PostgreSQL Source Code git master
Loading...
Searching...
No Matches
dynahash.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * dynahash.c
4 * dynamic chained hash tables
5 *
6 * dynahash.c supports both local-to-a-backend hash tables and hash tables in
7 * shared memory. For shared hash tables, it is the caller's responsibility
8 * to provide appropriate access interlocking. The simplest convention is
9 * that a single LWLock protects the whole hash table. Searches (HASH_FIND or
10 * hash_seq_search) need only shared lock, but any update requires exclusive
11 * lock. For heavily-used shared tables, the single-lock approach creates a
12 * concurrency bottleneck, so we also support "partitioned" locking wherein
13 * there are multiple LWLocks guarding distinct subsets of the table. To use
14 * a hash table in partitioned mode, the HASH_PARTITION flag must be given
15 * to hash_create. This prevents any attempt to split buckets on-the-fly.
16 * Therefore, each hash bucket chain operates independently, and no fields
17 * of the hash header change after init except nentries and freeList.
18 * (A partitioned table uses multiple copies of those fields, guarded by
19 * spinlocks, for additional concurrency.)
20 * This lets any subset of the hash buckets be treated as a separately
21 * lockable partition. We expect callers to use the low-order bits of a
22 * lookup key's hash value as a partition number --- this will work because
23 * of the way calc_bucket() maps hash values to bucket numbers.
24 *
25 * The memory allocator function should match malloc's semantics of returning
26 * NULL on failure. (This is essential for hash tables in shared memory.
27 * For hash tables in local memory, we used to use palloc() which will throw
28 * error on failure; but we no longer do, so it's untested whether this
29 * module will still cope with that behavior.)
30 *
31 * dynahash.c provides support for these types of lookup keys:
32 *
33 * 1. Null-terminated C strings (truncated if necessary to fit in keysize),
34 * compared as though by strcmp(). This is selected by specifying the
35 * HASH_STRINGS flag to hash_create.
36 *
37 * 2. Arbitrary binary data of size keysize, compared as though by memcmp().
38 * (Caller must ensure there are no undefined padding bits in the keys!)
39 * This is selected by specifying the HASH_BLOBS flag to hash_create.
40 *
41 * 3. More complex key behavior can be selected by specifying user-supplied
42 * hashing, comparison, and/or key-copying functions. At least a hashing
43 * function must be supplied; comparison defaults to memcmp() and key copying
44 * to memcpy() when a user-defined hashing function is selected.
45 *
46 * Compared to simplehash, dynahash has the following benefits:
47 *
48 * - It supports partitioning, which is useful for shared memory access using
49 * locks.
50 * - Shared memory hashes are allocated in a fixed size area at startup and
51 * are discoverable by name from other processes.
52 * - Because entries don't need to be moved in the case of hash conflicts,
53 * dynahash has better performance for large entries.
54 * - Guarantees stable pointers to entries.
55 *
56 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
57 * Portions Copyright (c) 1994, Regents of the University of California
58 *
59 *
60 * IDENTIFICATION
61 * src/backend/utils/hash/dynahash.c
62 *
63 *-------------------------------------------------------------------------
64 */
65
66/*
67 * Original comments:
68 *
69 * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
70 * Coded into C, with minor code improvements, and with hsearch(3) interface,
71 * by ejp@ausmelb.oz, Jul 26, 1988: 13:16;
72 * also, hcreate/hdestroy routines added to simulate hsearch(3).
73 *
74 * These routines simulate hsearch(3) and family, with the important
75 * difference that the hash table is dynamic - can grow indefinitely
76 * beyond its original size (as supplied to hcreate()).
77 *
78 * Performance appears to be comparable to that of hsearch(3).
79 * The 'source-code' options referred to in hsearch(3)'s 'man' page
80 * are not implemented; otherwise functionality is identical.
81 *
82 * Compilation controls:
83 * HASH_STATISTICS causes some usage statistics to be maintained, which can be
84 * logged by calling hash_stats().
85 *
86 * Problems & fixes to ejp@ausmelb.oz. WARNING: relies on pre-processor
87 * concatenation property, in probably unnecessary code 'optimization'.
88 *
89 * Modified margo@postgres.berkeley.edu February 1990
90 * added multiple table interface
91 * Modified by sullivan@postgres.berkeley.edu April 1990
92 * changed ctl structure for shared memory
93 */
94
95#include "postgres.h"
96
97#include <limits.h>
98
99#include "access/xact.h"
100#include "common/hashfn.h"
101#include "lib/ilist.h"
102#include "port/pg_bitutils.h"
103#include "storage/shmem.h"
104#include "storage/spin.h"
105#include "utils/memutils.h"
106
107
108/*
109 * Constants
110 *
111 * A hash table has a top-level "directory", each of whose entries points to a
112 * "segment" of HASH_SEGSIZE bucket headers. The maximum number of hash
113 * buckets is thus dsize * HASH_SEGSIZE (but dsize may be expansible). Of
114 * course, the number of records in the table can be larger, but we don't want
115 * a whole lot of records per bucket or performance goes down.
116 *
117 * In a hash table allocated in shared memory, the directory cannot be
118 * expanded because it must stay at a fixed address. The directory size is
119 * chosen at creation based on the initial number of elements, so even though
120 * we support allocating more elements later, performance will suffer if the
121 * table grows much beyond the initial size. (Currently, shared memory hash
122 * tables are only created by ShmemRequestHash()/ShmemInitHash() though, which
123 * doesn't support growing at all.)
124 */
125#define HASH_SEGSIZE 256
126#define HASH_SEGSIZE_SHIFT 8 /* must be log2(HASH_SEGSIZE) */
127#define DEF_DIRSIZE 256
128
129/* Number of freelists to be used for a partitioned hash table. */
130#define NUM_FREELISTS 32
131
132/* A hash bucket is a linked list of HASHELEMENTs */
134
135/* A hash segment is an array of bucket headers */
137
138/*
139 * Per-freelist data.
140 *
141 * In a partitioned hash table, each freelist is associated with a specific
142 * set of hashcodes, as determined by the FREELIST_IDX() macro below.
143 * nentries tracks the number of live hashtable entries having those hashcodes
144 * (NOT the number of entries in the freelist, as you might expect).
145 *
146 * The coverage of a freelist might be more or less than one partition, so it
147 * needs its own lock rather than relying on caller locking. Relying on that
148 * wouldn't work even if the coverage was the same, because of the occasional
149 * need to "borrow" entries from another freelist; see get_hash_entry().
150 *
151 * Using an array of FreeListData instead of separate arrays of mutexes,
152 * nentries and freeLists helps to reduce sharing of cache lines between
153 * different mutexes.
154 */
155typedef struct
156{
157 slock_t mutex; /* spinlock for this freelist */
158 int64 nentries; /* number of entries in associated buckets */
159 HASHELEMENT *freeList; /* chain of free elements */
161
162/*
163 * Header structure for a hash table --- contains all changeable info
164 *
165 * In a shared-memory hash table, the HASHHDR is in shared memory, while
166 * each backend has a local HTAB struct. For a non-shared table, there isn't
167 * any functional difference between HASHHDR and HTAB, but we separate them
168 * anyway to share code between shared and non-shared tables.
169 */
171{
172 /*
173 * The freelist can become a point of contention in high-concurrency hash
174 * tables, so we use an array of freelists, each with its own mutex and
175 * nentries count, instead of just a single one. Although the freelists
176 * normally operate independently, we will scavenge entries from freelists
177 * other than a hashcode's default freelist when necessary.
178 *
179 * If the hash table is not partitioned, only freeList[0] is used and its
180 * spinlock is not used at all; callers' locking is assumed sufficient.
181 */
183
184 /* These fields can change, but not in a partitioned table */
185 /* Also, dsize can't change in a shared table, even if unpartitioned */
186 int64 dsize; /* directory size */
187 int64 nsegs; /* number of allocated segments (<= dsize) */
188 uint32 max_bucket; /* ID of maximum bucket in use */
189 uint32 high_mask; /* mask to modulo into entire table */
190 uint32 low_mask; /* mask to modulo into lower half of table */
191
192 /* These fields are fixed at hashtable creation */
193 Size keysize; /* hash key length in bytes */
194 Size entrysize; /* total user element size in bytes */
195 int64 num_partitions; /* # partitions (must be power of 2), or 0 */
196 int64 max_dsize; /* 'dsize' limit if directory is fixed size */
197 int nelem_alloc; /* number of entries to allocate at once */
198 bool isfixed; /* if true, don't enlarge */
199
200 /* Current directory. In shared tables, this doesn't change */
202
203#ifdef HASH_STATISTICS
204
205 /*
206 * Count statistics here. NB: stats code doesn't bother with mutex, so
207 * counts could be corrupted a bit in a partitioned table.
208 */
212#endif
213};
214
215#define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0)
216
217#define FREELIST_IDX(hctl, hashcode) \
218 (IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0)
219
220/*
221 * Top control structure for a hashtable --- in a shared table, each backend
222 * has its own copy (OK since no fields change at runtime)
223 */
224struct HTAB
225{
226 HASHHDR *hctl; /* => shared control information */
227 HASHSEGMENT *dir; /* directory of segment starts */
228 HashValueFunc hash; /* hash function */
229 HashCompareFunc match; /* key comparison function */
230 HashCopyFunc keycopy; /* key copying function */
231 HashAllocFunc alloc; /* memory allocator */
232 void *alloc_arg; /* opaque argument passed to allocator */
233 MemoryContext hcxt; /* memory context if default allocator used */
234 char *tabname; /* table name (for error messages) */
235 bool isshared; /* true if table is in shared memory */
236
237 /* freezing a shared table isn't allowed, so we can keep state here */
238 bool frozen; /* true = no more inserts allowed */
239
240 /* We keep local copies of these fixed values to reduce contention */
241 Size keysize; /* hash key length in bytes */
242
243 /*
244 * In a USE_VALGRIND build, non-shared hashtables keep an slist chain of
245 * all the element blocks they have allocated. This pacifies Valgrind,
246 * which would otherwise often claim that the element blocks are "possibly
247 * lost" for lack of any non-interior pointers to their starts.
248 */
249#ifdef USE_VALGRIND
251#endif
252};
253
254/*
255 * Key (also entry) part of a HASHELEMENT
256 */
257#define ELEMENTKEY(helem) (((char *)(helem)) + MAXALIGN(sizeof(HASHELEMENT)))
258
259/*
260 * Obtain element pointer given pointer to key
261 */
262#define ELEMENT_FROM_KEY(key) \
263 ((HASHELEMENT *) (((char *) (key)) - MAXALIGN(sizeof(HASHELEMENT))))
264
265/*
266 * Fast MOD arithmetic, assuming that y is a power of 2 !
267 */
268#define MOD(x,y) ((x) & ((y)-1))
269
270/*
271 * Private function prototypes
272 */
273static void *DynaHashAlloc(Size size, void *alloc_arg);
274static HASHSEGMENT seg_alloc(HTAB *hashp);
275static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx);
276static bool dir_realloc(HTAB *hashp);
277static bool expand_table(HTAB *hashp);
278static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx);
279static void hdefault(HTAB *hashp);
280static int choose_nelem_alloc(Size entrysize);
281static bool init_htab(HTAB *hashp, int64 nelem);
282pg_noreturn static void hash_corrupted(HTAB *hashp);
283static uint32 hash_initial_lookup(HTAB *hashp, uint32 hashvalue,
285static int my_log2(int64 num);
286static int64 next_pow2_int64(int64 num);
287static int next_pow2_int(int64 num);
288static void register_seq_scan(HTAB *hashp);
289static void deregister_seq_scan(HTAB *hashp);
290static bool has_seq_scans(HTAB *hashp);
291
292
293/*
294 * memory allocation support
295 */
296static void *
297DynaHashAlloc(Size size, void *alloc_arg)
298{
299 MemoryContext hcxt = (MemoryContext) alloc_arg;
300
303}
304
305
306/*
307 * HashCompareFunc for string keys
308 *
309 * Because we copy keys with strlcpy(), they will be truncated at keysize-1
310 * bytes, so we can only compare that many ... hence strncmp is almost but
311 * not quite the right thing.
312 */
313static int
314string_compare(const char *key1, const char *key2, Size keysize)
315{
316 return strncmp(key1, key2, keysize - 1);
317}
318
319
320/************************** CREATE ROUTINES **********************/
321
322/*
323 * hash_create -- create a new dynamic hash table
324 *
325 * tabname: a name for the table (for debugging purposes)
326 * nelem: maximum number of elements expected
327 * *info: additional table parameters, as indicated by flags
328 * flags: bitmask indicating which parameters to take from *info
329 *
330 * The flags value *must* include HASH_ELEM. (Formerly, this was nominally
331 * optional, but the default keysize and entrysize values were useless.)
332 * The flags value must also include exactly one of HASH_STRINGS, HASH_BLOBS,
333 * or HASH_FUNCTION, to define the key hashing semantics (C strings,
334 * binary blobs, or custom, respectively). Callers specifying a custom
335 * hash function will likely also want to use HASH_COMPARE, and perhaps
336 * also HASH_KEYCOPY, to control key comparison and copying.
337 * Another often-used flag is HASH_CONTEXT, to allocate the hash table
338 * under info->hcxt rather than under TopMemoryContext; the default
339 * behavior is only suitable for session-lifespan hash tables.
340 * Other flags bits are special-purpose and seldom used, except for those
341 * associated with shared-memory hash tables, for which see
342 * ShmemRequestHash().
343 *
344 * Fields in *info are read only when the associated flags bit is set.
345 * It is not necessary to initialize other fields of *info.
346 * Neither tabname nor *info need persist after the hash_create() call.
347 *
348 * Note: It is deprecated for callers of hash_create() to explicitly specify
349 * string_hash, tag_hash, uint32_hash, or oid_hash. Just set HASH_STRINGS or
350 * HASH_BLOBS. Use HASH_FUNCTION only when you want something other than
351 * one of these.
352 *
353 * Note: for a shared-memory hashtable, nelem needs to be a pretty good
354 * estimate, since we can't expand the table on the fly. But an unshared
355 * hashtable can be expanded on-the-fly, so it's better for nelem to be
356 * on the small side and let the table grow if it's exceeded. An overly
357 * large nelem will penalize hash_seq_search speed without buying much.
358 */
359HTAB *
360hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
361{
362 HTAB *hashp;
363 HASHHDR *hctl;
364 MemoryContext hcxt;
365
366 /*
367 * Hash tables now allocate space for key and data, but you have to say
368 * how much space to allocate.
369 */
370 Assert(flags & HASH_ELEM);
371 Assert(info->keysize > 0);
372 Assert(info->entrysize >= info->keysize);
373
374 /*
375 * For shared hash tables, we have a local hash header (HTAB struct) that
376 * we allocate in TopMemoryContext; all else is in shared memory.
377 *
378 * For non-shared hash tables, everything including the hash header is in
379 * a memory context created specially for the hash table --- this makes
380 * hash_destroy very simple. The memory context is made a child of either
381 * a context specified by the caller, or TopMemoryContext if nothing is
382 * specified.
383 *
384 * Note that HASH_ALLOC had better be set as well.
385 */
386 if (flags & HASH_SHARED_MEM)
387 {
388 /* Set up to allocate the hash header */
389 hcxt = TopMemoryContext;
390 }
391 else
392 {
393 /* Create the hash table's private memory context */
394 if (flags & HASH_CONTEXT)
395 hcxt = info->hcxt;
396 else
397 hcxt = TopMemoryContext;
398 hcxt = AllocSetContextCreate(hcxt, "dynahash",
400 }
401
402 /* Initialize the hash header, plus a copy of the table name */
403 hashp = (HTAB *) MemoryContextAlloc(hcxt,
404 sizeof(HTAB) + strlen(tabname) + 1);
405 MemSet(hashp, 0, sizeof(HTAB));
406
407 hashp->tabname = (char *) (hashp + 1);
408 strcpy(hashp->tabname, tabname);
409
410 /* If we have a private context, label it with hashtable's name */
411 if (!(flags & HASH_SHARED_MEM))
413
414 /*
415 * Select the appropriate hash function (see comments at head of file).
416 */
417 if (flags & HASH_FUNCTION)
418 {
419 Assert(!(flags & (HASH_BLOBS | HASH_STRINGS)));
420 hashp->hash = info->hash;
421 }
422 else if (flags & HASH_BLOBS)
423 {
424 Assert(!(flags & HASH_STRINGS));
425 /* We can optimize hashing for common key sizes */
426 if (info->keysize == sizeof(uint32))
427 hashp->hash = uint32_hash;
428 else
429 hashp->hash = tag_hash;
430 }
431 else
432 {
433 /*
434 * string_hash used to be considered the default hash method, and in a
435 * non-assert build it effectively still is. But we now consider it
436 * an assertion error to not say HASH_STRINGS explicitly. To help
437 * catch mistaken usage of HASH_STRINGS, we also insist on a
438 * reasonably long string length: if the keysize is only 4 or 8 bytes,
439 * it's almost certainly an integer or pointer not a string.
440 */
441 Assert(flags & HASH_STRINGS);
442 Assert(info->keysize > 8);
443
444 hashp->hash = string_hash;
445 }
446
447 /*
448 * If you don't specify a match function, it defaults to string_compare if
449 * you used string_hash, and to memcmp otherwise.
450 *
451 * Note: explicitly specifying string_hash is deprecated, because this
452 * might not work for callers in loadable modules on some platforms due to
453 * referencing a trampoline instead of the string_hash function proper.
454 * Specify HASH_STRINGS instead.
455 */
456 if (flags & HASH_COMPARE)
457 hashp->match = info->match;
458 else if (hashp->hash == string_hash)
460 else
461 hashp->match = memcmp;
462
463 /*
464 * Similarly, the key-copying function defaults to strlcpy or memcpy.
465 */
466 if (flags & HASH_KEYCOPY)
467 hashp->keycopy = info->keycopy;
468 else if (hashp->hash == string_hash)
469 {
470 /*
471 * The signature of keycopy is meant for memcpy(), which returns
472 * void*, but strlcpy() returns size_t. Since we never use the return
473 * value of keycopy, and size_t is pretty much always the same size as
474 * void *, this should be safe. The extra cast in the middle is to
475 * avoid warnings from -Wcast-function-type.
476 */
478 }
479 else
480 hashp->keycopy = memcpy;
481
482 /* And select the entry allocation function, too. */
483 if (flags & HASH_ALLOC)
484 {
485 hashp->alloc = info->alloc;
486 hashp->alloc_arg = info->alloc_arg;
487 }
488 else
489 {
490 hashp->alloc = DynaHashAlloc;
491 hashp->alloc_arg = hcxt;
492 }
493
494 if (flags & HASH_SHARED_MEM)
495 {
496 hashp->hcxt = NULL;
497 hashp->isshared = true;
498
499 /* hash table already exists, we're just attaching to it */
500 if (flags & HASH_ATTACH)
501 {
502 /* Caller must pass the pointer to the shared header */
503 Assert(info->hctl);
504 hashp->hctl = info->hctl;
505
506 /* make local copies of some heavily-used values */
507 hashp->dir = info->hctl->dir;
508 hashp->keysize = info->hctl->keysize;
509
510 return hashp;
511 }
512 }
513 else
514 {
515 /* setup hash table defaults */
516 hashp->hctl = NULL;
517 hashp->dir = NULL;
518 hashp->hcxt = hcxt;
519 hashp->isshared = false;
520 }
521
522 /*
523 * Allocate the header structure.
524 *
525 * XXX: In case of a shared memory hash table, other processes need the
526 * pointer to the header to re-find the hash table. There is currently no
527 * explicit way to pass it back from here, the caller relies on the fact
528 * that this is the first allocation made with the alloc function. That's
529 * a little ugly, but works for now.
530 */
531 hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR), hashp->alloc_arg);
532 if (!hashp->hctl)
535 errmsg("out of memory")));
536
537 hashp->frozen = false;
538
539 hdefault(hashp);
540
541 hctl = hashp->hctl;
542
543 if (flags & HASH_PARTITION)
544 {
545 /* Doesn't make sense to partition a local hash table */
546 Assert(flags & HASH_SHARED_MEM);
547
548 /*
549 * The number of partitions had better be a power of 2. Also, it must
550 * be less than INT_MAX (see init_htab()), so call the int version of
551 * next_pow2.
552 */
554
555 hctl->num_partitions = info->num_partitions;
556 }
557
558 /* remember the entry sizes, too */
559 hctl->keysize = info->keysize;
560 hctl->entrysize = info->entrysize;
561
562 /* make local copies of heavily-used constant fields */
563 hashp->keysize = hctl->keysize;
564
565 /* Build the hash directory structure */
566 if (!init_htab(hashp, nelem))
567 elog(ERROR, "failed to initialize hash table \"%s\"", hashp->tabname);
568
569 /*
570 * For a shared hash table, preallocate the requested number of elements.
571 * This reduces problems with run-time out-of-shared-memory conditions.
572 *
573 * For a non-shared hash table, preallocate the requested number of
574 * elements if it's less than our chosen nelem_alloc. This avoids wasting
575 * space if the caller correctly estimates a small table size.
576 */
577 if ((flags & HASH_SHARED_MEM) ||
578 nelem < hctl->nelem_alloc)
579 {
580 int i,
582 nelem_alloc,
584
585 /*
586 * If hash table is partitioned, give each freelist an equal share of
587 * the initial allocation. Otherwise only freeList[0] is used.
588 */
589 if (IS_PARTITIONED(hashp->hctl))
591 else
593
594 nelem_alloc = nelem / freelist_partitions;
595 if (nelem_alloc <= 0)
596 nelem_alloc = 1;
597
598 /*
599 * Make sure we'll allocate all the requested elements; freeList[0]
600 * gets the excess if the request isn't divisible by NUM_FREELISTS.
601 */
602 if (nelem_alloc * freelist_partitions < nelem)
604 nelem - nelem_alloc * (freelist_partitions - 1);
605 else
606 nelem_alloc_first = nelem_alloc;
607
608 for (i = 0; i < freelist_partitions; i++)
609 {
610 int temp = (i == 0) ? nelem_alloc_first : nelem_alloc;
611
612 if (!element_alloc(hashp, temp, i))
615 errmsg("out of memory")));
616 }
617 }
618
619 /* Set isfixed if requested, but not till after we build initial entries */
620 if (flags & HASH_FIXED_SIZE)
621 hctl->isfixed = true;
622
623 return hashp;
624}
625
626/*
627 * Set default HASHHDR parameters.
628 */
629static void
631{
632 HASHHDR *hctl = hashp->hctl;
633
634 MemSet(hctl, 0, sizeof(HASHHDR));
635
636 hctl->num_partitions = 0; /* not partitioned */
637
638 /* table has no fixed maximum size */
639 hctl->max_dsize = NO_MAX_DSIZE;
640
641 hctl->isfixed = false; /* can be enlarged */
642
643#ifdef HASH_STATISTICS
644 hctl->accesses = hctl->collisions = hctl->expansions = 0;
645#endif
646}
647
648/*
649 * Given the user-specified entry size, choose nelem_alloc, ie, how many
650 * elements to add to the hash table when we need more.
651 */
652static int
654{
655 int nelem_alloc;
658
659 /* Each element has a HASHELEMENT header plus user data. */
660 /* NB: this had better match element_alloc() */
661 elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
662
663 /*
664 * The idea here is to choose nelem_alloc at least 32, but round up so
665 * that the allocation request will be a power of 2 or just less. This
666 * makes little difference for hash tables in shared memory, but for hash
667 * tables managed by palloc, the allocation request will be rounded up to
668 * a power of 2 anyway. If we fail to take this into account, we'll waste
669 * as much as half the allocated space.
670 */
671 allocSize = 32 * 4; /* assume elementSize at least 8 */
672 do
673 {
674 allocSize <<= 1;
675 nelem_alloc = allocSize / elementSize;
676 } while (nelem_alloc < 32);
677
678 return nelem_alloc;
679}
680
681/*
682 * Compute derived fields of hctl and build the initial directory/segment
683 * arrays
684 */
685static bool
687{
688 HASHHDR *hctl = hashp->hctl;
690 int nbuckets;
691 int nsegs;
692 int i;
693
694 /*
695 * initialize mutexes if it's a partitioned table
696 */
697 if (IS_PARTITIONED(hctl))
698 for (i = 0; i < NUM_FREELISTS; i++)
699 SpinLockInit(&(hctl->freeList[i].mutex));
700
701 /*
702 * Allocate space for the next greater power of two number of buckets,
703 * assuming a desired maximum load factor of 1.
704 */
705 nbuckets = next_pow2_int(nelem);
706
707 /*
708 * In a partitioned table, nbuckets must be at least equal to
709 * num_partitions; were it less, keys with apparently different partition
710 * numbers would map to the same bucket, breaking partition independence.
711 * (Normally nbuckets will be much bigger; this is just a safety check.)
712 */
713 while (nbuckets < hctl->num_partitions)
714 nbuckets <<= 1;
715
716 hctl->max_bucket = hctl->low_mask = nbuckets - 1;
717 hctl->high_mask = (nbuckets << 1) - 1;
718
719 /*
720 * Figure number of directory segments needed, round up to a power of 2
721 */
722 nsegs = (nbuckets - 1) / HASH_SEGSIZE + 1;
723 nsegs = next_pow2_int(nsegs);
724
725 /*
726 * Make sure directory is big enough.
727 */
728 hctl->dsize = Max(DEF_DIRSIZE, nsegs);
729
730 /* SHM hash tables have a fixed directory. */
731 if (hashp->isshared)
732 hctl->max_dsize = hctl->dsize;
733
734 /* Allocate a directory */
735 hctl->dir = (HASHSEGMENT *)
736 hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT), hashp->alloc_arg);
737 if (!hctl->dir)
738 return false;
739 hashp->dir = hctl->dir;
740
741 /* Allocate initial segments */
742 for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
743 {
744 *segp = seg_alloc(hashp);
745 if (*segp == NULL)
746 return false;
747 }
748
749 /* Choose number of entries to allocate at a time */
751
752 return true;
753}
754
755/*
756 * Estimate the space needed for a hashtable containing the given number
757 * of entries of given size.
758 * NOTE: this is used to estimate the footprint of hashtables in shared
759 * memory; therefore it does not count HTAB which is in local memory.
760 * NB: assumes that all hash structure parameters have default values!
761 */
762Size
763hash_estimate_size(int64 num_entries, Size entrysize)
764{
765 Size size;
767 nSegments,
772
773 /* estimate number of buckets wanted */
774 nBuckets = next_pow2_int64(num_entries);
775 /* # of segments needed for nBuckets */
777 /* directory entries */
779
780 /* fixed control info */
781 size = MAXALIGN(sizeof(HASHHDR)); /* but not HTAB, per above */
782 /* directory */
783 size = add_size(size, mul_size(nDirEntries, sizeof(HASHSEGMENT)));
784 /* segments */
785 size = add_size(size, mul_size(nSegments,
786 MAXALIGN(HASH_SEGSIZE * sizeof(HASHBUCKET))));
787 /* elements --- allocated in groups of choose_nelem_alloc() entries */
789 nElementAllocs = (num_entries - 1) / elementAllocCnt + 1;
790 elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
791 size = add_size(size,
794
795 return size;
796}
797
798
799/********************** DESTROY ROUTINES ************************/
800
801void
803{
804 if (hashp != NULL)
805 {
806 /* allocation method must be one we know how to free, too */
807 Assert(hashp->alloc == DynaHashAlloc);
808 /* so this hashtable must have its own context */
809 Assert(hashp->hcxt != NULL);
810
811 hash_stats(__func__, hashp);
812
813 /*
814 * Free everything by destroying the hash table's memory context.
815 */
817 }
818}
819
820void
821hash_stats(const char *caller, HTAB *hashp)
822{
823#ifdef HASH_STATISTICS
824 HASHHDR *hctl = hashp->hctl;
825
826 elog(DEBUG4,
827 "hash_stats: Caller: %s Table Name: \"%s\" Accesses: " UINT64_FORMAT " Collisions: " UINT64_FORMAT " Expansions: " UINT64_FORMAT " Entries: " INT64_FORMAT " Key Size: %zu Max Bucket: %u Segment Count: " INT64_FORMAT,
828 caller != NULL ? caller : "(unknown)", hashp->tabname, hctl->accesses,
829 hctl->collisions, hctl->expansions, hash_get_num_entries(hashp),
830 hctl->keysize, hctl->max_bucket, hctl->nsegs);
831#endif
832}
833
834/*******************************SEARCH ROUTINES *****************************/
835
836
837/*
838 * get_hash_value -- exported routine to calculate a key's hash value
839 *
840 * We export this because for partitioned tables, callers need to compute
841 * the partition number (from the low-order bits of the hash value) before
842 * searching.
843 */
844uint32
845get_hash_value(HTAB *hashp, const void *keyPtr)
846{
847 return hashp->hash(keyPtr, hashp->keysize);
848}
849
850/* Convert a hash value to a bucket number */
851static inline uint32
853{
855
856 bucket = hash_val & hctl->high_mask;
857 if (bucket > hctl->max_bucket)
858 bucket = bucket & hctl->low_mask;
859
860 return bucket;
861}
862
863/*
864 * hash_search -- look up key in table and perform action
865 * hash_search_with_hash_value -- same, with key's hash value already computed
866 *
867 * action is one of:
868 * HASH_FIND: look up key in table
869 * HASH_ENTER: look up key in table, creating entry if not present
870 * HASH_ENTER_NULL: same, but return NULL if out of memory
871 * HASH_REMOVE: look up key in table, remove entry if present
872 *
873 * Return value is a pointer to the element found/entered/removed if any,
874 * or NULL if no match was found. (NB: in the case of the REMOVE action,
875 * the result is a dangling pointer that shouldn't be dereferenced!)
876 *
877 * HASH_ENTER will normally ereport a generic "out of memory" error if
878 * it is unable to create a new entry. The HASH_ENTER_NULL operation is
879 * the same except it will return NULL if out of memory.
880 *
881 * If foundPtr isn't NULL, then *foundPtr is set true if we found an
882 * existing entry in the table, false otherwise. This is needed in the
883 * HASH_ENTER case, but is redundant with the return value otherwise.
884 *
885 * For hash_search_with_hash_value, the hashvalue parameter must have been
886 * calculated with get_hash_value().
887 */
888void *
890 const void *keyPtr,
891 HASHACTION action,
892 bool *foundPtr)
893{
894 return hash_search_with_hash_value(hashp,
895 keyPtr,
896 hashp->hash(keyPtr, hashp->keysize),
897 action,
898 foundPtr);
899}
900
901void *
903 const void *keyPtr,
904 uint32 hashvalue,
905 HASHACTION action,
906 bool *foundPtr)
907{
908 HASHHDR *hctl = hashp->hctl;
909 int freelist_idx = FREELIST_IDX(hctl, hashvalue);
910 Size keysize;
913 HashCompareFunc match;
914
915#ifdef HASH_STATISTICS
916 hctl->accesses++;
917#endif
918
919 /*
920 * If inserting, check if it is time to split a bucket.
921 *
922 * NOTE: failure to expand table is not a fatal error, it just means we
923 * have to run at higher fill factor than we wanted. However, if we're
924 * using the palloc allocator then it will throw error anyway on
925 * out-of-memory, so we must do this before modifying the table.
926 */
927 if (action == HASH_ENTER || action == HASH_ENTER_NULL)
928 {
929 /*
930 * Can't split if running in partitioned mode, nor if frozen, nor if
931 * table is the subject of any active hash_seq_search scans.
932 */
933 if (hctl->freeList[0].nentries > (int64) hctl->max_bucket &&
934 !IS_PARTITIONED(hctl) && !hashp->frozen &&
935 !has_seq_scans(hashp))
936 (void) expand_table(hashp);
937 }
938
939 /*
940 * Do the initial lookup
941 */
942 (void) hash_initial_lookup(hashp, hashvalue, &prevBucketPtr);
944
945 /*
946 * Follow collision chain looking for matching key
947 */
948 match = hashp->match; /* save one fetch in inner loop */
949 keysize = hashp->keysize; /* ditto */
950
951 while (currBucket != NULL)
952 {
953 if (currBucket->hashvalue == hashvalue &&
954 match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0)
955 break;
956 prevBucketPtr = &(currBucket->link);
958#ifdef HASH_STATISTICS
959 hctl->collisions++;
960#endif
961 }
962
963 if (foundPtr)
964 *foundPtr = (bool) (currBucket != NULL);
965
966 /*
967 * OK, now what?
968 */
969 switch (action)
970 {
971 case HASH_FIND:
972 if (currBucket != NULL)
973 return ELEMENTKEY(currBucket);
974 return NULL;
975
976 case HASH_REMOVE:
977 if (currBucket != NULL)
978 {
979 /* if partitioned, must lock to touch nentries and freeList */
980 if (IS_PARTITIONED(hctl))
982
983 /* delete the record from the appropriate nentries counter. */
986
987 /* remove record from hash bucket's chain. */
988 *prevBucketPtr = currBucket->link;
989
990 /* add the record to the appropriate freelist. */
993
994 if (IS_PARTITIONED(hctl))
996
997 /*
998 * better hope the caller is synchronizing access to this
999 * element, because someone else is going to reuse it the next
1000 * time something is added to the table
1001 */
1002 return ELEMENTKEY(currBucket);
1003 }
1004 return NULL;
1005
1006 case HASH_ENTER:
1007 case HASH_ENTER_NULL:
1008 /* Return existing element if found, else create one */
1009 if (currBucket != NULL)
1010 return ELEMENTKEY(currBucket);
1011
1012 /* disallow inserts if frozen */
1013 if (hashp->frozen)
1014 elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
1015 hashp->tabname);
1016
1018 if (currBucket == NULL)
1019 {
1020 /* out of memory */
1021 if (action == HASH_ENTER_NULL)
1022 return NULL;
1023 /* report a generic message */
1024 if (hashp->isshared)
1025 ereport(ERROR,
1027 errmsg("out of shared memory")));
1028 else
1029 ereport(ERROR,
1031 errmsg("out of memory")));
1032 }
1033
1034 /* link into hashbucket chain */
1036 currBucket->link = NULL;
1037
1038 /* copy key into record */
1039 currBucket->hashvalue = hashvalue;
1040 hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize);
1041
1042 /*
1043 * Caller is expected to fill the data field on return. DO NOT
1044 * insert any code that could possibly throw error here, as doing
1045 * so would leave the table entry incomplete and hence corrupt the
1046 * caller's data structure.
1047 */
1048
1049 return ELEMENTKEY(currBucket);
1050 }
1051
1052 elog(ERROR, "unrecognized hash action code: %d", (int) action);
1053
1054 return NULL; /* keep compiler quiet */
1055}
1056
1057/*
1058 * hash_update_hash_key -- change the hash key of an existing table entry
1059 *
1060 * This is equivalent to removing the entry, making a new entry, and copying
1061 * over its data, except that the entry never goes to the table's freelist.
1062 * Therefore this cannot suffer an out-of-memory failure, even if there are
1063 * other processes operating in other partitions of the hashtable.
1064 *
1065 * Returns true if successful, false if the requested new hash key is already
1066 * present. Throws error if the specified entry pointer isn't actually a
1067 * table member.
1068 *
1069 * NB: currently, there is no special case for old and new hash keys being
1070 * identical, which means we'll report false for that situation. This is
1071 * preferable for existing uses.
1072 *
1073 * NB: for a partitioned hashtable, caller must hold lock on both relevant
1074 * partitions, if the new hash key would belong to a different partition.
1075 */
1076bool
1078 void *existingEntry,
1079 const void *newKeyPtr)
1080{
1083 Size keysize;
1084 uint32 bucket;
1089 HashCompareFunc match;
1090
1091#ifdef HASH_STATISTICS
1092 HASHHDR *hctl = hashp->hctl;
1093
1094 hctl->accesses++;
1095#endif
1096
1097 /* disallow updates if frozen */
1098 if (hashp->frozen)
1099 elog(ERROR, "cannot update in frozen hashtable \"%s\"",
1100 hashp->tabname);
1101
1102 /*
1103 * Lookup the existing element using its saved hash value. We need to do
1104 * this to be able to unlink it from its hash chain, but as a side benefit
1105 * we can verify the validity of the passed existingEntry pointer.
1106 */
1107 bucket = hash_initial_lookup(hashp, existingElement->hashvalue,
1108 &prevBucketPtr);
1110
1111 while (currBucket != NULL)
1112 {
1114 break;
1115 prevBucketPtr = &(currBucket->link);
1117 }
1118
1119 if (currBucket == NULL)
1120 elog(ERROR, "hash_update_hash_key argument is not in hashtable \"%s\"",
1121 hashp->tabname);
1122
1124
1125 /*
1126 * Now perform the equivalent of a HASH_ENTER operation to locate the hash
1127 * chain we want to put the entry into.
1128 */
1129 newhashvalue = hashp->hash(newKeyPtr, hashp->keysize);
1132
1133 /*
1134 * Follow collision chain looking for matching key
1135 */
1136 match = hashp->match; /* save one fetch in inner loop */
1137 keysize = hashp->keysize; /* ditto */
1138
1139 while (currBucket != NULL)
1140 {
1141 if (currBucket->hashvalue == newhashvalue &&
1142 match(ELEMENTKEY(currBucket), newKeyPtr, keysize) == 0)
1143 break;
1144 prevBucketPtr = &(currBucket->link);
1146#ifdef HASH_STATISTICS
1147 hctl->collisions++;
1148#endif
1149 }
1150
1151 if (currBucket != NULL)
1152 return false; /* collision with an existing entry */
1153
1155
1156 /*
1157 * If old and new hash values belong to the same bucket, we need not
1158 * change any chain links, and indeed should not since this simplistic
1159 * update will corrupt the list if currBucket is the last element. (We
1160 * cannot fall out earlier, however, since we need to scan the bucket to
1161 * check for duplicate keys.)
1162 */
1163 if (bucket != newbucket)
1164 {
1165 /* OK to remove record from old hash bucket's chain. */
1166 *oldPrevPtr = currBucket->link;
1167
1168 /* link into new hashbucket chain */
1170 currBucket->link = NULL;
1171 }
1172
1173 /* copy new key into record */
1174 currBucket->hashvalue = newhashvalue;
1175 hashp->keycopy(ELEMENTKEY(currBucket), newKeyPtr, keysize);
1176
1177 /* rest of record is untouched */
1178
1179 return true;
1180}
1181
1182/*
1183 * Allocate a new hashtable entry if possible; return NULL if out of memory.
1184 * (Or, if the underlying space allocator throws error for out-of-memory,
1185 * we won't return at all.)
1186 */
1187static HASHBUCKET
1189{
1190 HASHHDR *hctl = hashp->hctl;
1192
1193 for (;;)
1194 {
1195 /* if partitioned, must lock to touch nentries and freeList */
1196 if (IS_PARTITIONED(hctl))
1198
1199 /* try to get an entry from the freelist */
1201
1202 if (newElement != NULL)
1203 break;
1204
1205 if (IS_PARTITIONED(hctl))
1207
1208 /*
1209 * No free elements in this freelist. In a partitioned table, there
1210 * might be entries in other freelists, but to reduce contention we
1211 * prefer to first try to get another chunk of buckets from the main
1212 * shmem allocator. If that fails, though, we *MUST* root through all
1213 * the other freelists before giving up. There are multiple callers
1214 * that assume that they can allocate every element in the initially
1215 * requested table size, or that deleting an element guarantees they
1216 * can insert a new element, even if shared memory is entirely full.
1217 * Failing because the needed element is in a different freelist is
1218 * not acceptable.
1219 */
1220 if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
1221 {
1222 int borrow_from_idx;
1223
1224 if (!IS_PARTITIONED(hctl))
1225 return NULL; /* out of memory */
1226
1227 /* try to borrow element from another freelist */
1229 for (;;)
1230 {
1233 break; /* examined all freelists, fail */
1234
1237
1238 if (newElement != NULL)
1239 {
1242
1243 /* careful: count the new element in its proper freelist */
1247
1248 return newElement;
1249 }
1250
1252 }
1253
1254 /* no elements available to borrow either, so out of memory */
1255 return NULL;
1256 }
1257 }
1258
1259 /* remove entry from freelist, bump nentries */
1262
1263 if (IS_PARTITIONED(hctl))
1265
1266 return newElement;
1267}
1268
1269/*
1270 * hash_get_num_entries -- get the number of entries in a hashtable
1271 */
1272int64
1274{
1275 int i;
1276 int64 sum = hashp->hctl->freeList[0].nentries;
1277
1278 /*
1279 * We currently don't bother with acquiring the mutexes; it's only
1280 * sensible to call this function if you've got lock on all partitions of
1281 * the table.
1282 */
1283 if (IS_PARTITIONED(hashp->hctl))
1284 {
1285 for (i = 1; i < NUM_FREELISTS; i++)
1286 sum += hashp->hctl->freeList[i].nentries;
1287 }
1288
1289 return sum;
1290}
1291
1292/*
1293 * hash_seq_init/_search/_term
1294 * Sequentially search through hash table and return
1295 * all the elements one by one, return NULL when no more.
1296 *
1297 * hash_seq_term should be called if and only if the scan is abandoned before
1298 * completion; if hash_seq_search returns NULL then it has already done the
1299 * end-of-scan cleanup.
1300 *
1301 * NOTE: caller may delete the returned element before continuing the scan.
1302 * However, deleting any other element while the scan is in progress is
1303 * UNDEFINED (it might be the one that curIndex is pointing at!). Also,
1304 * if elements are added to the table while the scan is in progress, it is
1305 * unspecified whether they will be visited by the scan or not.
1306 *
1307 * NOTE: it is possible to use hash_seq_init/hash_seq_search without any
1308 * worry about hash_seq_term cleanup, if the hashtable is first locked against
1309 * further insertions by calling hash_freeze.
1310 *
1311 * NOTE: to use this with a partitioned hashtable, caller had better hold
1312 * at least shared lock on all partitions of the table throughout the scan!
1313 * We can cope with insertions or deletions by our own backend, but *not*
1314 * with concurrent insertions or deletions by another.
1315 */
1316void
1318{
1319 status->hashp = hashp;
1320 status->curBucket = 0;
1321 status->curEntry = NULL;
1322 status->hasHashvalue = false;
1323 if (!hashp->frozen)
1324 register_seq_scan(hashp);
1325}
1326
1327/*
1328 * Same as above but scan by the given hash value.
1329 * See also hash_seq_search().
1330 *
1331 * NOTE: the default hash function doesn't match syscache hash function.
1332 * Thus, if you're going to use this function in syscache callback, make sure
1333 * you're using custom hash function. See relatt_cache_syshash()
1334 * for example.
1335 */
1336void
1338 uint32 hashvalue)
1339{
1341
1342 hash_seq_init(status, hashp);
1343
1344 status->hasHashvalue = true;
1345 status->hashvalue = hashvalue;
1346
1347 status->curBucket = hash_initial_lookup(hashp, hashvalue, &bucketPtr);
1348 status->curEntry = *bucketPtr;
1349}
1350
1351void *
1353{
1354 HTAB *hashp;
1355 HASHHDR *hctl;
1356 uint32 max_bucket;
1360 uint32 curBucket;
1362
1363 if (status->hasHashvalue)
1364 {
1365 /*
1366 * Scan entries only in the current bucket because only this bucket
1367 * can contain entries with the given hash value.
1368 */
1369 while ((curElem = status->curEntry) != NULL)
1370 {
1371 status->curEntry = curElem->link;
1372 if (status->hashvalue != curElem->hashvalue)
1373 continue;
1374 return (void *) ELEMENTKEY(curElem);
1375 }
1376
1377 hash_seq_term(status);
1378 return NULL;
1379 }
1380
1381 if ((curElem = status->curEntry) != NULL)
1382 {
1383 /* Continuing scan of curBucket... */
1384 status->curEntry = curElem->link;
1385 if (status->curEntry == NULL) /* end of this bucket */
1386 ++status->curBucket;
1387 return ELEMENTKEY(curElem);
1388 }
1389
1390 /*
1391 * Search for next nonempty bucket starting at curBucket.
1392 */
1393 curBucket = status->curBucket;
1394 hashp = status->hashp;
1395 hctl = hashp->hctl;
1396 max_bucket = hctl->max_bucket;
1397
1398 if (curBucket > max_bucket)
1399 {
1400 hash_seq_term(status);
1401 return NULL; /* search is done */
1402 }
1403
1404 /*
1405 * first find the right segment in the table directory.
1406 */
1407 segment_num = curBucket >> HASH_SEGSIZE_SHIFT;
1408 segment_ndx = MOD(curBucket, HASH_SEGSIZE);
1409
1410 segp = hashp->dir[segment_num];
1411
1412 /*
1413 * Pick up the first item in this bucket's chain. If chain is not empty
1414 * we can begin searching it. Otherwise we have to advance to find the
1415 * next nonempty bucket. We try to optimize that case since searching a
1416 * near-empty hashtable has to iterate this loop a lot.
1417 */
1418 while ((curElem = segp[segment_ndx]) == NULL)
1419 {
1420 /* empty bucket, advance to next */
1421 if (++curBucket > max_bucket)
1422 {
1423 status->curBucket = curBucket;
1424 hash_seq_term(status);
1425 return NULL; /* search is done */
1426 }
1427 if (++segment_ndx >= HASH_SEGSIZE)
1428 {
1429 segment_num++;
1430 segment_ndx = 0;
1431 segp = hashp->dir[segment_num];
1432 }
1433 }
1434
1435 /* Begin scan of curBucket... */
1436 status->curEntry = curElem->link;
1437 if (status->curEntry == NULL) /* end of this bucket */
1438 ++curBucket;
1439 status->curBucket = curBucket;
1440 return ELEMENTKEY(curElem);
1441}
1442
1443void
1445{
1446 if (!status->hashp->frozen)
1447 deregister_seq_scan(status->hashp);
1448}
1449
1450/*
1451 * hash_freeze
1452 * Freeze a hashtable against future insertions (deletions are
1453 * still allowed)
1454 *
1455 * The reason for doing this is that by preventing any more bucket splits,
1456 * we no longer need to worry about registering hash_seq_search scans,
1457 * and thus caller need not be careful about ensuring hash_seq_term gets
1458 * called at the right times.
1459 *
1460 * Multiple calls to hash_freeze() are allowed, but you can't freeze a table
1461 * with active scans (since hash_seq_term would then do the wrong thing).
1462 */
1463void
1465{
1466 if (hashp->isshared)
1467 elog(ERROR, "cannot freeze shared hashtable \"%s\"", hashp->tabname);
1468 if (!hashp->frozen && has_seq_scans(hashp))
1469 elog(ERROR, "cannot freeze hashtable \"%s\" because it has active scans",
1470 hashp->tabname);
1471 hashp->frozen = true;
1472}
1473
1474
1475/********************************* UTILITIES ************************/
1476
1477/*
1478 * Expand the table by adding one more hash bucket.
1479 */
1480static bool
1482{
1483 HASHHDR *hctl = hashp->hctl;
1485 new_seg;
1487 new_bucket;
1489 new_segndx;
1491 old_segndx;
1493 *newlink;
1496
1497 Assert(!IS_PARTITIONED(hctl));
1498
1499#ifdef HASH_STATISTICS
1500 hctl->expansions++;
1501#endif
1502
1503 new_bucket = hctl->max_bucket + 1;
1504 new_segnum = new_bucket >> HASH_SEGSIZE_SHIFT;
1505 new_segndx = MOD(new_bucket, HASH_SEGSIZE);
1506
1507 if (new_segnum >= hctl->nsegs)
1508 {
1509 /* Allocate new segment if necessary -- could fail if dir full */
1510 if (new_segnum >= hctl->dsize)
1511 if (!dir_realloc(hashp))
1512 return false;
1513 if (!(hashp->dir[new_segnum] = seg_alloc(hashp)))
1514 return false;
1515 hctl->nsegs++;
1516 }
1517
1518 /* OK, we created a new bucket */
1519 hctl->max_bucket++;
1520
1521 /*
1522 * *Before* changing masks, find old bucket corresponding to same hash
1523 * values; values in that bucket may need to be relocated to new bucket.
1524 * Note that new_bucket is certainly larger than low_mask at this point,
1525 * so we can skip the first step of the regular hash mask calc.
1526 */
1527 old_bucket = (new_bucket & hctl->low_mask);
1528
1529 /*
1530 * If we crossed a power of 2, readjust masks.
1531 */
1532 if ((uint32) new_bucket > hctl->high_mask)
1533 {
1534 hctl->low_mask = hctl->high_mask;
1535 hctl->high_mask = (uint32) new_bucket | hctl->low_mask;
1536 }
1537
1538 /*
1539 * Relocate records to the new bucket. NOTE: because of the way the hash
1540 * masking is done in calc_bucket, only one old bucket can need to be
1541 * split at this point. With a different way of reducing the hash value,
1542 * that might not be true!
1543 */
1546
1547 old_seg = hashp->dir[old_segnum];
1548 new_seg = hashp->dir[new_segnum];
1549
1552
1553 for (currElement = *oldlink;
1554 currElement != NULL;
1556 {
1558 if ((int64) calc_bucket(hctl, currElement->hashvalue) == old_bucket)
1559 {
1562 }
1563 else
1564 {
1567 }
1568 }
1569 /* don't forget to terminate the rebuilt hash chains... */
1570 *oldlink = NULL;
1571 *newlink = NULL;
1572
1573 return true;
1574}
1575
1576
1577static bool
1579{
1580 HASHSEGMENT *p;
1585
1586 if (hashp->hctl->max_dsize != NO_MAX_DSIZE)
1587 return false;
1588
1589 /* Reallocate directory */
1590 new_dsize = hashp->hctl->dsize << 1;
1591 old_dirsize = hashp->hctl->dsize * sizeof(HASHSEGMENT);
1592 new_dirsize = new_dsize * sizeof(HASHSEGMENT);
1593
1594 old_p = hashp->dir;
1595 p = (HASHSEGMENT *) hashp->alloc((Size) new_dirsize, hashp->alloc_arg);
1596
1597 if (p != NULL)
1598 {
1600 MemSet(((char *) p) + old_dirsize, 0, new_dirsize - old_dirsize);
1601 hashp->hctl->dir = p;
1602 hashp->dir = p;
1603 hashp->hctl->dsize = new_dsize;
1604
1605 /* XXX assume the allocator is palloc, so we know how to free */
1606 Assert(hashp->alloc == DynaHashAlloc);
1607 pfree(old_p);
1608
1609 return true;
1610 }
1611
1612 return false;
1613}
1614
1615
1616static HASHSEGMENT
1618{
1620
1621 segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * HASH_SEGSIZE, hashp->alloc_arg);
1622
1623 if (!segp)
1624 return NULL;
1625
1626 MemSet(segp, 0, sizeof(HASHBUCKET) * HASH_SEGSIZE);
1627
1628 return segp;
1629}
1630
1631/*
1632 * allocate some new elements and link them into the indicated free list
1633 */
1634static bool
1636{
1637 HASHHDR *hctl = hashp->hctl;
1640 char *allocedBlock;
1644 int i;
1645
1646 if (hctl->isfixed)
1647 return false;
1648
1649 /* Each element has a HASHELEMENT header plus user data. */
1651
1653
1654 /* Add space for slist_node list link if we need one. */
1655#ifdef USE_VALGRIND
1656 if (!hashp->isshared)
1657 requestSize += MAXALIGN(sizeof(slist_node));
1658#endif
1659
1660 /* Allocate the memory. */
1661 allocedBlock = hashp->alloc(requestSize, hashp->alloc_arg);
1662
1663 if (!allocedBlock)
1664 return false;
1665
1666 /*
1667 * If USE_VALGRIND, each allocated block of elements of a non-shared
1668 * hashtable is chained into a list, so that Valgrind won't think it's
1669 * been leaked.
1670 */
1671#ifdef USE_VALGRIND
1672 if (hashp->isshared)
1674 else
1675 {
1676 slist_push_head(&hashp->element_blocks, (slist_node *) allocedBlock);
1678 }
1679#else
1681#endif
1682
1683 /* prepare to link all the new entries into the freelist */
1684 prevElement = NULL;
1686 for (i = 0; i < nelem; i++)
1687 {
1688 tmpElement->link = prevElement;
1690 tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize);
1691 }
1692
1693 /* if partitioned, must lock to touch freeList */
1694 if (IS_PARTITIONED(hctl))
1696
1697 /* freelist could be nonempty if two backends did this concurrently */
1700
1701 if (IS_PARTITIONED(hctl))
1703
1704 return true;
1705}
1706
1707/*
1708 * Do initial lookup of a bucket for the given hash value, retrieving its
1709 * bucket number and its hash bucket.
1710 */
1711static inline uint32
1713{
1714 HASHHDR *hctl = hashp->hctl;
1718 uint32 bucket;
1719
1720 bucket = calc_bucket(hctl, hashvalue);
1721
1724
1725 segp = hashp->dir[segment_num];
1726
1727 if (segp == NULL)
1728 hash_corrupted(hashp);
1729
1731 return bucket;
1732}
1733
1734/* complain when we have detected a corrupted hashtable */
1735static void
1737{
1738 /*
1739 * If the corruption is in a shared hashtable, we'd better force a
1740 * systemwide restart. Otherwise, just shut down this one backend.
1741 */
1742 if (hashp->isshared)
1743 elog(PANIC, "hash table \"%s\" corrupted", hashp->tabname);
1744 else
1745 elog(FATAL, "hash table \"%s\" corrupted", hashp->tabname);
1746}
1747
1748/* calculate ceil(log base 2) of num */
1749static int
1751{
1752 /*
1753 * guard against too-large input, which would be invalid for
1754 * pg_ceil_log2_*()
1755 */
1756 if (num > PG_INT64_MAX / 2)
1757 num = PG_INT64_MAX / 2;
1758
1759 return pg_ceil_log2_64(num);
1760}
1761
1762/* calculate first power of 2 >= num, bounded to what will fit in a int64 */
1763static int64
1765{
1766 /* my_log2's internal range check is sufficient */
1767 return 1L << my_log2(num);
1768}
1769
1770/* calculate first power of 2 >= num, bounded to what will fit in an int */
1771static int
1773{
1774 if (num > INT_MAX / 2)
1775 num = INT_MAX / 2;
1776 return 1 << my_log2(num);
1777}
1778
1779
1780/************************* SEQ SCAN TRACKING ************************/
1781
1782/*
1783 * We track active hash_seq_search scans here. The need for this mechanism
1784 * comes from the fact that a scan will get confused if a bucket split occurs
1785 * while it's in progress: it might visit entries twice, or even miss some
1786 * entirely (if it's partway through the same bucket that splits). Hence
1787 * we want to inhibit bucket splits if there are any active scans on the
1788 * table being inserted into. This is a fairly rare case in current usage,
1789 * so just postponing the split until the next insertion seems sufficient.
1790 *
1791 * Given present usages of the function, only a few scans are likely to be
1792 * open concurrently; so a finite-size stack of open scans seems sufficient,
1793 * and we don't worry that linear search is too slow. Note that we do
1794 * allow multiple scans of the same hashtable to be open concurrently.
1795 *
1796 * This mechanism can support concurrent scan and insertion in a shared
1797 * hashtable if it's the same backend doing both. It would fail otherwise,
1798 * but locking reasons seem to preclude any such scenario anyway, so we don't
1799 * worry.
1800 *
1801 * This arrangement is reasonably robust if a transient hashtable is deleted
1802 * without notifying us. The absolute worst case is we might inhibit splits
1803 * in another table created later at exactly the same address. We will give
1804 * a warning at transaction end for reference leaks, so any bugs leading to
1805 * lack of notification should be easy to catch.
1806 */
1807
1808#define MAX_SEQ_SCANS 100
1809
1810static HTAB *seq_scan_tables[MAX_SEQ_SCANS]; /* tables being scanned */
1811static int seq_scan_level[MAX_SEQ_SCANS]; /* subtransaction nest level */
1812static int num_seq_scans = 0;
1813
1814
1815/* Register a table as having an active hash_seq_search scan */
1816static void
1818{
1820 elog(ERROR, "too many active hash_seq_search scans, cannot start one on \"%s\"",
1821 hashp->tabname);
1824 num_seq_scans++;
1825}
1826
1827/* Deregister an active scan */
1828static void
1830{
1831 int i;
1832
1833 /* Search backward since it's most likely at the stack top */
1834 for (i = num_seq_scans - 1; i >= 0; i--)
1835 {
1836 if (seq_scan_tables[i] == hashp)
1837 {
1840 num_seq_scans--;
1841 return;
1842 }
1843 }
1844 elog(ERROR, "no hash_seq_search scan for hash table \"%s\"",
1845 hashp->tabname);
1846}
1847
1848/* Check if a table has any active scan */
1849static bool
1851{
1852 int i;
1853
1854 for (i = 0; i < num_seq_scans; i++)
1855 {
1856 if (seq_scan_tables[i] == hashp)
1857 return true;
1858 }
1859 return false;
1860}
1861
1862/* Clean up any open scans at end of transaction */
1863void
1865{
1866 /*
1867 * During abort cleanup, open scans are expected; just silently clean 'em
1868 * out. An open scan at commit means someone forgot a hash_seq_term()
1869 * call, so complain.
1870 *
1871 * Note: it's tempting to try to print the tabname here, but refrain for
1872 * fear of touching deallocated memory. This isn't a user-facing message
1873 * anyway, so it needn't be pretty.
1874 */
1875 if (isCommit)
1876 {
1877 int i;
1878
1879 for (i = 0; i < num_seq_scans; i++)
1880 {
1881 elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1883 }
1884 }
1885 num_seq_scans = 0;
1886}
1887
1888/* Clean up any open scans at end of subtransaction */
1889void
1891{
1892 int i;
1893
1894 /*
1895 * Search backward to make cleanup easy. Note we must check all entries,
1896 * not only those at the end of the array, because deletion technique
1897 * doesn't keep them in order.
1898 */
1899 for (i = num_seq_scans - 1; i >= 0; i--)
1900 {
1901 if (seq_scan_level[i] >= nestDepth)
1902 {
1903 if (isCommit)
1904 elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1908 num_seq_scans--;
1909 }
1910 }
1911}
#define MAXALIGN(LEN)
Definition c.h:896
#define pg_noreturn
Definition c.h:190
#define Max(x, y)
Definition c.h:1085
#define INT64_FORMAT
Definition c.h:634
#define Assert(condition)
Definition c.h:943
int64_t int64
Definition c.h:621
#define UINT64_FORMAT
Definition c.h:635
#define PG_INT64_MAX
Definition c.h:676
uint64_t uint64
Definition c.h:625
uint32_t uint32
Definition c.h:624
#define MemSet(start, val, len)
Definition c.h:1107
void(* pg_funcptr_t)(void)
Definition c.h:548
size_t Size
Definition c.h:689
memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets))
static HTAB * seq_scan_tables[MAX_SEQ_SCANS]
Definition dynahash.c:1810
static int seq_scan_level[MAX_SEQ_SCANS]
Definition dynahash.c:1811
void hash_seq_init_with_hash_value(HASH_SEQ_STATUS *status, HTAB *hashp, uint32 hashvalue)
Definition dynahash.c:1337
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition dynahash.c:889
#define ELEMENT_FROM_KEY(key)
Definition dynahash.c:262
#define DEF_DIRSIZE
Definition dynahash.c:127
static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx)
Definition dynahash.c:1635
void AtEOXact_HashTables(bool isCommit)
Definition dynahash.c:1864
#define HASH_SEGSIZE_SHIFT
Definition dynahash.c:126
Size hash_estimate_size(int64 num_entries, Size entrysize)
Definition dynahash.c:763
static HASHSEGMENT seg_alloc(HTAB *hashp)
Definition dynahash.c:1617
#define MAX_SEQ_SCANS
Definition dynahash.c:1808
static int next_pow2_int(int64 num)
Definition dynahash.c:1772
static int choose_nelem_alloc(Size entrysize)
Definition dynahash.c:653
static void * DynaHashAlloc(Size size, void *alloc_arg)
Definition dynahash.c:297
HTAB * hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
Definition dynahash.c:360
static void register_seq_scan(HTAB *hashp)
Definition dynahash.c:1817
#define MOD(x, y)
Definition dynahash.c:268
#define IS_PARTITIONED(hctl)
Definition dynahash.c:215
void AtEOSubXact_HashTables(bool isCommit, int nestDepth)
Definition dynahash.c:1890
static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx)
Definition dynahash.c:1188
#define NUM_FREELISTS
Definition dynahash.c:130
void hash_destroy(HTAB *hashp)
Definition dynahash.c:802
static int string_compare(const char *key1, const char *key2, Size keysize)
Definition dynahash.c:314
void * hash_search_with_hash_value(HTAB *hashp, const void *keyPtr, uint32 hashvalue, HASHACTION action, bool *foundPtr)
Definition dynahash.c:902
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition dynahash.c:1352
static bool expand_table(HTAB *hashp)
Definition dynahash.c:1481
static void hdefault(HTAB *hashp)
Definition dynahash.c:630
static void deregister_seq_scan(HTAB *hashp)
Definition dynahash.c:1829
static int my_log2(int64 num)
Definition dynahash.c:1750
#define ELEMENTKEY(helem)
Definition dynahash.c:257
void hash_seq_term(HASH_SEQ_STATUS *status)
Definition dynahash.c:1444
int64 hash_get_num_entries(HTAB *hashp)
Definition dynahash.c:1273
static int num_seq_scans
Definition dynahash.c:1812
static int64 next_pow2_int64(int64 num)
Definition dynahash.c:1764
#define FREELIST_IDX(hctl, hashcode)
Definition dynahash.c:217
void hash_stats(const char *caller, HTAB *hashp)
Definition dynahash.c:821
static bool init_htab(HTAB *hashp, int64 nelem)
Definition dynahash.c:686
static pg_noreturn void hash_corrupted(HTAB *hashp)
Definition dynahash.c:1736
void hash_freeze(HTAB *hashp)
Definition dynahash.c:1464
static bool dir_realloc(HTAB *hashp)
Definition dynahash.c:1578
#define HASH_SEGSIZE
Definition dynahash.c:125
bool hash_update_hash_key(HTAB *hashp, void *existingEntry, const void *newKeyPtr)
Definition dynahash.c:1077
static uint32 hash_initial_lookup(HTAB *hashp, uint32 hashvalue, HASHBUCKET **bucketptr)
Definition dynahash.c:1712
HASHELEMENT * HASHBUCKET
Definition dynahash.c:133
uint32 get_hash_value(HTAB *hashp, const void *keyPtr)
Definition dynahash.c:845
static uint32 calc_bucket(HASHHDR *hctl, uint32 hash_val)
Definition dynahash.c:852
static bool has_seq_scans(HTAB *hashp)
Definition dynahash.c:1850
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition dynahash.c:1317
HASHBUCKET * HASHSEGMENT
Definition dynahash.c:136
int errcode(int sqlerrcode)
Definition elog.c:875
#define FATAL
Definition elog.h:42
#define WARNING
Definition elog.h:37
#define PANIC
Definition elog.h:44
#define ERROR
Definition elog.h:40
#define elog(elevel,...)
Definition elog.h:228
#define ereport(elevel,...)
Definition elog.h:152
#define DEBUG4
Definition elog.h:28
#define MCXT_ALLOC_NO_OOM
Definition fe_memutils.h:29
uint32 tag_hash(const void *key, Size keysize)
Definition hashfn.c:677
uint32 uint32_hash(const void *key, Size keysize)
Definition hashfn.c:688
uint32 string_hash(const void *key, Size keysize)
Definition hashfn.c:660
#define HASH_KEYCOPY
Definition hsearch.h:95
#define HASH_STRINGS
Definition hsearch.h:91
int(* HashCompareFunc)(const void *key1, const void *key2, Size keysize)
Definition hsearch.h:29
HASHACTION
Definition hsearch.h:107
@ HASH_FIND
Definition hsearch.h:108
@ HASH_REMOVE
Definition hsearch.h:110
@ HASH_ENTER
Definition hsearch.h:109
@ HASH_ENTER_NULL
Definition hsearch.h:111
#define HASH_CONTEXT
Definition hsearch.h:97
#define NO_MAX_DSIZE
Definition hsearch.h:103
#define HASH_ELEM
Definition hsearch.h:90
#define HASH_ALLOC
Definition hsearch.h:96
uint32(* HashValueFunc)(const void *key, Size keysize)
Definition hsearch.h:21
#define HASH_ATTACH
Definition hsearch.h:99
#define HASH_COMPARE
Definition hsearch.h:94
#define HASH_FUNCTION
Definition hsearch.h:93
#define HASH_BLOBS
Definition hsearch.h:92
#define HASH_SHARED_MEM
Definition hsearch.h:98
void *(* HashAllocFunc)(Size request, void *alloc_arg)
Definition hsearch.h:43
#define HASH_FIXED_SIZE
Definition hsearch.h:100
#define HASH_PARTITION
Definition hsearch.h:87
void *(* HashCopyFunc)(void *dest, const void *src, Size keysize)
Definition hsearch.h:37
static void slist_push_head(slist_head *head, slist_node *node)
Definition ilist.h:1006
int i
Definition isn.c:77
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition mcxt.c:1232
void pfree(void *pointer)
Definition mcxt.c:1616
MemoryContext TopMemoryContext
Definition mcxt.c:166
void * MemoryContextAllocExtended(MemoryContext context, Size size, int flags)
Definition mcxt.c:1289
void MemoryContextDelete(MemoryContext context)
Definition mcxt.c:472
void MemoryContextSetIdentifier(MemoryContext context, const char *id)
Definition mcxt.c:661
#define MemoryContextIsValid(context)
Definition memnodes.h:145
#define AllocSetContextCreate
Definition memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition memutils.h:160
static char * errmsg
struct MemoryContextData * MemoryContext
Definition palloc.h:36
static uint64 pg_ceil_log2_64(uint64 num)
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition strlcpy.c:45
static int fb(int x)
Size add_size(Size s1, Size s2)
Definition shmem.c:1048
Size mul_size(Size s1, Size s2)
Definition shmem.c:1063
static void SpinLockRelease(volatile slock_t *lock)
Definition spin.h:62
static void SpinLockAcquire(volatile slock_t *lock)
Definition spin.h:56
static void SpinLockInit(volatile slock_t *lock)
Definition spin.h:50
slock_t mutex
Definition dynahash.c:157
int64 nentries
Definition dynahash.c:158
HASHELEMENT * freeList
Definition dynahash.c:159
HashAllocFunc alloc
Definition hsearch.h:78
Size keysize
Definition hsearch.h:69
HashValueFunc hash
Definition hsearch.h:72
Size entrysize
Definition hsearch.h:70
void * alloc_arg
Definition hsearch.h:79
HashCompareFunc match
Definition hsearch.h:74
HASHHDR * hctl
Definition hsearch.h:83
MemoryContext hcxt
Definition hsearch.h:81
int64 num_partitions
Definition hsearch.h:67
HashCopyFunc keycopy
Definition hsearch.h:76
struct HASHELEMENT * link
Definition hsearch.h:52
uint32 high_mask
Definition dynahash.c:189
FreeListData freeList[NUM_FREELISTS]
Definition dynahash.c:182
Size entrysize
Definition dynahash.c:194
HASHSEGMENT * dir
Definition dynahash.c:201
uint32 max_bucket
Definition dynahash.c:188
int64 dsize
Definition dynahash.c:186
int64 max_dsize
Definition dynahash.c:196
Size keysize
Definition dynahash.c:193
bool isfixed
Definition dynahash.c:198
int nelem_alloc
Definition dynahash.c:197
int64 num_partitions
Definition dynahash.c:195
uint32 low_mask
Definition dynahash.c:190
int64 nsegs
Definition dynahash.c:187
uint32 hashvalue
Definition hsearch.h:121
HASHELEMENT * curEntry
Definition hsearch.h:119
uint32 curBucket
Definition hsearch.h:118
bool hasHashvalue
Definition hsearch.h:120
bool isshared
Definition dynahash.c:235
HashCompareFunc match
Definition dynahash.c:229
char * tabname
Definition dynahash.c:234
HASHHDR * hctl
Definition dynahash.c:226
MemoryContext hcxt
Definition dynahash.c:233
HashAllocFunc alloc
Definition dynahash.c:231
HashValueFunc hash
Definition dynahash.c:228
void * alloc_arg
Definition dynahash.c:232
HASHSEGMENT * dir
Definition dynahash.c:227
Size keysize
Definition dynahash.c:241
HashCopyFunc keycopy
Definition dynahash.c:230
bool frozen
Definition dynahash.c:238
int GetCurrentTransactionNestLevel(void)
Definition xact.c:931