PostgreSQL Source Code git master
Loading...
Searching...
No Matches
simplehash.h
Go to the documentation of this file.
1/*
2 * simplehash.h
3 *
4 * When included this file generates a "templated" (by way of macros)
5 * open-addressing hash table implementation specialized to user-defined
6 * types.
7 *
8 * It's probably not worthwhile to generate such a specialized implementation
9 * for hash tables that aren't performance or space sensitive.
10 *
11 * Compared to dynahash, simplehash has the following benefits:
12 *
13 * - Due to the "templated" code generation has known structure sizes and no
14 * indirect function calls (which show up substantially in dynahash
15 * profiles). These features considerably increase speed for small
16 * entries.
17 * - Open addressing has better CPU cache behavior than dynahash's chained
18 * hashtables.
19 * - The generated interface is type-safe and easier to use than dynahash,
20 * though at the cost of more complex setup.
21 * - Allocates memory in a MemoryContext or another allocator with a
22 * malloc/free style interface (which isn't easily usable in a shared
23 * memory context)
24 * - Does not require the overhead of a separate memory context.
25 *
26 * Usage notes:
27 *
28 * To generate a hash-table and associated functions for a use case several
29 * macros have to be #define'ed before this file is included. Including
30 * the file #undef's all those, so a new hash table can be generated
31 * afterwards.
32 * The relevant parameters are:
33 * - SH_PREFIX - prefix for all symbol names generated. A prefix of 'foo'
34 * will result in hash table type 'foo_hash' and functions like
35 * 'foo_insert'/'foo_lookup' and so forth.
36 * - SH_ELEMENT_TYPE - type of the contained elements
37 * - SH_KEY_TYPE - type of the hashtable's key
38 * - SH_DECLARE - if defined function prototypes and type declarations are
39 * generated
40 * - SH_DEFINE - if defined function definitions are generated
41 * - SH_SCOPE - in which scope (e.g. extern, static inline) do function
42 * declarations reside
43 * - SH_RAW_ALLOCATOR - if defined, memory contexts are not used; instead,
44 * use this to allocate bytes. The allocator must zero the returned space.
45 * - SH_USE_NONDEFAULT_ALLOCATOR - if defined no element allocator functions
46 * are defined, so you can supply your own
47 * The following parameters are only relevant when SH_DEFINE is defined:
48 * - SH_KEY - name of the element in SH_ELEMENT_TYPE containing the hash key
49 * - SH_EQUAL(table, a, b) - compare two table keys
50 * - SH_HASH_KEY(table, key) - generate hash for the key
51 * - SH_STORE_HASH - if defined the hash is stored in the elements
52 * - SH_GET_HASH(tb, a) - return the field to store the hash in
53 *
54 * The element type is required to contain a "status" member that can store
55 * the range of values defined in the SH_STATUS enum.
56 *
57 * While SH_STORE_HASH (and subsequently SH_GET_HASH) are optional, because
58 * the hash table implementation needs to compare hashes to move elements
59 * (particularly when growing the hash), it's preferable, if possible, to
60 * store the element's hash in the element's data type. If the hash is so
61 * stored, the hash table will also compare hashes before calling SH_EQUAL
62 * when comparing two keys.
63 *
64 * For convenience the hash table create functions accept a void pointer
65 * that will be stored in the hash table type's member private_data. This
66 * allows callbacks to reference caller provided data.
67 *
68 * For examples of usage look at tidbitmap.c (file local definition) and
69 * execnodes.h/execGrouping.c (exposed declaration, file local
70 * implementation).
71 *
72 * Hash table design:
73 *
74 * The hash table design chosen is a variant of linear open-addressing. The
75 * reason for doing so is that linear addressing is CPU cache & pipeline
76 * friendly. The biggest disadvantage of simple linear addressing schemes
77 * are highly variable lookup times due to clustering, and deletions
78 * leaving a lot of tombstones around. To address these issues a variant
79 * of "robin hood" hashing is employed. Robin hood hashing optimizes
80 * chaining lengths by moving elements close to their optimal bucket
81 * ("rich" elements), out of the way if a to-be-inserted element is further
82 * away from its optimal position (i.e. it's "poor"). While that can make
83 * insertions slower, the average lookup performance is a lot better, and
84 * higher fill factors can be used in a still performant manner. To avoid
85 * tombstones - which normally solve the issue that a deleted node's
86 * presence is relevant to determine whether a lookup needs to continue
87 * looking or is done - buckets following a deleted element are shifted
88 * backwards, unless they're empty or already at their optimal position.
89 *
90 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
91 * Portions Copyright (c) 1994, Regents of the University of California
92 *
93 * src/include/lib/simplehash.h
94 */
95
96
97/* helpers */
98#define SH_MAKE_PREFIX(a) CppConcat(a,_)
99#define SH_MAKE_NAME(name) SH_MAKE_NAME_(SH_MAKE_PREFIX(SH_PREFIX),name)
100#define SH_MAKE_NAME_(a,b) CppConcat(a,b)
101
102/* name macros for: */
103
104/* type declarations */
105#define SH_TYPE SH_MAKE_NAME(hash)
106#define SH_STATUS SH_MAKE_NAME(status)
107#define SH_STATUS_EMPTY SH_MAKE_NAME(SH_EMPTY)
108#define SH_STATUS_IN_USE SH_MAKE_NAME(SH_IN_USE)
109#define SH_ITERATOR SH_MAKE_NAME(iterator)
110
111/* function declarations */
112#define SH_CREATE SH_MAKE_NAME(create)
113#define SH_DESTROY SH_MAKE_NAME(destroy)
114#define SH_RESET SH_MAKE_NAME(reset)
115#define SH_INSERT SH_MAKE_NAME(insert)
116#define SH_INSERT_HASH SH_MAKE_NAME(insert_hash)
117#define SH_DELETE_ITEM SH_MAKE_NAME(delete_item)
118#define SH_DELETE SH_MAKE_NAME(delete)
119#define SH_LOOKUP SH_MAKE_NAME(lookup)
120#define SH_LOOKUP_HASH SH_MAKE_NAME(lookup_hash)
121#define SH_GROW SH_MAKE_NAME(grow)
122#define SH_START_ITERATE SH_MAKE_NAME(start_iterate)
123#define SH_START_ITERATE_AT SH_MAKE_NAME(start_iterate_at)
124#define SH_ITERATE SH_MAKE_NAME(iterate)
125#define SH_ALLOCATE SH_MAKE_NAME(allocate)
126#define SH_FREE SH_MAKE_NAME(free)
127#define SH_ESTIMATE_SPACE SH_MAKE_NAME(estimate_space)
128#define SH_STAT SH_MAKE_NAME(stat)
129
130/* internal helper functions (no externally visible prototypes) */
131#define SH_COMPUTE_SIZE SH_MAKE_NAME(compute_size)
132#define SH_UPDATE_PARAMETERS SH_MAKE_NAME(update_parameters)
133#define SH_NEXT SH_MAKE_NAME(next)
134#define SH_PREV SH_MAKE_NAME(prev)
135#define SH_DISTANCE_FROM_OPTIMAL SH_MAKE_NAME(distance)
136#define SH_INITIAL_BUCKET SH_MAKE_NAME(initial_bucket)
137#define SH_ENTRY_HASH SH_MAKE_NAME(entry_hash)
138#define SH_INSERT_HASH_INTERNAL SH_MAKE_NAME(insert_hash_internal)
139#define SH_LOOKUP_HASH_INTERNAL SH_MAKE_NAME(lookup_hash_internal)
140
141/* generate forward declarations necessary to use the hash table */
142#ifdef SH_DECLARE
143
144/* type definitions */
145typedef struct SH_TYPE
146{
147 /*
148 * Size of data / bucket array, 64 bits to handle UINT32_MAX sized hash
149 * tables. Note that the maximum number of elements is lower
150 * (SH_MAX_FILLFACTOR)
151 */
152 uint64 size;
153
154 /* how many elements have valid contents */
155 uint32 members;
156
157 /* mask for bucket and size calculations, based on size */
159
160 /* boundary after which to grow hashtable */
162
163 /* hash buckets */
165
166#ifndef SH_RAW_ALLOCATOR
167 /* memory context to use for allocations */
168 MemoryContext ctx;
169#endif
170
171 /* user defined data, useful for callbacks */
172 void *private_data;
173} SH_TYPE;
174
175typedef enum SH_STATUS
176{
177 SH_STATUS_EMPTY = 0x00,
178 SH_STATUS_IN_USE = 0x01
179} SH_STATUS;
180
181typedef struct SH_ITERATOR
182{
183 uint32 cur; /* current element */
184 uint32 end;
185 bool done; /* iterator exhausted? */
187
188/* externally visible function prototypes */
189#ifdef SH_RAW_ALLOCATOR
190/* <prefix>_hash <prefix>_create(uint32 nelements, void *private_data) */
191SH_SCOPE SH_TYPE *SH_CREATE(uint32 nelements, void *private_data);
192#else
193/*
194 * <prefix>_hash <prefix>_create(MemoryContext ctx, uint32 nelements,
195 * void *private_data)
196 */
198 void *private_data);
199#endif
200
201/* void <prefix>_destroy(<prefix>_hash *tb) */
203
204/* void <prefix>_reset(<prefix>_hash *tb) */
206
207/* void <prefix>_grow(<prefix>_hash *tb, uint64 newsize) */
209
210/* <element> *<prefix>_insert(<prefix>_hash *tb, <key> key, bool *found) */
212
213/*
214 * <element> *<prefix>_insert_hash(<prefix>_hash *tb, <key> key, uint32 hash,
215 * bool *found)
216 */
218 uint32 hash, bool *found);
219
220/* <element> *<prefix>_lookup(<prefix>_hash *tb, <key> key) */
222
223/* <element> *<prefix>_lookup_hash(<prefix>_hash *tb, <key> key, uint32 hash) */
225 uint32 hash);
226
227/* void <prefix>_delete_item(<prefix>_hash *tb, <element> *entry) */
229
230/* bool <prefix>_delete(<prefix>_hash *tb, <key> key) */
232
233/* void <prefix>_start_iterate(<prefix>_hash *tb, <prefix>_iterator *iter) */
235
236/*
237 * void <prefix>_start_iterate_at(<prefix>_hash *tb, <prefix>_iterator *iter,
238 * uint32 at)
239 */
241
242/* <element> *<prefix>_iterate(<prefix>_hash *tb, <prefix>_iterator *iter) */
244
245/* size_t <prefix>_estimate_space(double nentries) */
246SH_SCOPE size_t SH_ESTIMATE_SPACE(double nentries);
247
248/* void <prefix>_stat(<prefix>_hash *tb) */
249SH_SCOPE void SH_STAT(SH_TYPE * tb);
250
251#endif /* SH_DECLARE */
252
253
254/* generate implementation of the hash table */
255#ifdef SH_DEFINE
256
257#include "port/pg_bitutils.h"
258
259#ifndef SH_RAW_ALLOCATOR
260#include "utils/memutils.h"
261#endif
262
263/* max data array size,we allow up to PG_UINT32_MAX buckets, including 0 */
264#define SH_MAX_SIZE (((uint64) PG_UINT32_MAX) + 1)
265
266/* normal fillfactor, unless already close to maximum */
267#ifndef SH_FILLFACTOR
268#define SH_FILLFACTOR (0.9)
269#endif
270/* increase fillfactor if we otherwise would error out */
271#define SH_MAX_FILLFACTOR (0.98)
272/* grow if actual and optimal location bigger than */
273#ifndef SH_GROW_MAX_DIB
274#define SH_GROW_MAX_DIB 25
275#endif
276/* grow if more than elements to move when inserting */
277#ifndef SH_GROW_MAX_MOVE
278#define SH_GROW_MAX_MOVE 150
279#endif
280#ifndef SH_GROW_MIN_FILLFACTOR
281/* but do not grow due to SH_GROW_MAX_* if below */
282#define SH_GROW_MIN_FILLFACTOR 0.1
283#endif
284
285#ifdef SH_STORE_HASH
286#define SH_COMPARE_KEYS(tb, ahash, akey, b) (ahash == SH_GET_HASH(tb, b) && SH_EQUAL(tb, b->SH_KEY, akey))
287#else
288#define SH_COMPARE_KEYS(tb, ahash, akey, b) (SH_EQUAL(tb, b->SH_KEY, akey))
289#endif
290
291/*
292 * Wrap the following definitions in include guards, to avoid multiple
293 * definition errors if this header is included more than once. The rest of
294 * the file deliberately has no include guards, because it can be included
295 * with different parameters to define functions and types with non-colliding
296 * names.
297 */
298#ifndef SIMPLEHASH_H
299#define SIMPLEHASH_H
300
301#ifdef FRONTEND
302#define sh_error(...) pg_fatal(__VA_ARGS__)
303#define sh_log(...) pg_log_info(__VA_ARGS__)
304#else
305#define sh_error(...) elog(ERROR, __VA_ARGS__)
306#define sh_log(...) elog(LOG, __VA_ARGS__)
307#endif
308
309#endif
310
311/*
312 * Compute allocation size for hashtable. Result can be passed to
313 * SH_UPDATE_PARAMETERS. (Keep SH_ESTIMATE_SPACE in sync with this!)
314 */
315static inline uint64
317{
318 uint64 size;
319
320 /* supporting zero sized hashes would complicate matters */
321 size = Max(newsize, 2);
322
323 /* round up size to the next power of 2, that's how bucketing works */
324 size = pg_nextpower2_64(size);
325 Assert(size <= SH_MAX_SIZE);
326
327 /*
328 * Verify that allocation of ->data is possible on this platform, without
329 * overflowing Size.
330 */
331 if (unlikely((((uint64) sizeof(SH_ELEMENT_TYPE)) * size) >= SIZE_MAX / 2))
332 sh_error("hash table too large");
333
334 return size;
335}
336
337/*
338 * Update sizing parameters for hashtable. Called when creating and growing
339 * the hashtable.
340 */
341static inline void
343{
345
346 /* now set size */
347 tb->size = size;
348 tb->sizemask = (uint32) (size - 1);
349
350 /*
351 * Compute the next threshold at which we need to grow the hash table
352 * again.
353 */
354 if (tb->size == SH_MAX_SIZE)
355 tb->grow_threshold = ((double) tb->size) * SH_MAX_FILLFACTOR;
356 else
357 tb->grow_threshold = ((double) tb->size) * SH_FILLFACTOR;
358}
359
360/* return the optimal bucket for the hash */
361static inline uint32
363{
364 return hash & tb->sizemask;
365}
366
367/* return next bucket after the current, handling wraparound */
368static inline uint32
370{
371 curelem = (curelem + 1) & tb->sizemask;
372
374
375 return curelem;
376}
377
378/* return bucket before the current, handling wraparound */
379static inline uint32
381{
382 curelem = (curelem - 1) & tb->sizemask;
383
385
386 return curelem;
387}
388
389/* return distance between bucket and its optimal position */
390static inline uint32
392{
393 if (optimal <= bucket)
394 return bucket - optimal;
395 else
396 return (tb->size + bucket) - optimal;
397}
398
399static inline uint32
401{
402#ifdef SH_STORE_HASH
403 return SH_GET_HASH(tb, entry);
404#else
405 return SH_HASH_KEY(tb, entry->SH_KEY);
406#endif
407}
408
409/* default memory allocator function */
410static inline void *SH_ALLOCATE(SH_TYPE * type, Size size);
411static inline void SH_FREE(SH_TYPE * type, void *pointer);
412
413#ifndef SH_USE_NONDEFAULT_ALLOCATOR
414
415/* default memory allocator function */
416static inline void *
418{
419#ifdef SH_RAW_ALLOCATOR
420 return SH_RAW_ALLOCATOR(size);
421#else
422 return MemoryContextAllocExtended(type->ctx, size,
424#endif
425}
426
427/* default memory free function */
428static inline void
429SH_FREE(SH_TYPE * type, void *pointer)
430{
431 pfree(pointer);
432}
433
434#endif
435
436/*
437 * Create a hash table with enough space for `nelements` distinct members.
438 * Memory for the hash table is allocated from the passed-in context. If
439 * desired, the array of elements can be allocated using a passed-in allocator;
440 * this could be useful in order to place the array of elements in a shared
441 * memory, or in a context that will outlive the rest of the hash table.
442 * Memory other than for the array of elements will still be allocated from
443 * the passed-in context.
444 */
445#ifdef SH_RAW_ALLOCATOR
447SH_CREATE(uint32 nelements, void *private_data)
448#else
450SH_CREATE(MemoryContext ctx, uint32 nelements, void *private_data)
451#endif
452{
453 SH_TYPE *tb;
454 uint64 size;
455
456#ifdef SH_RAW_ALLOCATOR
457 tb = (SH_TYPE *) SH_RAW_ALLOCATOR(sizeof(SH_TYPE));
458#else
459 tb = (SH_TYPE *) MemoryContextAllocZero(ctx, sizeof(SH_TYPE));
460 tb->ctx = ctx;
461#endif
462 tb->private_data = private_data;
463
464 /* increase nelements by fillfactor, want to store nelements elements */
465 size = Min((double) SH_MAX_SIZE, ((double) nelements) / SH_FILLFACTOR);
466
467 size = SH_COMPUTE_SIZE(size);
468
469 tb->data = (SH_ELEMENT_TYPE *) SH_ALLOCATE(tb, sizeof(SH_ELEMENT_TYPE) * size);
470
472 return tb;
473}
474
475/* destroy a previously created hash table */
476SH_SCOPE void
478{
479 SH_FREE(tb, tb->data);
480 pfree(tb);
481}
482
483/* reset the contents of a previously created hash table */
484SH_SCOPE void
486{
487 memset(tb->data, 0, sizeof(SH_ELEMENT_TYPE) * tb->size);
488 tb->members = 0;
489}
490
491/*
492 * Grow a hash table to at least `newsize` buckets.
493 *
494 * Usually this will automatically be called by insertions/deletions, when
495 * necessary. But resizing to the exact input size can be advantageous
496 * performance-wise, when known at some point.
497 */
498SH_SCOPE void
500{
501 uint64 oldsize = tb->size;
502 SH_ELEMENT_TYPE *olddata = tb->data;
504 uint32 i;
505 uint32 startelem = 0;
507
511
513
514 tb->data = (SH_ELEMENT_TYPE *) SH_ALLOCATE(tb, sizeof(SH_ELEMENT_TYPE) * newsize);
515
516 /*
517 * Update parameters for new table after allocation succeeds to avoid
518 * inconsistent state on OOM.
519 */
521
522 newdata = tb->data;
523
524 /*
525 * Copy entries from the old data to newdata. We theoretically could use
526 * SH_INSERT here, to avoid code duplication, but that's more general than
527 * we need. We neither want tb->members increased, nor do we need to do
528 * deal with deleted elements, nor do we need to compare keys. So a
529 * special-cased implementation is lot faster. As resizing can be time
530 * consuming and frequent, that's worthwhile to optimize.
531 *
532 * To be able to simply move entries over, we have to start not at the
533 * first bucket (i.e olddata[0]), but find the first bucket that's either
534 * empty, or is occupied by an entry at its optimal position. Such a
535 * bucket has to exist in any table with a load factor under 1, as not all
536 * buckets are occupied, i.e. there always has to be an empty bucket. By
537 * starting at such a bucket we can move the entries to the larger table,
538 * without having to deal with conflicts.
539 */
540
541 /* search for the first element in the hash that's not wrapped around */
542 for (i = 0; i < oldsize; i++)
543 {
545 uint32 hash;
547
548 if (oldentry->status != SH_STATUS_IN_USE)
549 {
550 startelem = i;
551 break;
552 }
553
556
557 if (optimal == i)
558 {
559 startelem = i;
560 break;
561 }
562 }
563
564 /* and copy all elements in the old table */
566 for (i = 0; i < oldsize; i++)
567 {
569
570 if (oldentry->status == SH_STATUS_IN_USE)
571 {
572 uint32 hash;
576
580
581 /* find empty element to put data into */
582 while (true)
583 {
585
586 if (newentry->status == SH_STATUS_EMPTY)
587 {
588 break;
589 }
590
592 }
593
594 /* copy entry to new slot */
596 }
597
598 /* can't use SH_NEXT here, would use new size */
599 copyelem++;
600 if (copyelem >= oldsize)
601 {
602 copyelem = 0;
603 }
604 }
605
607}
608
609/*
610 * This is a separate static inline function, so it can be reliably be inlined
611 * into its wrapper functions even if SH_SCOPE is extern.
612 */
613static inline SH_ELEMENT_TYPE *
615{
620
621restart:
622 insertdist = 0;
623
624 /*
625 * We do the grow check even if the key is actually present, to avoid
626 * doing the check inside the loop. This also lets us avoid having to
627 * re-find our position in the hashtable after resizing.
628 *
629 * Note that this also reached when resizing the table due to
630 * SH_GROW_MAX_DIB / SH_GROW_MAX_MOVE.
631 */
632 if (unlikely(tb->members >= tb->grow_threshold))
633 {
634 if (unlikely(tb->size == SH_MAX_SIZE))
635 sh_error("hash table size exceeded");
636
637 /*
638 * When optimizing, it can be very useful to print these out.
639 */
640 /* SH_STAT(tb); */
641 SH_GROW(tb, tb->size * 2);
642 /* SH_STAT(tb); */
643 }
644
645 /* perform insert, start bucket search at optimal location */
646 data = tb->data;
649 while (true)
650 {
654 SH_ELEMENT_TYPE *entry = &data[curelem];
655
656 /* any empty bucket can directly be used */
657 if (entry->status == SH_STATUS_EMPTY)
658 {
659 tb->members++;
660 entry->SH_KEY = key;
661#ifdef SH_STORE_HASH
662 SH_GET_HASH(tb, entry) = hash;
663#endif
664 entry->status = SH_STATUS_IN_USE;
665 *found = false;
666 return entry;
667 }
668
669 /*
670 * If the bucket is not empty, we either found a match (in which case
671 * we're done), or we have to decide whether to skip over or move the
672 * colliding entry. When the colliding element's distance to its
673 * optimal position is smaller than the to-be-inserted entry's, we
674 * shift the colliding entry (and its followers) forward by one.
675 */
676
677 if (SH_COMPARE_KEYS(tb, hash, key, entry))
678 {
679 Assert(entry->status == SH_STATUS_IN_USE);
680 *found = true;
681 return entry;
682 }
683
684 curhash = SH_ENTRY_HASH(tb, entry);
687
688 if (insertdist > curdist)
689 {
690 SH_ELEMENT_TYPE *lastentry = entry;
693 int32 emptydist = 0;
694
695 /* find next empty bucket */
696 while (true)
697 {
699
702
703 if (emptyentry->status == SH_STATUS_EMPTY)
704 {
706 break;
707 }
708
709 /*
710 * To avoid negative consequences from overly imbalanced
711 * hashtables, grow the hashtable if collisions would require
712 * us to move a lot of entries. The most likely cause of such
713 * imbalance is filling a (currently) small table, from a
714 * currently big one, in hash-table order. Don't grow if the
715 * hashtable would be too empty, to prevent quick space
716 * explosion for some weird edge cases.
717 */
719 ((double) tb->members / tb->size) >= SH_GROW_MIN_FILLFACTOR)
720 {
721 tb->grow_threshold = 0;
722 goto restart;
723 }
724 }
725
726 /* shift forward, starting at last occupied element */
727
728 /*
729 * TODO: This could be optimized to be one memcpy in many cases,
730 * excepting wrapping around at the end of ->data. Hasn't shown up
731 * in profiles so far though.
732 */
734 while (moveelem != curelem)
735 {
737
740
743 }
744
745 /* and fill the now empty spot */
746 tb->members++;
747
748 entry->SH_KEY = key;
749#ifdef SH_STORE_HASH
750 SH_GET_HASH(tb, entry) = hash;
751#endif
752 entry->status = SH_STATUS_IN_USE;
753 *found = false;
754 return entry;
755 }
756
758 insertdist++;
759
760 /*
761 * To avoid negative consequences from overly imbalanced hashtables,
762 * grow the hashtable if collisions lead to large runs. The most
763 * likely cause of such imbalance is filling a (currently) small
764 * table, from a currently big one, in hash-table order. Don't grow
765 * if the hashtable would be too empty, to prevent quick space
766 * explosion for some weird edge cases.
767 */
769 ((double) tb->members / tb->size) >= SH_GROW_MIN_FILLFACTOR)
770 {
771 tb->grow_threshold = 0;
772 goto restart;
773 }
774 }
775}
776
777/*
778 * Insert the key into the hash-table, set *found to true if the key already
779 * exists, false otherwise. Returns the hash-table entry in either case.
780 */
782SH_INSERT(SH_TYPE * tb, SH_KEY_TYPE key, bool *found)
783{
784 uint32 hash = SH_HASH_KEY(tb, key);
785
786 return SH_INSERT_HASH_INTERNAL(tb, key, hash, found);
787}
788
789/*
790 * Insert the key into the hash-table using an already-calculated hash. Set
791 * *found to true if the key already exists, false otherwise. Returns the
792 * hash-table entry in either case.
793 */
795SH_INSERT_HASH(SH_TYPE * tb, SH_KEY_TYPE key, uint32 hash, bool *found)
796{
797 return SH_INSERT_HASH_INTERNAL(tb, key, hash, found);
798}
799
800/*
801 * This is a separate static inline function, so it can be reliably be inlined
802 * into its wrapper functions even if SH_SCOPE is extern.
803 */
804static inline SH_ELEMENT_TYPE *
806{
809
810 while (true)
811 {
812 SH_ELEMENT_TYPE *entry = &tb->data[curelem];
813
814 if (entry->status == SH_STATUS_EMPTY)
815 {
816 return NULL;
817 }
818
819 Assert(entry->status == SH_STATUS_IN_USE);
820
821 if (SH_COMPARE_KEYS(tb, hash, key, entry))
822 return entry;
823
824 /*
825 * TODO: we could stop search based on distance. If the current
826 * buckets's distance-from-optimal is smaller than what we've skipped
827 * already, the entry doesn't exist. Probably only do so if
828 * SH_STORE_HASH is defined, to avoid re-computing hashes?
829 */
830
832 }
833}
834
835/*
836 * Lookup entry in hash table. Returns NULL if key not present.
837 */
840{
841 uint32 hash = SH_HASH_KEY(tb, key);
842
843 return SH_LOOKUP_HASH_INTERNAL(tb, key, hash);
844}
845
846/*
847 * Lookup entry in hash table using an already-calculated hash.
848 *
849 * Returns NULL if key not present.
850 */
853{
854 return SH_LOOKUP_HASH_INTERNAL(tb, key, hash);
855}
856
857/*
858 * Delete entry from hash table by key. Returns whether to-be-deleted key was
859 * present.
860 */
861SH_SCOPE bool
863{
864 uint32 hash = SH_HASH_KEY(tb, key);
867
868 while (true)
869 {
870 SH_ELEMENT_TYPE *entry = &tb->data[curelem];
871
872 if (entry->status == SH_STATUS_EMPTY)
873 return false;
874
875 if (entry->status == SH_STATUS_IN_USE &&
876 SH_COMPARE_KEYS(tb, hash, key, entry))
877 {
878 SH_ELEMENT_TYPE *lastentry = entry;
879
880 tb->members--;
881
882 /*
883 * Backward shift following elements till either an empty element
884 * or an element at its optimal position is encountered.
885 *
886 * While that sounds expensive, the average chain length is short,
887 * and deletions would otherwise require tombstones.
888 */
889 while (true)
890 {
894
896 curentry = &tb->data[curelem];
897
898 if (curentry->status != SH_STATUS_IN_USE)
899 {
900 lastentry->status = SH_STATUS_EMPTY;
901 break;
902 }
903
906
907 /* current is at optimal position, done */
908 if (curoptimal == curelem)
909 {
910 lastentry->status = SH_STATUS_EMPTY;
911 break;
912 }
913
914 /* shift */
916
918 }
919
920 return true;
921 }
922
923 /* TODO: return false; if distance too big */
924
926 }
927}
928
929/*
930 * Delete entry from hash table by entry pointer
931 */
932SH_SCOPE void
934{
935 SH_ELEMENT_TYPE *lastentry = entry;
936 uint32 hash = SH_ENTRY_HASH(tb, entry);
939
940 /* Calculate the index of 'entry' */
941 curelem = entry - &tb->data[0];
942
943 tb->members--;
944
945 /*
946 * Backward shift following elements till either an empty element or an
947 * element at its optimal position is encountered.
948 *
949 * While that sounds expensive, the average chain length is short, and
950 * deletions would otherwise require tombstones.
951 */
952 while (true)
953 {
957
959 curentry = &tb->data[curelem];
960
961 if (curentry->status != SH_STATUS_IN_USE)
962 {
963 lastentry->status = SH_STATUS_EMPTY;
964 break;
965 }
966
969
970 /* current is at optimal position, done */
971 if (curoptimal == curelem)
972 {
973 lastentry->status = SH_STATUS_EMPTY;
974 break;
975 }
976
977 /* shift */
979
981 }
982}
983
984/*
985 * Initialize iterator.
986 */
987SH_SCOPE void
989{
991
992 /*
993 * Search for the first empty element. As deletions during iterations are
994 * supported, we want to start/end at an element that cannot be affected
995 * by elements being shifted.
996 */
997 for (uint32 i = 0; i < tb->size; i++)
998 {
999 SH_ELEMENT_TYPE *entry = &tb->data[i];
1000
1001 if (entry->status != SH_STATUS_IN_USE)
1002 {
1003 startelem = i;
1004 break;
1005 }
1006 }
1007
1008 /* we should have found an empty element */
1010
1011 /*
1012 * Iterate backwards, that allows the current element to be deleted, even
1013 * if there are backward shifts
1014 */
1015 iter->cur = startelem;
1016 iter->end = iter->cur;
1017 iter->done = false;
1018}
1019
1020/*
1021 * Initialize iterator to a specific bucket. That's really only useful for
1022 * cases where callers are partially iterating over the hashspace, and that
1023 * iteration deletes and inserts elements based on visited entries. Doing that
1024 * repeatedly could lead to an unbalanced keyspace when always starting at the
1025 * same position.
1026 */
1027SH_SCOPE void
1029{
1030 /*
1031 * Iterate backwards, that allows the current element to be deleted, even
1032 * if there are backward shifts.
1033 */
1034 iter->cur = at & tb->sizemask; /* ensure at is within a valid range */
1035 iter->end = iter->cur;
1036 iter->done = false;
1037}
1038
1039/*
1040 * Iterate over all entries in the hash-table. Return the next occupied entry,
1041 * or NULL if done.
1042 *
1043 * During iteration the current entry in the hash table may be deleted,
1044 * without leading to elements being skipped or returned twice. Additionally
1045 * the rest of the table may be modified (i.e. there can be insertions or
1046 * deletions), but if so, there's neither a guarantee that all nodes are
1047 * visited at least once, nor a guarantee that a node is visited at most once.
1048 */
1051{
1052 /* validate sanity of the given iterator */
1053 Assert(iter->cur < tb->size);
1054 Assert(iter->end < tb->size);
1055
1056 while (!iter->done)
1057 {
1058 SH_ELEMENT_TYPE *elem;
1059
1060 elem = &tb->data[iter->cur];
1061
1062 /* next element in backward direction */
1063 iter->cur = (iter->cur - 1) & tb->sizemask;
1064
1065 if ((iter->cur & tb->sizemask) == (iter->end & tb->sizemask))
1066 iter->done = true;
1067 if (elem->status == SH_STATUS_IN_USE)
1068 {
1069 return elem;
1070 }
1071 }
1072
1073 return NULL;
1074}
1075
1076/*
1077 * Estimate the amount of space needed for a hashtable with nentries entries.
1078 * Return SIZE_MAX if that's too many entries.
1079 *
1080 * nentries is "double" because this is meant for use by the planner,
1081 * which typically works with double rowcount estimates. So we'd need to
1082 * clamp to integer somewhere and that might as well be here. We do expect
1083 * the value not to be NaN or negative, else the result will be garbage.
1084 */
1085SH_SCOPE size_t
1086SH_ESTIMATE_SPACE(double nentries)
1087{
1088 uint64 size;
1089 uint64 space;
1090
1091 /* scale request by SH_FILLFACTOR, as SH_CREATE does */
1092 nentries = nentries / SH_FILLFACTOR;
1093
1094 /* fail if we'd overrun SH_MAX_SIZE entries */
1095 if (nentries >= SH_MAX_SIZE)
1096 return SIZE_MAX;
1097
1098 /* should be safe to convert to uint64 */
1099 size = (uint64) nentries;
1100
1101 /* supporting zero sized hashes would complicate matters */
1102 size = Max(size, 2);
1103
1104 /* round up size to the next power of 2, that's how bucketing works */
1105 size = pg_nextpower2_64(size);
1106
1107 /* calculate space needed for ->data */
1108 space = ((uint64) sizeof(SH_ELEMENT_TYPE)) * size;
1109
1110 /* verify that allocation of ->data is possible on this platform */
1111 if (space >= SIZE_MAX / 2)
1112 return SIZE_MAX;
1113
1114 return (size_t) space + sizeof(SH_TYPE);
1115}
1116
1117/*
1118 * Report some statistics about the state of the hashtable. For
1119 * debugging/profiling purposes only.
1120 */
1121SH_SCOPE void
1123{
1126 double avg_chain_length;
1127 double fillfactor;
1128 uint32 i;
1129
1130 uint32 *collisions = (uint32 *) palloc0(tb->size * sizeof(uint32));
1133 double avg_collisions;
1134
1135 for (i = 0; i < tb->size; i++)
1136 {
1137 uint32 hash;
1139 uint32 dist;
1140 SH_ELEMENT_TYPE *elem;
1141
1142 elem = &tb->data[i];
1143
1144 if (elem->status != SH_STATUS_IN_USE)
1145 continue;
1146
1147 hash = SH_ENTRY_HASH(tb, elem);
1150
1151 if (dist > max_chain_length)
1154
1156 }
1157
1158 for (i = 0; i < tb->size; i++)
1159 {
1161
1162 if (curcoll == 0)
1163 continue;
1164
1165 /* single contained element is not a collision */
1166 curcoll--;
1168 if (curcoll > max_collisions)
1170 }
1171
1172 /* large enough to be worth freeing, even if just used for debugging */
1174
1175 if (tb->members > 0)
1176 {
1177 fillfactor = tb->members / ((double) tb->size);
1179 avg_collisions = ((double) total_collisions) / tb->members;
1180 }
1181 else
1182 {
1183 fillfactor = 0;
1184 avg_chain_length = 0;
1185 avg_collisions = 0;
1186 }
1187
1188 sh_log("size: " UINT64_FORMAT ", members: %u, filled: %f, total chain: %u, max chain: %u, avg chain: %f, total_collisions: %u, max_collisions: %u, avg_collisions: %f",
1191}
1192
1193#endif /* SH_DEFINE */
1194
1195
1196/* undefine external parameters, so next hash table can be defined */
1197#undef SH_PREFIX
1198#undef SH_KEY_TYPE
1199#undef SH_KEY
1200#undef SH_ELEMENT_TYPE
1201#undef SH_HASH_KEY
1202#undef SH_SCOPE
1203#undef SH_DECLARE
1204#undef SH_DEFINE
1205#undef SH_GET_HASH
1206#undef SH_STORE_HASH
1207#undef SH_USE_NONDEFAULT_ALLOCATOR
1208#undef SH_EQUAL
1209
1210/* undefine locally declared macros */
1211#undef SH_MAKE_PREFIX
1212#undef SH_MAKE_NAME
1213#undef SH_MAKE_NAME_
1214#undef SH_FILLFACTOR
1215#undef SH_MAX_FILLFACTOR
1216#undef SH_GROW_MAX_DIB
1217#undef SH_GROW_MAX_MOVE
1218#undef SH_GROW_MIN_FILLFACTOR
1219#undef SH_MAX_SIZE
1220
1221/* types */
1222#undef SH_TYPE
1223#undef SH_STATUS
1224#undef SH_STATUS_EMPTY
1225#undef SH_STATUS_IN_USE
1226#undef SH_ITERATOR
1227
1228/* external function names */
1229#undef SH_CREATE
1230#undef SH_DESTROY
1231#undef SH_RESET
1232#undef SH_INSERT
1233#undef SH_INSERT_HASH
1234#undef SH_DELETE_ITEM
1235#undef SH_DELETE
1236#undef SH_LOOKUP
1237#undef SH_LOOKUP_HASH
1238#undef SH_GROW
1239#undef SH_START_ITERATE
1240#undef SH_START_ITERATE_AT
1241#undef SH_ITERATE
1242#undef SH_ALLOCATE
1243#undef SH_FREE
1244#undef SH_ESTIMATE_SPACE
1245#undef SH_STAT
1246
1247/* internal function names */
1248#undef SH_COMPUTE_SIZE
1249#undef SH_UPDATE_PARAMETERS
1250#undef SH_COMPARE_KEYS
1251#undef SH_INITIAL_BUCKET
1252#undef SH_NEXT
1253#undef SH_PREV
1254#undef SH_DISTANCE_FROM_OPTIMAL
1255#undef SH_ENTRY_HASH
1256#undef SH_INSERT_HASH_INTERNAL
1257#undef SH_LOOKUP_HASH_INTERNAL
#define Min(x, y)
Definition c.h:1093
#define Max(x, y)
Definition c.h:1087
#define Assert(condition)
Definition c.h:945
#define UINT64_FORMAT
Definition c.h:637
int32_t int32
Definition c.h:614
uint64_t uint64
Definition c.h:619
#define unlikely(x)
Definition c.h:432
uint32_t uint32
Definition c.h:618
#define PG_UINT64_MAX
Definition c.h:679
size_t Size
Definition c.h:691
struct cursor * cur
Definition ecpg.c:29
#define SH_GET_HASH(tb, a)
#define MCXT_ALLOC_ZERO
Definition fe_memutils.h:30
#define MCXT_ALLOC_HUGE
Definition fe_memutils.h:28
int i
Definition isn.c:77
#define SH_RAW_ALLOCATOR
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition mcxt.c:1266
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc0(Size size)
Definition mcxt.c:1417
void * MemoryContextAllocExtended(MemoryContext context, Size size, int flags)
Definition mcxt.c:1289
static uint64 pg_nextpower2_64(uint64 num)
const void * data
static int fillfactor
Definition pgbench.c:188
#define SH_HASH_KEY(tb, key)
Definition pgpa_trove.c:108
#define SH_ELEMENT_TYPE
Definition pgpa_trove.c:105
#define SH_KEY_TYPE
Definition pgpa_trove.c:106
#define SH_SCOPE
Definition pgpa_trove.c:110
static int fb(int x)
static unsigned hash(unsigned *uv, int n)
Definition rege_dfa.c:715
#define SH_GROW
Definition simplehash.h:121
#define SH_COMPUTE_SIZE
Definition simplehash.h:131
#define SH_STAT
Definition simplehash.h:128
#define SH_INITIAL_BUCKET
Definition simplehash.h:136
#define SH_UPDATE_PARAMETERS
Definition simplehash.h:132
#define SH_ESTIMATE_SPACE
Definition simplehash.h:127
#define SH_INSERT_HASH
Definition simplehash.h:116
#define SH_PREV
Definition simplehash.h:134
#define SH_STATUS
Definition simplehash.h:106
#define SH_CREATE
Definition simplehash.h:112
#define SH_LOOKUP_HASH
Definition simplehash.h:120
#define SH_START_ITERATE
Definition simplehash.h:122
#define SH_FREE
Definition simplehash.h:126
#define SH_STATUS_IN_USE
Definition simplehash.h:108
#define SH_DISTANCE_FROM_OPTIMAL
Definition simplehash.h:135
#define SH_LOOKUP_HASH_INTERNAL
Definition simplehash.h:139
#define SH_ITERATOR
Definition simplehash.h:109
#define SH_NEXT
Definition simplehash.h:133
#define SH_ITERATE
Definition simplehash.h:124
#define SH_DELETE
Definition simplehash.h:118
#define SH_INSERT
Definition simplehash.h:115
#define SH_INSERT_HASH_INTERNAL
Definition simplehash.h:138
#define SH_RESET
Definition simplehash.h:114
#define SH_ENTRY_HASH
Definition simplehash.h:137
#define SH_DELETE_ITEM
Definition simplehash.h:117
#define SH_ALLOCATE
Definition simplehash.h:125
#define SH_LOOKUP
Definition simplehash.h:119
#define SH_TYPE
Definition simplehash.h:105
#define SH_START_ITERATE_AT
Definition simplehash.h:123
#define SH_STATUS_EMPTY
Definition simplehash.h:107
#define SH_DESTROY
Definition simplehash.h:113
const char * type