PostgreSQL Source Code  git master
simplehash.h
Go to the documentation of this file.
1 /*
2  * simplehash.h
3  *
4  * When included this file generates a "templated" (by way of macros)
5  * open-addressing hash table implementation specialized to user-defined
6  * types.
7  *
8  * It's probably not worthwhile to generate such a specialized implementation
9  * for hash tables that aren't performance or space sensitive.
10  *
11  * Compared to dynahash, simplehash has the following benefits:
12  *
13  * - Due to the "templated" code generation has known structure sizes and no
14  * indirect function calls (which show up substantially in dynahash
15  * profiles). These features considerably increase speed for small
16  * entries.
17  * - Open addressing has better CPU cache behavior than dynahash's chained
18  * hashtables.
19  * - The generated interface is type-safe and easier to use than dynahash,
20  * though at the cost of more complex setup.
21  * - Allocates memory in a MemoryContext or another allocator with a
22  * malloc/free style interface (which isn't easily usable in a shared
23  * memory context)
24  * - Does not require the overhead of a separate memory context.
25  *
26  * Usage notes:
27  *
28  * To generate a hash-table and associated functions for a use case several
29  * macros have to be #define'ed before this file is included. Including
30  * the file #undef's all those, so a new hash table can be generated
31  * afterwards.
32  * The relevant parameters are:
33  * - SH_PREFIX - prefix for all symbol names generated. A prefix of 'foo'
34  * will result in hash table type 'foo_hash' and functions like
35  * 'foo_insert'/'foo_lookup' and so forth.
36  * - SH_ELEMENT_TYPE - type of the contained elements
37  * - SH_KEY_TYPE - type of the hashtable's key
38  * - SH_DECLARE - if defined function prototypes and type declarations are
39  * generated
40  * - SH_DEFINE - if defined function definitions are generated
41  * - SH_SCOPE - in which scope (e.g. extern, static inline) do function
42  * declarations reside
43  * - SH_RAW_ALLOCATOR - if defined, memory contexts are not used; instead,
44  * use this to allocate bytes. The allocator must zero the returned space.
45  * - SH_USE_NONDEFAULT_ALLOCATOR - if defined no element allocator functions
46  * are defined, so you can supply your own
47  * The following parameters are only relevant when SH_DEFINE is defined:
48  * - SH_KEY - name of the element in SH_ELEMENT_TYPE containing the hash key
49  * - SH_EQUAL(table, a, b) - compare two table keys
50  * - SH_HASH_KEY(table, key) - generate hash for the key
51  * - SH_STORE_HASH - if defined the hash is stored in the elements
52  * - SH_GET_HASH(tb, a) - return the field to store the hash in
53  *
54  * The element type is required to contain a "status" member that can store
55  * the range of values defined in the SH_STATUS enum.
56  *
57  * While SH_STORE_HASH (and subsequently SH_GET_HASH) are optional, because
58  * the hash table implementation needs to compare hashes to move elements
59  * (particularly when growing the hash), it's preferable, if possible, to
60  * store the element's hash in the element's data type. If the hash is so
61  * stored, the hash table will also compare hashes before calling SH_EQUAL
62  * when comparing two keys.
63  *
64  * For convenience the hash table create functions accept a void pointer
65  * that will be stored in the hash table type's member private_data. This
66  * allows callbacks to reference caller provided data.
67  *
68  * For examples of usage look at tidbitmap.c (file local definition) and
69  * execnodes.h/execGrouping.c (exposed declaration, file local
70  * implementation).
71  *
72  * Hash table design:
73  *
74  * The hash table design chosen is a variant of linear open-addressing. The
75  * reason for doing so is that linear addressing is CPU cache & pipeline
76  * friendly. The biggest disadvantage of simple linear addressing schemes
77  * are highly variable lookup times due to clustering, and deletions
78  * leaving a lot of tombstones around. To address these issues a variant
79  * of "robin hood" hashing is employed. Robin hood hashing optimizes
80  * chaining lengths by moving elements close to their optimal bucket
81  * ("rich" elements), out of the way if a to-be-inserted element is further
82  * away from its optimal position (i.e. it's "poor"). While that can make
83  * insertions slower, the average lookup performance is a lot better, and
84  * higher fill factors can be used in a still performant manner. To avoid
85  * tombstones - which normally solve the issue that a deleted node's
86  * presence is relevant to determine whether a lookup needs to continue
87  * looking or is done - buckets following a deleted element are shifted
88  * backwards, unless they're empty or already at their optimal position.
89  *
90  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
91  * Portions Copyright (c) 1994, Regents of the University of California
92  *
93  * src/include/lib/simplehash.h
94  */
95 
96 #include "port/pg_bitutils.h"
97 
98 /* helpers */
99 #define SH_MAKE_PREFIX(a) CppConcat(a,_)
100 #define SH_MAKE_NAME(name) SH_MAKE_NAME_(SH_MAKE_PREFIX(SH_PREFIX),name)
101 #define SH_MAKE_NAME_(a,b) CppConcat(a,b)
102 
103 /* name macros for: */
104 
105 /* type declarations */
106 #define SH_TYPE SH_MAKE_NAME(hash)
107 #define SH_STATUS SH_MAKE_NAME(status)
108 #define SH_STATUS_EMPTY SH_MAKE_NAME(SH_EMPTY)
109 #define SH_STATUS_IN_USE SH_MAKE_NAME(SH_IN_USE)
110 #define SH_ITERATOR SH_MAKE_NAME(iterator)
111 
112 /* function declarations */
113 #define SH_CREATE SH_MAKE_NAME(create)
114 #define SH_DESTROY SH_MAKE_NAME(destroy)
115 #define SH_RESET SH_MAKE_NAME(reset)
116 #define SH_INSERT SH_MAKE_NAME(insert)
117 #define SH_INSERT_HASH SH_MAKE_NAME(insert_hash)
118 #define SH_DELETE_ITEM SH_MAKE_NAME(delete_item)
119 #define SH_DELETE SH_MAKE_NAME(delete)
120 #define SH_LOOKUP SH_MAKE_NAME(lookup)
121 #define SH_LOOKUP_HASH SH_MAKE_NAME(lookup_hash)
122 #define SH_GROW SH_MAKE_NAME(grow)
123 #define SH_START_ITERATE SH_MAKE_NAME(start_iterate)
124 #define SH_START_ITERATE_AT SH_MAKE_NAME(start_iterate_at)
125 #define SH_ITERATE SH_MAKE_NAME(iterate)
126 #define SH_ALLOCATE SH_MAKE_NAME(allocate)
127 #define SH_FREE SH_MAKE_NAME(free)
128 #define SH_STAT SH_MAKE_NAME(stat)
129 
130 /* internal helper functions (no externally visible prototypes) */
131 #define SH_COMPUTE_PARAMETERS SH_MAKE_NAME(compute_parameters)
132 #define SH_NEXT SH_MAKE_NAME(next)
133 #define SH_PREV SH_MAKE_NAME(prev)
134 #define SH_DISTANCE_FROM_OPTIMAL SH_MAKE_NAME(distance)
135 #define SH_INITIAL_BUCKET SH_MAKE_NAME(initial_bucket)
136 #define SH_ENTRY_HASH SH_MAKE_NAME(entry_hash)
137 #define SH_INSERT_HASH_INTERNAL SH_MAKE_NAME(insert_hash_internal)
138 #define SH_LOOKUP_HASH_INTERNAL SH_MAKE_NAME(lookup_hash_internal)
139 
140 /* generate forward declarations necessary to use the hash table */
141 #ifdef SH_DECLARE
142 
143 /* type definitions */
144 typedef struct SH_TYPE
145 {
146  /*
147  * Size of data / bucket array, 64 bits to handle UINT32_MAX sized hash
148  * tables. Note that the maximum number of elements is lower
149  * (SH_MAX_FILLFACTOR)
150  */
151  uint64 size;
152 
153  /* how many elements have valid contents */
155 
156  /* mask for bucket and size calculations, based on size */
158 
159  /* boundary after which to grow hashtable */
161 
162  /* hash buckets */
164 
165 #ifndef SH_RAW_ALLOCATOR
166  /* memory context to use for allocations */
168 #endif
169 
170  /* user defined data, useful for callbacks */
173 
174 typedef enum SH_STATUS
175 {
177  SH_STATUS_IN_USE = 0x01
179 
180 typedef struct SH_ITERATOR
181 {
182  uint32 cur; /* current element */
184  bool done; /* iterator exhausted? */
186 
187 /* externally visible function prototypes */
188 #ifdef SH_RAW_ALLOCATOR
189 /* <prefix>_hash <prefix>_create(uint32 nelements, void *private_data) */
190 SH_SCOPE SH_TYPE *SH_CREATE(uint32 nelements, void *private_data);
191 #else
192 /*
193  * <prefix>_hash <prefix>_create(MemoryContext ctx, uint32 nelements,
194  * void *private_data)
195  */
197  void *private_data);
198 #endif
199 
200 /* void <prefix>_destroy(<prefix>_hash *tb) */
202 
203 /* void <prefix>_reset(<prefix>_hash *tb) */
205 
206 /* void <prefix>_grow(<prefix>_hash *tb, uint64 newsize) */
207 SH_SCOPE void SH_GROW(SH_TYPE * tb, uint64 newsize);
208 
209 /* <element> *<prefix>_insert(<prefix>_hash *tb, <key> key, bool *found) */
211 
212 /*
213  * <element> *<prefix>_insert_hash(<prefix>_hash *tb, <key> key, uint32 hash,
214  * bool *found)
215  */
217  uint32 hash, bool *found);
218 
219 /* <element> *<prefix>_lookup(<prefix>_hash *tb, <key> key) */
221 
222 /* <element> *<prefix>_lookup_hash(<prefix>_hash *tb, <key> key, uint32 hash) */
224  uint32 hash);
225 
226 /* void <prefix>_delete_item(<prefix>_hash *tb, <element> *entry) */
228 
229 /* bool <prefix>_delete(<prefix>_hash *tb, <key> key) */
231 
232 /* void <prefix>_start_iterate(<prefix>_hash *tb, <prefix>_iterator *iter) */
234 
235 /*
236  * void <prefix>_start_iterate_at(<prefix>_hash *tb, <prefix>_iterator *iter,
237  * uint32 at)
238  */
240 
241 /* <element> *<prefix>_iterate(<prefix>_hash *tb, <prefix>_iterator *iter) */
243 
244 /* void <prefix>_stat(<prefix>_hash *tb */
246 
247 #endif /* SH_DECLARE */
248 
249 
250 /* generate implementation of the hash table */
251 #ifdef SH_DEFINE
252 
253 #ifndef SH_RAW_ALLOCATOR
254 #include "utils/memutils.h"
255 #endif
256 
257 /* max data array size,we allow up to PG_UINT32_MAX buckets, including 0 */
258 #define SH_MAX_SIZE (((uint64) PG_UINT32_MAX) + 1)
259 
260 /* normal fillfactor, unless already close to maximum */
261 #ifndef SH_FILLFACTOR
262 #define SH_FILLFACTOR (0.9)
263 #endif
264 /* increase fillfactor if we otherwise would error out */
265 #define SH_MAX_FILLFACTOR (0.98)
266 /* grow if actual and optimal location bigger than */
267 #ifndef SH_GROW_MAX_DIB
268 #define SH_GROW_MAX_DIB 25
269 #endif
270 /* grow if more than elements to move when inserting */
271 #ifndef SH_GROW_MAX_MOVE
272 #define SH_GROW_MAX_MOVE 150
273 #endif
274 #ifndef SH_GROW_MIN_FILLFACTOR
275 /* but do not grow due to SH_GROW_MAX_* if below */
276 #define SH_GROW_MIN_FILLFACTOR 0.1
277 #endif
278 
279 #ifdef SH_STORE_HASH
280 #define SH_COMPARE_KEYS(tb, ahash, akey, b) (ahash == SH_GET_HASH(tb, b) && SH_EQUAL(tb, b->SH_KEY, akey))
281 #else
282 #define SH_COMPARE_KEYS(tb, ahash, akey, b) (SH_EQUAL(tb, b->SH_KEY, akey))
283 #endif
284 
285 /*
286  * Wrap the following definitions in include guards, to avoid multiple
287  * definition errors if this header is included more than once. The rest of
288  * the file deliberately has no include guards, because it can be included
289  * with different parameters to define functions and types with non-colliding
290  * names.
291  */
292 #ifndef SIMPLEHASH_H
293 #define SIMPLEHASH_H
294 
295 #ifdef FRONTEND
296 #define sh_error(...) pg_fatal(__VA_ARGS__)
297 #define sh_log(...) pg_log_info(__VA_ARGS__)
298 #else
299 #define sh_error(...) elog(ERROR, __VA_ARGS__)
300 #define sh_log(...) elog(LOG, __VA_ARGS__)
301 #endif
302 
303 #endif
304 
305 /*
306  * Compute sizing parameters for hashtable. Called when creating and growing
307  * the hashtable.
308  */
309 static inline void
310 SH_COMPUTE_PARAMETERS(SH_TYPE * tb, uint64 newsize)
311 {
312  uint64 size;
313 
314  /* supporting zero sized hashes would complicate matters */
315  size = Max(newsize, 2);
316 
317  /* round up size to the next power of 2, that's how bucketing works */
318  size = pg_nextpower2_64(size);
319  Assert(size <= SH_MAX_SIZE);
320 
321  /*
322  * Verify that allocation of ->data is possible on this platform, without
323  * overflowing Size.
324  */
325  if (unlikely((((uint64) sizeof(SH_ELEMENT_TYPE)) * size) >= SIZE_MAX / 2))
326  sh_error("hash table too large");
327 
328  /* now set size */
329  tb->size = size;
330  tb->sizemask = (uint32) (size - 1);
331 
332  /*
333  * Compute the next threshold at which we need to grow the hash table
334  * again.
335  */
336  if (tb->size == SH_MAX_SIZE)
337  tb->grow_threshold = ((double) tb->size) * SH_MAX_FILLFACTOR;
338  else
339  tb->grow_threshold = ((double) tb->size) * SH_FILLFACTOR;
340 }
341 
342 /* return the optimal bucket for the hash */
343 static inline uint32
345 {
346  return hash & tb->sizemask;
347 }
348 
349 /* return next bucket after the current, handling wraparound */
350 static inline uint32
351 SH_NEXT(SH_TYPE * tb, uint32 curelem, uint32 startelem)
352 {
353  curelem = (curelem + 1) & tb->sizemask;
354 
355  Assert(curelem != startelem);
356 
357  return curelem;
358 }
359 
360 /* return bucket before the current, handling wraparound */
361 static inline uint32
362 SH_PREV(SH_TYPE * tb, uint32 curelem, uint32 startelem)
363 {
364  curelem = (curelem - 1) & tb->sizemask;
365 
366  Assert(curelem != startelem);
367 
368  return curelem;
369 }
370 
371 /* return distance between bucket and its optimal position */
372 static inline uint32
373 SH_DISTANCE_FROM_OPTIMAL(SH_TYPE * tb, uint32 optimal, uint32 bucket)
374 {
375  if (optimal <= bucket)
376  return bucket - optimal;
377  else
378  return (tb->size + bucket) - optimal;
379 }
380 
381 static inline uint32
383 {
384 #ifdef SH_STORE_HASH
385  return SH_GET_HASH(tb, entry);
386 #else
387  return SH_HASH_KEY(tb, entry->SH_KEY);
388 #endif
389 }
390 
391 /* default memory allocator function */
392 static inline void *SH_ALLOCATE(SH_TYPE * type, Size size);
393 static inline void SH_FREE(SH_TYPE * type, void *pointer);
394 
395 #ifndef SH_USE_NONDEFAULT_ALLOCATOR
396 
397 /* default memory allocator function */
398 static inline void *
399 SH_ALLOCATE(SH_TYPE * type, Size size)
400 {
401 #ifdef SH_RAW_ALLOCATOR
402  return SH_RAW_ALLOCATOR(size);
403 #else
404  return MemoryContextAllocExtended(type->ctx, size,
406 #endif
407 }
408 
409 /* default memory free function */
410 static inline void
411 SH_FREE(SH_TYPE * type, void *pointer)
412 {
413  pfree(pointer);
414 }
415 
416 #endif
417 
418 /*
419  * Create a hash table with enough space for `nelements` distinct members.
420  * Memory for the hash table is allocated from the passed-in context. If
421  * desired, the array of elements can be allocated using a passed-in allocator;
422  * this could be useful in order to place the array of elements in a shared
423  * memory, or in a context that will outlive the rest of the hash table.
424  * Memory other than for the array of elements will still be allocated from
425  * the passed-in context.
426  */
427 #ifdef SH_RAW_ALLOCATOR
429 SH_CREATE(uint32 nelements, void *private_data)
430 #else
432 SH_CREATE(MemoryContext ctx, uint32 nelements, void *private_data)
433 #endif
434 {
435  SH_TYPE *tb;
436  uint64 size;
437 
438 #ifdef SH_RAW_ALLOCATOR
439  tb = (SH_TYPE *) SH_RAW_ALLOCATOR(sizeof(SH_TYPE));
440 #else
441  tb = (SH_TYPE *) MemoryContextAllocZero(ctx, sizeof(SH_TYPE));
442  tb->ctx = ctx;
443 #endif
444  tb->private_data = private_data;
445 
446  /* increase nelements by fillfactor, want to store nelements elements */
447  size = Min((double) SH_MAX_SIZE, ((double) nelements) / SH_FILLFACTOR);
448 
449  SH_COMPUTE_PARAMETERS(tb, size);
450 
451  tb->data = (SH_ELEMENT_TYPE *) SH_ALLOCATE(tb, sizeof(SH_ELEMENT_TYPE) * tb->size);
452 
453  return tb;
454 }
455 
456 /* destroy a previously created hash table */
457 SH_SCOPE void
458 SH_DESTROY(SH_TYPE * tb)
459 {
460  SH_FREE(tb, tb->data);
461  pfree(tb);
462 }
463 
464 /* reset the contents of a previously created hash table */
465 SH_SCOPE void
466 SH_RESET(SH_TYPE * tb)
467 {
468  memset(tb->data, 0, sizeof(SH_ELEMENT_TYPE) * tb->size);
469  tb->members = 0;
470 }
471 
472 /*
473  * Grow a hash table to at least `newsize` buckets.
474  *
475  * Usually this will automatically be called by insertions/deletions, when
476  * necessary. But resizing to the exact input size can be advantageous
477  * performance-wise, when known at some point.
478  */
479 SH_SCOPE void
480 SH_GROW(SH_TYPE * tb, uint64 newsize)
481 {
482  uint64 oldsize = tb->size;
483  SH_ELEMENT_TYPE *olddata = tb->data;
484  SH_ELEMENT_TYPE *newdata;
485  uint32 i;
486  uint32 startelem = 0;
487  uint32 copyelem;
488 
489  Assert(oldsize == pg_nextpower2_64(oldsize));
490  Assert(oldsize != SH_MAX_SIZE);
491  Assert(oldsize < newsize);
492 
493  /* compute parameters for new table */
494  SH_COMPUTE_PARAMETERS(tb, newsize);
495 
496  tb->data = (SH_ELEMENT_TYPE *) SH_ALLOCATE(tb, sizeof(SH_ELEMENT_TYPE) * tb->size);
497 
498  newdata = tb->data;
499 
500  /*
501  * Copy entries from the old data to newdata. We theoretically could use
502  * SH_INSERT here, to avoid code duplication, but that's more general than
503  * we need. We neither want tb->members increased, nor do we need to do
504  * deal with deleted elements, nor do we need to compare keys. So a
505  * special-cased implementation is lot faster. As resizing can be time
506  * consuming and frequent, that's worthwhile to optimize.
507  *
508  * To be able to simply move entries over, we have to start not at the
509  * first bucket (i.e olddata[0]), but find the first bucket that's either
510  * empty, or is occupied by an entry at its optimal position. Such a
511  * bucket has to exist in any table with a load factor under 1, as not all
512  * buckets are occupied, i.e. there always has to be an empty bucket. By
513  * starting at such a bucket we can move the entries to the larger table,
514  * without having to deal with conflicts.
515  */
516 
517  /* search for the first element in the hash that's not wrapped around */
518  for (i = 0; i < oldsize; i++)
519  {
520  SH_ELEMENT_TYPE *oldentry = &olddata[i];
521  uint32 hash;
522  uint32 optimal;
523 
524  if (oldentry->status != SH_STATUS_IN_USE)
525  {
526  startelem = i;
527  break;
528  }
529 
530  hash = SH_ENTRY_HASH(tb, oldentry);
531  optimal = SH_INITIAL_BUCKET(tb, hash);
532 
533  if (optimal == i)
534  {
535  startelem = i;
536  break;
537  }
538  }
539 
540  /* and copy all elements in the old table */
541  copyelem = startelem;
542  for (i = 0; i < oldsize; i++)
543  {
544  SH_ELEMENT_TYPE *oldentry = &olddata[copyelem];
545 
546  if (oldentry->status == SH_STATUS_IN_USE)
547  {
548  uint32 hash;
549  uint32 startelem2;
550  uint32 curelem;
551  SH_ELEMENT_TYPE *newentry;
552 
553  hash = SH_ENTRY_HASH(tb, oldentry);
554  startelem2 = SH_INITIAL_BUCKET(tb, hash);
555  curelem = startelem2;
556 
557  /* find empty element to put data into */
558  while (true)
559  {
560  newentry = &newdata[curelem];
561 
562  if (newentry->status == SH_STATUS_EMPTY)
563  {
564  break;
565  }
566 
567  curelem = SH_NEXT(tb, curelem, startelem2);
568  }
569 
570  /* copy entry to new slot */
571  memcpy(newentry, oldentry, sizeof(SH_ELEMENT_TYPE));
572  }
573 
574  /* can't use SH_NEXT here, would use new size */
575  copyelem++;
576  if (copyelem >= oldsize)
577  {
578  copyelem = 0;
579  }
580  }
581 
582  SH_FREE(tb, olddata);
583 }
584 
585 /*
586  * This is a separate static inline function, so it can be reliably be inlined
587  * into its wrapper functions even if SH_SCOPE is extern.
588  */
589 static inline SH_ELEMENT_TYPE *
591 {
592  uint32 startelem;
593  uint32 curelem;
595  uint32 insertdist;
596 
597 restart:
598  insertdist = 0;
599 
600  /*
601  * We do the grow check even if the key is actually present, to avoid
602  * doing the check inside the loop. This also lets us avoid having to
603  * re-find our position in the hashtable after resizing.
604  *
605  * Note that this also reached when resizing the table due to
606  * SH_GROW_MAX_DIB / SH_GROW_MAX_MOVE.
607  */
608  if (unlikely(tb->members >= tb->grow_threshold))
609  {
610  if (unlikely(tb->size == SH_MAX_SIZE))
611  sh_error("hash table size exceeded");
612 
613  /*
614  * When optimizing, it can be very useful to print these out.
615  */
616  /* SH_STAT(tb); */
617  SH_GROW(tb, tb->size * 2);
618  /* SH_STAT(tb); */
619  }
620 
621  /* perform insert, start bucket search at optimal location */
622  data = tb->data;
623  startelem = SH_INITIAL_BUCKET(tb, hash);
624  curelem = startelem;
625  while (true)
626  {
627  uint32 curdist;
628  uint32 curhash;
629  uint32 curoptimal;
630  SH_ELEMENT_TYPE *entry = &data[curelem];
631 
632  /* any empty bucket can directly be used */
633  if (entry->status == SH_STATUS_EMPTY)
634  {
635  tb->members++;
636  entry->SH_KEY = key;
637 #ifdef SH_STORE_HASH
638  SH_GET_HASH(tb, entry) = hash;
639 #endif
640  entry->status = SH_STATUS_IN_USE;
641  *found = false;
642  return entry;
643  }
644 
645  /*
646  * If the bucket is not empty, we either found a match (in which case
647  * we're done), or we have to decide whether to skip over or move the
648  * colliding entry. When the colliding element's distance to its
649  * optimal position is smaller than the to-be-inserted entry's, we
650  * shift the colliding entry (and its followers) forward by one.
651  */
652 
653  if (SH_COMPARE_KEYS(tb, hash, key, entry))
654  {
655  Assert(entry->status == SH_STATUS_IN_USE);
656  *found = true;
657  return entry;
658  }
659 
660  curhash = SH_ENTRY_HASH(tb, entry);
661  curoptimal = SH_INITIAL_BUCKET(tb, curhash);
662  curdist = SH_DISTANCE_FROM_OPTIMAL(tb, curoptimal, curelem);
663 
664  if (insertdist > curdist)
665  {
666  SH_ELEMENT_TYPE *lastentry = entry;
667  uint32 emptyelem = curelem;
668  uint32 moveelem;
669  int32 emptydist = 0;
670 
671  /* find next empty bucket */
672  while (true)
673  {
674  SH_ELEMENT_TYPE *emptyentry;
675 
676  emptyelem = SH_NEXT(tb, emptyelem, startelem);
677  emptyentry = &data[emptyelem];
678 
679  if (emptyentry->status == SH_STATUS_EMPTY)
680  {
681  lastentry = emptyentry;
682  break;
683  }
684 
685  /*
686  * To avoid negative consequences from overly imbalanced
687  * hashtables, grow the hashtable if collisions would require
688  * us to move a lot of entries. The most likely cause of such
689  * imbalance is filling a (currently) small table, from a
690  * currently big one, in hash-table order. Don't grow if the
691  * hashtable would be too empty, to prevent quick space
692  * explosion for some weird edge cases.
693  */
694  if (unlikely(++emptydist > SH_GROW_MAX_MOVE) &&
695  ((double) tb->members / tb->size) >= SH_GROW_MIN_FILLFACTOR)
696  {
697  tb->grow_threshold = 0;
698  goto restart;
699  }
700  }
701 
702  /* shift forward, starting at last occupied element */
703 
704  /*
705  * TODO: This could be optimized to be one memcpy in many cases,
706  * excepting wrapping around at the end of ->data. Hasn't shown up
707  * in profiles so far though.
708  */
709  moveelem = emptyelem;
710  while (moveelem != curelem)
711  {
712  SH_ELEMENT_TYPE *moveentry;
713 
714  moveelem = SH_PREV(tb, moveelem, startelem);
715  moveentry = &data[moveelem];
716 
717  memcpy(lastentry, moveentry, sizeof(SH_ELEMENT_TYPE));
718  lastentry = moveentry;
719  }
720 
721  /* and fill the now empty spot */
722  tb->members++;
723 
724  entry->SH_KEY = key;
725 #ifdef SH_STORE_HASH
726  SH_GET_HASH(tb, entry) = hash;
727 #endif
728  entry->status = SH_STATUS_IN_USE;
729  *found = false;
730  return entry;
731  }
732 
733  curelem = SH_NEXT(tb, curelem, startelem);
734  insertdist++;
735 
736  /*
737  * To avoid negative consequences from overly imbalanced hashtables,
738  * grow the hashtable if collisions lead to large runs. The most
739  * likely cause of such imbalance is filling a (currently) small
740  * table, from a currently big one, in hash-table order. Don't grow
741  * if the hashtable would be too empty, to prevent quick space
742  * explosion for some weird edge cases.
743  */
744  if (unlikely(insertdist > SH_GROW_MAX_DIB) &&
745  ((double) tb->members / tb->size) >= SH_GROW_MIN_FILLFACTOR)
746  {
747  tb->grow_threshold = 0;
748  goto restart;
749  }
750  }
751 }
752 
753 /*
754  * Insert the key key into the hash-table, set *found to true if the key
755  * already exists, false otherwise. Returns the hash-table entry in either
756  * case.
757  */
759 SH_INSERT(SH_TYPE * tb, SH_KEY_TYPE key, bool *found)
760 {
761  uint32 hash = SH_HASH_KEY(tb, key);
762 
763  return SH_INSERT_HASH_INTERNAL(tb, key, hash, found);
764 }
765 
766 /*
767  * Insert the key key into the hash-table using an already-calculated
768  * hash. Set *found to true if the key already exists, false
769  * otherwise. Returns the hash-table entry in either case.
770  */
772 SH_INSERT_HASH(SH_TYPE * tb, SH_KEY_TYPE key, uint32 hash, bool *found)
773 {
774  return SH_INSERT_HASH_INTERNAL(tb, key, hash, found);
775 }
776 
777 /*
778  * This is a separate static inline function, so it can be reliably be inlined
779  * into its wrapper functions even if SH_SCOPE is extern.
780  */
781 static inline SH_ELEMENT_TYPE *
783 {
784  const uint32 startelem = SH_INITIAL_BUCKET(tb, hash);
785  uint32 curelem = startelem;
786 
787  while (true)
788  {
789  SH_ELEMENT_TYPE *entry = &tb->data[curelem];
790 
791  if (entry->status == SH_STATUS_EMPTY)
792  {
793  return NULL;
794  }
795 
796  Assert(entry->status == SH_STATUS_IN_USE);
797 
798  if (SH_COMPARE_KEYS(tb, hash, key, entry))
799  return entry;
800 
801  /*
802  * TODO: we could stop search based on distance. If the current
803  * buckets's distance-from-optimal is smaller than what we've skipped
804  * already, the entry doesn't exist. Probably only do so if
805  * SH_STORE_HASH is defined, to avoid re-computing hashes?
806  */
807 
808  curelem = SH_NEXT(tb, curelem, startelem);
809  }
810 }
811 
812 /*
813  * Lookup entry in hash table. Returns NULL if key not present.
814  */
817 {
818  uint32 hash = SH_HASH_KEY(tb, key);
819 
820  return SH_LOOKUP_HASH_INTERNAL(tb, key, hash);
821 }
822 
823 /*
824  * Lookup entry in hash table using an already-calculated hash.
825  *
826  * Returns NULL if key not present.
827  */
830 {
831  return SH_LOOKUP_HASH_INTERNAL(tb, key, hash);
832 }
833 
834 /*
835  * Delete entry from hash table by key. Returns whether to-be-deleted key was
836  * present.
837  */
838 SH_SCOPE bool
840 {
841  uint32 hash = SH_HASH_KEY(tb, key);
842  uint32 startelem = SH_INITIAL_BUCKET(tb, hash);
843  uint32 curelem = startelem;
844 
845  while (true)
846  {
847  SH_ELEMENT_TYPE *entry = &tb->data[curelem];
848 
849  if (entry->status == SH_STATUS_EMPTY)
850  return false;
851 
852  if (entry->status == SH_STATUS_IN_USE &&
853  SH_COMPARE_KEYS(tb, hash, key, entry))
854  {
855  SH_ELEMENT_TYPE *lastentry = entry;
856 
857  tb->members--;
858 
859  /*
860  * Backward shift following elements till either an empty element
861  * or an element at its optimal position is encountered.
862  *
863  * While that sounds expensive, the average chain length is short,
864  * and deletions would otherwise require tombstones.
865  */
866  while (true)
867  {
868  SH_ELEMENT_TYPE *curentry;
869  uint32 curhash;
870  uint32 curoptimal;
871 
872  curelem = SH_NEXT(tb, curelem, startelem);
873  curentry = &tb->data[curelem];
874 
875  if (curentry->status != SH_STATUS_IN_USE)
876  {
877  lastentry->status = SH_STATUS_EMPTY;
878  break;
879  }
880 
881  curhash = SH_ENTRY_HASH(tb, curentry);
882  curoptimal = SH_INITIAL_BUCKET(tb, curhash);
883 
884  /* current is at optimal position, done */
885  if (curoptimal == curelem)
886  {
887  lastentry->status = SH_STATUS_EMPTY;
888  break;
889  }
890 
891  /* shift */
892  memcpy(lastentry, curentry, sizeof(SH_ELEMENT_TYPE));
893 
894  lastentry = curentry;
895  }
896 
897  return true;
898  }
899 
900  /* TODO: return false; if distance too big */
901 
902  curelem = SH_NEXT(tb, curelem, startelem);
903  }
904 }
905 
906 /*
907  * Delete entry from hash table by entry pointer
908  */
909 SH_SCOPE void
911 {
912  SH_ELEMENT_TYPE *lastentry = entry;
913  uint32 hash = SH_ENTRY_HASH(tb, entry);
914  uint32 startelem = SH_INITIAL_BUCKET(tb, hash);
915  uint32 curelem;
916 
917  /* Calculate the index of 'entry' */
918  curelem = entry - &tb->data[0];
919 
920  tb->members--;
921 
922  /*
923  * Backward shift following elements till either an empty element or an
924  * element at its optimal position is encountered.
925  *
926  * While that sounds expensive, the average chain length is short, and
927  * deletions would otherwise require tombstones.
928  */
929  while (true)
930  {
931  SH_ELEMENT_TYPE *curentry;
932  uint32 curhash;
933  uint32 curoptimal;
934 
935  curelem = SH_NEXT(tb, curelem, startelem);
936  curentry = &tb->data[curelem];
937 
938  if (curentry->status != SH_STATUS_IN_USE)
939  {
940  lastentry->status = SH_STATUS_EMPTY;
941  break;
942  }
943 
944  curhash = SH_ENTRY_HASH(tb, curentry);
945  curoptimal = SH_INITIAL_BUCKET(tb, curhash);
946 
947  /* current is at optimal position, done */
948  if (curoptimal == curelem)
949  {
950  lastentry->status = SH_STATUS_EMPTY;
951  break;
952  }
953 
954  /* shift */
955  memcpy(lastentry, curentry, sizeof(SH_ELEMENT_TYPE));
956 
957  lastentry = curentry;
958  }
959 }
960 
961 /*
962  * Initialize iterator.
963  */
964 SH_SCOPE void
966 {
967  uint64 startelem = PG_UINT64_MAX;
968 
969  /*
970  * Search for the first empty element. As deletions during iterations are
971  * supported, we want to start/end at an element that cannot be affected
972  * by elements being shifted.
973  */
974  for (uint32 i = 0; i < tb->size; i++)
975  {
976  SH_ELEMENT_TYPE *entry = &tb->data[i];
977 
978  if (entry->status != SH_STATUS_IN_USE)
979  {
980  startelem = i;
981  break;
982  }
983  }
984 
985  /* we should have found an empty element */
986  Assert(startelem < SH_MAX_SIZE);
987 
988  /*
989  * Iterate backwards, that allows the current element to be deleted, even
990  * if there are backward shifts
991  */
992  iter->cur = startelem;
993  iter->end = iter->cur;
994  iter->done = false;
995 }
996 
997 /*
998  * Initialize iterator to a specific bucket. That's really only useful for
999  * cases where callers are partially iterating over the hashspace, and that
1000  * iteration deletes and inserts elements based on visited entries. Doing that
1001  * repeatedly could lead to an unbalanced keyspace when always starting at the
1002  * same position.
1003  */
1004 SH_SCOPE void
1006 {
1007  /*
1008  * Iterate backwards, that allows the current element to be deleted, even
1009  * if there are backward shifts.
1010  */
1011  iter->cur = at & tb->sizemask; /* ensure at is within a valid range */
1012  iter->end = iter->cur;
1013  iter->done = false;
1014 }
1015 
1016 /*
1017  * Iterate over all entries in the hash-table. Return the next occupied entry,
1018  * or NULL if done.
1019  *
1020  * During iteration the current entry in the hash table may be deleted,
1021  * without leading to elements being skipped or returned twice. Additionally
1022  * the rest of the table may be modified (i.e. there can be insertions or
1023  * deletions), but if so, there's neither a guarantee that all nodes are
1024  * visited at least once, nor a guarantee that a node is visited at most once.
1025  */
1027 SH_ITERATE(SH_TYPE * tb, SH_ITERATOR * iter)
1028 {
1029  while (!iter->done)
1030  {
1031  SH_ELEMENT_TYPE *elem;
1032 
1033  elem = &tb->data[iter->cur];
1034 
1035  /* next element in backward direction */
1036  iter->cur = (iter->cur - 1) & tb->sizemask;
1037 
1038  if ((iter->cur & tb->sizemask) == (iter->end & tb->sizemask))
1039  iter->done = true;
1040  if (elem->status == SH_STATUS_IN_USE)
1041  {
1042  return elem;
1043  }
1044  }
1045 
1046  return NULL;
1047 }
1048 
1049 /*
1050  * Report some statistics about the state of the hashtable. For
1051  * debugging/profiling purposes only.
1052  */
1053 SH_SCOPE void
1054 SH_STAT(SH_TYPE * tb)
1055 {
1056  uint32 max_chain_length = 0;
1057  uint32 total_chain_length = 0;
1058  double avg_chain_length;
1059  double fillfactor;
1060  uint32 i;
1061 
1062  uint32 *collisions = (uint32 *) palloc0(tb->size * sizeof(uint32));
1063  uint32 total_collisions = 0;
1064  uint32 max_collisions = 0;
1065  double avg_collisions;
1066 
1067  for (i = 0; i < tb->size; i++)
1068  {
1069  uint32 hash;
1070  uint32 optimal;
1071  uint32 dist;
1072  SH_ELEMENT_TYPE *elem;
1073 
1074  elem = &tb->data[i];
1075 
1076  if (elem->status != SH_STATUS_IN_USE)
1077  continue;
1078 
1079  hash = SH_ENTRY_HASH(tb, elem);
1080  optimal = SH_INITIAL_BUCKET(tb, hash);
1081  dist = SH_DISTANCE_FROM_OPTIMAL(tb, optimal, i);
1082 
1083  if (dist > max_chain_length)
1084  max_chain_length = dist;
1085  total_chain_length += dist;
1086 
1087  collisions[optimal]++;
1088  }
1089 
1090  for (i = 0; i < tb->size; i++)
1091  {
1092  uint32 curcoll = collisions[i];
1093 
1094  if (curcoll == 0)
1095  continue;
1096 
1097  /* single contained element is not a collision */
1098  curcoll--;
1099  total_collisions += curcoll;
1100  if (curcoll > max_collisions)
1101  max_collisions = curcoll;
1102  }
1103 
1104  if (tb->members > 0)
1105  {
1106  fillfactor = tb->members / ((double) tb->size);
1107  avg_chain_length = ((double) total_chain_length) / tb->members;
1108  avg_collisions = ((double) total_collisions) / tb->members;
1109  }
1110  else
1111  {
1112  fillfactor = 0;
1113  avg_chain_length = 0;
1114  avg_collisions = 0;
1115  }
1116 
1117  sh_log("size: " UINT64_FORMAT ", members: %u, filled: %f, total chain: %u, max chain: %u, avg chain: %f, total_collisions: %u, max_collisions: %u, avg_collisions: %f",
1118  tb->size, tb->members, fillfactor, total_chain_length, max_chain_length, avg_chain_length,
1119  total_collisions, max_collisions, avg_collisions);
1120 }
1121 
1122 #endif /* SH_DEFINE */
1123 
1124 
1125 /* undefine external parameters, so next hash table can be defined */
1126 #undef SH_PREFIX
1127 #undef SH_KEY_TYPE
1128 #undef SH_KEY
1129 #undef SH_ELEMENT_TYPE
1130 #undef SH_HASH_KEY
1131 #undef SH_SCOPE
1132 #undef SH_DECLARE
1133 #undef SH_DEFINE
1134 #undef SH_GET_HASH
1135 #undef SH_STORE_HASH
1136 #undef SH_USE_NONDEFAULT_ALLOCATOR
1137 #undef SH_EQUAL
1138 
1139 /* undefine locally declared macros */
1140 #undef SH_MAKE_PREFIX
1141 #undef SH_MAKE_NAME
1142 #undef SH_MAKE_NAME_
1143 #undef SH_FILLFACTOR
1144 #undef SH_MAX_FILLFACTOR
1145 #undef SH_GROW_MAX_DIB
1146 #undef SH_GROW_MAX_MOVE
1147 #undef SH_GROW_MIN_FILLFACTOR
1148 #undef SH_MAX_SIZE
1149 
1150 /* types */
1151 #undef SH_TYPE
1152 #undef SH_STATUS
1153 #undef SH_STATUS_EMPTY
1154 #undef SH_STATUS_IN_USE
1155 #undef SH_ITERATOR
1156 
1157 /* external function names */
1158 #undef SH_CREATE
1159 #undef SH_DESTROY
1160 #undef SH_RESET
1161 #undef SH_INSERT
1162 #undef SH_INSERT_HASH
1163 #undef SH_DELETE_ITEM
1164 #undef SH_DELETE
1165 #undef SH_LOOKUP
1166 #undef SH_LOOKUP_HASH
1167 #undef SH_GROW
1168 #undef SH_START_ITERATE
1169 #undef SH_START_ITERATE_AT
1170 #undef SH_ITERATE
1171 #undef SH_ALLOCATE
1172 #undef SH_FREE
1173 #undef SH_STAT
1174 
1175 /* internal function names */
1176 #undef SH_COMPUTE_PARAMETERS
1177 #undef SH_COMPARE_KEYS
1178 #undef SH_INITIAL_BUCKET
1179 #undef SH_NEXT
1180 #undef SH_PREV
1181 #undef SH_DISTANCE_FROM_OPTIMAL
1182 #undef SH_ENTRY_HASH
1183 #undef SH_INSERT_HASH_INTERNAL
1184 #undef SH_LOOKUP_HASH_INTERNAL
#define SH_RAW_ALLOCATOR
Definition: common.c:73
unsigned int uint32
Definition: c.h:495
#define Min(x, y)
Definition: c.h:993
signed int int32
Definition: c.h:483
#define Max(x, y)
Definition: c.h:987
#define UINT64_FORMAT
Definition: c.h:538
#define unlikely(x)
Definition: c.h:300
#define PG_UINT64_MAX
Definition: c.h:582
size_t Size
Definition: c.h:594
#define SH_HASH_KEY(tb, key)
#define SH_GET_HASH(tb, a)
#define SH_ELEMENT_TYPE
#define SH_KEY_TYPE
#define SH_SCOPE
#define MCXT_ALLOC_ZERO
Definition: fe_memutils.h:18
#define MCXT_ALLOC_HUGE
Definition: fe_memutils.h:16
int i
Definition: isn.c:73
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
Assert(fmt[strlen(fmt) - 1] !='\n')
void pfree(void *pointer)
Definition: mcxt.c:1456
void * palloc0(Size size)
Definition: mcxt.c:1257
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:1064
void * MemoryContextAllocExtended(MemoryContext context, Size size, int flags)
Definition: mcxt.c:1137
static uint64 pg_nextpower2_64(uint64 num)
Definition: pg_bitutils.h:212
const void * data
int fillfactor
Definition: pgbench.c:187
static unsigned hash(unsigned *uv, int n)
Definition: rege_dfa.c:715
#define SH_GROW
Definition: simplehash.h:122
SH_STATUS
Definition: simplehash.h:175
#define SH_STAT
Definition: simplehash.h:128
#define SH_INITIAL_BUCKET
Definition: simplehash.h:135
#define SH_INSERT_HASH
Definition: simplehash.h:117
#define SH_PREV
Definition: simplehash.h:133
#define SH_STATUS
Definition: simplehash.h:107
#define SH_CREATE
Definition: simplehash.h:113
#define SH_LOOKUP_HASH
Definition: simplehash.h:121
#define SH_START_ITERATE
Definition: simplehash.h:123
#define SH_COMPUTE_PARAMETERS
Definition: simplehash.h:131
#define SH_FREE
Definition: simplehash.h:127
#define SH_STATUS_IN_USE
Definition: simplehash.h:109
#define SH_DISTANCE_FROM_OPTIMAL
Definition: simplehash.h:134
#define SH_LOOKUP_HASH_INTERNAL
Definition: simplehash.h:138
#define SH_ITERATOR
Definition: simplehash.h:110
#define SH_NEXT
Definition: simplehash.h:132
#define SH_ITERATE
Definition: simplehash.h:125
#define SH_DELETE
Definition: simplehash.h:119
#define SH_INSERT
Definition: simplehash.h:116
#define SH_INSERT_HASH_INTERNAL
Definition: simplehash.h:137
#define SH_RESET
Definition: simplehash.h:115
#define SH_ENTRY_HASH
Definition: simplehash.h:136
#define SH_DELETE_ITEM
Definition: simplehash.h:118
#define SH_ALLOCATE
Definition: simplehash.h:126
#define SH_LOOKUP
Definition: simplehash.h:120
#define SH_TYPE
Definition: simplehash.h:106
#define SH_START_ITERATE_AT
Definition: simplehash.h:124
#define SH_STATUS_EMPTY
Definition: simplehash.h:108
#define SH_DESTROY
Definition: simplehash.h:114
uint32 cur
Definition: simplehash.h:182
uint32 end
Definition: simplehash.h:183
MemoryContext ctx
Definition: simplehash.h:167
uint32 members
Definition: simplehash.h:154
SH_ELEMENT_TYPE * data
Definition: simplehash.h:163
uint32 grow_threshold
Definition: simplehash.h:160
uint32 sizemask
Definition: simplehash.h:157
void * private_data
Definition: simplehash.h:171
uint64 size
Definition: simplehash.h:151
const char * type