PostgreSQL Source Code  git master
simplehash.h
Go to the documentation of this file.
1 /*
2  * simplehash.h
3  *
4  * Hash table implementation which will be specialized to user-defined
5  * types, by including this file to generate the required code. It's
6  * probably not worthwhile to do so for hash tables that aren't performance
7  * or space sensitive.
8  *
9  * Usage notes:
10  *
11  * To generate a hash-table and associated functions for a use case several
12  * macros have to be #define'ed before this file is included. Including
13  * the file #undef's all those, so a new hash table can be generated
14  * afterwards.
15  * The relevant parameters are:
16  * - SH_PREFIX - prefix for all symbol names generated. A prefix of 'foo'
17  * will result in hash table type 'foo_hash' and functions like
18  * 'foo_insert'/'foo_lookup' and so forth.
19  * - SH_ELEMENT_TYPE - type of the contained elements
20  * - SH_KEY_TYPE - type of the hashtable's key
21  * - SH_DECLARE - if defined function prototypes and type declarations are
22  * generated
23  * - SH_DEFINE - if defined function definitions are generated
24  * - SH_SCOPE - in which scope (e.g. extern, static inline) do function
25  * declarations reside
26  * - SH_RAW_ALLOCATOR - if defined, memory contexts are not used; instead,
27  * use this to allocate bytes
28  * - SH_USE_NONDEFAULT_ALLOCATOR - if defined no element allocator functions
29  * are defined, so you can supply your own
30  * The following parameters are only relevant when SH_DEFINE is defined:
31  * - SH_KEY - name of the element in SH_ELEMENT_TYPE containing the hash key
32  * - SH_EQUAL(table, a, b) - compare two table keys
33  * - SH_HASH_KEY(table, key) - generate hash for the key
34  * - SH_STORE_HASH - if defined the hash is stored in the elements
35  * - SH_GET_HASH(tb, a) - return the field to store the hash in
36  *
37  * For examples of usage look at tidbitmap.c (file local definition) and
38  * execnodes.h/execGrouping.c (exposed declaration, file local
39  * implementation).
40  *
41  * Hash table design:
42  *
43  * The hash table design chosen is a variant of linear open-addressing. The
44  * reason for doing so is that linear addressing is CPU cache & pipeline
45  * friendly. The biggest disadvantage of simple linear addressing schemes
46  * are highly variable lookup times due to clustering, and deletions
47  * leaving a lot of tombstones around. To address these issues a variant
48  * of "robin hood" hashing is employed. Robin hood hashing optimizes
49  * chaining lengths by moving elements close to their optimal bucket
50  * ("rich" elements), out of the way if a to-be-inserted element is further
51  * away from its optimal position (i.e. it's "poor"). While that can make
52  * insertions slower, the average lookup performance is a lot better, and
53  * higher fill factors can be used in a still performant manner. To avoid
54  * tombstones - which normally solve the issue that a deleted node's
55  * presence is relevant to determine whether a lookup needs to continue
56  * looking or is done - buckets following a deleted element are shifted
57  * backwards, unless they're empty or already at their optimal position.
58  */
59 
60 #include "port/pg_bitutils.h"
61 
62 /* helpers */
63 #define SH_MAKE_PREFIX(a) CppConcat(a,_)
64 #define SH_MAKE_NAME(name) SH_MAKE_NAME_(SH_MAKE_PREFIX(SH_PREFIX),name)
65 #define SH_MAKE_NAME_(a,b) CppConcat(a,b)
66 
67 /* name macros for: */
68 
69 /* type declarations */
70 #define SH_TYPE SH_MAKE_NAME(hash)
71 #define SH_STATUS SH_MAKE_NAME(status)
72 #define SH_STATUS_EMPTY SH_MAKE_NAME(SH_EMPTY)
73 #define SH_STATUS_IN_USE SH_MAKE_NAME(SH_IN_USE)
74 #define SH_ITERATOR SH_MAKE_NAME(iterator)
75 
76 /* function declarations */
77 #define SH_CREATE SH_MAKE_NAME(create)
78 #define SH_DESTROY SH_MAKE_NAME(destroy)
79 #define SH_RESET SH_MAKE_NAME(reset)
80 #define SH_INSERT SH_MAKE_NAME(insert)
81 #define SH_INSERT_HASH SH_MAKE_NAME(insert_hash)
82 #define SH_DELETE SH_MAKE_NAME(delete)
83 #define SH_LOOKUP SH_MAKE_NAME(lookup)
84 #define SH_LOOKUP_HASH SH_MAKE_NAME(lookup_hash)
85 #define SH_GROW SH_MAKE_NAME(grow)
86 #define SH_START_ITERATE SH_MAKE_NAME(start_iterate)
87 #define SH_START_ITERATE_AT SH_MAKE_NAME(start_iterate_at)
88 #define SH_ITERATE SH_MAKE_NAME(iterate)
89 #define SH_ALLOCATE SH_MAKE_NAME(allocate)
90 #define SH_FREE SH_MAKE_NAME(free)
91 #define SH_STAT SH_MAKE_NAME(stat)
92 
93 /* internal helper functions (no externally visible prototypes) */
94 #define SH_COMPUTE_PARAMETERS SH_MAKE_NAME(compute_parameters)
95 #define SH_NEXT SH_MAKE_NAME(next)
96 #define SH_PREV SH_MAKE_NAME(prev)
97 #define SH_DISTANCE_FROM_OPTIMAL SH_MAKE_NAME(distance)
98 #define SH_INITIAL_BUCKET SH_MAKE_NAME(initial_bucket)
99 #define SH_ENTRY_HASH SH_MAKE_NAME(entry_hash)
100 #define SH_INSERT_HASH_INTERNAL SH_MAKE_NAME(insert_hash_internal)
101 #define SH_LOOKUP_HASH_INTERNAL SH_MAKE_NAME(lookup_hash_internal)
102 
103 /* generate forward declarations necessary to use the hash table */
104 #ifdef SH_DECLARE
105 
106 /* type definitions */
107 typedef struct SH_TYPE
108 {
109  /*
110  * Size of data / bucket array, 64 bits to handle UINT32_MAX sized hash
111  * tables. Note that the maximum number of elements is lower
112  * (SH_MAX_FILLFACTOR)
113  */
114  uint64 size;
115 
116  /* how many elements have valid contents */
117  uint32 members;
118 
119  /* mask for bucket and size calculations, based on size */
120  uint32 sizemask;
121 
122  /* boundary after which to grow hashtable */
123  uint32 grow_threshold;
124 
125  /* hash buckets */
126  SH_ELEMENT_TYPE *data;
127 
128 #ifndef SH_RAW_ALLOCATOR
129  /* memory context to use for allocations */
130  MemoryContext ctx;
131 #endif
132 
133  /* user defined data, useful for callbacks */
134  void *private_data;
135 } SH_TYPE;
136 
137 typedef enum SH_STATUS
138 {
139  SH_STATUS_EMPTY = 0x00,
140  SH_STATUS_IN_USE = 0x01
141 } SH_STATUS;
142 
143 typedef struct SH_ITERATOR
144 {
145  uint32 cur; /* current element */
146  uint32 end;
147  bool done; /* iterator exhausted? */
148 } SH_ITERATOR;
149 
150 /* externally visible function prototypes */
151 #ifdef SH_RAW_ALLOCATOR
152 SH_SCOPE SH_TYPE *SH_CREATE(uint32 nelements, void *private_data);
153 #else
155  void *private_data);
156 #endif
157 SH_SCOPE void SH_DESTROY(SH_TYPE * tb);
158 SH_SCOPE void SH_RESET(SH_TYPE * tb);
159 SH_SCOPE void SH_GROW(SH_TYPE * tb, uint32 newsize);
162  uint32 hash, bool *found);
165  uint32 hash);
167 SH_SCOPE void SH_START_ITERATE(SH_TYPE * tb, SH_ITERATOR * iter);
168 SH_SCOPE void SH_START_ITERATE_AT(SH_TYPE * tb, SH_ITERATOR * iter, uint32 at);
170 SH_SCOPE void SH_STAT(SH_TYPE * tb);
171 
172 #endif /* SH_DECLARE */
173 
174 
175 /* generate implementation of the hash table */
176 #ifdef SH_DEFINE
177 
178 #ifndef SH_RAW_ALLOCATOR
179 #include "utils/memutils.h"
180 #endif
181 
182 /* max data array size,we allow up to PG_UINT32_MAX buckets, including 0 */
183 #define SH_MAX_SIZE (((uint64) PG_UINT32_MAX) + 1)
184 
185 /* normal fillfactor, unless already close to maximum */
186 #ifndef SH_FILLFACTOR
187 #define SH_FILLFACTOR (0.9)
188 #endif
189 /* increase fillfactor if we otherwise would error out */
190 #define SH_MAX_FILLFACTOR (0.98)
191 /* grow if actual and optimal location bigger than */
192 #ifndef SH_GROW_MAX_DIB
193 #define SH_GROW_MAX_DIB 25
194 #endif
195 /* grow if more than elements to move when inserting */
196 #ifndef SH_GROW_MAX_MOVE
197 #define SH_GROW_MAX_MOVE 150
198 #endif
199 #ifndef SH_GROW_MIN_FILLFACTOR
200 /* but do not grow due to SH_GROW_MAX_* if below */
201 #define SH_GROW_MIN_FILLFACTOR 0.1
202 #endif
203 
204 #ifdef SH_STORE_HASH
205 #define SH_COMPARE_KEYS(tb, ahash, akey, b) (ahash == SH_GET_HASH(tb, b) && SH_EQUAL(tb, b->SH_KEY, akey))
206 #else
207 #define SH_COMPARE_KEYS(tb, ahash, akey, b) (SH_EQUAL(tb, b->SH_KEY, akey))
208 #endif
209 
210 /*
211  * Wrap the following definitions in include guards, to avoid multiple
212  * definition errors if this header is included more than once. The rest of
213  * the file deliberately has no include guards, because it can be included
214  * with different parameters to define functions and types with non-colliding
215  * names.
216  */
217 #ifndef SIMPLEHASH_H
218 #define SIMPLEHASH_H
219 
220 #ifdef FRONTEND
221 #define sh_error(...) pg_log_error(__VA_ARGS__)
222 #define sh_log(...) pg_log_info(__VA_ARGS__)
223 #else
224 #define sh_error(...) elog(ERROR, __VA_ARGS__)
225 #define sh_log(...) elog(LOG, __VA_ARGS__)
226 #endif
227 
228 #endif
229 
230 /*
231  * Compute sizing parameters for hashtable. Called when creating and growing
232  * the hashtable.
233  */
234 static inline void
235 SH_COMPUTE_PARAMETERS(SH_TYPE * tb, uint32 newsize)
236 {
237  uint64 size;
238 
239  /* supporting zero sized hashes would complicate matters */
240  size = Max(newsize, 2);
241 
242  /* round up size to the next power of 2, that's how bucketing works */
243  size = pg_nextpower2_64(size);
244  Assert(size <= SH_MAX_SIZE);
245 
246  /*
247  * Verify that allocation of ->data is possible on this platform, without
248  * overflowing Size.
249  */
250  if ((((uint64) sizeof(SH_ELEMENT_TYPE)) * size) >= SIZE_MAX / 2)
251  sh_error("hash table too large");
252 
253  /* now set size */
254  tb->size = size;
255 
256  if (tb->size == SH_MAX_SIZE)
257  tb->sizemask = 0;
258  else
259  tb->sizemask = tb->size - 1;
260 
261  /*
262  * Compute the next threshold at which we need to grow the hash table
263  * again.
264  */
265  if (tb->size == SH_MAX_SIZE)
266  tb->grow_threshold = ((double) tb->size) * SH_MAX_FILLFACTOR;
267  else
268  tb->grow_threshold = ((double) tb->size) * SH_FILLFACTOR;
269 }
270 
271 /* return the optimal bucket for the hash */
272 static inline uint32
274 {
275  return hash & tb->sizemask;
276 }
277 
278 /* return next bucket after the current, handling wraparound */
279 static inline uint32
280 SH_NEXT(SH_TYPE * tb, uint32 curelem, uint32 startelem)
281 {
282  curelem = (curelem + 1) & tb->sizemask;
283 
284  Assert(curelem != startelem);
285 
286  return curelem;
287 }
288 
289 /* return bucket before the current, handling wraparound */
290 static inline uint32
291 SH_PREV(SH_TYPE * tb, uint32 curelem, uint32 startelem)
292 {
293  curelem = (curelem - 1) & tb->sizemask;
294 
295  Assert(curelem != startelem);
296 
297  return curelem;
298 }
299 
300 /* return distance between bucket and its optimal position */
301 static inline uint32
302 SH_DISTANCE_FROM_OPTIMAL(SH_TYPE * tb, uint32 optimal, uint32 bucket)
303 {
304  if (optimal <= bucket)
305  return bucket - optimal;
306  else
307  return (tb->size + bucket) - optimal;
308 }
309 
310 static inline uint32
312 {
313 #ifdef SH_STORE_HASH
314  return SH_GET_HASH(tb, entry);
315 #else
316  return SH_HASH_KEY(tb, entry->SH_KEY);
317 #endif
318 }
319 
320 /* default memory allocator function */
321 static inline void *SH_ALLOCATE(SH_TYPE * type, Size size);
322 static inline void SH_FREE(SH_TYPE * type, void *pointer);
323 
324 #ifndef SH_USE_NONDEFAULT_ALLOCATOR
325 
326 /* default memory allocator function */
327 static inline void *
328 SH_ALLOCATE(SH_TYPE * type, Size size)
329 {
330 #ifdef SH_RAW_ALLOCATOR
331  return SH_RAW_ALLOCATOR(size);
332 #else
333  return MemoryContextAllocExtended(type->ctx, size,
335 #endif
336 }
337 
338 /* default memory free function */
339 static inline void
340 SH_FREE(SH_TYPE * type, void *pointer)
341 {
342  pfree(pointer);
343 }
344 
345 #endif
346 
347 /*
348  * Create a hash table with enough space for `nelements` distinct members.
349  * Memory for the hash table is allocated from the passed-in context. If
350  * desired, the array of elements can be allocated using a passed-in allocator;
351  * this could be useful in order to place the array of elements in a shared
352  * memory, or in a context that will outlive the rest of the hash table.
353  * Memory other than for the array of elements will still be allocated from
354  * the passed-in context.
355  */
356 #ifdef SH_RAW_ALLOCATOR
358 SH_CREATE(uint32 nelements, void *private_data)
359 #else
361 SH_CREATE(MemoryContext ctx, uint32 nelements, void *private_data)
362 #endif
363 {
364  SH_TYPE *tb;
365  uint64 size;
366 
367 #ifdef SH_RAW_ALLOCATOR
368  tb = SH_RAW_ALLOCATOR(sizeof(SH_TYPE));
369 #else
370  tb = MemoryContextAllocZero(ctx, sizeof(SH_TYPE));
371  tb->ctx = ctx;
372 #endif
373  tb->private_data = private_data;
374 
375  /* increase nelements by fillfactor, want to store nelements elements */
376  size = Min((double) SH_MAX_SIZE, ((double) nelements) / SH_FILLFACTOR);
377 
378  SH_COMPUTE_PARAMETERS(tb, size);
379 
380  tb->data = SH_ALLOCATE(tb, sizeof(SH_ELEMENT_TYPE) * tb->size);
381 
382  return tb;
383 }
384 
385 /* destroy a previously created hash table */
386 SH_SCOPE void
387 SH_DESTROY(SH_TYPE * tb)
388 {
389  SH_FREE(tb, tb->data);
390  pfree(tb);
391 }
392 
393 /* reset the contents of a previously created hash table */
394 SH_SCOPE void
395 SH_RESET(SH_TYPE * tb)
396 {
397  memset(tb->data, 0, sizeof(SH_ELEMENT_TYPE) * tb->size);
398  tb->members = 0;
399 }
400 
401 /*
402  * Grow a hash table to at least `newsize` buckets.
403  *
404  * Usually this will automatically be called by insertions/deletions, when
405  * necessary. But resizing to the exact input size can be advantageous
406  * performance-wise, when known at some point.
407  */
408 SH_SCOPE void
409 SH_GROW(SH_TYPE * tb, uint32 newsize)
410 {
411  uint64 oldsize = tb->size;
412  SH_ELEMENT_TYPE *olddata = tb->data;
413  SH_ELEMENT_TYPE *newdata;
414  uint32 i;
415  uint32 startelem = 0;
416  uint32 copyelem;
417 
418  Assert(oldsize == pg_nextpower2_64(oldsize));
419  Assert(oldsize != SH_MAX_SIZE);
420  Assert(oldsize < newsize);
421 
422  /* compute parameters for new table */
423  SH_COMPUTE_PARAMETERS(tb, newsize);
424 
425  tb->data = SH_ALLOCATE(tb, sizeof(SH_ELEMENT_TYPE) * tb->size);
426 
427  newdata = tb->data;
428 
429  /*
430  * Copy entries from the old data to newdata. We theoretically could use
431  * SH_INSERT here, to avoid code duplication, but that's more general than
432  * we need. We neither want tb->members increased, nor do we need to do
433  * deal with deleted elements, nor do we need to compare keys. So a
434  * special-cased implementation is lot faster. As resizing can be time
435  * consuming and frequent, that's worthwhile to optimize.
436  *
437  * To be able to simply move entries over, we have to start not at the
438  * first bucket (i.e olddata[0]), but find the first bucket that's either
439  * empty, or is occupied by an entry at its optimal position. Such a
440  * bucket has to exist in any table with a load factor under 1, as not all
441  * buckets are occupied, i.e. there always has to be an empty bucket. By
442  * starting at such a bucket we can move the entries to the larger table,
443  * without having to deal with conflicts.
444  */
445 
446  /* search for the first element in the hash that's not wrapped around */
447  for (i = 0; i < oldsize; i++)
448  {
449  SH_ELEMENT_TYPE *oldentry = &olddata[i];
450  uint32 hash;
451  uint32 optimal;
452 
453  if (oldentry->status != SH_STATUS_IN_USE)
454  {
455  startelem = i;
456  break;
457  }
458 
459  hash = SH_ENTRY_HASH(tb, oldentry);
460  optimal = SH_INITIAL_BUCKET(tb, hash);
461 
462  if (optimal == i)
463  {
464  startelem = i;
465  break;
466  }
467  }
468 
469  /* and copy all elements in the old table */
470  copyelem = startelem;
471  for (i = 0; i < oldsize; i++)
472  {
473  SH_ELEMENT_TYPE *oldentry = &olddata[copyelem];
474 
475  if (oldentry->status == SH_STATUS_IN_USE)
476  {
477  uint32 hash;
478  uint32 startelem;
479  uint32 curelem;
480  SH_ELEMENT_TYPE *newentry;
481 
482  hash = SH_ENTRY_HASH(tb, oldentry);
483  startelem = SH_INITIAL_BUCKET(tb, hash);
484  curelem = startelem;
485 
486  /* find empty element to put data into */
487  while (true)
488  {
489  newentry = &newdata[curelem];
490 
491  if (newentry->status == SH_STATUS_EMPTY)
492  {
493  break;
494  }
495 
496  curelem = SH_NEXT(tb, curelem, startelem);
497  }
498 
499  /* copy entry to new slot */
500  memcpy(newentry, oldentry, sizeof(SH_ELEMENT_TYPE));
501  }
502 
503  /* can't use SH_NEXT here, would use new size */
504  copyelem++;
505  if (copyelem >= oldsize)
506  {
507  copyelem = 0;
508  }
509  }
510 
511  SH_FREE(tb, olddata);
512 }
513 
514 /*
515  * This is a separate static inline function, so it can be reliably be inlined
516  * into its wrapper functions even if SH_SCOPE is extern.
517  */
518 static inline SH_ELEMENT_TYPE *
520 {
521  uint32 startelem;
522  uint32 curelem;
523  SH_ELEMENT_TYPE *data;
524  uint32 insertdist;
525 
526 restart:
527  insertdist = 0;
528 
529  /*
530  * We do the grow check even if the key is actually present, to avoid
531  * doing the check inside the loop. This also lets us avoid having to
532  * re-find our position in the hashtable after resizing.
533  *
534  * Note that this also reached when resizing the table due to
535  * SH_GROW_MAX_DIB / SH_GROW_MAX_MOVE.
536  */
537  if (unlikely(tb->members >= tb->grow_threshold))
538  {
539  if (tb->size == SH_MAX_SIZE)
540  {
541  sh_error("hash table size exceeded");
542  }
543 
544  /*
545  * When optimizing, it can be very useful to print these out.
546  */
547  /* SH_STAT(tb); */
548  SH_GROW(tb, tb->size * 2);
549  /* SH_STAT(tb); */
550  }
551 
552  /* perform insert, start bucket search at optimal location */
553  data = tb->data;
554  startelem = SH_INITIAL_BUCKET(tb, hash);
555  curelem = startelem;
556  while (true)
557  {
558  uint32 curdist;
559  uint32 curhash;
560  uint32 curoptimal;
561  SH_ELEMENT_TYPE *entry = &data[curelem];
562 
563  /* any empty bucket can directly be used */
564  if (entry->status == SH_STATUS_EMPTY)
565  {
566  tb->members++;
567  entry->SH_KEY = key;
568 #ifdef SH_STORE_HASH
569  SH_GET_HASH(tb, entry) = hash;
570 #endif
571  entry->status = SH_STATUS_IN_USE;
572  *found = false;
573  return entry;
574  }
575 
576  /*
577  * If the bucket is not empty, we either found a match (in which case
578  * we're done), or we have to decide whether to skip over or move the
579  * colliding entry. When the colliding element's distance to its
580  * optimal position is smaller than the to-be-inserted entry's, we
581  * shift the colliding entry (and its followers) forward by one.
582  */
583 
584  if (SH_COMPARE_KEYS(tb, hash, key, entry))
585  {
586  Assert(entry->status == SH_STATUS_IN_USE);
587  *found = true;
588  return entry;
589  }
590 
591  curhash = SH_ENTRY_HASH(tb, entry);
592  curoptimal = SH_INITIAL_BUCKET(tb, curhash);
593  curdist = SH_DISTANCE_FROM_OPTIMAL(tb, curoptimal, curelem);
594 
595  if (insertdist > curdist)
596  {
597  SH_ELEMENT_TYPE *lastentry = entry;
598  uint32 emptyelem = curelem;
599  uint32 moveelem;
600  int32 emptydist = 0;
601 
602  /* find next empty bucket */
603  while (true)
604  {
605  SH_ELEMENT_TYPE *emptyentry;
606 
607  emptyelem = SH_NEXT(tb, emptyelem, startelem);
608  emptyentry = &data[emptyelem];
609 
610  if (emptyentry->status == SH_STATUS_EMPTY)
611  {
612  lastentry = emptyentry;
613  break;
614  }
615 
616  /*
617  * To avoid negative consequences from overly imbalanced
618  * hashtables, grow the hashtable if collisions would require
619  * us to move a lot of entries. The most likely cause of such
620  * imbalance is filling a (currently) small table, from a
621  * currently big one, in hash-table order. Don't grow if the
622  * hashtable would be too empty, to prevent quick space
623  * explosion for some weird edge cases.
624  */
625  if (unlikely(++emptydist > SH_GROW_MAX_MOVE) &&
626  ((double) tb->members / tb->size) >= SH_GROW_MIN_FILLFACTOR)
627  {
628  tb->grow_threshold = 0;
629  goto restart;
630  }
631  }
632 
633  /* shift forward, starting at last occupied element */
634 
635  /*
636  * TODO: This could be optimized to be one memcpy in many cases,
637  * excepting wrapping around at the end of ->data. Hasn't shown up
638  * in profiles so far though.
639  */
640  moveelem = emptyelem;
641  while (moveelem != curelem)
642  {
643  SH_ELEMENT_TYPE *moveentry;
644 
645  moveelem = SH_PREV(tb, moveelem, startelem);
646  moveentry = &data[moveelem];
647 
648  memcpy(lastentry, moveentry, sizeof(SH_ELEMENT_TYPE));
649  lastentry = moveentry;
650  }
651 
652  /* and fill the now empty spot */
653  tb->members++;
654 
655  entry->SH_KEY = key;
656 #ifdef SH_STORE_HASH
657  SH_GET_HASH(tb, entry) = hash;
658 #endif
659  entry->status = SH_STATUS_IN_USE;
660  *found = false;
661  return entry;
662  }
663 
664  curelem = SH_NEXT(tb, curelem, startelem);
665  insertdist++;
666 
667  /*
668  * To avoid negative consequences from overly imbalanced hashtables,
669  * grow the hashtable if collisions lead to large runs. The most
670  * likely cause of such imbalance is filling a (currently) small
671  * table, from a currently big one, in hash-table order. Don't grow
672  * if the hashtable would be too empty, to prevent quick space
673  * explosion for some weird edge cases.
674  */
675  if (unlikely(insertdist > SH_GROW_MAX_DIB) &&
676  ((double) tb->members / tb->size) >= SH_GROW_MIN_FILLFACTOR)
677  {
678  tb->grow_threshold = 0;
679  goto restart;
680  }
681  }
682 }
683 
684 /*
685  * Insert the key key into the hash-table, set *found to true if the key
686  * already exists, false otherwise. Returns the hash-table entry in either
687  * case.
688  */
690 SH_INSERT(SH_TYPE * tb, SH_KEY_TYPE key, bool *found)
691 {
692  uint32 hash = SH_HASH_KEY(tb, key);
693 
694  return SH_INSERT_HASH_INTERNAL(tb, key, hash, found);
695 }
696 
697 /*
698  * Insert the key key into the hash-table using an already-calculated
699  * hash. Set *found to true if the key already exists, false
700  * otherwise. Returns the hash-table entry in either case.
701  */
703 SH_INSERT_HASH(SH_TYPE * tb, SH_KEY_TYPE key, uint32 hash, bool *found)
704 {
705  return SH_INSERT_HASH_INTERNAL(tb, key, hash, found);
706 }
707 
708 /*
709  * This is a separate static inline function, so it can be reliably be inlined
710  * into its wrapper functions even if SH_SCOPE is extern.
711  */
712 static inline SH_ELEMENT_TYPE *
714 {
715  const uint32 startelem = SH_INITIAL_BUCKET(tb, hash);
716  uint32 curelem = startelem;
717 
718  while (true)
719  {
720  SH_ELEMENT_TYPE *entry = &tb->data[curelem];
721 
722  if (entry->status == SH_STATUS_EMPTY)
723  {
724  return NULL;
725  }
726 
727  Assert(entry->status == SH_STATUS_IN_USE);
728 
729  if (SH_COMPARE_KEYS(tb, hash, key, entry))
730  return entry;
731 
732  /*
733  * TODO: we could stop search based on distance. If the current
734  * buckets's distance-from-optimal is smaller than what we've skipped
735  * already, the entry doesn't exist. Probably only do so if
736  * SH_STORE_HASH is defined, to avoid re-computing hashes?
737  */
738 
739  curelem = SH_NEXT(tb, curelem, startelem);
740  }
741 }
742 
743 /*
744  * Lookup up entry in hash table. Returns NULL if key not present.
745  */
748 {
749  uint32 hash = SH_HASH_KEY(tb, key);
750 
751  return SH_LOOKUP_HASH_INTERNAL(tb, key, hash);
752 }
753 
754 /*
755  * Lookup up entry in hash table using an already-calculated hash.
756  *
757  * Returns NULL if key not present.
758  */
761 {
762  return SH_LOOKUP_HASH_INTERNAL(tb, key, hash);
763 }
764 
765 /*
766  * Delete entry from hash table. Returns whether to-be-deleted key was
767  * present.
768  */
769 SH_SCOPE bool
771 {
772  uint32 hash = SH_HASH_KEY(tb, key);
773  uint32 startelem = SH_INITIAL_BUCKET(tb, hash);
774  uint32 curelem = startelem;
775 
776  while (true)
777  {
778  SH_ELEMENT_TYPE *entry = &tb->data[curelem];
779 
780  if (entry->status == SH_STATUS_EMPTY)
781  return false;
782 
783  if (entry->status == SH_STATUS_IN_USE &&
784  SH_COMPARE_KEYS(tb, hash, key, entry))
785  {
786  SH_ELEMENT_TYPE *lastentry = entry;
787 
788  tb->members--;
789 
790  /*
791  * Backward shift following elements till either an empty element
792  * or an element at its optimal position is encountered.
793  *
794  * While that sounds expensive, the average chain length is short,
795  * and deletions would otherwise require tombstones.
796  */
797  while (true)
798  {
799  SH_ELEMENT_TYPE *curentry;
800  uint32 curhash;
801  uint32 curoptimal;
802 
803  curelem = SH_NEXT(tb, curelem, startelem);
804  curentry = &tb->data[curelem];
805 
806  if (curentry->status != SH_STATUS_IN_USE)
807  {
808  lastentry->status = SH_STATUS_EMPTY;
809  break;
810  }
811 
812  curhash = SH_ENTRY_HASH(tb, curentry);
813  curoptimal = SH_INITIAL_BUCKET(tb, curhash);
814 
815  /* current is at optimal position, done */
816  if (curoptimal == curelem)
817  {
818  lastentry->status = SH_STATUS_EMPTY;
819  break;
820  }
821 
822  /* shift */
823  memcpy(lastentry, curentry, sizeof(SH_ELEMENT_TYPE));
824 
825  lastentry = curentry;
826  }
827 
828  return true;
829  }
830 
831  /* TODO: return false; if distance too big */
832 
833  curelem = SH_NEXT(tb, curelem, startelem);
834  }
835 }
836 
837 /*
838  * Initialize iterator.
839  */
840 SH_SCOPE void
842 {
843  int i;
844  uint64 startelem = PG_UINT64_MAX;
845 
846  /*
847  * Search for the first empty element. As deletions during iterations are
848  * supported, we want to start/end at an element that cannot be affected
849  * by elements being shifted.
850  */
851  for (i = 0; i < tb->size; i++)
852  {
853  SH_ELEMENT_TYPE *entry = &tb->data[i];
854 
855  if (entry->status != SH_STATUS_IN_USE)
856  {
857  startelem = i;
858  break;
859  }
860  }
861 
862  Assert(startelem < SH_MAX_SIZE);
863 
864  /*
865  * Iterate backwards, that allows the current element to be deleted, even
866  * if there are backward shifts
867  */
868  iter->cur = startelem;
869  iter->end = iter->cur;
870  iter->done = false;
871 }
872 
873 /*
874  * Initialize iterator to a specific bucket. That's really only useful for
875  * cases where callers are partially iterating over the hashspace, and that
876  * iteration deletes and inserts elements based on visited entries. Doing that
877  * repeatedly could lead to an unbalanced keyspace when always starting at the
878  * same position.
879  */
880 SH_SCOPE void
882 {
883  /*
884  * Iterate backwards, that allows the current element to be deleted, even
885  * if there are backward shifts.
886  */
887  iter->cur = at & tb->sizemask; /* ensure at is within a valid range */
888  iter->end = iter->cur;
889  iter->done = false;
890 }
891 
892 /*
893  * Iterate over all entries in the hash-table. Return the next occupied entry,
894  * or NULL if done.
895  *
896  * During iteration the current entry in the hash table may be deleted,
897  * without leading to elements being skipped or returned twice. Additionally
898  * the rest of the table may be modified (i.e. there can be insertions or
899  * deletions), but if so, there's neither a guarantee that all nodes are
900  * visited at least once, nor a guarantee that a node is visited at most once.
901  */
903 SH_ITERATE(SH_TYPE * tb, SH_ITERATOR * iter)
904 {
905  while (!iter->done)
906  {
907  SH_ELEMENT_TYPE *elem;
908 
909  elem = &tb->data[iter->cur];
910 
911  /* next element in backward direction */
912  iter->cur = (iter->cur - 1) & tb->sizemask;
913 
914  if ((iter->cur & tb->sizemask) == (iter->end & tb->sizemask))
915  iter->done = true;
916  if (elem->status == SH_STATUS_IN_USE)
917  {
918  return elem;
919  }
920  }
921 
922  return NULL;
923 }
924 
925 /*
926  * Report some statistics about the state of the hashtable. For
927  * debugging/profiling purposes only.
928  */
929 SH_SCOPE void
930 SH_STAT(SH_TYPE * tb)
931 {
932  uint32 max_chain_length = 0;
933  uint32 total_chain_length = 0;
934  double avg_chain_length;
935  double fillfactor;
936  uint32 i;
937 
938  uint32 *collisions = palloc0(tb->size * sizeof(uint32));
939  uint32 total_collisions = 0;
940  uint32 max_collisions = 0;
941  double avg_collisions;
942 
943  for (i = 0; i < tb->size; i++)
944  {
945  uint32 hash;
946  uint32 optimal;
947  uint32 dist;
948  SH_ELEMENT_TYPE *elem;
949 
950  elem = &tb->data[i];
951 
952  if (elem->status != SH_STATUS_IN_USE)
953  continue;
954 
955  hash = SH_ENTRY_HASH(tb, elem);
956  optimal = SH_INITIAL_BUCKET(tb, hash);
957  dist = SH_DISTANCE_FROM_OPTIMAL(tb, optimal, i);
958 
959  if (dist > max_chain_length)
960  max_chain_length = dist;
961  total_chain_length += dist;
962 
963  collisions[optimal]++;
964  }
965 
966  for (i = 0; i < tb->size; i++)
967  {
968  uint32 curcoll = collisions[i];
969 
970  if (curcoll == 0)
971  continue;
972 
973  /* single contained element is not a collision */
974  curcoll--;
975  total_collisions += curcoll;
976  if (curcoll > max_collisions)
977  max_collisions = curcoll;
978  }
979 
980  if (tb->members > 0)
981  {
982  fillfactor = tb->members / ((double) tb->size);
983  avg_chain_length = ((double) total_chain_length) / tb->members;
984  avg_collisions = ((double) total_collisions) / tb->members;
985  }
986  else
987  {
988  fillfactor = 0;
989  avg_chain_length = 0;
990  avg_collisions = 0;
991  }
992 
993  sh_log("size: " UINT64_FORMAT ", members: %u, filled: %f, total chain: %u, max chain: %u, avg chain: %f, total_collisions: %u, max_collisions: %i, avg_collisions: %f",
994  tb->size, tb->members, fillfactor, total_chain_length, max_chain_length, avg_chain_length,
995  total_collisions, max_collisions, avg_collisions);
996 }
997 
998 #endif /* SH_DEFINE */
999 
1000 
1001 /* undefine external parameters, so next hash table can be defined */
1002 #undef SH_PREFIX
1003 #undef SH_KEY_TYPE
1004 #undef SH_KEY
1005 #undef SH_ELEMENT_TYPE
1006 #undef SH_HASH_KEY
1007 #undef SH_SCOPE
1008 #undef SH_DECLARE
1009 #undef SH_DEFINE
1010 #undef SH_GET_HASH
1011 #undef SH_STORE_HASH
1012 #undef SH_USE_NONDEFAULT_ALLOCATOR
1013 #undef SH_EQUAL
1014 
1015 /* undefine locally declared macros */
1016 #undef SH_MAKE_PREFIX
1017 #undef SH_MAKE_NAME
1018 #undef SH_MAKE_NAME_
1019 #undef SH_FILLFACTOR
1020 #undef SH_MAX_FILLFACTOR
1021 #undef SH_GROW_MAX_DIB
1022 #undef SH_GROW_MAX_MOVE
1023 #undef SH_GROW_MIN_FILLFACTOR
1024 #undef SH_MAX_SIZE
1025 
1026 /* types */
1027 #undef SH_TYPE
1028 #undef SH_STATUS
1029 #undef SH_STATUS_EMPTY
1030 #undef SH_STATUS_IN_USE
1031 #undef SH_ITERATOR
1032 
1033 /* external function names */
1034 #undef SH_CREATE
1035 #undef SH_DESTROY
1036 #undef SH_RESET
1037 #undef SH_INSERT
1038 #undef SH_INSERT_HASH
1039 #undef SH_DELETE
1040 #undef SH_LOOKUP
1041 #undef SH_LOOKUP_HASH
1042 #undef SH_GROW
1043 #undef SH_START_ITERATE
1044 #undef SH_START_ITERATE_AT
1045 #undef SH_ITERATE
1046 #undef SH_ALLOCATE
1047 #undef SH_FREE
1048 #undef SH_STAT
1049 
1050 /* internal function names */
1051 #undef SH_COMPUTE_PARAMETERS
1052 #undef SH_COMPARE_KEYS
1053 #undef SH_INITIAL_BUCKET
1054 #undef SH_NEXT
1055 #undef SH_PREV
1056 #undef SH_DISTANCE_FROM_OPTIMAL
1057 #undef SH_ENTRY_HASH
1058 #undef SH_INSERT_HASH_INTERNAL
1059 #undef SH_LOOKUP_HASH_INTERNAL
#define SH_ALLOCATE
Definition: simplehash.h:89
#define SH_COMPUTE_PARAMETERS
Definition: simplehash.h:94
#define SH_KEY_TYPE
Definition: execGrouping.c:38
#define SH_START_ITERATE
Definition: simplehash.h:86
#define PG_UINT64_MAX
Definition: c.h:454
#define MCXT_ALLOC_HUGE
Definition: fe_memutils.h:16
void * MemoryContextAllocExtended(MemoryContext context, Size size, int flags)
Definition: mcxt.c:912
#define SH_DELETE
Definition: simplehash.h:82
#define SH_START_ITERATE_AT
Definition: simplehash.h:87
#define Min(x, y)
Definition: c.h:920
struct cursor * cur
Definition: ecpg.c:28
#define SH_STATUS_EMPTY
Definition: simplehash.h:72
#define SH_LOOKUP
Definition: simplehash.h:83
#define SH_INSERT_HASH
Definition: simplehash.h:81
#define SH_RESET
Definition: simplehash.h:79
#define SH_GROW
Definition: simplehash.h:85
#define SH_NEXT
Definition: simplehash.h:95
#define SH_STAT
Definition: simplehash.h:91
#define SH_PREV
Definition: simplehash.h:96
signed int int32
Definition: c.h:355
#define SH_STATUS_IN_USE
Definition: simplehash.h:73
#define SH_ITERATOR
Definition: simplehash.h:74
void pfree(void *pointer)
Definition: mcxt.c:1056
#define SH_INSERT_HASH_INTERNAL
Definition: simplehash.h:100
int fillfactor
Definition: pgbench.c:159
#define SH_DESTROY
Definition: simplehash.h:78
unsigned int uint32
Definition: c.h:367
#define SH_FREE
Definition: simplehash.h:90
static uint64 pg_nextpower2_64(uint64 num)
Definition: pg_bitutils.h:169
#define SH_GET_HASH(tb, a)
Definition: execGrouping.c:44
#define SH_RAW_ALLOCATOR
void * palloc0(Size size)
Definition: mcxt.c:980
#define SH_STATUS
Definition: simplehash.h:71
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:839
#define SH_DISTANCE_FROM_OPTIMAL
Definition: simplehash.h:97
#define Max(x, y)
Definition: c.h:914
#define Assert(condition)
Definition: c.h:738
#define SH_INSERT
Definition: simplehash.h:80
#define MCXT_ALLOC_ZERO
Definition: fe_memutils.h:19
size_t Size
Definition: c.h:466
#define SH_ITERATE
Definition: simplehash.h:88
#define SH_TYPE
Definition: simplehash.h:70
#define SH_ELEMENT_TYPE
Definition: execGrouping.c:37
#define SH_INITIAL_BUCKET
Definition: simplehash.h:98
int i
#define SH_LOOKUP_HASH
Definition: simplehash.h:84
#define unlikely(x)
Definition: c.h:206
#define SH_CREATE
Definition: simplehash.h:77
#define SH_ENTRY_HASH
Definition: simplehash.h:99
#define SH_HASH_KEY(tb, key)
Definition: execGrouping.c:40
static unsigned hash(unsigned *uv, int n)
Definition: rege_dfa.c:541
#define UINT64_FORMAT
Definition: c.h:410
#define SH_SCOPE
Definition: execGrouping.c:42
#define SH_LOOKUP_HASH_INTERNAL
Definition: simplehash.h:101