PostgreSQL Source Code  git master
simplehash.h
Go to the documentation of this file.
1 /*
2  * simplehash.h
3  *
4  * When included this file generates a "templated" (by way of macros)
5  * open-addressing hash table implementation specialized to user-defined
6  * types.
7  *
8  * It's probably not worthwhile to generate such a specialized implementation
9  * for hash tables that aren't performance or space sensitive.
10  *
11  * Compared to dynahash, simplehash has the following benefits:
12  *
13  * - Due to the "templated" code generation has known structure sizes and no
14  * indirect function calls (which show up substantially in dynahash
15  * profiles). These features considerably increase speed for small
16  * entries.
17  * - Open addressing has better CPU cache behavior than dynahash's chained
18  * hashtables.
19  * - The generated interface is type-safe and easier to use than dynahash,
20  * though at the cost of more complex setup.
21  * - Allocates memory in a MemoryContext or another allocator with a
22  * malloc/free style interface (which isn't easily usable in a shared
23  * memory context)
24  * - Does not require the overhead of a separate memory context.
25  *
26  * Usage notes:
27  *
28  * To generate a hash-table and associated functions for a use case several
29  * macros have to be #define'ed before this file is included. Including
30  * the file #undef's all those, so a new hash table can be generated
31  * afterwards.
32  * The relevant parameters are:
33  * - SH_PREFIX - prefix for all symbol names generated. A prefix of 'foo'
34  * will result in hash table type 'foo_hash' and functions like
35  * 'foo_insert'/'foo_lookup' and so forth.
36  * - SH_ELEMENT_TYPE - type of the contained elements
37  * - SH_KEY_TYPE - type of the hashtable's key
38  * - SH_DECLARE - if defined function prototypes and type declarations are
39  * generated
40  * - SH_DEFINE - if defined function definitions are generated
41  * - SH_SCOPE - in which scope (e.g. extern, static inline) do function
42  * declarations reside
43  * - SH_RAW_ALLOCATOR - if defined, memory contexts are not used; instead,
44  * use this to allocate bytes
45  * - SH_USE_NONDEFAULT_ALLOCATOR - if defined no element allocator functions
46  * are defined, so you can supply your own
47  * The following parameters are only relevant when SH_DEFINE is defined:
48  * - SH_KEY - name of the element in SH_ELEMENT_TYPE containing the hash key
49  * - SH_EQUAL(table, a, b) - compare two table keys
50  * - SH_HASH_KEY(table, key) - generate hash for the key
51  * - SH_STORE_HASH - if defined the hash is stored in the elements
52  * - SH_GET_HASH(tb, a) - return the field to store the hash in
53  *
54  * The element type is required to contain a "status" member that can store
55  * the range of values defined in the SH_STATUS enum.
56  *
57  * While SH_STORE_HASH (and subsequently SH_GET_HASH) are optional, because
58  * the hash table implementation needs to compare hashes to move elements
59  * (particularly when growing the hash), it's preferable, if possible, to
60  * store the element's hash in the element's data type. If the hash is so
61  * stored, the hash table will also compare hashes before calling SH_EQUAL
62  * when comparing two keys.
63  *
64  * For convenience the hash table create functions accept a void pointer
65  * that will be stored in the hash table type's member private_data. This
66  * allows callbacks to reference caller provided data.
67  *
68  * For examples of usage look at tidbitmap.c (file local definition) and
69  * execnodes.h/execGrouping.c (exposed declaration, file local
70  * implementation).
71  *
72  * Hash table design:
73  *
74  * The hash table design chosen is a variant of linear open-addressing. The
75  * reason for doing so is that linear addressing is CPU cache & pipeline
76  * friendly. The biggest disadvantage of simple linear addressing schemes
77  * are highly variable lookup times due to clustering, and deletions
78  * leaving a lot of tombstones around. To address these issues a variant
79  * of "robin hood" hashing is employed. Robin hood hashing optimizes
80  * chaining lengths by moving elements close to their optimal bucket
81  * ("rich" elements), out of the way if a to-be-inserted element is further
82  * away from its optimal position (i.e. it's "poor"). While that can make
83  * insertions slower, the average lookup performance is a lot better, and
84  * higher fill factors can be used in a still performant manner. To avoid
85  * tombstones - which normally solve the issue that a deleted node's
86  * presence is relevant to determine whether a lookup needs to continue
87  * looking or is done - buckets following a deleted element are shifted
88  * backwards, unless they're empty or already at their optimal position.
89  */
90 
91 #include "port/pg_bitutils.h"
92 
93 /* helpers */
94 #define SH_MAKE_PREFIX(a) CppConcat(a,_)
95 #define SH_MAKE_NAME(name) SH_MAKE_NAME_(SH_MAKE_PREFIX(SH_PREFIX),name)
96 #define SH_MAKE_NAME_(a,b) CppConcat(a,b)
97 
98 /* name macros for: */
99 
100 /* type declarations */
101 #define SH_TYPE SH_MAKE_NAME(hash)
102 #define SH_STATUS SH_MAKE_NAME(status)
103 #define SH_STATUS_EMPTY SH_MAKE_NAME(SH_EMPTY)
104 #define SH_STATUS_IN_USE SH_MAKE_NAME(SH_IN_USE)
105 #define SH_ITERATOR SH_MAKE_NAME(iterator)
106 
107 /* function declarations */
108 #define SH_CREATE SH_MAKE_NAME(create)
109 #define SH_DESTROY SH_MAKE_NAME(destroy)
110 #define SH_RESET SH_MAKE_NAME(reset)
111 #define SH_INSERT SH_MAKE_NAME(insert)
112 #define SH_INSERT_HASH SH_MAKE_NAME(insert_hash)
113 #define SH_DELETE_ITEM SH_MAKE_NAME(delete_item)
114 #define SH_DELETE SH_MAKE_NAME(delete)
115 #define SH_LOOKUP SH_MAKE_NAME(lookup)
116 #define SH_LOOKUP_HASH SH_MAKE_NAME(lookup_hash)
117 #define SH_GROW SH_MAKE_NAME(grow)
118 #define SH_START_ITERATE SH_MAKE_NAME(start_iterate)
119 #define SH_START_ITERATE_AT SH_MAKE_NAME(start_iterate_at)
120 #define SH_ITERATE SH_MAKE_NAME(iterate)
121 #define SH_ALLOCATE SH_MAKE_NAME(allocate)
122 #define SH_FREE SH_MAKE_NAME(free)
123 #define SH_STAT SH_MAKE_NAME(stat)
124 
125 /* internal helper functions (no externally visible prototypes) */
126 #define SH_COMPUTE_PARAMETERS SH_MAKE_NAME(compute_parameters)
127 #define SH_NEXT SH_MAKE_NAME(next)
128 #define SH_PREV SH_MAKE_NAME(prev)
129 #define SH_DISTANCE_FROM_OPTIMAL SH_MAKE_NAME(distance)
130 #define SH_INITIAL_BUCKET SH_MAKE_NAME(initial_bucket)
131 #define SH_ENTRY_HASH SH_MAKE_NAME(entry_hash)
132 #define SH_INSERT_HASH_INTERNAL SH_MAKE_NAME(insert_hash_internal)
133 #define SH_LOOKUP_HASH_INTERNAL SH_MAKE_NAME(lookup_hash_internal)
134 
135 /* generate forward declarations necessary to use the hash table */
136 #ifdef SH_DECLARE
137 
138 /* type definitions */
139 typedef struct SH_TYPE
140 {
141  /*
142  * Size of data / bucket array, 64 bits to handle UINT32_MAX sized hash
143  * tables. Note that the maximum number of elements is lower
144  * (SH_MAX_FILLFACTOR)
145  */
146  uint64 size;
147 
148  /* how many elements have valid contents */
149  uint32 members;
150 
151  /* mask for bucket and size calculations, based on size */
152  uint32 sizemask;
153 
154  /* boundary after which to grow hashtable */
155  uint32 grow_threshold;
156 
157  /* hash buckets */
158  SH_ELEMENT_TYPE *data;
159 
160 #ifndef SH_RAW_ALLOCATOR
161  /* memory context to use for allocations */
162  MemoryContext ctx;
163 #endif
164 
165  /* user defined data, useful for callbacks */
166  void *private_data;
167 } SH_TYPE;
168 
169 typedef enum SH_STATUS
170 {
171  SH_STATUS_EMPTY = 0x00,
172  SH_STATUS_IN_USE = 0x01
173 } SH_STATUS;
174 
175 typedef struct SH_ITERATOR
176 {
177  uint32 cur; /* current element */
178  uint32 end;
179  bool done; /* iterator exhausted? */
180 } SH_ITERATOR;
181 
182 /* externally visible function prototypes */
183 #ifdef SH_RAW_ALLOCATOR
184 /* <prefix>_hash <prefix>_create(uint32 nelements, void *private_data) */
185 SH_SCOPE SH_TYPE *SH_CREATE(uint32 nelements, void *private_data);
186 #else
187 /*
188  * <prefix>_hash <prefix>_create(MemoryContext ctx, uint32 nelements,
189  * void *private_data)
190  */
192  void *private_data);
193 #endif
194 
195 /* void <prefix>_destroy(<prefix>_hash *tb) */
196 SH_SCOPE void SH_DESTROY(SH_TYPE * tb);
197 
198 /* void <prefix>_reset(<prefix>_hash *tb) */
199 SH_SCOPE void SH_RESET(SH_TYPE * tb);
200 
201 /* void <prefix>_grow(<prefix>_hash *tb) */
202 SH_SCOPE void SH_GROW(SH_TYPE * tb, uint32 newsize);
203 
204 /* <element> *<prefix>_insert(<prefix>_hash *tb, <key> key, bool *found) */
206 
207 /*
208  * <element> *<prefix>_insert_hash(<prefix>_hash *tb, <key> key, uint32 hash,
209  * bool *found)
210  */
212  uint32 hash, bool *found);
213 
214 /* <element> *<prefix>_lookup(<prefix>_hash *tb, <key> key) */
216 
217 /* <element> *<prefix>_lookup_hash(<prefix>_hash *tb, <key> key, uint32 hash) */
219  uint32 hash);
220 
221 /* void <prefix>_delete_item(<prefix>_hash *tb, <element> *entry) */
222 SH_SCOPE void SH_DELETE_ITEM(SH_TYPE * tb, SH_ELEMENT_TYPE * entry);
223 
224 /* bool <prefix>_delete(<prefix>_hash *tb, <key> key) */
226 
227 /* void <prefix>_start_iterate(<prefix>_hash *tb, <prefix>_iterator *iter) */
228 SH_SCOPE void SH_START_ITERATE(SH_TYPE * tb, SH_ITERATOR * iter);
229 
230 /*
231  * void <prefix>_start_iterate_at(<prefix>_hash *tb, <prefix>_iterator *iter,
232  * uint32 at)
233  */
234 SH_SCOPE void SH_START_ITERATE_AT(SH_TYPE * tb, SH_ITERATOR * iter, uint32 at);
235 
236 /* <element> *<prefix>_iterate(<prefix>_hash *tb, <prefix>_iterator *iter) */
238 
239 /* void <prefix>_stat(<prefix>_hash *tb */
240 SH_SCOPE void SH_STAT(SH_TYPE * tb);
241 
242 #endif /* SH_DECLARE */
243 
244 
245 /* generate implementation of the hash table */
246 #ifdef SH_DEFINE
247 
248 #ifndef SH_RAW_ALLOCATOR
249 #include "utils/memutils.h"
250 #endif
251 
252 /* max data array size,we allow up to PG_UINT32_MAX buckets, including 0 */
253 #define SH_MAX_SIZE (((uint64) PG_UINT32_MAX) + 1)
254 
255 /* normal fillfactor, unless already close to maximum */
256 #ifndef SH_FILLFACTOR
257 #define SH_FILLFACTOR (0.9)
258 #endif
259 /* increase fillfactor if we otherwise would error out */
260 #define SH_MAX_FILLFACTOR (0.98)
261 /* grow if actual and optimal location bigger than */
262 #ifndef SH_GROW_MAX_DIB
263 #define SH_GROW_MAX_DIB 25
264 #endif
265 /* grow if more than elements to move when inserting */
266 #ifndef SH_GROW_MAX_MOVE
267 #define SH_GROW_MAX_MOVE 150
268 #endif
269 #ifndef SH_GROW_MIN_FILLFACTOR
270 /* but do not grow due to SH_GROW_MAX_* if below */
271 #define SH_GROW_MIN_FILLFACTOR 0.1
272 #endif
273 
274 #ifdef SH_STORE_HASH
275 #define SH_COMPARE_KEYS(tb, ahash, akey, b) (ahash == SH_GET_HASH(tb, b) && SH_EQUAL(tb, b->SH_KEY, akey))
276 #else
277 #define SH_COMPARE_KEYS(tb, ahash, akey, b) (SH_EQUAL(tb, b->SH_KEY, akey))
278 #endif
279 
280 /*
281  * Wrap the following definitions in include guards, to avoid multiple
282  * definition errors if this header is included more than once. The rest of
283  * the file deliberately has no include guards, because it can be included
284  * with different parameters to define functions and types with non-colliding
285  * names.
286  */
287 #ifndef SIMPLEHASH_H
288 #define SIMPLEHASH_H
289 
290 #ifdef FRONTEND
291 #define sh_error(...) pg_log_error(__VA_ARGS__)
292 #define sh_log(...) pg_log_info(__VA_ARGS__)
293 #else
294 #define sh_error(...) elog(ERROR, __VA_ARGS__)
295 #define sh_log(...) elog(LOG, __VA_ARGS__)
296 #endif
297 
298 #endif
299 
300 /*
301  * Compute sizing parameters for hashtable. Called when creating and growing
302  * the hashtable.
303  */
304 static inline void
305 SH_COMPUTE_PARAMETERS(SH_TYPE * tb, uint32 newsize)
306 {
307  uint64 size;
308 
309  /* supporting zero sized hashes would complicate matters */
310  size = Max(newsize, 2);
311 
312  /* round up size to the next power of 2, that's how bucketing works */
313  size = pg_nextpower2_64(size);
314  Assert(size <= SH_MAX_SIZE);
315 
316  /*
317  * Verify that allocation of ->data is possible on this platform, without
318  * overflowing Size.
319  */
320  if ((((uint64) sizeof(SH_ELEMENT_TYPE)) * size) >= SIZE_MAX / 2)
321  sh_error("hash table too large");
322 
323  /* now set size */
324  tb->size = size;
325 
326  if (tb->size == SH_MAX_SIZE)
327  tb->sizemask = 0;
328  else
329  tb->sizemask = tb->size - 1;
330 
331  /*
332  * Compute the next threshold at which we need to grow the hash table
333  * again.
334  */
335  if (tb->size == SH_MAX_SIZE)
336  tb->grow_threshold = ((double) tb->size) * SH_MAX_FILLFACTOR;
337  else
338  tb->grow_threshold = ((double) tb->size) * SH_FILLFACTOR;
339 }
340 
341 /* return the optimal bucket for the hash */
342 static inline uint32
344 {
345  return hash & tb->sizemask;
346 }
347 
348 /* return next bucket after the current, handling wraparound */
349 static inline uint32
350 SH_NEXT(SH_TYPE * tb, uint32 curelem, uint32 startelem)
351 {
352  curelem = (curelem + 1) & tb->sizemask;
353 
354  Assert(curelem != startelem);
355 
356  return curelem;
357 }
358 
359 /* return bucket before the current, handling wraparound */
360 static inline uint32
361 SH_PREV(SH_TYPE * tb, uint32 curelem, uint32 startelem)
362 {
363  curelem = (curelem - 1) & tb->sizemask;
364 
365  Assert(curelem != startelem);
366 
367  return curelem;
368 }
369 
370 /* return distance between bucket and its optimal position */
371 static inline uint32
372 SH_DISTANCE_FROM_OPTIMAL(SH_TYPE * tb, uint32 optimal, uint32 bucket)
373 {
374  if (optimal <= bucket)
375  return bucket - optimal;
376  else
377  return (tb->size + bucket) - optimal;
378 }
379 
380 static inline uint32
382 {
383 #ifdef SH_STORE_HASH
384  return SH_GET_HASH(tb, entry);
385 #else
386  return SH_HASH_KEY(tb, entry->SH_KEY);
387 #endif
388 }
389 
390 /* default memory allocator function */
391 static inline void *SH_ALLOCATE(SH_TYPE * type, Size size);
392 static inline void SH_FREE(SH_TYPE * type, void *pointer);
393 
394 #ifndef SH_USE_NONDEFAULT_ALLOCATOR
395 
396 /* default memory allocator function */
397 static inline void *
398 SH_ALLOCATE(SH_TYPE * type, Size size)
399 {
400 #ifdef SH_RAW_ALLOCATOR
401  return SH_RAW_ALLOCATOR(size);
402 #else
403  return MemoryContextAllocExtended(type->ctx, size,
405 #endif
406 }
407 
408 /* default memory free function */
409 static inline void
410 SH_FREE(SH_TYPE * type, void *pointer)
411 {
412  pfree(pointer);
413 }
414 
415 #endif
416 
417 /*
418  * Create a hash table with enough space for `nelements` distinct members.
419  * Memory for the hash table is allocated from the passed-in context. If
420  * desired, the array of elements can be allocated using a passed-in allocator;
421  * this could be useful in order to place the array of elements in a shared
422  * memory, or in a context that will outlive the rest of the hash table.
423  * Memory other than for the array of elements will still be allocated from
424  * the passed-in context.
425  */
426 #ifdef SH_RAW_ALLOCATOR
428 SH_CREATE(uint32 nelements, void *private_data)
429 #else
431 SH_CREATE(MemoryContext ctx, uint32 nelements, void *private_data)
432 #endif
433 {
434  SH_TYPE *tb;
435  uint64 size;
436 
437 #ifdef SH_RAW_ALLOCATOR
438  tb = SH_RAW_ALLOCATOR(sizeof(SH_TYPE));
439 #else
440  tb = MemoryContextAllocZero(ctx, sizeof(SH_TYPE));
441  tb->ctx = ctx;
442 #endif
443  tb->private_data = private_data;
444 
445  /* increase nelements by fillfactor, want to store nelements elements */
446  size = Min((double) SH_MAX_SIZE, ((double) nelements) / SH_FILLFACTOR);
447 
448  SH_COMPUTE_PARAMETERS(tb, size);
449 
450  tb->data = SH_ALLOCATE(tb, sizeof(SH_ELEMENT_TYPE) * tb->size);
451 
452  return tb;
453 }
454 
455 /* destroy a previously created hash table */
456 SH_SCOPE void
457 SH_DESTROY(SH_TYPE * tb)
458 {
459  SH_FREE(tb, tb->data);
460  pfree(tb);
461 }
462 
463 /* reset the contents of a previously created hash table */
464 SH_SCOPE void
465 SH_RESET(SH_TYPE * tb)
466 {
467  memset(tb->data, 0, sizeof(SH_ELEMENT_TYPE) * tb->size);
468  tb->members = 0;
469 }
470 
471 /*
472  * Grow a hash table to at least `newsize` buckets.
473  *
474  * Usually this will automatically be called by insertions/deletions, when
475  * necessary. But resizing to the exact input size can be advantageous
476  * performance-wise, when known at some point.
477  */
478 SH_SCOPE void
479 SH_GROW(SH_TYPE * tb, uint32 newsize)
480 {
481  uint64 oldsize = tb->size;
482  SH_ELEMENT_TYPE *olddata = tb->data;
483  SH_ELEMENT_TYPE *newdata;
484  uint32 i;
485  uint32 startelem = 0;
486  uint32 copyelem;
487 
488  Assert(oldsize == pg_nextpower2_64(oldsize));
489  Assert(oldsize != SH_MAX_SIZE);
490  Assert(oldsize < newsize);
491 
492  /* compute parameters for new table */
493  SH_COMPUTE_PARAMETERS(tb, newsize);
494 
495  tb->data = SH_ALLOCATE(tb, sizeof(SH_ELEMENT_TYPE) * tb->size);
496 
497  newdata = tb->data;
498 
499  /*
500  * Copy entries from the old data to newdata. We theoretically could use
501  * SH_INSERT here, to avoid code duplication, but that's more general than
502  * we need. We neither want tb->members increased, nor do we need to do
503  * deal with deleted elements, nor do we need to compare keys. So a
504  * special-cased implementation is lot faster. As resizing can be time
505  * consuming and frequent, that's worthwhile to optimize.
506  *
507  * To be able to simply move entries over, we have to start not at the
508  * first bucket (i.e olddata[0]), but find the first bucket that's either
509  * empty, or is occupied by an entry at its optimal position. Such a
510  * bucket has to exist in any table with a load factor under 1, as not all
511  * buckets are occupied, i.e. there always has to be an empty bucket. By
512  * starting at such a bucket we can move the entries to the larger table,
513  * without having to deal with conflicts.
514  */
515 
516  /* search for the first element in the hash that's not wrapped around */
517  for (i = 0; i < oldsize; i++)
518  {
519  SH_ELEMENT_TYPE *oldentry = &olddata[i];
520  uint32 hash;
521  uint32 optimal;
522 
523  if (oldentry->status != SH_STATUS_IN_USE)
524  {
525  startelem = i;
526  break;
527  }
528 
529  hash = SH_ENTRY_HASH(tb, oldentry);
530  optimal = SH_INITIAL_BUCKET(tb, hash);
531 
532  if (optimal == i)
533  {
534  startelem = i;
535  break;
536  }
537  }
538 
539  /* and copy all elements in the old table */
540  copyelem = startelem;
541  for (i = 0; i < oldsize; i++)
542  {
543  SH_ELEMENT_TYPE *oldentry = &olddata[copyelem];
544 
545  if (oldentry->status == SH_STATUS_IN_USE)
546  {
547  uint32 hash;
548  uint32 startelem;
549  uint32 curelem;
550  SH_ELEMENT_TYPE *newentry;
551 
552  hash = SH_ENTRY_HASH(tb, oldentry);
553  startelem = SH_INITIAL_BUCKET(tb, hash);
554  curelem = startelem;
555 
556  /* find empty element to put data into */
557  while (true)
558  {
559  newentry = &newdata[curelem];
560 
561  if (newentry->status == SH_STATUS_EMPTY)
562  {
563  break;
564  }
565 
566  curelem = SH_NEXT(tb, curelem, startelem);
567  }
568 
569  /* copy entry to new slot */
570  memcpy(newentry, oldentry, sizeof(SH_ELEMENT_TYPE));
571  }
572 
573  /* can't use SH_NEXT here, would use new size */
574  copyelem++;
575  if (copyelem >= oldsize)
576  {
577  copyelem = 0;
578  }
579  }
580 
581  SH_FREE(tb, olddata);
582 }
583 
584 /*
585  * This is a separate static inline function, so it can be reliably be inlined
586  * into its wrapper functions even if SH_SCOPE is extern.
587  */
588 static inline SH_ELEMENT_TYPE *
590 {
591  uint32 startelem;
592  uint32 curelem;
593  SH_ELEMENT_TYPE *data;
594  uint32 insertdist;
595 
596 restart:
597  insertdist = 0;
598 
599  /*
600  * We do the grow check even if the key is actually present, to avoid
601  * doing the check inside the loop. This also lets us avoid having to
602  * re-find our position in the hashtable after resizing.
603  *
604  * Note that this also reached when resizing the table due to
605  * SH_GROW_MAX_DIB / SH_GROW_MAX_MOVE.
606  */
607  if (unlikely(tb->members >= tb->grow_threshold))
608  {
609  if (tb->size == SH_MAX_SIZE)
610  {
611  sh_error("hash table size exceeded");
612  }
613 
614  /*
615  * When optimizing, it can be very useful to print these out.
616  */
617  /* SH_STAT(tb); */
618  SH_GROW(tb, tb->size * 2);
619  /* SH_STAT(tb); */
620  }
621 
622  /* perform insert, start bucket search at optimal location */
623  data = tb->data;
624  startelem = SH_INITIAL_BUCKET(tb, hash);
625  curelem = startelem;
626  while (true)
627  {
628  uint32 curdist;
629  uint32 curhash;
630  uint32 curoptimal;
631  SH_ELEMENT_TYPE *entry = &data[curelem];
632 
633  /* any empty bucket can directly be used */
634  if (entry->status == SH_STATUS_EMPTY)
635  {
636  tb->members++;
637  entry->SH_KEY = key;
638 #ifdef SH_STORE_HASH
639  SH_GET_HASH(tb, entry) = hash;
640 #endif
641  entry->status = SH_STATUS_IN_USE;
642  *found = false;
643  return entry;
644  }
645 
646  /*
647  * If the bucket is not empty, we either found a match (in which case
648  * we're done), or we have to decide whether to skip over or move the
649  * colliding entry. When the colliding element's distance to its
650  * optimal position is smaller than the to-be-inserted entry's, we
651  * shift the colliding entry (and its followers) forward by one.
652  */
653 
654  if (SH_COMPARE_KEYS(tb, hash, key, entry))
655  {
656  Assert(entry->status == SH_STATUS_IN_USE);
657  *found = true;
658  return entry;
659  }
660 
661  curhash = SH_ENTRY_HASH(tb, entry);
662  curoptimal = SH_INITIAL_BUCKET(tb, curhash);
663  curdist = SH_DISTANCE_FROM_OPTIMAL(tb, curoptimal, curelem);
664 
665  if (insertdist > curdist)
666  {
667  SH_ELEMENT_TYPE *lastentry = entry;
668  uint32 emptyelem = curelem;
669  uint32 moveelem;
670  int32 emptydist = 0;
671 
672  /* find next empty bucket */
673  while (true)
674  {
675  SH_ELEMENT_TYPE *emptyentry;
676 
677  emptyelem = SH_NEXT(tb, emptyelem, startelem);
678  emptyentry = &data[emptyelem];
679 
680  if (emptyentry->status == SH_STATUS_EMPTY)
681  {
682  lastentry = emptyentry;
683  break;
684  }
685 
686  /*
687  * To avoid negative consequences from overly imbalanced
688  * hashtables, grow the hashtable if collisions would require
689  * us to move a lot of entries. The most likely cause of such
690  * imbalance is filling a (currently) small table, from a
691  * currently big one, in hash-table order. Don't grow if the
692  * hashtable would be too empty, to prevent quick space
693  * explosion for some weird edge cases.
694  */
695  if (unlikely(++emptydist > SH_GROW_MAX_MOVE) &&
696  ((double) tb->members / tb->size) >= SH_GROW_MIN_FILLFACTOR)
697  {
698  tb->grow_threshold = 0;
699  goto restart;
700  }
701  }
702 
703  /* shift forward, starting at last occupied element */
704 
705  /*
706  * TODO: This could be optimized to be one memcpy in many cases,
707  * excepting wrapping around at the end of ->data. Hasn't shown up
708  * in profiles so far though.
709  */
710  moveelem = emptyelem;
711  while (moveelem != curelem)
712  {
713  SH_ELEMENT_TYPE *moveentry;
714 
715  moveelem = SH_PREV(tb, moveelem, startelem);
716  moveentry = &data[moveelem];
717 
718  memcpy(lastentry, moveentry, sizeof(SH_ELEMENT_TYPE));
719  lastentry = moveentry;
720  }
721 
722  /* and fill the now empty spot */
723  tb->members++;
724 
725  entry->SH_KEY = key;
726 #ifdef SH_STORE_HASH
727  SH_GET_HASH(tb, entry) = hash;
728 #endif
729  entry->status = SH_STATUS_IN_USE;
730  *found = false;
731  return entry;
732  }
733 
734  curelem = SH_NEXT(tb, curelem, startelem);
735  insertdist++;
736 
737  /*
738  * To avoid negative consequences from overly imbalanced hashtables,
739  * grow the hashtable if collisions lead to large runs. The most
740  * likely cause of such imbalance is filling a (currently) small
741  * table, from a currently big one, in hash-table order. Don't grow
742  * if the hashtable would be too empty, to prevent quick space
743  * explosion for some weird edge cases.
744  */
745  if (unlikely(insertdist > SH_GROW_MAX_DIB) &&
746  ((double) tb->members / tb->size) >= SH_GROW_MIN_FILLFACTOR)
747  {
748  tb->grow_threshold = 0;
749  goto restart;
750  }
751  }
752 }
753 
754 /*
755  * Insert the key key into the hash-table, set *found to true if the key
756  * already exists, false otherwise. Returns the hash-table entry in either
757  * case.
758  */
760 SH_INSERT(SH_TYPE * tb, SH_KEY_TYPE key, bool *found)
761 {
762  uint32 hash = SH_HASH_KEY(tb, key);
763 
764  return SH_INSERT_HASH_INTERNAL(tb, key, hash, found);
765 }
766 
767 /*
768  * Insert the key key into the hash-table using an already-calculated
769  * hash. Set *found to true if the key already exists, false
770  * otherwise. Returns the hash-table entry in either case.
771  */
773 SH_INSERT_HASH(SH_TYPE * tb, SH_KEY_TYPE key, uint32 hash, bool *found)
774 {
775  return SH_INSERT_HASH_INTERNAL(tb, key, hash, found);
776 }
777 
778 /*
779  * This is a separate static inline function, so it can be reliably be inlined
780  * into its wrapper functions even if SH_SCOPE is extern.
781  */
782 static inline SH_ELEMENT_TYPE *
784 {
785  const uint32 startelem = SH_INITIAL_BUCKET(tb, hash);
786  uint32 curelem = startelem;
787 
788  while (true)
789  {
790  SH_ELEMENT_TYPE *entry = &tb->data[curelem];
791 
792  if (entry->status == SH_STATUS_EMPTY)
793  {
794  return NULL;
795  }
796 
797  Assert(entry->status == SH_STATUS_IN_USE);
798 
799  if (SH_COMPARE_KEYS(tb, hash, key, entry))
800  return entry;
801 
802  /*
803  * TODO: we could stop search based on distance. If the current
804  * buckets's distance-from-optimal is smaller than what we've skipped
805  * already, the entry doesn't exist. Probably only do so if
806  * SH_STORE_HASH is defined, to avoid re-computing hashes?
807  */
808 
809  curelem = SH_NEXT(tb, curelem, startelem);
810  }
811 }
812 
813 /*
814  * Lookup up entry in hash table. Returns NULL if key not present.
815  */
818 {
819  uint32 hash = SH_HASH_KEY(tb, key);
820 
821  return SH_LOOKUP_HASH_INTERNAL(tb, key, hash);
822 }
823 
824 /*
825  * Lookup up entry in hash table using an already-calculated hash.
826  *
827  * Returns NULL if key not present.
828  */
831 {
832  return SH_LOOKUP_HASH_INTERNAL(tb, key, hash);
833 }
834 
835 /*
836  * Delete entry from hash table by key. Returns whether to-be-deleted key was
837  * present.
838  */
839 SH_SCOPE bool
841 {
842  uint32 hash = SH_HASH_KEY(tb, key);
843  uint32 startelem = SH_INITIAL_BUCKET(tb, hash);
844  uint32 curelem = startelem;
845 
846  while (true)
847  {
848  SH_ELEMENT_TYPE *entry = &tb->data[curelem];
849 
850  if (entry->status == SH_STATUS_EMPTY)
851  return false;
852 
853  if (entry->status == SH_STATUS_IN_USE &&
854  SH_COMPARE_KEYS(tb, hash, key, entry))
855  {
856  SH_ELEMENT_TYPE *lastentry = entry;
857 
858  tb->members--;
859 
860  /*
861  * Backward shift following elements till either an empty element
862  * or an element at its optimal position is encountered.
863  *
864  * While that sounds expensive, the average chain length is short,
865  * and deletions would otherwise require tombstones.
866  */
867  while (true)
868  {
869  SH_ELEMENT_TYPE *curentry;
870  uint32 curhash;
871  uint32 curoptimal;
872 
873  curelem = SH_NEXT(tb, curelem, startelem);
874  curentry = &tb->data[curelem];
875 
876  if (curentry->status != SH_STATUS_IN_USE)
877  {
878  lastentry->status = SH_STATUS_EMPTY;
879  break;
880  }
881 
882  curhash = SH_ENTRY_HASH(tb, curentry);
883  curoptimal = SH_INITIAL_BUCKET(tb, curhash);
884 
885  /* current is at optimal position, done */
886  if (curoptimal == curelem)
887  {
888  lastentry->status = SH_STATUS_EMPTY;
889  break;
890  }
891 
892  /* shift */
893  memcpy(lastentry, curentry, sizeof(SH_ELEMENT_TYPE));
894 
895  lastentry = curentry;
896  }
897 
898  return true;
899  }
900 
901  /* TODO: return false; if distance too big */
902 
903  curelem = SH_NEXT(tb, curelem, startelem);
904  }
905 }
906 
907 /*
908  * Delete entry from hash table by entry pointer
909  */
910 SH_SCOPE void
912 {
913  SH_ELEMENT_TYPE *lastentry = entry;
914  uint32 hash = SH_ENTRY_HASH(tb, entry);
915  uint32 startelem = SH_INITIAL_BUCKET(tb, hash);
916  uint32 curelem;
917 
918  /* Calculate the index of 'entry' */
919  curelem = entry - &tb->data[0];
920 
921  tb->members--;
922 
923  /*
924  * Backward shift following elements till either an empty element or an
925  * element at its optimal position is encountered.
926  *
927  * While that sounds expensive, the average chain length is short, and
928  * deletions would otherwise require tombstones.
929  */
930  while (true)
931  {
932  SH_ELEMENT_TYPE *curentry;
933  uint32 curhash;
934  uint32 curoptimal;
935 
936  curelem = SH_NEXT(tb, curelem, startelem);
937  curentry = &tb->data[curelem];
938 
939  if (curentry->status != SH_STATUS_IN_USE)
940  {
941  lastentry->status = SH_STATUS_EMPTY;
942  break;
943  }
944 
945  curhash = SH_ENTRY_HASH(tb, curentry);
946  curoptimal = SH_INITIAL_BUCKET(tb, curhash);
947 
948  /* current is at optimal position, done */
949  if (curoptimal == curelem)
950  {
951  lastentry->status = SH_STATUS_EMPTY;
952  break;
953  }
954 
955  /* shift */
956  memcpy(lastentry, curentry, sizeof(SH_ELEMENT_TYPE));
957 
958  lastentry = curentry;
959  }
960 }
961 
962 /*
963  * Initialize iterator.
964  */
965 SH_SCOPE void
967 {
968  int i;
969  uint64 startelem = PG_UINT64_MAX;
970 
971  /*
972  * Search for the first empty element. As deletions during iterations are
973  * supported, we want to start/end at an element that cannot be affected
974  * by elements being shifted.
975  */
976  for (i = 0; i < tb->size; i++)
977  {
978  SH_ELEMENT_TYPE *entry = &tb->data[i];
979 
980  if (entry->status != SH_STATUS_IN_USE)
981  {
982  startelem = i;
983  break;
984  }
985  }
986 
987  Assert(startelem < SH_MAX_SIZE);
988 
989  /*
990  * Iterate backwards, that allows the current element to be deleted, even
991  * if there are backward shifts
992  */
993  iter->cur = startelem;
994  iter->end = iter->cur;
995  iter->done = false;
996 }
997 
998 /*
999  * Initialize iterator to a specific bucket. That's really only useful for
1000  * cases where callers are partially iterating over the hashspace, and that
1001  * iteration deletes and inserts elements based on visited entries. Doing that
1002  * repeatedly could lead to an unbalanced keyspace when always starting at the
1003  * same position.
1004  */
1005 SH_SCOPE void
1007 {
1008  /*
1009  * Iterate backwards, that allows the current element to be deleted, even
1010  * if there are backward shifts.
1011  */
1012  iter->cur = at & tb->sizemask; /* ensure at is within a valid range */
1013  iter->end = iter->cur;
1014  iter->done = false;
1015 }
1016 
1017 /*
1018  * Iterate over all entries in the hash-table. Return the next occupied entry,
1019  * or NULL if done.
1020  *
1021  * During iteration the current entry in the hash table may be deleted,
1022  * without leading to elements being skipped or returned twice. Additionally
1023  * the rest of the table may be modified (i.e. there can be insertions or
1024  * deletions), but if so, there's neither a guarantee that all nodes are
1025  * visited at least once, nor a guarantee that a node is visited at most once.
1026  */
1028 SH_ITERATE(SH_TYPE * tb, SH_ITERATOR * iter)
1029 {
1030  while (!iter->done)
1031  {
1032  SH_ELEMENT_TYPE *elem;
1033 
1034  elem = &tb->data[iter->cur];
1035 
1036  /* next element in backward direction */
1037  iter->cur = (iter->cur - 1) & tb->sizemask;
1038 
1039  if ((iter->cur & tb->sizemask) == (iter->end & tb->sizemask))
1040  iter->done = true;
1041  if (elem->status == SH_STATUS_IN_USE)
1042  {
1043  return elem;
1044  }
1045  }
1046 
1047  return NULL;
1048 }
1049 
1050 /*
1051  * Report some statistics about the state of the hashtable. For
1052  * debugging/profiling purposes only.
1053  */
1054 SH_SCOPE void
1055 SH_STAT(SH_TYPE * tb)
1056 {
1057  uint32 max_chain_length = 0;
1058  uint32 total_chain_length = 0;
1059  double avg_chain_length;
1060  double fillfactor;
1061  uint32 i;
1062 
1063  uint32 *collisions = palloc0(tb->size * sizeof(uint32));
1064  uint32 total_collisions = 0;
1065  uint32 max_collisions = 0;
1066  double avg_collisions;
1067 
1068  for (i = 0; i < tb->size; i++)
1069  {
1070  uint32 hash;
1071  uint32 optimal;
1072  uint32 dist;
1073  SH_ELEMENT_TYPE *elem;
1074 
1075  elem = &tb->data[i];
1076 
1077  if (elem->status != SH_STATUS_IN_USE)
1078  continue;
1079 
1080  hash = SH_ENTRY_HASH(tb, elem);
1081  optimal = SH_INITIAL_BUCKET(tb, hash);
1082  dist = SH_DISTANCE_FROM_OPTIMAL(tb, optimal, i);
1083 
1084  if (dist > max_chain_length)
1085  max_chain_length = dist;
1086  total_chain_length += dist;
1087 
1088  collisions[optimal]++;
1089  }
1090 
1091  for (i = 0; i < tb->size; i++)
1092  {
1093  uint32 curcoll = collisions[i];
1094 
1095  if (curcoll == 0)
1096  continue;
1097 
1098  /* single contained element is not a collision */
1099  curcoll--;
1100  total_collisions += curcoll;
1101  if (curcoll > max_collisions)
1102  max_collisions = curcoll;
1103  }
1104 
1105  if (tb->members > 0)
1106  {
1107  fillfactor = tb->members / ((double) tb->size);
1108  avg_chain_length = ((double) total_chain_length) / tb->members;
1109  avg_collisions = ((double) total_collisions) / tb->members;
1110  }
1111  else
1112  {
1113  fillfactor = 0;
1114  avg_chain_length = 0;
1115  avg_collisions = 0;
1116  }
1117 
1118  sh_log("size: " UINT64_FORMAT ", members: %u, filled: %f, total chain: %u, max chain: %u, avg chain: %f, total_collisions: %u, max_collisions: %i, avg_collisions: %f",
1119  tb->size, tb->members, fillfactor, total_chain_length, max_chain_length, avg_chain_length,
1120  total_collisions, max_collisions, avg_collisions);
1121 }
1122 
1123 #endif /* SH_DEFINE */
1124 
1125 
1126 /* undefine external parameters, so next hash table can be defined */
1127 #undef SH_PREFIX
1128 #undef SH_KEY_TYPE
1129 #undef SH_KEY
1130 #undef SH_ELEMENT_TYPE
1131 #undef SH_HASH_KEY
1132 #undef SH_SCOPE
1133 #undef SH_DECLARE
1134 #undef SH_DEFINE
1135 #undef SH_GET_HASH
1136 #undef SH_STORE_HASH
1137 #undef SH_USE_NONDEFAULT_ALLOCATOR
1138 #undef SH_EQUAL
1139 
1140 /* undefine locally declared macros */
1141 #undef SH_MAKE_PREFIX
1142 #undef SH_MAKE_NAME
1143 #undef SH_MAKE_NAME_
1144 #undef SH_FILLFACTOR
1145 #undef SH_MAX_FILLFACTOR
1146 #undef SH_GROW_MAX_DIB
1147 #undef SH_GROW_MAX_MOVE
1148 #undef SH_GROW_MIN_FILLFACTOR
1149 #undef SH_MAX_SIZE
1150 
1151 /* types */
1152 #undef SH_TYPE
1153 #undef SH_STATUS
1154 #undef SH_STATUS_EMPTY
1155 #undef SH_STATUS_IN_USE
1156 #undef SH_ITERATOR
1157 
1158 /* external function names */
1159 #undef SH_CREATE
1160 #undef SH_DESTROY
1161 #undef SH_RESET
1162 #undef SH_INSERT
1163 #undef SH_INSERT_HASH
1164 #undef SH_DELETE_ITEM
1165 #undef SH_DELETE
1166 #undef SH_LOOKUP
1167 #undef SH_LOOKUP_HASH
1168 #undef SH_GROW
1169 #undef SH_START_ITERATE
1170 #undef SH_START_ITERATE_AT
1171 #undef SH_ITERATE
1172 #undef SH_ALLOCATE
1173 #undef SH_FREE
1174 #undef SH_STAT
1175 
1176 /* internal function names */
1177 #undef SH_COMPUTE_PARAMETERS
1178 #undef SH_COMPARE_KEYS
1179 #undef SH_INITIAL_BUCKET
1180 #undef SH_NEXT
1181 #undef SH_PREV
1182 #undef SH_DISTANCE_FROM_OPTIMAL
1183 #undef SH_ENTRY_HASH
1184 #undef SH_INSERT_HASH_INTERNAL
1185 #undef SH_LOOKUP_HASH_INTERNAL
#define SH_ALLOCATE
Definition: simplehash.h:121
#define SH_COMPUTE_PARAMETERS
Definition: simplehash.h:126
#define SH_START_ITERATE
Definition: simplehash.h:118
#define PG_UINT64_MAX
Definition: c.h:528
#define SH_HASH_KEY(tb, key)
#define MCXT_ALLOC_HUGE
Definition: fe_memutils.h:16
#define SH_KEY_TYPE
#define SH_RAW_ALLOCATOR
Definition: filemap.c:49
void * MemoryContextAllocExtended(MemoryContext context, Size size, int flags)
Definition: mcxt.c:979
#define SH_DELETE
Definition: simplehash.h:114
#define SH_START_ITERATE_AT
Definition: simplehash.h:119
#define Min(x, y)
Definition: c.h:986
struct cursor * cur
Definition: ecpg.c:28
#define SH_ELEMENT_TYPE
#define SH_STATUS_EMPTY
Definition: simplehash.h:103
#define SH_GET_HASH(tb, a)
#define SH_LOOKUP
Definition: simplehash.h:115
#define SH_INSERT_HASH
Definition: simplehash.h:112
#define SH_RESET
Definition: simplehash.h:110
#define SH_GROW
Definition: simplehash.h:117
#define SH_NEXT
Definition: simplehash.h:127
#define SH_STAT
Definition: simplehash.h:123
#define SH_PREV
Definition: simplehash.h:128
signed int int32
Definition: c.h:429
#define SH_STATUS_IN_USE
Definition: simplehash.h:104
#define SH_ITERATOR
Definition: simplehash.h:105
void pfree(void *pointer)
Definition: mcxt.c:1169
#define SH_INSERT_HASH_INTERNAL
Definition: simplehash.h:132
int fillfactor
Definition: pgbench.c:195
#define SH_SCOPE
#define SH_DESTROY
Definition: simplehash.h:109
unsigned int uint32
Definition: c.h:441
#define SH_FREE
Definition: simplehash.h:122
static uint64 pg_nextpower2_64(uint64 num)
Definition: pg_bitutils.h:169
void * palloc0(Size size)
Definition: mcxt.c:1093
#define SH_STATUS
Definition: simplehash.h:102
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:906
#define SH_DISTANCE_FROM_OPTIMAL
Definition: simplehash.h:129
#define Max(x, y)
Definition: c.h:980
#define Assert(condition)
Definition: c.h:804
#define SH_INSERT
Definition: simplehash.h:111
#define MCXT_ALLOC_ZERO
Definition: fe_memutils.h:19
size_t Size
Definition: c.h:540
#define SH_ITERATE
Definition: simplehash.h:120
#define SH_TYPE
Definition: simplehash.h:101
#define SH_INITIAL_BUCKET
Definition: simplehash.h:130
int i
#define SH_LOOKUP_HASH
Definition: simplehash.h:116
#define unlikely(x)
Definition: c.h:273
#define SH_CREATE
Definition: simplehash.h:108
#define SH_ENTRY_HASH
Definition: simplehash.h:131
static unsigned hash(unsigned *uv, int n)
Definition: rege_dfa.c:719
#define UINT64_FORMAT
Definition: c.h:484
#define SH_DELETE_ITEM
Definition: simplehash.h:113
#define SH_LOOKUP_HASH_INTERNAL
Definition: simplehash.h:133