PostgreSQL Source Code  git master
hashfn.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * hashfn.c
4  * Generic hashing functions, and hash functions for use in dynahash.c
5  * hashtables
6  *
7  *
8  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
9  * Portions Copyright (c) 1994, Regents of the University of California
10  *
11  *
12  * IDENTIFICATION
13  * src/common/hashfn.c
14  *
15  * NOTES
16  * It is expected that every bit of a hash function's 32-bit result is
17  * as random as every other; failure to ensure this is likely to lead
18  * to poor performance of hash tables. In most cases a hash
19  * function should use hash_bytes() or its variant hash_bytes_uint32(),
20  * or the wrappers hash_any() and hash_uint32 defined in hashfn.h.
21  *
22  *-------------------------------------------------------------------------
23  */
24 #include "postgres.h"
25 
26 #include "common/hashfn.h"
27 #include "port/pg_bitutils.h"
28 
29 
30 /*
31  * This hash function was written by Bob Jenkins
32  * (bob_jenkins@burtleburtle.net), and superficially adapted
33  * for PostgreSQL by Neil Conway. For more information on this
34  * hash function, see http://burtleburtle.net/bob/hash/doobs.html,
35  * or Bob's article in Dr. Dobb's Journal, Sept. 1997.
36  *
37  * In the current code, we have adopted Bob's 2006 update of his hash
38  * function to fetch the data a word at a time when it is suitably aligned.
39  * This makes for a useful speedup, at the cost of having to maintain
40  * four code paths (aligned vs unaligned, and little-endian vs big-endian).
41  * It also uses two separate mixing functions mix() and final(), instead
42  * of a slower multi-purpose function.
43  */
44 
45 /* Get a bit mask of the bits set in non-uint32 aligned addresses */
46 #define UINT32_ALIGN_MASK (sizeof(uint32) - 1)
47 
48 #define rot(x,k) pg_rotate_left32(x, k)
49 
50 /*----------
51  * mix -- mix 3 32-bit values reversibly.
52  *
53  * This is reversible, so any information in (a,b,c) before mix() is
54  * still in (a,b,c) after mix().
55  *
56  * If four pairs of (a,b,c) inputs are run through mix(), or through
57  * mix() in reverse, there are at least 32 bits of the output that
58  * are sometimes the same for one pair and different for another pair.
59  * This was tested for:
60  * * pairs that differed by one bit, by two bits, in any combination
61  * of top bits of (a,b,c), or in any combination of bottom bits of
62  * (a,b,c).
63  * * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
64  * the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
65  * is commonly produced by subtraction) look like a single 1-bit
66  * difference.
67  * * the base values were pseudorandom, all zero but one bit set, or
68  * all zero plus a counter that starts at zero.
69  *
70  * This does not achieve avalanche. There are input bits of (a,b,c)
71  * that fail to affect some output bits of (a,b,c), especially of a. The
72  * most thoroughly mixed value is c, but it doesn't really even achieve
73  * avalanche in c.
74  *
75  * This allows some parallelism. Read-after-writes are good at doubling
76  * the number of bits affected, so the goal of mixing pulls in the opposite
77  * direction from the goal of parallelism. I did what I could. Rotates
78  * seem to cost as much as shifts on every machine I could lay my hands on,
79  * and rotates are much kinder to the top and bottom bits, so I used rotates.
80  *----------
81  */
82 #define mix(a,b,c) \
83 { \
84  a -= c; a ^= rot(c, 4); c += b; \
85  b -= a; b ^= rot(a, 6); a += c; \
86  c -= b; c ^= rot(b, 8); b += a; \
87  a -= c; a ^= rot(c,16); c += b; \
88  b -= a; b ^= rot(a,19); a += c; \
89  c -= b; c ^= rot(b, 4); b += a; \
90 }
91 
92 /*----------
93  * final -- final mixing of 3 32-bit values (a,b,c) into c
94  *
95  * Pairs of (a,b,c) values differing in only a few bits will usually
96  * produce values of c that look totally different. This was tested for
97  * * pairs that differed by one bit, by two bits, in any combination
98  * of top bits of (a,b,c), or in any combination of bottom bits of
99  * (a,b,c).
100  * * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
101  * the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
102  * is commonly produced by subtraction) look like a single 1-bit
103  * difference.
104  * * the base values were pseudorandom, all zero but one bit set, or
105  * all zero plus a counter that starts at zero.
106  *
107  * The use of separate functions for mix() and final() allow for a
108  * substantial performance increase since final() does not need to
109  * do well in reverse, but is does need to affect all output bits.
110  * mix(), on the other hand, does not need to affect all output
111  * bits (affecting 32 bits is enough). The original hash function had
112  * a single mixing operation that had to satisfy both sets of requirements
113  * and was slower as a result.
114  *----------
115  */
116 #define final(a,b,c) \
117 { \
118  c ^= b; c -= rot(b,14); \
119  a ^= c; a -= rot(c,11); \
120  b ^= a; b -= rot(a,25); \
121  c ^= b; c -= rot(b,16); \
122  a ^= c; a -= rot(c, 4); \
123  b ^= a; b -= rot(a,14); \
124  c ^= b; c -= rot(b,24); \
125 }
126 
127 /*
128  * hash_bytes() -- hash a variable-length key into a 32-bit value
129  * k : the key (the unaligned variable-length array of bytes)
130  * len : the length of the key, counting by bytes
131  *
132  * Returns a uint32 value. Every bit of the key affects every bit of
133  * the return value. Every 1-bit and 2-bit delta achieves avalanche.
134  * About 6*len+35 instructions. The best hash table sizes are powers
135  * of 2. There is no need to do mod a prime (mod is sooo slow!).
136  * If you need less than 32 bits, use a bitmask.
137  *
138  * This procedure must never throw elog(ERROR); the ResourceOwner code
139  * relies on this not to fail.
140  *
141  * Note: we could easily change this function to return a 64-bit hash value
142  * by using the final values of both b and c. b is perhaps a little less
143  * well mixed than c, however.
144  */
145 uint32
146 hash_bytes(const unsigned char *k, int keylen)
147 {
148  uint32 a,
149  b,
150  c,
151  len;
152 
153  /* Set up the internal state */
154  len = keylen;
155  a = b = c = 0x9e3779b9 + len + 3923095;
156 
157  /* If the source pointer is word-aligned, we use word-wide fetches */
158  if (((uintptr_t) k & UINT32_ALIGN_MASK) == 0)
159  {
160  /* Code path for aligned source data */
161  const uint32 *ka = (const uint32 *) k;
162 
163  /* handle most of the key */
164  while (len >= 12)
165  {
166  a += ka[0];
167  b += ka[1];
168  c += ka[2];
169  mix(a, b, c);
170  ka += 3;
171  len -= 12;
172  }
173 
174  /* handle the last 11 bytes */
175  k = (const unsigned char *) ka;
176 #ifdef WORDS_BIGENDIAN
177  switch (len)
178  {
179  case 11:
180  c += ((uint32) k[10] << 8);
181  /* fall through */
182  case 10:
183  c += ((uint32) k[9] << 16);
184  /* fall through */
185  case 9:
186  c += ((uint32) k[8] << 24);
187  /* fall through */
188  case 8:
189  /* the lowest byte of c is reserved for the length */
190  b += ka[1];
191  a += ka[0];
192  break;
193  case 7:
194  b += ((uint32) k[6] << 8);
195  /* fall through */
196  case 6:
197  b += ((uint32) k[5] << 16);
198  /* fall through */
199  case 5:
200  b += ((uint32) k[4] << 24);
201  /* fall through */
202  case 4:
203  a += ka[0];
204  break;
205  case 3:
206  a += ((uint32) k[2] << 8);
207  /* fall through */
208  case 2:
209  a += ((uint32) k[1] << 16);
210  /* fall through */
211  case 1:
212  a += ((uint32) k[0] << 24);
213  /* case 0: nothing left to add */
214  }
215 #else /* !WORDS_BIGENDIAN */
216  switch (len)
217  {
218  case 11:
219  c += ((uint32) k[10] << 24);
220  /* fall through */
221  case 10:
222  c += ((uint32) k[9] << 16);
223  /* fall through */
224  case 9:
225  c += ((uint32) k[8] << 8);
226  /* fall through */
227  case 8:
228  /* the lowest byte of c is reserved for the length */
229  b += ka[1];
230  a += ka[0];
231  break;
232  case 7:
233  b += ((uint32) k[6] << 16);
234  /* fall through */
235  case 6:
236  b += ((uint32) k[5] << 8);
237  /* fall through */
238  case 5:
239  b += k[4];
240  /* fall through */
241  case 4:
242  a += ka[0];
243  break;
244  case 3:
245  a += ((uint32) k[2] << 16);
246  /* fall through */
247  case 2:
248  a += ((uint32) k[1] << 8);
249  /* fall through */
250  case 1:
251  a += k[0];
252  /* case 0: nothing left to add */
253  }
254 #endif /* WORDS_BIGENDIAN */
255  }
256  else
257  {
258  /* Code path for non-aligned source data */
259 
260  /* handle most of the key */
261  while (len >= 12)
262  {
263 #ifdef WORDS_BIGENDIAN
264  a += (k[3] + ((uint32) k[2] << 8) + ((uint32) k[1] << 16) + ((uint32) k[0] << 24));
265  b += (k[7] + ((uint32) k[6] << 8) + ((uint32) k[5] << 16) + ((uint32) k[4] << 24));
266  c += (k[11] + ((uint32) k[10] << 8) + ((uint32) k[9] << 16) + ((uint32) k[8] << 24));
267 #else /* !WORDS_BIGENDIAN */
268  a += (k[0] + ((uint32) k[1] << 8) + ((uint32) k[2] << 16) + ((uint32) k[3] << 24));
269  b += (k[4] + ((uint32) k[5] << 8) + ((uint32) k[6] << 16) + ((uint32) k[7] << 24));
270  c += (k[8] + ((uint32) k[9] << 8) + ((uint32) k[10] << 16) + ((uint32) k[11] << 24));
271 #endif /* WORDS_BIGENDIAN */
272  mix(a, b, c);
273  k += 12;
274  len -= 12;
275  }
276 
277  /* handle the last 11 bytes */
278 #ifdef WORDS_BIGENDIAN
279  switch (len)
280  {
281  case 11:
282  c += ((uint32) k[10] << 8);
283  /* fall through */
284  case 10:
285  c += ((uint32) k[9] << 16);
286  /* fall through */
287  case 9:
288  c += ((uint32) k[8] << 24);
289  /* fall through */
290  case 8:
291  /* the lowest byte of c is reserved for the length */
292  b += k[7];
293  /* fall through */
294  case 7:
295  b += ((uint32) k[6] << 8);
296  /* fall through */
297  case 6:
298  b += ((uint32) k[5] << 16);
299  /* fall through */
300  case 5:
301  b += ((uint32) k[4] << 24);
302  /* fall through */
303  case 4:
304  a += k[3];
305  /* fall through */
306  case 3:
307  a += ((uint32) k[2] << 8);
308  /* fall through */
309  case 2:
310  a += ((uint32) k[1] << 16);
311  /* fall through */
312  case 1:
313  a += ((uint32) k[0] << 24);
314  /* case 0: nothing left to add */
315  }
316 #else /* !WORDS_BIGENDIAN */
317  switch (len)
318  {
319  case 11:
320  c += ((uint32) k[10] << 24);
321  /* fall through */
322  case 10:
323  c += ((uint32) k[9] << 16);
324  /* fall through */
325  case 9:
326  c += ((uint32) k[8] << 8);
327  /* fall through */
328  case 8:
329  /* the lowest byte of c is reserved for the length */
330  b += ((uint32) k[7] << 24);
331  /* fall through */
332  case 7:
333  b += ((uint32) k[6] << 16);
334  /* fall through */
335  case 6:
336  b += ((uint32) k[5] << 8);
337  /* fall through */
338  case 5:
339  b += k[4];
340  /* fall through */
341  case 4:
342  a += ((uint32) k[3] << 24);
343  /* fall through */
344  case 3:
345  a += ((uint32) k[2] << 16);
346  /* fall through */
347  case 2:
348  a += ((uint32) k[1] << 8);
349  /* fall through */
350  case 1:
351  a += k[0];
352  /* case 0: nothing left to add */
353  }
354 #endif /* WORDS_BIGENDIAN */
355  }
356 
357  final(a, b, c);
358 
359  /* report the result */
360  return c;
361 }
362 
363 /*
364  * hash_bytes_extended() -- hash into a 64-bit value, using an optional seed
365  * k : the key (the unaligned variable-length array of bytes)
366  * len : the length of the key, counting by bytes
367  * seed : a 64-bit seed (0 means no seed)
368  *
369  * Returns a uint64 value. Otherwise similar to hash_bytes.
370  */
371 uint64
372 hash_bytes_extended(const unsigned char *k, int keylen, uint64 seed)
373 {
374  uint32 a,
375  b,
376  c,
377  len;
378 
379  /* Set up the internal state */
380  len = keylen;
381  a = b = c = 0x9e3779b9 + len + 3923095;
382 
383  /* If the seed is non-zero, use it to perturb the internal state. */
384  if (seed != 0)
385  {
386  /*
387  * In essence, the seed is treated as part of the data being hashed,
388  * but for simplicity, we pretend that it's padded with four bytes of
389  * zeroes so that the seed constitutes a 12-byte chunk.
390  */
391  a += (uint32) (seed >> 32);
392  b += (uint32) seed;
393  mix(a, b, c);
394  }
395 
396  /* If the source pointer is word-aligned, we use word-wide fetches */
397  if (((uintptr_t) k & UINT32_ALIGN_MASK) == 0)
398  {
399  /* Code path for aligned source data */
400  const uint32 *ka = (const uint32 *) k;
401 
402  /* handle most of the key */
403  while (len >= 12)
404  {
405  a += ka[0];
406  b += ka[1];
407  c += ka[2];
408  mix(a, b, c);
409  ka += 3;
410  len -= 12;
411  }
412 
413  /* handle the last 11 bytes */
414  k = (const unsigned char *) ka;
415 #ifdef WORDS_BIGENDIAN
416  switch (len)
417  {
418  case 11:
419  c += ((uint32) k[10] << 8);
420  /* fall through */
421  case 10:
422  c += ((uint32) k[9] << 16);
423  /* fall through */
424  case 9:
425  c += ((uint32) k[8] << 24);
426  /* fall through */
427  case 8:
428  /* the lowest byte of c is reserved for the length */
429  b += ka[1];
430  a += ka[0];
431  break;
432  case 7:
433  b += ((uint32) k[6] << 8);
434  /* fall through */
435  case 6:
436  b += ((uint32) k[5] << 16);
437  /* fall through */
438  case 5:
439  b += ((uint32) k[4] << 24);
440  /* fall through */
441  case 4:
442  a += ka[0];
443  break;
444  case 3:
445  a += ((uint32) k[2] << 8);
446  /* fall through */
447  case 2:
448  a += ((uint32) k[1] << 16);
449  /* fall through */
450  case 1:
451  a += ((uint32) k[0] << 24);
452  /* case 0: nothing left to add */
453  }
454 #else /* !WORDS_BIGENDIAN */
455  switch (len)
456  {
457  case 11:
458  c += ((uint32) k[10] << 24);
459  /* fall through */
460  case 10:
461  c += ((uint32) k[9] << 16);
462  /* fall through */
463  case 9:
464  c += ((uint32) k[8] << 8);
465  /* fall through */
466  case 8:
467  /* the lowest byte of c is reserved for the length */
468  b += ka[1];
469  a += ka[0];
470  break;
471  case 7:
472  b += ((uint32) k[6] << 16);
473  /* fall through */
474  case 6:
475  b += ((uint32) k[5] << 8);
476  /* fall through */
477  case 5:
478  b += k[4];
479  /* fall through */
480  case 4:
481  a += ka[0];
482  break;
483  case 3:
484  a += ((uint32) k[2] << 16);
485  /* fall through */
486  case 2:
487  a += ((uint32) k[1] << 8);
488  /* fall through */
489  case 1:
490  a += k[0];
491  /* case 0: nothing left to add */
492  }
493 #endif /* WORDS_BIGENDIAN */
494  }
495  else
496  {
497  /* Code path for non-aligned source data */
498 
499  /* handle most of the key */
500  while (len >= 12)
501  {
502 #ifdef WORDS_BIGENDIAN
503  a += (k[3] + ((uint32) k[2] << 8) + ((uint32) k[1] << 16) + ((uint32) k[0] << 24));
504  b += (k[7] + ((uint32) k[6] << 8) + ((uint32) k[5] << 16) + ((uint32) k[4] << 24));
505  c += (k[11] + ((uint32) k[10] << 8) + ((uint32) k[9] << 16) + ((uint32) k[8] << 24));
506 #else /* !WORDS_BIGENDIAN */
507  a += (k[0] + ((uint32) k[1] << 8) + ((uint32) k[2] << 16) + ((uint32) k[3] << 24));
508  b += (k[4] + ((uint32) k[5] << 8) + ((uint32) k[6] << 16) + ((uint32) k[7] << 24));
509  c += (k[8] + ((uint32) k[9] << 8) + ((uint32) k[10] << 16) + ((uint32) k[11] << 24));
510 #endif /* WORDS_BIGENDIAN */
511  mix(a, b, c);
512  k += 12;
513  len -= 12;
514  }
515 
516  /* handle the last 11 bytes */
517 #ifdef WORDS_BIGENDIAN
518  switch (len)
519  {
520  case 11:
521  c += ((uint32) k[10] << 8);
522  /* fall through */
523  case 10:
524  c += ((uint32) k[9] << 16);
525  /* fall through */
526  case 9:
527  c += ((uint32) k[8] << 24);
528  /* fall through */
529  case 8:
530  /* the lowest byte of c is reserved for the length */
531  b += k[7];
532  /* fall through */
533  case 7:
534  b += ((uint32) k[6] << 8);
535  /* fall through */
536  case 6:
537  b += ((uint32) k[5] << 16);
538  /* fall through */
539  case 5:
540  b += ((uint32) k[4] << 24);
541  /* fall through */
542  case 4:
543  a += k[3];
544  /* fall through */
545  case 3:
546  a += ((uint32) k[2] << 8);
547  /* fall through */
548  case 2:
549  a += ((uint32) k[1] << 16);
550  /* fall through */
551  case 1:
552  a += ((uint32) k[0] << 24);
553  /* case 0: nothing left to add */
554  }
555 #else /* !WORDS_BIGENDIAN */
556  switch (len)
557  {
558  case 11:
559  c += ((uint32) k[10] << 24);
560  /* fall through */
561  case 10:
562  c += ((uint32) k[9] << 16);
563  /* fall through */
564  case 9:
565  c += ((uint32) k[8] << 8);
566  /* fall through */
567  case 8:
568  /* the lowest byte of c is reserved for the length */
569  b += ((uint32) k[7] << 24);
570  /* fall through */
571  case 7:
572  b += ((uint32) k[6] << 16);
573  /* fall through */
574  case 6:
575  b += ((uint32) k[5] << 8);
576  /* fall through */
577  case 5:
578  b += k[4];
579  /* fall through */
580  case 4:
581  a += ((uint32) k[3] << 24);
582  /* fall through */
583  case 3:
584  a += ((uint32) k[2] << 16);
585  /* fall through */
586  case 2:
587  a += ((uint32) k[1] << 8);
588  /* fall through */
589  case 1:
590  a += k[0];
591  /* case 0: nothing left to add */
592  }
593 #endif /* WORDS_BIGENDIAN */
594  }
595 
596  final(a, b, c);
597 
598  /* report the result */
599  return ((uint64) b << 32) | c;
600 }
601 
602 /*
603  * hash_bytes_uint32() -- hash a 32-bit value to a 32-bit value
604  *
605  * This has the same result as
606  * hash_bytes(&k, sizeof(uint32))
607  * but is faster and doesn't force the caller to store k into memory.
608  */
609 uint32
611 {
612  uint32 a,
613  b,
614  c;
615 
616  a = b = c = 0x9e3779b9 + (uint32) sizeof(uint32) + 3923095;
617  a += k;
618 
619  final(a, b, c);
620 
621  /* report the result */
622  return c;
623 }
624 
625 /*
626  * hash_bytes_uint32_extended() -- hash 32-bit value to 64-bit value, with seed
627  *
628  * Like hash_bytes_uint32, this is a convenience function.
629  */
630 uint64
632 {
633  uint32 a,
634  b,
635  c;
636 
637  a = b = c = 0x9e3779b9 + (uint32) sizeof(uint32) + 3923095;
638 
639  if (seed != 0)
640  {
641  a += (uint32) (seed >> 32);
642  b += (uint32) seed;
643  mix(a, b, c);
644  }
645 
646  a += k;
647 
648  final(a, b, c);
649 
650  /* report the result */
651  return ((uint64) b << 32) | c;
652 }
653 
654 /*
655  * string_hash: hash function for keys that are NUL-terminated strings.
656  *
657  * NOTE: this is the default hash function if none is specified.
658  */
659 uint32
660 string_hash(const void *key, Size keysize)
661 {
662  /*
663  * If the string exceeds keysize-1 bytes, we want to hash only that many,
664  * because when it is copied into the hash table it will be truncated at
665  * that length.
666  */
667  Size s_len = strlen((const char *) key);
668 
669  s_len = Min(s_len, keysize - 1);
670  return hash_bytes((const unsigned char *) key, (int) s_len);
671 }
672 
673 /*
674  * tag_hash: hash function for fixed-size tag values
675  */
676 uint32
677 tag_hash(const void *key, Size keysize)
678 {
679  return hash_bytes((const unsigned char *) key, (int) keysize);
680 }
681 
682 /*
683  * uint32_hash: hash function for keys that are uint32 or int32
684  *
685  * (tag_hash works for this case too, but is slower)
686  */
687 uint32
688 uint32_hash(const void *key, Size keysize)
689 {
690  Assert(keysize == sizeof(uint32));
691  return hash_bytes_uint32(*((const uint32 *) key));
692 }
unsigned int uint32
Definition: c.h:506
#define Min(x, y)
Definition: c.h:1004
#define Assert(condition)
Definition: c.h:858
size_t Size
Definition: c.h:605
uint32 hash_bytes_uint32(uint32 k)
Definition: hashfn.c:610
uint64 hash_bytes_extended(const unsigned char *k, int keylen, uint64 seed)
Definition: hashfn.c:372
uint32 hash_bytes(const unsigned char *k, int keylen)
Definition: hashfn.c:146
uint32 tag_hash(const void *key, Size keysize)
Definition: hashfn.c:677
uint64 hash_bytes_uint32_extended(uint32 k, uint64 seed)
Definition: hashfn.c:631
#define UINT32_ALIGN_MASK
Definition: hashfn.c:46
uint32 uint32_hash(const void *key, Size keysize)
Definition: hashfn.c:688
uint32 string_hash(const void *key, Size keysize)
Definition: hashfn.c:660
#define mix(a, b, c)
Definition: hashfn.c:82
int b
Definition: isn.c:70
int a
Definition: isn.c:69
const void size_t len
char * c