PostgreSQL Source Code git master
hashfn.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * hashfn.c
4 * Generic hashing functions, and hash functions for use in dynahash.c
5 * hashtables
6 *
7 *
8 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
9 * Portions Copyright (c) 1994, Regents of the University of California
10 *
11 *
12 * IDENTIFICATION
13 * src/common/hashfn.c
14 *
15 * NOTES
16 * It is expected that every bit of a hash function's 32-bit result is
17 * as random as every other; failure to ensure this is likely to lead
18 * to poor performance of hash tables. In most cases a hash
19 * function should use hash_bytes() or its variant hash_bytes_uint32(),
20 * or the wrappers hash_any() and hash_uint32 defined in hashfn.h.
21 *
22 *-------------------------------------------------------------------------
23 */
24#include "postgres.h"
25
26#include "common/hashfn.h"
27#include "port/pg_bitutils.h"
28
29
30/*
31 * This hash function was written by Bob Jenkins
32 * (bob_jenkins@burtleburtle.net), and superficially adapted
33 * for PostgreSQL by Neil Conway. For more information on this
34 * hash function, see http://burtleburtle.net/bob/hash/doobs.html,
35 * or Bob's article in Dr. Dobb's Journal, Sept. 1997.
36 *
37 * In the current code, we have adopted Bob's 2006 update of his hash
38 * function to fetch the data a word at a time when it is suitably aligned.
39 * This makes for a useful speedup, at the cost of having to maintain
40 * four code paths (aligned vs unaligned, and little-endian vs big-endian).
41 * It also uses two separate mixing functions mix() and final(), instead
42 * of a slower multi-purpose function.
43 */
44
45/* Get a bit mask of the bits set in non-uint32 aligned addresses */
46#define UINT32_ALIGN_MASK (sizeof(uint32) - 1)
47
48#define rot(x,k) pg_rotate_left32(x, k)
49
50/*----------
51 * mix -- mix 3 32-bit values reversibly.
52 *
53 * This is reversible, so any information in (a,b,c) before mix() is
54 * still in (a,b,c) after mix().
55 *
56 * If four pairs of (a,b,c) inputs are run through mix(), or through
57 * mix() in reverse, there are at least 32 bits of the output that
58 * are sometimes the same for one pair and different for another pair.
59 * This was tested for:
60 * * pairs that differed by one bit, by two bits, in any combination
61 * of top bits of (a,b,c), or in any combination of bottom bits of
62 * (a,b,c).
63 * * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
64 * the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
65 * is commonly produced by subtraction) look like a single 1-bit
66 * difference.
67 * * the base values were pseudorandom, all zero but one bit set, or
68 * all zero plus a counter that starts at zero.
69 *
70 * This does not achieve avalanche. There are input bits of (a,b,c)
71 * that fail to affect some output bits of (a,b,c), especially of a. The
72 * most thoroughly mixed value is c, but it doesn't really even achieve
73 * avalanche in c.
74 *
75 * This allows some parallelism. Read-after-writes are good at doubling
76 * the number of bits affected, so the goal of mixing pulls in the opposite
77 * direction from the goal of parallelism. I did what I could. Rotates
78 * seem to cost as much as shifts on every machine I could lay my hands on,
79 * and rotates are much kinder to the top and bottom bits, so I used rotates.
80 *----------
81 */
82#define mix(a,b,c) \
83{ \
84 a -= c; a ^= rot(c, 4); c += b; \
85 b -= a; b ^= rot(a, 6); a += c; \
86 c -= b; c ^= rot(b, 8); b += a; \
87 a -= c; a ^= rot(c,16); c += b; \
88 b -= a; b ^= rot(a,19); a += c; \
89 c -= b; c ^= rot(b, 4); b += a; \
90}
91
92/*----------
93 * final -- final mixing of 3 32-bit values (a,b,c) into c
94 *
95 * Pairs of (a,b,c) values differing in only a few bits will usually
96 * produce values of c that look totally different. This was tested for
97 * * pairs that differed by one bit, by two bits, in any combination
98 * of top bits of (a,b,c), or in any combination of bottom bits of
99 * (a,b,c).
100 * * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
101 * the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
102 * is commonly produced by subtraction) look like a single 1-bit
103 * difference.
104 * * the base values were pseudorandom, all zero but one bit set, or
105 * all zero plus a counter that starts at zero.
106 *
107 * The use of separate functions for mix() and final() allow for a
108 * substantial performance increase since final() does not need to
109 * do well in reverse, but is does need to affect all output bits.
110 * mix(), on the other hand, does not need to affect all output
111 * bits (affecting 32 bits is enough). The original hash function had
112 * a single mixing operation that had to satisfy both sets of requirements
113 * and was slower as a result.
114 *----------
115 */
116#define final(a,b,c) \
117{ \
118 c ^= b; c -= rot(b,14); \
119 a ^= c; a -= rot(c,11); \
120 b ^= a; b -= rot(a,25); \
121 c ^= b; c -= rot(b,16); \
122 a ^= c; a -= rot(c, 4); \
123 b ^= a; b -= rot(a,14); \
124 c ^= b; c -= rot(b,24); \
125}
126
127/*
128 * hash_bytes() -- hash a variable-length key into a 32-bit value
129 * k : the key (the unaligned variable-length array of bytes)
130 * len : the length of the key, counting by bytes
131 *
132 * Returns a uint32 value. Every bit of the key affects every bit of
133 * the return value. Every 1-bit and 2-bit delta achieves avalanche.
134 * About 6*len+35 instructions. The best hash table sizes are powers
135 * of 2. There is no need to do mod a prime (mod is sooo slow!).
136 * If you need less than 32 bits, use a bitmask.
137 *
138 * This procedure must never throw elog(ERROR); the ResourceOwner code
139 * relies on this not to fail.
140 *
141 * Note: we could easily change this function to return a 64-bit hash value
142 * by using the final values of both b and c. b is perhaps a little less
143 * well mixed than c, however.
144 */
145uint32
146hash_bytes(const unsigned char *k, int keylen)
147{
148 uint32 a,
149 b,
150 c,
151 len;
152
153 /* Set up the internal state */
154 len = keylen;
155 a = b = c = 0x9e3779b9 + len + 3923095;
156
157 /* If the source pointer is word-aligned, we use word-wide fetches */
158 if (((uintptr_t) k & UINT32_ALIGN_MASK) == 0)
159 {
160 /* Code path for aligned source data */
161 const uint32 *ka = (const uint32 *) k;
162
163 /* handle most of the key */
164 while (len >= 12)
165 {
166 a += ka[0];
167 b += ka[1];
168 c += ka[2];
169 mix(a, b, c);
170 ka += 3;
171 len -= 12;
172 }
173
174 /* handle the last 11 bytes */
175 k = (const unsigned char *) ka;
176#ifdef WORDS_BIGENDIAN
177 switch (len)
178 {
179 case 11:
180 c += ((uint32) k[10] << 8);
181 /* fall through */
182 case 10:
183 c += ((uint32) k[9] << 16);
184 /* fall through */
185 case 9:
186 c += ((uint32) k[8] << 24);
187 /* fall through */
188 case 8:
189 /* the lowest byte of c is reserved for the length */
190 b += ka[1];
191 a += ka[0];
192 break;
193 case 7:
194 b += ((uint32) k[6] << 8);
195 /* fall through */
196 case 6:
197 b += ((uint32) k[5] << 16);
198 /* fall through */
199 case 5:
200 b += ((uint32) k[4] << 24);
201 /* fall through */
202 case 4:
203 a += ka[0];
204 break;
205 case 3:
206 a += ((uint32) k[2] << 8);
207 /* fall through */
208 case 2:
209 a += ((uint32) k[1] << 16);
210 /* fall through */
211 case 1:
212 a += ((uint32) k[0] << 24);
213 /* case 0: nothing left to add */
214 }
215#else /* !WORDS_BIGENDIAN */
216 switch (len)
217 {
218 case 11:
219 c += ((uint32) k[10] << 24);
220 /* fall through */
221 case 10:
222 c += ((uint32) k[9] << 16);
223 /* fall through */
224 case 9:
225 c += ((uint32) k[8] << 8);
226 /* fall through */
227 case 8:
228 /* the lowest byte of c is reserved for the length */
229 b += ka[1];
230 a += ka[0];
231 break;
232 case 7:
233 b += ((uint32) k[6] << 16);
234 /* fall through */
235 case 6:
236 b += ((uint32) k[5] << 8);
237 /* fall through */
238 case 5:
239 b += k[4];
240 /* fall through */
241 case 4:
242 a += ka[0];
243 break;
244 case 3:
245 a += ((uint32) k[2] << 16);
246 /* fall through */
247 case 2:
248 a += ((uint32) k[1] << 8);
249 /* fall through */
250 case 1:
251 a += k[0];
252 /* case 0: nothing left to add */
253 }
254#endif /* WORDS_BIGENDIAN */
255 }
256 else
257 {
258 /* Code path for non-aligned source data */
259
260 /* handle most of the key */
261 while (len >= 12)
262 {
263#ifdef WORDS_BIGENDIAN
264 a += (k[3] + ((uint32) k[2] << 8) + ((uint32) k[1] << 16) + ((uint32) k[0] << 24));
265 b += (k[7] + ((uint32) k[6] << 8) + ((uint32) k[5] << 16) + ((uint32) k[4] << 24));
266 c += (k[11] + ((uint32) k[10] << 8) + ((uint32) k[9] << 16) + ((uint32) k[8] << 24));
267#else /* !WORDS_BIGENDIAN */
268 a += (k[0] + ((uint32) k[1] << 8) + ((uint32) k[2] << 16) + ((uint32) k[3] << 24));
269 b += (k[4] + ((uint32) k[5] << 8) + ((uint32) k[6] << 16) + ((uint32) k[7] << 24));
270 c += (k[8] + ((uint32) k[9] << 8) + ((uint32) k[10] << 16) + ((uint32) k[11] << 24));
271#endif /* WORDS_BIGENDIAN */
272 mix(a, b, c);
273 k += 12;
274 len -= 12;
275 }
276
277 /* handle the last 11 bytes */
278#ifdef WORDS_BIGENDIAN
279 switch (len)
280 {
281 case 11:
282 c += ((uint32) k[10] << 8);
283 /* fall through */
284 case 10:
285 c += ((uint32) k[9] << 16);
286 /* fall through */
287 case 9:
288 c += ((uint32) k[8] << 24);
289 /* fall through */
290 case 8:
291 /* the lowest byte of c is reserved for the length */
292 b += k[7];
293 /* fall through */
294 case 7:
295 b += ((uint32) k[6] << 8);
296 /* fall through */
297 case 6:
298 b += ((uint32) k[5] << 16);
299 /* fall through */
300 case 5:
301 b += ((uint32) k[4] << 24);
302 /* fall through */
303 case 4:
304 a += k[3];
305 /* fall through */
306 case 3:
307 a += ((uint32) k[2] << 8);
308 /* fall through */
309 case 2:
310 a += ((uint32) k[1] << 16);
311 /* fall through */
312 case 1:
313 a += ((uint32) k[0] << 24);
314 /* case 0: nothing left to add */
315 }
316#else /* !WORDS_BIGENDIAN */
317 switch (len)
318 {
319 case 11:
320 c += ((uint32) k[10] << 24);
321 /* fall through */
322 case 10:
323 c += ((uint32) k[9] << 16);
324 /* fall through */
325 case 9:
326 c += ((uint32) k[8] << 8);
327 /* fall through */
328 case 8:
329 /* the lowest byte of c is reserved for the length */
330 b += ((uint32) k[7] << 24);
331 /* fall through */
332 case 7:
333 b += ((uint32) k[6] << 16);
334 /* fall through */
335 case 6:
336 b += ((uint32) k[5] << 8);
337 /* fall through */
338 case 5:
339 b += k[4];
340 /* fall through */
341 case 4:
342 a += ((uint32) k[3] << 24);
343 /* fall through */
344 case 3:
345 a += ((uint32) k[2] << 16);
346 /* fall through */
347 case 2:
348 a += ((uint32) k[1] << 8);
349 /* fall through */
350 case 1:
351 a += k[0];
352 /* case 0: nothing left to add */
353 }
354#endif /* WORDS_BIGENDIAN */
355 }
356
357 final(a, b, c);
358
359 /* report the result */
360 return c;
361}
362
363/*
364 * hash_bytes_extended() -- hash into a 64-bit value, using an optional seed
365 * k : the key (the unaligned variable-length array of bytes)
366 * len : the length of the key, counting by bytes
367 * seed : a 64-bit seed (0 means no seed)
368 *
369 * Returns a uint64 value. Otherwise similar to hash_bytes.
370 */
371uint64
372hash_bytes_extended(const unsigned char *k, int keylen, uint64 seed)
373{
374 uint32 a,
375 b,
376 c,
377 len;
378
379 /* Set up the internal state */
380 len = keylen;
381 a = b = c = 0x9e3779b9 + len + 3923095;
382
383 /* If the seed is non-zero, use it to perturb the internal state. */
384 if (seed != 0)
385 {
386 /*
387 * In essence, the seed is treated as part of the data being hashed,
388 * but for simplicity, we pretend that it's padded with four bytes of
389 * zeroes so that the seed constitutes a 12-byte chunk.
390 */
391 a += (uint32) (seed >> 32);
392 b += (uint32) seed;
393 mix(a, b, c);
394 }
395
396 /* If the source pointer is word-aligned, we use word-wide fetches */
397 if (((uintptr_t) k & UINT32_ALIGN_MASK) == 0)
398 {
399 /* Code path for aligned source data */
400 const uint32 *ka = (const uint32 *) k;
401
402 /* handle most of the key */
403 while (len >= 12)
404 {
405 a += ka[0];
406 b += ka[1];
407 c += ka[2];
408 mix(a, b, c);
409 ka += 3;
410 len -= 12;
411 }
412
413 /* handle the last 11 bytes */
414 k = (const unsigned char *) ka;
415#ifdef WORDS_BIGENDIAN
416 switch (len)
417 {
418 case 11:
419 c += ((uint32) k[10] << 8);
420 /* fall through */
421 case 10:
422 c += ((uint32) k[9] << 16);
423 /* fall through */
424 case 9:
425 c += ((uint32) k[8] << 24);
426 /* fall through */
427 case 8:
428 /* the lowest byte of c is reserved for the length */
429 b += ka[1];
430 a += ka[0];
431 break;
432 case 7:
433 b += ((uint32) k[6] << 8);
434 /* fall through */
435 case 6:
436 b += ((uint32) k[5] << 16);
437 /* fall through */
438 case 5:
439 b += ((uint32) k[4] << 24);
440 /* fall through */
441 case 4:
442 a += ka[0];
443 break;
444 case 3:
445 a += ((uint32) k[2] << 8);
446 /* fall through */
447 case 2:
448 a += ((uint32) k[1] << 16);
449 /* fall through */
450 case 1:
451 a += ((uint32) k[0] << 24);
452 /* case 0: nothing left to add */
453 }
454#else /* !WORDS_BIGENDIAN */
455 switch (len)
456 {
457 case 11:
458 c += ((uint32) k[10] << 24);
459 /* fall through */
460 case 10:
461 c += ((uint32) k[9] << 16);
462 /* fall through */
463 case 9:
464 c += ((uint32) k[8] << 8);
465 /* fall through */
466 case 8:
467 /* the lowest byte of c is reserved for the length */
468 b += ka[1];
469 a += ka[0];
470 break;
471 case 7:
472 b += ((uint32) k[6] << 16);
473 /* fall through */
474 case 6:
475 b += ((uint32) k[5] << 8);
476 /* fall through */
477 case 5:
478 b += k[4];
479 /* fall through */
480 case 4:
481 a += ka[0];
482 break;
483 case 3:
484 a += ((uint32) k[2] << 16);
485 /* fall through */
486 case 2:
487 a += ((uint32) k[1] << 8);
488 /* fall through */
489 case 1:
490 a += k[0];
491 /* case 0: nothing left to add */
492 }
493#endif /* WORDS_BIGENDIAN */
494 }
495 else
496 {
497 /* Code path for non-aligned source data */
498
499 /* handle most of the key */
500 while (len >= 12)
501 {
502#ifdef WORDS_BIGENDIAN
503 a += (k[3] + ((uint32) k[2] << 8) + ((uint32) k[1] << 16) + ((uint32) k[0] << 24));
504 b += (k[7] + ((uint32) k[6] << 8) + ((uint32) k[5] << 16) + ((uint32) k[4] << 24));
505 c += (k[11] + ((uint32) k[10] << 8) + ((uint32) k[9] << 16) + ((uint32) k[8] << 24));
506#else /* !WORDS_BIGENDIAN */
507 a += (k[0] + ((uint32) k[1] << 8) + ((uint32) k[2] << 16) + ((uint32) k[3] << 24));
508 b += (k[4] + ((uint32) k[5] << 8) + ((uint32) k[6] << 16) + ((uint32) k[7] << 24));
509 c += (k[8] + ((uint32) k[9] << 8) + ((uint32) k[10] << 16) + ((uint32) k[11] << 24));
510#endif /* WORDS_BIGENDIAN */
511 mix(a, b, c);
512 k += 12;
513 len -= 12;
514 }
515
516 /* handle the last 11 bytes */
517#ifdef WORDS_BIGENDIAN
518 switch (len)
519 {
520 case 11:
521 c += ((uint32) k[10] << 8);
522 /* fall through */
523 case 10:
524 c += ((uint32) k[9] << 16);
525 /* fall through */
526 case 9:
527 c += ((uint32) k[8] << 24);
528 /* fall through */
529 case 8:
530 /* the lowest byte of c is reserved for the length */
531 b += k[7];
532 /* fall through */
533 case 7:
534 b += ((uint32) k[6] << 8);
535 /* fall through */
536 case 6:
537 b += ((uint32) k[5] << 16);
538 /* fall through */
539 case 5:
540 b += ((uint32) k[4] << 24);
541 /* fall through */
542 case 4:
543 a += k[3];
544 /* fall through */
545 case 3:
546 a += ((uint32) k[2] << 8);
547 /* fall through */
548 case 2:
549 a += ((uint32) k[1] << 16);
550 /* fall through */
551 case 1:
552 a += ((uint32) k[0] << 24);
553 /* case 0: nothing left to add */
554 }
555#else /* !WORDS_BIGENDIAN */
556 switch (len)
557 {
558 case 11:
559 c += ((uint32) k[10] << 24);
560 /* fall through */
561 case 10:
562 c += ((uint32) k[9] << 16);
563 /* fall through */
564 case 9:
565 c += ((uint32) k[8] << 8);
566 /* fall through */
567 case 8:
568 /* the lowest byte of c is reserved for the length */
569 b += ((uint32) k[7] << 24);
570 /* fall through */
571 case 7:
572 b += ((uint32) k[6] << 16);
573 /* fall through */
574 case 6:
575 b += ((uint32) k[5] << 8);
576 /* fall through */
577 case 5:
578 b += k[4];
579 /* fall through */
580 case 4:
581 a += ((uint32) k[3] << 24);
582 /* fall through */
583 case 3:
584 a += ((uint32) k[2] << 16);
585 /* fall through */
586 case 2:
587 a += ((uint32) k[1] << 8);
588 /* fall through */
589 case 1:
590 a += k[0];
591 /* case 0: nothing left to add */
592 }
593#endif /* WORDS_BIGENDIAN */
594 }
595
596 final(a, b, c);
597
598 /* report the result */
599 return ((uint64) b << 32) | c;
600}
601
602/*
603 * hash_bytes_uint32() -- hash a 32-bit value to a 32-bit value
604 *
605 * This has the same result as
606 * hash_bytes(&k, sizeof(uint32))
607 * but is faster and doesn't force the caller to store k into memory.
608 */
609uint32
611{
612 uint32 a,
613 b,
614 c;
615
616 a = b = c = 0x9e3779b9 + (uint32) sizeof(uint32) + 3923095;
617 a += k;
618
619 final(a, b, c);
620
621 /* report the result */
622 return c;
623}
624
625/*
626 * hash_bytes_uint32_extended() -- hash 32-bit value to 64-bit value, with seed
627 *
628 * Like hash_bytes_uint32, this is a convenience function.
629 */
630uint64
632{
633 uint32 a,
634 b,
635 c;
636
637 a = b = c = 0x9e3779b9 + (uint32) sizeof(uint32) + 3923095;
638
639 if (seed != 0)
640 {
641 a += (uint32) (seed >> 32);
642 b += (uint32) seed;
643 mix(a, b, c);
644 }
645
646 a += k;
647
648 final(a, b, c);
649
650 /* report the result */
651 return ((uint64) b << 32) | c;
652}
653
654/*
655 * string_hash: hash function for keys that are NUL-terminated strings.
656 *
657 * NOTE: this is the default hash function if none is specified.
658 */
659uint32
660string_hash(const void *key, Size keysize)
661{
662 /*
663 * If the string exceeds keysize-1 bytes, we want to hash only that many,
664 * because when it is copied into the hash table it will be truncated at
665 * that length.
666 */
667 Size s_len = strlen((const char *) key);
668
669 s_len = Min(s_len, keysize - 1);
670 return hash_bytes((const unsigned char *) key, (int) s_len);
671}
672
673/*
674 * tag_hash: hash function for fixed-size tag values
675 */
676uint32
677tag_hash(const void *key, Size keysize)
678{
679 return hash_bytes((const unsigned char *) key, (int) keysize);
680}
681
682/*
683 * uint32_hash: hash function for keys that are uint32 or int32
684 *
685 * (tag_hash works for this case too, but is slower)
686 */
687uint32
688uint32_hash(const void *key, Size keysize)
689{
690 Assert(keysize == sizeof(uint32));
691 return hash_bytes_uint32(*((const uint32 *) key));
692}
#define Min(x, y)
Definition: c.h:961
#define Assert(condition)
Definition: c.h:815
uint64_t uint64
Definition: c.h:489
uint32_t uint32
Definition: c.h:488
size_t Size
Definition: c.h:562
uint32 hash_bytes_uint32(uint32 k)
Definition: hashfn.c:610
uint64 hash_bytes_extended(const unsigned char *k, int keylen, uint64 seed)
Definition: hashfn.c:372
uint32 hash_bytes(const unsigned char *k, int keylen)
Definition: hashfn.c:146
uint32 tag_hash(const void *key, Size keysize)
Definition: hashfn.c:677
uint64 hash_bytes_uint32_extended(uint32 k, uint64 seed)
Definition: hashfn.c:631
#define UINT32_ALIGN_MASK
Definition: hashfn.c:46
uint32 uint32_hash(const void *key, Size keysize)
Definition: hashfn.c:688
uint32 string_hash(const void *key, Size keysize)
Definition: hashfn.c:660
#define mix(a, b, c)
Definition: hashfn.c:82
int b
Definition: isn.c:69
int a
Definition: isn.c:68
const void size_t len
char * c