PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
hashfunc.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * hashfunc.c
4  * Support functions for hash access method.
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/hash/hashfunc.c
12  *
13  * NOTES
14  * These functions are stored in pg_amproc. For each operator class
15  * defined for hash indexes, they compute the hash value of the argument.
16  *
17  * Additional hash functions appear in /utils/adt/ files for various
18  * specialized datatypes.
19  *
20  * It is expected that every bit of a hash function's 32-bit result is
21  * as random as every other; failure to ensure this is likely to lead
22  * to poor performance of hash joins, for example. In most cases a hash
23  * function should use hash_any() or its variant hash_uint32().
24  *-------------------------------------------------------------------------
25  */
26 
27 #include "postgres.h"
28 
29 #include "access/hash.h"
30 #include "utils/builtins.h"
31 
32 /*
33  * Datatype-specific hash functions.
34  *
35  * These support both hash indexes and hash joins.
36  *
37  * NOTE: some of these are also used by catcache operations, without
38  * any direct connection to hash indexes. Also, the common hash_any
39  * routine is also used by dynahash tables.
40  */
41 
42 /* Note: this is used for both "char" and boolean datatypes */
43 Datum
45 {
46  return hash_uint32((int32) PG_GETARG_CHAR(0));
47 }
48 
49 Datum
51 {
52  return hash_uint32((int32) PG_GETARG_INT16(0));
53 }
54 
55 Datum
57 {
58  return hash_uint32(PG_GETARG_INT32(0));
59 }
60 
61 Datum
63 {
64  /*
65  * The idea here is to produce a hash value compatible with the values
66  * produced by hashint4 and hashint2 for logically equal inputs; this is
67  * necessary to support cross-type hash joins across these input types.
68  * Since all three types are signed, we can xor the high half of the int8
69  * value if the sign is positive, or the complement of the high half when
70  * the sign is negative.
71  */
72  int64 val = PG_GETARG_INT64(0);
73  uint32 lohalf = (uint32) val;
74  uint32 hihalf = (uint32) (val >> 32);
75 
76  lohalf ^= (val >= 0) ? hihalf : ~hihalf;
77 
78  return hash_uint32(lohalf);
79 }
80 
81 Datum
83 {
84  return hash_uint32((uint32) PG_GETARG_OID(0));
85 }
86 
87 Datum
89 {
90  return hash_uint32((uint32) PG_GETARG_OID(0));
91 }
92 
93 Datum
95 {
96  float4 key = PG_GETARG_FLOAT4(0);
97  float8 key8;
98 
99  /*
100  * On IEEE-float machines, minus zero and zero have different bit patterns
101  * but should compare as equal. We must ensure that they have the same
102  * hash value, which is most reliably done this way:
103  */
104  if (key == (float4) 0)
105  PG_RETURN_UINT32(0);
106 
107  /*
108  * To support cross-type hashing of float8 and float4, we want to return
109  * the same hash value hashfloat8 would produce for an equal float8 value.
110  * So, widen the value to float8 and hash that. (We must do this rather
111  * than have hashfloat8 try to narrow its value to float4; that could fail
112  * on overflow.)
113  */
114  key8 = key;
115 
116  return hash_any((unsigned char *) &key8, sizeof(key8));
117 }
118 
119 Datum
121 {
122  float8 key = PG_GETARG_FLOAT8(0);
123 
124  /*
125  * On IEEE-float machines, minus zero and zero have different bit patterns
126  * but should compare as equal. We must ensure that they have the same
127  * hash value, which is most reliably done this way:
128  */
129  if (key == (float8) 0)
130  PG_RETURN_UINT32(0);
131 
132  return hash_any((unsigned char *) &key, sizeof(key));
133 }
134 
135 Datum
137 {
138  oidvector *key = (oidvector *) PG_GETARG_POINTER(0);
139 
140  return hash_any((unsigned char *) key->values, key->dim1 * sizeof(Oid));
141 }
142 
143 Datum
145 {
146  char *key = NameStr(*PG_GETARG_NAME(0));
147 
148  return hash_any((unsigned char *) key, strlen(key));
149 }
150 
151 Datum
153 {
154  text *key = PG_GETARG_TEXT_PP(0);
155  Datum result;
156 
157  /*
158  * Note: this is currently identical in behavior to hashvarlena, but keep
159  * it as a separate function in case we someday want to do something
160  * different in non-C locales. (See also hashbpchar, if so.)
161  */
162  result = hash_any((unsigned char *) VARDATA_ANY(key),
163  VARSIZE_ANY_EXHDR(key));
164 
165  /* Avoid leaking memory for toasted inputs */
166  PG_FREE_IF_COPY(key, 0);
167 
168  return result;
169 }
170 
171 /*
172  * hashvarlena() can be used for any varlena datatype in which there are
173  * no non-significant bits, ie, distinct bitpatterns never compare as equal.
174  */
175 Datum
177 {
178  struct varlena *key = PG_GETARG_VARLENA_PP(0);
179  Datum result;
180 
181  result = hash_any((unsigned char *) VARDATA_ANY(key),
182  VARSIZE_ANY_EXHDR(key));
183 
184  /* Avoid leaking memory for toasted inputs */
185  PG_FREE_IF_COPY(key, 0);
186 
187  return result;
188 }
189 
190 /*
191  * This hash function was written by Bob Jenkins
192  * (bob_jenkins@burtleburtle.net), and superficially adapted
193  * for PostgreSQL by Neil Conway. For more information on this
194  * hash function, see http://burtleburtle.net/bob/hash/doobs.html,
195  * or Bob's article in Dr. Dobb's Journal, Sept. 1997.
196  *
197  * In the current code, we have adopted Bob's 2006 update of his hash
198  * function to fetch the data a word at a time when it is suitably aligned.
199  * This makes for a useful speedup, at the cost of having to maintain
200  * four code paths (aligned vs unaligned, and little-endian vs big-endian).
201  * It also uses two separate mixing functions mix() and final(), instead
202  * of a slower multi-purpose function.
203  */
204 
205 /* Get a bit mask of the bits set in non-uint32 aligned addresses */
206 #define UINT32_ALIGN_MASK (sizeof(uint32) - 1)
207 
208 /* Rotate a uint32 value left by k bits - note multiple evaluation! */
209 #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
210 
211 /*----------
212  * mix -- mix 3 32-bit values reversibly.
213  *
214  * This is reversible, so any information in (a,b,c) before mix() is
215  * still in (a,b,c) after mix().
216  *
217  * If four pairs of (a,b,c) inputs are run through mix(), or through
218  * mix() in reverse, there are at least 32 bits of the output that
219  * are sometimes the same for one pair and different for another pair.
220  * This was tested for:
221  * * pairs that differed by one bit, by two bits, in any combination
222  * of top bits of (a,b,c), or in any combination of bottom bits of
223  * (a,b,c).
224  * * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
225  * the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
226  * is commonly produced by subtraction) look like a single 1-bit
227  * difference.
228  * * the base values were pseudorandom, all zero but one bit set, or
229  * all zero plus a counter that starts at zero.
230  *
231  * This does not achieve avalanche. There are input bits of (a,b,c)
232  * that fail to affect some output bits of (a,b,c), especially of a. The
233  * most thoroughly mixed value is c, but it doesn't really even achieve
234  * avalanche in c.
235  *
236  * This allows some parallelism. Read-after-writes are good at doubling
237  * the number of bits affected, so the goal of mixing pulls in the opposite
238  * direction from the goal of parallelism. I did what I could. Rotates
239  * seem to cost as much as shifts on every machine I could lay my hands on,
240  * and rotates are much kinder to the top and bottom bits, so I used rotates.
241  *----------
242  */
243 #define mix(a,b,c) \
244 { \
245  a -= c; a ^= rot(c, 4); c += b; \
246  b -= a; b ^= rot(a, 6); a += c; \
247  c -= b; c ^= rot(b, 8); b += a; \
248  a -= c; a ^= rot(c,16); c += b; \
249  b -= a; b ^= rot(a,19); a += c; \
250  c -= b; c ^= rot(b, 4); b += a; \
251 }
252 
253 /*----------
254  * final -- final mixing of 3 32-bit values (a,b,c) into c
255  *
256  * Pairs of (a,b,c) values differing in only a few bits will usually
257  * produce values of c that look totally different. This was tested for
258  * * pairs that differed by one bit, by two bits, in any combination
259  * of top bits of (a,b,c), or in any combination of bottom bits of
260  * (a,b,c).
261  * * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
262  * the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
263  * is commonly produced by subtraction) look like a single 1-bit
264  * difference.
265  * * the base values were pseudorandom, all zero but one bit set, or
266  * all zero plus a counter that starts at zero.
267  *
268  * The use of separate functions for mix() and final() allow for a
269  * substantial performance increase since final() does not need to
270  * do well in reverse, but is does need to affect all output bits.
271  * mix(), on the other hand, does not need to affect all output
272  * bits (affecting 32 bits is enough). The original hash function had
273  * a single mixing operation that had to satisfy both sets of requirements
274  * and was slower as a result.
275  *----------
276  */
277 #define final(a,b,c) \
278 { \
279  c ^= b; c -= rot(b,14); \
280  a ^= c; a -= rot(c,11); \
281  b ^= a; b -= rot(a,25); \
282  c ^= b; c -= rot(b,16); \
283  a ^= c; a -= rot(c, 4); \
284  b ^= a; b -= rot(a,14); \
285  c ^= b; c -= rot(b,24); \
286 }
287 
288 /*
289  * hash_any() -- hash a variable-length key into a 32-bit value
290  * k : the key (the unaligned variable-length array of bytes)
291  * len : the length of the key, counting by bytes
292  *
293  * Returns a uint32 value. Every bit of the key affects every bit of
294  * the return value. Every 1-bit and 2-bit delta achieves avalanche.
295  * About 6*len+35 instructions. The best hash table sizes are powers
296  * of 2. There is no need to do mod a prime (mod is sooo slow!).
297  * If you need less than 32 bits, use a bitmask.
298  *
299  * This procedure must never throw elog(ERROR); the ResourceOwner code
300  * relies on this not to fail.
301  *
302  * Note: we could easily change this function to return a 64-bit hash value
303  * by using the final values of both b and c. b is perhaps a little less
304  * well mixed than c, however.
305  */
306 Datum
307 hash_any(register const unsigned char *k, register int keylen)
308 {
309  register uint32 a,
310  b,
311  c,
312  len;
313 
314  /* Set up the internal state */
315  len = keylen;
316  a = b = c = 0x9e3779b9 + len + 3923095;
317 
318  /* If the source pointer is word-aligned, we use word-wide fetches */
319  if (((uintptr_t) k & UINT32_ALIGN_MASK) == 0)
320  {
321  /* Code path for aligned source data */
322  register const uint32 *ka = (const uint32 *) k;
323 
324  /* handle most of the key */
325  while (len >= 12)
326  {
327  a += ka[0];
328  b += ka[1];
329  c += ka[2];
330  mix(a, b, c);
331  ka += 3;
332  len -= 12;
333  }
334 
335  /* handle the last 11 bytes */
336  k = (const unsigned char *) ka;
337 #ifdef WORDS_BIGENDIAN
338  switch (len)
339  {
340  case 11:
341  c += ((uint32) k[10] << 8);
342  /* fall through */
343  case 10:
344  c += ((uint32) k[9] << 16);
345  /* fall through */
346  case 9:
347  c += ((uint32) k[8] << 24);
348  /* the lowest byte of c is reserved for the length */
349  /* fall through */
350  case 8:
351  b += ka[1];
352  a += ka[0];
353  break;
354  case 7:
355  b += ((uint32) k[6] << 8);
356  /* fall through */
357  case 6:
358  b += ((uint32) k[5] << 16);
359  /* fall through */
360  case 5:
361  b += ((uint32) k[4] << 24);
362  /* fall through */
363  case 4:
364  a += ka[0];
365  break;
366  case 3:
367  a += ((uint32) k[2] << 8);
368  /* fall through */
369  case 2:
370  a += ((uint32) k[1] << 16);
371  /* fall through */
372  case 1:
373  a += ((uint32) k[0] << 24);
374  /* case 0: nothing left to add */
375  }
376 #else /* !WORDS_BIGENDIAN */
377  switch (len)
378  {
379  case 11:
380  c += ((uint32) k[10] << 24);
381  /* fall through */
382  case 10:
383  c += ((uint32) k[9] << 16);
384  /* fall through */
385  case 9:
386  c += ((uint32) k[8] << 8);
387  /* the lowest byte of c is reserved for the length */
388  /* fall through */
389  case 8:
390  b += ka[1];
391  a += ka[0];
392  break;
393  case 7:
394  b += ((uint32) k[6] << 16);
395  /* fall through */
396  case 6:
397  b += ((uint32) k[5] << 8);
398  /* fall through */
399  case 5:
400  b += k[4];
401  /* fall through */
402  case 4:
403  a += ka[0];
404  break;
405  case 3:
406  a += ((uint32) k[2] << 16);
407  /* fall through */
408  case 2:
409  a += ((uint32) k[1] << 8);
410  /* fall through */
411  case 1:
412  a += k[0];
413  /* case 0: nothing left to add */
414  }
415 #endif /* WORDS_BIGENDIAN */
416  }
417  else
418  {
419  /* Code path for non-aligned source data */
420 
421  /* handle most of the key */
422  while (len >= 12)
423  {
424 #ifdef WORDS_BIGENDIAN
425  a += (k[3] + ((uint32) k[2] << 8) + ((uint32) k[1] << 16) + ((uint32) k[0] << 24));
426  b += (k[7] + ((uint32) k[6] << 8) + ((uint32) k[5] << 16) + ((uint32) k[4] << 24));
427  c += (k[11] + ((uint32) k[10] << 8) + ((uint32) k[9] << 16) + ((uint32) k[8] << 24));
428 #else /* !WORDS_BIGENDIAN */
429  a += (k[0] + ((uint32) k[1] << 8) + ((uint32) k[2] << 16) + ((uint32) k[3] << 24));
430  b += (k[4] + ((uint32) k[5] << 8) + ((uint32) k[6] << 16) + ((uint32) k[7] << 24));
431  c += (k[8] + ((uint32) k[9] << 8) + ((uint32) k[10] << 16) + ((uint32) k[11] << 24));
432 #endif /* WORDS_BIGENDIAN */
433  mix(a, b, c);
434  k += 12;
435  len -= 12;
436  }
437 
438  /* handle the last 11 bytes */
439 #ifdef WORDS_BIGENDIAN
440  switch (len) /* all the case statements fall through */
441  {
442  case 11:
443  c += ((uint32) k[10] << 8);
444  case 10:
445  c += ((uint32) k[9] << 16);
446  case 9:
447  c += ((uint32) k[8] << 24);
448  /* the lowest byte of c is reserved for the length */
449  case 8:
450  b += k[7];
451  case 7:
452  b += ((uint32) k[6] << 8);
453  case 6:
454  b += ((uint32) k[5] << 16);
455  case 5:
456  b += ((uint32) k[4] << 24);
457  case 4:
458  a += k[3];
459  case 3:
460  a += ((uint32) k[2] << 8);
461  case 2:
462  a += ((uint32) k[1] << 16);
463  case 1:
464  a += ((uint32) k[0] << 24);
465  /* case 0: nothing left to add */
466  }
467 #else /* !WORDS_BIGENDIAN */
468  switch (len) /* all the case statements fall through */
469  {
470  case 11:
471  c += ((uint32) k[10] << 24);
472  case 10:
473  c += ((uint32) k[9] << 16);
474  case 9:
475  c += ((uint32) k[8] << 8);
476  /* the lowest byte of c is reserved for the length */
477  case 8:
478  b += ((uint32) k[7] << 24);
479  case 7:
480  b += ((uint32) k[6] << 16);
481  case 6:
482  b += ((uint32) k[5] << 8);
483  case 5:
484  b += k[4];
485  case 4:
486  a += ((uint32) k[3] << 24);
487  case 3:
488  a += ((uint32) k[2] << 16);
489  case 2:
490  a += ((uint32) k[1] << 8);
491  case 1:
492  a += k[0];
493  /* case 0: nothing left to add */
494  }
495 #endif /* WORDS_BIGENDIAN */
496  }
497 
498  final(a, b, c);
499 
500  /* report the result */
501  return UInt32GetDatum(c);
502 }
503 
504 /*
505  * hash_uint32() -- hash a 32-bit value
506  *
507  * This has the same result as
508  * hash_any(&k, sizeof(uint32))
509  * but is faster and doesn't force the caller to store k into memory.
510  */
511 Datum
513 {
514  register uint32 a,
515  b,
516  c;
517 
518  a = b = c = 0x9e3779b9 + (uint32) sizeof(uint32) + 3923095;
519  a += k;
520 
521  final(a, b, c);
522 
523  /* report the result */
524  return UInt32GetDatum(c);
525 }
#define PG_GETARG_FLOAT8(n)
Definition: fmgr.h:246
Definition: c.h:478
#define PG_GETARG_INT32(n)
Definition: fmgr.h:234
Datum hashoid(PG_FUNCTION_ARGS)
Definition: hashfunc.c:82
#define VARDATA_ANY(PTR)
Definition: postgres.h:347
Datum hashname(PG_FUNCTION_ARGS)
Definition: hashfunc.c:144
Datum hashint8(PG_FUNCTION_ARGS)
Definition: hashfunc.c:62
return result
Definition: formatting.c:1632
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:241
unsigned int Oid
Definition: postgres_ext.h:31
#define PG_RETURN_UINT32(x)
Definition: fmgr.h:315
signed int int32
Definition: c.h:256
#define PG_GETARG_TEXT_PP(n)
Definition: fmgr.h:273
Datum hashchar(PG_FUNCTION_ARGS)
Definition: hashfunc.c:44
Datum hashvarlena(PG_FUNCTION_ARGS)
Definition: hashfunc.c:176
double float8
Definition: c.h:381
Datum hashtext(PG_FUNCTION_ARGS)
Definition: hashfunc.c:152
#define PG_GETARG_VARLENA_PP(n)
Definition: fmgr.h:253
char * c
#define PG_GETARG_OID(n)
Definition: fmgr.h:240
Oid values[FLEXIBLE_ARRAY_MEMBER]
Definition: c.h:486
int dim1
Definition: c.h:484
unsigned int uint32
Definition: c.h:268
Datum hashint4(PG_FUNCTION_ARGS)
Definition: hashfunc.c:56
#define UInt32GetDatum(X)
Definition: postgres.h:499
Datum hashfloat4(PG_FUNCTION_ARGS)
Definition: hashfunc.c:94
#define PG_GETARG_FLOAT4(n)
Definition: fmgr.h:245
Datum hashfloat8(PG_FUNCTION_ARGS)
Definition: hashfunc.c:120
float float4
Definition: c.h:380
#define UINT32_ALIGN_MASK
Definition: hashfunc.c:206
Datum hash_uint32(uint32 k)
Definition: hashfunc.c:512
uintptr_t Datum
Definition: postgres.h:372
#define PG_GETARG_INT16(n)
Definition: fmgr.h:236
#define mix(a, b, c)
Definition: hashfunc.c:243
Datum hash_any(register const unsigned char *k, register int keylen)
Definition: hashfunc.c:307
Datum hashenum(PG_FUNCTION_ARGS)
Definition: hashfunc.c:88
#define PG_FREE_IF_COPY(ptr, n)
Definition: fmgr.h:225
Datum hashint2(PG_FUNCTION_ARGS)
Definition: hashfunc.c:50
#define VARSIZE_ANY_EXHDR(PTR)
Definition: postgres.h:340
Datum hashoidvector(PG_FUNCTION_ARGS)
Definition: hashfunc.c:136
#define NameStr(name)
Definition: c.h:499
Definition: c.h:439
#define PG_FUNCTION_ARGS
Definition: fmgr.h:158
#define PG_GETARG_INT64(n)
Definition: fmgr.h:247
long val
Definition: informix.c:689
#define PG_GETARG_CHAR(n)
Definition: fmgr.h:238
#define PG_GETARG_NAME(n)
Definition: fmgr.h:243