PostgreSQL Source Code  git master
hashfn_unstable.h
Go to the documentation of this file.
1 /*
2  * hashfn_unstable.h
3  *
4  * Building blocks for creating fast inlineable hash functions. The
5  * functions in this file are not guaranteed to be stable between versions,
6  * and may differ by hardware platform. Hence they must not be used in
7  * indexes or other on-disk structures. See hashfn.h if you need stability.
8  *
9  *
10  * Portions Copyright (c) 2024, PostgreSQL Global Development Group
11  *
12  * src/include/common/hashfn_unstable.h
13  */
14 #ifndef HASHFN_UNSTABLE_H
15 #define HASHFN_UNSTABLE_H
16 
17 #include "port/pg_bitutils.h"
18 #include "port/pg_bswap.h"
19 
20 /*
21  * fasthash is a modification of code taken from
22  * https://code.google.com/archive/p/fast-hash/source/default/source
23  * under the terms of the MIT license. The original copyright
24  * notice follows:
25  */
26 
27 /* The MIT License
28 
29  Copyright (C) 2012 Zilong Tan (eric.zltan@gmail.com)
30 
31  Permission is hereby granted, free of charge, to any person
32  obtaining a copy of this software and associated documentation
33  files (the "Software"), to deal in the Software without
34  restriction, including without limitation the rights to use, copy,
35  modify, merge, publish, distribute, sublicense, and/or sell copies
36  of the Software, and to permit persons to whom the Software is
37  furnished to do so, subject to the following conditions:
38 
39  The above copyright notice and this permission notice shall be
40  included in all copies or substantial portions of the Software.
41 
42  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
43  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
44  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
45  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
46  BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
47  ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
48  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
49  SOFTWARE.
50 */
51 
52 /*
53  * fasthash as implemented here has two interfaces:
54  *
55  * 1) Standalone functions, e.g. fasthash32() for a single value with a
56  * known length. These return the same hash code as the original, at
57  * least on little-endian machines.
58  *
59  * 2) Incremental interface. This can used for incorporating multiple
60  * inputs. First, initialize the hash state (here with a zero seed):
61  *
62  * fasthash_state hs;
63  * fasthash_init(&hs, 0);
64  *
65  * If the inputs are of types that can be trivially cast to uint64, it's
66  * sufficient to do:
67  *
68  * hs.accum = value1;
69  * fasthash_combine(&hs);
70  * hs.accum = value2;
71  * fasthash_combine(&hs);
72  * ...
73  *
74  * For longer or variable-length input, fasthash_accum() is a more
75  * flexible, but more verbose method. The standalone functions use this
76  * internally, so see fasthash64() for an example of this.
77  *
78  * After all inputs have been mixed in, finalize the hash:
79  *
80  * hashcode = fasthash_final32(&hs, 0);
81  *
82  * The incremental interface allows an optimization for NUL-terminated
83  * C strings:
84  *
85  * len = fasthash_accum_cstring(&hs, str);
86  * hashcode = fasthash_final32(&hs, len);
87  *
88  * By handling the terminator on-the-fly, we can avoid needing a strlen()
89  * call to tell us how many bytes to hash. Experimentation has found that
90  * SMHasher fails unless we incorporate the length, so it is passed to
91  * the finalizer as a tweak.
92  */
93 
94 
95 typedef struct fasthash_state
96 {
97  /* staging area for chunks of input */
98  uint64 accum;
99 
100  uint64 hash;
102 
103 #define FH_SIZEOF_ACCUM sizeof(uint64)
104 
105 
106 /*
107  * Initialize the hash state.
108  *
109  * 'seed' can be zero.
110  */
111 static inline void
112 fasthash_init(fasthash_state *hs, uint64 seed)
113 {
114  memset(hs, 0, sizeof(fasthash_state));
115  hs->hash = seed ^ 0x880355f21e6d1965;
116 }
117 
118 /* both the finalizer and part of the combining step */
119 static inline uint64
120 fasthash_mix(uint64 h, uint64 tweak)
121 {
122  h ^= (h >> 23) + tweak;
123  h *= 0x2127599bf4325c37;
124  h ^= h >> 47;
125  return h;
126 }
127 
128 /* combine one chunk of input into the hash */
129 static inline void
131 {
132  hs->hash ^= fasthash_mix(hs->accum, 0);
133  hs->hash *= 0x880355f21e6d1965;
134 }
135 
136 /* accumulate up to 8 bytes of input and combine it into the hash */
137 static inline void
138 fasthash_accum(fasthash_state *hs, const char *k, size_t len)
139 {
140  uint32 lower_four;
141 
143  hs->accum = 0;
144 
145  /*
146  * For consistency, bytewise loads must match the platform's endianness.
147  */
148 #ifdef WORDS_BIGENDIAN
149  switch (len)
150  {
151  case 8:
152  memcpy(&hs->accum, k, 8);
153  break;
154  case 7:
155  hs->accum |= (uint64) k[6] << 8;
156  /* FALLTHROUGH */
157  case 6:
158  hs->accum |= (uint64) k[5] << 16;
159  /* FALLTHROUGH */
160  case 5:
161  hs->accum |= (uint64) k[4] << 24;
162  /* FALLTHROUGH */
163  case 4:
164  memcpy(&lower_four, k, sizeof(lower_four));
165  hs->accum |= (uint64) lower_four << 32;
166  break;
167  case 3:
168  hs->accum |= (uint64) k[2] << 40;
169  /* FALLTHROUGH */
170  case 2:
171  hs->accum |= (uint64) k[1] << 48;
172  /* FALLTHROUGH */
173  case 1:
174  hs->accum |= (uint64) k[0] << 56;
175  break;
176  case 0:
177  return;
178  }
179 #else
180  switch (len)
181  {
182  case 8:
183  memcpy(&hs->accum, k, 8);
184  break;
185  case 7:
186  hs->accum |= (uint64) k[6] << 48;
187  /* FALLTHROUGH */
188  case 6:
189  hs->accum |= (uint64) k[5] << 40;
190  /* FALLTHROUGH */
191  case 5:
192  hs->accum |= (uint64) k[4] << 32;
193  /* FALLTHROUGH */
194  case 4:
195  memcpy(&lower_four, k, sizeof(lower_four));
196  hs->accum |= lower_four;
197  break;
198  case 3:
199  hs->accum |= (uint64) k[2] << 16;
200  /* FALLTHROUGH */
201  case 2:
202  hs->accum |= (uint64) k[1] << 8;
203  /* FALLTHROUGH */
204  case 1:
205  hs->accum |= (uint64) k[0];
206  break;
207  case 0:
208  return;
209  }
210 #endif
211 
212  fasthash_combine(hs);
213 }
214 
215 /*
216  * Set high bit in lowest byte where the input is zero, from:
217  * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
218  */
219 #define haszero64(v) \
220  (((v) - 0x0101010101010101) & ~(v) & 0x8080808080808080)
221 
222 /* get first byte in memory order */
223 #ifdef WORDS_BIGENDIAN
224 #define firstbyte64(v) ((v) >> 56)
225 #else
226 #define firstbyte64(v) ((v) & 0xFF)
227 #endif
228 
229 /*
230  * all-purpose workhorse for fasthash_accum_cstring
231  */
232 static inline size_t
234 {
235  const char *const start = str;
236 
237  while (*str)
238  {
239  size_t chunk_len = 0;
240 
241  while (chunk_len < FH_SIZEOF_ACCUM && str[chunk_len] != '\0')
242  chunk_len++;
243 
244  fasthash_accum(hs, str, chunk_len);
245  str += chunk_len;
246  }
247 
248  return str - start;
249 }
250 
251 /*
252  * specialized workhorse for fasthash_accum_cstring
253  *
254  * With an aligned pointer, we consume the string a word at a time.
255  * Loading the word containing the NUL terminator cannot segfault since
256  * allocation boundaries are suitably aligned. To keep from setting
257  * off alarms with address sanitizers, exclude this function from
258  * such testing.
259  */
261 static inline size_t
262 fasthash_accum_cstring_aligned(fasthash_state *hs, const char *str)
263 {
264  const char *const start = str;
265  uint64 chunk;
267 
269 
270  /*
271  * For every chunk of input, check for zero bytes before mixing into the
272  * hash. The chunk with zeros must contain the NUL terminator. We arrange
273  * so that zero_byte_low tells us not only that a zero exists, but also
274  * where it is, so we can hash the remainder of the string.
275  *
276  * The haszero64 calculation will set bits corresponding to the lowest
277  * byte where a zero exists, so that suffices for little-endian machines.
278  * For big-endian machines, we would need bits set for the highest zero
279  * byte in the chunk, since the trailing junk past the terminator could
280  * contain additional zeros. haszero64 does not give us that, so we
281  * byteswap the chunk first.
282  */
283  for (;;)
284  {
285  chunk = *(uint64 *) str;
286 
287 #ifdef WORDS_BIGENDIAN
289 #else
291 #endif
292  if (zero_byte_low)
293  break;
294 
295  hs->accum = chunk;
296  fasthash_combine(hs);
297  str += FH_SIZEOF_ACCUM;
298  }
299 
300  if (firstbyte64(chunk) != 0)
301  {
302  size_t remainder;
303  uint64 mask;
304 
305  /*
306  * The byte corresponding to the NUL will be 0x80, so the rightmost
307  * bit position will be in the range 15, 23, ..., 63. Turn this into
308  * byte position by dividing by 8.
309  */
311 
312  /*
313  * Create a mask for the remaining bytes so we can combine them into
314  * the hash. This must have the same result as mixing the remaining
315  * bytes with fasthash_accum().
316  */
317 #ifdef WORDS_BIGENDIAN
318  mask = ~UINT64CONST(0) << BITS_PER_BYTE * (FH_SIZEOF_ACCUM - remainder);
319 #else
320  mask = ~UINT64CONST(0) >> BITS_PER_BYTE * (FH_SIZEOF_ACCUM - remainder);
321 #endif
322  hs->accum = chunk & mask;
323  fasthash_combine(hs);
324 
325  str += remainder;
326  }
327 
328  return str - start;
329 }
330 
331 /*
332  * Mix 'str' into the hash state and return the length of the string.
333  */
334 static inline size_t
336 {
337 #if SIZEOF_VOID_P >= 8
338 
339  size_t len;
340 #ifdef USE_ASSERT_CHECKING
341  size_t len_check;
342  fasthash_state hs_check;
343 
344  memcpy(&hs_check, hs, sizeof(fasthash_state));
345  len_check = fasthash_accum_cstring_unaligned(&hs_check, str);
346 #endif
347  if (PointerIsAligned(str, uint64))
348  {
349  len = fasthash_accum_cstring_aligned(hs, str);
350  Assert(len_check == len);
351  Assert(hs_check.hash == hs->hash);
352  return len;
353  }
354 #endif /* SIZEOF_VOID_P */
355 
356  /*
357  * It's not worth it to try to make the word-at-a-time optimization work
358  * on 32-bit platforms.
359  */
361 }
362 
363 /*
364  * The finalizer
365  *
366  * 'tweak' is intended to be the input length when the caller doesn't know
367  * the length ahead of time, such as for NUL-terminated strings, otherwise
368  * zero.
369  */
370 static inline uint64
372 {
373  return fasthash_mix(hs->hash, tweak);
374 }
375 
376 /*
377  * Reduce a 64-bit hash to a 32-bit hash.
378  *
379  * This optional step provides a bit more additional mixing compared to
380  * just taking the lower 32-bits.
381  */
382 static inline uint32
384 {
385  /*
386  * Convert the 64-bit hashcode to Fermat residue, which shall retain
387  * information from both the higher and lower parts of hashcode.
388  */
389  return h - (h >> 32);
390 }
391 
392 /* finalize and reduce */
393 static inline uint32
395 {
396  return fasthash_reduce32(fasthash_final64(hs, tweak));
397 }
398 
399 /*
400  * The original fasthash64 function, re-implemented using the incremental
401  * interface. Returns a 64-bit hashcode. 'len' controls not only how
402  * many bytes to hash, but also modifies the internal seed.
403  * 'seed' can be zero.
404  */
405 static inline uint64
406 fasthash64(const char *k, size_t len, uint64 seed)
407 {
408  fasthash_state hs;
409 
410  fasthash_init(&hs, 0);
411 
412  /* re-initialize the seed according to input length */
413  hs.hash = seed ^ (len * 0x880355f21e6d1965);
414 
415  while (len >= FH_SIZEOF_ACCUM)
416  {
418  k += FH_SIZEOF_ACCUM;
419  len -= FH_SIZEOF_ACCUM;
420  }
421 
422  fasthash_accum(&hs, k, len);
423  return fasthash_final64(&hs, 0);
424 }
425 
426 /* like fasthash64, but returns a 32-bit hashcode */
427 static inline uint32
428 fasthash32(const char *k, size_t len, uint64 seed)
429 {
430  return fasthash_reduce32(fasthash64(k, len, seed));
431 }
432 
433 /*
434  * Convenience function for hashing NUL-terminated strings
435  */
436 static inline uint32
437 hash_string(const char *s)
438 {
439  fasthash_state hs;
440  size_t s_len;
441 
442  fasthash_init(&hs, 0);
443 
444  /*
445  * Combine string into the hash and save the length for tweaking the final
446  * mix.
447  */
448  s_len = fasthash_accum_cstring(&hs, s);
449 
450  return fasthash_final32(&hs, s_len);
451 }
452 
453 #endif /* HASHFN_UNSTABLE_H */
unsigned int uint32
Definition: c.h:506
#define PointerIsAligned(pointer, type)
Definition: c.h:769
Assert(PointerIsAligned(start, uint64))
#define haszero64(v)
return str start
static uint32 fasthash_reduce32(uint64 h)
static uint32 fasthash32(const char *k, size_t len, uint64 seed)
#define FH_SIZEOF_ACCUM
pg_attribute_no_sanitize_address() static inline size_t fasthash_accum_cstring_aligned(fasthash_state *hs
static size_t fasthash_accum_cstring(fasthash_state *hs, const char *str)
uint64 zero_byte_low
#define firstbyte64(v)
static uint64 fasthash64(const char *k, size_t len, uint64 seed)
static uint64 fasthash_mix(uint64 h, uint64 tweak)
static uint32 fasthash_final32(fasthash_state *hs, uint64 tweak)
struct fasthash_state fasthash_state
uint64 chunk
static void fasthash_combine(fasthash_state *hs)
static void fasthash_accum(fasthash_state *hs, const char *k, size_t len)
static uint64 fasthash_final64(fasthash_state *hs, uint64 tweak)
static uint32 hash_string(const char *s)
static size_t fasthash_accum_cstring_unaligned(fasthash_state *hs, const char *str)
static void fasthash_init(fasthash_state *hs, uint64 seed)
const char * str
static int pg_rightmost_one_pos64(uint64 word)
Definition: pg_bitutils.h:145
static uint64 pg_bswap64(uint64 x)
Definition: pg_bswap.h:89
#define BITS_PER_BYTE
const void size_t len