PostgreSQL Source Code git master
hashfn_unstable.h
Go to the documentation of this file.
1/*
2 * hashfn_unstable.h
3 *
4 * Building blocks for creating fast inlineable hash functions. The
5 * functions in this file are not guaranteed to be stable between versions,
6 * and may differ by hardware platform. Hence they must not be used in
7 * indexes or other on-disk structures. See hashfn.h if you need stability.
8 *
9 *
10 * Portions Copyright (c) 2024-2025, PostgreSQL Global Development Group
11 *
12 * src/include/common/hashfn_unstable.h
13 */
14#ifndef HASHFN_UNSTABLE_H
15#define HASHFN_UNSTABLE_H
16
17#include "port/pg_bitutils.h"
18#include "port/pg_bswap.h"
19
20/*
21 * fasthash is a modification of code taken from
22 * https://code.google.com/archive/p/fast-hash/source/default/source
23 * under the terms of the MIT license. The original copyright
24 * notice follows:
25 */
26
27/* The MIT License
28
29 Copyright (C) 2012 Zilong Tan (eric.zltan@gmail.com)
30
31 Permission is hereby granted, free of charge, to any person
32 obtaining a copy of this software and associated documentation
33 files (the "Software"), to deal in the Software without
34 restriction, including without limitation the rights to use, copy,
35 modify, merge, publish, distribute, sublicense, and/or sell copies
36 of the Software, and to permit persons to whom the Software is
37 furnished to do so, subject to the following conditions:
38
39 The above copyright notice and this permission notice shall be
40 included in all copies or substantial portions of the Software.
41
42 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
43 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
44 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
45 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
46 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
47 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
48 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
49 SOFTWARE.
50*/
51
52/*
53 * fasthash as implemented here has two interfaces:
54 *
55 * 1) Standalone functions, e.g. fasthash32() for a single value with a
56 * known length. These return the same hash code as the original, at
57 * least on little-endian machines.
58 *
59 * 2) Incremental interface. This can used for incorporating multiple
60 * inputs. First, initialize the hash state (here with a zero seed):
61 *
62 * fasthash_state hs;
63 * fasthash_init(&hs, 0);
64 *
65 * If the inputs are of types that can be trivially cast to uint64, it's
66 * sufficient to do:
67 *
68 * hs.accum = value1;
69 * fasthash_combine(&hs);
70 * hs.accum = value2;
71 * fasthash_combine(&hs);
72 * ...
73 *
74 * For longer or variable-length input, fasthash_accum() is a more
75 * flexible, but more verbose method. The standalone functions use this
76 * internally, so see fasthash64() for an example of this.
77 *
78 * After all inputs have been mixed in, finalize the hash:
79 *
80 * hashcode = fasthash_final32(&hs, 0);
81 *
82 * The incremental interface allows an optimization for NUL-terminated
83 * C strings:
84 *
85 * len = fasthash_accum_cstring(&hs, str);
86 * hashcode = fasthash_final32(&hs, len);
87 *
88 * By handling the terminator on-the-fly, we can avoid needing a strlen()
89 * call to tell us how many bytes to hash. Experimentation has found that
90 * SMHasher fails unless we incorporate the length, so it is passed to
91 * the finalizer as a tweak.
92 */
93
94
95typedef struct fasthash_state
96{
97 /* staging area for chunks of input */
99
102
103#define FH_SIZEOF_ACCUM sizeof(uint64)
104
105
106/*
107 * Initialize the hash state.
108 *
109 * 'seed' can be zero.
110 */
111static inline void
113{
114 memset(hs, 0, sizeof(fasthash_state));
115 hs->hash = seed ^ 0x880355f21e6d1965;
116}
117
118/* both the finalizer and part of the combining step */
119static inline uint64
121{
122 h ^= (h >> 23) + tweak;
123 h *= 0x2127599bf4325c37;
124 h ^= h >> 47;
125 return h;
126}
127
128/* combine one chunk of input into the hash */
129static inline void
131{
132 hs->hash ^= fasthash_mix(hs->accum, 0);
133 hs->hash *= 0x880355f21e6d1965;
134}
135
136/* accumulate up to 8 bytes of input and combine it into the hash */
137static inline void
138fasthash_accum(fasthash_state *hs, const char *k, size_t len)
139{
140 uint32 lower_four;
141
143 hs->accum = 0;
144
145 /*
146 * For consistency, bytewise loads must match the platform's endianness.
147 */
148#ifdef WORDS_BIGENDIAN
149 switch (len)
150 {
151 case 8:
152 memcpy(&hs->accum, k, 8);
153 break;
154 case 7:
155 hs->accum |= (uint64) k[6] << 8;
156 /* FALLTHROUGH */
157 case 6:
158 hs->accum |= (uint64) k[5] << 16;
159 /* FALLTHROUGH */
160 case 5:
161 hs->accum |= (uint64) k[4] << 24;
162 /* FALLTHROUGH */
163 case 4:
164 memcpy(&lower_four, k, sizeof(lower_four));
165 hs->accum |= (uint64) lower_four << 32;
166 break;
167 case 3:
168 hs->accum |= (uint64) k[2] << 40;
169 /* FALLTHROUGH */
170 case 2:
171 hs->accum |= (uint64) k[1] << 48;
172 /* FALLTHROUGH */
173 case 1:
174 hs->accum |= (uint64) k[0] << 56;
175 break;
176 case 0:
177 return;
178 }
179#else
180 switch (len)
181 {
182 case 8:
183 memcpy(&hs->accum, k, 8);
184 break;
185 case 7:
186 hs->accum |= (uint64) k[6] << 48;
187 /* FALLTHROUGH */
188 case 6:
189 hs->accum |= (uint64) k[5] << 40;
190 /* FALLTHROUGH */
191 case 5:
192 hs->accum |= (uint64) k[4] << 32;
193 /* FALLTHROUGH */
194 case 4:
195 memcpy(&lower_four, k, sizeof(lower_four));
196 hs->accum |= lower_four;
197 break;
198 case 3:
199 hs->accum |= (uint64) k[2] << 16;
200 /* FALLTHROUGH */
201 case 2:
202 hs->accum |= (uint64) k[1] << 8;
203 /* FALLTHROUGH */
204 case 1:
205 hs->accum |= (uint64) k[0];
206 break;
207 case 0:
208 return;
209 }
210#endif
211
213}
214
215/*
216 * Set high bit in lowest byte where the input is zero, from:
217 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
218 */
219#define haszero64(v) \
220 (((v) - 0x0101010101010101) & ~(v) & 0x8080808080808080)
221
222/* get first byte in memory order */
223#ifdef WORDS_BIGENDIAN
224#define firstbyte64(v) ((v) >> 56)
225#else
226#define firstbyte64(v) ((v) & 0xFF)
227#endif
228
229/*
230 * all-purpose workhorse for fasthash_accum_cstring
231 */
232static inline size_t
234{
235 const char *const start = str;
236
237 while (*str)
238 {
239 size_t chunk_len = 0;
240
241 while (chunk_len < FH_SIZEOF_ACCUM && str[chunk_len] != '\0')
242 chunk_len++;
243
244 fasthash_accum(hs, str, chunk_len);
245 str += chunk_len;
246 }
247
248 return str - start;
249}
250
251/*
252 * specialized workhorse for fasthash_accum_cstring
253 *
254 * With an aligned pointer, we consume the string a word at a time.
255 * Loading the word containing the NUL terminator cannot segfault since
256 * allocation boundaries are suitably aligned. To keep from setting
257 * off alarms with address sanitizers, exclude this function from
258 * such testing.
259 */
261static inline size_t
262fasthash_accum_cstring_aligned(fasthash_state *hs, const char *str)
263{
264 const char *const start = str;
267
269
270 /*
271 * For every chunk of input, check for zero bytes before mixing into the
272 * hash. The chunk with zeros must contain the NUL terminator. We arrange
273 * so that zero_byte_low tells us not only that a zero exists, but also
274 * where it is, so we can hash the remainder of the string.
275 *
276 * The haszero64 calculation will set bits corresponding to the lowest
277 * byte where a zero exists, so that suffices for little-endian machines.
278 * For big-endian machines, we would need bits set for the highest zero
279 * byte in the chunk, since the trailing junk past the terminator could
280 * contain additional zeros. haszero64 does not give us that, so we
281 * byteswap the chunk first.
282 */
283 for (;;)
284 {
285 chunk = *(uint64 *) str;
286
287#ifdef WORDS_BIGENDIAN
289#else
291#endif
292 if (zero_byte_low)
293 break;
294
295 hs->accum = chunk;
298 }
299
301 {
302 size_t remainder;
303 uint64 mask;
304
305 /*
306 * The byte corresponding to the NUL will be 0x80, so the rightmost
307 * bit position will be in the range 15, 23, ..., 63. Turn this into
308 * byte position by dividing by 8.
309 */
311
312 /*
313 * Create a mask for the remaining bytes so we can combine them into
314 * the hash. This must have the same result as mixing the remaining
315 * bytes with fasthash_accum().
316 */
317#ifdef WORDS_BIGENDIAN
318 mask = ~UINT64CONST(0) << BITS_PER_BYTE * (FH_SIZEOF_ACCUM - remainder);
319#else
320 mask = ~UINT64CONST(0) >> BITS_PER_BYTE * (FH_SIZEOF_ACCUM - remainder);
321#endif
322 hs->accum = chunk & mask;
324
325 str += remainder;
326 }
327
328 return str - start;
329}
330
331/*
332 * Mix 'str' into the hash state and return the length of the string.
333 */
334static inline size_t
336{
337#if SIZEOF_VOID_P >= 8
338
339 size_t len;
340#ifdef USE_ASSERT_CHECKING
341 size_t len_check;
342 fasthash_state hs_check;
343
344 memcpy(&hs_check, hs, sizeof(fasthash_state));
345 len_check = fasthash_accum_cstring_unaligned(&hs_check, str);
346#endif
348 {
349 len = fasthash_accum_cstring_aligned(hs, str);
350 Assert(len_check == len);
351 Assert(hs_check.hash == hs->hash);
352 return len;
353 }
354#endif /* SIZEOF_VOID_P */
355
356 /*
357 * It's not worth it to try to make the word-at-a-time optimization work
358 * on 32-bit platforms.
359 */
361}
362
363/*
364 * The finalizer
365 *
366 * 'tweak' is intended to be the input length when the caller doesn't know
367 * the length ahead of time, such as for NUL-terminated strings, otherwise
368 * zero.
369 */
370static inline uint64
372{
373 return fasthash_mix(hs->hash, tweak);
374}
375
376/*
377 * Reduce a 64-bit hash to a 32-bit hash.
378 *
379 * This optional step provides a bit more additional mixing compared to
380 * just taking the lower 32-bits.
381 */
382static inline uint32
384{
385 /*
386 * Convert the 64-bit hashcode to Fermat residue, which shall retain
387 * information from both the higher and lower parts of hashcode.
388 */
389 return h - (h >> 32);
390}
391
392/* finalize and reduce */
393static inline uint32
395{
396 return fasthash_reduce32(fasthash_final64(hs, tweak));
397}
398
399/*
400 * The original fasthash64 function, re-implemented using the incremental
401 * interface. Returns a 64-bit hashcode. 'len' controls not only how
402 * many bytes to hash, but also modifies the internal seed.
403 * 'seed' can be zero.
404 */
405static inline uint64
406fasthash64(const char *k, size_t len, uint64 seed)
407{
409
410 fasthash_init(&hs, 0);
411
412 /* re-initialize the seed according to input length */
413 hs.hash = seed ^ (len * 0x880355f21e6d1965);
414
415 while (len >= FH_SIZEOF_ACCUM)
416 {
418 k += FH_SIZEOF_ACCUM;
420 }
421
422 fasthash_accum(&hs, k, len);
423 return fasthash_final64(&hs, 0);
424}
425
426/* like fasthash64, but returns a 32-bit hashcode */
427static inline uint32
428fasthash32(const char *k, size_t len, uint64 seed)
429{
430 return fasthash_reduce32(fasthash64(k, len, seed));
431}
432
433/*
434 * Convenience function for hashing NUL-terminated strings
435 */
436static inline uint32
437hash_string(const char *s)
438{
440 size_t s_len;
441
442 fasthash_init(&hs, 0);
443
444 /*
445 * Combine string into the hash and save the length for tweaking the final
446 * mix.
447 */
448 s_len = fasthash_accum_cstring(&hs, s);
449
450 return fasthash_final32(&hs, s_len);
451}
452
453#endif /* HASHFN_UNSTABLE_H */
#define PointerIsAligned(pointer, type)
Definition: c.h:726
uint64_t uint64
Definition: c.h:489
uint32_t uint32
Definition: c.h:488
Assert(PointerIsAligned(start, uint64))
#define haszero64(v)
return str start
static uint32 fasthash_reduce32(uint64 h)
static uint32 fasthash32(const char *k, size_t len, uint64 seed)
#define FH_SIZEOF_ACCUM
pg_attribute_no_sanitize_address() static inline size_t fasthash_accum_cstring_aligned(fasthash_state *hs
static size_t fasthash_accum_cstring(fasthash_state *hs, const char *str)
uint64 zero_byte_low
#define firstbyte64(v)
static uint64 fasthash64(const char *k, size_t len, uint64 seed)
static uint64 fasthash_mix(uint64 h, uint64 tweak)
static uint32 fasthash_final32(fasthash_state *hs, uint64 tweak)
struct fasthash_state fasthash_state
uint64 chunk
static void fasthash_combine(fasthash_state *hs)
static void fasthash_accum(fasthash_state *hs, const char *k, size_t len)
static uint64 fasthash_final64(fasthash_state *hs, uint64 tweak)
static uint32 hash_string(const char *s)
static size_t fasthash_accum_cstring_unaligned(fasthash_state *hs, const char *str)
static void fasthash_init(fasthash_state *hs, uint64 seed)
const char * str
static int pg_rightmost_one_pos64(uint64 word)
Definition: pg_bitutils.h:145
static uint64 pg_bswap64(uint64 x)
Definition: pg_bswap.h:89
#define BITS_PER_BYTE
const void size_t len