PostgreSQL Source Code git master
Loading...
Searching...
No Matches
pg_popcount_x86.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * pg_popcount_x86.c
4 * Holds the x86-64 pg_popcount() implementations.
5 *
6 * Copyright (c) 2024-2026, PostgreSQL Global Development Group
7 *
8 * IDENTIFICATION
9 * src/port/pg_popcount_x86.c
10 *
11 *-------------------------------------------------------------------------
12 */
13#include "c.h"
14
15#ifdef HAVE_X86_64_POPCNTQ
16
17#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
18#include <cpuid.h>
19#endif
20
21#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
22#include <immintrin.h>
23#endif
24
25#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
26#include <intrin.h>
27#endif
28
29#include "port/pg_bitutils.h"
30
31/*
32 * The SSE4.2 versions are built regardless of whether we are building the
33 * AVX-512 versions.
34 *
35 * Technically, POPCNT is not part of SSE4.2, and isn't even a vector
36 * operation, but in practice this is close enough, and "sse42" seems easier to
37 * follow than "popcnt" for these names.
38 */
39static inline int pg_popcount32_sse42(uint32 word);
40static inline int pg_popcount64_sse42(uint64 word);
41static uint64 pg_popcount_sse42(const char *buf, int bytes);
42static uint64 pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask);
43
44/*
45 * These are the AVX-512 implementations of the popcount functions.
46 */
47#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
48static uint64 pg_popcount_avx512(const char *buf, int bytes);
49static uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask);
50#endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
51
52/*
53 * The function pointers are initially set to "choose" functions. These
54 * functions will first set the pointers to the right implementations (base on
55 * what the current CPU supports) and then will call the pointer to fulfill the
56 * caller's request.
57 */
60static uint64 pg_popcount_choose(const char *buf, int bytes);
61static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask);
64uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose;
66
67/*
68 * Return true if CPUID indicates that the POPCNT instruction is available.
69 */
70static bool
72{
73 unsigned int exx[4] = {0, 0, 0, 0};
74
75#if defined(HAVE__GET_CPUID)
76 __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
77#elif defined(HAVE__CPUID)
78 __cpuid(exx, 1);
79#else
80#error cpuid instruction not available
81#endif
82
83 return (exx[2] & (1 << 23)) != 0; /* POPCNT */
84}
85
86#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
87
88/*
89 * Does CPUID say there's support for XSAVE instructions?
90 */
91static inline bool
93{
94 unsigned int exx[4] = {0, 0, 0, 0};
95
96#if defined(HAVE__GET_CPUID)
97 __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
98#elif defined(HAVE__CPUID)
99 __cpuid(exx, 1);
100#else
101#error cpuid instruction not available
102#endif
103 return (exx[2] & (1 << 27)) != 0; /* osxsave */
104}
105
106/*
107 * Does XGETBV say the ZMM registers are enabled?
108 *
109 * NB: Caller is responsible for verifying that xsave_available() returns true
110 * before calling this.
111 */
112#ifdef HAVE_XSAVE_INTRINSICS
113pg_attribute_target("xsave")
114#endif
115static inline bool
117{
118#ifdef HAVE_XSAVE_INTRINSICS
119 return (_xgetbv(0) & 0xe6) == 0xe6;
120#else
121 return false;
122#endif
123}
124
125/*
126 * Does CPUID say there's support for AVX-512 popcount and byte-and-word
127 * instructions?
128 */
129static inline bool
131{
132 unsigned int exx[4] = {0, 0, 0, 0};
133
134#if defined(HAVE__GET_CPUID_COUNT)
135 __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
136#elif defined(HAVE__CPUIDEX)
137 __cpuidex(exx, 7, 0);
138#else
139#error cpuid instruction not available
140#endif
141 return (exx[2] & (1 << 14)) != 0 && /* avx512-vpopcntdq */
142 (exx[1] & (1 << 30)) != 0; /* avx512-bw */
143}
144
145/*
146 * Returns true if the CPU supports the instructions required for the AVX-512
147 * pg_popcount() implementation.
148 */
149static bool
151{
152 return xsave_available() &&
155}
156
157#endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
158
159/*
160 * These functions get called on the first call to pg_popcount32 etc.
161 * They detect whether we can use the asm implementations, and replace
162 * the function pointers so that subsequent calls are routed directly to
163 * the chosen implementation.
164 */
165static inline void
167{
169 {
174 }
175 else
176 {
181 }
182
183#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
185 {
188 }
189#endif
190}
191
192static int
194{
196 return pg_popcount32(word);
197}
198
199static int
201{
203 return pg_popcount64(word);
204}
205
206static uint64
207pg_popcount_choose(const char *buf, int bytes)
208{
210 return pg_popcount_optimized(buf, bytes);
211}
212
213static uint64
214pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask)
215{
217 return pg_popcount_masked(buf, bytes, mask);
218}
219
220#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
221
222/*
223 * pg_popcount_avx512
224 * Returns the number of 1-bits in buf
225 */
226pg_attribute_target("avx512vpopcntdq,avx512bw")
228pg_popcount_avx512(const char *buf, int bytes)
229{
230 __m512i val,
231 cnt;
233 const char *final;
234 int tail_idx;
235 __mmask64 mask = ~UINT64CONST(0);
236
237 /*
238 * Align buffer down to avoid double load overhead from unaligned access.
239 * Calculate a mask to ignore preceding bytes. Find start offset of final
240 * iteration and ensure it is not empty.
241 */
242 mask <<= ((uintptr_t) buf) % sizeof(__m512i);
243 tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
244 final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
245 buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
246
247 /*
248 * Iterate through all but the final iteration. Starting from the second
249 * iteration, the mask is ignored.
250 */
251 if (buf < final)
252 {
253 val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
255 accum = _mm512_add_epi64(accum, cnt);
256
257 buf += sizeof(__m512i);
258 mask = ~UINT64CONST(0);
259
260 for (; buf < final; buf += sizeof(__m512i))
261 {
262 val = _mm512_load_si512((const __m512i *) buf);
264 accum = _mm512_add_epi64(accum, cnt);
265 }
266 }
267
268 /* Final iteration needs to ignore bytes that are not within the length */
269 mask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
270
271 val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
273 accum = _mm512_add_epi64(accum, cnt);
274
275 return _mm512_reduce_add_epi64(accum);
276}
277
278/*
279 * pg_popcount_masked_avx512
280 * Returns the number of 1-bits in buf after applying the mask to each byte
281 */
282pg_attribute_target("avx512vpopcntdq,avx512bw")
284pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
285{
286 __m512i val,
287 vmasked,
288 cnt;
290 const char *final;
291 int tail_idx;
293 const __m512i maskv = _mm512_set1_epi8(mask);
294
295 /*
296 * Align buffer down to avoid double load overhead from unaligned access.
297 * Calculate a mask to ignore preceding bytes. Find start offset of final
298 * iteration and ensure it is not empty.
299 */
300 bmask <<= ((uintptr_t) buf) % sizeof(__m512i);
301 tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
302 final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
303 buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
304
305 /*
306 * Iterate through all but the final iteration. Starting from the second
307 * iteration, the mask is ignored.
308 */
309 if (buf < final)
310 {
314 accum = _mm512_add_epi64(accum, cnt);
315
316 buf += sizeof(__m512i);
317 bmask = ~UINT64CONST(0);
318
319 for (; buf < final; buf += sizeof(__m512i))
320 {
321 val = _mm512_load_si512((const __m512i *) buf);
324 accum = _mm512_add_epi64(accum, cnt);
325 }
326 }
327
328 /* Final iteration needs to ignore bytes that are not within the length */
329 bmask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
330
334 accum = _mm512_add_epi64(accum, cnt);
335
336 return _mm512_reduce_add_epi64(accum);
337}
338
339#endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
340
341/*
342 * pg_popcount32_sse42
343 * Return the number of 1 bits set in word
344 */
345static inline int
347{
348#ifdef _MSC_VER
349 return __popcnt(word);
350#else
351 uint32 res;
352
353__asm__ __volatile__(" popcntl %1,%0\n":"=q"(res):"rm"(word):"cc");
354 return (int) res;
355#endif
356}
357
358/*
359 * pg_popcount64_sse42
360 * Return the number of 1 bits set in word
361 */
362static inline int
364{
365#ifdef _MSC_VER
366 return __popcnt64(word);
367#else
368 uint64 res;
369
370__asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
371 return (int) res;
372#endif
373}
374
375/*
376 * pg_popcount_sse42
377 * Returns the number of 1-bits in buf
378 */
379static uint64
380pg_popcount_sse42(const char *buf, int bytes)
381{
382 uint64 popcnt = 0;
383
384#if SIZEOF_VOID_P >= 8
385 /* Process in 64-bit chunks if the buffer is aligned. */
386 if (buf == (const char *) TYPEALIGN(8, buf))
387 {
388 const uint64 *words = (const uint64 *) buf;
389
390 while (bytes >= 8)
391 {
392 popcnt += pg_popcount64_sse42(*words++);
393 bytes -= 8;
394 }
395
396 buf = (const char *) words;
397 }
398#else
399 /* Process in 32-bit chunks if the buffer is aligned. */
400 if (buf == (const char *) TYPEALIGN(4, buf))
401 {
402 const uint32 *words = (const uint32 *) buf;
403
404 while (bytes >= 4)
405 {
406 popcnt += pg_popcount32_sse42(*words++);
407 bytes -= 4;
408 }
409
410 buf = (const char *) words;
411 }
412#endif
413
414 /* Process any remaining bytes */
415 while (bytes--)
416 popcnt += pg_number_of_ones[(unsigned char) *buf++];
417
418 return popcnt;
419}
420
421/*
422 * pg_popcount_masked_sse42
423 * Returns the number of 1-bits in buf after applying the mask to each byte
424 */
425static uint64
426pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask)
427{
428 uint64 popcnt = 0;
429
430#if SIZEOF_VOID_P >= 8
431 /* Process in 64-bit chunks if the buffer is aligned */
432 uint64 maskv = ~UINT64CONST(0) / 0xFF * mask;
433
434 if (buf == (const char *) TYPEALIGN(8, buf))
435 {
436 const uint64 *words = (const uint64 *) buf;
437
438 while (bytes >= 8)
439 {
440 popcnt += pg_popcount64_sse42(*words++ & maskv);
441 bytes -= 8;
442 }
443
444 buf = (const char *) words;
445 }
446#else
447 /* Process in 32-bit chunks if the buffer is aligned. */
448 uint32 maskv = ~((uint32) 0) / 0xFF * mask;
449
450 if (buf == (const char *) TYPEALIGN(4, buf))
451 {
452 const uint32 *words = (const uint32 *) buf;
453
454 while (bytes >= 4)
455 {
456 popcnt += pg_popcount32_sse42(*words++ & maskv);
457 bytes -= 4;
458 }
459
460 buf = (const char *) words;
461 }
462#endif
463
464 /* Process any remaining bytes */
465 while (bytes--)
466 popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask];
467
468 return popcnt;
469}
470
471#endif /* HAVE_X86_64_POPCNTQ */
#define TYPEALIGN(ALIGNVAL, LEN)
Definition c.h:819
uint8 bits8
Definition c.h:553
uint64_t uint64
Definition c.h:547
uint32_t uint32
Definition c.h:546
#define pg_attribute_target(...)
Definition c.h:212
#define TYPEALIGN_DOWN(ALIGNVAL, LEN)
Definition c.h:831
long val
Definition informix.c:689
uint64 pg_popcount_portable(const char *buf, int bytes)
uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
PGDLLIMPORT const uint8 pg_number_of_ones[256]
Definition pg_bitutils.c:80
uint64 pg_popcount_masked_portable(const char *buf, int bytes, bits8 mask)
uint64 pg_popcount_optimized(const char *buf, int bytes)
int pg_popcount64(uint64 word)
int pg_popcount32(uint32 word)
int pg_popcount32_portable(uint32 word)
static uint64 pg_popcount_masked(const char *buf, int bytes, bits8 mask)
int pg_popcount64_portable(uint64 word)
static bool zmm_regs_available(void)
static char buf[DEFAULT_XLOG_SEG_SIZE]
static int fb(int x)
static void word(struct vars *v, int dir, struct state *lp, struct state *rp)
Definition regcomp.c:1476