PostgreSQL Source Code git master
Loading...
Searching...
No Matches
pg_popcount_x86.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * pg_popcount_x86.c
4 * Holds the x86-64 pg_popcount() implementations.
5 *
6 * Copyright (c) 2024-2026, PostgreSQL Global Development Group
7 *
8 * IDENTIFICATION
9 * src/port/pg_popcount_x86.c
10 *
11 *-------------------------------------------------------------------------
12 */
13#include "c.h"
14
15#ifdef HAVE_X86_64_POPCNTQ
16
17#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
18#include <immintrin.h>
19#endif
20
21#include "port/pg_bitutils.h"
22#include "port/pg_cpu.h"
23
24/*
25 * The SSE4.2 versions are built regardless of whether we are building the
26 * AVX-512 versions.
27 *
28 * Technically, POPCNT is not part of SSE4.2, and isn't even a vector
29 * operation, but in practice this is close enough, and "sse42" seems easier to
30 * follow than "popcnt" for these names.
31 */
32static uint64 pg_popcount_sse42(const char *buf, int bytes);
33static uint64 pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask);
34
35/*
36 * These are the AVX-512 implementations of the popcount functions.
37 */
38#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
39static uint64 pg_popcount_avx512(const char *buf, int bytes);
40static uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask);
41#endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
42
43/*
44 * The function pointers are initially set to "choose" functions. These
45 * functions will first set the pointers to the right implementations (base on
46 * what the current CPU supports) and then will call the pointer to fulfill the
47 * caller's request.
48 */
49static uint64 pg_popcount_choose(const char *buf, int bytes);
50static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask);
51uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose;
53
54
55#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
56
57/*
58 * Returns true if the CPU supports the instructions required for the AVX-512
59 * pg_popcount() implementation.
60 */
61static bool
63{
66}
67
68#endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
69
70/*
71 * These functions get called on the first call to pg_popcount(), etc.
72 * They detect whether we can use the asm implementations, and replace
73 * the function pointers so that subsequent calls are routed directly to
74 * the chosen implementation.
75 */
76static inline void
78{
80 {
83 }
84 else
85 {
88 }
89
90#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
92 {
95 }
96#endif
97}
98
99static uint64
100pg_popcount_choose(const char *buf, int bytes)
101{
103 return pg_popcount_optimized(buf, bytes);
104}
105
106static uint64
107pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask)
108{
110 return pg_popcount_masked(buf, bytes, mask);
111}
112
113#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
114
115/*
116 * pg_popcount_avx512
117 * Returns the number of 1-bits in buf
118 */
119pg_attribute_target("avx512vpopcntdq,avx512bw")
121pg_popcount_avx512(const char *buf, int bytes)
122{
123 __m512i val,
124 cnt;
126 const char *final;
127 int tail_idx;
128 __mmask64 mask = ~UINT64CONST(0);
129
130 /*
131 * Align buffer down to avoid double load overhead from unaligned access.
132 * Calculate a mask to ignore preceding bytes. Find start offset of final
133 * iteration and ensure it is not empty.
134 */
135 mask <<= ((uintptr_t) buf) % sizeof(__m512i);
136 tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
137 final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
138 buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
139
140 /*
141 * Iterate through all but the final iteration. Starting from the second
142 * iteration, the mask is ignored.
143 */
144 if (buf < final)
145 {
146 val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
148 accum = _mm512_add_epi64(accum, cnt);
149
150 buf += sizeof(__m512i);
151 mask = ~UINT64CONST(0);
152
153 for (; buf < final; buf += sizeof(__m512i))
154 {
155 val = _mm512_load_si512((const __m512i *) buf);
157 accum = _mm512_add_epi64(accum, cnt);
158 }
159 }
160
161 /* Final iteration needs to ignore bytes that are not within the length */
162 mask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
163
164 val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
166 accum = _mm512_add_epi64(accum, cnt);
167
168 return _mm512_reduce_add_epi64(accum);
169}
170
171/*
172 * pg_popcount_masked_avx512
173 * Returns the number of 1-bits in buf after applying the mask to each byte
174 */
175pg_attribute_target("avx512vpopcntdq,avx512bw")
177pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
178{
179 __m512i val,
180 vmasked,
181 cnt;
183 const char *final;
184 int tail_idx;
186 const __m512i maskv = _mm512_set1_epi8(mask);
187
188 /*
189 * Align buffer down to avoid double load overhead from unaligned access.
190 * Calculate a mask to ignore preceding bytes. Find start offset of final
191 * iteration and ensure it is not empty.
192 */
193 bmask <<= ((uintptr_t) buf) % sizeof(__m512i);
194 tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
195 final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
196 buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
197
198 /*
199 * Iterate through all but the final iteration. Starting from the second
200 * iteration, the mask is ignored.
201 */
202 if (buf < final)
203 {
207 accum = _mm512_add_epi64(accum, cnt);
208
209 buf += sizeof(__m512i);
210 bmask = ~UINT64CONST(0);
211
212 for (; buf < final; buf += sizeof(__m512i))
213 {
214 val = _mm512_load_si512((const __m512i *) buf);
217 accum = _mm512_add_epi64(accum, cnt);
218 }
219 }
220
221 /* Final iteration needs to ignore bytes that are not within the length */
222 bmask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
223
227 accum = _mm512_add_epi64(accum, cnt);
228
229 return _mm512_reduce_add_epi64(accum);
230}
231
232#endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
233
234/*
235 * pg_popcount64_sse42
236 * Return the number of 1 bits set in word
237 */
238static inline int
240{
241#ifdef _MSC_VER
242 return __popcnt64(word);
243#else
244 uint64 res;
245
246__asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
247 return (int) res;
248#endif
249}
250
251/*
252 * pg_popcount_sse42
253 * Returns the number of 1-bits in buf
254 */
257pg_popcount_sse42(const char *buf, int bytes)
258{
259 uint64 popcnt = 0;
260 const uint64 *words = (const uint64 *) buf;
261
262 while (bytes >= 8)
263 {
264 popcnt += pg_popcount64_sse42(*words++);
265 bytes -= 8;
266 }
267
268 buf = (const char *) words;
269
270 /* Process any remaining bytes */
271 while (bytes--)
272 popcnt += pg_number_of_ones[(unsigned char) *buf++];
273
274 return popcnt;
275}
276
277/*
278 * pg_popcount_masked_sse42
279 * Returns the number of 1-bits in buf after applying the mask to each byte
280 */
283pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask)
284{
285 uint64 popcnt = 0;
286 uint64 maskv = ~UINT64CONST(0) / 0xFF * mask;
287 const uint64 *words = (const uint64 *) buf;
288
289 while (bytes >= 8)
290 {
291 popcnt += pg_popcount64_sse42(*words++ & maskv);
292 bytes -= 8;
293 }
294
295 buf = (const char *) words;
296
297 /* Process any remaining bytes */
298 while (bytes--)
299 popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask];
300
301 return popcnt;
302}
303
304#endif /* HAVE_X86_64_POPCNTQ */
#define pg_attribute_no_sanitize_alignment()
Definition c.h:201
uint8 bits8
Definition c.h:565
uint64_t uint64
Definition c.h:559
#define pg_attribute_target(...)
Definition c.h:224
#define TYPEALIGN_DOWN(ALIGNVAL, LEN)
Definition c.h:843
long val
Definition informix.c:689
uint64 pg_popcount_portable(const char *buf, int bytes)
uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
PGDLLIMPORT const uint8 pg_number_of_ones[256]
Definition pg_bitutils.c:80
uint64 pg_popcount_masked_portable(const char *buf, int bytes, bits8 mask)
uint64 pg_popcount_optimized(const char *buf, int bytes)
static uint64 pg_popcount_masked(const char *buf, int bytes, bits8 mask)
static char buf[DEFAULT_XLOG_SEG_SIZE]
static int fb(int x)
static void word(struct vars *v, int dir, struct state *lp, struct state *rp)
Definition regcomp.c:1476