PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
pg_popcount_aarch64.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * pg_popcount_aarch64.c
4 * Holds the AArch64 popcount implementations.
5 *
6 * Copyright (c) 2025, PostgreSQL Global Development Group
7 *
8 * IDENTIFICATION
9 * src/port/pg_popcount_aarch64.c
10 *
11 *-------------------------------------------------------------------------
12 */
13#include "c.h"
14
15#include "port/pg_bitutils.h"
16
17#ifdef POPCNT_AARCH64
18
19#include <arm_neon.h>
20
21#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
22#include <arm_sve.h>
23
24#if defined(HAVE_ELF_AUX_INFO) || defined(HAVE_GETAUXVAL)
25#include <sys/auxv.h>
26#endif
27#endif
28
29/*
30 * The Neon versions are built regardless of whether we are building the SVE
31 * versions.
32 */
33static uint64 pg_popcount_neon(const char *buf, int bytes);
34static uint64 pg_popcount_masked_neon(const char *buf, int bytes, bits8 mask);
35
36#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
37
38/*
39 * These are the SVE implementations of the popcount functions.
40 */
41static uint64 pg_popcount_sve(const char *buf, int bytes);
42static uint64 pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask);
43
44/*
45 * The function pointers are initially set to "choose" functions. These
46 * functions will first set the pointers to the right implementations (based on
47 * what the current CPU supports) and then will call the pointer to fulfill the
48 * caller's request.
49 */
50static uint64 pg_popcount_choose(const char *buf, int bytes);
51static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask);
52uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose;
53uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose;
54
55static inline bool
56pg_popcount_sve_available(void)
57{
58#ifdef HAVE_ELF_AUX_INFO
59 unsigned long value;
60
61 return elf_aux_info(AT_HWCAP, &value, sizeof(value)) == 0 &&
62 (value & HWCAP_SVE) != 0;
63#elif defined(HAVE_GETAUXVAL)
64 return (getauxval(AT_HWCAP) & HWCAP_SVE) != 0;
65#else
66 return false;
67#endif
68}
69
70static inline void
71choose_popcount_functions(void)
72{
73 if (pg_popcount_sve_available())
74 {
75 pg_popcount_optimized = pg_popcount_sve;
76 pg_popcount_masked_optimized = pg_popcount_masked_sve;
77 }
78 else
79 {
80 pg_popcount_optimized = pg_popcount_neon;
81 pg_popcount_masked_optimized = pg_popcount_masked_neon;
82 }
83}
84
85static uint64
86pg_popcount_choose(const char *buf, int bytes)
87{
88 choose_popcount_functions();
89 return pg_popcount_optimized(buf, bytes);
90}
91
92static uint64
93pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask)
94{
95 choose_popcount_functions();
96 return pg_popcount_masked_optimized(buf, bytes, mask);
97}
98
99/*
100 * pg_popcount_sve
101 * Returns number of 1 bits in buf
102 */
103pg_attribute_target("arch=armv8-a+sve")
104static uint64
105pg_popcount_sve(const char *buf, int bytes)
106{
107 svbool_t pred = svptrue_b64();
108 svuint64_t accum1 = svdup_u64(0),
109 accum2 = svdup_u64(0),
110 accum3 = svdup_u64(0),
111 accum4 = svdup_u64(0);
112 uint32 vec_len = svcntb(),
113 bytes_per_iteration = 4 * vec_len;
114 uint64 popcnt = 0;
115
116 /*
117 * For better instruction-level parallelism, each loop iteration operates
118 * on a block of four registers.
119 */
120 for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
121 {
122 svuint64_t vec;
123
124 vec = svld1_u64(pred, (const uint64 *) buf);
125 accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec));
126 buf += vec_len;
127
128 vec = svld1_u64(pred, (const uint64 *) buf);
129 accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec));
130 buf += vec_len;
131
132 vec = svld1_u64(pred, (const uint64 *) buf);
133 accum3 = svadd_u64_x(pred, accum3, svcnt_u64_x(pred, vec));
134 buf += vec_len;
135
136 vec = svld1_u64(pred, (const uint64 *) buf);
137 accum4 = svadd_u64_x(pred, accum4, svcnt_u64_x(pred, vec));
138 buf += vec_len;
139 }
140
141 /*
142 * If enough data remains, do another iteration on a block of two
143 * registers.
144 */
145 bytes_per_iteration = 2 * vec_len;
146 if (bytes >= bytes_per_iteration)
147 {
148 svuint64_t vec;
149
150 vec = svld1_u64(pred, (const uint64 *) buf);
151 accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec));
152 buf += vec_len;
153
154 vec = svld1_u64(pred, (const uint64 *) buf);
155 accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec));
156 buf += vec_len;
157
158 bytes -= bytes_per_iteration;
159 }
160
161 /*
162 * Add the accumulators.
163 */
164 popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2));
165 popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum3, accum4));
166
167 /*
168 * Process any remaining data.
169 */
170 for (; bytes > 0; bytes -= vec_len)
171 {
172 svuint8_t vec;
173
174 pred = svwhilelt_b8_s32(0, bytes);
175 vec = svld1_u8(pred, (const uint8 *) buf);
176 popcnt += svaddv_u8(pred, svcnt_u8_x(pred, vec));
177 buf += vec_len;
178 }
179
180 return popcnt;
181}
182
183/*
184 * pg_popcount_masked_sve
185 * Returns number of 1 bits in buf after applying the mask to each byte
186 */
187pg_attribute_target("arch=armv8-a+sve")
188static uint64
189pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask)
190{
191 svbool_t pred = svptrue_b64();
192 svuint64_t accum1 = svdup_u64(0),
193 accum2 = svdup_u64(0),
194 accum3 = svdup_u64(0),
195 accum4 = svdup_u64(0);
196 uint32 vec_len = svcntb(),
197 bytes_per_iteration = 4 * vec_len;
198 uint64 popcnt = 0,
199 mask64 = ~UINT64CONST(0) / 0xFF * mask;
200
201 /*
202 * For better instruction-level parallelism, each loop iteration operates
203 * on a block of four registers.
204 */
205 for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
206 {
207 svuint64_t vec;
208
209 vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64);
210 accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec));
211 buf += vec_len;
212
213 vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64);
214 accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec));
215 buf += vec_len;
216
217 vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64);
218 accum3 = svadd_u64_x(pred, accum3, svcnt_u64_x(pred, vec));
219 buf += vec_len;
220
221 vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64);
222 accum4 = svadd_u64_x(pred, accum4, svcnt_u64_x(pred, vec));
223 buf += vec_len;
224 }
225
226 /*
227 * If enough data remains, do another iteration on a block of two
228 * registers.
229 */
230 bytes_per_iteration = 2 * vec_len;
231 if (bytes >= bytes_per_iteration)
232 {
233 svuint64_t vec;
234
235 vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64);
236 accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec));
237 buf += vec_len;
238
239 vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64);
240 accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec));
241 buf += vec_len;
242
243 bytes -= bytes_per_iteration;
244 }
245
246 /*
247 * Add the accumulators.
248 */
249 popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2));
250 popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum3, accum4));
251
252 /*
253 * Process any remaining data.
254 */
255 for (; bytes > 0; bytes -= vec_len)
256 {
257 svuint8_t vec;
258
259 pred = svwhilelt_b8_s32(0, bytes);
260 vec = svand_n_u8_x(pred, svld1_u8(pred, (const uint8 *) buf), mask);
261 popcnt += svaddv_u8(pred, svcnt_u8_x(pred, vec));
262 buf += vec_len;
263 }
264
265 return popcnt;
266}
267
268#else /* USE_SVE_POPCNT_WITH_RUNTIME_CHECK */
269
270/*
271 * When the SVE version isn't available, there's no point in using function
272 * pointers to vary the implementation. We instead just make these actual
273 * external functions when USE_SVE_POPCNT_WITH_RUNTIME_CHECK is not defined.
274 * The compiler should be able to inline the Neon versions here.
275 */
276uint64
277pg_popcount_optimized(const char *buf, int bytes)
278{
279 return pg_popcount_neon(buf, bytes);
280}
281
282uint64
283pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
284{
285 return pg_popcount_masked_neon(buf, bytes, mask);
286}
287
288#endif /* ! USE_SVE_POPCNT_WITH_RUNTIME_CHECK */
289
290/*
291 * pg_popcount32
292 * Return number of 1 bits in word
293 */
294int
296{
297 return pg_popcount64((uint64) word);
298}
299
300/*
301 * pg_popcount64
302 * Return number of 1 bits in word
303 */
304int
306{
307 /*
308 * For some compilers, __builtin_popcountl() already emits Neon
309 * instructions. The line below should compile to the same code on those
310 * systems.
311 */
312 return vaddv_u8(vcnt_u8(vld1_u8((const uint8 *) &word)));
313}
314
315/*
316 * pg_popcount_neon
317 * Returns number of 1 bits in buf
318 */
319static uint64
320pg_popcount_neon(const char *buf, int bytes)
321{
322 uint8x16_t vec;
323 uint64x2_t accum1 = vdupq_n_u64(0),
324 accum2 = vdupq_n_u64(0),
325 accum3 = vdupq_n_u64(0),
326 accum4 = vdupq_n_u64(0);
327 uint32 bytes_per_iteration = 4 * sizeof(uint8x16_t);
328 uint64 popcnt = 0;
329
330 /*
331 * For better instruction-level parallelism, each loop iteration operates
332 * on a block of four registers.
333 */
334 for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
335 {
336 vec = vld1q_u8((const uint8 *) buf);
337 accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
338 buf += sizeof(uint8x16_t);
339
340 vec = vld1q_u8((const uint8 *) buf);
341 accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
342 buf += sizeof(uint8x16_t);
343
344 vec = vld1q_u8((const uint8 *) buf);
345 accum3 = vpadalq_u32(accum3, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
346 buf += sizeof(uint8x16_t);
347
348 vec = vld1q_u8((const uint8 *) buf);
349 accum4 = vpadalq_u32(accum4, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
350 buf += sizeof(uint8x16_t);
351 }
352
353 /*
354 * If enough data remains, do another iteration on a block of two
355 * registers.
356 */
357 bytes_per_iteration = 2 * sizeof(uint8x16_t);
358 if (bytes >= bytes_per_iteration)
359 {
360 vec = vld1q_u8((const uint8 *) buf);
361 accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
362 buf += sizeof(uint8x16_t);
363
364 vec = vld1q_u8((const uint8 *) buf);
365 accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
366 buf += sizeof(uint8x16_t);
367
368 bytes -= bytes_per_iteration;
369 }
370
371 /*
372 * Add the accumulators.
373 */
374 popcnt += vaddvq_u64(vaddq_u64(accum1, accum2));
375 popcnt += vaddvq_u64(vaddq_u64(accum3, accum4));
376
377 /*
378 * Process remaining 8-byte blocks.
379 */
380 for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64))
381 {
382 popcnt += pg_popcount64(*((uint64 *) buf));
383 buf += sizeof(uint64);
384 }
385
386 /*
387 * Process any remaining data byte-by-byte.
388 */
389 while (bytes--)
390 popcnt += pg_number_of_ones[(unsigned char) *buf++];
391
392 return popcnt;
393}
394
395/*
396 * pg_popcount_masked_neon
397 * Returns number of 1 bits in buf after applying the mask to each byte
398 */
399static uint64
400pg_popcount_masked_neon(const char *buf, int bytes, bits8 mask)
401{
402 uint8x16_t vec,
403 maskv = vdupq_n_u8(mask);
404 uint64x2_t accum1 = vdupq_n_u64(0),
405 accum2 = vdupq_n_u64(0),
406 accum3 = vdupq_n_u64(0),
407 accum4 = vdupq_n_u64(0);
408 uint32 bytes_per_iteration = 4 * sizeof(uint8x16_t);
409 uint64 popcnt = 0,
410 mask64 = ~UINT64CONST(0) / 0xFF * mask;
411
412 /*
413 * For better instruction-level parallelism, each loop iteration operates
414 * on a block of four registers.
415 */
416 for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
417 {
418 vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
419 accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
420 buf += sizeof(uint8x16_t);
421
422 vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
423 accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
424 buf += sizeof(uint8x16_t);
425
426 vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
427 accum3 = vpadalq_u32(accum3, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
428 buf += sizeof(uint8x16_t);
429
430 vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
431 accum4 = vpadalq_u32(accum4, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
432 buf += sizeof(uint8x16_t);
433 }
434
435 /*
436 * If enough data remains, do another iteration on a block of two
437 * registers.
438 */
439 bytes_per_iteration = 2 * sizeof(uint8x16_t);
440 if (bytes >= bytes_per_iteration)
441 {
442 vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
443 accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
444 buf += sizeof(uint8x16_t);
445
446 vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
447 accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
448 buf += sizeof(uint8x16_t);
449
450 bytes -= bytes_per_iteration;
451 }
452
453 /*
454 * Add the accumulators.
455 */
456 popcnt += vaddvq_u64(vaddq_u64(accum1, accum2));
457 popcnt += vaddvq_u64(vaddq_u64(accum3, accum4));
458
459 /*
460 * Process remaining 8-byte blocks.
461 */
462 for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64))
463 {
464 popcnt += pg_popcount64(*((uint64 *) buf) & mask64);
465 buf += sizeof(uint64);
466 }
467
468 /*
469 * Process any remaining data byte-by-byte.
470 */
471 while (bytes--)
472 popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask];
473
474 return popcnt;
475}
476
477#endif /* POPCNT_AARCH64 */
uint8_t uint8
Definition: c.h:500
uint8 bits8
Definition: c.h:509
uint64_t uint64
Definition: c.h:503
uint32_t uint32
Definition: c.h:502
#define pg_attribute_target(...)
Definition: c.h:213
static struct @165 value
uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
Definition: pg_bitutils.c:535
PGDLLIMPORT const uint8 pg_number_of_ones[256]
Definition: pg_bitutils.c:87
uint64 pg_popcount_optimized(const char *buf, int bytes)
Definition: pg_bitutils.c:525
int pg_popcount64(uint64 word)
Definition: pg_bitutils.c:515
int pg_popcount32(uint32 word)
Definition: pg_bitutils.c:509
static char * buf
Definition: pg_test_fsync.c:72
static void word(struct vars *v, int dir, struct state *lp, struct state *rp)
Definition: regcomp.c:1476