PostgreSQL Source Code git master
pg_popcount_aarch64.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * pg_popcount_aarch64.c
4 * Holds the AArch64 popcount implementations.
5 *
6 * Copyright (c) 2025, PostgreSQL Global Development Group
7 *
8 * IDENTIFICATION
9 * src/port/pg_popcount_aarch64.c
10 *
11 *-------------------------------------------------------------------------
12 */
13#include "c.h"
14
15#include "port/pg_bitutils.h"
16
17#ifdef POPCNT_AARCH64
18
19#include <arm_neon.h>
20
21#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
22#include <arm_sve.h>
23
24#if defined(HAVE_ELF_AUX_INFO) || defined(HAVE_GETAUXVAL)
25#include <sys/auxv.h>
26/* Ancient glibc releases don't include the HWCAPxxx macros in sys/auxv.h */
27#if defined(__linux__) && !defined(HWCAP_SVE)
28#include <asm/hwcap.h>
29#endif
30#endif
31#endif
32
33/*
34 * The Neon versions are built regardless of whether we are building the SVE
35 * versions.
36 */
37static uint64 pg_popcount_neon(const char *buf, int bytes);
38static uint64 pg_popcount_masked_neon(const char *buf, int bytes, bits8 mask);
39
40#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
41
42/*
43 * These are the SVE implementations of the popcount functions.
44 */
45static uint64 pg_popcount_sve(const char *buf, int bytes);
46static uint64 pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask);
47
48/*
49 * The function pointers are initially set to "choose" functions. These
50 * functions will first set the pointers to the right implementations (based on
51 * what the current CPU supports) and then will call the pointer to fulfill the
52 * caller's request.
53 */
54static uint64 pg_popcount_choose(const char *buf, int bytes);
55static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask);
56uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose;
57uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose;
58
59static inline bool
60pg_popcount_sve_available(void)
61{
62#ifdef HAVE_ELF_AUX_INFO
63 unsigned long value;
64
65 return elf_aux_info(AT_HWCAP, &value, sizeof(value)) == 0 &&
66 (value & HWCAP_SVE) != 0;
67#elif defined(HAVE_GETAUXVAL)
68 return (getauxval(AT_HWCAP) & HWCAP_SVE) != 0;
69#else
70 return false;
71#endif
72}
73
74static inline void
75choose_popcount_functions(void)
76{
77 if (pg_popcount_sve_available())
78 {
79 pg_popcount_optimized = pg_popcount_sve;
80 pg_popcount_masked_optimized = pg_popcount_masked_sve;
81 }
82 else
83 {
84 pg_popcount_optimized = pg_popcount_neon;
85 pg_popcount_masked_optimized = pg_popcount_masked_neon;
86 }
87}
88
89static uint64
90pg_popcount_choose(const char *buf, int bytes)
91{
92 choose_popcount_functions();
93 return pg_popcount_optimized(buf, bytes);
94}
95
96static uint64
97pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask)
98{
99 choose_popcount_functions();
100 return pg_popcount_masked_optimized(buf, bytes, mask);
101}
102
103/*
104 * pg_popcount_sve
105 * Returns number of 1 bits in buf
106 */
107pg_attribute_target("arch=armv8-a+sve")
108static uint64
109pg_popcount_sve(const char *buf, int bytes)
110{
111 svbool_t pred = svptrue_b64();
112 svuint64_t accum1 = svdup_u64(0),
113 accum2 = svdup_u64(0),
114 accum3 = svdup_u64(0),
115 accum4 = svdup_u64(0);
116 uint32 vec_len = svcntb(),
117 bytes_per_iteration = 4 * vec_len;
118 uint64 popcnt = 0;
119
120 /*
121 * For better instruction-level parallelism, each loop iteration operates
122 * on a block of four registers.
123 */
124 for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
125 {
126 svuint64_t vec;
127
128 vec = svld1_u64(pred, (const uint64 *) buf);
129 accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec));
130 buf += vec_len;
131
132 vec = svld1_u64(pred, (const uint64 *) buf);
133 accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec));
134 buf += vec_len;
135
136 vec = svld1_u64(pred, (const uint64 *) buf);
137 accum3 = svadd_u64_x(pred, accum3, svcnt_u64_x(pred, vec));
138 buf += vec_len;
139
140 vec = svld1_u64(pred, (const uint64 *) buf);
141 accum4 = svadd_u64_x(pred, accum4, svcnt_u64_x(pred, vec));
142 buf += vec_len;
143 }
144
145 /*
146 * If enough data remains, do another iteration on a block of two
147 * registers.
148 */
149 bytes_per_iteration = 2 * vec_len;
150 if (bytes >= bytes_per_iteration)
151 {
152 svuint64_t vec;
153
154 vec = svld1_u64(pred, (const uint64 *) buf);
155 accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec));
156 buf += vec_len;
157
158 vec = svld1_u64(pred, (const uint64 *) buf);
159 accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec));
160 buf += vec_len;
161
162 bytes -= bytes_per_iteration;
163 }
164
165 /*
166 * Add the accumulators.
167 */
168 popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2));
169 popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum3, accum4));
170
171 /*
172 * Process any remaining data.
173 */
174 for (; bytes > 0; bytes -= vec_len)
175 {
176 svuint8_t vec;
177
178 pred = svwhilelt_b8_s32(0, bytes);
179 vec = svld1_u8(pred, (const uint8 *) buf);
180 popcnt += svaddv_u8(pred, svcnt_u8_x(pred, vec));
181 buf += vec_len;
182 }
183
184 return popcnt;
185}
186
187/*
188 * pg_popcount_masked_sve
189 * Returns number of 1 bits in buf after applying the mask to each byte
190 */
191pg_attribute_target("arch=armv8-a+sve")
192static uint64
193pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask)
194{
195 svbool_t pred = svptrue_b64();
196 svuint64_t accum1 = svdup_u64(0),
197 accum2 = svdup_u64(0),
198 accum3 = svdup_u64(0),
199 accum4 = svdup_u64(0);
200 uint32 vec_len = svcntb(),
201 bytes_per_iteration = 4 * vec_len;
202 uint64 popcnt = 0,
203 mask64 = ~UINT64CONST(0) / 0xFF * mask;
204
205 /*
206 * For better instruction-level parallelism, each loop iteration operates
207 * on a block of four registers.
208 */
209 for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
210 {
211 svuint64_t vec;
212
213 vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64);
214 accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec));
215 buf += vec_len;
216
217 vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64);
218 accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec));
219 buf += vec_len;
220
221 vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64);
222 accum3 = svadd_u64_x(pred, accum3, svcnt_u64_x(pred, vec));
223 buf += vec_len;
224
225 vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64);
226 accum4 = svadd_u64_x(pred, accum4, svcnt_u64_x(pred, vec));
227 buf += vec_len;
228 }
229
230 /*
231 * If enough data remains, do another iteration on a block of two
232 * registers.
233 */
234 bytes_per_iteration = 2 * vec_len;
235 if (bytes >= bytes_per_iteration)
236 {
237 svuint64_t vec;
238
239 vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64);
240 accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec));
241 buf += vec_len;
242
243 vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64);
244 accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec));
245 buf += vec_len;
246
247 bytes -= bytes_per_iteration;
248 }
249
250 /*
251 * Add the accumulators.
252 */
253 popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2));
254 popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum3, accum4));
255
256 /*
257 * Process any remaining data.
258 */
259 for (; bytes > 0; bytes -= vec_len)
260 {
261 svuint8_t vec;
262
263 pred = svwhilelt_b8_s32(0, bytes);
264 vec = svand_n_u8_x(pred, svld1_u8(pred, (const uint8 *) buf), mask);
265 popcnt += svaddv_u8(pred, svcnt_u8_x(pred, vec));
266 buf += vec_len;
267 }
268
269 return popcnt;
270}
271
272#else /* USE_SVE_POPCNT_WITH_RUNTIME_CHECK */
273
274/*
275 * When the SVE version isn't available, there's no point in using function
276 * pointers to vary the implementation. We instead just make these actual
277 * external functions when USE_SVE_POPCNT_WITH_RUNTIME_CHECK is not defined.
278 * The compiler should be able to inline the Neon versions here.
279 */
280uint64
281pg_popcount_optimized(const char *buf, int bytes)
282{
283 return pg_popcount_neon(buf, bytes);
284}
285
286uint64
287pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
288{
289 return pg_popcount_masked_neon(buf, bytes, mask);
290}
291
292#endif /* ! USE_SVE_POPCNT_WITH_RUNTIME_CHECK */
293
294/*
295 * pg_popcount32
296 * Return number of 1 bits in word
297 */
298int
300{
301 return pg_popcount64((uint64) word);
302}
303
304/*
305 * pg_popcount64
306 * Return number of 1 bits in word
307 */
308int
310{
311 /*
312 * For some compilers, __builtin_popcountl() already emits Neon
313 * instructions. The line below should compile to the same code on those
314 * systems.
315 */
316 return vaddv_u8(vcnt_u8(vld1_u8((const uint8 *) &word)));
317}
318
319/*
320 * pg_popcount_neon
321 * Returns number of 1 bits in buf
322 */
323static uint64
324pg_popcount_neon(const char *buf, int bytes)
325{
326 uint8x16_t vec;
327 uint64x2_t accum1 = vdupq_n_u64(0),
328 accum2 = vdupq_n_u64(0),
329 accum3 = vdupq_n_u64(0),
330 accum4 = vdupq_n_u64(0);
331 uint32 bytes_per_iteration = 4 * sizeof(uint8x16_t);
332 uint64 popcnt = 0;
333
334 /*
335 * For better instruction-level parallelism, each loop iteration operates
336 * on a block of four registers.
337 */
338 for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
339 {
340 vec = vld1q_u8((const uint8 *) buf);
341 accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
342 buf += sizeof(uint8x16_t);
343
344 vec = vld1q_u8((const uint8 *) buf);
345 accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
346 buf += sizeof(uint8x16_t);
347
348 vec = vld1q_u8((const uint8 *) buf);
349 accum3 = vpadalq_u32(accum3, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
350 buf += sizeof(uint8x16_t);
351
352 vec = vld1q_u8((const uint8 *) buf);
353 accum4 = vpadalq_u32(accum4, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
354 buf += sizeof(uint8x16_t);
355 }
356
357 /*
358 * If enough data remains, do another iteration on a block of two
359 * registers.
360 */
361 bytes_per_iteration = 2 * sizeof(uint8x16_t);
362 if (bytes >= bytes_per_iteration)
363 {
364 vec = vld1q_u8((const uint8 *) buf);
365 accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
366 buf += sizeof(uint8x16_t);
367
368 vec = vld1q_u8((const uint8 *) buf);
369 accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
370 buf += sizeof(uint8x16_t);
371
372 bytes -= bytes_per_iteration;
373 }
374
375 /*
376 * Add the accumulators.
377 */
378 popcnt += vaddvq_u64(vaddq_u64(accum1, accum2));
379 popcnt += vaddvq_u64(vaddq_u64(accum3, accum4));
380
381 /*
382 * Process remaining 8-byte blocks.
383 */
384 for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64))
385 {
386 popcnt += pg_popcount64(*((uint64 *) buf));
387 buf += sizeof(uint64);
388 }
389
390 /*
391 * Process any remaining data byte-by-byte.
392 */
393 while (bytes--)
394 popcnt += pg_number_of_ones[(unsigned char) *buf++];
395
396 return popcnt;
397}
398
399/*
400 * pg_popcount_masked_neon
401 * Returns number of 1 bits in buf after applying the mask to each byte
402 */
403static uint64
404pg_popcount_masked_neon(const char *buf, int bytes, bits8 mask)
405{
406 uint8x16_t vec,
407 maskv = vdupq_n_u8(mask);
408 uint64x2_t accum1 = vdupq_n_u64(0),
409 accum2 = vdupq_n_u64(0),
410 accum3 = vdupq_n_u64(0),
411 accum4 = vdupq_n_u64(0);
412 uint32 bytes_per_iteration = 4 * sizeof(uint8x16_t);
413 uint64 popcnt = 0,
414 mask64 = ~UINT64CONST(0) / 0xFF * mask;
415
416 /*
417 * For better instruction-level parallelism, each loop iteration operates
418 * on a block of four registers.
419 */
420 for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
421 {
422 vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
423 accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
424 buf += sizeof(uint8x16_t);
425
426 vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
427 accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
428 buf += sizeof(uint8x16_t);
429
430 vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
431 accum3 = vpadalq_u32(accum3, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
432 buf += sizeof(uint8x16_t);
433
434 vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
435 accum4 = vpadalq_u32(accum4, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
436 buf += sizeof(uint8x16_t);
437 }
438
439 /*
440 * If enough data remains, do another iteration on a block of two
441 * registers.
442 */
443 bytes_per_iteration = 2 * sizeof(uint8x16_t);
444 if (bytes >= bytes_per_iteration)
445 {
446 vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
447 accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
448 buf += sizeof(uint8x16_t);
449
450 vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
451 accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
452 buf += sizeof(uint8x16_t);
453
454 bytes -= bytes_per_iteration;
455 }
456
457 /*
458 * Add the accumulators.
459 */
460 popcnt += vaddvq_u64(vaddq_u64(accum1, accum2));
461 popcnt += vaddvq_u64(vaddq_u64(accum3, accum4));
462
463 /*
464 * Process remaining 8-byte blocks.
465 */
466 for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64))
467 {
468 popcnt += pg_popcount64(*((uint64 *) buf) & mask64);
469 buf += sizeof(uint64);
470 }
471
472 /*
473 * Process any remaining data byte-by-byte.
474 */
475 while (bytes--)
476 popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask];
477
478 return popcnt;
479}
480
481#endif /* POPCNT_AARCH64 */
uint8_t uint8
Definition: c.h:539
uint8 bits8
Definition: c.h:548
uint64_t uint64
Definition: c.h:542
uint32_t uint32
Definition: c.h:541
#define pg_attribute_target(...)
Definition: c.h:217
static struct @171 value
uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
Definition: pg_bitutils.c:535
PGDLLIMPORT const uint8 pg_number_of_ones[256]
Definition: pg_bitutils.c:87
uint64 pg_popcount_optimized(const char *buf, int bytes)
Definition: pg_bitutils.c:525
int pg_popcount64(uint64 word)
Definition: pg_bitutils.c:515
int pg_popcount32(uint32 word)
Definition: pg_bitutils.c:509
static char * buf
Definition: pg_test_fsync.c:72
static void word(struct vars *v, int dir, struct state *lp, struct state *rp)
Definition: regcomp.c:1476