PostgreSQL Source Code
git master
Loading...
Searching...
No Matches
pg_popcount_aarch64.c
Go to the documentation of this file.
1
/*-------------------------------------------------------------------------
2
*
3
* pg_popcount_aarch64.c
4
* Holds the AArch64 popcount implementations.
5
*
6
* Copyright (c) 2025-2026, PostgreSQL Global Development Group
7
*
8
* IDENTIFICATION
9
* src/port/pg_popcount_aarch64.c
10
*
11
*-------------------------------------------------------------------------
12
*/
13
#include "
c.h
"
14
15
#ifdef USE_NEON
16
17
#include <arm_neon.h>
18
19
#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
20
#include <arm_sve.h>
21
22
#if defined(HAVE_ELF_AUX_INFO) || defined(HAVE_GETAUXVAL)
23
#include <sys/auxv.h>
24
/* Ancient glibc releases don't include the HWCAPxxx macros in sys/auxv.h */
25
#if defined(__linux__) && !defined(HWCAP_SVE)
26
#include <asm/hwcap.h>
27
#endif
28
#endif
29
#endif
30
31
#include "
port/pg_bitutils.h
"
32
33
/*
34
* The Neon versions are built regardless of whether we are building the SVE
35
* versions.
36
*/
37
static
uint64
pg_popcount_neon
(
const
char
*
buf
,
int
bytes);
38
static
uint64
pg_popcount_masked_neon
(
const
char
*
buf
,
int
bytes,
bits8
mask);
39
40
#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
41
42
/*
43
* These are the SVE implementations of the popcount functions.
44
*/
45
static
uint64
pg_popcount_sve
(
const
char
*
buf
,
int
bytes);
46
static
uint64
pg_popcount_masked_sve
(
const
char
*
buf
,
int
bytes,
bits8
mask);
47
48
/*
49
* The function pointers are initially set to "choose" functions. These
50
* functions will first set the pointers to the right implementations (based on
51
* what the current CPU supports) and then will call the pointer to fulfill the
52
* caller's request.
53
*/
54
static
uint64
pg_popcount_choose
(
const
char
*
buf
,
int
bytes);
55
static
uint64
pg_popcount_masked_choose
(
const
char
*
buf
,
int
bytes,
bits8
mask);
56
uint64
(*
pg_popcount_optimized
) (
const
char
*
buf
,
int
bytes) =
pg_popcount_choose
;
57
uint64
(*
pg_popcount_masked_optimized
) (
const
char
*
buf
,
int
bytes,
bits8
mask) =
pg_popcount_masked_choose
;
58
59
static
inline
bool
60
pg_popcount_sve_available
(
void
)
61
{
62
#ifdef HAVE_ELF_AUX_INFO
63
unsigned
long
value
;
64
65
return
elf_aux_info
(
AT_HWCAP
, &
value
,
sizeof
(
value
)) == 0 &&
66
(
value
&
HWCAP_SVE
) != 0;
67
#elif defined(HAVE_GETAUXVAL)
68
return
(
getauxval
(
AT_HWCAP
) &
HWCAP_SVE
) != 0;
69
#else
70
return
false
;
71
#endif
72
}
73
74
static
inline
void
75
choose_popcount_functions
(
void
)
76
{
77
if
(
pg_popcount_sve_available
())
78
{
79
pg_popcount_optimized
=
pg_popcount_sve
;
80
pg_popcount_masked_optimized
=
pg_popcount_masked_sve
;
81
}
82
else
83
{
84
pg_popcount_optimized
=
pg_popcount_neon
;
85
pg_popcount_masked_optimized
=
pg_popcount_masked_neon
;
86
}
87
}
88
89
static
uint64
90
pg_popcount_choose
(
const
char
*
buf
,
int
bytes)
91
{
92
choose_popcount_functions
();
93
return
pg_popcount_optimized
(
buf
, bytes);
94
}
95
96
static
uint64
97
pg_popcount_masked_choose
(
const
char
*
buf
,
int
bytes,
bits8
mask)
98
{
99
choose_popcount_functions
();
100
return
pg_popcount_masked_optimized
(
buf
, bytes, mask);
101
}
102
103
/*
104
* pg_popcount_sve
105
* Returns number of 1 bits in buf
106
*/
107
pg_attribute_target
(
"arch=armv8-a+sve"
)
108
static
uint64
109
pg_popcount_sve
(
const
char
*
buf
,
int
bytes)
110
{
111
svbool_t
pred =
svptrue_b64
();
112
svuint64_t
accum1
=
svdup_u64
(0),
113
accum2
=
svdup_u64
(0),
114
accum3
=
svdup_u64
(0),
115
accum4
=
svdup_u64
(0);
116
uint32
vec_len
=
svcntb
(),
117
bytes_per_iteration
= 4 *
vec_len
;
118
uint64
popcnt
= 0;
119
120
/*
121
* For better instruction-level parallelism, each loop iteration operates
122
* on a block of four registers.
123
*/
124
for
(; bytes >=
bytes_per_iteration
; bytes -=
bytes_per_iteration
)
125
{
126
svuint64_t
vec
;
127
128
vec
=
svld1_u64
(pred, (
const
uint64
*)
buf
);
129
accum1
=
svadd_u64_x
(pred,
accum1
,
svcnt_u64_x
(pred,
vec
));
130
buf
+=
vec_len
;
131
132
vec
=
svld1_u64
(pred, (
const
uint64
*)
buf
);
133
accum2
=
svadd_u64_x
(pred,
accum2
,
svcnt_u64_x
(pred,
vec
));
134
buf
+=
vec_len
;
135
136
vec
=
svld1_u64
(pred, (
const
uint64
*)
buf
);
137
accum3
=
svadd_u64_x
(pred,
accum3
,
svcnt_u64_x
(pred,
vec
));
138
buf
+=
vec_len
;
139
140
vec
=
svld1_u64
(pred, (
const
uint64
*)
buf
);
141
accum4
=
svadd_u64_x
(pred,
accum4
,
svcnt_u64_x
(pred,
vec
));
142
buf
+=
vec_len
;
143
}
144
145
/*
146
* If enough data remains, do another iteration on a block of two
147
* registers.
148
*/
149
bytes_per_iteration
= 2 *
vec_len
;
150
if
(bytes >=
bytes_per_iteration
)
151
{
152
svuint64_t
vec
;
153
154
vec
=
svld1_u64
(pred, (
const
uint64
*)
buf
);
155
accum1
=
svadd_u64_x
(pred,
accum1
,
svcnt_u64_x
(pred,
vec
));
156
buf
+=
vec_len
;
157
158
vec
=
svld1_u64
(pred, (
const
uint64
*)
buf
);
159
accum2
=
svadd_u64_x
(pred,
accum2
,
svcnt_u64_x
(pred,
vec
));
160
buf
+=
vec_len
;
161
162
bytes -=
bytes_per_iteration
;
163
}
164
165
/*
166
* Add the accumulators.
167
*/
168
popcnt
+=
svaddv_u64
(pred,
svadd_u64_x
(pred,
accum1
,
accum2
));
169
popcnt
+=
svaddv_u64
(pred,
svadd_u64_x
(pred,
accum3
,
accum4
));
170
171
/*
172
* Process any remaining data.
173
*/
174
for
(; bytes > 0; bytes -=
vec_len
)
175
{
176
svuint8_t
vec
;
177
178
pred =
svwhilelt_b8_s32
(0, bytes);
179
vec
=
svld1_u8
(pred, (
const
uint8
*)
buf
);
180
popcnt
+=
svaddv_u8
(pred,
svcnt_u8_x
(pred,
vec
));
181
buf
+=
vec_len
;
182
}
183
184
return
popcnt
;
185
}
186
187
/*
188
* pg_popcount_masked_sve
189
* Returns number of 1 bits in buf after applying the mask to each byte
190
*/
191
pg_attribute_target
(
"arch=armv8-a+sve"
)
192
static
uint64
193
pg_popcount_masked_sve
(
const
char
*
buf
,
int
bytes,
bits8
mask)
194
{
195
svbool_t
pred =
svptrue_b64
();
196
svuint64_t
accum1
=
svdup_u64
(0),
197
accum2
=
svdup_u64
(0),
198
accum3
=
svdup_u64
(0),
199
accum4
=
svdup_u64
(0);
200
uint32
vec_len
=
svcntb
(),
201
bytes_per_iteration
= 4 *
vec_len
;
202
uint64
popcnt
= 0,
203
mask64
=
~UINT64CONST
(0) / 0xFF * mask;
204
205
/*
206
* For better instruction-level parallelism, each loop iteration operates
207
* on a block of four registers.
208
*/
209
for
(; bytes >=
bytes_per_iteration
; bytes -=
bytes_per_iteration
)
210
{
211
svuint64_t
vec
;
212
213
vec
=
svand_n_u64_x
(pred,
svld1_u64
(pred, (
const
uint64
*)
buf
),
mask64
);
214
accum1
=
svadd_u64_x
(pred,
accum1
,
svcnt_u64_x
(pred,
vec
));
215
buf
+=
vec_len
;
216
217
vec
=
svand_n_u64_x
(pred,
svld1_u64
(pred, (
const
uint64
*)
buf
),
mask64
);
218
accum2
=
svadd_u64_x
(pred,
accum2
,
svcnt_u64_x
(pred,
vec
));
219
buf
+=
vec_len
;
220
221
vec
=
svand_n_u64_x
(pred,
svld1_u64
(pred, (
const
uint64
*)
buf
),
mask64
);
222
accum3
=
svadd_u64_x
(pred,
accum3
,
svcnt_u64_x
(pred,
vec
));
223
buf
+=
vec_len
;
224
225
vec
=
svand_n_u64_x
(pred,
svld1_u64
(pred, (
const
uint64
*)
buf
),
mask64
);
226
accum4
=
svadd_u64_x
(pred,
accum4
,
svcnt_u64_x
(pred,
vec
));
227
buf
+=
vec_len
;
228
}
229
230
/*
231
* If enough data remains, do another iteration on a block of two
232
* registers.
233
*/
234
bytes_per_iteration
= 2 *
vec_len
;
235
if
(bytes >=
bytes_per_iteration
)
236
{
237
svuint64_t
vec
;
238
239
vec
=
svand_n_u64_x
(pred,
svld1_u64
(pred, (
const
uint64
*)
buf
),
mask64
);
240
accum1
=
svadd_u64_x
(pred,
accum1
,
svcnt_u64_x
(pred,
vec
));
241
buf
+=
vec_len
;
242
243
vec
=
svand_n_u64_x
(pred,
svld1_u64
(pred, (
const
uint64
*)
buf
),
mask64
);
244
accum2
=
svadd_u64_x
(pred,
accum2
,
svcnt_u64_x
(pred,
vec
));
245
buf
+=
vec_len
;
246
247
bytes -=
bytes_per_iteration
;
248
}
249
250
/*
251
* Add the accumulators.
252
*/
253
popcnt
+=
svaddv_u64
(pred,
svadd_u64_x
(pred,
accum1
,
accum2
));
254
popcnt
+=
svaddv_u64
(pred,
svadd_u64_x
(pred,
accum3
,
accum4
));
255
256
/*
257
* Process any remaining data.
258
*/
259
for
(; bytes > 0; bytes -=
vec_len
)
260
{
261
svuint8_t
vec
;
262
263
pred =
svwhilelt_b8_s32
(0, bytes);
264
vec
=
svand_n_u8_x
(pred,
svld1_u8
(pred, (
const
uint8
*)
buf
), mask);
265
popcnt
+=
svaddv_u8
(pred,
svcnt_u8_x
(pred,
vec
));
266
buf
+=
vec_len
;
267
}
268
269
return
popcnt
;
270
}
271
272
#else
/* USE_SVE_POPCNT_WITH_RUNTIME_CHECK */
273
274
/*
275
* When the SVE version isn't available, there's no point in using function
276
* pointers to vary the implementation. We instead just make these actual
277
* external functions when USE_SVE_POPCNT_WITH_RUNTIME_CHECK is not defined.
278
* The compiler should be able to inline the Neon versions here.
279
*/
280
uint64
281
pg_popcount_optimized
(
const
char
*
buf
,
int
bytes)
282
{
283
return
pg_popcount_neon
(
buf
, bytes);
284
}
285
286
uint64
287
pg_popcount_masked_optimized
(
const
char
*
buf
,
int
bytes,
bits8
mask)
288
{
289
return
pg_popcount_masked_neon
(
buf
, bytes, mask);
290
}
291
292
#endif
/* ! USE_SVE_POPCNT_WITH_RUNTIME_CHECK */
293
294
/*
295
* pg_popcount64_neon
296
* Return number of 1 bits in word
297
*/
298
static
inline
int
299
pg_popcount64_neon
(
uint64
word
)
300
{
301
return
vaddv_u8
(
vcnt_u8
(
vld1_u8
((
const
uint8
*) &
word
)));
302
}
303
304
/*
305
* pg_popcount_neon
306
* Returns number of 1 bits in buf
307
*/
308
static
uint64
309
pg_popcount_neon
(
const
char
*
buf
,
int
bytes)
310
{
311
uint8x16_t
vec
;
312
uint64x2_t
accum1
=
vdupq_n_u64
(0),
313
accum2
=
vdupq_n_u64
(0),
314
accum3
=
vdupq_n_u64
(0),
315
accum4
=
vdupq_n_u64
(0);
316
uint32
bytes_per_iteration
= 4 *
sizeof
(
uint8x16_t
);
317
uint64
popcnt
= 0;
318
319
/*
320
* For better instruction-level parallelism, each loop iteration operates
321
* on a block of four registers.
322
*/
323
for
(; bytes >=
bytes_per_iteration
; bytes -=
bytes_per_iteration
)
324
{
325
vec
=
vld1q_u8
((
const
uint8
*)
buf
);
326
accum1
=
vpadalq_u32
(
accum1
,
vpaddlq_u16
(
vpaddlq_u8
(
vcntq_u8
(
vec
))));
327
buf
+=
sizeof
(
uint8x16_t
);
328
329
vec
=
vld1q_u8
((
const
uint8
*)
buf
);
330
accum2
=
vpadalq_u32
(
accum2
,
vpaddlq_u16
(
vpaddlq_u8
(
vcntq_u8
(
vec
))));
331
buf
+=
sizeof
(
uint8x16_t
);
332
333
vec
=
vld1q_u8
((
const
uint8
*)
buf
);
334
accum3
=
vpadalq_u32
(
accum3
,
vpaddlq_u16
(
vpaddlq_u8
(
vcntq_u8
(
vec
))));
335
buf
+=
sizeof
(
uint8x16_t
);
336
337
vec
=
vld1q_u8
((
const
uint8
*)
buf
);
338
accum4
=
vpadalq_u32
(
accum4
,
vpaddlq_u16
(
vpaddlq_u8
(
vcntq_u8
(
vec
))));
339
buf
+=
sizeof
(
uint8x16_t
);
340
}
341
342
/*
343
* If enough data remains, do another iteration on a block of two
344
* registers.
345
*/
346
bytes_per_iteration
= 2 *
sizeof
(
uint8x16_t
);
347
if
(bytes >=
bytes_per_iteration
)
348
{
349
vec
=
vld1q_u8
((
const
uint8
*)
buf
);
350
accum1
=
vpadalq_u32
(
accum1
,
vpaddlq_u16
(
vpaddlq_u8
(
vcntq_u8
(
vec
))));
351
buf
+=
sizeof
(
uint8x16_t
);
352
353
vec
=
vld1q_u8
((
const
uint8
*)
buf
);
354
accum2
=
vpadalq_u32
(
accum2
,
vpaddlq_u16
(
vpaddlq_u8
(
vcntq_u8
(
vec
))));
355
buf
+=
sizeof
(
uint8x16_t
);
356
357
bytes -=
bytes_per_iteration
;
358
}
359
360
/*
361
* Add the accumulators.
362
*/
363
popcnt
+=
vaddvq_u64
(
vaddq_u64
(
accum1
,
accum2
));
364
popcnt
+=
vaddvq_u64
(
vaddq_u64
(
accum3
,
accum4
));
365
366
/*
367
* Process remaining 8-byte blocks.
368
*/
369
for
(; bytes >=
sizeof
(
uint64
); bytes -=
sizeof
(
uint64
))
370
{
371
popcnt
+=
pg_popcount64_neon
(*((
const
uint64
*)
buf
));
372
buf
+=
sizeof
(
uint64
);
373
}
374
375
/*
376
* Process any remaining data byte-by-byte.
377
*/
378
while
(bytes--)
379
popcnt
+=
pg_number_of_ones
[(
unsigned
char
) *
buf
++];
380
381
return
popcnt
;
382
}
383
384
/*
385
* pg_popcount_masked_neon
386
* Returns number of 1 bits in buf after applying the mask to each byte
387
*/
388
static
uint64
389
pg_popcount_masked_neon
(
const
char
*
buf
,
int
bytes,
bits8
mask)
390
{
391
uint8x16_t
vec
,
392
maskv
=
vdupq_n_u8
(mask);
393
uint64x2_t
accum1
=
vdupq_n_u64
(0),
394
accum2
=
vdupq_n_u64
(0),
395
accum3
=
vdupq_n_u64
(0),
396
accum4
=
vdupq_n_u64
(0);
397
uint32
bytes_per_iteration
= 4 *
sizeof
(
uint8x16_t
);
398
uint64
popcnt
= 0,
399
mask64
=
~UINT64CONST
(0) / 0xFF * mask;
400
401
/*
402
* For better instruction-level parallelism, each loop iteration operates
403
* on a block of four registers.
404
*/
405
for
(; bytes >=
bytes_per_iteration
; bytes -=
bytes_per_iteration
)
406
{
407
vec
=
vandq_u8
(
vld1q_u8
((
const
uint8
*)
buf
),
maskv
);
408
accum1
=
vpadalq_u32
(
accum1
,
vpaddlq_u16
(
vpaddlq_u8
(
vcntq_u8
(
vec
))));
409
buf
+=
sizeof
(
uint8x16_t
);
410
411
vec
=
vandq_u8
(
vld1q_u8
((
const
uint8
*)
buf
),
maskv
);
412
accum2
=
vpadalq_u32
(
accum2
,
vpaddlq_u16
(
vpaddlq_u8
(
vcntq_u8
(
vec
))));
413
buf
+=
sizeof
(
uint8x16_t
);
414
415
vec
=
vandq_u8
(
vld1q_u8
((
const
uint8
*)
buf
),
maskv
);
416
accum3
=
vpadalq_u32
(
accum3
,
vpaddlq_u16
(
vpaddlq_u8
(
vcntq_u8
(
vec
))));
417
buf
+=
sizeof
(
uint8x16_t
);
418
419
vec
=
vandq_u8
(
vld1q_u8
((
const
uint8
*)
buf
),
maskv
);
420
accum4
=
vpadalq_u32
(
accum4
,
vpaddlq_u16
(
vpaddlq_u8
(
vcntq_u8
(
vec
))));
421
buf
+=
sizeof
(
uint8x16_t
);
422
}
423
424
/*
425
* If enough data remains, do another iteration on a block of two
426
* registers.
427
*/
428
bytes_per_iteration
= 2 *
sizeof
(
uint8x16_t
);
429
if
(bytes >=
bytes_per_iteration
)
430
{
431
vec
=
vandq_u8
(
vld1q_u8
((
const
uint8
*)
buf
),
maskv
);
432
accum1
=
vpadalq_u32
(
accum1
,
vpaddlq_u16
(
vpaddlq_u8
(
vcntq_u8
(
vec
))));
433
buf
+=
sizeof
(
uint8x16_t
);
434
435
vec
=
vandq_u8
(
vld1q_u8
((
const
uint8
*)
buf
),
maskv
);
436
accum2
=
vpadalq_u32
(
accum2
,
vpaddlq_u16
(
vpaddlq_u8
(
vcntq_u8
(
vec
))));
437
buf
+=
sizeof
(
uint8x16_t
);
438
439
bytes -=
bytes_per_iteration
;
440
}
441
442
/*
443
* Add the accumulators.
444
*/
445
popcnt
+=
vaddvq_u64
(
vaddq_u64
(
accum1
,
accum2
));
446
popcnt
+=
vaddvq_u64
(
vaddq_u64
(
accum3
,
accum4
));
447
448
/*
449
* Process remaining 8-byte blocks.
450
*/
451
for
(; bytes >=
sizeof
(
uint64
); bytes -=
sizeof
(
uint64
))
452
{
453
popcnt
+=
pg_popcount64_neon
(*((
const
uint64
*)
buf
) &
mask64
);
454
buf
+=
sizeof
(
uint64
);
455
}
456
457
/*
458
* Process any remaining data byte-by-byte.
459
*/
460
while
(bytes--)
461
popcnt
+=
pg_number_of_ones
[(
unsigned
char
) *
buf
++ & mask];
462
463
return
popcnt
;
464
}
465
466
#endif
/* USE_NEON */
c.h
uint8
uint8_t uint8
Definition
c.h:556
bits8
uint8 bits8
Definition
c.h:565
uint64
uint64_t uint64
Definition
c.h:559
uint32
uint32_t uint32
Definition
c.h:558
pg_attribute_target
#define pg_attribute_target(...)
Definition
c.h:224
value
static struct @174 value
pg_bitutils.h
pg_popcount_masked_optimized
uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
Definition
pg_bitutils.c:189
pg_number_of_ones
PGDLLIMPORT const uint8 pg_number_of_ones[256]
Definition
pg_bitutils.c:80
pg_popcount_optimized
uint64 pg_popcount_optimized(const char *buf, int bytes)
Definition
pg_bitutils.c:179
buf
static char buf[DEFAULT_XLOG_SEG_SIZE]
Definition
pg_test_fsync.c:71
fb
static int fb(int x)
Definition
preproc-init.c:92
word
static void word(struct vars *v, int dir, struct state *lp, struct state *rp)
Definition
regcomp.c:1476
src
port
pg_popcount_aarch64.c
Generated on Mon Mar 2 2026 00:13:18 for PostgreSQL Source Code by
1.9.8