21#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
24#if defined(HAVE_ELF_AUX_INFO) || defined(HAVE_GETAUXVAL)
33static uint64 pg_popcount_neon(
const char *
buf,
int bytes);
34static uint64 pg_popcount_masked_neon(
const char *
buf,
int bytes,
bits8 mask);
36#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
41static uint64 pg_popcount_sve(
const char *
buf,
int bytes);
42static uint64 pg_popcount_masked_sve(
const char *
buf,
int bytes,
bits8 mask);
50static uint64 pg_popcount_choose(
const char *
buf,
int bytes);
51static uint64 pg_popcount_masked_choose(
const char *
buf,
int bytes,
bits8 mask);
56pg_popcount_sve_available(
void)
58#ifdef HAVE_ELF_AUX_INFO
61 return elf_aux_info(AT_HWCAP, &
value,
sizeof(
value)) == 0 &&
62 (
value & HWCAP_SVE) != 0;
63#elif defined(HAVE_GETAUXVAL)
64 return (getauxval(AT_HWCAP) & HWCAP_SVE) != 0;
71choose_popcount_functions(
void)
73 if (pg_popcount_sve_available())
86pg_popcount_choose(
const char *
buf,
int bytes)
88 choose_popcount_functions();
93pg_popcount_masked_choose(
const char *
buf,
int bytes,
bits8 mask)
95 choose_popcount_functions();
105pg_popcount_sve(const
char *
buf,
int bytes)
107 svbool_t pred = svptrue_b64();
108 svuint64_t accum1 = svdup_u64(0),
109 accum2 = svdup_u64(0),
110 accum3 = svdup_u64(0),
111 accum4 = svdup_u64(0);
112 uint32 vec_len = svcntb(),
113 bytes_per_iteration = 4 * vec_len;
120 for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
124 vec = svld1_u64(pred, (
const uint64 *)
buf);
125 accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec));
128 vec = svld1_u64(pred, (
const uint64 *)
buf);
129 accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec));
132 vec = svld1_u64(pred, (
const uint64 *)
buf);
133 accum3 = svadd_u64_x(pred, accum3, svcnt_u64_x(pred, vec));
136 vec = svld1_u64(pred, (
const uint64 *)
buf);
137 accum4 = svadd_u64_x(pred, accum4, svcnt_u64_x(pred, vec));
145 bytes_per_iteration = 2 * vec_len;
146 if (bytes >= bytes_per_iteration)
150 vec = svld1_u64(pred, (
const uint64 *)
buf);
151 accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec));
154 vec = svld1_u64(pred, (
const uint64 *)
buf);
155 accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec));
158 bytes -= bytes_per_iteration;
164 popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2));
165 popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum3, accum4));
170 for (; bytes > 0; bytes -= vec_len)
174 pred = svwhilelt_b8_s32(0, bytes);
175 vec = svld1_u8(pred, (
const uint8 *)
buf);
176 popcnt += svaddv_u8(pred, svcnt_u8_x(pred, vec));
189pg_popcount_masked_sve(const
char *
buf,
int bytes,
bits8 mask)
191 svbool_t pred = svptrue_b64();
192 svuint64_t accum1 = svdup_u64(0),
193 accum2 = svdup_u64(0),
194 accum3 = svdup_u64(0),
195 accum4 = svdup_u64(0);
196 uint32 vec_len = svcntb(),
197 bytes_per_iteration = 4 * vec_len;
199 mask64 = ~UINT64CONST(0) / 0xFF * mask;
205 for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
209 vec = svand_n_u64_x(pred, svld1_u64(pred, (
const uint64 *)
buf), mask64);
210 accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec));
213 vec = svand_n_u64_x(pred, svld1_u64(pred, (
const uint64 *)
buf), mask64);
214 accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec));
217 vec = svand_n_u64_x(pred, svld1_u64(pred, (
const uint64 *)
buf), mask64);
218 accum3 = svadd_u64_x(pred, accum3, svcnt_u64_x(pred, vec));
221 vec = svand_n_u64_x(pred, svld1_u64(pred, (
const uint64 *)
buf), mask64);
222 accum4 = svadd_u64_x(pred, accum4, svcnt_u64_x(pred, vec));
230 bytes_per_iteration = 2 * vec_len;
231 if (bytes >= bytes_per_iteration)
235 vec = svand_n_u64_x(pred, svld1_u64(pred, (
const uint64 *)
buf), mask64);
236 accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec));
239 vec = svand_n_u64_x(pred, svld1_u64(pred, (
const uint64 *)
buf), mask64);
240 accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec));
243 bytes -= bytes_per_iteration;
249 popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2));
250 popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum3, accum4));
255 for (; bytes > 0; bytes -= vec_len)
259 pred = svwhilelt_b8_s32(0, bytes);
260 vec = svand_n_u8_x(pred, svld1_u8(pred, (
const uint8 *)
buf), mask);
261 popcnt += svaddv_u8(pred, svcnt_u8_x(pred, vec));
279 return pg_popcount_neon(
buf, bytes);
285 return pg_popcount_masked_neon(
buf, bytes, mask);
312 return vaddv_u8(vcnt_u8(vld1_u8((
const uint8 *) &
word)));
320pg_popcount_neon(
const char *
buf,
int bytes)
323 uint64x2_t accum1 = vdupq_n_u64(0),
324 accum2 = vdupq_n_u64(0),
325 accum3 = vdupq_n_u64(0),
326 accum4 = vdupq_n_u64(0);
327 uint32 bytes_per_iteration = 4 *
sizeof(uint8x16_t);
334 for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
336 vec = vld1q_u8((
const uint8 *)
buf);
337 accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
338 buf +=
sizeof(uint8x16_t);
340 vec = vld1q_u8((
const uint8 *)
buf);
341 accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
342 buf +=
sizeof(uint8x16_t);
344 vec = vld1q_u8((
const uint8 *)
buf);
345 accum3 = vpadalq_u32(accum3, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
346 buf +=
sizeof(uint8x16_t);
348 vec = vld1q_u8((
const uint8 *)
buf);
349 accum4 = vpadalq_u32(accum4, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
350 buf +=
sizeof(uint8x16_t);
357 bytes_per_iteration = 2 *
sizeof(uint8x16_t);
358 if (bytes >= bytes_per_iteration)
360 vec = vld1q_u8((
const uint8 *)
buf);
361 accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
362 buf +=
sizeof(uint8x16_t);
364 vec = vld1q_u8((
const uint8 *)
buf);
365 accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
366 buf +=
sizeof(uint8x16_t);
368 bytes -= bytes_per_iteration;
374 popcnt += vaddvq_u64(vaddq_u64(accum1, accum2));
375 popcnt += vaddvq_u64(vaddq_u64(accum3, accum4));
380 for (; bytes >=
sizeof(
uint64); bytes -=
sizeof(
uint64))
400pg_popcount_masked_neon(
const char *
buf,
int bytes,
bits8 mask)
403 maskv = vdupq_n_u8(mask);
404 uint64x2_t accum1 = vdupq_n_u64(0),
405 accum2 = vdupq_n_u64(0),
406 accum3 = vdupq_n_u64(0),
407 accum4 = vdupq_n_u64(0);
408 uint32 bytes_per_iteration = 4 *
sizeof(uint8x16_t);
410 mask64 = ~UINT64CONST(0) / 0xFF * mask;
416 for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
418 vec = vandq_u8(vld1q_u8((
const uint8 *)
buf), maskv);
419 accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
420 buf +=
sizeof(uint8x16_t);
422 vec = vandq_u8(vld1q_u8((
const uint8 *)
buf), maskv);
423 accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
424 buf +=
sizeof(uint8x16_t);
426 vec = vandq_u8(vld1q_u8((
const uint8 *)
buf), maskv);
427 accum3 = vpadalq_u32(accum3, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
428 buf +=
sizeof(uint8x16_t);
430 vec = vandq_u8(vld1q_u8((
const uint8 *)
buf), maskv);
431 accum4 = vpadalq_u32(accum4, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
432 buf +=
sizeof(uint8x16_t);
439 bytes_per_iteration = 2 *
sizeof(uint8x16_t);
440 if (bytes >= bytes_per_iteration)
442 vec = vandq_u8(vld1q_u8((
const uint8 *)
buf), maskv);
443 accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
444 buf +=
sizeof(uint8x16_t);
446 vec = vandq_u8(vld1q_u8((
const uint8 *)
buf), maskv);
447 accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
448 buf +=
sizeof(uint8x16_t);
450 bytes -= bytes_per_iteration;
456 popcnt += vaddvq_u64(vaddq_u64(accum1, accum2));
457 popcnt += vaddvq_u64(vaddq_u64(accum3, accum4));
462 for (; bytes >=
sizeof(
uint64); bytes -=
sizeof(
uint64))
#define pg_attribute_target(...)
uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
PGDLLIMPORT const uint8 pg_number_of_ones[256]
uint64 pg_popcount_optimized(const char *buf, int bytes)
int pg_popcount64(uint64 word)
int pg_popcount32(uint32 word)
static void word(struct vars *v, int dir, struct state *lp, struct state *rp)