21#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
24#if defined(HAVE_ELF_AUX_INFO) || defined(HAVE_GETAUXVAL)
27#if defined(__linux__) && !defined(HWCAP_SVE)
37static uint64 pg_popcount_neon(
const char *
buf,
int bytes);
38static uint64 pg_popcount_masked_neon(
const char *
buf,
int bytes,
bits8 mask);
40#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
45static uint64 pg_popcount_sve(
const char *
buf,
int bytes);
46static uint64 pg_popcount_masked_sve(
const char *
buf,
int bytes,
bits8 mask);
54static uint64 pg_popcount_choose(
const char *
buf,
int bytes);
55static uint64 pg_popcount_masked_choose(
const char *
buf,
int bytes,
bits8 mask);
60pg_popcount_sve_available(
void)
62#ifdef HAVE_ELF_AUX_INFO
65 return elf_aux_info(AT_HWCAP, &
value,
sizeof(
value)) == 0 &&
66 (
value & HWCAP_SVE) != 0;
67#elif defined(HAVE_GETAUXVAL)
68 return (getauxval(AT_HWCAP) & HWCAP_SVE) != 0;
75choose_popcount_functions(
void)
77 if (pg_popcount_sve_available())
90pg_popcount_choose(
const char *
buf,
int bytes)
92 choose_popcount_functions();
97pg_popcount_masked_choose(
const char *
buf,
int bytes,
bits8 mask)
99 choose_popcount_functions();
109pg_popcount_sve(const
char *
buf,
int bytes)
111 svbool_t pred = svptrue_b64();
112 svuint64_t accum1 = svdup_u64(0),
113 accum2 = svdup_u64(0),
114 accum3 = svdup_u64(0),
115 accum4 = svdup_u64(0);
116 uint32 vec_len = svcntb(),
117 bytes_per_iteration = 4 * vec_len;
124 for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
128 vec = svld1_u64(pred, (
const uint64 *)
buf);
129 accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec));
132 vec = svld1_u64(pred, (
const uint64 *)
buf);
133 accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec));
136 vec = svld1_u64(pred, (
const uint64 *)
buf);
137 accum3 = svadd_u64_x(pred, accum3, svcnt_u64_x(pred, vec));
140 vec = svld1_u64(pred, (
const uint64 *)
buf);
141 accum4 = svadd_u64_x(pred, accum4, svcnt_u64_x(pred, vec));
149 bytes_per_iteration = 2 * vec_len;
150 if (bytes >= bytes_per_iteration)
154 vec = svld1_u64(pred, (
const uint64 *)
buf);
155 accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec));
158 vec = svld1_u64(pred, (
const uint64 *)
buf);
159 accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec));
162 bytes -= bytes_per_iteration;
168 popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2));
169 popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum3, accum4));
174 for (; bytes > 0; bytes -= vec_len)
178 pred = svwhilelt_b8_s32(0, bytes);
179 vec = svld1_u8(pred, (
const uint8 *)
buf);
180 popcnt += svaddv_u8(pred, svcnt_u8_x(pred, vec));
193pg_popcount_masked_sve(const
char *
buf,
int bytes,
bits8 mask)
195 svbool_t pred = svptrue_b64();
196 svuint64_t accum1 = svdup_u64(0),
197 accum2 = svdup_u64(0),
198 accum3 = svdup_u64(0),
199 accum4 = svdup_u64(0);
200 uint32 vec_len = svcntb(),
201 bytes_per_iteration = 4 * vec_len;
203 mask64 = ~UINT64CONST(0) / 0xFF * mask;
209 for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
213 vec = svand_n_u64_x(pred, svld1_u64(pred, (
const uint64 *)
buf), mask64);
214 accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec));
217 vec = svand_n_u64_x(pred, svld1_u64(pred, (
const uint64 *)
buf), mask64);
218 accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec));
221 vec = svand_n_u64_x(pred, svld1_u64(pred, (
const uint64 *)
buf), mask64);
222 accum3 = svadd_u64_x(pred, accum3, svcnt_u64_x(pred, vec));
225 vec = svand_n_u64_x(pred, svld1_u64(pred, (
const uint64 *)
buf), mask64);
226 accum4 = svadd_u64_x(pred, accum4, svcnt_u64_x(pred, vec));
234 bytes_per_iteration = 2 * vec_len;
235 if (bytes >= bytes_per_iteration)
239 vec = svand_n_u64_x(pred, svld1_u64(pred, (
const uint64 *)
buf), mask64);
240 accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec));
243 vec = svand_n_u64_x(pred, svld1_u64(pred, (
const uint64 *)
buf), mask64);
244 accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec));
247 bytes -= bytes_per_iteration;
253 popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2));
254 popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum3, accum4));
259 for (; bytes > 0; bytes -= vec_len)
263 pred = svwhilelt_b8_s32(0, bytes);
264 vec = svand_n_u8_x(pred, svld1_u8(pred, (
const uint8 *)
buf), mask);
265 popcnt += svaddv_u8(pred, svcnt_u8_x(pred, vec));
283 return pg_popcount_neon(
buf, bytes);
289 return pg_popcount_masked_neon(
buf, bytes, mask);
316 return vaddv_u8(vcnt_u8(vld1_u8((
const uint8 *) &
word)));
324pg_popcount_neon(
const char *
buf,
int bytes)
327 uint64x2_t accum1 = vdupq_n_u64(0),
328 accum2 = vdupq_n_u64(0),
329 accum3 = vdupq_n_u64(0),
330 accum4 = vdupq_n_u64(0);
331 uint32 bytes_per_iteration = 4 *
sizeof(uint8x16_t);
338 for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
340 vec = vld1q_u8((
const uint8 *)
buf);
341 accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
342 buf +=
sizeof(uint8x16_t);
344 vec = vld1q_u8((
const uint8 *)
buf);
345 accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
346 buf +=
sizeof(uint8x16_t);
348 vec = vld1q_u8((
const uint8 *)
buf);
349 accum3 = vpadalq_u32(accum3, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
350 buf +=
sizeof(uint8x16_t);
352 vec = vld1q_u8((
const uint8 *)
buf);
353 accum4 = vpadalq_u32(accum4, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
354 buf +=
sizeof(uint8x16_t);
361 bytes_per_iteration = 2 *
sizeof(uint8x16_t);
362 if (bytes >= bytes_per_iteration)
364 vec = vld1q_u8((
const uint8 *)
buf);
365 accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
366 buf +=
sizeof(uint8x16_t);
368 vec = vld1q_u8((
const uint8 *)
buf);
369 accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
370 buf +=
sizeof(uint8x16_t);
372 bytes -= bytes_per_iteration;
378 popcnt += vaddvq_u64(vaddq_u64(accum1, accum2));
379 popcnt += vaddvq_u64(vaddq_u64(accum3, accum4));
384 for (; bytes >=
sizeof(
uint64); bytes -=
sizeof(
uint64))
404pg_popcount_masked_neon(
const char *
buf,
int bytes,
bits8 mask)
407 maskv = vdupq_n_u8(mask);
408 uint64x2_t accum1 = vdupq_n_u64(0),
409 accum2 = vdupq_n_u64(0),
410 accum3 = vdupq_n_u64(0),
411 accum4 = vdupq_n_u64(0);
412 uint32 bytes_per_iteration = 4 *
sizeof(uint8x16_t);
414 mask64 = ~UINT64CONST(0) / 0xFF * mask;
420 for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
422 vec = vandq_u8(vld1q_u8((
const uint8 *)
buf), maskv);
423 accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
424 buf +=
sizeof(uint8x16_t);
426 vec = vandq_u8(vld1q_u8((
const uint8 *)
buf), maskv);
427 accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
428 buf +=
sizeof(uint8x16_t);
430 vec = vandq_u8(vld1q_u8((
const uint8 *)
buf), maskv);
431 accum3 = vpadalq_u32(accum3, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
432 buf +=
sizeof(uint8x16_t);
434 vec = vandq_u8(vld1q_u8((
const uint8 *)
buf), maskv);
435 accum4 = vpadalq_u32(accum4, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
436 buf +=
sizeof(uint8x16_t);
443 bytes_per_iteration = 2 *
sizeof(uint8x16_t);
444 if (bytes >= bytes_per_iteration)
446 vec = vandq_u8(vld1q_u8((
const uint8 *)
buf), maskv);
447 accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
448 buf +=
sizeof(uint8x16_t);
450 vec = vandq_u8(vld1q_u8((
const uint8 *)
buf), maskv);
451 accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
452 buf +=
sizeof(uint8x16_t);
454 bytes -= bytes_per_iteration;
460 popcnt += vaddvq_u64(vaddq_u64(accum1, accum2));
461 popcnt += vaddvq_u64(vaddq_u64(accum3, accum4));
466 for (; bytes >=
sizeof(
uint64); bytes -=
sizeof(
uint64))
#define pg_attribute_target(...)
uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
PGDLLIMPORT const uint8 pg_number_of_ones[256]
uint64 pg_popcount_optimized(const char *buf, int bytes)
int pg_popcount64(uint64 word)
int pg_popcount32(uint32 word)
static void word(struct vars *v, int dir, struct state *lp, struct state *rp)