15 #include <immintrin.h>
24 #ifdef TRY_POPCNT_FAST
31 pg_popcount_avx512(
const char *
buf,
int bytes)
35 __m512i accum = _mm512_setzero_si512();
38 __mmask64 mask = ~UINT64CONST(0);
45 mask <<= ((uintptr_t)
buf) %
sizeof(__m512i);
46 tail_idx = (((uintptr_t)
buf + bytes - 1) %
sizeof(__m512i)) + 1;
56 val = _mm512_maskz_loadu_epi8(mask, (
const __m512i *)
buf);
57 cnt = _mm512_popcnt_epi64(
val);
58 accum = _mm512_add_epi64(accum, cnt);
60 buf +=
sizeof(__m512i);
61 mask = ~UINT64CONST(0);
63 for (;
buf <
final;
buf +=
sizeof(__m512i))
65 val = _mm512_load_si512((
const __m512i *)
buf);
66 cnt = _mm512_popcnt_epi64(
val);
67 accum = _mm512_add_epi64(accum, cnt);
72 mask &= (~UINT64CONST(0) >> (
sizeof(__m512i) - tail_idx));
74 val = _mm512_maskz_loadu_epi8(mask, (
const __m512i *)
buf);
75 cnt = _mm512_popcnt_epi64(
val);
76 accum = _mm512_add_epi64(accum, cnt);
78 return _mm512_reduce_add_epi64(accum);
86 pg_popcount_masked_avx512(
const char *
buf,
int bytes,
bits8 mask)
91 __m512i accum = _mm512_setzero_si512();
94 __mmask64 bmask = ~UINT64CONST(0);
95 const __m512i maskv = _mm512_set1_epi8(mask);
102 bmask <<= ((uintptr_t)
buf) %
sizeof(__m512i);
103 tail_idx = (((uintptr_t)
buf + bytes - 1) %
sizeof(__m512i)) + 1;
113 val = _mm512_maskz_loadu_epi8(bmask, (
const __m512i *)
buf);
114 vmasked = _mm512_and_si512(
val, maskv);
115 cnt = _mm512_popcnt_epi64(vmasked);
116 accum = _mm512_add_epi64(accum, cnt);
118 buf +=
sizeof(__m512i);
119 bmask = ~UINT64CONST(0);
121 for (;
buf <
final;
buf +=
sizeof(__m512i))
123 val = _mm512_load_si512((
const __m512i *)
buf);
124 vmasked = _mm512_and_si512(
val, maskv);
125 cnt = _mm512_popcnt_epi64(vmasked);
126 accum = _mm512_add_epi64(accum, cnt);
131 bmask &= (~UINT64CONST(0) >> (
sizeof(__m512i) - tail_idx));
133 val = _mm512_maskz_loadu_epi8(bmask, (
const __m512i *)
buf);
134 vmasked = _mm512_and_si512(
val, maskv);
135 cnt = _mm512_popcnt_epi64(vmasked);
136 accum = _mm512_add_epi64(accum, cnt);
138 return _mm512_reduce_add_epi64(accum);
#define TYPEALIGN_DOWN(ALIGNVAL, LEN)