15 #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
17 #if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
21 #include <immintrin.h>
23 #if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
34 #ifdef TRY_POPCNT_FAST
42 unsigned int exx[4] = {0, 0, 0, 0};
44 #if defined(HAVE__GET_CPUID)
45 __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
46 #elif defined(HAVE__CPUID)
49 #error cpuid instruction not available
51 return (exx[2] & (1 << 27)) != 0;
60 #ifdef HAVE_XSAVE_INTRINSICS
64 zmm_regs_available(
void)
66 #ifdef HAVE_XSAVE_INTRINSICS
67 return (_xgetbv(0) & 0xe6) == 0xe6;
78 avx512_popcnt_available(
void)
80 unsigned int exx[4] = {0, 0, 0, 0};
82 #if defined(HAVE__GET_CPUID_COUNT)
83 __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
84 #elif defined(HAVE__CPUIDEX)
87 #error cpuid instruction not available
89 return (exx[2] & (1 << 14)) != 0 &&
90 (exx[1] & (1 << 30)) != 0;
98 pg_popcount_avx512_available(
void)
100 return xsave_available() &&
101 zmm_regs_available() &&
102 avx512_popcnt_available();
111 pg_popcount_avx512(const
char *
buf,
int bytes)
115 __m512i accum = _mm512_setzero_si512();
125 mask <<= ((uintptr_t)
buf) %
sizeof(__m512i);
126 tail_idx = (((uintptr_t)
buf + bytes - 1) %
sizeof(__m512i)) + 1;
136 val = _mm512_maskz_loadu_epi8(mask, (
const __m512i *)
buf);
137 cnt = _mm512_popcnt_epi64(
val);
138 accum = _mm512_add_epi64(accum, cnt);
140 buf +=
sizeof(__m512i);
143 for (;
buf <
final;
buf +=
sizeof(__m512i))
145 val = _mm512_load_si512((
const __m512i *)
buf);
146 cnt = _mm512_popcnt_epi64(
val);
147 accum = _mm512_add_epi64(accum, cnt);
152 mask &= (~
UINT64CONST(0) >> (
sizeof(__m512i) - tail_idx));
154 val = _mm512_maskz_loadu_epi8(mask, (
const __m512i *)
buf);
155 cnt = _mm512_popcnt_epi64(
val);
156 accum = _mm512_add_epi64(accum, cnt);
158 return _mm512_reduce_add_epi64(accum);
167 pg_popcount_masked_avx512(const
char *
buf,
int bytes,
bits8 mask)
172 __m512i accum = _mm512_setzero_si512();
176 const __m512i maskv = _mm512_set1_epi8(mask);
183 bmask <<= ((uintptr_t)
buf) %
sizeof(__m512i);
184 tail_idx = (((uintptr_t)
buf + bytes - 1) %
sizeof(__m512i)) + 1;
194 val = _mm512_maskz_loadu_epi8(bmask, (
const __m512i *)
buf);
195 vmasked = _mm512_and_si512(
val, maskv);
196 cnt = _mm512_popcnt_epi64(vmasked);
197 accum = _mm512_add_epi64(accum, cnt);
199 buf +=
sizeof(__m512i);
202 for (;
buf <
final;
buf +=
sizeof(__m512i))
204 val = _mm512_load_si512((
const __m512i *)
buf);
205 vmasked = _mm512_and_si512(
val, maskv);
206 cnt = _mm512_popcnt_epi64(vmasked);
207 accum = _mm512_add_epi64(accum, cnt);
212 bmask &= (~
UINT64CONST(0) >> (
sizeof(__m512i) - tail_idx));
214 val = _mm512_maskz_loadu_epi8(bmask, (
const __m512i *)
buf);
215 vmasked = _mm512_and_si512(
val, maskv);
216 cnt = _mm512_popcnt_epi64(vmasked);
217 accum = _mm512_add_epi64(accum, cnt);
219 return _mm512_reduce_add_epi64(accum);
#define pg_attribute_target(...)
#define TYPEALIGN_DOWN(ALIGNVAL, LEN)