PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
pg_popcount_avx512.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * pg_popcount_avx512.c
4 * Holds the AVX-512 pg_popcount() implementation.
5 *
6 * Copyright (c) 2024-2025, PostgreSQL Global Development Group
7 *
8 * IDENTIFICATION
9 * src/port/pg_popcount_avx512.c
10 *
11 *-------------------------------------------------------------------------
12 */
13#include "c.h"
14
15#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
16
17#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
18#include <cpuid.h>
19#endif
20
21#include <immintrin.h>
22
23#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
24#include <intrin.h>
25#endif
26
27#include "port/pg_bitutils.h"
28
29/*
30 * It's probably unlikely that TRY_POPCNT_X86_64 won't be set if we are able to
31 * use AVX-512 intrinsics, but we check it anyway to be sure. We piggy-back on
32 * the function pointers that are only used when TRY_POPCNT_X86_64 is set.
33 */
34#ifdef TRY_POPCNT_X86_64
35
36/*
37 * Does CPUID say there's support for XSAVE instructions?
38 */
39static inline bool
40xsave_available(void)
41{
42 unsigned int exx[4] = {0, 0, 0, 0};
43
44#if defined(HAVE__GET_CPUID)
45 __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
46#elif defined(HAVE__CPUID)
47 __cpuid(exx, 1);
48#else
49#error cpuid instruction not available
50#endif
51 return (exx[2] & (1 << 27)) != 0; /* osxsave */
52}
53
54/*
55 * Does XGETBV say the ZMM registers are enabled?
56 *
57 * NB: Caller is responsible for verifying that xsave_available() returns true
58 * before calling this.
59 */
60#ifdef HAVE_XSAVE_INTRINSICS
62#endif
63static inline bool
65{
66#ifdef HAVE_XSAVE_INTRINSICS
67 return (_xgetbv(0) & 0xe6) == 0xe6;
68#else
69 return false;
70#endif
71}
72
73/*
74 * Does CPUID say there's support for AVX-512 popcount and byte-and-word
75 * instructions?
76 */
77static inline bool
78avx512_popcnt_available(void)
79{
80 unsigned int exx[4] = {0, 0, 0, 0};
81
82#if defined(HAVE__GET_CPUID_COUNT)
83 __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
84#elif defined(HAVE__CPUIDEX)
85 __cpuidex(exx, 7, 0);
86#else
87#error cpuid instruction not available
88#endif
89 return (exx[2] & (1 << 14)) != 0 && /* avx512-vpopcntdq */
90 (exx[1] & (1 << 30)) != 0; /* avx512-bw */
91}
92
93/*
94 * Returns true if the CPU supports the instructions required for the AVX-512
95 * pg_popcount() implementation.
96 */
97bool
98pg_popcount_avx512_available(void)
99{
100 return xsave_available() &&
102 avx512_popcnt_available();
103}
104
105/*
106 * pg_popcount_avx512
107 * Returns the number of 1-bits in buf
108 */
109pg_attribute_target("avx512vpopcntdq,avx512bw")
110uint64
111pg_popcount_avx512(const char *buf, int bytes)
112{
113 __m512i val,
114 cnt;
115 __m512i accum = _mm512_setzero_si512();
116 const char *final;
117 int tail_idx;
118 __mmask64 mask = ~UINT64CONST(0);
119
120 /*
121 * Align buffer down to avoid double load overhead from unaligned access.
122 * Calculate a mask to ignore preceding bytes. Find start offset of final
123 * iteration and ensure it is not empty.
124 */
125 mask <<= ((uintptr_t) buf) % sizeof(__m512i);
126 tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
127 final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
128 buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
129
130 /*
131 * Iterate through all but the final iteration. Starting from the second
132 * iteration, the mask is ignored.
133 */
134 if (buf < final)
135 {
136 val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
137 cnt = _mm512_popcnt_epi64(val);
138 accum = _mm512_add_epi64(accum, cnt);
139
140 buf += sizeof(__m512i);
141 mask = ~UINT64CONST(0);
142
143 for (; buf < final; buf += sizeof(__m512i))
144 {
145 val = _mm512_load_si512((const __m512i *) buf);
146 cnt = _mm512_popcnt_epi64(val);
147 accum = _mm512_add_epi64(accum, cnt);
148 }
149 }
150
151 /* Final iteration needs to ignore bytes that are not within the length */
152 mask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
153
154 val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
155 cnt = _mm512_popcnt_epi64(val);
156 accum = _mm512_add_epi64(accum, cnt);
157
158 return _mm512_reduce_add_epi64(accum);
159}
160
161/*
162 * pg_popcount_masked_avx512
163 * Returns the number of 1-bits in buf after applying the mask to each byte
164 */
165pg_attribute_target("avx512vpopcntdq,avx512bw")
166uint64
167pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
168{
169 __m512i val,
170 vmasked,
171 cnt;
172 __m512i accum = _mm512_setzero_si512();
173 const char *final;
174 int tail_idx;
175 __mmask64 bmask = ~UINT64CONST(0);
176 const __m512i maskv = _mm512_set1_epi8(mask);
177
178 /*
179 * Align buffer down to avoid double load overhead from unaligned access.
180 * Calculate a mask to ignore preceding bytes. Find start offset of final
181 * iteration and ensure it is not empty.
182 */
183 bmask <<= ((uintptr_t) buf) % sizeof(__m512i);
184 tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
185 final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
186 buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
187
188 /*
189 * Iterate through all but the final iteration. Starting from the second
190 * iteration, the mask is ignored.
191 */
192 if (buf < final)
193 {
194 val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
195 vmasked = _mm512_and_si512(val, maskv);
196 cnt = _mm512_popcnt_epi64(vmasked);
197 accum = _mm512_add_epi64(accum, cnt);
198
199 buf += sizeof(__m512i);
200 bmask = ~UINT64CONST(0);
201
202 for (; buf < final; buf += sizeof(__m512i))
203 {
204 val = _mm512_load_si512((const __m512i *) buf);
205 vmasked = _mm512_and_si512(val, maskv);
206 cnt = _mm512_popcnt_epi64(vmasked);
207 accum = _mm512_add_epi64(accum, cnt);
208 }
209 }
210
211 /* Final iteration needs to ignore bytes that are not within the length */
212 bmask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
213
214 val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
215 vmasked = _mm512_and_si512(val, maskv);
216 cnt = _mm512_popcnt_epi64(vmasked);
217 accum = _mm512_add_epi64(accum, cnt);
218
219 return _mm512_reduce_add_epi64(accum);
220}
221
222#endif /* TRY_POPCNT_X86_64 */
223#endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
uint8 bits8
Definition: c.h:509
uint64_t uint64
Definition: c.h:503
#define pg_attribute_target(...)
Definition: c.h:213
#define TYPEALIGN_DOWN(ALIGNVAL, LEN)
Definition: c.h:787
long val
Definition: informix.c:689
static bool zmm_regs_available(void)
static char * buf
Definition: pg_test_fsync.c:72