PostgreSQL Source Code  git master
pg_popcount_avx512.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * pg_popcount_avx512.c
4  * Holds the AVX-512 pg_popcount() implementation.
5  *
6  * Copyright (c) 2024, PostgreSQL Global Development Group
7  *
8  * IDENTIFICATION
9  * src/port/pg_popcount_avx512.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "c.h"
14 
15 #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
16 
17 #if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
18 #include <cpuid.h>
19 #endif
20 
21 #include <immintrin.h>
22 
23 #if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
24 #include <intrin.h>
25 #endif
26 
27 #include "port/pg_bitutils.h"
28 
29 /*
30  * It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to
31  * use AVX-512 intrinsics, but we check it anyway to be sure. We piggy-back on
32  * the function pointers that are only used when TRY_POPCNT_FAST is set.
33  */
34 #ifdef TRY_POPCNT_FAST
35 
36 /*
37  * Does CPUID say there's support for XSAVE instructions?
38  */
39 static inline bool
40 xsave_available(void)
41 {
42  unsigned int exx[4] = {0, 0, 0, 0};
43 
44 #if defined(HAVE__GET_CPUID)
45  __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
46 #elif defined(HAVE__CPUID)
47  __cpuid(exx, 1);
48 #else
49 #error cpuid instruction not available
50 #endif
51  return (exx[2] & (1 << 27)) != 0; /* osxsave */
52 }
53 
54 /*
55  * Does XGETBV say the ZMM registers are enabled?
56  *
57  * NB: Caller is responsible for verifying that xsave_available() returns true
58  * before calling this.
59  */
60 #ifdef HAVE_XSAVE_INTRINSICS
61 pg_attribute_target("xsave")
62 #endif
63 static inline bool
64 zmm_regs_available(void)
65 {
66 #ifdef HAVE_XSAVE_INTRINSICS
67  return (_xgetbv(0) & 0xe6) == 0xe6;
68 #else
69  return false;
70 #endif
71 }
72 
73 /*
74  * Does CPUID say there's support for AVX-512 popcount and byte-and-word
75  * instructions?
76  */
77 static inline bool
78 avx512_popcnt_available(void)
79 {
80  unsigned int exx[4] = {0, 0, 0, 0};
81 
82 #if defined(HAVE__GET_CPUID_COUNT)
83  __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
84 #elif defined(HAVE__CPUIDEX)
85  __cpuidex(exx, 7, 0);
86 #else
87 #error cpuid instruction not available
88 #endif
89  return (exx[2] & (1 << 14)) != 0 && /* avx512-vpopcntdq */
90  (exx[1] & (1 << 30)) != 0; /* avx512-bw */
91 }
92 
93 /*
94  * Returns true if the CPU supports the instructions required for the AVX-512
95  * pg_popcount() implementation.
96  */
97 bool
98 pg_popcount_avx512_available(void)
99 {
100  return xsave_available() &&
101  zmm_regs_available() &&
102  avx512_popcnt_available();
103 }
104 
105 /*
106  * pg_popcount_avx512
107  * Returns the number of 1-bits in buf
108  */
109 pg_attribute_target("avx512vpopcntdq,avx512bw")
110 uint64
111 pg_popcount_avx512(const char *buf, int bytes)
112 {
113  __m512i val,
114  cnt;
115  __m512i accum = _mm512_setzero_si512();
116  const char *final;
117  int tail_idx;
118  __mmask64 mask = ~UINT64CONST(0);
119 
120  /*
121  * Align buffer down to avoid double load overhead from unaligned access.
122  * Calculate a mask to ignore preceding bytes. Find start offset of final
123  * iteration and ensure it is not empty.
124  */
125  mask <<= ((uintptr_t) buf) % sizeof(__m512i);
126  tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
127  final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
128  buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
129 
130  /*
131  * Iterate through all but the final iteration. Starting from the second
132  * iteration, the mask is ignored.
133  */
134  if (buf < final)
135  {
136  val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
137  cnt = _mm512_popcnt_epi64(val);
138  accum = _mm512_add_epi64(accum, cnt);
139 
140  buf += sizeof(__m512i);
141  mask = ~UINT64CONST(0);
142 
143  for (; buf < final; buf += sizeof(__m512i))
144  {
145  val = _mm512_load_si512((const __m512i *) buf);
146  cnt = _mm512_popcnt_epi64(val);
147  accum = _mm512_add_epi64(accum, cnt);
148  }
149  }
150 
151  /* Final iteration needs to ignore bytes that are not within the length */
152  mask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
153 
154  val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
155  cnt = _mm512_popcnt_epi64(val);
156  accum = _mm512_add_epi64(accum, cnt);
157 
158  return _mm512_reduce_add_epi64(accum);
159 }
160 
161 /*
162  * pg_popcount_masked_avx512
163  * Returns the number of 1-bits in buf after applying the mask to each byte
164  */
165 pg_attribute_target("avx512vpopcntdq,avx512bw")
166 uint64
167 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
168 {
169  __m512i val,
170  vmasked,
171  cnt;
172  __m512i accum = _mm512_setzero_si512();
173  const char *final;
174  int tail_idx;
175  __mmask64 bmask = ~UINT64CONST(0);
176  const __m512i maskv = _mm512_set1_epi8(mask);
177 
178  /*
179  * Align buffer down to avoid double load overhead from unaligned access.
180  * Calculate a mask to ignore preceding bytes. Find start offset of final
181  * iteration and ensure it is not empty.
182  */
183  bmask <<= ((uintptr_t) buf) % sizeof(__m512i);
184  tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
185  final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
186  buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
187 
188  /*
189  * Iterate through all but the final iteration. Starting from the second
190  * iteration, the mask is ignored.
191  */
192  if (buf < final)
193  {
194  val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
195  vmasked = _mm512_and_si512(val, maskv);
196  cnt = _mm512_popcnt_epi64(vmasked);
197  accum = _mm512_add_epi64(accum, cnt);
198 
199  buf += sizeof(__m512i);
200  bmask = ~UINT64CONST(0);
201 
202  for (; buf < final; buf += sizeof(__m512i))
203  {
204  val = _mm512_load_si512((const __m512i *) buf);
205  vmasked = _mm512_and_si512(val, maskv);
206  cnt = _mm512_popcnt_epi64(vmasked);
207  accum = _mm512_add_epi64(accum, cnt);
208  }
209  }
210 
211  /* Final iteration needs to ignore bytes that are not within the length */
212  bmask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
213 
214  val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
215  vmasked = _mm512_and_si512(val, maskv);
216  cnt = _mm512_popcnt_epi64(vmasked);
217  accum = _mm512_add_epi64(accum, cnt);
218 
219  return _mm512_reduce_add_epi64(accum);
220 }
221 
222 #endif /* TRY_POPCNT_FAST */
223 #endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
uint8 bits8
Definition: c.h:492
uint64_t uint64
Definition: c.h:486
#define pg_attribute_target(...)
Definition: c.h:190
#define UINT64CONST(x)
Definition: c.h:500
#define TYPEALIGN_DOWN(ALIGNVAL, LEN)
Definition: c.h:770
long val
Definition: informix.c:689
static char * buf
Definition: pg_test_fsync.c:72