PostgreSQL Source Code  git master
pg_popcount_avx512.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * pg_popcount_avx512.c
4  * Holds the AVX-512 pg_popcount() implementation.
5  *
6  * Copyright (c) 2024, PostgreSQL Global Development Group
7  *
8  * IDENTIFICATION
9  * src/port/pg_popcount_avx512.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "c.h"
14 
15 #include <immintrin.h>
16 
17 #include "port/pg_bitutils.h"
18 
19 /*
20  * It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to
21  * use AVX-512 intrinsics, but we check it anyway to be sure. We piggy-back on
22  * the function pointers that are only used when TRY_POPCNT_FAST is set.
23  */
24 #ifdef TRY_POPCNT_FAST
25 
26 /*
27  * pg_popcount_avx512
28  * Returns the number of 1-bits in buf
29  */
30 uint64
31 pg_popcount_avx512(const char *buf, int bytes)
32 {
33  __m512i val,
34  cnt;
35  __m512i accum = _mm512_setzero_si512();
36  const char *final;
37  int tail_idx;
38  __mmask64 mask = ~UINT64CONST(0);
39 
40  /*
41  * Align buffer down to avoid double load overhead from unaligned access.
42  * Calculate a mask to ignore preceding bytes. Find start offset of final
43  * iteration and ensure it is not empty.
44  */
45  mask <<= ((uintptr_t) buf) % sizeof(__m512i);
46  tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
47  final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
48  buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
49 
50  /*
51  * Iterate through all but the final iteration. Starting from the second
52  * iteration, the mask is ignored.
53  */
54  if (buf < final)
55  {
56  val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
57  cnt = _mm512_popcnt_epi64(val);
58  accum = _mm512_add_epi64(accum, cnt);
59 
60  buf += sizeof(__m512i);
61  mask = ~UINT64CONST(0);
62 
63  for (; buf < final; buf += sizeof(__m512i))
64  {
65  val = _mm512_load_si512((const __m512i *) buf);
66  cnt = _mm512_popcnt_epi64(val);
67  accum = _mm512_add_epi64(accum, cnt);
68  }
69  }
70 
71  /* Final iteration needs to ignore bytes that are not within the length */
72  mask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
73 
74  val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
75  cnt = _mm512_popcnt_epi64(val);
76  accum = _mm512_add_epi64(accum, cnt);
77 
78  return _mm512_reduce_add_epi64(accum);
79 }
80 
81 /*
82  * pg_popcount_masked_avx512
83  * Returns the number of 1-bits in buf after applying the mask to each byte
84  */
85 uint64
86 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
87 {
88  __m512i val,
89  vmasked,
90  cnt;
91  __m512i accum = _mm512_setzero_si512();
92  const char *final;
93  int tail_idx;
94  __mmask64 bmask = ~UINT64CONST(0);
95  const __m512i maskv = _mm512_set1_epi8(mask);
96 
97  /*
98  * Align buffer down to avoid double load overhead from unaligned access.
99  * Calculate a mask to ignore preceding bytes. Find start offset of final
100  * iteration and ensure it is not empty.
101  */
102  bmask <<= ((uintptr_t) buf) % sizeof(__m512i);
103  tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
104  final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
105  buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
106 
107  /*
108  * Iterate through all but the final iteration. Starting from the second
109  * iteration, the mask is ignored.
110  */
111  if (buf < final)
112  {
113  val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
114  vmasked = _mm512_and_si512(val, maskv);
115  cnt = _mm512_popcnt_epi64(vmasked);
116  accum = _mm512_add_epi64(accum, cnt);
117 
118  buf += sizeof(__m512i);
119  bmask = ~UINT64CONST(0);
120 
121  for (; buf < final; buf += sizeof(__m512i))
122  {
123  val = _mm512_load_si512((const __m512i *) buf);
124  vmasked = _mm512_and_si512(val, maskv);
125  cnt = _mm512_popcnt_epi64(vmasked);
126  accum = _mm512_add_epi64(accum, cnt);
127  }
128  }
129 
130  /* Final iteration needs to ignore bytes that are not within the length */
131  bmask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
132 
133  val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
134  vmasked = _mm512_and_si512(val, maskv);
135  cnt = _mm512_popcnt_epi64(vmasked);
136  accum = _mm512_add_epi64(accum, cnt);
137 
138  return _mm512_reduce_add_epi64(accum);
139 }
140 
141 #endif /* TRY_POPCNT_FAST */
uint8 bits8
Definition: c.h:513
#define TYPEALIGN_DOWN(ALIGNVAL, LEN)
Definition: c.h:816
long val
Definition: informix.c:670
static char * buf
Definition: pg_test_fsync.c:73