PostgreSQL Source Code  git master
simd.h
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * simd.h
4  * Support for platform-specific vector operations.
5  *
6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * src/include/port/simd.h
10  *
11  * NOTES
12  * - VectorN in this file refers to a register where the element operands
13  * are N bits wide. The vector width is platform-specific, so users that care
14  * about that will need to inspect "sizeof(VectorN)".
15  *
16  *-------------------------------------------------------------------------
17  */
18 #ifndef SIMD_H
19 #define SIMD_H
20 
21 #if (defined(__x86_64__) || defined(_M_AMD64))
22 /*
23  * SSE2 instructions are part of the spec for the 64-bit x86 ISA. We assume
24  * that compilers targeting this architecture understand SSE2 intrinsics.
25  *
26  * We use emmintrin.h rather than the comprehensive header immintrin.h in
27  * order to exclude extensions beyond SSE2. This is because MSVC, at least,
28  * will allow the use of intrinsics that haven't been enabled at compile
29  * time.
30  */
31 #include <emmintrin.h>
32 #define USE_SSE2
33 typedef __m128i Vector8;
34 typedef __m128i Vector32;
35 
36 #elif defined(__aarch64__) && defined(__ARM_NEON)
37 /*
38  * We use the Neon instructions if the compiler provides access to them (as
39  * indicated by __ARM_NEON) and we are on aarch64. While Neon support is
40  * technically optional for aarch64, it appears that all available 64-bit
41  * hardware does have it. Neon exists in some 32-bit hardware too, but we
42  * could not realistically use it there without a run-time check, which seems
43  * not worth the trouble for now.
44  */
45 #include <arm_neon.h>
46 #define USE_NEON
47 typedef uint8x16_t Vector8;
48 typedef uint32x4_t Vector32;
49 
50 #else
51 /*
52  * If no SIMD instructions are available, we can in some cases emulate vector
53  * operations using bitwise operations on unsigned integers. Note that many
54  * of the functions in this file presently do not have non-SIMD
55  * implementations. In particular, none of the functions involving Vector32
56  * are implemented without SIMD since it's likely not worthwhile to represent
57  * two 32-bit integers using a uint64.
58  */
59 #define USE_NO_SIMD
60 typedef uint64 Vector8;
61 #endif
62 
63 /* load/store operations */
64 static inline void vector8_load(Vector8 *v, const uint8 *s);
65 #ifndef USE_NO_SIMD
66 static inline void vector32_load(Vector32 *v, const uint32 *s);
67 #endif
68 
69 /* assignment operations */
70 static inline Vector8 vector8_broadcast(const uint8 c);
71 #ifndef USE_NO_SIMD
72 static inline Vector32 vector32_broadcast(const uint32 c);
73 #endif
74 
75 /* element-wise comparisons to a scalar */
76 static inline bool vector8_has(const Vector8 v, const uint8 c);
77 static inline bool vector8_has_zero(const Vector8 v);
78 static inline bool vector8_has_le(const Vector8 v, const uint8 c);
79 static inline bool vector8_is_highbit_set(const Vector8 v);
80 #ifndef USE_NO_SIMD
81 static inline bool vector32_is_highbit_set(const Vector32 v);
82 static inline uint32 vector8_highbit_mask(const Vector8 v);
83 #endif
84 
85 /* arithmetic operations */
86 static inline Vector8 vector8_or(const Vector8 v1, const Vector8 v2);
87 #ifndef USE_NO_SIMD
88 static inline Vector32 vector32_or(const Vector32 v1, const Vector32 v2);
89 static inline Vector8 vector8_ssub(const Vector8 v1, const Vector8 v2);
90 #endif
91 
92 /*
93  * comparisons between vectors
94  *
95  * Note: These return a vector rather than boolean, which is why we don't
96  * have non-SIMD implementations.
97  */
98 #ifndef USE_NO_SIMD
99 static inline Vector8 vector8_eq(const Vector8 v1, const Vector8 v2);
100 static inline Vector8 vector8_min(const Vector8 v1, const Vector8 v2);
101 static inline Vector32 vector32_eq(const Vector32 v1, const Vector32 v2);
102 #endif
103 
104 /*
105  * Load a chunk of memory into the given vector.
106  */
107 static inline void
109 {
110 #if defined(USE_SSE2)
111  *v = _mm_loadu_si128((const __m128i *) s);
112 #elif defined(USE_NEON)
113  *v = vld1q_u8(s);
114 #else
115  memcpy(v, s, sizeof(Vector8));
116 #endif
117 }
118 
119 #ifndef USE_NO_SIMD
120 static inline void
121 vector32_load(Vector32 *v, const uint32 *s)
122 {
123 #ifdef USE_SSE2
124  *v = _mm_loadu_si128((const __m128i *) s);
125 #elif defined(USE_NEON)
126  *v = vld1q_u32(s);
127 #endif
128 }
129 #endif /* ! USE_NO_SIMD */
130 
131 /*
132  * Create a vector with all elements set to the same value.
133  */
134 static inline Vector8
136 {
137 #if defined(USE_SSE2)
138  return _mm_set1_epi8(c);
139 #elif defined(USE_NEON)
140  return vdupq_n_u8(c);
141 #else
142  return ~UINT64CONST(0) / 0xFF * c;
143 #endif
144 }
145 
146 #ifndef USE_NO_SIMD
147 static inline Vector32
148 vector32_broadcast(const uint32 c)
149 {
150 #ifdef USE_SSE2
151  return _mm_set1_epi32(c);
152 #elif defined(USE_NEON)
153  return vdupq_n_u32(c);
154 #endif
155 }
156 #endif /* ! USE_NO_SIMD */
157 
158 /*
159  * Return true if any elements in the vector are equal to the given scalar.
160  */
161 static inline bool
162 vector8_has(const Vector8 v, const uint8 c)
163 {
164  bool result;
165 
166  /* pre-compute the result for assert checking */
167 #ifdef USE_ASSERT_CHECKING
168  bool assert_result = false;
169 
170  for (Size i = 0; i < sizeof(Vector8); i++)
171  {
172  if (((const uint8 *) &v)[i] == c)
173  {
174  assert_result = true;
175  break;
176  }
177  }
178 #endif /* USE_ASSERT_CHECKING */
179 
180 #if defined(USE_NO_SIMD)
181  /* any bytes in v equal to c will evaluate to zero via XOR */
182  result = vector8_has_zero(v ^ vector8_broadcast(c));
183 #else
184  result = vector8_is_highbit_set(vector8_eq(v, vector8_broadcast(c)));
185 #endif
186 
187  Assert(assert_result == result);
188  return result;
189 }
190 
191 /*
192  * Convenience function equivalent to vector8_has(v, 0)
193  */
194 static inline bool
196 {
197 #if defined(USE_NO_SIMD)
198  /*
199  * We cannot call vector8_has() here, because that would lead to a
200  * circular definition.
201  */
202  return vector8_has_le(v, 0);
203 #else
204  return vector8_has(v, 0);
205 #endif
206 }
207 
208 /*
209  * Return true if any elements in the vector are less than or equal to the
210  * given scalar.
211  */
212 static inline bool
213 vector8_has_le(const Vector8 v, const uint8 c)
214 {
215  bool result = false;
216 
217  /* pre-compute the result for assert checking */
218 #ifdef USE_ASSERT_CHECKING
219  bool assert_result = false;
220 
221  for (Size i = 0; i < sizeof(Vector8); i++)
222  {
223  if (((const uint8 *) &v)[i] <= c)
224  {
225  assert_result = true;
226  break;
227  }
228  }
229 #endif /* USE_ASSERT_CHECKING */
230 
231 #if defined(USE_NO_SIMD)
232 
233  /*
234  * To find bytes <= c, we can use bitwise operations to find bytes < c+1,
235  * but it only works if c+1 <= 128 and if the highest bit in v is not set.
236  * Adapted from
237  * https://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
238  */
239  if ((int64) v >= 0 && c < 0x80)
240  result = (v - vector8_broadcast(c + 1)) & ~v & vector8_broadcast(0x80);
241  else
242  {
243  /* one byte at a time */
244  for (Size i = 0; i < sizeof(Vector8); i++)
245  {
246  if (((const uint8 *) &v)[i] <= c)
247  {
248  result = true;
249  break;
250  }
251  }
252  }
253 #else
254 
255  /*
256  * Use saturating subtraction to find bytes <= c, which will present as
257  * NUL bytes. This approach is a workaround for the lack of unsigned
258  * comparison instructions on some architectures.
259  */
260  result = vector8_has_zero(vector8_ssub(v, vector8_broadcast(c)));
261 #endif
262 
263  Assert(assert_result == result);
264  return result;
265 }
266 
267 /*
268  * Return true if the high bit of any element is set
269  */
270 static inline bool
272 {
273 #ifdef USE_SSE2
274  return _mm_movemask_epi8(v) != 0;
275 #elif defined(USE_NEON)
276  return vmaxvq_u8(v) > 0x7F;
277 #else
278  return v & vector8_broadcast(0x80);
279 #endif
280 }
281 
282 /*
283  * Exactly like vector8_is_highbit_set except for the input type, so it
284  * looks at each byte separately.
285  *
286  * XXX x86 uses the same underlying type for 8-bit, 16-bit, and 32-bit
287  * integer elements, but Arm does not, hence the need for a separate
288  * function. We could instead adopt the behavior of Arm's vmaxvq_u32(), i.e.
289  * check each 32-bit element, but that would require an additional mask
290  * operation on x86.
291  */
292 #ifndef USE_NO_SIMD
293 static inline bool
294 vector32_is_highbit_set(const Vector32 v)
295 {
296 #if defined(USE_NEON)
297  return vector8_is_highbit_set((Vector8) v);
298 #else
299  return vector8_is_highbit_set(v);
300 #endif
301 }
302 #endif /* ! USE_NO_SIMD */
303 
304 /*
305  * Return a bitmask formed from the high-bit of each element.
306  */
307 #ifndef USE_NO_SIMD
308 static inline uint32
309 vector8_highbit_mask(const Vector8 v)
310 {
311 #ifdef USE_SSE2
312  return (uint32) _mm_movemask_epi8(v);
313 #elif defined(USE_NEON)
314  /*
315  * Note: It would be faster to use vget_lane_u64 and vshrn_n_u16, but that
316  * returns a uint64, making it inconvenient to combine mask values from
317  * multiple vectors.
318  */
319  static const uint8 mask[16] = {
320  1 << 0, 1 << 1, 1 << 2, 1 << 3,
321  1 << 4, 1 << 5, 1 << 6, 1 << 7,
322  1 << 0, 1 << 1, 1 << 2, 1 << 3,
323  1 << 4, 1 << 5, 1 << 6, 1 << 7,
324  };
325 
326  uint8x16_t masked = vandq_u8(vld1q_u8(mask), (uint8x16_t) vshrq_n_s8((int8x16_t) v, 7));
327  uint8x16_t maskedhi = vextq_u8(masked, masked, 8);
328 
329  return (uint32) vaddvq_u16((uint16x8_t) vzip1q_u8(masked, maskedhi));
330 #endif
331 }
332 #endif /* ! USE_NO_SIMD */
333 
334 /*
335  * Return the bitwise OR of the inputs
336  */
337 static inline Vector8
338 vector8_or(const Vector8 v1, const Vector8 v2)
339 {
340 #ifdef USE_SSE2
341  return _mm_or_si128(v1, v2);
342 #elif defined(USE_NEON)
343  return vorrq_u8(v1, v2);
344 #else
345  return v1 | v2;
346 #endif
347 }
348 
349 #ifndef USE_NO_SIMD
350 static inline Vector32
351 vector32_or(const Vector32 v1, const Vector32 v2)
352 {
353 #ifdef USE_SSE2
354  return _mm_or_si128(v1, v2);
355 #elif defined(USE_NEON)
356  return vorrq_u32(v1, v2);
357 #endif
358 }
359 #endif /* ! USE_NO_SIMD */
360 
361 /*
362  * Return the result of subtracting the respective elements of the input
363  * vectors using saturation (i.e., if the operation would yield a value less
364  * than zero, zero is returned instead). For more information on saturation
365  * arithmetic, see https://en.wikipedia.org/wiki/Saturation_arithmetic
366  */
367 #ifndef USE_NO_SIMD
368 static inline Vector8
369 vector8_ssub(const Vector8 v1, const Vector8 v2)
370 {
371 #ifdef USE_SSE2
372  return _mm_subs_epu8(v1, v2);
373 #elif defined(USE_NEON)
374  return vqsubq_u8(v1, v2);
375 #endif
376 }
377 #endif /* ! USE_NO_SIMD */
378 
379 /*
380  * Return a vector with all bits set in each lane where the corresponding
381  * lanes in the inputs are equal.
382  */
383 #ifndef USE_NO_SIMD
384 static inline Vector8
385 vector8_eq(const Vector8 v1, const Vector8 v2)
386 {
387 #ifdef USE_SSE2
388  return _mm_cmpeq_epi8(v1, v2);
389 #elif defined(USE_NEON)
390  return vceqq_u8(v1, v2);
391 #endif
392 }
393 #endif /* ! USE_NO_SIMD */
394 
395 #ifndef USE_NO_SIMD
396 static inline Vector32
397 vector32_eq(const Vector32 v1, const Vector32 v2)
398 {
399 #ifdef USE_SSE2
400  return _mm_cmpeq_epi32(v1, v2);
401 #elif defined(USE_NEON)
402  return vceqq_u32(v1, v2);
403 #endif
404 }
405 #endif /* ! USE_NO_SIMD */
406 
407 /*
408  * Given two vectors, return a vector with the minimum element of each.
409  */
410 #ifndef USE_NO_SIMD
411 static inline Vector8
412 vector8_min(const Vector8 v1, const Vector8 v2)
413 {
414 #ifdef USE_SSE2
415  return _mm_min_epu8(v1, v2);
416 #elif defined(USE_NEON)
417  return vminq_u8(v1, v2);
418 #endif
419 }
420 #endif /* ! USE_NO_SIMD */
421 
422 #endif /* SIMD_H */
uint8_t uint8
Definition: c.h:483
#define Assert(condition)
Definition: c.h:812
int64_t int64
Definition: c.h:482
uint64_t uint64
Definition: c.h:486
uint32_t uint32
Definition: c.h:485
#define UINT64CONST(x)
Definition: c.h:500
size_t Size
Definition: c.h:559
int i
Definition: isn.c:72
char * c
static bool vector8_has_le(const Vector8 v, const uint8 c)
Definition: simd.h:213
static Vector8 vector8_broadcast(const uint8 c)
Definition: simd.h:135
static void vector8_load(Vector8 *v, const uint8 *s)
Definition: simd.h:108
static bool vector8_has_zero(const Vector8 v)
Definition: simd.h:195
static Vector8 vector8_or(const Vector8 v1, const Vector8 v2)
Definition: simd.h:338
uint64 Vector8
Definition: simd.h:60
static bool vector8_is_highbit_set(const Vector8 v)
Definition: simd.h:271
static bool vector8_has(const Vector8 v, const uint8 c)
Definition: simd.h:162