PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
simd.h
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * simd.h
4 * Support for platform-specific vector operations.
5 *
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 * src/include/port/simd.h
10 *
11 * NOTES
12 * - VectorN in this file refers to a register where the element operands
13 * are N bits wide. The vector width is platform-specific, so users that care
14 * about that will need to inspect "sizeof(VectorN)".
15 *
16 *-------------------------------------------------------------------------
17 */
18#ifndef SIMD_H
19#define SIMD_H
20
21#if (defined(__x86_64__) || defined(_M_AMD64))
22/*
23 * SSE2 instructions are part of the spec for the 64-bit x86 ISA. We assume
24 * that compilers targeting this architecture understand SSE2 intrinsics.
25 *
26 * We use emmintrin.h rather than the comprehensive header immintrin.h in
27 * order to exclude extensions beyond SSE2. This is because MSVC, at least,
28 * will allow the use of intrinsics that haven't been enabled at compile
29 * time.
30 */
31#include <emmintrin.h>
32#define USE_SSE2
33typedef __m128i Vector8;
34typedef __m128i Vector32;
35
36#elif defined(__aarch64__) && defined(__ARM_NEON)
37/*
38 * We use the Neon instructions if the compiler provides access to them (as
39 * indicated by __ARM_NEON) and we are on aarch64. While Neon support is
40 * technically optional for aarch64, it appears that all available 64-bit
41 * hardware does have it. Neon exists in some 32-bit hardware too, but we
42 * could not realistically use it there without a run-time check, which seems
43 * not worth the trouble for now.
44 */
45#include <arm_neon.h>
46#define USE_NEON
47typedef uint8x16_t Vector8;
48typedef uint32x4_t Vector32;
49
50#else
51/*
52 * If no SIMD instructions are available, we can in some cases emulate vector
53 * operations using bitwise operations on unsigned integers. Note that many
54 * of the functions in this file presently do not have non-SIMD
55 * implementations. In particular, none of the functions involving Vector32
56 * are implemented without SIMD since it's likely not worthwhile to represent
57 * two 32-bit integers using a uint64.
58 */
59#define USE_NO_SIMD
61#endif
62
63/* load/store operations */
64static inline void vector8_load(Vector8 *v, const uint8 *s);
65#ifndef USE_NO_SIMD
66static inline void vector32_load(Vector32 *v, const uint32 *s);
67#endif
68
69/* assignment operations */
70static inline Vector8 vector8_broadcast(const uint8 c);
71#ifndef USE_NO_SIMD
72static inline Vector32 vector32_broadcast(const uint32 c);
73#endif
74
75/* element-wise comparisons to a scalar */
76static inline bool vector8_has(const Vector8 v, const uint8 c);
77static inline bool vector8_has_zero(const Vector8 v);
78static inline bool vector8_has_le(const Vector8 v, const uint8 c);
79static inline bool vector8_is_highbit_set(const Vector8 v);
80#ifndef USE_NO_SIMD
81static inline bool vector32_is_highbit_set(const Vector32 v);
82static inline uint32 vector8_highbit_mask(const Vector8 v);
83#endif
84
85/* arithmetic operations */
86static inline Vector8 vector8_or(const Vector8 v1, const Vector8 v2);
87#ifndef USE_NO_SIMD
88static inline Vector32 vector32_or(const Vector32 v1, const Vector32 v2);
89#endif
90
91/*
92 * comparisons between vectors
93 *
94 * Note: These return a vector rather than boolean, which is why we don't
95 * have non-SIMD implementations.
96 */
97#ifndef USE_NO_SIMD
98static inline Vector8 vector8_eq(const Vector8 v1, const Vector8 v2);
99static inline Vector8 vector8_min(const Vector8 v1, const Vector8 v2);
100static inline Vector32 vector32_eq(const Vector32 v1, const Vector32 v2);
101#endif
102
103/*
104 * Load a chunk of memory into the given vector.
105 */
106static inline void
108{
109#if defined(USE_SSE2)
110 *v = _mm_loadu_si128((const __m128i *) s);
111#elif defined(USE_NEON)
112 *v = vld1q_u8(s);
113#else
114 memcpy(v, s, sizeof(Vector8));
115#endif
116}
117
118#ifndef USE_NO_SIMD
119static inline void
120vector32_load(Vector32 *v, const uint32 *s)
121{
122#ifdef USE_SSE2
123 *v = _mm_loadu_si128((const __m128i *) s);
124#elif defined(USE_NEON)
125 *v = vld1q_u32(s);
126#endif
127}
128#endif /* ! USE_NO_SIMD */
129
130/*
131 * Store a vector into the given memory address.
132 */
133#ifndef USE_NO_SIMD
134static inline void
135vector8_store(uint8 *s, Vector8 v)
136{
137#ifdef USE_SSE2
138 _mm_storeu_si128((Vector8 *) s, v);
139#elif defined(USE_NEON)
140 vst1q_u8(s, v);
141#endif
142}
143#endif /* ! USE_NO_SIMD */
144
145/*
146 * Create a vector with all elements set to the same value.
147 */
148static inline Vector8
150{
151#if defined(USE_SSE2)
152 return _mm_set1_epi8(c);
153#elif defined(USE_NEON)
154 return vdupq_n_u8(c);
155#else
156 return ~UINT64CONST(0) / 0xFF * c;
157#endif
158}
159
160#ifndef USE_NO_SIMD
161static inline Vector32
162vector32_broadcast(const uint32 c)
163{
164#ifdef USE_SSE2
165 return _mm_set1_epi32(c);
166#elif defined(USE_NEON)
167 return vdupq_n_u32(c);
168#endif
169}
170#endif /* ! USE_NO_SIMD */
171
172/*
173 * Return true if any elements in the vector are equal to the given scalar.
174 */
175static inline bool
176vector8_has(const Vector8 v, const uint8 c)
177{
178 bool result;
179
180 /* pre-compute the result for assert checking */
181#ifdef USE_ASSERT_CHECKING
182 bool assert_result = false;
183
184 for (Size i = 0; i < sizeof(Vector8); i++)
185 {
186 if (((const uint8 *) &v)[i] == c)
187 {
188 assert_result = true;
189 break;
190 }
191 }
192#endif /* USE_ASSERT_CHECKING */
193
194#if defined(USE_NO_SIMD)
195 /* any bytes in v equal to c will evaluate to zero via XOR */
196 result = vector8_has_zero(v ^ vector8_broadcast(c));
197#else
198 result = vector8_is_highbit_set(vector8_eq(v, vector8_broadcast(c)));
199#endif
200
201 Assert(assert_result == result);
202 return result;
203}
204
205/*
206 * Convenience function equivalent to vector8_has(v, 0)
207 */
208static inline bool
210{
211#if defined(USE_NO_SIMD)
212 /*
213 * We cannot call vector8_has() here, because that would lead to a
214 * circular definition.
215 */
216 return vector8_has_le(v, 0);
217#else
218 return vector8_has(v, 0);
219#endif
220}
221
222/*
223 * Return true if any elements in the vector are less than or equal to the
224 * given scalar.
225 */
226static inline bool
228{
229 bool result = false;
230#ifdef USE_SSE2
231 Vector8 umin;
232 Vector8 cmpe;
233#endif
234
235 /* pre-compute the result for assert checking */
236#ifdef USE_ASSERT_CHECKING
237 bool assert_result = false;
238
239 for (Size i = 0; i < sizeof(Vector8); i++)
240 {
241 if (((const uint8 *) &v)[i] <= c)
242 {
243 assert_result = true;
244 break;
245 }
246 }
247#endif /* USE_ASSERT_CHECKING */
248
249#if defined(USE_NO_SIMD)
250
251 /*
252 * To find bytes <= c, we can use bitwise operations to find bytes < c+1,
253 * but it only works if c+1 <= 128 and if the highest bit in v is not set.
254 * Adapted from
255 * https://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
256 */
257 if ((int64) v >= 0 && c < 0x80)
258 result = (v - vector8_broadcast(c + 1)) & ~v & vector8_broadcast(0x80);
259 else
260 {
261 /* one byte at a time */
262 for (Size i = 0; i < sizeof(Vector8); i++)
263 {
264 if (((const uint8 *) &v)[i] <= c)
265 {
266 result = true;
267 break;
268 }
269 }
270 }
271#elif defined(USE_SSE2)
272 umin = vector8_min(v, vector8_broadcast(c));
273 cmpe = vector8_eq(umin, v);
274 result = vector8_is_highbit_set(cmpe);
275#elif defined(USE_NEON)
276 result = vminvq_u8(v) <= c;
277#endif
278
279 Assert(assert_result == result);
280 return result;
281}
282
283/*
284 * Returns true if any elements in the vector are greater than or equal to the
285 * given scalar.
286 */
287#ifndef USE_NO_SIMD
288static inline bool
289vector8_has_ge(const Vector8 v, const uint8 c)
290{
291#ifdef USE_SSE2
292 Vector8 umax = _mm_max_epu8(v, vector8_broadcast(c));
293 Vector8 cmpe = vector8_eq(umax, v);
294
295 return vector8_is_highbit_set(cmpe);
296#elif defined(USE_NEON)
297 return vmaxvq_u8(v) >= c;
298#endif
299}
300#endif /* ! USE_NO_SIMD */
301
302/*
303 * Return true if the high bit of any element is set
304 */
305static inline bool
307{
308#ifdef USE_SSE2
309 return _mm_movemask_epi8(v) != 0;
310#elif defined(USE_NEON)
311 return vmaxvq_u8(v) > 0x7F;
312#else
313 return v & vector8_broadcast(0x80);
314#endif
315}
316
317/*
318 * Exactly like vector8_is_highbit_set except for the input type, so it
319 * looks at each byte separately.
320 *
321 * XXX x86 uses the same underlying type for 8-bit, 16-bit, and 32-bit
322 * integer elements, but Arm does not, hence the need for a separate
323 * function. We could instead adopt the behavior of Arm's vmaxvq_u32(), i.e.
324 * check each 32-bit element, but that would require an additional mask
325 * operation on x86.
326 */
327#ifndef USE_NO_SIMD
328static inline bool
329vector32_is_highbit_set(const Vector32 v)
330{
331#if defined(USE_NEON)
333#else
334 return vector8_is_highbit_set(v);
335#endif
336}
337#endif /* ! USE_NO_SIMD */
338
339/*
340 * Return a bitmask formed from the high-bit of each element.
341 */
342#ifndef USE_NO_SIMD
343static inline uint32
344vector8_highbit_mask(const Vector8 v)
345{
346#ifdef USE_SSE2
347 return (uint32) _mm_movemask_epi8(v);
348#elif defined(USE_NEON)
349 /*
350 * Note: It would be faster to use vget_lane_u64 and vshrn_n_u16, but that
351 * returns a uint64, making it inconvenient to combine mask values from
352 * multiple vectors.
353 */
354 static const uint8 mask[16] = {
355 1 << 0, 1 << 1, 1 << 2, 1 << 3,
356 1 << 4, 1 << 5, 1 << 6, 1 << 7,
357 1 << 0, 1 << 1, 1 << 2, 1 << 3,
358 1 << 4, 1 << 5, 1 << 6, 1 << 7,
359 };
360
361 uint8x16_t masked = vandq_u8(vld1q_u8(mask), (uint8x16_t) vshrq_n_s8((int8x16_t) v, 7));
362 uint8x16_t maskedhi = vextq_u8(masked, masked, 8);
363
364 return (uint32) vaddvq_u16((uint16x8_t) vzip1q_u8(masked, maskedhi));
365#endif
366}
367#endif /* ! USE_NO_SIMD */
368
369/*
370 * Return the bitwise OR of the inputs
371 */
372static inline Vector8
373vector8_or(const Vector8 v1, const Vector8 v2)
374{
375#ifdef USE_SSE2
376 return _mm_or_si128(v1, v2);
377#elif defined(USE_NEON)
378 return vorrq_u8(v1, v2);
379#else
380 return v1 | v2;
381#endif
382}
383
384#ifndef USE_NO_SIMD
385static inline Vector32
386vector32_or(const Vector32 v1, const Vector32 v2)
387{
388#ifdef USE_SSE2
389 return _mm_or_si128(v1, v2);
390#elif defined(USE_NEON)
391 return vorrq_u32(v1, v2);
392#endif
393}
394#endif /* ! USE_NO_SIMD */
395
396/*
397 * Return the bitwise AND of the inputs.
398 */
399#ifndef USE_NO_SIMD
400static inline Vector8
401vector8_and(const Vector8 v1, const Vector8 v2)
402{
403#ifdef USE_SSE2
404 return _mm_and_si128(v1, v2);
405#elif defined(USE_NEON)
406 return vandq_u8(v1, v2);
407#endif
408}
409#endif /* ! USE_NO_SIMD */
410
411/*
412 * Return the result of adding the respective elements of the input vectors.
413 */
414#ifndef USE_NO_SIMD
415static inline Vector8
416vector8_add(const Vector8 v1, const Vector8 v2)
417{
418#ifdef USE_SSE2
419 return _mm_add_epi8(v1, v2);
420#elif defined(USE_NEON)
421 return vaddq_u8(v1, v2);
422#endif
423}
424#endif /* ! USE_NO_SIMD */
425
426/*
427 * Return the result of subtracting the respective elements of the input
428 * vectors using signed saturation (i.e., if the operation would yield a value
429 * less than -128, -128 is returned instead). For more information on
430 * saturation arithmetic, see
431 * https://en.wikipedia.org/wiki/Saturation_arithmetic
432 */
433#ifndef USE_NO_SIMD
434static inline Vector8
435vector8_issub(const Vector8 v1, const Vector8 v2)
436{
437#ifdef USE_SSE2
438 return _mm_subs_epi8(v1, v2);
439#elif defined(USE_NEON)
440 return (Vector8) vqsubq_s8((int8x16_t) v1, (int8x16_t) v2);
441#endif
442}
443#endif /* ! USE_NO_SIMD */
444
445/*
446 * Return a vector with all bits set in each lane where the corresponding
447 * lanes in the inputs are equal.
448 */
449#ifndef USE_NO_SIMD
450static inline Vector8
451vector8_eq(const Vector8 v1, const Vector8 v2)
452{
453#ifdef USE_SSE2
454 return _mm_cmpeq_epi8(v1, v2);
455#elif defined(USE_NEON)
456 return vceqq_u8(v1, v2);
457#endif
458}
459#endif /* ! USE_NO_SIMD */
460
461#ifndef USE_NO_SIMD
462static inline Vector32
463vector32_eq(const Vector32 v1, const Vector32 v2)
464{
465#ifdef USE_SSE2
466 return _mm_cmpeq_epi32(v1, v2);
467#elif defined(USE_NEON)
468 return vceqq_u32(v1, v2);
469#endif
470}
471#endif /* ! USE_NO_SIMD */
472
473/*
474 * Return a vector with all bits set for each lane of v1 that is greater than
475 * the corresponding lane of v2. NB: The comparison treats the elements as
476 * signed.
477 */
478#ifndef USE_NO_SIMD
479static inline Vector8
480vector8_gt(const Vector8 v1, const Vector8 v2)
481{
482#ifdef USE_SSE2
483 return _mm_cmpgt_epi8(v1, v2);
484#elif defined(USE_NEON)
485 return vcgtq_s8((int8x16_t) v1, (int8x16_t) v2);
486#endif
487}
488#endif /* ! USE_NO_SIMD */
489
490/*
491 * Given two vectors, return a vector with the minimum element of each.
492 */
493#ifndef USE_NO_SIMD
494static inline Vector8
495vector8_min(const Vector8 v1, const Vector8 v2)
496{
497#ifdef USE_SSE2
498 return _mm_min_epu8(v1, v2);
499#elif defined(USE_NEON)
500 return vminq_u8(v1, v2);
501#endif
502}
503#endif /* ! USE_NO_SIMD */
504
505/*
506 * Interleave elements of low halves (e.g., for SSE2, bits 0-63) of given
507 * vectors. Bytes 0, 2, 4, etc. use v1, and bytes 1, 3, 5, etc. use v2.
508 */
509#ifndef USE_NO_SIMD
510static inline Vector8
511vector8_interleave_low(const Vector8 v1, const Vector8 v2)
512{
513#ifdef USE_SSE2
514 return _mm_unpacklo_epi8(v1, v2);
515#elif defined(USE_NEON)
516 return vzip1q_u8(v1, v2);
517#endif
518}
519#endif /* ! USE_NO_SIMD */
520
521/*
522 * Interleave elements of high halves (e.g., for SSE2, bits 64-127) of given
523 * vectors. Bytes 0, 2, 4, etc. use v1, and bytes 1, 3, 5, etc. use v2.
524 */
525#ifndef USE_NO_SIMD
526static inline Vector8
527vector8_interleave_high(const Vector8 v1, const Vector8 v2)
528{
529#ifdef USE_SSE2
530 return _mm_unpackhi_epi8(v1, v2);
531#elif defined(USE_NEON)
532 return vzip2q_u8(v1, v2);
533#endif
534}
535#endif /* ! USE_NO_SIMD */
536
537/*
538 * Pack 16-bit elements in the given vectors into a single vector of 8-bit
539 * elements. The first half of the return vector (e.g., for SSE2, bits 0-63)
540 * uses v1, and the second half (e.g., for SSE2, bits 64-127) uses v2.
541 *
542 * NB: The upper 8-bits of each 16-bit element must be zeros, else this will
543 * produce different results on different architectures.
544 */
545#ifndef USE_NO_SIMD
546static inline Vector8
547vector8_pack_16(const Vector8 v1, const Vector8 v2)
548{
550
551 mask = vector8_interleave_low(vector8_broadcast(0), vector8_broadcast(0xff));
552 Assert(!vector8_has_ge(vector8_and(v1, mask), 1));
553 Assert(!vector8_has_ge(vector8_and(v2, mask), 1));
554#ifdef USE_SSE2
555 return _mm_packus_epi16(v1, v2);
556#elif defined(USE_NEON)
557 return vuzp1q_u8(v1, v2);
558#endif
559}
560#endif /* ! USE_NO_SIMD */
561
562/*
563 * Unsigned shift left of each 32-bit element in the vector by "i" bits.
564 *
565 * XXX AArch64 requires an integer literal, so we have to list all expected
566 * values of "i" from all callers in a switch statement. If you add a new
567 * caller, be sure your expected values of "i" are handled.
568 */
569#ifndef USE_NO_SIMD
570static inline Vector8
571vector8_shift_left(const Vector8 v1, int i)
572{
573#ifdef USE_SSE2
574 return _mm_slli_epi32(v1, i);
575#elif defined(USE_NEON)
576 switch (i)
577 {
578 case 4:
579 return (Vector8) vshlq_n_u32((Vector32) v1, 4);
580 default:
581 Assert(false);
582 return vector8_broadcast(0);
583 }
584#endif
585}
586#endif /* ! USE_NO_SIMD */
587
588/*
589 * Unsigned shift right of each 32-bit element in the vector by "i" bits.
590 *
591 * XXX AArch64 requires an integer literal, so we have to list all expected
592 * values of "i" from all callers in a switch statement. If you add a new
593 * caller, be sure your expected values of "i" are handled.
594 */
595#ifndef USE_NO_SIMD
596static inline Vector8
597vector8_shift_right(const Vector8 v1, int i)
598{
599#ifdef USE_SSE2
600 return _mm_srli_epi32(v1, i);
601#elif defined(USE_NEON)
602 switch (i)
603 {
604 case 4:
605 return (Vector8) vshrq_n_u32((Vector32) v1, 4);
606 case 8:
607 return (Vector8) vshrq_n_u32((Vector32) v1, 8);
608 default:
609 Assert(false);
610 return vector8_broadcast(0);
611 }
612#endif
613}
614#endif /* ! USE_NO_SIMD */
615
616#endif /* SIMD_H */
uint8_t uint8
Definition: c.h:536
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:223
int64_t int64
Definition: c.h:535
uint64_t uint64
Definition: c.h:539
uint32_t uint32
Definition: c.h:538
size_t Size
Definition: c.h:610
Assert(PointerIsAligned(start, uint64))
int i
Definition: isn.c:77
char * c
static bool vector8_has_le(const Vector8 v, const uint8 c)
Definition: simd.h:227
static Vector8 vector8_broadcast(const uint8 c)
Definition: simd.h:149
static void vector8_load(Vector8 *v, const uint8 *s)
Definition: simd.h:107
static bool vector8_has_zero(const Vector8 v)
Definition: simd.h:209
static Vector8 vector8_or(const Vector8 v1, const Vector8 v2)
Definition: simd.h:373
uint64 Vector8
Definition: simd.h:60
static bool vector8_is_highbit_set(const Vector8 v)
Definition: simd.h:306
static bool vector8_has(const Vector8 v, const uint8 c)
Definition: simd.h:176