PostgreSQL Source Code git master
Loading...
Searching...
No Matches
simd.h
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * simd.h
4 * Support for platform-specific vector operations.
5 *
6 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 * src/include/port/simd.h
10 *
11 * NOTES
12 * - VectorN in this file refers to a register where the element operands
13 * are N bits wide. The vector width is platform-specific, so users that care
14 * about that will need to inspect "sizeof(VectorN)".
15 *
16 *-------------------------------------------------------------------------
17 */
18#ifndef SIMD_H
19#define SIMD_H
20
21#if defined(USE_SSE2)
22/*
23 * We use emmintrin.h rather than the comprehensive header immintrin.h in
24 * order to exclude extensions beyond SSE2. This is because MSVC, at least,
25 * will allow the use of intrinsics that haven't been enabled at compile
26 * time.
27 */
28#include <emmintrin.h>
29typedef __m128i Vector8;
30typedef __m128i Vector32;
31
32#elif defined(USE_NEON)
33#include <arm_neon.h>
34typedef uint8x16_t Vector8;
35typedef uint32x4_t Vector32;
36
37#else
38/*
39 * If no SIMD instructions are available, we can in some cases emulate vector
40 * operations using bitwise operations on unsigned integers. Note that many
41 * of the functions in this file presently do not have non-SIMD
42 * implementations. In particular, none of the functions involving Vector32
43 * are implemented without SIMD since it's likely not worthwhile to represent
44 * two 32-bit integers using a uint64.
45 */
46#define USE_NO_SIMD
48#endif
49
50/* load/store operations */
51static inline void vector8_load(Vector8 *v, const uint8 *s);
52#ifndef USE_NO_SIMD
53static inline void vector32_load(Vector32 *v, const uint32 *s);
54#endif
55
56/* assignment operations */
57static inline Vector8 vector8_broadcast(const uint8 c);
58#ifndef USE_NO_SIMD
59static inline Vector32 vector32_broadcast(const uint32 c);
60#endif
61
62/* element-wise comparisons to a scalar */
63static inline bool vector8_has(const Vector8 v, const uint8 c);
64static inline bool vector8_has_zero(const Vector8 v);
65static inline bool vector8_has_le(const Vector8 v, const uint8 c);
66static inline bool vector8_is_highbit_set(const Vector8 v);
67#ifndef USE_NO_SIMD
68static inline bool vector32_is_highbit_set(const Vector32 v);
69static inline uint32 vector8_highbit_mask(const Vector8 v);
70#endif
71
72/* arithmetic operations */
73static inline Vector8 vector8_or(const Vector8 v1, const Vector8 v2);
74#ifndef USE_NO_SIMD
75static inline Vector32 vector32_or(const Vector32 v1, const Vector32 v2);
76#endif
77
78/*
79 * comparisons between vectors
80 *
81 * Note: These return a vector rather than boolean, which is why we don't
82 * have non-SIMD implementations.
83 */
84#ifndef USE_NO_SIMD
85static inline Vector8 vector8_eq(const Vector8 v1, const Vector8 v2);
86static inline Vector8 vector8_min(const Vector8 v1, const Vector8 v2);
87static inline Vector32 vector32_eq(const Vector32 v1, const Vector32 v2);
88#endif
89
90/*
91 * Load a chunk of memory into the given vector.
92 */
93static inline void
95{
96#if defined(USE_SSE2)
97 *v = _mm_loadu_si128((const __m128i *) s);
98#elif defined(USE_NEON)
99 *v = vld1q_u8(s);
100#else
101 memcpy(v, s, sizeof(Vector8));
102#endif
103}
104
105#ifndef USE_NO_SIMD
106static inline void
107vector32_load(Vector32 *v, const uint32 *s)
108{
109#ifdef USE_SSE2
110 *v = _mm_loadu_si128((const __m128i *) s);
111#elif defined(USE_NEON)
112 *v = vld1q_u32(s);
113#endif
114}
115#endif /* ! USE_NO_SIMD */
116
117/*
118 * Store a vector into the given memory address.
119 */
120#ifndef USE_NO_SIMD
121static inline void
123{
124#ifdef USE_SSE2
125 _mm_storeu_si128((Vector8 *) s, v);
126#elif defined(USE_NEON)
127 vst1q_u8(s, v);
128#endif
129}
130#endif /* ! USE_NO_SIMD */
131
132/*
133 * Create a vector with all elements set to the same value.
134 */
135static inline Vector8
137{
138#if defined(USE_SSE2)
139 return _mm_set1_epi8(c);
140#elif defined(USE_NEON)
141 return vdupq_n_u8(c);
142#else
143 return ~UINT64CONST(0) / 0xFF * c;
144#endif
145}
146
147#ifndef USE_NO_SIMD
148static inline Vector32
150{
151#ifdef USE_SSE2
152 return _mm_set1_epi32(c);
153#elif defined(USE_NEON)
154 return vdupq_n_u32(c);
155#endif
156}
157#endif /* ! USE_NO_SIMD */
158
159/*
160 * Return true if any elements in the vector are equal to the given scalar.
161 */
162static inline bool
163vector8_has(const Vector8 v, const uint8 c)
164{
165 bool result;
166
167 /* pre-compute the result for assert checking */
168#ifdef USE_ASSERT_CHECKING
169 bool assert_result = false;
170
171 for (Size i = 0; i < sizeof(Vector8); i++)
172 {
173 if (((const uint8 *) &v)[i] == c)
174 {
175 assert_result = true;
176 break;
177 }
178 }
179#endif /* USE_ASSERT_CHECKING */
180
181#if defined(USE_NO_SIMD)
182 /* any bytes in v equal to c will evaluate to zero via XOR */
183 result = vector8_has_zero(v ^ vector8_broadcast(c));
184#else
186#endif
187
188 Assert(assert_result == result);
189 return result;
190}
191
192/*
193 * Convenience function equivalent to vector8_has(v, 0)
194 */
195static inline bool
197{
198#if defined(USE_NO_SIMD)
199 /*
200 * We cannot call vector8_has() here, because that would lead to a
201 * circular definition.
202 */
203 return vector8_has_le(v, 0);
204#else
205 return vector8_has(v, 0);
206#endif
207}
208
209/*
210 * Return true if any elements in the vector are less than or equal to the
211 * given scalar.
212 */
213static inline bool
215{
216 bool result = false;
217#ifdef USE_SSE2
220#endif
221
222 /* pre-compute the result for assert checking */
223#ifdef USE_ASSERT_CHECKING
224 bool assert_result = false;
225
226 for (Size i = 0; i < sizeof(Vector8); i++)
227 {
228 if (((const uint8 *) &v)[i] <= c)
229 {
230 assert_result = true;
231 break;
232 }
233 }
234#endif /* USE_ASSERT_CHECKING */
235
236#if defined(USE_NO_SIMD)
237
238 /*
239 * To find bytes <= c, we can use bitwise operations to find bytes < c+1,
240 * but it only works if c+1 <= 128 and if the highest bit in v is not set.
241 * Adapted from
242 * https://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
243 */
244 if ((int64) v >= 0 && c < 0x80)
245 result = (v - vector8_broadcast(c + 1)) & ~v & vector8_broadcast(0x80);
246 else
247 {
248 /* one byte at a time */
249 for (Size i = 0; i < sizeof(Vector8); i++)
250 {
251 if (((const uint8 *) &v)[i] <= c)
252 {
253 result = true;
254 break;
255 }
256 }
257 }
258#elif defined(USE_SSE2)
260 cmpe = vector8_eq(umin, v);
262#elif defined(USE_NEON)
263 result = vminvq_u8(v) <= c;
264#endif
265
266 Assert(assert_result == result);
267 return result;
268}
269
270/*
271 * Returns true if any elements in the vector are greater than or equal to the
272 * given scalar.
273 */
274#ifndef USE_NO_SIMD
275static inline bool
276vector8_has_ge(const Vector8 v, const uint8 c)
277{
278#ifdef USE_SSE2
281
283#elif defined(USE_NEON)
284 return vmaxvq_u8(v) >= c;
285#endif
286}
287#endif /* ! USE_NO_SIMD */
288
289/*
290 * Return true if the high bit of any element is set
291 */
292static inline bool
294{
295#ifdef USE_SSE2
296 return _mm_movemask_epi8(v) != 0;
297#elif defined(USE_NEON)
298 return vmaxvq_u8(v) > 0x7F;
299#else
300 return v & vector8_broadcast(0x80);
301#endif
302}
303
304/*
305 * Exactly like vector8_is_highbit_set except for the input type, so it
306 * looks at each byte separately.
307 *
308 * XXX x86 uses the same underlying type for 8-bit, 16-bit, and 32-bit
309 * integer elements, but Arm does not, hence the need for a separate
310 * function. We could instead adopt the behavior of Arm's vmaxvq_u32(), i.e.
311 * check each 32-bit element, but that would require an additional mask
312 * operation on x86.
313 */
314#ifndef USE_NO_SIMD
315static inline bool
317{
318#if defined(USE_NEON)
320#else
321 return vector8_is_highbit_set(v);
322#endif
323}
324#endif /* ! USE_NO_SIMD */
325
326/*
327 * Return a bitmask formed from the high-bit of each element.
328 */
329#ifndef USE_NO_SIMD
330static inline uint32
332{
333#ifdef USE_SSE2
334 return (uint32) _mm_movemask_epi8(v);
335#elif defined(USE_NEON)
336 /*
337 * Note: It would be faster to use vget_lane_u64 and vshrn_n_u16, but that
338 * returns a uint64, making it inconvenient to combine mask values from
339 * multiple vectors.
340 */
341 static const uint8 mask[16] = {
342 1 << 0, 1 << 1, 1 << 2, 1 << 3,
343 1 << 4, 1 << 5, 1 << 6, 1 << 7,
344 1 << 0, 1 << 1, 1 << 2, 1 << 3,
345 1 << 4, 1 << 5, 1 << 6, 1 << 7,
346 };
347
348 uint8x16_t masked = vandq_u8(vld1q_u8(mask), (uint8x16_t) vshrq_n_s8((int8x16_t) v, 7));
349 uint8x16_t maskedhi = vextq_u8(masked, masked, 8);
350
351 return (uint32) vaddvq_u16((uint16x8_t) vzip1q_u8(masked, maskedhi));
352#endif
353}
354#endif /* ! USE_NO_SIMD */
355
356/*
357 * Return the bitwise OR of the inputs
358 */
359static inline Vector8
361{
362#ifdef USE_SSE2
363 return _mm_or_si128(v1, v2);
364#elif defined(USE_NEON)
365 return vorrq_u8(v1, v2);
366#else
367 return v1 | v2;
368#endif
369}
370
371#ifndef USE_NO_SIMD
372static inline Vector32
373vector32_or(const Vector32 v1, const Vector32 v2)
374{
375#ifdef USE_SSE2
376 return _mm_or_si128(v1, v2);
377#elif defined(USE_NEON)
378 return vorrq_u32(v1, v2);
379#endif
380}
381#endif /* ! USE_NO_SIMD */
382
383/*
384 * Return the bitwise AND of the inputs.
385 */
386#ifndef USE_NO_SIMD
387static inline Vector8
388vector8_and(const Vector8 v1, const Vector8 v2)
389{
390#ifdef USE_SSE2
391 return _mm_and_si128(v1, v2);
392#elif defined(USE_NEON)
393 return vandq_u8(v1, v2);
394#endif
395}
396#endif /* ! USE_NO_SIMD */
397
398/*
399 * Return the result of adding the respective elements of the input vectors.
400 */
401#ifndef USE_NO_SIMD
402static inline Vector8
403vector8_add(const Vector8 v1, const Vector8 v2)
404{
405#ifdef USE_SSE2
406 return _mm_add_epi8(v1, v2);
407#elif defined(USE_NEON)
408 return vaddq_u8(v1, v2);
409#endif
410}
411#endif /* ! USE_NO_SIMD */
412
413/*
414 * Return the result of subtracting the respective elements of the input
415 * vectors using signed saturation (i.e., if the operation would yield a value
416 * less than -128, -128 is returned instead). For more information on
417 * saturation arithmetic, see
418 * https://en.wikipedia.org/wiki/Saturation_arithmetic
419 */
420#ifndef USE_NO_SIMD
421static inline Vector8
422vector8_issub(const Vector8 v1, const Vector8 v2)
423{
424#ifdef USE_SSE2
425 return _mm_subs_epi8(v1, v2);
426#elif defined(USE_NEON)
427 return (Vector8) vqsubq_s8((int8x16_t) v1, (int8x16_t) v2);
428#endif
429}
430#endif /* ! USE_NO_SIMD */
431
432/*
433 * Return a vector with all bits set in each lane where the corresponding
434 * lanes in the inputs are equal.
435 */
436#ifndef USE_NO_SIMD
437static inline Vector8
438vector8_eq(const Vector8 v1, const Vector8 v2)
439{
440#ifdef USE_SSE2
441 return _mm_cmpeq_epi8(v1, v2);
442#elif defined(USE_NEON)
443 return vceqq_u8(v1, v2);
444#endif
445}
446#endif /* ! USE_NO_SIMD */
447
448#ifndef USE_NO_SIMD
449static inline Vector32
450vector32_eq(const Vector32 v1, const Vector32 v2)
451{
452#ifdef USE_SSE2
453 return _mm_cmpeq_epi32(v1, v2);
454#elif defined(USE_NEON)
455 return vceqq_u32(v1, v2);
456#endif
457}
458#endif /* ! USE_NO_SIMD */
459
460/*
461 * Return a vector with all bits set for each lane of v1 that is greater than
462 * the corresponding lane of v2. NB: The comparison treats the elements as
463 * signed.
464 */
465#ifndef USE_NO_SIMD
466static inline Vector8
467vector8_gt(const Vector8 v1, const Vector8 v2)
468{
469#ifdef USE_SSE2
470 return _mm_cmpgt_epi8(v1, v2);
471#elif defined(USE_NEON)
472 return vcgtq_s8((int8x16_t) v1, (int8x16_t) v2);
473#endif
474}
475#endif /* ! USE_NO_SIMD */
476
477/*
478 * Given two vectors, return a vector with the minimum element of each.
479 */
480#ifndef USE_NO_SIMD
481static inline Vector8
482vector8_min(const Vector8 v1, const Vector8 v2)
483{
484#ifdef USE_SSE2
485 return _mm_min_epu8(v1, v2);
486#elif defined(USE_NEON)
487 return vminq_u8(v1, v2);
488#endif
489}
490#endif /* ! USE_NO_SIMD */
491
492/*
493 * Interleave elements of low halves (e.g., for SSE2, bits 0-63) of given
494 * vectors. Bytes 0, 2, 4, etc. use v1, and bytes 1, 3, 5, etc. use v2.
495 */
496#ifndef USE_NO_SIMD
497static inline Vector8
499{
500#ifdef USE_SSE2
501 return _mm_unpacklo_epi8(v1, v2);
502#elif defined(USE_NEON)
503 return vzip1q_u8(v1, v2);
504#endif
505}
506#endif /* ! USE_NO_SIMD */
507
508/*
509 * Interleave elements of high halves (e.g., for SSE2, bits 64-127) of given
510 * vectors. Bytes 0, 2, 4, etc. use v1, and bytes 1, 3, 5, etc. use v2.
511 */
512#ifndef USE_NO_SIMD
513static inline Vector8
515{
516#ifdef USE_SSE2
517 return _mm_unpackhi_epi8(v1, v2);
518#elif defined(USE_NEON)
519 return vzip2q_u8(v1, v2);
520#endif
521}
522#endif /* ! USE_NO_SIMD */
523
524/*
525 * Pack 16-bit elements in the given vectors into a single vector of 8-bit
526 * elements. The first half of the return vector (e.g., for SSE2, bits 0-63)
527 * uses v1, and the second half (e.g., for SSE2, bits 64-127) uses v2.
528 *
529 * NB: The upper 8-bits of each 16-bit element must be zeros, else this will
530 * produce different results on different architectures.
531 */
532#ifndef USE_NO_SIMD
533static inline Vector8
534vector8_pack_16(const Vector8 v1, const Vector8 v2)
535{
537
539 Assert(!vector8_has_ge(vector8_and(v1, mask), 1));
540 Assert(!vector8_has_ge(vector8_and(v2, mask), 1));
541#ifdef USE_SSE2
542 return _mm_packus_epi16(v1, v2);
543#elif defined(USE_NEON)
544 return vuzp1q_u8(v1, v2);
545#endif
546}
547#endif /* ! USE_NO_SIMD */
548
549/*
550 * Unsigned shift left of each 32-bit element in the vector by "i" bits.
551 *
552 * XXX AArch64 requires an integer literal, so we have to list all expected
553 * values of "i" from all callers in a switch statement. If you add a new
554 * caller, be sure your expected values of "i" are handled.
555 */
556#ifndef USE_NO_SIMD
557static inline Vector8
558vector8_shift_left(const Vector8 v1, int i)
559{
560#ifdef USE_SSE2
561 return _mm_slli_epi32(v1, i);
562#elif defined(USE_NEON)
563 switch (i)
564 {
565 case 4:
566 return (Vector8) vshlq_n_u32((Vector32) v1, 4);
567 default:
568 Assert(false);
569 return vector8_broadcast(0);
570 }
571#endif
572}
573#endif /* ! USE_NO_SIMD */
574
575/*
576 * Unsigned shift right of each 32-bit element in the vector by "i" bits.
577 *
578 * XXX AArch64 requires an integer literal, so we have to list all expected
579 * values of "i" from all callers in a switch statement. If you add a new
580 * caller, be sure your expected values of "i" are handled.
581 */
582#ifndef USE_NO_SIMD
583static inline Vector8
585{
586#ifdef USE_SSE2
587 return _mm_srli_epi32(v1, i);
588#elif defined(USE_NEON)
589 switch (i)
590 {
591 case 4:
592 return (Vector8) vshrq_n_u32((Vector32) v1, 4);
593 case 8:
594 return (Vector8) vshrq_n_u32((Vector32) v1, 8);
595 default:
596 Assert(false);
597 return vector8_broadcast(0);
598 }
599#endif
600}
601#endif /* ! USE_NO_SIMD */
602
603#endif /* SIMD_H */
uint8_t uint8
Definition c.h:544
#define PG_USED_FOR_ASSERTS_ONLY
Definition c.h:223
#define Assert(condition)
Definition c.h:873
int64_t int64
Definition c.h:543
uint64_t uint64
Definition c.h:547
uint32_t uint32
Definition c.h:546
size_t Size
Definition c.h:619
int i
Definition isn.c:77
char * c
static int fb(int x)
static bool vector8_has_le(const Vector8 v, const uint8 c)
Definition simd.h:214
static Vector8 vector8_broadcast(const uint8 c)
Definition simd.h:136
static void vector8_load(Vector8 *v, const uint8 *s)
Definition simd.h:94
static bool vector8_has_zero(const Vector8 v)
Definition simd.h:196
static Vector8 vector8_or(const Vector8 v1, const Vector8 v2)
Definition simd.h:360
uint64 Vector8
Definition simd.h:47
static bool vector8_is_highbit_set(const Vector8 v)
Definition simd.h:293
static bool vector8_has(const Vector8 v, const uint8 c)
Definition simd.h:163