21#if (defined(__x86_64__) || defined(_M_AMD64))
34typedef __m128i Vector32;
36#elif defined(__aarch64__) && defined(__ARM_NEON)
48typedef uint32x4_t Vector32;
66static inline void vector32_load(Vector32 *v,
const uint32 *s);
72static inline Vector32 vector32_broadcast(
const uint32 c);
81static inline bool vector32_is_highbit_set(
const Vector32 v);
88static inline Vector32 vector32_or(
const Vector32 v1,
const Vector32 v2);
100static inline Vector32 vector32_eq(
const Vector32 v1,
const Vector32 v2);
110 *v = _mm_loadu_si128((
const __m128i *) s);
111#elif defined(USE_NEON)
120vector32_load(Vector32 *v,
const uint32 *s)
123 *v = _mm_loadu_si128((
const __m128i *) s);
124#elif defined(USE_NEON)
138 _mm_storeu_si128((
Vector8 *) s, v);
139#elif defined(USE_NEON)
152 return _mm_set1_epi8(
c);
153#elif defined(USE_NEON)
154 return vdupq_n_u8(
c);
156 return ~UINT64CONST(0) / 0xFF *
c;
161static inline Vector32
162vector32_broadcast(
const uint32 c)
165 return _mm_set1_epi32(
c);
166#elif defined(USE_NEON)
167 return vdupq_n_u32(
c);
181#ifdef USE_ASSERT_CHECKING
182 bool assert_result =
false;
186 if (((
const uint8 *) &v)[
i] ==
c)
188 assert_result =
true;
194#if defined(USE_NO_SIMD)
201 Assert(assert_result == result);
211#if defined(USE_NO_SIMD)
236#ifdef USE_ASSERT_CHECKING
237 bool assert_result =
false;
241 if (((
const uint8 *) &v)[
i] <=
c)
243 assert_result =
true;
249#if defined(USE_NO_SIMD)
257 if ((
int64) v >= 0 &&
c < 0x80)
264 if (((
const uint8 *) &v)[
i] <=
c)
271#elif defined(USE_SSE2)
273 cmpe = vector8_eq(umin, v);
275#elif defined(USE_NEON)
276 result = vminvq_u8(v) <=
c;
279 Assert(assert_result == result);
293 Vector8 cmpe = vector8_eq(umax, v);
296#elif defined(USE_NEON)
297 return vmaxvq_u8(v) >=
c;
309 return _mm_movemask_epi8(v) != 0;
310#elif defined(USE_NEON)
311 return vmaxvq_u8(v) > 0x7F;
329vector32_is_highbit_set(
const Vector32 v)
344vector8_highbit_mask(
const Vector8 v)
347 return (
uint32) _mm_movemask_epi8(v);
348#elif defined(USE_NEON)
354 static const uint8 mask[16] = {
355 1 << 0, 1 << 1, 1 << 2, 1 << 3,
356 1 << 4, 1 << 5, 1 << 6, 1 << 7,
357 1 << 0, 1 << 1, 1 << 2, 1 << 3,
358 1 << 4, 1 << 5, 1 << 6, 1 << 7,
361 uint8x16_t masked = vandq_u8(vld1q_u8(mask), (uint8x16_t) vshrq_n_s8((int8x16_t) v, 7));
362 uint8x16_t maskedhi = vextq_u8(masked, masked, 8);
364 return (
uint32) vaddvq_u16((uint16x8_t) vzip1q_u8(masked, maskedhi));
376 return _mm_or_si128(v1, v2);
377#elif defined(USE_NEON)
378 return vorrq_u8(v1, v2);
385static inline Vector32
386vector32_or(
const Vector32 v1,
const Vector32 v2)
389 return _mm_or_si128(v1, v2);
390#elif defined(USE_NEON)
391 return vorrq_u32(v1, v2);
404 return _mm_and_si128(v1, v2);
405#elif defined(USE_NEON)
406 return vandq_u8(v1, v2);
419 return _mm_add_epi8(v1, v2);
420#elif defined(USE_NEON)
421 return vaddq_u8(v1, v2);
438 return _mm_subs_epi8(v1, v2);
439#elif defined(USE_NEON)
440 return (
Vector8) vqsubq_s8((int8x16_t) v1, (int8x16_t) v2);
454 return _mm_cmpeq_epi8(v1, v2);
455#elif defined(USE_NEON)
456 return vceqq_u8(v1, v2);
462static inline Vector32
463vector32_eq(
const Vector32 v1,
const Vector32 v2)
466 return _mm_cmpeq_epi32(v1, v2);
467#elif defined(USE_NEON)
468 return vceqq_u32(v1, v2);
483 return _mm_cmpgt_epi8(v1, v2);
484#elif defined(USE_NEON)
485 return vcgtq_s8((int8x16_t) v1, (int8x16_t) v2);
498 return _mm_min_epu8(v1, v2);
499#elif defined(USE_NEON)
500 return vminq_u8(v1, v2);
514 return _mm_unpacklo_epi8(v1, v2);
515#elif defined(USE_NEON)
516 return vzip1q_u8(v1, v2);
530 return _mm_unpackhi_epi8(v1, v2);
531#elif defined(USE_NEON)
532 return vzip2q_u8(v1, v2);
552 Assert(!vector8_has_ge(vector8_and(v1, mask), 1));
553 Assert(!vector8_has_ge(vector8_and(v2, mask), 1));
555 return _mm_packus_epi16(v1, v2);
556#elif defined(USE_NEON)
557 return vuzp1q_u8(v1, v2);
571vector8_shift_left(
const Vector8 v1,
int i)
574 return _mm_slli_epi32(v1,
i);
575#elif defined(USE_NEON)
579 return (
Vector8) vshlq_n_u32((Vector32) v1, 4);
597vector8_shift_right(
const Vector8 v1,
int i)
600 return _mm_srli_epi32(v1,
i);
601#elif defined(USE_NEON)
605 return (
Vector8) vshrq_n_u32((Vector32) v1, 4);
607 return (
Vector8) vshrq_n_u32((Vector32) v1, 8);
#define PG_USED_FOR_ASSERTS_ONLY
Assert(PointerIsAligned(start, uint64))
static bool vector8_has_le(const Vector8 v, const uint8 c)
static Vector8 vector8_broadcast(const uint8 c)
static void vector8_load(Vector8 *v, const uint8 *s)
static bool vector8_has_zero(const Vector8 v)
static Vector8 vector8_or(const Vector8 v1, const Vector8 v2)
static bool vector8_is_highbit_set(const Vector8 v)
static bool vector8_has(const Vector8 v, const uint8 c)