PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
pg_crc32c_sse42.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * pg_crc32c_sse42.c
4 * Compute CRC-32C checksum using Intel SSE 4.2 or AVX-512 instructions.
5 *
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/port/pg_crc32c_sse42.c
12 *
13 *-------------------------------------------------------------------------
14 */
15#include "c.h"
16
17#include <nmmintrin.h>
18#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
19#include <immintrin.h>
20#endif
21
22#include "port/pg_crc32c.h"
23
25pg_attribute_target("sse4.2")
27pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
28{
29 const unsigned char *p = data;
30 const unsigned char *pend = p + len;
31
32 /*
33 * Process eight bytes of data at a time.
34 *
35 * NB: We do unaligned accesses here. The Intel architecture allows that,
36 * and performance testing didn't show any performance gain from aligning
37 * the begin address.
38 */
39#ifdef __x86_64__
40 while (p + 8 <= pend)
41 {
42 crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p));
43 p += 8;
44 }
45
46 /* Process remaining full four bytes if any */
47 if (p + 4 <= pend)
48 {
49 crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
50 p += 4;
51 }
52#else
53
54 /*
55 * Process four bytes at a time. (The eight byte instruction is not
56 * available on the 32-bit x86 architecture).
57 */
58 while (p + 4 <= pend)
59 {
60 crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
61 p += 4;
62 }
63#endif /* __x86_64__ */
64
65 /* Process any remaining bytes one at a time. */
66 while (p < pend)
67 {
68 crc = _mm_crc32_u8(crc, *p);
69 p++;
70 }
71
72 return crc;
73}
74
75#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
76
77/*
78 * Note: There is no copyright notice in the following generated code.
79 *
80 * We have modified the output to
81 * - match our function declaration
82 * - match whitespace to our project style
83 * - add a threshold for the alignment stanza
84 */
85
86/* Generated by https://github.com/corsix/fast-crc32/ using: */
87/* ./generate -i avx512_vpclmulqdq -p crc32c -a v1e */
88/* MIT licensed */
89
90#define clmul_lo(a, b) (_mm512_clmulepi64_epi128((a), (b), 0))
91#define clmul_hi(a, b) (_mm512_clmulepi64_epi128((a), (b), 17))
92
93pg_attribute_target("vpclmulqdq,avx512vl")
95pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t length)
96{
97 /* adjust names to match generated code */
98 pg_crc32c crc0 = crc;
99 size_t len = length;
100 const char *buf = data;
101
102 /* Align on cacheline boundary. The threshold is somewhat arbitrary. */
103 if (unlikely(len > 256))
104 {
105 for (; len && ((uintptr_t) buf & 7); --len)
106 crc0 = _mm_crc32_u8(crc0, *buf++);
107 while (((uintptr_t) buf & 56) && len >= 8)
108 {
109 crc0 = _mm_crc32_u64(crc0, *(const uint64_t *) buf);
110 buf += 8;
111 len -= 8;
112 }
113 }
114
115 if (len >= 64)
116 {
117 const char *end = buf + len;
118 const char *limit = buf + len - 64;
119 __m128i z0;
120
121 /* First vector chunk. */
122 __m512i x0 = _mm512_loadu_si512((const void *) buf),
123 y0;
124 __m512i k;
125
126 k = _mm512_broadcast_i32x4(_mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0));
127 x0 = _mm512_xor_si512(_mm512_castsi128_si512(_mm_cvtsi32_si128(crc0)), x0);
128 buf += 64;
129
130 /* Main loop. */
131 while (buf <= limit)
132 {
133 y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
134 x0 = _mm512_ternarylogic_epi64(x0, y0,
135 _mm512_loadu_si512((const void *) buf),
136 0x96);
137 buf += 64;
138 }
139
140 /* Reduce 512 bits to 128 bits. */
141 k = _mm512_setr_epi32(0x1c291d04, 0, 0xddc0152b, 0,
142 0x3da6d0cb, 0, 0xba4fc28e, 0,
143 0xf20c0dfe, 0, 0x493c7d27, 0,
144 0, 0, 0, 0);
145 y0 = clmul_lo(x0, k), k = clmul_hi(x0, k);
146 y0 = _mm512_xor_si512(y0, k);
147 z0 = _mm_ternarylogic_epi64(_mm512_castsi512_si128(y0),
148 _mm512_extracti32x4_epi32(y0, 1),
149 _mm512_extracti32x4_epi32(y0, 2),
150 0x96);
151 z0 = _mm_xor_si128(z0, _mm512_extracti32x4_epi32(x0, 3));
152
153 /* Reduce 128 bits to 32 bits, and multiply by x^32. */
154 crc0 = _mm_crc32_u64(0, _mm_extract_epi64(z0, 0));
155 crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(z0, 1));
156 len = end - buf;
157 }
158
159 return pg_comp_crc32c_sse42(crc0, buf, len);
160}
161
162#endif
uint64_t uint64
Definition: c.h:503
#define unlikely(x)
Definition: c.h:347
uint32_t uint32
Definition: c.h:502
#define pg_attribute_target(...)
Definition: c.h:213
uint32 pg_crc32c
Definition: pg_crc32c.h:38
const void size_t len
const void * data
return crc
pg_attribute_no_sanitize_alignment() pg_attribute_target("sse4.2") pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc
const unsigned char * pend
static char * buf
Definition: pg_test_fsync.c:72