PostgreSQL Source Code git master
Loading...
Searching...
No Matches
pg_crc32c_sse42.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * pg_crc32c_sse42.c
4 * Compute CRC-32C checksum using Intel SSE 4.2 or AVX-512 instructions.
5 *
6 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/port/pg_crc32c_sse42.c
12 *
13 *-------------------------------------------------------------------------
14 */
15#include "c.h"
16
17#include <nmmintrin.h>
18#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
19#include <immintrin.h>
20#endif
21
22#include "port/pg_cpu.h"
23#include "port/pg_crc32c.h"
24
25static pg_crc32c pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len);
26
31{
32 const unsigned char *p = data;
33 const unsigned char *pend = p + len;
34
35 /*
36 * Process eight bytes of data at a time.
37 *
38 * NB: We do unaligned accesses here. The Intel architecture allows that,
39 * and performance testing didn't show any performance gain from aligning
40 * the begin address.
41 */
42#ifdef __x86_64__
43 while (p + 8 <= pend)
44 {
45 crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p));
46 p += 8;
47 }
48
49 /* Process remaining full four bytes if any */
50 if (p + 4 <= pend)
51 {
52 crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
53 p += 4;
54 }
55#else
56
57 /*
58 * Process four bytes at a time. (The eight byte instruction is not
59 * available on the 32-bit x86 architecture).
60 */
61 while (p + 4 <= pend)
62 {
63 crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
64 p += 4;
65 }
66#endif /* __x86_64__ */
67
68 /* Process any remaining bytes one at a time. */
69 while (p < pend)
70 {
71 crc = _mm_crc32_u8(crc, *p);
72 p++;
73 }
74
75 return crc;
76}
77
78#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
79
80/*
81 * Note: There is no copyright notice in the following generated code.
82 *
83 * We have modified the output to
84 * - match our function declaration
85 * - match whitespace to our project style
86 * - add a threshold for the alignment stanza
87 */
88
89/* Generated by https://github.com/corsix/fast-crc32/ using: */
90/* ./generate -i avx512_vpclmulqdq -p crc32c -a v1e */
91/* MIT licensed */
92
93#define clmul_lo(a, b) (_mm512_clmulepi64_epi128((a), (b), 0))
94#define clmul_hi(a, b) (_mm512_clmulepi64_epi128((a), (b), 17))
95
96pg_attribute_target("vpclmulqdq,avx512vl")
99{
100 /* adjust names to match generated code */
102 const char *buf = data;
103
104 /* Align on cacheline boundary. The threshold is somewhat arbitrary. */
105 if (unlikely(len > 256))
106 {
107 for (; len && ((uintptr_t) buf & 7); --len)
108 crc0 = _mm_crc32_u8(crc0, *buf++);
109 while (((uintptr_t) buf & 56) && len >= 8)
110 {
111 crc0 = _mm_crc32_u64(crc0, *(const uint64_t *) buf);
112 buf += 8;
113 len -= 8;
114 }
115 }
116
117 if (len >= 64)
118 {
119 const char *end = buf + len;
120 const char *limit = buf + len - 64;
121 __m128i z0;
122
123 /* First vector chunk. */
124 __m512i x0 = _mm512_loadu_si512((const void *) buf),
125 y0;
126 __m512i k;
127
128 k = _mm512_broadcast_i32x4(_mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0));
130 buf += 64;
131
132 /* Main loop. */
133 while (buf <= limit)
134 {
135 y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
137 _mm512_loadu_si512((const void *) buf),
138 0x96);
139 buf += 64;
140 }
141
142 /* Reduce 512 bits to 128 bits. */
143 k = _mm512_setr_epi32(0x1c291d04, 0, 0xddc0152b, 0,
144 0x3da6d0cb, 0, 0xba4fc28e, 0,
145 0xf20c0dfe, 0, 0x493c7d27, 0,
146 0, 0, 0, 0);
147 y0 = clmul_lo(x0, k), k = clmul_hi(x0, k);
148 y0 = _mm512_xor_si512(y0, k);
152 0x96);
154
155 /* Reduce 128 bits to 32 bits, and multiply by x^32. */
158 len = end - buf;
159 }
160
162}
163
164#endif /* USE_AVX512_CRC32C_WITH_RUNTIME_CHECK */
165
166/*
167 * This gets called on the first call. It replaces the function pointer
168 * so that subsequent calls are routed directly to the chosen implementation.
169 */
170static pg_crc32c
172{
173 /*
174 * Set fallback. We must guard since slicing-by-8 is not visible
175 * everywhere.
176 */
177#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
179#endif
180
183
184#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
188#endif
189
190 return pg_comp_crc32c(crc, data, len);
191}
192
uint64_t uint64
Definition c.h:559
#define unlikely(x)
Definition c.h:424
uint32_t uint32
Definition c.h:558
#define pg_attribute_target(...)
Definition c.h:224
uint32 pg_crc32c
Definition pg_crc32c.h:38
pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len)
const void size_t len
const void * data
return crc
pg_attribute_no_sanitize_alignment() pg_attribute_target("sse4.2") pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc
const unsigned char * pend
pg_crc32c(* pg_comp_crc32c)(pg_crc32c crc, const void *data, size_t len)
static pg_crc32c pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
static char buf[DEFAULT_XLOG_SEG_SIZE]
static int fb(int x)