PostgreSQL Source Code git master
Loading...
Searching...
No Matches
pg_crc32c_armv8.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * pg_crc32c_armv8.c
4 * Compute CRC-32C checksum using ARMv8 CRC Extension instructions
5 *
6 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/port/pg_crc32c_armv8.c
12 *
13 *-------------------------------------------------------------------------
14 */
15#include "c.h"
16
17#ifdef _MSC_VER
18#include <intrin.h>
19#else
20#include <arm_acle.h>
21#endif
22
23#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
24#include <arm_neon.h>
25#endif
26
27#include "port/pg_crc32c.h"
28
31{
32 const unsigned char *p = data;
33 const unsigned char *pend = p + len;
34
35 /*
36 * ARMv8 doesn't require alignment, but aligned memory access is
37 * significantly faster. Process leading bytes so that the loop below
38 * starts with a pointer aligned to eight bytes.
39 */
40 if (!PointerIsAligned(p, uint16) &&
41 p + 1 <= pend)
42 {
43 crc = __crc32cb(crc, *p);
44 p += 1;
45 }
46 if (!PointerIsAligned(p, uint32) &&
47 p + 2 <= pend)
48 {
49 crc = __crc32ch(crc, *(const uint16 *) p);
50 p += 2;
51 }
52 if (!PointerIsAligned(p, uint64) &&
53 p + 4 <= pend)
54 {
55 crc = __crc32cw(crc, *(const uint32 *) p);
56 p += 4;
57 }
58
59 /* Process eight bytes at a time, as far as we can. */
60 while (p + 8 <= pend)
61 {
62 crc = __crc32cd(crc, *(const uint64 *) p);
63 p += 8;
64 }
65
66 /* Process remaining 0-7 bytes. */
67 if (p + 4 <= pend)
68 {
69 crc = __crc32cw(crc, *(const uint32 *) p);
70 p += 4;
71 }
72 if (p + 2 <= pend)
73 {
74 crc = __crc32ch(crc, *(const uint16 *) p);
75 p += 2;
76 }
77 if (p < pend)
78 {
79 crc = __crc32cb(crc, *p);
80 }
81
82 return crc;
83}
84
85#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
86
87/*
88 * Note: There is no copyright notice in the following generated code.
89 *
90 * We have modified the output to
91 * - match our function declaration
92 * - match whitespace to our project style
93 * - be more friendly for pgindent
94 * - exit early for small inputs
95 */
96
97/* Generated by https://github.com/corsix/fast-crc32/ using: */
98/* ./generate -i neon -p crc32c -a v4e */
99/* MIT licensed */
100
101pg_attribute_target("+crypto")
105{
106 uint64x2_t r;
107
108__asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b));
109 return r;
110}
111
112pg_attribute_target("+crypto")
116{
117 uint64x2_t r;
118
119__asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b));
120 return r;
121}
122
123pg_attribute_target("+crypto")
126{
127 /* adjust names to match generated code */
129 const char *buf = data;
130
131 /*
132 * Immediately fall back to the scalar path if the vector path is not
133 * guaranteed to perform at least one iteration after the alignment
134 * preamble.
135 */
136 if (len < 5 * sizeof(uint64x2_t))
138
139 /* align to 16 bytes */
140 for (; (uintptr_t) buf & 7; --len)
141 {
142 crc0 = __crc32cb(crc0, *buf++);
143 }
144 if ((uintptr_t) buf & 8)
145 {
146 crc0 = __crc32cd(crc0, *(const uint64_t *) buf);
147 buf += 8;
148 len -= 8;
149 }
150
151 Assert(len >= 64);
152
153 {
154 const char *end = buf + len;
155 const char *limit = buf + len - 64;
156
157 /* First vector chunk. */
158 uint64x2_t x0 = vld1q_u64((const uint64_t *) buf),
159 y0;
160 uint64x2_t x1 = vld1q_u64((const uint64_t *) (buf + 16)),
161 y1;
162 uint64x2_t x2 = vld1q_u64((const uint64_t *) (buf + 32)),
163 y2;
164 uint64x2_t x3 = vld1q_u64((const uint64_t *) (buf + 48)),
165 y3;
166 uint64x2_t k;
167
168 {
169 static const uint64_t pg_attribute_aligned(16) k_[] = {0x740eef02, 0x9e4addf8};
170
171 k = vld1q_u64(k_);
172 }
173
174 /*
175 * pgindent complained of unmatched parens, so the following has been
176 * re-written with intrinsics:
177 *
178 * x0 = veorq_u64((uint64x2_t) {crc0, 0}, x0);
179 */
181 buf += 64;
182
183 /* Main loop. */
184 while (buf <= limit)
185 {
186 y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t *) buf)), x0 = clmul_hi_e(x0, k, y0);
187 y1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t *) (buf + 16))), x1 = clmul_hi_e(x1, k, y1);
188 y2 = clmul_lo_e(x2, k, vld1q_u64((const uint64_t *) (buf + 32))), x2 = clmul_hi_e(x2, k, y2);
189 y3 = clmul_lo_e(x3, k, vld1q_u64((const uint64_t *) (buf + 48))), x3 = clmul_hi_e(x3, k, y3);
190 buf += 64;
191 }
192
193 /* Reduce x0 ... x3 to just x0. */
194 {
195 static const uint64_t pg_attribute_aligned(16) k_[] = {0xf20c0dfe, 0x493c7d27};
196
197 k = vld1q_u64(k_);
198 }
199 y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0);
200 y2 = clmul_lo_e(x2, k, x3), x2 = clmul_hi_e(x2, k, y2);
201 {
202 static const uint64_t pg_attribute_aligned(16) k_[] = {0x3da6d0cb, 0xba4fc28e};
203
204 k = vld1q_u64(k_);
205 }
206 y0 = clmul_lo_e(x0, k, x2), x0 = clmul_hi_e(x0, k, y0);
207
208 /* Reduce 128 bits to 32 bits, and multiply by x^32. */
211 len = end - buf;
212 }
213
215}
216
217#endif
#define PointerIsAligned(pointer, type)
Definition c.h:852
#define Assert(condition)
Definition c.h:943
uint64_t uint64
Definition c.h:625
uint16_t uint16
Definition c.h:623
uint32_t uint32
Definition c.h:624
#define pg_attribute_target(...)
Definition c.h:238
int b
Definition isn.c:74
int a
Definition isn.c:73
uint32 pg_crc32c
Definition pg_crc32c.h:38
pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len)
const void size_t len
const void * data
return crc
const unsigned char * pend
static char buf[DEFAULT_XLOG_SEG_SIZE]
char * c
static int fb(int x)