pg__crc32c__sse42_8c_source.html

/*-------------------------------------------------------------------------

 *

 * pg_crc32c_sse42.c

 *    Compute CRC-32C checksum using Intel SSE 4.2 or AVX-512 instructions.

 *

 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group

 * Portions Copyright (c) 1994, Regents of the University of California

 *

 *

 * IDENTIFICATION

 *    src/port/pg_crc32c_sse42.c

 *

 *-------------------------------------------------------------------------

 */

#include "c.h"


#include <nmmintrin.h>

#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK

#include <immintrin.h>

#endif


#include "port/pg_crc32c.h"


pg_attribute_no_sanitize_alignment()

pg_attribute_target("sse4.2")

pg_crc32c

pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)

{

    const unsigned char *p = data;

    const unsigned char *pend = p + len;


    /*

     * Process eight bytes of data at a time.

     *

     * NB: We do unaligned accesses here. The Intel architecture allows that,

     * and performance testing didn't show any performance gain from aligning

     * the begin address.

     */

#ifdef __x86_64__

    while (p + 8 <= pend)

    {

        crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p));

        p += 8;

    }


    /* Process remaining full four bytes if any */

    if (p + 4 <= pend)

    {

        crc = _mm_crc32_u32(crc, *((const unsigned int *) p));

        p += 4;

    }

#else


    /*

     * Process four bytes at a time. (The eight byte instruction is not

     * available on the 32-bit x86 architecture).

     */

    while (p + 4 <= pend)

    {

        crc = _mm_crc32_u32(crc, *((const unsigned int *) p));

        p += 4;

    }

#endif                          /* __x86_64__ */


    /* Process any remaining bytes one at a time. */

    while (p < pend)

    {

        crc = _mm_crc32_u8(crc, *p);

        p++;

    }


    return crc;

}


#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK


/*

 * Note: There is no copyright notice in the following generated code.

 *

 * We have modified the output to

 *   - match our function declaration

 *   - match whitespace to our project style

 *   - add a threshold for the alignment stanza

 */


/* Generated by https://github.com/corsix/fast-crc32/ using: */

/* ./generate -i avx512_vpclmulqdq -p crc32c -a v1e */

/* MIT licensed */


#define clmul_lo(a, b) (_mm512_clmulepi64_epi128((a), (b), 0))

#define clmul_hi(a, b) (_mm512_clmulepi64_epi128((a), (b), 17))


pg_attribute_target("vpclmulqdq,avx512vl")

pg_crc32c

pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t len)

{

    /* adjust names to match generated code */

    pg_crc32c   crc0 = crc;

    const char *buf = data;


    /* Align on cacheline boundary. The threshold is somewhat arbitrary. */

    if (unlikely(len > 256))

    {

        for (; len && ((uintptr_t) buf & 7); --len)

            crc0 = _mm_crc32_u8(crc0, *buf++);

        while (((uintptr_t) buf & 56) && len >= 8)

        {

            crc0 = _mm_crc32_u64(crc0, *(const uint64_t *) buf);

            buf += 8;

            len -= 8;

        }

    }


    if (len >= 64)

    {

        const char *end = buf + len;

        const char *limit = buf + len - 64;

        __m128i     z0;


        /* First vector chunk. */

        __m512i     x0 = _mm512_loadu_si512((const void *) buf),

                    y0;

        __m512i     k;


        k = _mm512_broadcast_i32x4(_mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0));

        x0 = _mm512_xor_si512(_mm512_castsi128_si512(_mm_cvtsi32_si128(crc0)), x0);

        buf += 64;


        /* Main loop. */

        while (buf <= limit)

        {

            y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);

            x0 = _mm512_ternarylogic_epi64(x0, y0,

                                           _mm512_loadu_si512((const void *) buf),

                                           0x96);

            buf += 64;

        }


        /* Reduce 512 bits to 128 bits. */

        k = _mm512_setr_epi32(0x1c291d04, 0, 0xddc0152b, 0,

                              0x3da6d0cb, 0, 0xba4fc28e, 0,

                              0xf20c0dfe, 0, 0x493c7d27, 0,

                              0, 0, 0, 0);

        y0 = clmul_lo(x0, k), k = clmul_hi(x0, k);

        y0 = _mm512_xor_si512(y0, k);

        z0 = _mm_ternarylogic_epi64(_mm512_castsi512_si128(y0),

                                    _mm512_extracti32x4_epi32(y0, 1),

                                    _mm512_extracti32x4_epi32(y0, 2),

                                    0x96);

        z0 = _mm_xor_si128(z0, _mm512_extracti32x4_epi32(x0, 3));


        /* Reduce 128 bits to 32 bits, and multiply by x^32. */

        crc0 = _mm_crc32_u64(0, _mm_extract_epi64(z0, 0));

        crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(z0, 1));

        len = end - buf;

    }


    return pg_comp_crc32c_sse42(crc0, buf, len);

}


#endif

c.h

uint64
uint64_t uint64
Definition: c.h:503

unlikely
#define unlikely(x)
Definition: c.h:347

uint32
uint32_t uint32
Definition: c.h:502

pg_attribute_target
#define pg_attribute_target(...)
Definition: c.h:213

pg_crc32c.h

pg_crc32c
uint32 pg_crc32c
Definition: pg_crc32c.h:38

len
const void size_t len
Definition: pg_crc32c_sse42.c:28

data
const void * data
Definition: pg_crc32c_sse42.c:27

crc
return crc
Definition: pg_crc32c_sse42.c:72

pg_attribute_no_sanitize_alignment
pg_attribute_no_sanitize_alignment() pg_attribute_target("sse4.2") pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc

pend
const unsigned char * pend
Definition: pg_crc32c_sse42.c:30

buf
static char * buf
Definition: pg_test_fsync.c:72