Add PCLMUL implementation of CRC16-CCITT.

2025-12-16 11:14:29 +00:00 · 2025-08-21 00:07:21 +01:00
parent 2e857b0240
commit b8a97a8a05
4 changed files with 262 additions and 2 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -112,6 +112,7 @@ if ("${CMAKE_BUILD_TYPE}" MATCHES "Release" OR "${CMAKE_BUILD_TYPE}" MATCHES "Re
  endif ()
 endif ()

-add_library("Aaru.Checksums.Native" SHARED adler32.h adler32.c crc16.h crc16.c crc16_ccitt.h crc16_ccitt.c crc32.c crc32.h crc64.c crc64.h fletcher16.h fletcher16.c fletcher16_avx2.c fletcher16_neon.c fletcher16_ssse3.c fletcher32.h fletcher32.c fletcher32_avx2.c fletcher32_neon.c fletcher32_ssse3.c library.h spamsum.c spamsum.h crc32_clmul.c crc64_clmul.c simd.c simd.h adler32_ssse3.c adler32_avx2.c adler32_neon.c crc32_arm_simd.c crc32_vmull.c crc32_simd.h arm_vmull.c arm_vmull.h crc64_vmull.c library.c)
+add_library("Aaru.Checksums.Native" SHARED adler32.h adler32.c crc16.h crc16.c crc16_ccitt.h crc16_ccitt.c crc32.c crc32.h crc64.c crc64.h fletcher16.h fletcher16.c fletcher16_avx2.c fletcher16_neon.c fletcher16_ssse3.c fletcher32.h fletcher32.c fletcher32_avx2.c fletcher32_neon.c fletcher32_ssse3.c library.h spamsum.c spamsum.h crc32_clmul.c crc64_clmul.c simd.c simd.h adler32_ssse3.c adler32_avx2.c adler32_neon.c crc32_arm_simd.c crc32_vmull.c crc32_simd.h arm_vmull.c arm_vmull.h crc64_vmull.c library.c
+        crc16_ccitt_clmul.c)

 add_subdirectory(tests)
--- a/crc16_ccitt.h
+++ b/crc16_ccitt.h
@@ -22,9 +22,10 @@
 typedef struct
 {
    uint16_t crc;
+    int      seen_first;
 } crc16_ccitt_ctx;

-const uint16_t crc16_ccitt_table[8][256] = {
+static const uint16_t crc16_ccitt_table[8][256] = {
    {0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50A5, 0x60C6, 0x70E7, 0x8108, 0x9129, 0xA14A, 0xB16B, 0xC18C, 0xD1AD,
     0xE1CE, 0xF1EF, 0x1231, 0x0210, 0x3273, 0x2252, 0x52B5, 0x4294, 0x72F7, 0x62D6, 0x9339, 0x8318, 0xB37B, 0xA35A,
     0xD3BD, 0xC39C, 0xF3FF, 0xE3DE, 0x2462, 0x3443, 0x0420, 0x1401, 0x64E6, 0x74C7, 0x44A4, 0x5485, 0xA56A, 0xB54B,
@@ -184,4 +185,10 @@ AARU_EXPORT int AARU_CALL              crc16_ccitt_update(crc16_ccitt_ctx *ctx,
 AARU_EXPORT int AARU_CALL              crc16_ccitt_final(crc16_ccitt_ctx *ctx, uint16_t *crc);
 AARU_EXPORT void AARU_CALL             crc16_ccitt_free(crc16_ccitt_ctx *ctx);

+#if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \
+defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86)
+AARU_EXPORT TARGET_WITH_CLMUL int AARU_CALL crc16_ccitt_update_clmul(crc16_ccitt_ctx *ctx, const uint8_t *data,
+                                                                     uint32_t         len);
+#endif
+
 #endif  // AARU_CHECKSUMS_NATIVE_CRC16_H
--- a/crc16_ccitt_clmul.c
+++ b/crc16_ccitt_clmul.c
@@ -0,0 +1,157 @@
+/*
+* This file is part of the Aaru Data Preservation Suite.
+ * Copyright (c) 2019-2025 Natalia Portillo.
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of the
+ * License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \
+defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86)
+
+#include <bits/stdint-uintn.h>
+#include <stdint.h>
+#include <stddef.h>
+#include <immintrin.h>   // for _mm_clmulepi64_si128
+#include <wmmintrin.h>   // some compilers need this for PCLMUL
+
+#include "library.h"
+#include "crc16_ccitt.h"
+
+#ifndef CRC16_CCITT_POLY
+#define CRC16_CCITT_POLY 0x1021u  // x^16 + x^12 + x^5 + 1
+#endif
+
+// Carry-less multiply of two 16-bit values -> 32-bit polynomial product.
+TARGET_WITH_CLMUL static inline uint32_t clmul16(uint16_t a, uint16_t b)
+{
+    __m128i va   = _mm_set_epi64x(0, (uint64_t)a);
+    __m128i vb   = _mm_set_epi64x(0, (uint64_t)b);
+    __m128i prod = _mm_clmulepi64_si128(va, vb, 0x00);
+#if defined(_M_X64) || defined(__x86_64__)
+    return (uint32_t)_mm_cvtsi128_si64(prod);
+#else
+    // On 32-bit targets, extract low 64 then cast.
+    uint64_t low64; _mm_storel_epi64((__m128i *)&low64, prod); return (uint32_t)low64;
+#endif
+}
+
+// Reduce a 32-bit polynomial modulo 0x1021 to 16 bits (MSB-first semantics).
+static inline uint16_t gf2_reduce32_to16(uint32_t x)
+{
+    int i;
+    // For each set bit at position i >= 16, xor poly shifted by (i-16).
+    for(i = 31; i >= 16; --i) { if(x & (1u << i)) x ^= (uint32_t)CRC16_CCITT_POLY << (i - 16); }
+    return (uint16_t)x;
+}
+
+// GF(2) multiply modulo 0x1021 for 16-bit operands, using PCLMUL for the product.
+static inline uint16_t gf2_mul16_mod(uint16_t a, uint16_t b)
+{
+    uint32_t prod = clmul16(a, b);  // 32-bit polynomial product
+    return gf2_reduce32_to16(prod); // reduce to 16-bit remainder
+}
+
+// Compute x^(8*len) mod P (MSB-first), using exponentiation by squaring.
+static inline uint16_t gf2_pow_x8(size_t len)
+{
+    uint16_t result = 1u;                  // multiplicative identity
+    uint16_t base   = (uint16_t)(1u << 8); // x^8 mod P (degree 8 < 16, so unchanged)
+    while(len)
+    {
+        if(len & 1) result = gf2_mul16_mod(result, base);
+        base = gf2_mul16_mod(base, base);
+        len >>= 1;
+    }
+    return result;
+}
+
+// Compute CRC of a block starting from crc=0, using YOUR exact slice order (T[7] first).
+static inline uint16_t crc16_block_slice_by_8(const uint8_t *p, size_t n)
+{
+    uint16_t c = 0;
+    // Align small heads to 8
+    while(n && ((uintptr_t)p & 7))
+    {
+        c = (uint16_t)((c << 8) ^ crc16_ccitt_table[0][((c >> 8) ^ *p++) & 0xFF]);
+        n--;
+    }
+    while(n >= 8)
+    {
+        c = crc16_ccitt_table[7][p[0] ^ (c >> 8)] ^ crc16_ccitt_table[6][p[1] ^ (c & 0xFF)] ^ crc16_ccitt_table[5][p[2]]
+            ^ crc16_ccitt_table[4][p[3]] ^ crc16_ccitt_table[3][p[4]] ^ crc16_ccitt_table[2][p[5]] ^ crc16_ccitt_table[
+                1][p[6]] ^ crc16_ccitt_table[0][p[7]];
+        p += 8;
+        n -= 8;
+    }
+    while(n--) c = (uint16_t)((c << 8) ^ crc16_ccitt_table[0][((c >> 8) ^ *p++) & 0xFF]);
+
+    return c;
+}
+
+AARU_EXPORT TARGET_WITH_CLMUL int AARU_CALL crc16_ccitt_update_clmul(crc16_ccitt_ctx *ctx, const uint8_t *data,
+                                                                     uint32_t         len);
+{
+    if(!ctx || !data) return -1;
+
+    uint16_t crc = ctx->crc;
+
+    // align to 4 bytes, byte-at-a-time.
+    uintptr_t unaligned_length = (4 - (((uintptr_t)data) & 3)) & 3;
+    while(len && unaligned_length)
+    {
+        crc = (uint16_t)((crc << 8) ^ crc16_ccitt_table[0][((crc >> 8) ^ *data++) & 0xFF]);
+        len--;
+        unaligned_length--;
+    }
+
+    // Process large blocks via: crc = mul(crc, x^(8*B)) ^ crc_block(0, block)
+    // Choose a block size that balances pow() cost and locality.
+    const size_t   BLOCK     = 64; // 64 bytes per block
+    const uint16_t pow_block = gf2_pow_x8(BLOCK);
+
+    while(len >= BLOCK)
+    {
+        uint16_t block_crc = crc16_block_slice_by_8(data, BLOCK);
+        uint16_t folded    = gf2_mul16_mod(crc, pow_block);
+        crc                = (uint16_t)(folded ^ block_crc);
+
+        data += BLOCK;
+        len -= BLOCK;
+    }
+
+    // Handle the remainder: you can either combine once more, or fall back bytewise.
+    // To stay faithful and still leverage PCLMUL combine, do one more combine for the tail.
+    if(len >= 8)
+    {
+        // Combine full 8-byte chunks with a single pow per chunk length (8).
+        const uint16_t pow8 = gf2_pow_x8(8);
+        while(len >= 8)
+        {
+            uint16_t chunk_crc = crc16_block_slice_by_8(data, 8);
+            uint16_t folded    = gf2_mul16_mod(crc, pow8);
+            crc                = (uint16_t)(folded ^ chunk_crc);
+
+            data += 8;
+            len -= 8;
+        }
+    }
+
+    // Final tiny tail (<=7 bytes)
+    while(len--) crc = (uint16_t)((crc << 8) ^ crc16_ccitt_table[0][((crc >> 8) ^ *data++) & 0xFF]);
+
+    ctx->crc = crc;
+    return 0;
+}
+
+#endif
--- a/tests/crc16_ccitt.cpp
+++ b/tests/crc16_ccitt.cpp
@@ -137,3 +137,98 @@ TEST_F(crc16_ccittFixture, crc16_ccitt_auto_2352bytes)

    EXPECT_EQ(crc, EXPECTED_CRC16_CCITT_2352BYTES);
 }
+
+#if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \
+defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86)
+
+TEST_F(crc16_ccittFixture, crc16_ccitt_clmul)
+{
+    if(!have_clmul()) return;
+
+    crc16_ccitt_ctx *ctx = crc16_ccitt_init();
+    uint16_t         crc;
+
+    EXPECT_NE(ctx, nullptr);
+
+    crc16_ccitt_update_clmul(ctx, buffer, 1048576);
+    crc16_ccitt_final(ctx, &crc);
+
+    EXPECT_EQ(crc, EXPECTED_CRC16_CCITT);
+}
+
+TEST_F(crc16_ccittFixture, crc16_ccitt_clmul_misaligned)
+{
+    if(!have_clmul()) return;
+
+    crc16_ccitt_ctx *ctx = crc16_ccitt_init();
+    uint16_t         crc;
+
+    EXPECT_NE(ctx, nullptr);
+
+    crc16_ccitt_update_clmul(ctx, buffer_misaligned + 1, 1048576);
+    crc16_ccitt_final(ctx, &crc);
+
+    EXPECT_EQ(crc, EXPECTED_CRC16_CCITT);
+}
+
+TEST_F(crc16_ccittFixture, crc16_ccitt_clmul_15bytes)
+{
+    if(!have_clmul()) return;
+
+    crc16_ccitt_ctx *ctx = crc16_ccitt_init();
+    uint16_t         crc;
+
+    EXPECT_NE(ctx, nullptr);
+
+    crc16_ccitt_update_clmul(ctx, buffer, 15);
+    crc16_ccitt_final(ctx, &crc);
+
+    EXPECT_EQ(crc, EXPECTED_CRC16_CCITT_15BYTES);
+}
+
+TEST_F(crc16_ccittFixture, crc16_ccitt_clmul_31bytes)
+{
+    if(!have_clmul()) return;
+
+    crc16_ccitt_ctx *ctx = crc16_ccitt_init();
+    uint16_t         crc;
+
+    EXPECT_NE(ctx, nullptr);
+
+    crc16_ccitt_update_clmul(ctx, buffer, 31);
+    crc16_ccitt_final(ctx, &crc);
+
+    EXPECT_EQ(crc, EXPECTED_CRC16_CCITT_31BYTES);
+}
+
+TEST_F(crc16_ccittFixture, crc16_ccitt_clmul_63bytes)
+{
+    if(!have_clmul()) return;
+
+    crc16_ccitt_ctx *ctx = crc16_ccitt_init();
+    uint16_t         crc;
+
+    EXPECT_NE(ctx, nullptr);
+
+    crc16_ccitt_update_clmul(ctx, buffer, 63);
+    crc16_ccitt_final(ctx, &crc);
+
+    EXPECT_EQ(crc, EXPECTED_CRC16_CCITT_63BYTES);
+}
+
+TEST_F(crc16_ccittFixture, crc16_ccitt_clmul_2352bytes)
+{
+    if(!have_clmul()) return;
+
+    crc16_ccitt_ctx *ctx = crc16_ccitt_init();
+    uint16_t         crc;
+
+    EXPECT_NE(ctx, nullptr);
+
+    crc16_ccitt_update_clmul(ctx, buffer, 2352);
+    crc16_ccitt_final(ctx, &crc);
+
+    EXPECT_EQ(crc, EXPECTED_CRC16_CCITT_2352BYTES);
+}
+
+#endif