mirror of
https://github.com/aaru-dps/Aaru.Checksums.Native.git
synced 2025-12-16 19:24:29 +00:00
Add AVX2 implementation of CRC16-IBM.
This commit is contained in:
@@ -113,6 +113,7 @@ if ("${CMAKE_BUILD_TYPE}" MATCHES "Release" OR "${CMAKE_BUILD_TYPE}" MATCHES "Re
|
|||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
add_library("Aaru.Checksums.Native" SHARED adler32.h adler32.c crc16.h crc16.c crc16_ccitt.h crc16_ccitt.c crc32.c crc32.h crc64.c crc64.h fletcher16.h fletcher16.c fletcher16_avx2.c fletcher16_neon.c fletcher16_ssse3.c fletcher32.h fletcher32.c fletcher32_avx2.c fletcher32_neon.c fletcher32_ssse3.c library.h spamsum.c spamsum.h crc32_clmul.c crc64_clmul.c simd.c simd.h adler32_ssse3.c adler32_avx2.c adler32_neon.c crc32_arm_simd.c crc32_vmull.c crc32_simd.h arm_vmull.c arm_vmull.h crc64_vmull.c library.c
|
add_library("Aaru.Checksums.Native" SHARED adler32.h adler32.c crc16.h crc16.c crc16_ccitt.h crc16_ccitt.c crc32.c crc32.h crc64.c crc64.h fletcher16.h fletcher16.c fletcher16_avx2.c fletcher16_neon.c fletcher16_ssse3.c fletcher32.h fletcher32.c fletcher32_avx2.c fletcher32_neon.c fletcher32_ssse3.c library.h spamsum.c spamsum.h crc32_clmul.c crc64_clmul.c simd.c simd.h adler32_ssse3.c adler32_avx2.c adler32_neon.c crc32_arm_simd.c crc32_vmull.c crc32_simd.h arm_vmull.c arm_vmull.h crc64_vmull.c library.c
|
||||||
crc16_ccitt_clmul.c)
|
crc16_ccitt_clmul.c
|
||||||
|
crc16_avx2.c)
|
||||||
|
|
||||||
add_subdirectory(tests)
|
add_subdirectory(tests)
|
||||||
|
|||||||
7
crc16.h
7
crc16.h
@@ -24,7 +24,7 @@ typedef struct
|
|||||||
uint16_t crc;
|
uint16_t crc;
|
||||||
} crc16_ctx;
|
} crc16_ctx;
|
||||||
|
|
||||||
const uint16_t crc16_table[8][256] = {
|
static const uint16_t crc16_table[8][256] = {
|
||||||
{0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1,
|
{0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1,
|
||||||
0xC481, 0x0440, 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 0x0A00, 0xCAC1, 0xCB81, 0x0B40,
|
0xC481, 0x0440, 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 0x0A00, 0xCAC1, 0xCB81, 0x0B40,
|
||||||
0xC901, 0x09C0, 0x0880, 0xC841, 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 0x1E00, 0xDEC1,
|
0xC901, 0x09C0, 0x0880, 0xC841, 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 0x1E00, 0xDEC1,
|
||||||
@@ -184,4 +184,9 @@ AARU_EXPORT int AARU_CALL crc16_update(crc16_ctx *ctx, const uint8_t *dat
|
|||||||
AARU_EXPORT int AARU_CALL crc16_final(crc16_ctx *ctx, uint16_t *crc);
|
AARU_EXPORT int AARU_CALL crc16_final(crc16_ctx *ctx, uint16_t *crc);
|
||||||
AARU_EXPORT void AARU_CALL crc16_free(crc16_ctx *ctx);
|
AARU_EXPORT void AARU_CALL crc16_free(crc16_ctx *ctx);
|
||||||
|
|
||||||
|
#if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \
|
||||||
|
defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86)
|
||||||
|
AARU_EXPORT int AARU_CALL crc16_update_avx2(crc16_ctx *ctx, const uint8_t *data, uint32_t len);
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif // AARU_CHECKSUMS_NATIVE_CRC16_H
|
#endif // AARU_CHECKSUMS_NATIVE_CRC16_H
|
||||||
155
crc16_avx2.c
Normal file
155
crc16_avx2.c
Normal file
@@ -0,0 +1,155 @@
|
|||||||
|
/*
|
||||||
|
* This file is part of the Aaru Data Preservation Suite.
|
||||||
|
* Copyright (c) 2019-2025 Natalia Portillo.
|
||||||
|
*
|
||||||
|
* This library is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU Lesser General Public License as
|
||||||
|
* published by the Free Software Foundation; either version 2.1 of the
|
||||||
|
* License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This library is distributed in the hope that it will be useful, but
|
||||||
|
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \
|
||||||
|
defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86)
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <immintrin.h>
|
||||||
|
|
||||||
|
#include "library.h"
|
||||||
|
#include "crc16.h"
|
||||||
|
|
||||||
|
static inline void CRC8_chunk(uint16_t *crc, uint64_t lane)
|
||||||
|
{
|
||||||
|
uint32_t one = (uint32_t)lane ^ (uint32_t)(*crc);
|
||||||
|
uint32_t two = (uint32_t)(lane >> 32);
|
||||||
|
|
||||||
|
uint16_t c = crc16_table[0][(two >> 24) & 0xFF] ^ crc16_table[1][(two >> 16) & 0xFF] ^ crc16_table[2][
|
||||||
|
(two >> 8) & 0xFF] ^ crc16_table[3][two & 0xFF] ^ crc16_table[4][(one >> 24) & 0xFF] ^ crc16_table[
|
||||||
|
5][(one >> 16) & 0xFF] ^ crc16_table[6][(one >> 8) & 0xFF] ^ crc16_table[7][one & 0xFF];
|
||||||
|
|
||||||
|
*crc = c;
|
||||||
|
}
|
||||||
|
|
||||||
|
AARU_EXPORT TARGET_WITH_AVX2 int AARU_CALL crc16_update_avx2(crc16_ctx *ctx, const uint8_t *data, uint32_t len)
|
||||||
|
{
|
||||||
|
if(!ctx || !data) return -1;
|
||||||
|
|
||||||
|
uint16_t crc = ctx->crc;
|
||||||
|
const uint8_t *p = data;
|
||||||
|
|
||||||
|
// Head: align to 32B for nicer load grouping
|
||||||
|
uintptr_t mis = (32 - ((uintptr_t)p & 31)) & 31;
|
||||||
|
while(len && mis)
|
||||||
|
{
|
||||||
|
crc = (crc >> 8) ^ crc16_table[0][(crc & 0xFF) ^ *p++];
|
||||||
|
len--;
|
||||||
|
mis--;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Main: 128B per iteration (4x 32B), software-pipelined
|
||||||
|
while(len >= 128)
|
||||||
|
{
|
||||||
|
// Prefetch ahead to keep L1 warm; nudge L2 for bigger strides
|
||||||
|
_mm_prefetch((const char*)(p + 256), _MM_HINT_T0);
|
||||||
|
_mm_prefetch((const char*)(p + 512), _MM_HINT_T1);
|
||||||
|
|
||||||
|
// Load 4x 32B
|
||||||
|
__m256i v0 = _mm256_loadu_si256((const __m256i *)(p + 0));
|
||||||
|
__m256i v1 = _mm256_loadu_si256((const __m256i *)(p + 32));
|
||||||
|
__m256i v2 = _mm256_loadu_si256((const __m256i *)(p + 64));
|
||||||
|
__m256i v3 = _mm256_loadu_si256((const __m256i *)(p + 96));
|
||||||
|
|
||||||
|
// Extract 64-bit lanes (8 lanes total per 64B, 16 lanes per 128B)
|
||||||
|
__m128i v0_lo = _mm256_extracti128_si256(v0, 0);
|
||||||
|
__m128i v0_hi = _mm256_extracti128_si256(v0, 1);
|
||||||
|
__m128i v1_lo = _mm256_extracti128_si256(v1, 0);
|
||||||
|
__m128i v1_hi = _mm256_extracti128_si256(v1, 1);
|
||||||
|
__m128i v2_lo = _mm256_extracti128_si256(v2, 0);
|
||||||
|
__m128i v2_hi = _mm256_extracti128_si256(v2, 1);
|
||||||
|
__m128i v3_lo = _mm256_extracti128_si256(v3, 0);
|
||||||
|
__m128i v3_hi = _mm256_extracti128_si256(v3, 1);
|
||||||
|
|
||||||
|
uint64_t l00 = (uint64_t)_mm_cvtsi128_si64(v0_lo);
|
||||||
|
uint64_t l01 = (uint64_t)_mm_extract_epi64(v0_lo, 1);
|
||||||
|
uint64_t l02 = (uint64_t)_mm_cvtsi128_si64(v0_hi);
|
||||||
|
uint64_t l03 = (uint64_t)_mm_extract_epi64(v0_hi, 1);
|
||||||
|
|
||||||
|
uint64_t l10 = (uint64_t)_mm_cvtsi128_si64(v1_lo);
|
||||||
|
uint64_t l11 = (uint64_t)_mm_extract_epi64(v1_lo, 1);
|
||||||
|
uint64_t l12 = (uint64_t)_mm_cvtsi128_si64(v1_hi);
|
||||||
|
uint64_t l13 = (uint64_t)_mm_extract_epi64(v1_hi, 1);
|
||||||
|
|
||||||
|
uint64_t l20 = (uint64_t)_mm_cvtsi128_si64(v2_lo);
|
||||||
|
uint64_t l21 = (uint64_t)_mm_extract_epi64(v2_lo, 1);
|
||||||
|
uint64_t l22 = (uint64_t)_mm_cvtsi128_si64(v2_hi);
|
||||||
|
uint64_t l23 = (uint64_t)_mm_extract_epi64(v2_hi, 1);
|
||||||
|
|
||||||
|
uint64_t l30 = (uint64_t)_mm_cvtsi128_si64(v3_lo);
|
||||||
|
uint64_t l31 = (uint64_t)_mm_extract_epi64(v3_lo, 1);
|
||||||
|
uint64_t l32 = (uint64_t)_mm_cvtsi128_si64(v3_hi);
|
||||||
|
uint64_t l33 = (uint64_t)_mm_extract_epi64(v3_hi, 1);
|
||||||
|
|
||||||
|
// Process in strict stream order (slicing-by-8 semantics)
|
||||||
|
CRC8_chunk(&crc, l00);
|
||||||
|
CRC8_chunk(&crc, l01);
|
||||||
|
CRC8_chunk(&crc, l02);
|
||||||
|
CRC8_chunk(&crc, l03);
|
||||||
|
|
||||||
|
CRC8_chunk(&crc, l10);
|
||||||
|
CRC8_chunk(&crc, l11);
|
||||||
|
CRC8_chunk(&crc, l12);
|
||||||
|
CRC8_chunk(&crc, l13);
|
||||||
|
|
||||||
|
CRC8_chunk(&crc, l20);
|
||||||
|
CRC8_chunk(&crc, l21);
|
||||||
|
CRC8_chunk(&crc, l22);
|
||||||
|
CRC8_chunk(&crc, l23);
|
||||||
|
|
||||||
|
CRC8_chunk(&crc, l30);
|
||||||
|
CRC8_chunk(&crc, l31);
|
||||||
|
CRC8_chunk(&crc, l32);
|
||||||
|
CRC8_chunk(&crc, l33);
|
||||||
|
|
||||||
|
p += 128;
|
||||||
|
len -= 128;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Drain remaining 32..96 bytes in 32B steps (keeps hot path tight)
|
||||||
|
while(len >= 32)
|
||||||
|
{
|
||||||
|
_mm_prefetch((const char*)(p + 128), _MM_HINT_T0);
|
||||||
|
|
||||||
|
__m256i v = _mm256_loadu_si256((const __m256i *)p);
|
||||||
|
__m128i lo = _mm256_extracti128_si256(v, 0);
|
||||||
|
__m128i hi = _mm256_extracti128_si256(v, 1);
|
||||||
|
|
||||||
|
uint64_t l0 = (uint64_t)_mm_cvtsi128_si64(lo);
|
||||||
|
uint64_t l1 = (uint64_t)_mm_extract_epi64(lo, 1);
|
||||||
|
uint64_t l2 = (uint64_t)_mm_cvtsi128_si64(hi);
|
||||||
|
uint64_t l3 = (uint64_t)_mm_extract_epi64(hi, 1);
|
||||||
|
|
||||||
|
CRC8_chunk(&crc, l0);
|
||||||
|
CRC8_chunk(&crc, l1);
|
||||||
|
CRC8_chunk(&crc, l2);
|
||||||
|
CRC8_chunk(&crc, l3);
|
||||||
|
|
||||||
|
p += 32;
|
||||||
|
len -= 32;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tail (<=31 bytes): byte-by-byte, identical to scalar
|
||||||
|
while(len--) { crc = (crc >> 8) ^ crc16_table[0][(crc & 0xFF) ^ *p++]; }
|
||||||
|
|
||||||
|
ctx->crc = crc;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
@@ -100,7 +100,7 @@ static inline uint16_t crc16_block_slice_by_8(const uint8_t *p, size_t n)
|
|||||||
}
|
}
|
||||||
|
|
||||||
AARU_EXPORT TARGET_WITH_CLMUL int AARU_CALL crc16_ccitt_update_clmul(crc16_ccitt_ctx *ctx, const uint8_t *data,
|
AARU_EXPORT TARGET_WITH_CLMUL int AARU_CALL crc16_ccitt_update_clmul(crc16_ccitt_ctx *ctx, const uint8_t *data,
|
||||||
uint32_t len);
|
uint32_t len)
|
||||||
{
|
{
|
||||||
if(!ctx || !data) return -1;
|
if(!ctx || !data) return -1;
|
||||||
|
|
||||||
|
|||||||
@@ -8,6 +8,8 @@
|
|||||||
|
|
||||||
#include "../library.h"
|
#include "../library.h"
|
||||||
#include "../crc16.h"
|
#include "../crc16.h"
|
||||||
|
#include "../simd.h"
|
||||||
|
#include "/home/claunia/Development/Aaru/Aaru.Checksums.Native/simd.h"
|
||||||
#include "gtest/gtest.h"
|
#include "gtest/gtest.h"
|
||||||
|
|
||||||
#define EXPECTED_CRC16 0x2d6d
|
#define EXPECTED_CRC16 0x2d6d
|
||||||
@@ -137,3 +139,98 @@ TEST_F(crc16Fixture, crc16_auto_2352bytes)
|
|||||||
|
|
||||||
EXPECT_EQ(crc, EXPECTED_CRC16_2352BYTES);
|
EXPECT_EQ(crc, EXPECTED_CRC16_2352BYTES);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \
|
||||||
|
defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86)
|
||||||
|
|
||||||
|
TEST_F(crc16Fixture, crc16_avx2)
|
||||||
|
{
|
||||||
|
if(!have_avx2()) return;
|
||||||
|
|
||||||
|
crc16_ctx *ctx = crc16_init();
|
||||||
|
uint16_t crc;
|
||||||
|
|
||||||
|
EXPECT_NE(ctx, nullptr);
|
||||||
|
|
||||||
|
crc16_update_avx2(ctx, buffer, 1048576);
|
||||||
|
crc16_final(ctx, &crc);
|
||||||
|
|
||||||
|
EXPECT_EQ(crc, EXPECTED_CRC16);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(crc16Fixture, crc16_avx2_misaligned)
|
||||||
|
{
|
||||||
|
if(!have_avx2()) return;
|
||||||
|
|
||||||
|
crc16_ctx *ctx = crc16_init();
|
||||||
|
uint16_t crc;
|
||||||
|
|
||||||
|
EXPECT_NE(ctx, nullptr);
|
||||||
|
|
||||||
|
crc16_update_avx2(ctx, buffer_misaligned + 1, 1048576);
|
||||||
|
crc16_final(ctx, &crc);
|
||||||
|
|
||||||
|
EXPECT_EQ(crc, EXPECTED_CRC16);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(crc16Fixture, crc16_avx2_15bytes)
|
||||||
|
{
|
||||||
|
if(!have_avx2()) return;
|
||||||
|
|
||||||
|
crc16_ctx *ctx = crc16_init();
|
||||||
|
uint16_t crc;
|
||||||
|
|
||||||
|
EXPECT_NE(ctx, nullptr);
|
||||||
|
|
||||||
|
crc16_update_avx2(ctx, buffer, 15);
|
||||||
|
crc16_final(ctx, &crc);
|
||||||
|
|
||||||
|
EXPECT_EQ(crc, EXPECTED_CRC16_15BYTES);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(crc16Fixture, crc16_avx2_31bytes)
|
||||||
|
{
|
||||||
|
if(!have_avx2()) return;
|
||||||
|
|
||||||
|
crc16_ctx *ctx = crc16_init();
|
||||||
|
uint16_t crc;
|
||||||
|
|
||||||
|
EXPECT_NE(ctx, nullptr);
|
||||||
|
|
||||||
|
crc16_update_avx2(ctx, buffer, 31);
|
||||||
|
crc16_final(ctx, &crc);
|
||||||
|
|
||||||
|
EXPECT_EQ(crc, EXPECTED_CRC16_31BYTES);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(crc16Fixture, crc16_avx2_63bytes)
|
||||||
|
{
|
||||||
|
if(!have_avx2()) return;
|
||||||
|
|
||||||
|
crc16_ctx *ctx = crc16_init();
|
||||||
|
uint16_t crc;
|
||||||
|
|
||||||
|
EXPECT_NE(ctx, nullptr);
|
||||||
|
|
||||||
|
crc16_update_avx2(ctx, buffer, 63);
|
||||||
|
crc16_final(ctx, &crc);
|
||||||
|
|
||||||
|
EXPECT_EQ(crc, EXPECTED_CRC16_63BYTES);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(crc16Fixture, crc16_avx2_2352bytes)
|
||||||
|
{
|
||||||
|
if(!have_avx2()) return;
|
||||||
|
|
||||||
|
crc16_ctx *ctx = crc16_init();
|
||||||
|
uint16_t crc;
|
||||||
|
|
||||||
|
EXPECT_NE(ctx, nullptr);
|
||||||
|
|
||||||
|
crc16_update_avx2(ctx, buffer, 2352);
|
||||||
|
crc16_final(ctx, &crc);
|
||||||
|
|
||||||
|
EXPECT_EQ(crc, EXPECTED_CRC16_2352BYTES);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
Reference in New Issue
Block a user