From aa172b43ebf97476b58a5cf13f2f57bf6d5e2d30 Mon Sep 17 00:00:00 2001 From: Natalia Portillo Date: Tue, 5 Oct 2021 00:33:05 +0100 Subject: [PATCH] Separate common parts of SIMD CRC32. --- crc32_clmul.c | 45 +------------------------------------------- crc32_simd.h | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 44 deletions(-) create mode 100644 crc32_simd.h diff --git a/crc32_clmul.c b/crc32_clmul.c index 2c8785d..12c7c95 100644 --- a/crc32_clmul.c +++ b/crc32_clmul.c @@ -41,6 +41,7 @@ #include "library.h" #include "crc32.h" +#include "crc32_simd.h" CLMUL static void fold_1(__m128i* xmm_crc0, __m128i* xmm_crc1, __m128i* xmm_crc2, __m128i* xmm_crc3) @@ -178,24 +179,6 @@ static void fold_4(__m128i* xmm_crc0, __m128i* xmm_crc1, __m128i* xmm_crc2, __m1 *xmm_crc3 = _mm_castps_si128(ps_res3); } -static const unsigned ALIGNED_(32) pshufb_shf_table[60] = { - 0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d, /* shl 15 (16 - 1)/shr1 */ - 0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e, /* shl 14 (16 - 3)/shr2 */ - 0x86858483, 0x8a898887, 0x8e8d8c8b, 0x0201008f, /* shl 13 (16 - 4)/shr3 */ - 0x87868584, 0x8b8a8988, 0x8f8e8d8c, 0x03020100, /* shl 12 (16 - 4)/shr4 */ - 0x88878685, 0x8c8b8a89, 0x008f8e8d, 0x04030201, /* shl 11 (16 - 5)/shr5 */ - 0x89888786, 0x8d8c8b8a, 0x01008f8e, 0x05040302, /* shl 10 (16 - 6)/shr6 */ - 0x8a898887, 0x8e8d8c8b, 0x0201008f, 0x06050403, /* shl 9 (16 - 7)/shr7 */ - 0x8b8a8988, 0x8f8e8d8c, 0x03020100, 0x07060504, /* shl 8 (16 - 8)/shr8 */ - 0x8c8b8a89, 0x008f8e8d, 0x04030201, 0x08070605, /* shl 7 (16 - 9)/shr9 */ - 0x8d8c8b8a, 0x01008f8e, 0x05040302, 0x09080706, /* shl 6 (16 -10)/shr10*/ - 0x8e8d8c8b, 0x0201008f, 0x06050403, 0x0a090807, /* shl 5 (16 -11)/shr11*/ - 0x8f8e8d8c, 0x03020100, 0x07060504, 0x0b0a0908, /* shl 4 (16 -12)/shr12*/ - 0x008f8e8d, 0x04030201, 0x08070605, 0x0c0b0a09, /* shl 3 (16 -13)/shr13*/ - 0x01008f8e, 0x05040302, 0x09080706, 0x0d0c0b0a, /* shl 2 (16 -14)/shr14*/ - 0x0201008f, 0x06050403, 0x0a090807, 0x0e0d0c0b /* shl 1 (16 -15)/shr15*/ -}; - CLMUL static void partial_fold(const size_t len, __m128i* xmm_crc0, @@ -246,32 +229,6 @@ static void partial_fold(const size_t len, *xmm_crc3 = _mm_castps_si128(ps_res); } -static const unsigned ALIGNED_(16) crc_k[] = { - 0xccaa009e, - 0x00000000, /* rk1 */ - 0x751997d0, - 0x00000001, /* rk2 */ - 0xccaa009e, - 0x00000000, /* rk5 */ - 0x63cd6124, - 0x00000001, /* rk6 */ - 0xf7011640, - 0x00000001, /* rk7 */ - 0xdb710640, - 0x00000001 /* rk8 */ -}; - -static const unsigned ALIGNED_(16) crc_mask[4] = {0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000}; - -static const unsigned ALIGNED_(16) crc_mask2[4] = {0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF}; - -#define ONCE(op) \ - if(first) \ - { \ - first = 0; \ - (op); \ - } - /* * somewhat surprisingly the "naive" way of doing this, ie. with a flag and a cond. branch, * is consistently ~5 % faster on average than the implied-recommended branchless way (always xor, diff --git a/crc32_simd.h b/crc32_simd.h new file mode 100644 index 0000000..8552ad1 --- /dev/null +++ b/crc32_simd.h @@ -0,0 +1,52 @@ +// +// Created by claunia on 5/10/21. +// + +#ifndef AARU_CHECKSUMS_NATIVE__CRC32_SIMD_H_ +#define AARU_CHECKSUMS_NATIVE__CRC32_SIMD_H_ + +static const unsigned ALIGNED_(32) pshufb_shf_table[60] = { + 0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d, /* shl 15 (16 - 1)/shr1 */ + 0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e, /* shl 14 (16 - 3)/shr2 */ + 0x86858483, 0x8a898887, 0x8e8d8c8b, 0x0201008f, /* shl 13 (16 - 4)/shr3 */ + 0x87868584, 0x8b8a8988, 0x8f8e8d8c, 0x03020100, /* shl 12 (16 - 4)/shr4 */ + 0x88878685, 0x8c8b8a89, 0x008f8e8d, 0x04030201, /* shl 11 (16 - 5)/shr5 */ + 0x89888786, 0x8d8c8b8a, 0x01008f8e, 0x05040302, /* shl 10 (16 - 6)/shr6 */ + 0x8a898887, 0x8e8d8c8b, 0x0201008f, 0x06050403, /* shl 9 (16 - 7)/shr7 */ + 0x8b8a8988, 0x8f8e8d8c, 0x03020100, 0x07060504, /* shl 8 (16 - 8)/shr8 */ + 0x8c8b8a89, 0x008f8e8d, 0x04030201, 0x08070605, /* shl 7 (16 - 9)/shr9 */ + 0x8d8c8b8a, 0x01008f8e, 0x05040302, 0x09080706, /* shl 6 (16 -10)/shr10*/ + 0x8e8d8c8b, 0x0201008f, 0x06050403, 0x0a090807, /* shl 5 (16 -11)/shr11*/ + 0x8f8e8d8c, 0x03020100, 0x07060504, 0x0b0a0908, /* shl 4 (16 -12)/shr12*/ + 0x008f8e8d, 0x04030201, 0x08070605, 0x0c0b0a09, /* shl 3 (16 -13)/shr13*/ + 0x01008f8e, 0x05040302, 0x09080706, 0x0d0c0b0a, /* shl 2 (16 -14)/shr14*/ + 0x0201008f, 0x06050403, 0x0a090807, 0x0e0d0c0b /* shl 1 (16 -15)/shr15*/ +}; + +static const uint32_t ALIGNED_(16) crc_k[] = { + 0xccaa009e, + 0x00000000, /* rk1 */ + 0x751997d0, + 0x00000001, /* rk2 */ + 0xccaa009e, + 0x00000000, /* rk5 */ + 0x63cd6124, + 0x00000001, /* rk6 */ + 0xf7011640, + 0x00000001, /* rk7 */ + 0xdb710640, + 0x00000001 /* rk8 */ +}; + +static const unsigned ALIGNED_(16) crc_mask[4] = {0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000}; + +static const unsigned ALIGNED_(16) crc_mask2[4] = {0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF}; + +#define ONCE(op) \ + if(first) \ + { \ + first = 0; \ + (op); \ + } + +#endif // AARU_CHECKSUMS_NATIVE__CRC32_SIMD_H_