mirror of
https://github.com/aaru-dps/Aaru.Checksums.Native.git
synced 2025-12-16 19:24:29 +00:00
Add CLMUL implementations for CRC32 and CRC64.
This commit is contained in:
@@ -5,4 +5,4 @@ set(CMAKE_C_STANDARD 90)
|
||||
|
||||
add_compile_options(-flto -ffast-math -march=x86-64 -mfpmath=sse -msse3)
|
||||
|
||||
add_library("Aaru.Checksums.Native" SHARED adler32.h adler32.c crc16.h crc16.c crc16_ccitt.h crc16_ccitt.c crc32.c crc32.h crc64.c crc64.h fletcher16.h fletcher16.c fletcher32.h fletcher32.c library.h spamsum.c spamsum.h)
|
||||
add_library("Aaru.Checksums.Native" SHARED adler32.h adler32.c crc16.h crc16.c crc16_ccitt.h crc16_ccitt.c crc32.c crc32.h crc64.c crc64.h fletcher16.h fletcher16.c fletcher32.h fletcher32.c library.h spamsum.c spamsum.h crc32_clmul.c crc64_clmul.c)
|
||||
|
||||
4
crc32.c
4
crc32.c
@@ -35,6 +35,9 @@ AARU_EXPORT crc32_ctx* AARU_CALL crc32_init(void)
|
||||
|
||||
AARU_EXPORT int AARU_CALL crc32_update(crc32_ctx* ctx, const uint8_t* data, uint32_t len)
|
||||
{
|
||||
ctx->crc = ~crc32_clmul(data, (long)len, ~ctx->crc);
|
||||
return 0;
|
||||
/*
|
||||
// Unroll according to Intel slicing by uint8_t
|
||||
// http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf
|
||||
// http://sourceforge.net/projects/slicing-by-8/
|
||||
@@ -81,6 +84,7 @@ AARU_EXPORT int AARU_CALL crc32_update(crc32_ctx* ctx, const uint8_t* data, uint
|
||||
|
||||
ctx->crc = crc;
|
||||
return 0;
|
||||
*/
|
||||
}
|
||||
|
||||
AARU_EXPORT int AARU_CALL crc32_final(crc32_ctx* ctx, uint32_t* crc)
|
||||
|
||||
6
crc32.h
6
crc32.h
@@ -21,7 +21,7 @@ typedef struct
|
||||
uint32_t crc;
|
||||
} crc32_ctx;
|
||||
|
||||
const uint32_t crc32_table[8][256] = {
|
||||
static const uint32_t crc32_table[8][256] = {
|
||||
{0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3, 0x0EDB8832,
|
||||
0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91, 0x1DB71064, 0x6AB020F2,
|
||||
0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7, 0x136C9856, 0x646BA8C0, 0xFD62F97A,
|
||||
@@ -259,6 +259,10 @@ const uint32_t crc32_table[8][256] = {
|
||||
#define CRC32_ISO_POLY 0xEDB88320
|
||||
#define CRC32_ISO_SEED 0xFFFFFFFF
|
||||
|
||||
#define CLMUL __attribute__((target("pclmul,sse4.1")))
|
||||
#define ALIGNED_(n) __attribute__((aligned(n)))
|
||||
|
||||
CLMUL uint32_t crc32_clmul(const uint8_t* src, long len, uint32_t initial_crc);
|
||||
AARU_EXPORT crc32_ctx* AARU_CALL crc32_init();
|
||||
AARU_EXPORT int AARU_CALL crc32_update(crc32_ctx* ctx, const uint8_t* data, uint32_t len);
|
||||
AARU_EXPORT int AARU_CALL crc32_final(crc32_ctx* ctx, uint32_t* crc);
|
||||
|
||||
534
crc32_clmul.c
Normal file
534
crc32_clmul.c
Normal file
@@ -0,0 +1,534 @@
|
||||
/*
|
||||
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
|
||||
* instruction.
|
||||
*
|
||||
* A white paper describing this algorithm can be found at:
|
||||
* http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
|
||||
*
|
||||
* Copyright (C) 2013 Intel Corporation. All rights reserved.
|
||||
* Authors:
|
||||
* Wajdi Feghali <wajdi.k.feghali@intel.com>
|
||||
* Jim Guilford <james.guilford@intel.com>
|
||||
* Vinodh Gopal <vinodh.gopal@intel.com>
|
||||
* Erdinc Ozturk <erdinc.ozturk@intel.com>
|
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com>
|
||||
*
|
||||
* Copyright (c) 2016 Marian Beermann (add support for initial value, restructuring)
|
||||
*
|
||||
* This software is provided 'as-is', without any express or implied
|
||||
* warranty. In no event will the authors be held liable for any damages
|
||||
* arising from the use of this software.
|
||||
*
|
||||
* Permission is granted to anyone to use this software for any purpose,
|
||||
* including commercial applications, and to alter it and redistribute it
|
||||
* freely, subject to the following restrictions:
|
||||
*
|
||||
* 1. The origin of this software must not be misrepresented; you must not
|
||||
* claim that you wrote the original software. If you use this software
|
||||
* in a product, an acknowledgment in the product documentation would be
|
||||
* appreciated but is not required.
|
||||
* 2. Altered source versions must be plainly marked as such, and must not be
|
||||
* misrepresented as being the original software.
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <smmintrin.h>
|
||||
#include <wmmintrin.h>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
#else
|
||||
/*
|
||||
* Newer versions of GCC and clang come with cpuid.h
|
||||
* (ftr GCC 4.7 in Debian Wheezy has this)
|
||||
*/
|
||||
#include <cpuid.h>
|
||||
|
||||
#endif
|
||||
|
||||
#include "library.h"
|
||||
#include "crc32.h"
|
||||
|
||||
static void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx)
|
||||
{
|
||||
#ifdef _MSC_VER
|
||||
unsigned int registers[4];
|
||||
__cpuid(registers, info);
|
||||
*eax = registers[0];
|
||||
*ebx = registers[1];
|
||||
*ecx = registers[2];
|
||||
*edx = registers[3];
|
||||
#else
|
||||
/* GCC, clang */
|
||||
unsigned int _eax;
|
||||
unsigned int _ebx;
|
||||
unsigned int _ecx;
|
||||
unsigned int _edx;
|
||||
__cpuid(info, _eax, _ebx, _ecx, _edx);
|
||||
*eax = _eax;
|
||||
*ebx = _ebx;
|
||||
*ecx = _ecx;
|
||||
*edx = _edx;
|
||||
#endif
|
||||
}
|
||||
|
||||
static int have_clmul(void)
|
||||
{
|
||||
unsigned eax, ebx, ecx, edx;
|
||||
int has_pclmulqdq;
|
||||
int has_sse41;
|
||||
cpuid(1 /* feature bits */, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
has_pclmulqdq = ecx & 0x2; /* bit 1 */
|
||||
has_sse41 = ecx & 0x80000; /* bit 19 */
|
||||
|
||||
return has_pclmulqdq && has_sse41;
|
||||
}
|
||||
|
||||
CLMUL
|
||||
static void fold_1(__m128i* xmm_crc0, __m128i* xmm_crc1, __m128i* xmm_crc2, __m128i* xmm_crc3)
|
||||
{
|
||||
const __m128i xmm_fold4 = _mm_set_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
|
||||
|
||||
__m128i x_tmp3;
|
||||
__m128 ps_crc0, ps_crc3, ps_res;
|
||||
|
||||
x_tmp3 = *xmm_crc3;
|
||||
|
||||
*xmm_crc3 = *xmm_crc0;
|
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_res = _mm_xor_ps(ps_crc0, ps_crc3);
|
||||
|
||||
*xmm_crc0 = *xmm_crc1;
|
||||
*xmm_crc1 = *xmm_crc2;
|
||||
*xmm_crc2 = x_tmp3;
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res);
|
||||
}
|
||||
|
||||
CLMUL
|
||||
static void fold_2(__m128i* xmm_crc0, __m128i* xmm_crc1, __m128i* xmm_crc2, __m128i* xmm_crc3)
|
||||
{
|
||||
const __m128i xmm_fold4 = _mm_set_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
|
||||
|
||||
__m128i x_tmp3, x_tmp2;
|
||||
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res31, ps_res20;
|
||||
|
||||
x_tmp3 = *xmm_crc3;
|
||||
x_tmp2 = *xmm_crc2;
|
||||
|
||||
*xmm_crc3 = *xmm_crc1;
|
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
|
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_res31 = _mm_xor_ps(ps_crc3, ps_crc1);
|
||||
|
||||
*xmm_crc2 = *xmm_crc0;
|
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_res20 = _mm_xor_ps(ps_crc0, ps_crc2);
|
||||
|
||||
*xmm_crc0 = x_tmp2;
|
||||
*xmm_crc1 = x_tmp3;
|
||||
*xmm_crc2 = _mm_castps_si128(ps_res20);
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res31);
|
||||
}
|
||||
|
||||
CLMUL
|
||||
static void fold_3(__m128i* xmm_crc0, __m128i* xmm_crc1, __m128i* xmm_crc2, __m128i* xmm_crc3)
|
||||
{
|
||||
const __m128i xmm_fold4 = _mm_set_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
|
||||
|
||||
__m128i x_tmp3;
|
||||
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res32, ps_res21, ps_res10;
|
||||
|
||||
x_tmp3 = *xmm_crc3;
|
||||
|
||||
*xmm_crc3 = *xmm_crc2;
|
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
|
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3);
|
||||
|
||||
*xmm_crc2 = *xmm_crc1;
|
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
|
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_res21 = _mm_xor_ps(ps_crc1, ps_crc2);
|
||||
|
||||
*xmm_crc1 = *xmm_crc0;
|
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_res10 = _mm_xor_ps(ps_crc0, ps_crc1);
|
||||
|
||||
*xmm_crc0 = x_tmp3;
|
||||
*xmm_crc1 = _mm_castps_si128(ps_res10);
|
||||
*xmm_crc2 = _mm_castps_si128(ps_res21);
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res32);
|
||||
}
|
||||
|
||||
CLMUL
|
||||
static void fold_4(__m128i* xmm_crc0, __m128i* xmm_crc1, __m128i* xmm_crc2, __m128i* xmm_crc3)
|
||||
{
|
||||
const __m128i xmm_fold4 = _mm_set_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
|
||||
|
||||
__m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3;
|
||||
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3;
|
||||
__m128 ps_t0, ps_t1, ps_t2, ps_t3;
|
||||
__m128 ps_res0, ps_res1, ps_res2, ps_res3;
|
||||
|
||||
x_tmp0 = *xmm_crc0;
|
||||
x_tmp1 = *xmm_crc1;
|
||||
x_tmp2 = *xmm_crc2;
|
||||
x_tmp3 = *xmm_crc3;
|
||||
|
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
||||
x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_t0 = _mm_castsi128_ps(x_tmp0);
|
||||
ps_res0 = _mm_xor_ps(ps_crc0, ps_t0);
|
||||
|
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
|
||||
x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_t1 = _mm_castsi128_ps(x_tmp1);
|
||||
ps_res1 = _mm_xor_ps(ps_crc1, ps_t1);
|
||||
|
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
|
||||
x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_t2 = _mm_castsi128_ps(x_tmp2);
|
||||
ps_res2 = _mm_xor_ps(ps_crc2, ps_t2);
|
||||
|
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
|
||||
x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_t3 = _mm_castsi128_ps(x_tmp3);
|
||||
ps_res3 = _mm_xor_ps(ps_crc3, ps_t3);
|
||||
|
||||
*xmm_crc0 = _mm_castps_si128(ps_res0);
|
||||
*xmm_crc1 = _mm_castps_si128(ps_res1);
|
||||
*xmm_crc2 = _mm_castps_si128(ps_res2);
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res3);
|
||||
}
|
||||
|
||||
static const unsigned ALIGNED_(32) pshufb_shf_table[60] = {
|
||||
0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
|
||||
0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e, /* shl 14 (16 - 3)/shr2 */
|
||||
0x86858483, 0x8a898887, 0x8e8d8c8b, 0x0201008f, /* shl 13 (16 - 4)/shr3 */
|
||||
0x87868584, 0x8b8a8988, 0x8f8e8d8c, 0x03020100, /* shl 12 (16 - 4)/shr4 */
|
||||
0x88878685, 0x8c8b8a89, 0x008f8e8d, 0x04030201, /* shl 11 (16 - 5)/shr5 */
|
||||
0x89888786, 0x8d8c8b8a, 0x01008f8e, 0x05040302, /* shl 10 (16 - 6)/shr6 */
|
||||
0x8a898887, 0x8e8d8c8b, 0x0201008f, 0x06050403, /* shl 9 (16 - 7)/shr7 */
|
||||
0x8b8a8988, 0x8f8e8d8c, 0x03020100, 0x07060504, /* shl 8 (16 - 8)/shr8 */
|
||||
0x8c8b8a89, 0x008f8e8d, 0x04030201, 0x08070605, /* shl 7 (16 - 9)/shr9 */
|
||||
0x8d8c8b8a, 0x01008f8e, 0x05040302, 0x09080706, /* shl 6 (16 -10)/shr10*/
|
||||
0x8e8d8c8b, 0x0201008f, 0x06050403, 0x0a090807, /* shl 5 (16 -11)/shr11*/
|
||||
0x8f8e8d8c, 0x03020100, 0x07060504, 0x0b0a0908, /* shl 4 (16 -12)/shr12*/
|
||||
0x008f8e8d, 0x04030201, 0x08070605, 0x0c0b0a09, /* shl 3 (16 -13)/shr13*/
|
||||
0x01008f8e, 0x05040302, 0x09080706, 0x0d0c0b0a, /* shl 2 (16 -14)/shr14*/
|
||||
0x0201008f, 0x06050403, 0x0a090807, 0x0e0d0c0b /* shl 1 (16 -15)/shr15*/
|
||||
};
|
||||
|
||||
CLMUL
|
||||
static void partial_fold(const size_t len,
|
||||
__m128i* xmm_crc0,
|
||||
__m128i* xmm_crc1,
|
||||
__m128i* xmm_crc2,
|
||||
__m128i* xmm_crc3,
|
||||
__m128i* xmm_crc_part)
|
||||
{
|
||||
const __m128i xmm_fold4 = _mm_set_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
|
||||
const __m128i xmm_mask3 = _mm_set1_epi32(0x80808080);
|
||||
|
||||
__m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3;
|
||||
__m128i xmm_a0_0, xmm_a0_1;
|
||||
__m128 ps_crc3, psa0_0, psa0_1, ps_res;
|
||||
|
||||
xmm_shl = _mm_load_si128((__m128i*)pshufb_shf_table + (len - 1));
|
||||
xmm_shr = xmm_shl;
|
||||
xmm_shr = _mm_xor_si128(xmm_shr, xmm_mask3);
|
||||
|
||||
xmm_a0_0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shl);
|
||||
|
||||
*xmm_crc0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shr);
|
||||
xmm_tmp1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shl);
|
||||
*xmm_crc0 = _mm_or_si128(*xmm_crc0, xmm_tmp1);
|
||||
|
||||
*xmm_crc1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shr);
|
||||
xmm_tmp2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shl);
|
||||
*xmm_crc1 = _mm_or_si128(*xmm_crc1, xmm_tmp2);
|
||||
|
||||
*xmm_crc2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shr);
|
||||
xmm_tmp3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shl);
|
||||
*xmm_crc2 = _mm_or_si128(*xmm_crc2, xmm_tmp3);
|
||||
|
||||
*xmm_crc3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shr);
|
||||
*xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
|
||||
*xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
|
||||
|
||||
xmm_a0_1 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x10);
|
||||
xmm_a0_0 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x01);
|
||||
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
psa0_0 = _mm_castsi128_ps(xmm_a0_0);
|
||||
psa0_1 = _mm_castsi128_ps(xmm_a0_1);
|
||||
|
||||
ps_res = _mm_xor_ps(ps_crc3, psa0_0);
|
||||
ps_res = _mm_xor_ps(ps_res, psa0_1);
|
||||
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res);
|
||||
}
|
||||
|
||||
static const unsigned ALIGNED_(16) crc_k[] = {
|
||||
0xccaa009e,
|
||||
0x00000000, /* rk1 */
|
||||
0x751997d0,
|
||||
0x00000001, /* rk2 */
|
||||
0xccaa009e,
|
||||
0x00000000, /* rk5 */
|
||||
0x63cd6124,
|
||||
0x00000001, /* rk6 */
|
||||
0xf7011640,
|
||||
0x00000001, /* rk7 */
|
||||
0xdb710640,
|
||||
0x00000001 /* rk8 */
|
||||
};
|
||||
|
||||
static const unsigned ALIGNED_(16) crc_mask[4] = {0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000};
|
||||
|
||||
static const unsigned ALIGNED_(16) crc_mask2[4] = {0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF};
|
||||
|
||||
#define ONCE(op) \
|
||||
if(first) \
|
||||
{ \
|
||||
first = 0; \
|
||||
(op); \
|
||||
}
|
||||
|
||||
/*
|
||||
* somewhat surprisingly the "naive" way of doing this, ie. with a flag and a cond. branch,
|
||||
* is consistently ~5 % faster on average than the implied-recommended branchless way (always xor,
|
||||
* always zero xmm_initial). Guess speculative execution and branch prediction got the better of
|
||||
* yet another "optimization tip".
|
||||
*/
|
||||
#define XOR_INITIAL(where) ONCE(where = _mm_xor_si128(where, xmm_initial))
|
||||
|
||||
CLMUL
|
||||
uint32_t crc32_clmul(const uint8_t* src, long len, uint32_t initial_crc)
|
||||
{
|
||||
unsigned long algn_diff;
|
||||
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
|
||||
__m128i xmm_initial = _mm_cvtsi32_si128(initial_crc);
|
||||
__m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
|
||||
__m128i xmm_crc1 = _mm_setzero_si128();
|
||||
__m128i xmm_crc2 = _mm_setzero_si128();
|
||||
__m128i xmm_crc3 = _mm_setzero_si128();
|
||||
__m128i xmm_crc_part;
|
||||
|
||||
int first = 1;
|
||||
|
||||
/* fold 512 to 32 step variable declarations for ISO-C90 compat. */
|
||||
const __m128i xmm_mask = _mm_load_si128((__m128i*)crc_mask);
|
||||
const __m128i xmm_mask2 = _mm_load_si128((__m128i*)crc_mask2);
|
||||
|
||||
uint32_t crc;
|
||||
__m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
|
||||
|
||||
if(len < 16)
|
||||
{
|
||||
if(len == 0) return initial_crc;
|
||||
if(len < 4)
|
||||
{
|
||||
/*
|
||||
* no idea how to do this for <4 bytes, delegate to classic impl.
|
||||
*/
|
||||
uint32_t crc = ~initial_crc;
|
||||
switch(len)
|
||||
{
|
||||
case 3: crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *src++];
|
||||
case 2: crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *src++];
|
||||
case 1: crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *src++];
|
||||
}
|
||||
return ~crc;
|
||||
}
|
||||
xmm_crc_part = _mm_loadu_si128((__m128i*)src);
|
||||
XOR_INITIAL(xmm_crc_part);
|
||||
goto partial;
|
||||
}
|
||||
|
||||
/* this alignment computation would be wrong for len<16 handled above */
|
||||
algn_diff = (0 - (uintptr_t)src) & 0xF;
|
||||
if(algn_diff)
|
||||
{
|
||||
xmm_crc_part = _mm_loadu_si128((__m128i*)src);
|
||||
XOR_INITIAL(xmm_crc_part);
|
||||
|
||||
src += algn_diff;
|
||||
len -= algn_diff;
|
||||
|
||||
partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
|
||||
}
|
||||
|
||||
while((len -= 64) >= 0)
|
||||
{
|
||||
xmm_t0 = _mm_load_si128((__m128i*)src);
|
||||
xmm_t1 = _mm_load_si128((__m128i*)src + 1);
|
||||
xmm_t2 = _mm_load_si128((__m128i*)src + 2);
|
||||
xmm_t3 = _mm_load_si128((__m128i*)src + 3);
|
||||
|
||||
XOR_INITIAL(xmm_t0);
|
||||
|
||||
fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
|
||||
xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
|
||||
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
|
||||
|
||||
src += 64;
|
||||
}
|
||||
|
||||
/*
|
||||
* len = num bytes left - 64
|
||||
*/
|
||||
if(len + 16 >= 0)
|
||||
{
|
||||
len += 16;
|
||||
|
||||
xmm_t0 = _mm_load_si128((__m128i*)src);
|
||||
xmm_t1 = _mm_load_si128((__m128i*)src + 1);
|
||||
xmm_t2 = _mm_load_si128((__m128i*)src + 2);
|
||||
|
||||
XOR_INITIAL(xmm_t0);
|
||||
|
||||
fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
|
||||
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
|
||||
|
||||
if(len == 0) goto done;
|
||||
|
||||
xmm_crc_part = _mm_load_si128((__m128i*)src + 3);
|
||||
}
|
||||
else if(len + 32 >= 0)
|
||||
{
|
||||
len += 32;
|
||||
|
||||
xmm_t0 = _mm_load_si128((__m128i*)src);
|
||||
xmm_t1 = _mm_load_si128((__m128i*)src + 1);
|
||||
|
||||
XOR_INITIAL(xmm_t0);
|
||||
|
||||
fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
|
||||
|
||||
if(len == 0) goto done;
|
||||
|
||||
xmm_crc_part = _mm_load_si128((__m128i*)src + 2);
|
||||
}
|
||||
else if(len + 48 >= 0)
|
||||
{
|
||||
len += 48;
|
||||
|
||||
xmm_t0 = _mm_load_si128((__m128i*)src);
|
||||
|
||||
XOR_INITIAL(xmm_t0);
|
||||
|
||||
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
|
||||
|
||||
if(len == 0) goto done;
|
||||
|
||||
xmm_crc_part = _mm_load_si128((__m128i*)src + 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
len += 64;
|
||||
if(len == 0) goto done;
|
||||
xmm_crc_part = _mm_load_si128((__m128i*)src);
|
||||
XOR_INITIAL(xmm_crc_part);
|
||||
}
|
||||
|
||||
partial:
|
||||
partial_fold(len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
|
||||
|
||||
done:
|
||||
(void)0;
|
||||
|
||||
/* fold 512 to 32 */
|
||||
|
||||
/*
|
||||
* k1
|
||||
*/
|
||||
crc_fold = _mm_load_si128((__m128i*)crc_k);
|
||||
|
||||
x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
|
||||
xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
|
||||
xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
|
||||
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0);
|
||||
|
||||
x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
|
||||
xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1);
|
||||
|
||||
x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
|
||||
xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
|
||||
|
||||
/*
|
||||
* k5
|
||||
*/
|
||||
crc_fold = _mm_load_si128((__m128i*)crc_k + 1);
|
||||
|
||||
xmm_crc0 = xmm_crc3;
|
||||
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
|
||||
xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
|
||||
|
||||
xmm_crc0 = xmm_crc3;
|
||||
xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
|
||||
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
|
||||
xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2);
|
||||
|
||||
/*
|
||||
* k7
|
||||
*/
|
||||
xmm_crc1 = xmm_crc3;
|
||||
xmm_crc2 = xmm_crc3;
|
||||
crc_fold = _mm_load_si128((__m128i*)crc_k + 2);
|
||||
|
||||
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
|
||||
xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
|
||||
|
||||
xmm_crc2 = xmm_crc3;
|
||||
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
|
||||
|
||||
/*
|
||||
* could just as well write xmm_crc3[2], doing a movaps and truncating, but
|
||||
* no real advantage - it's a tiny bit slower per call, while no additional CPUs
|
||||
* would be supported by only requiring SSSE3 and CLMUL instead of SSE4.1 + CLMUL
|
||||
*/
|
||||
crc = _mm_extract_epi32(xmm_crc3, 2);
|
||||
return ~crc;
|
||||
}
|
||||
4
crc64.c
4
crc64.c
@@ -35,6 +35,9 @@ AARU_EXPORT crc64_ctx* AARU_CALL crc64_init(void)
|
||||
|
||||
AARU_EXPORT int AARU_CALL crc64_update(crc64_ctx* ctx, const uint8_t* data, uint32_t len)
|
||||
{
|
||||
ctx->crc = ~crc64_clmul(~ctx->crc, data, len);
|
||||
return 0;
|
||||
/*
|
||||
// Unroll according to Intel slicing by uint8_t
|
||||
// http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf
|
||||
// http://sourceforge.net/projects/slicing-by-8/
|
||||
@@ -70,6 +73,7 @@ AARU_EXPORT int AARU_CALL crc64_update(crc64_ctx* ctx, const uint8_t* data, uint
|
||||
|
||||
ctx->crc = crc;
|
||||
return 0;
|
||||
*/
|
||||
}
|
||||
|
||||
AARU_EXPORT int AARU_CALL crc64_final(crc64_ctx* ctx, uint64_t* crc)
|
||||
|
||||
3
crc64.h
3
crc64.h
@@ -21,7 +21,7 @@ typedef struct
|
||||
uint64_t crc;
|
||||
} crc64_ctx;
|
||||
|
||||
const uint64_t crc64_table[4][256] = {
|
||||
const static uint64_t crc64_table[4][256] = {
|
||||
{0x0000000000000000, 0xB32E4CBE03A75F6F, 0xF4843657A840A05B, 0x47AA7AE9ABE7FF34, 0x7BD0C384FF8F5E33,
|
||||
0xC8FE8F3AFC28015C, 0x8F54F5D357CFFE68, 0x3C7AB96D5468A107, 0xF7A18709FF1EBC66, 0x448FCBB7FCB9E309,
|
||||
0x0325B15E575E1C3D, 0xB00BFDE054F94352, 0x8C71448D0091E255, 0x3F5F08330336BD3A, 0x78F572DAA8D1420E,
|
||||
@@ -234,6 +234,7 @@ const uint64_t crc64_table[4][256] = {
|
||||
#define CRC64_ECMA_POLY 0xC96C5795D7870F42
|
||||
#define CRC64_ECMA_SEED 0xFFFFFFFFFFFFFFFF
|
||||
|
||||
uint64_t crc64_clmul(uint64_t crc, const uint8_t* data, size_t length);
|
||||
AARU_EXPORT crc64_ctx* AARU_CALL crc64_init();
|
||||
AARU_EXPORT int AARU_CALL crc64_update(crc64_ctx* ctx, const uint8_t* data, uint32_t len);
|
||||
AARU_EXPORT int AARU_CALL crc64_final(crc64_ctx* ctx, uint64_t* crc);
|
||||
|
||||
186
crc64_clmul.c
Normal file
186
crc64_clmul.c
Normal file
@@ -0,0 +1,186 @@
|
||||
#include <inttypes.h>
|
||||
#include <smmintrin.h>
|
||||
#include <wmmintrin.h>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
#include "library.h"
|
||||
#include "crc32.h"
|
||||
#include "crc64.h"
|
||||
|
||||
// Reverses bits
|
||||
static uint64_t bitReflect(uint64_t v)
|
||||
{
|
||||
v = ((v >> 1) & 0x5555555555555555) | ((v & 0x5555555555555555) << 1);
|
||||
v = ((v >> 2) & 0x3333333333333333) | ((v & 0x3333333333333333) << 2);
|
||||
v = ((v >> 4) & 0x0F0F0F0F0F0F0F0F) | ((v & 0x0F0F0F0F0F0F0F0F) << 4);
|
||||
v = ((v >> 8) & 0x00FF00FF00FF00FF) | ((v & 0x00FF00FF00FF00FF) << 8);
|
||||
v = ((v >> 16) & 0x0000FFFF0000FFFF) | ((v & 0x0000FFFF0000FFFF) << 16);
|
||||
v = (v >> 32) | (v << 32);
|
||||
return v;
|
||||
}
|
||||
|
||||
// Computes r*x^N mod p(x)
|
||||
static uint64_t expMod65(uint32_t n, uint64_t p, uint64_t r)
|
||||
{
|
||||
return n == 0 ? r : expMod65(n - 1, p, (r << 1) ^ (p & ((int64_t)r >> 63)));
|
||||
}
|
||||
|
||||
// Computes x^129 / p(x); the result has an implicit 65th bit.
|
||||
static uint64_t div129by65(uint64_t poly)
|
||||
{
|
||||
uint64_t q = 0;
|
||||
uint64_t h = poly;
|
||||
uint32_t i;
|
||||
for(i = 0; i < 64; ++i)
|
||||
{
|
||||
q |= (h & (1ull << 63)) >> i;
|
||||
h = (h << 1) ^ (poly & ((int64_t)h >> 63));
|
||||
}
|
||||
return q;
|
||||
}
|
||||
|
||||
static const uint8_t shuffleMasks[] = {
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
|
||||
0x8f, 0x8e, 0x8d, 0x8c, 0x8b, 0x8a, 0x89, 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81, 0x80,
|
||||
};
|
||||
|
||||
CLMUL static void shiftRight128(__m128i in, size_t n, __m128i* outLeft, __m128i* outRight)
|
||||
{
|
||||
const __m128i maskA = _mm_loadu_si128((const __m128i*)(shuffleMasks + (16 - n)));
|
||||
const __m128i maskB = _mm_xor_si128(maskA, _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()));
|
||||
|
||||
*outLeft = _mm_shuffle_epi8(in, maskB);
|
||||
*outRight = _mm_shuffle_epi8(in, maskA);
|
||||
}
|
||||
|
||||
CLMUL static __m128i fold(__m128i in, __m128i foldConstants)
|
||||
{
|
||||
return _mm_xor_si128(_mm_clmulepi64_si128(in, foldConstants, 0x00), _mm_clmulepi64_si128(in, foldConstants, 0x11));
|
||||
}
|
||||
|
||||
CLMUL uint64_t crc64_clmul(uint64_t crc, const uint8_t* data, size_t length)
|
||||
{
|
||||
const uint64_t k1 = 0xe05dd497ca393ae4; // bitReflect(expMod65(128 + 64, poly, 1)) << 1;
|
||||
const uint64_t k2 = 0xdabe95afc7875f40; // bitReflect(expMod65(128, poly, 1)) << 1;
|
||||
const uint64_t mu = 0x9c3e466c172963d5; // (bitReflect(div129by65(poly)) << 1) | 1;
|
||||
const uint64_t p = 0x92d8af2baf0e1e85; // (bitReflect(poly) << 1) | 1;
|
||||
|
||||
const __m128i foldConstants1 = _mm_set_epi64x(k2, k1);
|
||||
const __m128i foldConstants2 = _mm_set_epi64x(p, mu);
|
||||
|
||||
const uint8_t* end = data + length;
|
||||
|
||||
// Align pointers
|
||||
const __m128i* alignedData = (const __m128i*)((uintptr_t)data & ~(uintptr_t)15);
|
||||
const __m128i* alignedEnd = (const __m128i*)(((uintptr_t)end + 15) & ~(uintptr_t)15);
|
||||
|
||||
const size_t leadInSize = data - (const uint8_t*)alignedData;
|
||||
const size_t leadOutSize = (const uint8_t*)alignedEnd - end;
|
||||
|
||||
const size_t alignedLength = alignedEnd - alignedData;
|
||||
|
||||
const __m128i leadInMask = _mm_loadu_si128((const __m128i*)(shuffleMasks + (16 - leadInSize)));
|
||||
const __m128i data0 = _mm_blendv_epi8(_mm_setzero_si128(), _mm_load_si128(alignedData), leadInMask);
|
||||
|
||||
#if defined(_WIN64)
|
||||
const __m128i initialCrc = _mm_cvtsi64x_si128(~crc);
|
||||
#else
|
||||
const __m128i initialCrc = _mm_set_epi64x(0, ~crc);
|
||||
#endif
|
||||
|
||||
__m128i R;
|
||||
if(alignedLength == 1)
|
||||
{
|
||||
// Single data block, initial CRC possibly bleeds into zero padding
|
||||
__m128i crc0, crc1;
|
||||
shiftRight128(initialCrc, 16 - length, &crc0, &crc1);
|
||||
|
||||
__m128i A, B;
|
||||
shiftRight128(data0, leadOutSize, &A, &B);
|
||||
|
||||
const __m128i P = _mm_xor_si128(A, crc0);
|
||||
R = _mm_xor_si128(_mm_clmulepi64_si128(P, foldConstants1, 0x10),
|
||||
_mm_xor_si128(_mm_srli_si128(P, 8), _mm_slli_si128(crc1, 8)));
|
||||
}
|
||||
else if(alignedLength == 2)
|
||||
{
|
||||
const __m128i data1 = _mm_load_si128(alignedData + 1);
|
||||
|
||||
if(length < 8)
|
||||
{
|
||||
// Initial CRC bleeds into the zero padding
|
||||
__m128i crc0, crc1;
|
||||
shiftRight128(initialCrc, 16 - length, &crc0, &crc1);
|
||||
|
||||
__m128i A, B, C, D;
|
||||
shiftRight128(data0, leadOutSize, &A, &B);
|
||||
shiftRight128(data1, leadOutSize, &C, &D);
|
||||
|
||||
const __m128i P = _mm_xor_si128(_mm_xor_si128(B, C), crc0);
|
||||
R = _mm_xor_si128(_mm_clmulepi64_si128(P, foldConstants1, 0x10),
|
||||
_mm_xor_si128(_mm_srli_si128(P, 8), _mm_slli_si128(crc1, 8)));
|
||||
}
|
||||
else
|
||||
{
|
||||
// We can fit the initial CRC into the data without bleeding into the zero padding
|
||||
__m128i crc0, crc1;
|
||||
shiftRight128(initialCrc, leadInSize, &crc0, &crc1);
|
||||
|
||||
__m128i A, B, C, D;
|
||||
shiftRight128(_mm_xor_si128(data0, crc0), leadOutSize, &A, &B);
|
||||
shiftRight128(_mm_xor_si128(data1, crc1), leadOutSize, &C, &D);
|
||||
|
||||
const __m128i P = _mm_xor_si128(fold(A, foldConstants1), _mm_xor_si128(B, C));
|
||||
R = _mm_xor_si128(_mm_clmulepi64_si128(P, foldConstants1, 0x10), _mm_srli_si128(P, 8));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
alignedData++;
|
||||
length -= 16 - leadInSize;
|
||||
|
||||
// Initial CRC can simply be added to data
|
||||
__m128i crc0, crc1;
|
||||
shiftRight128(initialCrc, leadInSize, &crc0, &crc1);
|
||||
|
||||
__m128i accumulator = _mm_xor_si128(fold(_mm_xor_si128(crc0, data0), foldConstants1), crc1);
|
||||
|
||||
while(length >= 32)
|
||||
{
|
||||
accumulator = fold(_mm_xor_si128(_mm_load_si128(alignedData), accumulator), foldConstants1);
|
||||
|
||||
length -= 16;
|
||||
alignedData++;
|
||||
}
|
||||
|
||||
__m128i P;
|
||||
if(length == 16) { P = _mm_xor_si128(accumulator, _mm_load_si128(alignedData)); }
|
||||
else
|
||||
{
|
||||
const __m128i end0 = _mm_xor_si128(accumulator, _mm_load_si128(alignedData));
|
||||
const __m128i end1 = _mm_load_si128(alignedData + 1);
|
||||
|
||||
__m128i A, B, C, D;
|
||||
shiftRight128(end0, leadOutSize, &A, &B);
|
||||
shiftRight128(end1, leadOutSize, &C, &D);
|
||||
|
||||
P = _mm_xor_si128(fold(A, foldConstants1), _mm_or_si128(B, C));
|
||||
}
|
||||
|
||||
R = _mm_xor_si128(_mm_clmulepi64_si128(P, foldConstants1, 0x10), _mm_srli_si128(P, 8));
|
||||
}
|
||||
|
||||
// Final Barrett reduction
|
||||
const __m128i T1 = _mm_clmulepi64_si128(R, foldConstants2, 0x00);
|
||||
const __m128i T2 =
|
||||
_mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(T1, foldConstants2, 0x10), _mm_slli_si128(T1, 8)), R);
|
||||
|
||||
#if defined(_WIN64)
|
||||
return ~_mm_extract_epi64(T2, 1);
|
||||
#else
|
||||
return ~(((uint64_t)(uint32_t)_mm_extract_epi32(T2, 3) << 32) | (uint64_t)(uint32_t)_mm_extract_epi32(T2, 2));
|
||||
#endif
|
||||
}
|
||||
Reference in New Issue
Block a user