diff --git a/CMakeLists.txt b/CMakeLists.txt index 297abc3..b3caa98 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,6 +26,6 @@ if("${CMAKE_BUILD_TYPE}" MATCHES "Release") endif() endif() -add_library("Aaru.Checksums.Native" SHARED adler32.h adler32.c crc16.h crc16.c crc16_ccitt.h crc16_ccitt.c crc32.c crc32.h crc64.c crc64.h fletcher16.h fletcher16.c fletcher32.h fletcher32.c library.h spamsum.c spamsum.h crc32_clmul.c crc64_clmul.c simd.c simd.h adler32_ssse3.c adler32_avx2.c adler32_neon.c crc32_arm_simd.c crc32_vmull.c crc32_simd.h) +add_library("Aaru.Checksums.Native" SHARED adler32.h adler32.c crc16.h crc16.c crc16_ccitt.h crc16_ccitt.c crc32.c crc32.h crc64.c crc64.h fletcher16.h fletcher16.c fletcher32.h fletcher32.c library.h spamsum.c spamsum.h crc32_clmul.c crc64_clmul.c simd.c simd.h adler32_ssse3.c adler32_avx2.c adler32_neon.c crc32_arm_simd.c crc32_vmull.c crc32_simd.h arm_vmull.c arm_vmull.h crc64_vmull.c) add_subdirectory(tests) \ No newline at end of file diff --git a/arm_vmull.c b/arm_vmull.c new file mode 100644 index 0000000..a175198 --- /dev/null +++ b/arm_vmull.c @@ -0,0 +1,140 @@ +// +// Created by claunia on 12/10/21. +// + +#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM) + +#include + +#include "library.h" +#include "arm_vmull.h" +#include "simd.h" + +TARGET_WITH_CRYPTO static uint64x2_t sse2neon_vmull_p64_crypto(uint64x1_t _a, uint64x1_t _b) +{ + poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0); + poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0); + return vreinterpretq_u64_p128(vmull_p64(a, b)); +} + +TARGET_WITH_SIMD uint64x2_t sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) +{ + // Wraps vmull_p64 + if(have_arm_crypto()) return sse2neon_vmull_p64_crypto(_a, _b); + + // ARMv7 polyfill + // ARMv7/some A64 lacks vmull_p64, but it has vmull_p8. + // + // vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a + // 64-bit->128-bit polynomial multiply. + // + // It needs some work and is somewhat slow, but it is still faster than all + // known scalar methods. + // + // Algorithm adapted to C from + // https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted + // from "Fast Software Polynomial Multiplication on ARM Processors Using the + // NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab + // (https://hal.inria.fr/hal-01506572) + + poly8x8_t a = vreinterpret_p8_u64(_a); + poly8x8_t b = vreinterpret_p8_u64(_b); + + // Masks + uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff), vcreate_u8(0x00000000ffffffff)); + uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff), vcreate_u8(0x0000000000000000)); + + // Do the multiplies, rotating with vext to get all combinations + uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0 + uint8x16_t e = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1 + uint8x16_t f = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0 + uint8x16_t g = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2 + uint8x16_t h = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0 + uint8x16_t i = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3 + uint8x16_t j = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0 + uint8x16_t k = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4 + + // Add cross products + uint8x16_t l = veorq_u8(e, f); // L = E + F + uint8x16_t m = veorq_u8(g, h); // M = G + H + uint8x16_t n = veorq_u8(i, j); // N = I + J + + // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL + // instructions. +#if defined(__aarch64__) + uint8x16_t lm_p0 = vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); + uint8x16_t lm_p1 = vreinterpretq_u8_u64(vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); + uint8x16_t nk_p0 = vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); + uint8x16_t nk_p1 = vreinterpretq_u8_u64(vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); +#else + uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m)); + uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m)); + uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k)); + uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k)); +#endif + // t0 = (L) (P0 + P1) << 8 + // t1 = (M) (P2 + P3) << 16 + uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1); + uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32); + uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h); + + // t2 = (N) (P4 + P5) << 24 + // t3 = (K) (P6 + P7) << 32 + uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1); + uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00); + uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h); + + // De-interleave +#if defined(__aarch64__) + uint8x16_t t0 = vreinterpretq_u8_u64(vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); + uint8x16_t t1 = vreinterpretq_u8_u64(vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); + uint8x16_t t2 = vreinterpretq_u8_u64(vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); + uint8x16_t t3 = vreinterpretq_u8_u64(vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); +#else + uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h)); + uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h)); + uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h)); + uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h)); +#endif + // Shift the cross products + uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8 + uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16 + uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24 + uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32 + + // Accumulate the products + uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift); + uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift); + uint8x16_t mix = veorq_u8(d, cross1); + uint8x16_t r = veorq_u8(mix, cross2); + return vreinterpretq_u64_u8(r); +} + +TARGET_WITH_SIMD uint64x2_t mm_shuffle_epi8(uint64x2_t a, uint64x2_t b) +{ + uint8x16_t tbl = vreinterpretq_u8_u64(a); // input a + uint8x16_t idx = vreinterpretq_u8_u64(b); // input b + uint8x16_t idx_masked = vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits +#if defined(__aarch64__) + return vreinterpretq_u64_u8(vqtbl1q_u8(tbl, idx_masked)); +#else + // use this line if testing on aarch64 + uint8x8x2_t a_split = {vget_low_u8(tbl), vget_high_u8(tbl)}; + return vreinterpretq_u64_u8( + vcombine_u8(vtbl2_u8(a_split, vget_low_u8(idx_masked)), vtbl2_u8(a_split, vget_high_u8(idx_masked)))); +#endif +} + +TARGET_WITH_SIMD uint64x2_t mm_srli_si128(uint64x2_t a, int imm) +{ + uint8x16_t tmp[2] = {vreinterpretq_u8_u64(a), vdupq_n_u8(0)}; + return vreinterpretq_u64_u8(vld1q_u8(((uint8_t const*)tmp) + imm)); +} + +TARGET_WITH_SIMD uint64x2_t mm_slli_si128(uint64x2_t a, int imm) +{ + uint8x16_t tmp[2] = {vdupq_n_u8(0), vreinterpretq_u8_u64(a)}; + return vreinterpretq_u64_u8(vld1q_u8(((uint8_t const*)tmp) + (16 - imm))); +} + +#endif \ No newline at end of file diff --git a/arm_vmull.h b/arm_vmull.h new file mode 100644 index 0000000..33ab1b2 --- /dev/null +++ b/arm_vmull.h @@ -0,0 +1,18 @@ +// +// Created by claunia on 12/10/21. +// + +#ifndef AARU_CHECKSUMS_NATIVE__ARM_VMULL_H_ +#define AARU_CHECKSUMS_NATIVE__ARM_VMULL_H_ + +#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM) + +TARGET_WITH_CRYPTO static uint64x2_t sse2neon_vmull_p64_crypto(uint64x1_t _a, uint64x1_t _b); +TARGET_WITH_SIMD uint64x2_t sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b); +TARGET_WITH_SIMD uint64x2_t mm_shuffle_epi8(uint64x2_t a, uint64x2_t b); +TARGET_WITH_SIMD uint64x2_t mm_srli_si128(uint64x2_t a, int imm); +TARGET_WITH_SIMD uint64x2_t mm_slli_si128(uint64x2_t a, int imm); + +#endif + +#endif // AARU_CHECKSUMS_NATIVE__ARM_VMULL_H_ diff --git a/crc32_vmull.c b/crc32_vmull.c index c4f8a80..6a43e41 100644 --- a/crc32_vmull.c +++ b/crc32_vmull.c @@ -11,121 +11,7 @@ #include "library.h" #include "crc32.h" #include "crc32_simd.h" - -TARGET_WITH_CRYPTO static uint64x2_t sse2neon_vmull_p64_crypto(uint64x1_t _a, uint64x1_t _b) -{ - poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0); - poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0); - return vreinterpretq_u64_p128(vmull_p64(a, b)); -} - -FORCE_INLINE TARGET_WITH_SIMD uint64x2_t sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) -{ - // Wraps vmull_p64 - if(have_arm_crypto()) return sse2neon_vmull_p64_crypto(_a, _b); - - // ARMv7 polyfill - // ARMv7/some A64 lacks vmull_p64, but it has vmull_p8. - // - // vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a - // 64-bit->128-bit polynomial multiply. - // - // It needs some work and is somewhat slow, but it is still faster than all - // known scalar methods. - // - // Algorithm adapted to C from - // https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted - // from "Fast Software Polynomial Multiplication on ARM Processors Using the - // NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab - // (https://hal.inria.fr/hal-01506572) - - poly8x8_t a = vreinterpret_p8_u64(_a); - poly8x8_t b = vreinterpret_p8_u64(_b); - - // Masks - uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff), vcreate_u8(0x00000000ffffffff)); - uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff), vcreate_u8(0x0000000000000000)); - - // Do the multiplies, rotating with vext to get all combinations - uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0 - uint8x16_t e = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1 - uint8x16_t f = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0 - uint8x16_t g = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2 - uint8x16_t h = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0 - uint8x16_t i = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3 - uint8x16_t j = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0 - uint8x16_t k = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4 - - // Add cross products - uint8x16_t l = veorq_u8(e, f); // L = E + F - uint8x16_t m = veorq_u8(g, h); // M = G + H - uint8x16_t n = veorq_u8(i, j); // N = I + J - - // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL - // instructions. -#if defined(__aarch64__) - uint8x16_t lm_p0 = vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); - uint8x16_t lm_p1 = vreinterpretq_u8_u64(vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); - uint8x16_t nk_p0 = vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); - uint8x16_t nk_p1 = vreinterpretq_u8_u64(vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); -#else - uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m)); - uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m)); - uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k)); - uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k)); -#endif - // t0 = (L) (P0 + P1) << 8 - // t1 = (M) (P2 + P3) << 16 - uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1); - uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32); - uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h); - - // t2 = (N) (P4 + P5) << 24 - // t3 = (K) (P6 + P7) << 32 - uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1); - uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00); - uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h); - - // De-interleave -#if defined(__aarch64__) - uint8x16_t t0 = vreinterpretq_u8_u64(vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); - uint8x16_t t1 = vreinterpretq_u8_u64(vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); - uint8x16_t t2 = vreinterpretq_u8_u64(vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); - uint8x16_t t3 = vreinterpretq_u8_u64(vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); -#else - uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h)); - uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h)); - uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h)); - uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h)); -#endif - // Shift the cross products - uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8 - uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16 - uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24 - uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32 - - // Accumulate the products - uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift); - uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift); - uint8x16_t mix = veorq_u8(d, cross1); - uint8x16_t r = veorq_u8(mix, cross2); - return vreinterpretq_u64_u8(r); -} - -FORCE_INLINE TARGET_WITH_SIMD uint64x2_t mm_shuffle_epi8(uint64x2_t a, uint64x2_t b) -{ - uint8x16_t tbl = vreinterpretq_u8_u64(a); // input a - uint8x16_t idx = vreinterpretq_u8_u64(b); // input b - uint8x16_t idx_masked = vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits -#if defined(__aarch64__) - return vreinterpretq_u64_u8(vqtbl1q_u8(tbl, idx_masked)); -#else - // use this line if testing on aarch64 - uint8x8x2_t a_split = {vget_low_u8(tbl), vget_high_u8(tbl)}; - return vreinterpretq_u64_u8( - vcombine_u8(vtbl2_u8(a_split, vget_low_u8(idx_masked)), vtbl2_u8(a_split, vget_high_u8(idx_masked)))); -#endif -} +#include "arm_vmull.h" /* * somewhat surprisingly the "naive" way of doing this, ie. with a flag and a cond. branch, diff --git a/crc64.c b/crc64.c index 63e7dde..aa0e15d 100644 --- a/crc64.c +++ b/crc64.c @@ -47,6 +47,14 @@ AARU_EXPORT int AARU_CALL crc64_update(crc64_ctx* ctx, const uint8_t* data, uint } #endif +#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM) + if(have_neon()) + { + ctx->crc = ~crc64_vmull(~ctx->crc, data, len); + return 0; + } +#endif + // Unroll according to Intel slicing by uint8_t // http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf // http://sourceforge.net/projects/slicing-by-8/ diff --git a/crc64.h b/crc64.h index e081731..96aacd1 100644 --- a/crc64.h +++ b/crc64.h @@ -243,4 +243,8 @@ AARU_EXPORT void AARU_CALL crc64_slicing(uint64_t* crc, const uint8_t* dat #if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \ defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86) AARU_EXPORT uint64_t AARU_CALL crc64_clmul(uint64_t crc, const uint8_t* data, long length); +#endif + +#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM) +AARU_EXPORT TARGET_WITH_SIMD uint64_t AARU_CALL crc64_vmull(uint64_t crc, const uint8_t* data, long length); #endif \ No newline at end of file diff --git a/crc64_vmull.c b/crc64_vmull.c new file mode 100644 index 0000000..9df958e --- /dev/null +++ b/crc64_vmull.c @@ -0,0 +1,164 @@ +// +// Created by claunia on 12/10/21. +// + +#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM) + +#include +#include +#include + +#include "library.h" +#include "arm_vmull.h" +#include "crc64.h" + +static const uint8_t shuffleMasks[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x8f, 0x8e, 0x8d, 0x8c, 0x8b, 0x8a, 0x89, 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81, 0x80, +}; + +TARGET_WITH_SIMD FORCE_INLINE void shiftRight128(uint64x2_t in, size_t n, uint64x2_t* outLeft, uint64x2_t* outRight) +{ + const uint64x2_t maskA = + vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)(const uint64x2_t*)(shuffleMasks + (16 - n)))); + uint64x2_t b = vreinterpretq_u64_u8(vceqq_u8(vreinterpretq_u8_u64(vreinterpretq_u64_u32(vdupq_n_u32(0))), + vreinterpretq_u8_u64(vreinterpretq_u64_u32(vdupq_n_u32(0))))); + const uint64x2_t maskB = vreinterpretq_u64_u32(veorq_u32(vreinterpretq_u32_u64(maskA), vreinterpretq_u32_u64(b))); + + *outLeft = mm_shuffle_epi8(in, maskB); + *outRight = mm_shuffle_epi8(in, maskA); +} + +TARGET_WITH_SIMD FORCE_INLINE uint64x2_t fold(uint64x2_t in, uint64x2_t foldConstants) +{ + return veorq_u64(sse2neon_vmull_p64(vget_low_u64(in), vget_low_u64(foldConstants)), + sse2neon_vmull_p64(vget_high_u64(in), vget_high_u64(foldConstants))); +} + +AARU_EXPORT TARGET_WITH_SIMD uint64_t AARU_CALL crc64_vmull(uint64_t crc, const uint8_t* data, long length) +{ + const uint64_t k1 = 0xe05dd497ca393ae4; // bitReflect(expMod65(128 + 64, poly, 1)) << 1; + const uint64_t k2 = 0xdabe95afc7875f40; // bitReflect(expMod65(128, poly, 1)) << 1; + const uint64_t mu = 0x9c3e466c172963d5; // (bitReflect(div129by65(poly)) << 1) | 1; + const uint64_t p = 0x92d8af2baf0e1e85; // (bitReflect(poly) << 1) | 1; + + const uint64x2_t foldConstants1 = vcombine_u64(vcreate_u64(k1), vcreate_u64(k2)); + const uint64x2_t foldConstants2 = vcombine_u64(vcreate_u64(mu), vcreate_u64(p)); + + const uint8_t* end = data + length; + + // Align pointers + const uint64x2_t* alignedData = (const uint64x2_t*)((uintptr_t)data & ~(uintptr_t)15); + const uint64x2_t* alignedEnd = (const uint64x2_t*)(((uintptr_t)end + 15) & ~(uintptr_t)15); + + const size_t leadInSize = data - (const uint8_t*)alignedData; + const size_t leadOutSize = (const uint8_t*)alignedEnd - end; + + const size_t alignedLength = alignedEnd - alignedData; + + const uint64x2_t leadInMask = + vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)(const uint64x2_t*)(shuffleMasks + (16 - leadInSize)))); + uint64x2_t a = vreinterpretq_u64_u32(vdupq_n_u32(0)); + uint64x2_t b = vreinterpretq_u64_u32( + vld1q_u32((const uint32_t*)alignedData)); // Use a signed shift right to create a mask with the sign bit + const uint64x2_t data0 = + vreinterpretq_u64_u8(vbslq_u8(vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u64(leadInMask), 7)), + vreinterpretq_u8_u64(b), + vreinterpretq_u8_u64(a))); + + const uint64x2_t initialCrc = vsetq_lane_u64(~crc, vdupq_n_u64(0), 0); + + uint64x2_t R; + if(alignedLength == 1) + { + // Single data block, initial CRC possibly bleeds into zero padding + uint64x2_t crc0, crc1; + shiftRight128(initialCrc, 16 - length, &crc0, &crc1); + + uint64x2_t A, B; + shiftRight128(data0, leadOutSize, &A, &B); + + const uint64x2_t P = veorq_u64(A, crc0); + R = veorq_u64(sse2neon_vmull_p64(vget_low_u64(P), vget_high_u64(foldConstants1)), + veorq_u64(mm_srli_si128(P, 8), mm_slli_si128(crc1, 8))); + } + else if(alignedLength == 2) + { + const uint64x2_t data1 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)(alignedData + 1))); + + if(length < 8) + { + // Initial CRC bleeds into the zero padding + uint64x2_t crc0, crc1; + shiftRight128(initialCrc, 16 - length, &crc0, &crc1); + + uint64x2_t A, B, C, D; + shiftRight128(data0, leadOutSize, &A, &B); + shiftRight128(data1, leadOutSize, &C, &D); + + const uint64x2_t P = veorq_u64(veorq_u64(B, C), crc0); + R = veorq_u64(sse2neon_vmull_p64(vget_low_u64(P), vget_high_u64(foldConstants1)), + veorq_u64(mm_srli_si128(P, 8), mm_slli_si128(crc1, 8))); + } + else + { + // We can fit the initial CRC into the data without bleeding into the zero padding + uint64x2_t crc0, crc1; + shiftRight128(initialCrc, leadInSize, &crc0, &crc1); + + uint64x2_t A, B, C, D; + shiftRight128(veorq_u64(data0, crc0), leadOutSize, &A, &B); + shiftRight128(veorq_u64(data1, crc1), leadOutSize, &C, &D); + + const uint64x2_t P = veorq_u64(fold(A, foldConstants1), veorq_u64(B, C)); + R = veorq_u64(sse2neon_vmull_p64(vget_low_u64(P), vget_high_u64(foldConstants1)), mm_srli_si128(P, 8)); + } + } + else + { + alignedData++; + length -= 16 - leadInSize; + + // Initial CRC can simply be added to data + uint64x2_t crc0, crc1; + shiftRight128(initialCrc, leadInSize, &crc0, &crc1); + + uint64x2_t accumulator = veorq_u64(fold(veorq_u64(crc0, data0), foldConstants1), crc1); + + while(length >= 32) + { + accumulator = fold(veorq_u64(vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)alignedData)), accumulator), + foldConstants1); + + length -= 16; + alignedData++; + } + + uint64x2_t P; + if(length == 16) P = veorq_u64(accumulator, vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)alignedData))); + else + { + const uint64x2_t end0 = + veorq_u64(accumulator, vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)alignedData))); + const uint64x2_t end1 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)(alignedData + 1))); + + uint64x2_t A, B, C, D; + shiftRight128(end0, leadOutSize, &A, &B); + shiftRight128(end1, leadOutSize, &C, &D); + + P = veorq_u64(fold(A, foldConstants1), + vreinterpretq_u64_u32(vorrq_u32(vreinterpretq_u32_u64(B), vreinterpretq_u32_u64(C)))); + } + + R = veorq_u64(sse2neon_vmull_p64(vget_low_u64(P), vget_high_u64(foldConstants1)), mm_srli_si128(P, 8)); + } + + // Final Barrett reduction + const uint64x2_t T1 = sse2neon_vmull_p64(vget_low_u64(R), vget_low_u64(foldConstants2)); + const uint64x2_t T2 = veorq_u64( + veorq_u64(sse2neon_vmull_p64(vget_low_u64(T1), vget_high_u64(foldConstants2)), mm_slli_si128(T1, 8)), R); + + return ~vgetq_lane_u64(T2, 1); +} + +#endif \ No newline at end of file diff --git a/tests/crc64.cpp b/tests/crc64.cpp index 473704b..b04e1bd 100644 --- a/tests/crc64.cpp +++ b/tests/crc64.cpp @@ -283,3 +283,83 @@ TEST_F(crc64Fixture, crc64_clmul_2352bytes) EXPECT_EQ(crc, EXPECTED_CRC64_2352BYTES); } #endif + +#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM) +TEST_F(crc64Fixture, crc64_vmull) +{ + if(!have_neon()) return; + + uint64_t crc = CRC64_ECMA_SEED; + + crc = ~crc64_vmull(~crc, buffer, 1048576); + + crc ^= CRC64_ECMA_SEED; + + EXPECT_EQ(crc, EXPECTED_CRC64); +} + +TEST_F(crc64Fixture, crc64_vmull_misaligned) +{ + if(!have_neon()) return; + + uint64_t crc = CRC64_ECMA_SEED; + + crc = ~crc64_vmull(~crc, buffer_misaligned+1, 1048576); + + crc ^= CRC64_ECMA_SEED; + + EXPECT_EQ(crc, EXPECTED_CRC64); +} + +TEST_F(crc64Fixture, crc64_vmull_15bytes) +{ + if(!have_neon()) return; + + uint64_t crc = CRC64_ECMA_SEED; + + crc = ~crc64_vmull(~crc, buffer, 15); + + crc ^= CRC64_ECMA_SEED; + + EXPECT_EQ(crc, EXPECTED_CRC64_15BYTES); +} + +TEST_F(crc64Fixture, crc64_vmull_31bytes) +{ + if(!have_neon()) return; + + uint64_t crc = CRC64_ECMA_SEED; + + crc = ~crc64_vmull(~crc, buffer, 31); + + crc ^= CRC64_ECMA_SEED; + + EXPECT_EQ(crc, EXPECTED_CRC64_31BYTES); +} + +TEST_F(crc64Fixture, crc64_vmull_63bytes) +{ + if(!have_neon()) return; + + uint64_t crc = CRC64_ECMA_SEED; + + crc = ~crc64_vmull(~crc, buffer, 63); + + crc ^= CRC64_ECMA_SEED; + + EXPECT_EQ(crc, EXPECTED_CRC64_63BYTES); +} + +TEST_F(crc64Fixture, crc64_vmull_2352bytes) +{ + if(!have_neon()) return; + + uint64_t crc = CRC64_ECMA_SEED; + + crc = ~crc64_vmull(~crc, buffer, 2352); + + crc ^= CRC64_ECMA_SEED; + + EXPECT_EQ(crc, EXPECTED_CRC64_2352BYTES); +} +#endif