From d433af7987ba38142292b0f58d02141aa49393af Mon Sep 17 00:00:00 2001 From: Natalia Portillo Date: Wed, 29 Sep 2021 02:49:40 +0100 Subject: [PATCH] Add ARM special instructions implementation for CRC32. --- CMakeLists.txt | 2 +- crc32.c | 9 ++++++ crc32.h | 10 +++++++ crc32_arm_simd.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++ simd.c | 5 ++++ simd.h | 3 ++ 6 files changed, 101 insertions(+), 1 deletion(-) create mode 100644 crc32_arm_simd.c diff --git a/CMakeLists.txt b/CMakeLists.txt index f43e11f..b788801 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,4 +24,4 @@ if("${CMAKE_BUILD_TYPE}" MATCHES "Release") endif() endif() -add_library("Aaru.Checksums.Native" SHARED adler32.h adler32.c crc16.h crc16.c crc16_ccitt.h crc16_ccitt.c crc32.c crc32.h crc64.c crc64.h fletcher16.h fletcher16.c fletcher32.h fletcher32.c library.h spamsum.c spamsum.h crc32_clmul.c crc64_clmul.c simd.c simd.h adler32_ssse3.c adler32_avx2.c adler32_neon.c) +add_library("Aaru.Checksums.Native" SHARED adler32.h adler32.c crc16.h crc16.c crc16_ccitt.h crc16_ccitt.c crc32.c crc32.h crc64.c crc64.h fletcher16.h fletcher16.c fletcher32.h fletcher32.c library.h spamsum.c spamsum.h crc32_clmul.c crc64_clmul.c simd.c simd.h adler32_ssse3.c adler32_avx2.c adler32_neon.c crc32_arm_simd.c) diff --git a/crc32.c b/crc32.c index 6978dc3..4ac930b 100644 --- a/crc32.c +++ b/crc32.c @@ -48,6 +48,15 @@ AARU_EXPORT int AARU_CALL crc32_update(crc32_ctx* ctx, const uint8_t* data, uint } #endif +#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM) + if(have_arm_crc32()) + { + ctx->crc = armv8_crc32_little(ctx->crc, data, len); + + return 0; + } +#endif + // Unroll according to Intel slicing by uint8_t // http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf // http://sourceforge.net/projects/slicing-by-8/ diff --git a/crc32.h b/crc32.h index 0804b7e..22bdcb9 100644 --- a/crc32.h +++ b/crc32.h @@ -276,4 +276,14 @@ AARU_EXPORT void AARU_CALL crc32_free(crc32_ctx* ctx); #endif CLMUL uint32_t crc32_clmul(const uint8_t* src, long len, uint32_t initial_crc); +#endif + +#if defined(__aarch64__) +#define TARGET_ARMV8_WITH_CRC __attribute__((target("+crc"))) +#else // !defined(__aarch64__) +#define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc"))) +#endif // defined(__aarch64__) + +#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM) +TARGET_ARMV8_WITH_CRC uint32_t armv8_crc32_little(uint32_t crc, const unsigned char* buf, uint32_t len); #endif \ No newline at end of file diff --git a/crc32_arm_simd.c b/crc32_arm_simd.c new file mode 100644 index 0000000..8671281 --- /dev/null +++ b/crc32_arm_simd.c @@ -0,0 +1,73 @@ +// +// Created by claunia on 29/9/21. +// + +#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM) + +#include + +#include "library.h" +#include "crc32.h" + +TARGET_ARMV8_WITH_CRC uint32_t armv8_crc32_little(uint32_t crc, const unsigned char* buf, uint32_t len) +{ + uint32_t c = (uint32_t)~crc; + +#if defined(__aarch64__) || defined(_M_ARM64) + while(len && ((uintptr_t)buf & 7)) + { + c = __crc32b(c, *buf++); + --len; + } + const uint64_t* buf8 = (const uint64_t*)buf; + while(len >= 64) + { + c = __crc32d(c, *buf8++); + c = __crc32d(c, *buf8++); + c = __crc32d(c, *buf8++); + c = __crc32d(c, *buf8++); + c = __crc32d(c, *buf8++); + c = __crc32d(c, *buf8++); + c = __crc32d(c, *buf8++); + c = __crc32d(c, *buf8++); + len -= 64; + } + while(len >= 8) + { + c = __crc32d(c, *buf8++); + len -= 8; + } + + buf = (const unsigned char*)buf8; +#else // AARCH64 + while(len && ((uintptr_t)buf & 3)) + { + c = __crc32b(c, *buf++); + --len; + } + const uint32_t* buf4 = (const uint32_t*)buf; + while(len >= 32) + { + c = __crc32w(c, *buf4++); + c = __crc32w(c, *buf4++); + c = __crc32w(c, *buf4++); + c = __crc32w(c, *buf4++); + c = __crc32w(c, *buf4++); + c = __crc32w(c, *buf4++); + c = __crc32w(c, *buf4++); + c = __crc32w(c, *buf4++); + len -= 32; + } + while(len >= 4) + { + c = __crc32d(c, *buf4++); + len -= 4; + } + + buf = (const unsigned char*)buf4; +#endif + + while(len--) { c = __crc32b(c, *buf++); } + return ~c; +} +#endif diff --git a/simd.c b/simd.c index 7fa1efd..a369d9b 100644 --- a/simd.c +++ b/simd.c @@ -99,8 +99,13 @@ int have_neon(void) { return 1; // ARMv8-A made it mandatory } + +int have_arm_crc32(void) { return getauxval(AT_HWCAP) & HWCAP_CRC32; } #endif #if defined(__arm__) || defined(_M_ARM) int have_neon(void) { return getauxval(AT_HWCAP) & HWCAP_NEON; } + +int have_arm_crc32(void) { return getauxval(AT_HWCAP2) & HWCAP2_CRC32; } +} #endif \ No newline at end of file diff --git a/simd.h b/simd.h index cb0a88a..a551edf 100644 --- a/simd.h +++ b/simd.h @@ -10,12 +10,15 @@ int have_avx2(void); #if defined(__arm__) || defined(_M_ARM) #define HWCAP_NEON (1 << 12) +#define HWCAP2_CRC32 (1 << 4) #endif #if defined(__aarch64__) || defined(_M_ARM64) #define HWCAP_NEON (1 << 1) +#define HWCAP_CRC32 (1 << 7) #endif #if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM) int have_neon(void); +int have_arm_crc32(void); #endif \ No newline at end of file