diff --git a/CMakeLists.txt b/CMakeLists.txt index d266b7a..55557ee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,4 +22,4 @@ if("${CMAKE_BUILD_TYPE}" MATCHES "Release") endif() endif() -add_library("Aaru.Checksums.Native" SHARED adler32.h adler32.c crc16.h crc16.c crc16_ccitt.h crc16_ccitt.c crc32.c crc32.h crc64.c crc64.h fletcher16.h fletcher16.c fletcher32.h fletcher32.c library.h spamsum.c spamsum.h crc32_clmul.c crc64_clmul.c simd.c simd.h adler32_ssse3.c) +add_library("Aaru.Checksums.Native" SHARED adler32.h adler32.c crc16.h crc16.c crc16_ccitt.h crc16_ccitt.c crc32.c crc32.h crc64.c crc64.h fletcher16.h fletcher16.c fletcher32.h fletcher32.c library.h spamsum.c spamsum.h crc32_clmul.c crc64_clmul.c simd.c simd.h adler32_ssse3.c adler32_avx2.c) diff --git a/adler32.c b/adler32.c index 19b314c..1ad47b5 100644 --- a/adler32.c +++ b/adler32.c @@ -51,6 +51,13 @@ AARU_EXPORT int AARU_CALL adler32_update(adler32_ctx* ctx, const uint8_t* data, #if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \ defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86) + if(have_avx2()) + { + adler32_avx2(&ctx->sum1, &ctx->sum2, data, len); + + return 0; + } + if(have_ssse3()) { adler32_ssse3(&ctx->sum1, &ctx->sum2, data, len); diff --git a/adler32.h b/adler32.h index 92bda16..ad8f84f 100644 --- a/adler32.h +++ b/adler32.h @@ -38,6 +38,7 @@ AARU_EXPORT void AARU_CALL adler32_free(adler32_ctx* ctx); defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86) void adler32_ssse3(uint16_t* sum1, uint16_t* sum2, const unsigned char* buf, size_t len); +void adler32_avx2(uint16_t* sum1, uint16_t* sum2, const unsigned char* buf, size_t len); #endif diff --git a/adler32_avx2.c b/adler32_avx2.c new file mode 100644 index 0000000..7075e52 --- /dev/null +++ b/adler32_avx2.c @@ -0,0 +1,150 @@ +// +// Created by claunia on 28/9/21. +// + +#include +#include + +#include "library.h" +#include "adler32.h" +#include "simd.h" + +AVX2 void adler32_avx2(uint16_t* sum1, uint16_t* sum2, const unsigned char* buf, size_t len) +{ + uint32_t s1 = *sum1; + uint32_t s2 = *sum2; + + /* + * Process the data in blocks. + */ + const unsigned BLOCK_SIZE = 1 << 5; + size_t blocks = len / BLOCK_SIZE; + len -= blocks * BLOCK_SIZE; + + while(blocks) + { + unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ + + if(n > blocks) n = (unsigned)blocks; + blocks -= n; + + const __m256i tap = _mm256_set_epi8(1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32); + const __m256i zero = _mm256_setzero_si256(); + const __m256i ones = _mm256_set1_epi16(1); + + /* + * Process n blocks of data. At most NMAX data bytes can be + * processed before s2 must be reduced modulo BASE. + */ + __m256i v_ps = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, (s1 * n)); + __m256i v_s2 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, s2); + __m256i v_s1 = _mm256_setzero_si256(); + do { + /* + * Load 32 input bytes. + */ + const __m256i bytes = _mm256_lddqu_si256((__m256i*)(buf)); + + /* + * Add previous block byte sum to v_ps. + */ + v_ps = _mm256_add_epi32(v_ps, v_s1); + /* + * Horizontally add the bytes for s1, multiply-adds the + * bytes by [ 32, 31, 30, ... ] for s2. + */ + v_s1 = _mm256_add_epi32(v_s1, _mm256_sad_epu8(bytes, zero)); + const __m256i mad = _mm256_maddubs_epi16(bytes, tap); + v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(mad, ones)); + + buf += BLOCK_SIZE; + } while(--n); + + __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(v_s1), _mm256_extracti128_si256(v_s1, 1)); + __m128i hi = _mm_unpackhi_epi64(sum, sum); + sum = _mm_add_epi32(hi, sum); + hi = _mm_shuffle_epi32(sum, 177); + sum = _mm_add_epi32(sum, hi); + s1 += _mm_cvtsi128_si32(sum); + + v_s2 = _mm256_add_epi32(v_s2, _mm256_slli_epi32(v_ps, 5)); + sum = _mm_add_epi32(_mm256_castsi256_si128(v_s2), _mm256_extracti128_si256(v_s2, 1)); + hi = _mm_unpackhi_epi64(sum, sum); + sum = _mm_add_epi32(hi, sum); + hi = _mm_shuffle_epi32(sum, 177); + sum = _mm_add_epi32(sum, hi); + s2 = _mm_cvtsi128_si32(sum); + + /* + * Reduce. + */ + s1 %= ADLER_MODULE; + s2 %= ADLER_MODULE; + } + + /* + * Handle leftover data. + */ + if(len) + { + if(len >= 16) + { + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + len -= 16; + } + while(len--) { s2 += (s1 += *buf++); } + if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE; + s2 %= ADLER_MODULE; + } + /* + * Return the recombined sums. + */ + *sum1 = s1 & 0xFFFF; + *sum2 = s2 & 0xFFFF; +} diff --git a/simd.c b/simd.c index ae42f48..91fd687 100644 --- a/simd.c +++ b/simd.c @@ -35,6 +35,29 @@ static void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigne #endif } +static void cpuidex(int info, int count, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) +{ +#ifdef _MSC_VER + unsigned int registers[4]; + __cpuidex(registers, info, count); + *eax = registers[0]; + *ebx = registers[1]; + *ecx = registers[2]; + *edx = registers[3]; +#else + /* GCC, clang */ + unsigned int _eax; + unsigned int _ebx; + unsigned int _ecx; + unsigned int _edx; + __cpuid_count(info, count, _eax, _ebx, _ecx, _edx); + *eax = _eax; + *ebx = _ebx; + *ecx = _ecx; + *edx = _edx; +#endif +} + int have_clmul(void) { unsigned eax, ebx, ecx, edx; @@ -56,4 +79,11 @@ int have_ssse3(void) return ecx & 0x200; } +int have_avx2(void) +{ + unsigned eax, ebx, ecx, edx; + cpuidex(7 /* extended feature bits */, 0, &eax, &ebx, &ecx, &edx); + + return ebx & 0x20; +} #endif \ No newline at end of file diff --git a/simd.h b/simd.h index 301ec6a..a302eb7 100644 --- a/simd.h +++ b/simd.h @@ -1,6 +1,9 @@ #if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \ defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86) +#define AVX2 __attribute__((target("avx2"))) + int have_clmul(void); int have_ssse3(void); +int have_avx2(void); #endif