diff --git a/CMakeLists.txt b/CMakeLists.txt index 47a6c56..af7b257 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -71,6 +71,6 @@ if("${CMAKE_BUILD_TYPE}" MATCHES "Release") endif() endif() -add_library("Aaru.Checksums.Native" SHARED adler32.h adler32.c crc16.h crc16.c crc16_ccitt.h crc16_ccitt.c crc32.c crc32.h crc64.c crc64.h fletcher16.h fletcher16.c fletcher16_neon.c fletcher16_ssse3.c fletcher32.h fletcher32.c fletcher32_avx2.c fletcher32_neon.c fletcher32_ssse3.c library.h spamsum.c spamsum.h crc32_clmul.c crc64_clmul.c simd.c simd.h adler32_ssse3.c adler32_avx2.c adler32_neon.c crc32_arm_simd.c crc32_vmull.c crc32_simd.h arm_vmull.c arm_vmull.h crc64_vmull.c library.c) +add_library("Aaru.Checksums.Native" SHARED adler32.h adler32.c crc16.h crc16.c crc16_ccitt.h crc16_ccitt.c crc32.c crc32.h crc64.c crc64.h fletcher16.h fletcher16.c fletcher16_avx2.c fletcher16_neon.c fletcher16_ssse3.c fletcher32.h fletcher32.c fletcher32_avx2.c fletcher32_neon.c fletcher32_ssse3.c library.h spamsum.c spamsum.h crc32_clmul.c crc64_clmul.c simd.c simd.h adler32_ssse3.c adler32_avx2.c adler32_neon.c crc32_arm_simd.c crc32_vmull.c crc32_simd.h arm_vmull.c arm_vmull.h crc64_vmull.c library.c) add_subdirectory(tests) diff --git a/fletcher16.c b/fletcher16.c index 5b465ea..e43befc 100644 --- a/fletcher16.c +++ b/fletcher16.c @@ -77,6 +77,13 @@ AARU_EXPORT int AARU_CALL fletcher16_update(fletcher16_ctx *ctx, const uint8_t * #if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \ defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86) + if(have_avx2()) + { + fletcher16_avx2(&ctx->sum1, &ctx->sum2, data, len); + + return 0; + } + if(have_ssse3()) { fletcher16_ssse3(&ctx->sum1, &ctx->sum2, data, len); diff --git a/fletcher16.h b/fletcher16.h index 1a7e321..59caa43 100644 --- a/fletcher16.h +++ b/fletcher16.h @@ -37,6 +37,7 @@ AARU_EXPORT void AARU_CALL fletcher16_free(fletcher16_ctx *ctx); #if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \ defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86) +AARU_EXPORT TARGET_WITH_AVX2 void AARU_CALL fletcher16_avx2(uint8_t* sum1, uint8_t* sum2, const uint8_t* data, long len); AARU_EXPORT TARGET_WITH_SSSE3 void AARU_CALL fletcher16_ssse3(uint8_t* sum1, uint8_t* sum2, const uint8_t* data, long len); #endif diff --git a/fletcher16_avx2.c b/fletcher16_avx2.c new file mode 100644 index 0000000..8a66e48 --- /dev/null +++ b/fletcher16_avx2.c @@ -0,0 +1,189 @@ +/* + * This file is part of the Aaru Data Preservation Suite. + * Copyright (c) 2019-2023 Natalia Portillo. + * Copyright (C) 1995-2011 Mark Adler + * Copyright (C) Jean-loup Gailly + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \ + defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86) + +#include +#include + +#include "library.h" +#include "fletcher16.h" +#include "simd.h" + +/** + * @brief Calculate Fletcher-16 checksum for a given data using NEON instructions. + * + * This function calculates the Fletcher-16 checksum for a block of data using NEON vector instructions. + * + * @param sum1 Pointer to the variable where the first 8-bit checksum value is stored. + * @param sum2 Pointer to the variable where the second 8-bit checksum value is stored. + * @param data Pointer to the data buffer. + * @param len Length of the data buffer in bytes. + */ +AARU_EXPORT TARGET_WITH_AVX2 void AARU_CALL +fletcher16_avx2(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len) +{ + uint32_t s1 = *sum1; + uint32_t s2 = *sum2; + + /* + * Process the data in blocks. + */ + const unsigned BLOCK_SIZE = 1 << 5; + long blocks = len / BLOCK_SIZE; + len -= blocks * BLOCK_SIZE; + + while(blocks) + { + unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ + + if(n > blocks) n = (unsigned)blocks; + blocks -= n; + + const __m256i tap = _mm256_set_epi8(1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32); + const __m256i zero = _mm256_setzero_si256(); + const __m256i ones = _mm256_set1_epi16(1); + + /* + * Process n blocks of data. At most NMAX data bytes can be + * processed before s2 must be reduced modulo BASE. + */ + __m256i v_ps = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, (s1 * n)); + __m256i v_s2 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, s2); + __m256i v_s1 = _mm256_setzero_si256(); + do + { + /* + * Load 32 input bytes. + */ + const __m256i bytes = _mm256_lddqu_si256((__m256i *)(data)); + + /* + * Add previous block byte sum to v_ps. + */ + v_ps = _mm256_add_epi32(v_ps, v_s1); + /* + * Horizontally add the bytes for s1, multiply-adds the + * bytes by [ 32, 31, 30, ... ] for s2. + */ + v_s1 = _mm256_add_epi32(v_s1, _mm256_sad_epu8(bytes, zero)); + const __m256i mad = _mm256_maddubs_epi16(bytes, tap); + v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(mad, ones)); + + data += BLOCK_SIZE; + } + while(--n); + + __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(v_s1), _mm256_extracti128_si256(v_s1, 1)); + __m128i hi = _mm_unpackhi_epi64(sum, sum); + sum = _mm_add_epi32(hi, sum); + hi = _mm_shuffle_epi32(sum, 177); + sum = _mm_add_epi32(sum, hi); + s1 += _mm_cvtsi128_si32(sum); + + v_s2 = _mm256_add_epi32(v_s2, _mm256_slli_epi32(v_ps, 5)); + sum = _mm_add_epi32(_mm256_castsi256_si128(v_s2), _mm256_extracti128_si256(v_s2, 1)); + hi = _mm_unpackhi_epi64(sum, sum); + sum = _mm_add_epi32(hi, sum); + hi = _mm_shuffle_epi32(sum, 177); + sum = _mm_add_epi32(sum, hi); + s2 = _mm_cvtsi128_si32(sum); + + /* + * Reduce. + */ + s1 %= FLETCHER16_MODULE; + s2 %= FLETCHER16_MODULE; + } + + /* + * Handle leftover data. + */ + if(len) + { + if(len >= 16) + { + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + len -= 16; + } + while(len--) + { s2 += (s1 += *data++); } + s1 %= FLETCHER16_MODULE; + s2 %= FLETCHER16_MODULE; + } + /* + * Return the recombined sums. + */ + *sum1 = s1 & 0xFF; + *sum2 = s2 & 0xFF; +} + +#endif diff --git a/tests/fletcher16.cpp b/tests/fletcher16.cpp index 2849dac..0a6a2dd 100644 --- a/tests/fletcher16.cpp +++ b/tests/fletcher16.cpp @@ -329,6 +329,24 @@ TEST_F(fletcher16Fixture, fletcher16_neon_2352bytes) #if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \ defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86) +TEST_F(fletcher16Fixture, fletcher16_avx2) +{ + if(!have_avx2()) return; + + uint8_t sum1; + uint8_t sum2; + uint32_t fletcher16; + + sum1 = 0xFF; + sum2 = 0xFF; + + fletcher16_avx2(&sum1, &sum2, buffer, 1048576); + + fletcher16 = (sum2 << 8) | sum1; + + EXPECT_EQ(fletcher16, EXPECTED_FLETCHER16); +} + TEST_F(fletcher16Fixture, fletcher16_ssse3) { if(!have_ssse3()) return; @@ -347,12 +365,30 @@ TEST_F(fletcher16Fixture, fletcher16_ssse3) EXPECT_EQ(fletcher16, EXPECTED_FLETCHER16); } +TEST_F(fletcher16Fixture, fletcher16_avx2_misaligned) +{ + if(!have_avx2()) return; + + uint8_t sum1; + uint8_t sum2; + uint32_t fletcher16; + + sum1 = 0xFF; + sum2 = 0xFF; + + fletcher16_avx2(&sum1, &sum2, buffer_misaligned + 1, 1048576); + + fletcher16 = (sum2 << 8) | sum1; + + EXPECT_EQ(fletcher16, EXPECTED_FLETCHER16); +} + TEST_F(fletcher16Fixture, fletcher16_ssse3_misaligned) { if(!have_ssse3()) return; - uint8_t sum1; - uint8_t sum2; + uint8_t sum1; + uint8_t sum2; uint32_t fletcher16; sum1 = 0xFF; @@ -365,6 +401,24 @@ TEST_F(fletcher16Fixture, fletcher16_ssse3_misaligned) EXPECT_EQ(fletcher16, EXPECTED_FLETCHER16); } +TEST_F(fletcher16Fixture, fletcher16_avx2_1byte) +{ + if(!have_avx2()) return; + + uint8_t sum1; + uint8_t sum2; + uint32_t fletcher16; + + sum1 = 0xFF; + sum2 = 0xFF; + + fletcher16_avx2(&sum1, &sum2, buffer, 1); + + fletcher16 = (sum2 << 8) | sum1; + + EXPECT_EQ(fletcher16, EXPECTED_FLETCHER16_1BYTE); +} + TEST_F(fletcher16Fixture, fletcher16_ssse3_1byte) { if(!have_ssse3()) return; @@ -383,6 +437,24 @@ TEST_F(fletcher16Fixture, fletcher16_ssse3_1byte) EXPECT_EQ(fletcher16, EXPECTED_FLETCHER16_1BYTE); } +TEST_F(fletcher16Fixture, fletcher16_avx2_7bytes) +{ + if(!have_avx2()) return; + + uint8_t sum1; + uint8_t sum2; + uint32_t fletcher16; + + sum1 = 0xFF; + sum2 = 0xFF; + + fletcher16_avx2(&sum1, &sum2, buffer, 7); + + fletcher16 = (sum2 << 8) | sum1; + + EXPECT_EQ(fletcher16, EXPECTED_FLETCHER16_7BYTES); +} + TEST_F(fletcher16Fixture, fletcher16_ssse3_7bytes) { if(!have_ssse3()) return; @@ -401,12 +473,30 @@ TEST_F(fletcher16Fixture, fletcher16_ssse3_7bytes) EXPECT_EQ(fletcher16, EXPECTED_FLETCHER16_7BYTES); } +TEST_F(fletcher16Fixture, fletcher16_avx2_15bytes) +{ + if(!have_avx2()) return; + + uint8_t sum1; + uint8_t sum2; + uint32_t fletcher16; + + sum1 = 0xFF; + sum2 = 0xFF; + + fletcher16_avx2(&sum1, &sum2, buffer, 15); + + fletcher16 = (sum2 << 8) | sum1; + + EXPECT_EQ(fletcher16, EXPECTED_FLETCHER16_15BYTES); +} + TEST_F(fletcher16Fixture, fletcher16_ssse3_15bytes) { if(!have_ssse3()) return; - uint8_t sum1; - uint8_t sum2; + uint8_t sum1; + uint8_t sum2; uint32_t fletcher16; sum1 = 0xFF; @@ -419,12 +509,30 @@ TEST_F(fletcher16Fixture, fletcher16_ssse3_15bytes) EXPECT_EQ(fletcher16, EXPECTED_FLETCHER16_15BYTES); } +TEST_F(fletcher16Fixture, fletcher16_avx2_31bytes) +{ + if(!have_avx2()) return; + + uint8_t sum1; + uint8_t sum2; + uint32_t fletcher16; + + sum1 = 0xFF; + sum2 = 0xFF; + + fletcher16_avx2(&sum1, &sum2, buffer, 31); + + fletcher16 = (sum2 << 8) | sum1; + + EXPECT_EQ(fletcher16, EXPECTED_FLETCHER16_31BYTES); +} + TEST_F(fletcher16Fixture, fletcher16_ssse3_31bytes) { if(!have_ssse3()) return; - uint8_t sum1; - uint8_t sum2; + uint8_t sum1; + uint8_t sum2; uint32_t fletcher16; sum1 = 0xFF; @@ -437,12 +545,30 @@ TEST_F(fletcher16Fixture, fletcher16_ssse3_31bytes) EXPECT_EQ(fletcher16, EXPECTED_FLETCHER16_31BYTES); } +TEST_F(fletcher16Fixture, fletcher16_avx2_63bytes) +{ + if(!have_avx2()) return; + + uint8_t sum1; + uint8_t sum2; + uint32_t fletcher16; + + sum1 = 0xFF; + sum2 = 0xFF; + + fletcher16_avx2(&sum1, &sum2, buffer, 63); + + fletcher16 = (sum2 << 8) | sum1; + + EXPECT_EQ(fletcher16, EXPECTED_FLETCHER16_63BYTES); +} + TEST_F(fletcher16Fixture, fletcher16_ssse3_63bytes) { if(!have_ssse3()) return; - uint8_t sum1; - uint8_t sum2; + uint8_t sum1; + uint8_t sum2; uint32_t fletcher16; sum1 = 0xFF; @@ -455,12 +581,30 @@ TEST_F(fletcher16Fixture, fletcher16_ssse3_63bytes) EXPECT_EQ(fletcher16, EXPECTED_FLETCHER16_63BYTES); } +TEST_F(fletcher16Fixture, fletcher16_avx2_2352bytes) +{ + if(!have_avx2()) return; + + uint8_t sum1; + uint8_t sum2; + uint32_t fletcher16; + + sum1 = 0xFF; + sum2 = 0xFF; + + fletcher16_avx2(&sum1, &sum2, buffer, 2352); + + fletcher16 = (sum2 << 8) | sum1; + + EXPECT_EQ(fletcher16, EXPECTED_FLETCHER16_2352BYTES); +} + TEST_F(fletcher16Fixture, fletcher16_ssse3_2352bytes) { if(!have_ssse3()) return; - uint8_t sum1; - uint8_t sum2; + uint8_t sum1; + uint8_t sum2; uint32_t fletcher16; sum1 = 0xFF;