diff --git a/adler32.c b/adler32.c index f83f7d9..6cb049d 100644 --- a/adler32.c +++ b/adler32.c @@ -78,7 +78,7 @@ AARU_EXPORT int AARU_CALL adler32_update(adler32_ctx* ctx, const uint8_t* data, return 0; } -AARU_EXPORT void AARU_CALL adler32_slicing(uint16_t* sum1, uint16_t* sum2, const unsigned char* data, long len) +AARU_EXPORT void AARU_CALL adler32_slicing(uint16_t* sum1, uint16_t* sum2, const uint8_t* data, long len) { uint32_t s1 = *sum1; uint32_t s2 = *sum2; diff --git a/adler32.h b/adler32.h index 4751be7..4d5bb1a 100644 --- a/adler32.h +++ b/adler32.h @@ -33,19 +33,19 @@ AARU_EXPORT adler32_ctx* AARU_CALL adler32_init(); AARU_EXPORT int AARU_CALL adler32_update(adler32_ctx* ctx, const uint8_t* data, uint32_t len); AARU_EXPORT int AARU_CALL adler32_final(adler32_ctx* ctx, uint32_t* checksum); AARU_EXPORT void AARU_CALL adler32_free(adler32_ctx* ctx); -AARU_EXPORT void AARU_CALL adler32_slicing(uint16_t* sum1, uint16_t* sum2, const unsigned char* data, long len); +AARU_EXPORT void AARU_CALL adler32_slicing(uint16_t* sum1, uint16_t* sum2, const uint8_t* data, long len); #if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \ defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86) -AARU_EXPORT SSSE3 void AARU_CALL adler32_ssse3(uint16_t* sum1, uint16_t* sum2, const unsigned char* buf, long len); -AARU_EXPORT AVX2 void AARU_CALL adler32_avx2(uint16_t* sum1, uint16_t* sum2, const unsigned char* buf, long len); +AARU_EXPORT SSSE3 void AARU_CALL adler32_ssse3(uint16_t* sum1, uint16_t* sum2, const uint8_t* data, long len); +AARU_EXPORT AVX2 void AARU_CALL adler32_avx2(uint16_t* sum1, uint16_t* sum2, const uint8_t* data, long len); #endif #if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM) -AARU_EXPORT void AARU_CALL adler32_neon(uint16_t* sum1, uint16_t* sum2, const unsigned char* buf, uint32_t len); +AARU_EXPORT void AARU_CALL adler32_neon(uint16_t* sum1, uint16_t* sum2, const uint8_t* data, uint32_t len); #endif diff --git a/adler32_avx2.c b/adler32_avx2.c index e0ba068..0bc7fe6 100644 --- a/adler32_avx2.c +++ b/adler32_avx2.c @@ -12,7 +12,7 @@ #include "adler32.h" #include "simd.h" -AARU_EXPORT AVX2 void AARU_CALL adler32_avx2(uint16_t* sum1, uint16_t* sum2, const unsigned char* buf, long len) +AARU_EXPORT AVX2 void AARU_CALL adler32_avx2(uint16_t* sum1, uint16_t* sum2, const uint8_t* data, long len) { uint32_t s1 = *sum1; uint32_t s2 = *sum2; @@ -77,7 +77,7 @@ AARU_EXPORT AVX2 void AARU_CALL adler32_avx2(uint16_t* sum1, uint16_t* sum2, con /* * Load 32 input bytes. */ - const __m256i bytes = _mm256_lddqu_si256((__m256i*)(buf)); + const __m256i bytes = _mm256_lddqu_si256((__m256i*)(data)); /* * Add previous block byte sum to v_ps. @@ -91,7 +91,7 @@ AARU_EXPORT AVX2 void AARU_CALL adler32_avx2(uint16_t* sum1, uint16_t* sum2, con const __m256i mad = _mm256_maddubs_epi16(bytes, tap); v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(mad, ones)); - buf += BLOCK_SIZE; + data += BLOCK_SIZE; } while(--n); __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(v_s1), _mm256_extracti128_si256(v_s1, 1)); @@ -123,25 +123,25 @@ AARU_EXPORT AVX2 void AARU_CALL adler32_avx2(uint16_t* sum1, uint16_t* sum2, con { if(len >= 16) { - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); len -= 16; } - while(len--) { s2 += (s1 += *buf++); } + while(len--) { s2 += (s1 += *data++); } if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE; s2 %= ADLER_MODULE; } diff --git a/adler32_neon.c b/adler32_neon.c index c401db2..5fabd75 100644 --- a/adler32_neon.c +++ b/adler32_neon.c @@ -10,7 +10,7 @@ #include "adler32.h" #include "simd.h" -TARGET_WITH_SIMD void adler32_neon(uint16_t* sum1, uint16_t* sum2, const unsigned char* buf, uint32_t len) +TARGET_WITH_SIMD void adler32_neon(uint16_t* sum1, uint16_t* sum2, const uint8_t* data, uint32_t len) { /* * Split Adler-32 into component sums. @@ -20,11 +20,11 @@ TARGET_WITH_SIMD void adler32_neon(uint16_t* sum1, uint16_t* sum2, const unsigne /* * Serially compute s1 & s2, until the data is 16-byte aligned. */ - if((uintptr_t)buf & 15) + if((uintptr_t)data & 15) { - while((uintptr_t)buf & 15) + while((uintptr_t)data & 15) { - s2 += (s1 += *buf++); + s2 += (s1 += *data++); --len; } if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE; @@ -60,8 +60,8 @@ TARGET_WITH_SIMD void adler32_neon(uint16_t* sum1, uint16_t* sum2, const unsigne /* * Load 32 input bytes. */ - const uint8x16_t bytes1 = vld1q_u8((uint8_t*)(buf)); - const uint8x16_t bytes2 = vld1q_u8((uint8_t*)(buf + 16)); + const uint8x16_t bytes1 = vld1q_u8((uint8_t*)(data)); + const uint8x16_t bytes2 = vld1q_u8((uint8_t*)(data + 16)); /* * Add previous block byte sum to v_s2. */ @@ -77,7 +77,7 @@ TARGET_WITH_SIMD void adler32_neon(uint16_t* sum1, uint16_t* sum2, const unsigne v_column_sum_2 = vaddw_u8(v_column_sum_2, vget_high_u8(bytes1)); v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8(bytes2)); v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2)); - buf += BLOCK_SIZE; + data += BLOCK_SIZE; } while(--n); v_s2 = vshlq_n_u32(v_s2, 5); /* @@ -123,25 +123,25 @@ TARGET_WITH_SIMD void adler32_neon(uint16_t* sum1, uint16_t* sum2, const unsigne { if(len >= 16) { - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); len -= 16; } - while(len--) { s2 += (s1 += *buf++); } + while(len--) { s2 += (s1 += *data++); } if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE; s2 %= ADLER_MODULE; } diff --git a/adler32_ssse3.c b/adler32_ssse3.c index 8117f7d..ff9970b 100644 --- a/adler32_ssse3.c +++ b/adler32_ssse3.c @@ -51,7 +51,7 @@ #include "library.h" #include "adler32.h" -AARU_EXPORT SSSE3 void AARU_CALL adler32_ssse3(uint16_t* sum1, uint16_t* sum2, const unsigned char* buf, long len) +AARU_EXPORT SSSE3 void AARU_CALL adler32_ssse3(uint16_t* sum1, uint16_t* sum2, const uint8_t* data, long len) { uint32_t s1 = *sum1; uint32_t s2 = *sum2; @@ -82,8 +82,8 @@ AARU_EXPORT SSSE3 void AARU_CALL adler32_ssse3(uint16_t* sum1, uint16_t* sum2, c /* * Load 32 input bytes. */ - const __m128i bytes1 = _mm_loadu_si128((__m128i*)(buf)); - const __m128i bytes2 = _mm_loadu_si128((__m128i*)(buf + 16)); + const __m128i bytes1 = _mm_loadu_si128((__m128i*)(data)); + const __m128i bytes2 = _mm_loadu_si128((__m128i*)(data + 16)); /* * Add previous block byte sum to v_ps. */ @@ -98,7 +98,7 @@ AARU_EXPORT SSSE3 void AARU_CALL adler32_ssse3(uint16_t* sum1, uint16_t* sum2, c v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zero)); const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2); v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones)); - buf += BLOCK_SIZE; + data += BLOCK_SIZE; } while(--n); v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5)); /* @@ -127,25 +127,25 @@ AARU_EXPORT SSSE3 void AARU_CALL adler32_ssse3(uint16_t* sum1, uint16_t* sum2, c { if(len >= 16) { - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); - s2 += (s1 += *buf++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); + s2 += (s1 += *data++); len -= 16; } - while(len--) { s2 += (s1 += *buf++); } + while(len--) { s2 += (s1 += *data++); } if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE; s2 %= ADLER_MODULE; } diff --git a/crc32.c b/crc32.c index 60bbd1d..d19bd8c 100644 --- a/crc32.c +++ b/crc32.c @@ -41,7 +41,7 @@ AARU_EXPORT int AARU_CALL crc32_update(crc32_ctx* ctx, const uint8_t* data, uint defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86) if(have_clmul()) { - ctx->crc = ~crc32_clmul(data, (long)len, ~ctx->crc); + ctx->crc = ~crc32_clmul(~ctx->crc, data, (long)len); return 0; } @@ -58,7 +58,7 @@ AARU_EXPORT int AARU_CALL crc32_update(crc32_ctx* ctx, const uint8_t* data, uint #endif if(have_neon()) { - ctx->crc = ~crc32_vmull(data, len, ~ctx->crc); + ctx->crc = ~crc32_vmull(~ctx->crc, data, len); return 0; } #endif @@ -67,7 +67,7 @@ AARU_EXPORT int AARU_CALL crc32_update(crc32_ctx* ctx, const uint8_t* data, uint return 0; } -AARU_EXPORT void AARU_CALL crc32_slicing(uint32_t* crc, const unsigned char* data, long len) +AARU_EXPORT void AARU_CALL crc32_slicing(uint32_t* previous_crc, const uint8_t* data, long len) { // Unroll according to Intel slicing by uint8_t // http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf @@ -79,7 +79,7 @@ AARU_EXPORT void AARU_CALL crc32_slicing(uint32_t* crc, const unsigned char* dat const size_t bytes_at_once = 8 * unroll; uintptr_t unaligned_length = (4 - (((uintptr_t)current_char) & 3)) & 3; - c = *crc; + c = *previous_crc; while((len != 0) && (unaligned_length != 0)) { @@ -110,7 +110,7 @@ AARU_EXPORT void AARU_CALL crc32_slicing(uint32_t* crc, const unsigned char* dat while(len-- != 0) c = (c >> 8) ^ crc32_table[0][(c & 0xFF) ^ *current_char++]; - *crc = c; + *previous_crc = c; } AARU_EXPORT int AARU_CALL crc32_final(crc32_ctx* ctx, uint32_t* crc) diff --git a/crc32.h b/crc32.h index 1d7761f..dbf37b0 100644 --- a/crc32.h +++ b/crc32.h @@ -263,18 +263,18 @@ AARU_EXPORT crc32_ctx* AARU_CALL crc32_init(); AARU_EXPORT int AARU_CALL crc32_update(crc32_ctx* ctx, const uint8_t* data, uint32_t len); AARU_EXPORT int AARU_CALL crc32_final(crc32_ctx* ctx, uint32_t* crc); AARU_EXPORT void AARU_CALL crc32_free(crc32_ctx* ctx); -AARU_EXPORT void AARU_CALL crc32_slicing(uint32_t* crc, const unsigned char* data, long len); +AARU_EXPORT void AARU_CALL crc32_slicing(uint32_t* previous_crc, const uint8_t* data, long len); #if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \ defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86) -AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(const uint8_t* src, long len, uint32_t initial_crc); +AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(uint32_t previous_crc, const uint8_t* data, long len); #endif #if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM) #if __ARM_ARCH >= 8 -AARU_EXPORT TARGET_ARMV8_WITH_CRC uint32_t AARU_CALL armv8_crc32_little(uint32_t crc, - const unsigned char* buf, +AARU_EXPORT TARGET_ARMV8_WITH_CRC uint32_t AARU_CALL armv8_crc32_little(uint32_t previous_crc, + const uint8_t* data, uint32_t len); #endif -AARU_EXPORT TARGET_WITH_SIMD uint32_t AARU_CALL crc32_vmull(const uint8_t* src, long len, uint32_t initial_crc); +AARU_EXPORT TARGET_WITH_SIMD uint32_t AARU_CALL crc32_vmull(uint32_t previous_crc, const uint8_t* data, long len); #endif \ No newline at end of file diff --git a/crc32_arm_simd.c b/crc32_arm_simd.c index 5e3db0b..023b925 100644 --- a/crc32_arm_simd.c +++ b/crc32_arm_simd.c @@ -9,17 +9,17 @@ #include "library.h" #include "crc32.h" -TARGET_ARMV8_WITH_CRC uint32_t armv8_crc32_little(uint32_t crc, const unsigned char* buf, uint32_t len) +TARGET_ARMV8_WITH_CRC uint32_t armv8_crc32_little(uint32_t previous_crc, const uint8_t* data, uint32_t len) { - uint32_t c = (uint32_t)crc; + uint32_t c = (uint32_t)previous_crc; #if defined(__aarch64__) || defined(_M_ARM64) - while(len && ((uintptr_t)buf & 7)) + while(len && ((uintptr_t)data & 7)) { - c = __crc32b(c, *buf++); + c = __crc32b(c, *data++); --len; } - const uint64_t* buf8 = (const uint64_t*)buf; + const uint64_t* buf8 = (const uint64_t*)data; while(len >= 64) { c = __crc32d(c, *buf8++); @@ -38,7 +38,7 @@ TARGET_ARMV8_WITH_CRC uint32_t armv8_crc32_little(uint32_t crc, const unsigned c len -= 8; } - buf = (const unsigned char*)buf8; + data = (const uint8_t*)buf8; #else // AARCH64 while(len && ((uintptr_t)buf & 3)) { @@ -64,10 +64,10 @@ TARGET_ARMV8_WITH_CRC uint32_t armv8_crc32_little(uint32_t crc, const unsigned c len -= 4; } - buf = (const unsigned char*)buf4; + buf = (const uint8_t*)buf4; #endif - while(len--) { c = __crc32b(c, *buf++); } + while(len--) { c = __crc32b(c, *data++); } return c; } #endif diff --git a/crc32_clmul.c b/crc32_clmul.c index 303f4fa..cb735cf 100644 --- a/crc32_clmul.c +++ b/crc32_clmul.c @@ -237,11 +237,11 @@ static void partial_fold(const size_t len, */ #define XOR_INITIAL(where) ONCE(where = _mm_xor_si128(where, xmm_initial)) -AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(const uint8_t* src, long len, uint32_t initial_crc) +AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(uint32_t previous_crc, const uint8_t* data, long len) { unsigned long algn_diff; __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3; - __m128i xmm_initial = _mm_cvtsi32_si128(initial_crc); + __m128i xmm_initial = _mm_cvtsi32_si128(previous_crc); __m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487); __m128i xmm_crc1 = _mm_setzero_si128(); __m128i xmm_crc2 = _mm_setzero_si128(); @@ -259,34 +259,34 @@ AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(const uint8_t* src, long len, u if(len < 16) { - if(len == 0) return initial_crc; + if(len == 0) return previous_crc; if(len < 4) { /* * no idea how to do this for <4 bytes, delegate to classic impl. */ - uint32_t crc = ~initial_crc; + uint32_t crc = ~previous_crc; switch(len) { - case 3: crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *src++]; - case 2: crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *src++]; - case 1: crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *src++]; + case 3: crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *data++]; + case 2: crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *data++]; + case 1: crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *data++]; } return ~crc; } - xmm_crc_part = _mm_loadu_si128((__m128i*)src); + xmm_crc_part = _mm_loadu_si128((__m128i*)data); XOR_INITIAL(xmm_crc_part); goto partial; } /* this alignment computation would be wrong for len<16 handled above */ - algn_diff = (0 - (uintptr_t)src) & 0xF; + algn_diff = (0 - (uintptr_t)data) & 0xF; if(algn_diff) { - xmm_crc_part = _mm_loadu_si128((__m128i*)src); + xmm_crc_part = _mm_loadu_si128((__m128i*)data); XOR_INITIAL(xmm_crc_part); - src += algn_diff; + data += algn_diff; len -= algn_diff; partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part); @@ -294,10 +294,10 @@ AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(const uint8_t* src, long len, u while((len -= 64) >= 0) { - xmm_t0 = _mm_load_si128((__m128i*)src); - xmm_t1 = _mm_load_si128((__m128i*)src + 1); - xmm_t2 = _mm_load_si128((__m128i*)src + 2); - xmm_t3 = _mm_load_si128((__m128i*)src + 3); + xmm_t0 = _mm_load_si128((__m128i*)data); + xmm_t1 = _mm_load_si128((__m128i*)data + 1); + xmm_t2 = _mm_load_si128((__m128i*)data + 2); + xmm_t3 = _mm_load_si128((__m128i*)data + 3); XOR_INITIAL(xmm_t0); @@ -308,7 +308,7 @@ AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(const uint8_t* src, long len, u xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2); xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3); - src += 64; + data += 64; } /* @@ -318,9 +318,9 @@ AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(const uint8_t* src, long len, u { len += 16; - xmm_t0 = _mm_load_si128((__m128i*)src); - xmm_t1 = _mm_load_si128((__m128i*)src + 1); - xmm_t2 = _mm_load_si128((__m128i*)src + 2); + xmm_t0 = _mm_load_si128((__m128i*)data); + xmm_t1 = _mm_load_si128((__m128i*)data + 1); + xmm_t2 = _mm_load_si128((__m128i*)data + 2); XOR_INITIAL(xmm_t0); @@ -332,14 +332,14 @@ AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(const uint8_t* src, long len, u if(len == 0) goto done; - xmm_crc_part = _mm_load_si128((__m128i*)src + 3); + xmm_crc_part = _mm_load_si128((__m128i*)data + 3); } else if(len + 32 >= 0) { len += 32; - xmm_t0 = _mm_load_si128((__m128i*)src); - xmm_t1 = _mm_load_si128((__m128i*)src + 1); + xmm_t0 = _mm_load_si128((__m128i*)data); + xmm_t1 = _mm_load_si128((__m128i*)data + 1); XOR_INITIAL(xmm_t0); @@ -350,13 +350,13 @@ AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(const uint8_t* src, long len, u if(len == 0) goto done; - xmm_crc_part = _mm_load_si128((__m128i*)src + 2); + xmm_crc_part = _mm_load_si128((__m128i*)data + 2); } else if(len + 48 >= 0) { len += 48; - xmm_t0 = _mm_load_si128((__m128i*)src); + xmm_t0 = _mm_load_si128((__m128i*)data); XOR_INITIAL(xmm_t0); @@ -366,13 +366,13 @@ AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(const uint8_t* src, long len, u if(len == 0) goto done; - xmm_crc_part = _mm_load_si128((__m128i*)src + 1); + xmm_crc_part = _mm_load_si128((__m128i*)data + 1); } else { len += 64; if(len == 0) goto done; - xmm_crc_part = _mm_load_si128((__m128i*)src); + xmm_crc_part = _mm_load_si128((__m128i*)data); XOR_INITIAL(xmm_crc_part); } diff --git a/crc32_vmull.c b/crc32_vmull.c index 2f965fb..a69f592 100644 --- a/crc32_vmull.c +++ b/crc32_vmull.c @@ -212,14 +212,14 @@ TARGET_WITH_SIMD FORCE_INLINE void partial_fold(const size_t len, *q_crc3 = vreinterpretq_u64_u32(ps_res); } -TARGET_WITH_SIMD uint32_t crc32_vmull(const uint8_t* src, long len, uint32_t initial_crc) +TARGET_WITH_SIMD uint32_t crc32_vmull(uint32_t previous_crc, const uint8_t* data, long len) { unsigned long algn_diff; uint64x2_t q_t0; uint64x2_t q_t1; uint64x2_t q_t2; uint64x2_t q_t3; - uint64x2_t q_initial = vreinterpretq_u64_u32(vsetq_lane_u32(initial_crc, vdupq_n_u32(0), 0)); + uint64x2_t q_initial = vreinterpretq_u64_u32(vsetq_lane_u32(previous_crc, vdupq_n_u32(0), 0)); uint64x2_t q_crc0 = vreinterpretq_u64_u32(vsetq_lane_u32(0x9db42487, vdupq_n_u32(0), 0)); uint64x2_t q_crc1 = vreinterpretq_u64_u32(vdupq_n_u32(0)); uint64x2_t q_crc2 = vreinterpretq_u64_u32(vdupq_n_u32(0)); @@ -240,34 +240,34 @@ TARGET_WITH_SIMD uint32_t crc32_vmull(const uint8_t* src, long len, uint32_t ini if(len < 16) { - if(len == 0) return initial_crc; + if(len == 0) return previous_crc; if(len < 4) { /* * no idea how to do this for <4 bytes, delegate to classic impl. */ - uint32_t crc = ~initial_crc; + uint32_t crc = ~previous_crc; switch(len) { - case 3: crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *src++]; - case 2: crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *src++]; - case 1: crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *src++]; + case 3: crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *data++]; + case 2: crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *data++]; + case 1: crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *data++]; } return ~crc; } - q_crc_part = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src)); + q_crc_part = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data)); XOR_INITIAL(q_crc_part); goto partial; } /* this alignment computation would be wrong for len<16 handled above */ - algn_diff = (0 - (uintptr_t)src) & 0xF; + algn_diff = (0 - (uintptr_t)data) & 0xF; if(algn_diff) { - q_crc_part = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src)); + q_crc_part = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data)); XOR_INITIAL(q_crc_part); - src += algn_diff; + data += algn_diff; len -= algn_diff; partial_fold(algn_diff, &q_crc0, &q_crc1, &q_crc2, &q_crc3, &q_crc_part); @@ -275,10 +275,10 @@ TARGET_WITH_SIMD uint32_t crc32_vmull(const uint8_t* src, long len, uint32_t ini while((len -= 64) >= 0) { - q_t0 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src)); - q_t1 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src + 4)); - q_t2 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src + 8)); - q_t3 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src + 12)); + q_t0 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data)); + q_t1 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data + 4)); + q_t2 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data + 8)); + q_t3 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data + 12)); XOR_INITIAL(q_t0); @@ -289,7 +289,7 @@ TARGET_WITH_SIMD uint32_t crc32_vmull(const uint8_t* src, long len, uint32_t ini q_crc2 = vreinterpretq_u64_u32(veorq_u32(vreinterpretq_u32_u64(q_crc2), vreinterpretq_u32_u64(q_t2))); q_crc3 = vreinterpretq_u64_u32(veorq_u32(vreinterpretq_u32_u64(q_crc3), vreinterpretq_u32_u64(q_t3))); - src += 64; + data += 64; } /* @@ -299,9 +299,9 @@ TARGET_WITH_SIMD uint32_t crc32_vmull(const uint8_t* src, long len, uint32_t ini { len += 16; - q_t0 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src)); - q_t1 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src + 4)); - q_t2 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src + 8)); + q_t0 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data)); + q_t1 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data + 4)); + q_t2 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data + 8)); XOR_INITIAL(q_t0); @@ -313,14 +313,14 @@ TARGET_WITH_SIMD uint32_t crc32_vmull(const uint8_t* src, long len, uint32_t ini if(len == 0) goto done; - q_crc_part = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src + 12)); + q_crc_part = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data + 12)); } else if(len + 32 >= 0) { len += 32; - q_t0 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src)); - q_t1 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src + 4)); + q_t0 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data)); + q_t1 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data + 4)); XOR_INITIAL(q_t0); @@ -331,13 +331,13 @@ TARGET_WITH_SIMD uint32_t crc32_vmull(const uint8_t* src, long len, uint32_t ini if(len == 0) goto done; - q_crc_part = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src + 8)); + q_crc_part = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data + 8)); } else if(len + 48 >= 0) { len += 48; - q_t0 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src)); + q_t0 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data)); XOR_INITIAL(q_t0); @@ -347,13 +347,13 @@ TARGET_WITH_SIMD uint32_t crc32_vmull(const uint8_t* src, long len, uint32_t ini if(len == 0) goto done; - q_crc_part = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src + 4)); + q_crc_part = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data + 4)); } else { len += 64; if(len == 0) goto done; - q_crc_part = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src)); + q_crc_part = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data)); XOR_INITIAL(q_crc_part); } diff --git a/crc64.c b/crc64.c index aa0e15d..e989b31 100644 --- a/crc64.c +++ b/crc64.c @@ -64,9 +64,9 @@ AARU_EXPORT int AARU_CALL crc64_update(crc64_ctx* ctx, const uint8_t* data, uint return 0; } -AARU_EXPORT void AARU_CALL crc64_slicing(uint64_t* crc, const uint8_t* data, uint32_t len) +AARU_EXPORT void AARU_CALL crc64_slicing(uint64_t* previous_crc, const uint8_t* data, uint32_t len) { - uint64_t c = *crc; + uint64_t c = *previous_crc; if(len > 4) { @@ -93,7 +93,7 @@ AARU_EXPORT void AARU_CALL crc64_slicing(uint64_t* crc, const uint8_t* data, uin while(len-- != 0) c = crc64_table[0][*data++ ^ ((c)&0xFF)] ^ ((c) >> 8); - *crc = c; + *previous_crc = c; } AARU_EXPORT int AARU_CALL crc64_final(crc64_ctx* ctx, uint64_t* crc) diff --git a/crc64.h b/crc64.h index d122076..4b4248a 100644 --- a/crc64.h +++ b/crc64.h @@ -238,7 +238,7 @@ AARU_EXPORT crc64_ctx* AARU_CALL crc64_init(); AARU_EXPORT int AARU_CALL crc64_update(crc64_ctx* ctx, const uint8_t* data, uint32_t len); AARU_EXPORT int AARU_CALL crc64_final(crc64_ctx* ctx, uint64_t* crc); AARU_EXPORT void AARU_CALL crc64_free(crc64_ctx* ctx); -AARU_EXPORT void AARU_CALL crc64_slicing(uint64_t* crc, const uint8_t* data, uint32_t len); +AARU_EXPORT void AARU_CALL crc64_slicing(uint64_t* previous_crc, const uint8_t* data, uint32_t len); #if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \ defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86) @@ -246,5 +246,5 @@ AARU_EXPORT CLMUL uint64_t AARU_CALL crc64_clmul(uint64_t crc, const uint8_t* da #endif #if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM) -AARU_EXPORT TARGET_WITH_SIMD uint64_t AARU_CALL crc64_vmull(uint64_t crc, const uint8_t* data, long length); +AARU_EXPORT TARGET_WITH_SIMD uint64_t AARU_CALL crc64_vmull(uint64_t previous_crc, const uint8_t* data, long len); #endif \ No newline at end of file diff --git a/crc64_vmull.c b/crc64_vmull.c index 611f13e..8577494 100644 --- a/crc64_vmull.c +++ b/crc64_vmull.c @@ -35,7 +35,7 @@ TARGET_WITH_SIMD FORCE_INLINE uint64x2_t fold(uint64x2_t in, uint64x2_t foldCons sse2neon_vmull_p64(vget_high_u64(in), vget_high_u64(foldConstants))); } -AARU_EXPORT TARGET_WITH_SIMD uint64_t AARU_CALL crc64_vmull(uint64_t crc, const uint8_t* data, long length) +AARU_EXPORT TARGET_WITH_SIMD uint64_t AARU_CALL crc64_vmull(uint64_t previous_crc, const uint8_t* data, long len) { const uint64_t k1 = 0xe05dd497ca393ae4; // bitReflect(expMod65(128 + 64, poly, 1)) << 1; const uint64_t k2 = 0xdabe95afc7875f40; // bitReflect(expMod65(128, poly, 1)) << 1; @@ -45,7 +45,7 @@ AARU_EXPORT TARGET_WITH_SIMD uint64_t AARU_CALL crc64_vmull(uint64_t crc, const const uint64x2_t foldConstants1 = vcombine_u64(vcreate_u64(k1), vcreate_u64(k2)); const uint64x2_t foldConstants2 = vcombine_u64(vcreate_u64(mu), vcreate_u64(p)); - const uint8_t* end = data + length; + const uint8_t* end = data + len; // Align pointers const uint64x2_t* alignedData = (const uint64x2_t*)((uintptr_t)data & ~(uintptr_t)15); @@ -66,14 +66,14 @@ AARU_EXPORT TARGET_WITH_SIMD uint64_t AARU_CALL crc64_vmull(uint64_t crc, const vreinterpretq_u8_u64(b), vreinterpretq_u8_u64(a))); - const uint64x2_t initialCrc = vsetq_lane_u64(~crc, vdupq_n_u64(0), 0); + const uint64x2_t initialCrc = vsetq_lane_u64(~previous_crc, vdupq_n_u64(0), 0); uint64x2_t R; if(alignedLength == 1) { // Single data block, initial CRC possibly bleeds into zero padding uint64x2_t crc0, crc1; - shiftRight128(initialCrc, 16 - length, &crc0, &crc1); + shiftRight128(initialCrc, 16 - len, &crc0, &crc1); uint64x2_t A, B; shiftRight128(data0, leadOutSize, &A, &B); @@ -86,11 +86,11 @@ AARU_EXPORT TARGET_WITH_SIMD uint64_t AARU_CALL crc64_vmull(uint64_t crc, const { const uint64x2_t data1 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)(alignedData + 1))); - if(length < 8) + if(len < 8) { // Initial CRC bleeds into the zero padding uint64x2_t crc0, crc1; - shiftRight128(initialCrc, 16 - length, &crc0, &crc1); + shiftRight128(initialCrc, 16 - len, &crc0, &crc1); uint64x2_t A, B, C, D; shiftRight128(data0, leadOutSize, &A, &B); @@ -117,7 +117,7 @@ AARU_EXPORT TARGET_WITH_SIMD uint64_t AARU_CALL crc64_vmull(uint64_t crc, const else { alignedData++; - length -= 16 - leadInSize; + len -= 16 - leadInSize; // Initial CRC can simply be added to data uint64x2_t crc0, crc1; @@ -125,17 +125,17 @@ AARU_EXPORT TARGET_WITH_SIMD uint64_t AARU_CALL crc64_vmull(uint64_t crc, const uint64x2_t accumulator = veorq_u64(fold(veorq_u64(crc0, data0), foldConstants1), crc1); - while(length >= 32) + while(len >= 32) { accumulator = fold(veorq_u64(vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)alignedData)), accumulator), foldConstants1); - length -= 16; + len -= 16; alignedData++; } uint64x2_t P; - if(length == 16) P = veorq_u64(accumulator, vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)alignedData))); + if(len == 16) P = veorq_u64(accumulator, vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)alignedData))); else { const uint64x2_t end0 = diff --git a/tests/crc32.cpp b/tests/crc32.cpp index d4fef34..1229ff7 100644 --- a/tests/crc32.cpp +++ b/tests/crc32.cpp @@ -211,7 +211,7 @@ TEST_F(crc32Fixture, crc32_clmul) uint32_t crc = CRC32_ISO_SEED; - crc = ~crc32_clmul(buffer, 1048576, ~crc); + crc = ~crc32_clmul(~crc, buffer, 1048576); crc ^= CRC32_ISO_SEED; @@ -224,7 +224,7 @@ TEST_F(crc32Fixture, crc32_clmul_misaligned) uint32_t crc = CRC32_ISO_SEED; - crc = ~crc32_clmul(buffer_misaligned+1, 1048576, ~crc); + crc = ~crc32_clmul(~crc, buffer_misaligned + 1, 1048576); crc ^= CRC32_ISO_SEED; @@ -237,7 +237,7 @@ TEST_F(crc32Fixture, crc32_clmul_15bytes) uint32_t crc = CRC32_ISO_SEED; - crc = ~crc32_clmul(buffer, 15, ~crc); + crc = ~crc32_clmul(~crc, buffer, 15); crc ^= CRC32_ISO_SEED; @@ -250,7 +250,7 @@ TEST_F(crc32Fixture, crc32_clmul_31bytes) uint32_t crc = CRC32_ISO_SEED; - crc = ~crc32_clmul(buffer, 31, ~crc); + crc = ~crc32_clmul(~crc, buffer, 31); crc ^= CRC32_ISO_SEED; @@ -263,7 +263,7 @@ TEST_F(crc32Fixture, crc32_clmul_63bytes) uint32_t crc = CRC32_ISO_SEED; - crc = ~crc32_clmul(buffer, 63, ~crc); + crc = ~crc32_clmul(~crc, buffer, 63); crc ^= CRC32_ISO_SEED; @@ -276,7 +276,7 @@ TEST_F(crc32Fixture, crc32_clmul_2352bytes) uint32_t crc = CRC32_ISO_SEED; - crc = ~crc32_clmul(buffer, 2352, ~crc); + crc = ~crc32_clmul(~crc, buffer, 2352); crc ^= CRC32_ISO_SEED; @@ -371,7 +371,7 @@ TEST_F(crc32Fixture, crc32_vmull) uint32_t crc = CRC32_ISO_SEED; - crc = ~crc32_vmull(buffer, 1048576, ~crc); + crc = ~crc32_vmull(~crc, buffer, 1048576); crc ^= CRC32_ISO_SEED; @@ -384,7 +384,7 @@ TEST_F(crc32Fixture, crc32_vmull_misaligned) uint32_t crc = CRC32_ISO_SEED; - crc = ~crc32_vmull(buffer_misaligned+1, 1048576, ~crc); + crc = ~crc32_vmull(~crc, buffer_misaligned + 1, 1048576); crc ^= CRC32_ISO_SEED; @@ -397,7 +397,7 @@ TEST_F(crc32Fixture, crc32_vmull_15bytes) uint32_t crc = CRC32_ISO_SEED; - crc = ~crc32_vmull(buffer, 15, ~crc); + crc = ~crc32_vmull(~crc, buffer, 15); crc ^= CRC32_ISO_SEED; @@ -410,7 +410,7 @@ TEST_F(crc32Fixture, crc32_vmull_31bytes) uint32_t crc = CRC32_ISO_SEED; - crc = ~crc32_vmull(buffer, 31, ~crc); + crc = ~crc32_vmull(~crc, buffer, 31); crc ^= CRC32_ISO_SEED; @@ -423,7 +423,7 @@ TEST_F(crc32Fixture, crc32_vmull_63bytes) uint32_t crc = CRC32_ISO_SEED; - crc = ~crc32_vmull(buffer, 63, ~crc); + crc = ~crc32_vmull(~crc, buffer, 63); crc ^= CRC32_ISO_SEED; @@ -436,7 +436,7 @@ TEST_F(crc32Fixture, crc32_vmull_2352bytes) uint32_t crc = CRC32_ISO_SEED; - crc = ~crc32_vmull(buffer, 2352, ~crc); + crc = ~crc32_vmull(~crc, buffer, 2352); crc ^= CRC32_ISO_SEED;