Consistency of method signatures.

This commit is contained in:
2021-10-13 03:07:04 +01:00
parent 3797b44289
commit e63125ac04
14 changed files with 167 additions and 167 deletions

View File

@@ -78,7 +78,7 @@ AARU_EXPORT int AARU_CALL adler32_update(adler32_ctx* ctx, const uint8_t* data,
return 0;
}
AARU_EXPORT void AARU_CALL adler32_slicing(uint16_t* sum1, uint16_t* sum2, const unsigned char* data, long len)
AARU_EXPORT void AARU_CALL adler32_slicing(uint16_t* sum1, uint16_t* sum2, const uint8_t* data, long len)
{
uint32_t s1 = *sum1;
uint32_t s2 = *sum2;

View File

@@ -33,19 +33,19 @@ AARU_EXPORT adler32_ctx* AARU_CALL adler32_init();
AARU_EXPORT int AARU_CALL adler32_update(adler32_ctx* ctx, const uint8_t* data, uint32_t len);
AARU_EXPORT int AARU_CALL adler32_final(adler32_ctx* ctx, uint32_t* checksum);
AARU_EXPORT void AARU_CALL adler32_free(adler32_ctx* ctx);
AARU_EXPORT void AARU_CALL adler32_slicing(uint16_t* sum1, uint16_t* sum2, const unsigned char* data, long len);
AARU_EXPORT void AARU_CALL adler32_slicing(uint16_t* sum1, uint16_t* sum2, const uint8_t* data, long len);
#if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \
defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86)
AARU_EXPORT SSSE3 void AARU_CALL adler32_ssse3(uint16_t* sum1, uint16_t* sum2, const unsigned char* buf, long len);
AARU_EXPORT AVX2 void AARU_CALL adler32_avx2(uint16_t* sum1, uint16_t* sum2, const unsigned char* buf, long len);
AARU_EXPORT SSSE3 void AARU_CALL adler32_ssse3(uint16_t* sum1, uint16_t* sum2, const uint8_t* data, long len);
AARU_EXPORT AVX2 void AARU_CALL adler32_avx2(uint16_t* sum1, uint16_t* sum2, const uint8_t* data, long len);
#endif
#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
AARU_EXPORT void AARU_CALL adler32_neon(uint16_t* sum1, uint16_t* sum2, const unsigned char* buf, uint32_t len);
AARU_EXPORT void AARU_CALL adler32_neon(uint16_t* sum1, uint16_t* sum2, const uint8_t* data, uint32_t len);
#endif

View File

@@ -12,7 +12,7 @@
#include "adler32.h"
#include "simd.h"
AARU_EXPORT AVX2 void AARU_CALL adler32_avx2(uint16_t* sum1, uint16_t* sum2, const unsigned char* buf, long len)
AARU_EXPORT AVX2 void AARU_CALL adler32_avx2(uint16_t* sum1, uint16_t* sum2, const uint8_t* data, long len)
{
uint32_t s1 = *sum1;
uint32_t s2 = *sum2;
@@ -77,7 +77,7 @@ AARU_EXPORT AVX2 void AARU_CALL adler32_avx2(uint16_t* sum1, uint16_t* sum2, con
/*
* Load 32 input bytes.
*/
const __m256i bytes = _mm256_lddqu_si256((__m256i*)(buf));
const __m256i bytes = _mm256_lddqu_si256((__m256i*)(data));
/*
* Add previous block byte sum to v_ps.
@@ -91,7 +91,7 @@ AARU_EXPORT AVX2 void AARU_CALL adler32_avx2(uint16_t* sum1, uint16_t* sum2, con
const __m256i mad = _mm256_maddubs_epi16(bytes, tap);
v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(mad, ones));
buf += BLOCK_SIZE;
data += BLOCK_SIZE;
} while(--n);
__m128i sum = _mm_add_epi32(_mm256_castsi256_si128(v_s1), _mm256_extracti128_si256(v_s1, 1));
@@ -123,25 +123,25 @@ AARU_EXPORT AVX2 void AARU_CALL adler32_avx2(uint16_t* sum1, uint16_t* sum2, con
{
if(len >= 16)
{
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
len -= 16;
}
while(len--) { s2 += (s1 += *buf++); }
while(len--) { s2 += (s1 += *data++); }
if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
s2 %= ADLER_MODULE;
}

View File

@@ -10,7 +10,7 @@
#include "adler32.h"
#include "simd.h"
TARGET_WITH_SIMD void adler32_neon(uint16_t* sum1, uint16_t* sum2, const unsigned char* buf, uint32_t len)
TARGET_WITH_SIMD void adler32_neon(uint16_t* sum1, uint16_t* sum2, const uint8_t* data, uint32_t len)
{
/*
* Split Adler-32 into component sums.
@@ -20,11 +20,11 @@ TARGET_WITH_SIMD void adler32_neon(uint16_t* sum1, uint16_t* sum2, const unsigne
/*
* Serially compute s1 & s2, until the data is 16-byte aligned.
*/
if((uintptr_t)buf & 15)
if((uintptr_t)data & 15)
{
while((uintptr_t)buf & 15)
while((uintptr_t)data & 15)
{
s2 += (s1 += *buf++);
s2 += (s1 += *data++);
--len;
}
if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
@@ -60,8 +60,8 @@ TARGET_WITH_SIMD void adler32_neon(uint16_t* sum1, uint16_t* sum2, const unsigne
/*
* Load 32 input bytes.
*/
const uint8x16_t bytes1 = vld1q_u8((uint8_t*)(buf));
const uint8x16_t bytes2 = vld1q_u8((uint8_t*)(buf + 16));
const uint8x16_t bytes1 = vld1q_u8((uint8_t*)(data));
const uint8x16_t bytes2 = vld1q_u8((uint8_t*)(data + 16));
/*
* Add previous block byte sum to v_s2.
*/
@@ -77,7 +77,7 @@ TARGET_WITH_SIMD void adler32_neon(uint16_t* sum1, uint16_t* sum2, const unsigne
v_column_sum_2 = vaddw_u8(v_column_sum_2, vget_high_u8(bytes1));
v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8(bytes2));
v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2));
buf += BLOCK_SIZE;
data += BLOCK_SIZE;
} while(--n);
v_s2 = vshlq_n_u32(v_s2, 5);
/*
@@ -123,25 +123,25 @@ TARGET_WITH_SIMD void adler32_neon(uint16_t* sum1, uint16_t* sum2, const unsigne
{
if(len >= 16)
{
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
len -= 16;
}
while(len--) { s2 += (s1 += *buf++); }
while(len--) { s2 += (s1 += *data++); }
if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
s2 %= ADLER_MODULE;
}

View File

@@ -51,7 +51,7 @@
#include "library.h"
#include "adler32.h"
AARU_EXPORT SSSE3 void AARU_CALL adler32_ssse3(uint16_t* sum1, uint16_t* sum2, const unsigned char* buf, long len)
AARU_EXPORT SSSE3 void AARU_CALL adler32_ssse3(uint16_t* sum1, uint16_t* sum2, const uint8_t* data, long len)
{
uint32_t s1 = *sum1;
uint32_t s2 = *sum2;
@@ -82,8 +82,8 @@ AARU_EXPORT SSSE3 void AARU_CALL adler32_ssse3(uint16_t* sum1, uint16_t* sum2, c
/*
* Load 32 input bytes.
*/
const __m128i bytes1 = _mm_loadu_si128((__m128i*)(buf));
const __m128i bytes2 = _mm_loadu_si128((__m128i*)(buf + 16));
const __m128i bytes1 = _mm_loadu_si128((__m128i*)(data));
const __m128i bytes2 = _mm_loadu_si128((__m128i*)(data + 16));
/*
* Add previous block byte sum to v_ps.
*/
@@ -98,7 +98,7 @@ AARU_EXPORT SSSE3 void AARU_CALL adler32_ssse3(uint16_t* sum1, uint16_t* sum2, c
v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zero));
const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2);
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones));
buf += BLOCK_SIZE;
data += BLOCK_SIZE;
} while(--n);
v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5));
/*
@@ -127,25 +127,25 @@ AARU_EXPORT SSSE3 void AARU_CALL adler32_ssse3(uint16_t* sum1, uint16_t* sum2, c
{
if(len >= 16)
{
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
s2 += (s1 += *data++);
len -= 16;
}
while(len--) { s2 += (s1 += *buf++); }
while(len--) { s2 += (s1 += *data++); }
if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
s2 %= ADLER_MODULE;
}

10
crc32.c
View File

@@ -41,7 +41,7 @@ AARU_EXPORT int AARU_CALL crc32_update(crc32_ctx* ctx, const uint8_t* data, uint
defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86)
if(have_clmul())
{
ctx->crc = ~crc32_clmul(data, (long)len, ~ctx->crc);
ctx->crc = ~crc32_clmul(~ctx->crc, data, (long)len);
return 0;
}
@@ -58,7 +58,7 @@ AARU_EXPORT int AARU_CALL crc32_update(crc32_ctx* ctx, const uint8_t* data, uint
#endif
if(have_neon())
{
ctx->crc = ~crc32_vmull(data, len, ~ctx->crc);
ctx->crc = ~crc32_vmull(~ctx->crc, data, len);
return 0;
}
#endif
@@ -67,7 +67,7 @@ AARU_EXPORT int AARU_CALL crc32_update(crc32_ctx* ctx, const uint8_t* data, uint
return 0;
}
AARU_EXPORT void AARU_CALL crc32_slicing(uint32_t* crc, const unsigned char* data, long len)
AARU_EXPORT void AARU_CALL crc32_slicing(uint32_t* previous_crc, const uint8_t* data, long len)
{
// Unroll according to Intel slicing by uint8_t
// http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf
@@ -79,7 +79,7 @@ AARU_EXPORT void AARU_CALL crc32_slicing(uint32_t* crc, const unsigned char* dat
const size_t bytes_at_once = 8 * unroll;
uintptr_t unaligned_length = (4 - (((uintptr_t)current_char) & 3)) & 3;
c = *crc;
c = *previous_crc;
while((len != 0) && (unaligned_length != 0))
{
@@ -110,7 +110,7 @@ AARU_EXPORT void AARU_CALL crc32_slicing(uint32_t* crc, const unsigned char* dat
while(len-- != 0) c = (c >> 8) ^ crc32_table[0][(c & 0xFF) ^ *current_char++];
*crc = c;
*previous_crc = c;
}
AARU_EXPORT int AARU_CALL crc32_final(crc32_ctx* ctx, uint32_t* crc)

10
crc32.h
View File

@@ -263,18 +263,18 @@ AARU_EXPORT crc32_ctx* AARU_CALL crc32_init();
AARU_EXPORT int AARU_CALL crc32_update(crc32_ctx* ctx, const uint8_t* data, uint32_t len);
AARU_EXPORT int AARU_CALL crc32_final(crc32_ctx* ctx, uint32_t* crc);
AARU_EXPORT void AARU_CALL crc32_free(crc32_ctx* ctx);
AARU_EXPORT void AARU_CALL crc32_slicing(uint32_t* crc, const unsigned char* data, long len);
AARU_EXPORT void AARU_CALL crc32_slicing(uint32_t* previous_crc, const uint8_t* data, long len);
#if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \
defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86)
AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(const uint8_t* src, long len, uint32_t initial_crc);
AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(uint32_t previous_crc, const uint8_t* data, long len);
#endif
#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
#if __ARM_ARCH >= 8
AARU_EXPORT TARGET_ARMV8_WITH_CRC uint32_t AARU_CALL armv8_crc32_little(uint32_t crc,
const unsigned char* buf,
AARU_EXPORT TARGET_ARMV8_WITH_CRC uint32_t AARU_CALL armv8_crc32_little(uint32_t previous_crc,
const uint8_t* data,
uint32_t len);
#endif
AARU_EXPORT TARGET_WITH_SIMD uint32_t AARU_CALL crc32_vmull(const uint8_t* src, long len, uint32_t initial_crc);
AARU_EXPORT TARGET_WITH_SIMD uint32_t AARU_CALL crc32_vmull(uint32_t previous_crc, const uint8_t* data, long len);
#endif

View File

@@ -9,17 +9,17 @@
#include "library.h"
#include "crc32.h"
TARGET_ARMV8_WITH_CRC uint32_t armv8_crc32_little(uint32_t crc, const unsigned char* buf, uint32_t len)
TARGET_ARMV8_WITH_CRC uint32_t armv8_crc32_little(uint32_t previous_crc, const uint8_t* data, uint32_t len)
{
uint32_t c = (uint32_t)crc;
uint32_t c = (uint32_t)previous_crc;
#if defined(__aarch64__) || defined(_M_ARM64)
while(len && ((uintptr_t)buf & 7))
while(len && ((uintptr_t)data & 7))
{
c = __crc32b(c, *buf++);
c = __crc32b(c, *data++);
--len;
}
const uint64_t* buf8 = (const uint64_t*)buf;
const uint64_t* buf8 = (const uint64_t*)data;
while(len >= 64)
{
c = __crc32d(c, *buf8++);
@@ -38,7 +38,7 @@ TARGET_ARMV8_WITH_CRC uint32_t armv8_crc32_little(uint32_t crc, const unsigned c
len -= 8;
}
buf = (const unsigned char*)buf8;
data = (const uint8_t*)buf8;
#else // AARCH64
while(len && ((uintptr_t)buf & 3))
{
@@ -64,10 +64,10 @@ TARGET_ARMV8_WITH_CRC uint32_t armv8_crc32_little(uint32_t crc, const unsigned c
len -= 4;
}
buf = (const unsigned char*)buf4;
buf = (const uint8_t*)buf4;
#endif
while(len--) { c = __crc32b(c, *buf++); }
while(len--) { c = __crc32b(c, *data++); }
return c;
}
#endif

View File

@@ -237,11 +237,11 @@ static void partial_fold(const size_t len,
*/
#define XOR_INITIAL(where) ONCE(where = _mm_xor_si128(where, xmm_initial))
AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(const uint8_t* src, long len, uint32_t initial_crc)
AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(uint32_t previous_crc, const uint8_t* data, long len)
{
unsigned long algn_diff;
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
__m128i xmm_initial = _mm_cvtsi32_si128(initial_crc);
__m128i xmm_initial = _mm_cvtsi32_si128(previous_crc);
__m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
__m128i xmm_crc1 = _mm_setzero_si128();
__m128i xmm_crc2 = _mm_setzero_si128();
@@ -259,34 +259,34 @@ AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(const uint8_t* src, long len, u
if(len < 16)
{
if(len == 0) return initial_crc;
if(len == 0) return previous_crc;
if(len < 4)
{
/*
* no idea how to do this for <4 bytes, delegate to classic impl.
*/
uint32_t crc = ~initial_crc;
uint32_t crc = ~previous_crc;
switch(len)
{
case 3: crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *src++];
case 2: crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *src++];
case 1: crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *src++];
case 3: crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *data++];
case 2: crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *data++];
case 1: crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *data++];
}
return ~crc;
}
xmm_crc_part = _mm_loadu_si128((__m128i*)src);
xmm_crc_part = _mm_loadu_si128((__m128i*)data);
XOR_INITIAL(xmm_crc_part);
goto partial;
}
/* this alignment computation would be wrong for len<16 handled above */
algn_diff = (0 - (uintptr_t)src) & 0xF;
algn_diff = (0 - (uintptr_t)data) & 0xF;
if(algn_diff)
{
xmm_crc_part = _mm_loadu_si128((__m128i*)src);
xmm_crc_part = _mm_loadu_si128((__m128i*)data);
XOR_INITIAL(xmm_crc_part);
src += algn_diff;
data += algn_diff;
len -= algn_diff;
partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
@@ -294,10 +294,10 @@ AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(const uint8_t* src, long len, u
while((len -= 64) >= 0)
{
xmm_t0 = _mm_load_si128((__m128i*)src);
xmm_t1 = _mm_load_si128((__m128i*)src + 1);
xmm_t2 = _mm_load_si128((__m128i*)src + 2);
xmm_t3 = _mm_load_si128((__m128i*)src + 3);
xmm_t0 = _mm_load_si128((__m128i*)data);
xmm_t1 = _mm_load_si128((__m128i*)data + 1);
xmm_t2 = _mm_load_si128((__m128i*)data + 2);
xmm_t3 = _mm_load_si128((__m128i*)data + 3);
XOR_INITIAL(xmm_t0);
@@ -308,7 +308,7 @@ AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(const uint8_t* src, long len, u
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
src += 64;
data += 64;
}
/*
@@ -318,9 +318,9 @@ AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(const uint8_t* src, long len, u
{
len += 16;
xmm_t0 = _mm_load_si128((__m128i*)src);
xmm_t1 = _mm_load_si128((__m128i*)src + 1);
xmm_t2 = _mm_load_si128((__m128i*)src + 2);
xmm_t0 = _mm_load_si128((__m128i*)data);
xmm_t1 = _mm_load_si128((__m128i*)data + 1);
xmm_t2 = _mm_load_si128((__m128i*)data + 2);
XOR_INITIAL(xmm_t0);
@@ -332,14 +332,14 @@ AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(const uint8_t* src, long len, u
if(len == 0) goto done;
xmm_crc_part = _mm_load_si128((__m128i*)src + 3);
xmm_crc_part = _mm_load_si128((__m128i*)data + 3);
}
else if(len + 32 >= 0)
{
len += 32;
xmm_t0 = _mm_load_si128((__m128i*)src);
xmm_t1 = _mm_load_si128((__m128i*)src + 1);
xmm_t0 = _mm_load_si128((__m128i*)data);
xmm_t1 = _mm_load_si128((__m128i*)data + 1);
XOR_INITIAL(xmm_t0);
@@ -350,13 +350,13 @@ AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(const uint8_t* src, long len, u
if(len == 0) goto done;
xmm_crc_part = _mm_load_si128((__m128i*)src + 2);
xmm_crc_part = _mm_load_si128((__m128i*)data + 2);
}
else if(len + 48 >= 0)
{
len += 48;
xmm_t0 = _mm_load_si128((__m128i*)src);
xmm_t0 = _mm_load_si128((__m128i*)data);
XOR_INITIAL(xmm_t0);
@@ -366,13 +366,13 @@ AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(const uint8_t* src, long len, u
if(len == 0) goto done;
xmm_crc_part = _mm_load_si128((__m128i*)src + 1);
xmm_crc_part = _mm_load_si128((__m128i*)data + 1);
}
else
{
len += 64;
if(len == 0) goto done;
xmm_crc_part = _mm_load_si128((__m128i*)src);
xmm_crc_part = _mm_load_si128((__m128i*)data);
XOR_INITIAL(xmm_crc_part);
}

View File

@@ -212,14 +212,14 @@ TARGET_WITH_SIMD FORCE_INLINE void partial_fold(const size_t len,
*q_crc3 = vreinterpretq_u64_u32(ps_res);
}
TARGET_WITH_SIMD uint32_t crc32_vmull(const uint8_t* src, long len, uint32_t initial_crc)
TARGET_WITH_SIMD uint32_t crc32_vmull(uint32_t previous_crc, const uint8_t* data, long len)
{
unsigned long algn_diff;
uint64x2_t q_t0;
uint64x2_t q_t1;
uint64x2_t q_t2;
uint64x2_t q_t3;
uint64x2_t q_initial = vreinterpretq_u64_u32(vsetq_lane_u32(initial_crc, vdupq_n_u32(0), 0));
uint64x2_t q_initial = vreinterpretq_u64_u32(vsetq_lane_u32(previous_crc, vdupq_n_u32(0), 0));
uint64x2_t q_crc0 = vreinterpretq_u64_u32(vsetq_lane_u32(0x9db42487, vdupq_n_u32(0), 0));
uint64x2_t q_crc1 = vreinterpretq_u64_u32(vdupq_n_u32(0));
uint64x2_t q_crc2 = vreinterpretq_u64_u32(vdupq_n_u32(0));
@@ -240,34 +240,34 @@ TARGET_WITH_SIMD uint32_t crc32_vmull(const uint8_t* src, long len, uint32_t ini
if(len < 16)
{
if(len == 0) return initial_crc;
if(len == 0) return previous_crc;
if(len < 4)
{
/*
* no idea how to do this for <4 bytes, delegate to classic impl.
*/
uint32_t crc = ~initial_crc;
uint32_t crc = ~previous_crc;
switch(len)
{
case 3: crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *src++];
case 2: crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *src++];
case 1: crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *src++];
case 3: crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *data++];
case 2: crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *data++];
case 1: crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *data++];
}
return ~crc;
}
q_crc_part = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src));
q_crc_part = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data));
XOR_INITIAL(q_crc_part);
goto partial;
}
/* this alignment computation would be wrong for len<16 handled above */
algn_diff = (0 - (uintptr_t)src) & 0xF;
algn_diff = (0 - (uintptr_t)data) & 0xF;
if(algn_diff)
{
q_crc_part = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src));
q_crc_part = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data));
XOR_INITIAL(q_crc_part);
src += algn_diff;
data += algn_diff;
len -= algn_diff;
partial_fold(algn_diff, &q_crc0, &q_crc1, &q_crc2, &q_crc3, &q_crc_part);
@@ -275,10 +275,10 @@ TARGET_WITH_SIMD uint32_t crc32_vmull(const uint8_t* src, long len, uint32_t ini
while((len -= 64) >= 0)
{
q_t0 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src));
q_t1 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src + 4));
q_t2 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src + 8));
q_t3 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src + 12));
q_t0 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data));
q_t1 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data + 4));
q_t2 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data + 8));
q_t3 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data + 12));
XOR_INITIAL(q_t0);
@@ -289,7 +289,7 @@ TARGET_WITH_SIMD uint32_t crc32_vmull(const uint8_t* src, long len, uint32_t ini
q_crc2 = vreinterpretq_u64_u32(veorq_u32(vreinterpretq_u32_u64(q_crc2), vreinterpretq_u32_u64(q_t2)));
q_crc3 = vreinterpretq_u64_u32(veorq_u32(vreinterpretq_u32_u64(q_crc3), vreinterpretq_u32_u64(q_t3)));
src += 64;
data += 64;
}
/*
@@ -299,9 +299,9 @@ TARGET_WITH_SIMD uint32_t crc32_vmull(const uint8_t* src, long len, uint32_t ini
{
len += 16;
q_t0 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src));
q_t1 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src + 4));
q_t2 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src + 8));
q_t0 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data));
q_t1 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data + 4));
q_t2 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data + 8));
XOR_INITIAL(q_t0);
@@ -313,14 +313,14 @@ TARGET_WITH_SIMD uint32_t crc32_vmull(const uint8_t* src, long len, uint32_t ini
if(len == 0) goto done;
q_crc_part = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src + 12));
q_crc_part = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data + 12));
}
else if(len + 32 >= 0)
{
len += 32;
q_t0 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src));
q_t1 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src + 4));
q_t0 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data));
q_t1 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data + 4));
XOR_INITIAL(q_t0);
@@ -331,13 +331,13 @@ TARGET_WITH_SIMD uint32_t crc32_vmull(const uint8_t* src, long len, uint32_t ini
if(len == 0) goto done;
q_crc_part = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src + 8));
q_crc_part = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data + 8));
}
else if(len + 48 >= 0)
{
len += 48;
q_t0 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src));
q_t0 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data));
XOR_INITIAL(q_t0);
@@ -347,13 +347,13 @@ TARGET_WITH_SIMD uint32_t crc32_vmull(const uint8_t* src, long len, uint32_t ini
if(len == 0) goto done;
q_crc_part = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src + 4));
q_crc_part = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data + 4));
}
else
{
len += 64;
if(len == 0) goto done;
q_crc_part = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)src));
q_crc_part = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)data));
XOR_INITIAL(q_crc_part);
}

View File

@@ -64,9 +64,9 @@ AARU_EXPORT int AARU_CALL crc64_update(crc64_ctx* ctx, const uint8_t* data, uint
return 0;
}
AARU_EXPORT void AARU_CALL crc64_slicing(uint64_t* crc, const uint8_t* data, uint32_t len)
AARU_EXPORT void AARU_CALL crc64_slicing(uint64_t* previous_crc, const uint8_t* data, uint32_t len)
{
uint64_t c = *crc;
uint64_t c = *previous_crc;
if(len > 4)
{
@@ -93,7 +93,7 @@ AARU_EXPORT void AARU_CALL crc64_slicing(uint64_t* crc, const uint8_t* data, uin
while(len-- != 0) c = crc64_table[0][*data++ ^ ((c)&0xFF)] ^ ((c) >> 8);
*crc = c;
*previous_crc = c;
}
AARU_EXPORT int AARU_CALL crc64_final(crc64_ctx* ctx, uint64_t* crc)

View File

@@ -238,7 +238,7 @@ AARU_EXPORT crc64_ctx* AARU_CALL crc64_init();
AARU_EXPORT int AARU_CALL crc64_update(crc64_ctx* ctx, const uint8_t* data, uint32_t len);
AARU_EXPORT int AARU_CALL crc64_final(crc64_ctx* ctx, uint64_t* crc);
AARU_EXPORT void AARU_CALL crc64_free(crc64_ctx* ctx);
AARU_EXPORT void AARU_CALL crc64_slicing(uint64_t* crc, const uint8_t* data, uint32_t len);
AARU_EXPORT void AARU_CALL crc64_slicing(uint64_t* previous_crc, const uint8_t* data, uint32_t len);
#if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \
defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86)
@@ -246,5 +246,5 @@ AARU_EXPORT CLMUL uint64_t AARU_CALL crc64_clmul(uint64_t crc, const uint8_t* da
#endif
#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
AARU_EXPORT TARGET_WITH_SIMD uint64_t AARU_CALL crc64_vmull(uint64_t crc, const uint8_t* data, long length);
AARU_EXPORT TARGET_WITH_SIMD uint64_t AARU_CALL crc64_vmull(uint64_t previous_crc, const uint8_t* data, long len);
#endif

View File

@@ -35,7 +35,7 @@ TARGET_WITH_SIMD FORCE_INLINE uint64x2_t fold(uint64x2_t in, uint64x2_t foldCons
sse2neon_vmull_p64(vget_high_u64(in), vget_high_u64(foldConstants)));
}
AARU_EXPORT TARGET_WITH_SIMD uint64_t AARU_CALL crc64_vmull(uint64_t crc, const uint8_t* data, long length)
AARU_EXPORT TARGET_WITH_SIMD uint64_t AARU_CALL crc64_vmull(uint64_t previous_crc, const uint8_t* data, long len)
{
const uint64_t k1 = 0xe05dd497ca393ae4; // bitReflect(expMod65(128 + 64, poly, 1)) << 1;
const uint64_t k2 = 0xdabe95afc7875f40; // bitReflect(expMod65(128, poly, 1)) << 1;
@@ -45,7 +45,7 @@ AARU_EXPORT TARGET_WITH_SIMD uint64_t AARU_CALL crc64_vmull(uint64_t crc, const
const uint64x2_t foldConstants1 = vcombine_u64(vcreate_u64(k1), vcreate_u64(k2));
const uint64x2_t foldConstants2 = vcombine_u64(vcreate_u64(mu), vcreate_u64(p));
const uint8_t* end = data + length;
const uint8_t* end = data + len;
// Align pointers
const uint64x2_t* alignedData = (const uint64x2_t*)((uintptr_t)data & ~(uintptr_t)15);
@@ -66,14 +66,14 @@ AARU_EXPORT TARGET_WITH_SIMD uint64_t AARU_CALL crc64_vmull(uint64_t crc, const
vreinterpretq_u8_u64(b),
vreinterpretq_u8_u64(a)));
const uint64x2_t initialCrc = vsetq_lane_u64(~crc, vdupq_n_u64(0), 0);
const uint64x2_t initialCrc = vsetq_lane_u64(~previous_crc, vdupq_n_u64(0), 0);
uint64x2_t R;
if(alignedLength == 1)
{
// Single data block, initial CRC possibly bleeds into zero padding
uint64x2_t crc0, crc1;
shiftRight128(initialCrc, 16 - length, &crc0, &crc1);
shiftRight128(initialCrc, 16 - len, &crc0, &crc1);
uint64x2_t A, B;
shiftRight128(data0, leadOutSize, &A, &B);
@@ -86,11 +86,11 @@ AARU_EXPORT TARGET_WITH_SIMD uint64_t AARU_CALL crc64_vmull(uint64_t crc, const
{
const uint64x2_t data1 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)(alignedData + 1)));
if(length < 8)
if(len < 8)
{
// Initial CRC bleeds into the zero padding
uint64x2_t crc0, crc1;
shiftRight128(initialCrc, 16 - length, &crc0, &crc1);
shiftRight128(initialCrc, 16 - len, &crc0, &crc1);
uint64x2_t A, B, C, D;
shiftRight128(data0, leadOutSize, &A, &B);
@@ -117,7 +117,7 @@ AARU_EXPORT TARGET_WITH_SIMD uint64_t AARU_CALL crc64_vmull(uint64_t crc, const
else
{
alignedData++;
length -= 16 - leadInSize;
len -= 16 - leadInSize;
// Initial CRC can simply be added to data
uint64x2_t crc0, crc1;
@@ -125,17 +125,17 @@ AARU_EXPORT TARGET_WITH_SIMD uint64_t AARU_CALL crc64_vmull(uint64_t crc, const
uint64x2_t accumulator = veorq_u64(fold(veorq_u64(crc0, data0), foldConstants1), crc1);
while(length >= 32)
while(len >= 32)
{
accumulator = fold(veorq_u64(vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)alignedData)), accumulator),
foldConstants1);
length -= 16;
len -= 16;
alignedData++;
}
uint64x2_t P;
if(length == 16) P = veorq_u64(accumulator, vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)alignedData)));
if(len == 16) P = veorq_u64(accumulator, vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)alignedData)));
else
{
const uint64x2_t end0 =

View File

@@ -211,7 +211,7 @@ TEST_F(crc32Fixture, crc32_clmul)
uint32_t crc = CRC32_ISO_SEED;
crc = ~crc32_clmul(buffer, 1048576, ~crc);
crc = ~crc32_clmul(~crc, buffer, 1048576);
crc ^= CRC32_ISO_SEED;
@@ -224,7 +224,7 @@ TEST_F(crc32Fixture, crc32_clmul_misaligned)
uint32_t crc = CRC32_ISO_SEED;
crc = ~crc32_clmul(buffer_misaligned+1, 1048576, ~crc);
crc = ~crc32_clmul(~crc, buffer_misaligned + 1, 1048576);
crc ^= CRC32_ISO_SEED;
@@ -237,7 +237,7 @@ TEST_F(crc32Fixture, crc32_clmul_15bytes)
uint32_t crc = CRC32_ISO_SEED;
crc = ~crc32_clmul(buffer, 15, ~crc);
crc = ~crc32_clmul(~crc, buffer, 15);
crc ^= CRC32_ISO_SEED;
@@ -250,7 +250,7 @@ TEST_F(crc32Fixture, crc32_clmul_31bytes)
uint32_t crc = CRC32_ISO_SEED;
crc = ~crc32_clmul(buffer, 31, ~crc);
crc = ~crc32_clmul(~crc, buffer, 31);
crc ^= CRC32_ISO_SEED;
@@ -263,7 +263,7 @@ TEST_F(crc32Fixture, crc32_clmul_63bytes)
uint32_t crc = CRC32_ISO_SEED;
crc = ~crc32_clmul(buffer, 63, ~crc);
crc = ~crc32_clmul(~crc, buffer, 63);
crc ^= CRC32_ISO_SEED;
@@ -276,7 +276,7 @@ TEST_F(crc32Fixture, crc32_clmul_2352bytes)
uint32_t crc = CRC32_ISO_SEED;
crc = ~crc32_clmul(buffer, 2352, ~crc);
crc = ~crc32_clmul(~crc, buffer, 2352);
crc ^= CRC32_ISO_SEED;
@@ -371,7 +371,7 @@ TEST_F(crc32Fixture, crc32_vmull)
uint32_t crc = CRC32_ISO_SEED;
crc = ~crc32_vmull(buffer, 1048576, ~crc);
crc = ~crc32_vmull(~crc, buffer, 1048576);
crc ^= CRC32_ISO_SEED;
@@ -384,7 +384,7 @@ TEST_F(crc32Fixture, crc32_vmull_misaligned)
uint32_t crc = CRC32_ISO_SEED;
crc = ~crc32_vmull(buffer_misaligned+1, 1048576, ~crc);
crc = ~crc32_vmull(~crc, buffer_misaligned + 1, 1048576);
crc ^= CRC32_ISO_SEED;
@@ -397,7 +397,7 @@ TEST_F(crc32Fixture, crc32_vmull_15bytes)
uint32_t crc = CRC32_ISO_SEED;
crc = ~crc32_vmull(buffer, 15, ~crc);
crc = ~crc32_vmull(~crc, buffer, 15);
crc ^= CRC32_ISO_SEED;
@@ -410,7 +410,7 @@ TEST_F(crc32Fixture, crc32_vmull_31bytes)
uint32_t crc = CRC32_ISO_SEED;
crc = ~crc32_vmull(buffer, 31, ~crc);
crc = ~crc32_vmull(~crc, buffer, 31);
crc ^= CRC32_ISO_SEED;
@@ -423,7 +423,7 @@ TEST_F(crc32Fixture, crc32_vmull_63bytes)
uint32_t crc = CRC32_ISO_SEED;
crc = ~crc32_vmull(buffer, 63, ~crc);
crc = ~crc32_vmull(~crc, buffer, 63);
crc ^= CRC32_ISO_SEED;
@@ -436,7 +436,7 @@ TEST_F(crc32Fixture, crc32_vmull_2352bytes)
uint32_t crc = CRC32_ISO_SEED;
crc = ~crc32_vmull(buffer, 2352, ~crc);
crc = ~crc32_vmull(~crc, buffer, 2352);
crc ^= CRC32_ISO_SEED;