mirror of
https://github.com/aaru-dps/Aaru.Checksums.Native.git
synced 2025-12-16 19:24:29 +00:00
General refactor and cleanup.
This commit is contained in:
@@ -6,9 +6,11 @@ The purpose of this library is to provide checksums and hashing algorithms for A
|
|||||||
|
|
||||||
No archiver processing code should fall here, those go in [Aaru.Checksums](https://github.com/aaru-dps/Aaru.Checksums).
|
No archiver processing code should fall here, those go in [Aaru.Checksums](https://github.com/aaru-dps/Aaru.Checksums).
|
||||||
|
|
||||||
To build you just need Docker on Linux and run `build.sh`, that will generate a NuGet package for use with Aaru.Checksums.
|
To build you just need Docker on Linux and run `build.sh`, that will generate a NuGet package for use with
|
||||||
|
Aaru.Checksums.
|
||||||
|
|
||||||
Currently implemented algorithms are:
|
Currently implemented algorithms are:
|
||||||
|
|
||||||
- Adler-32
|
- Adler-32
|
||||||
- CRC-16 (CCITT and IBM polynomials)
|
- CRC-16 (CCITT and IBM polynomials)
|
||||||
- CRC-32 (ISO polynomial)
|
- CRC-32 (ISO polynomial)
|
||||||
|
|||||||
@@ -147,8 +147,7 @@ AARU_EXPORT void AARU_CALL adler32_slicing(uint16_t *sum1, uint16_t *sum2, const
|
|||||||
{
|
{
|
||||||
len -= NMAX;
|
len -= NMAX;
|
||||||
n = NMAX / 16; /* NMAX is divisible by 16 */
|
n = NMAX / 16; /* NMAX is divisible by 16 */
|
||||||
do
|
do {
|
||||||
{
|
|
||||||
s1 += data[0];
|
s1 += data[0];
|
||||||
s2 += s1;
|
s2 += s1;
|
||||||
s1 += data[0 + 1];
|
s1 += data[0 + 1];
|
||||||
@@ -184,8 +183,7 @@ AARU_EXPORT void AARU_CALL adler32_slicing(uint16_t *sum1, uint16_t *sum2, const
|
|||||||
|
|
||||||
/* 16 sums unrolled */
|
/* 16 sums unrolled */
|
||||||
data += 16;
|
data += 16;
|
||||||
}
|
} while(--n);
|
||||||
while(--n);
|
|
||||||
s1 %= ADLER_MODULE;
|
s1 %= ADLER_MODULE;
|
||||||
s2 %= ADLER_MODULE;
|
s2 %= ADLER_MODULE;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -38,8 +38,8 @@ AARU_EXPORT void AARU_CALL adler32_slicing(uint16_t *sum1, uint16_t *sum2, const
|
|||||||
#if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \
|
#if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \
|
||||||
defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86)
|
defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86)
|
||||||
|
|
||||||
AARU_EXPORT TARGET_WITH_SSSE3 void AARU_CALL
|
AARU_EXPORT TARGET_WITH_SSSE3 void AARU_CALL adler32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data,
|
||||||
adler32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len);
|
long len);
|
||||||
AARU_EXPORT TARGET_WITH_AVX2 void AARU_CALL adler32_avx2(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len);
|
AARU_EXPORT TARGET_WITH_AVX2 void AARU_CALL adler32_avx2(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -63,38 +63,8 @@ AARU_EXPORT TARGET_WITH_AVX2 void AARU_CALL adler32_avx2(uint16_t *sum1, uint16_
|
|||||||
if(n > blocks) n = (unsigned)blocks;
|
if(n > blocks) n = (unsigned)blocks;
|
||||||
blocks -= n;
|
blocks -= n;
|
||||||
|
|
||||||
const __m256i tap = _mm256_set_epi8(1,
|
const __m256i tap = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
|
||||||
2,
|
21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
|
||||||
3,
|
|
||||||
4,
|
|
||||||
5,
|
|
||||||
6,
|
|
||||||
7,
|
|
||||||
8,
|
|
||||||
9,
|
|
||||||
10,
|
|
||||||
11,
|
|
||||||
12,
|
|
||||||
13,
|
|
||||||
14,
|
|
||||||
15,
|
|
||||||
16,
|
|
||||||
17,
|
|
||||||
18,
|
|
||||||
19,
|
|
||||||
20,
|
|
||||||
21,
|
|
||||||
22,
|
|
||||||
23,
|
|
||||||
24,
|
|
||||||
25,
|
|
||||||
26,
|
|
||||||
27,
|
|
||||||
28,
|
|
||||||
29,
|
|
||||||
30,
|
|
||||||
31,
|
|
||||||
32);
|
|
||||||
const __m256i zero = _mm256_setzero_si256();
|
const __m256i zero = _mm256_setzero_si256();
|
||||||
const __m256i ones = _mm256_set1_epi16(1);
|
const __m256i ones = _mm256_set1_epi16(1);
|
||||||
|
|
||||||
@@ -105,8 +75,7 @@ AARU_EXPORT TARGET_WITH_AVX2 void AARU_CALL adler32_avx2(uint16_t *sum1, uint16_
|
|||||||
__m256i v_ps = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, (s1 * n));
|
__m256i v_ps = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, (s1 * n));
|
||||||
__m256i v_s2 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, s2);
|
__m256i v_s2 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, s2);
|
||||||
__m256i v_s1 = _mm256_setzero_si256();
|
__m256i v_s1 = _mm256_setzero_si256();
|
||||||
do
|
do {
|
||||||
{
|
|
||||||
/*
|
/*
|
||||||
* Load 32 input bytes.
|
* Load 32 input bytes.
|
||||||
*/
|
*/
|
||||||
@@ -125,8 +94,7 @@ AARU_EXPORT TARGET_WITH_AVX2 void AARU_CALL adler32_avx2(uint16_t *sum1, uint16_
|
|||||||
v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(mad, ones));
|
v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(mad, ones));
|
||||||
|
|
||||||
data += BLOCK_SIZE;
|
data += BLOCK_SIZE;
|
||||||
}
|
} while(--n);
|
||||||
while(--n);
|
|
||||||
|
|
||||||
__m128i sum = _mm_add_epi32(_mm256_castsi256_si128(v_s1), _mm256_extracti128_si256(v_s1, 1));
|
__m128i sum = _mm_add_epi32(_mm256_castsi256_si128(v_s1), _mm256_extracti128_si256(v_s1, 1));
|
||||||
__m128i hi = _mm_unpackhi_epi64(sum, sum);
|
__m128i hi = _mm_unpackhi_epi64(sum, sum);
|
||||||
@@ -176,8 +144,7 @@ AARU_EXPORT TARGET_WITH_AVX2 void AARU_CALL adler32_avx2(uint16_t *sum1, uint16_
|
|||||||
s2 += (s1 += *data++);
|
s2 += (s1 += *data++);
|
||||||
len -= 16;
|
len -= 16;
|
||||||
}
|
}
|
||||||
while(len--)
|
while(len--) { s2 += (s1 += *data++); }
|
||||||
{ s2 += (s1 += *data++); }
|
|
||||||
if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
|
if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
|
||||||
s2 %= ADLER_MODULE;
|
s2 %= ADLER_MODULE;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -88,8 +88,12 @@ TARGET_WITH_NEON void adler32_neon(uint16_t *sum1, uint16_t *sum2, const uint8_t
|
|||||||
* processed before s2 must be reduced modulo ADLER_MODULE.
|
* processed before s2 must be reduced modulo ADLER_MODULE.
|
||||||
*/
|
*/
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
uint32x4_t v_s2 = {.n128_u32 = {0, 0, 0, s1 * n}};
|
uint32x4_t v_s2 = {
|
||||||
uint32x4_t v_s1 = {.n128_u32 = {0, 0, 0, 0}};
|
.n128_u32 = {0, 0, 0, s1 * n}
|
||||||
|
};
|
||||||
|
uint32x4_t v_s1 = {
|
||||||
|
.n128_u32 = {0, 0, 0, 0}
|
||||||
|
};
|
||||||
#else
|
#else
|
||||||
uint32x4_t v_s2 = (uint32x4_t){0, 0, 0, s1 * n};
|
uint32x4_t v_s2 = (uint32x4_t){0, 0, 0, s1 * n};
|
||||||
uint32x4_t v_s1 = (uint32x4_t){0, 0, 0, 0};
|
uint32x4_t v_s1 = (uint32x4_t){0, 0, 0, 0};
|
||||||
@@ -98,8 +102,7 @@ TARGET_WITH_NEON void adler32_neon(uint16_t *sum1, uint16_t *sum2, const uint8_t
|
|||||||
uint16x8_t v_column_sum_2 = vdupq_n_u16(0);
|
uint16x8_t v_column_sum_2 = vdupq_n_u16(0);
|
||||||
uint16x8_t v_column_sum_3 = vdupq_n_u16(0);
|
uint16x8_t v_column_sum_3 = vdupq_n_u16(0);
|
||||||
uint16x8_t v_column_sum_4 = vdupq_n_u16(0);
|
uint16x8_t v_column_sum_4 = vdupq_n_u16(0);
|
||||||
do
|
do {
|
||||||
{
|
|
||||||
/*
|
/*
|
||||||
* Load 32 input bytes.
|
* Load 32 input bytes.
|
||||||
*/
|
*/
|
||||||
@@ -121,8 +124,7 @@ TARGET_WITH_NEON void adler32_neon(uint16_t *sum1, uint16_t *sum2, const uint8_t
|
|||||||
v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8(bytes2));
|
v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8(bytes2));
|
||||||
v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2));
|
v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2));
|
||||||
data += BLOCK_SIZE;
|
data += BLOCK_SIZE;
|
||||||
}
|
} while(--n);
|
||||||
while(--n);
|
|
||||||
v_s2 = vshlq_n_u32(v_s2, 5);
|
v_s2 = vshlq_n_u32(v_s2, 5);
|
||||||
/*
|
/*
|
||||||
* Multiply-add bytes by [ 32, 31, 30, ... ] for s2.
|
* Multiply-add bytes by [ 32, 31, 30, ... ] for s2.
|
||||||
@@ -198,8 +200,7 @@ TARGET_WITH_NEON void adler32_neon(uint16_t *sum1, uint16_t *sum2, const uint8_t
|
|||||||
s2 += (s1 += *data++);
|
s2 += (s1 += *data++);
|
||||||
len -= 16;
|
len -= 16;
|
||||||
}
|
}
|
||||||
while(len--)
|
while(len--) { s2 += (s1 += *data++); }
|
||||||
{ s2 += (s1 += *data++); }
|
|
||||||
if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
|
if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
|
||||||
s2 %= ADLER_MODULE;
|
s2 %= ADLER_MODULE;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -39,7 +39,6 @@
|
|||||||
#include "library.h"
|
#include "library.h"
|
||||||
#include "adler32.h"
|
#include "adler32.h"
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Calculate Adler-32 checksum for a given data using SSSE3 instructions.
|
* @brief Calculate Adler-32 checksum for a given data using SSSE3 instructions.
|
||||||
*
|
*
|
||||||
@@ -50,8 +49,8 @@
|
|||||||
* @param data Pointer to the data buffer.
|
* @param data Pointer to the data buffer.
|
||||||
* @param len Length of the data buffer in bytes.
|
* @param len Length of the data buffer in bytes.
|
||||||
*/
|
*/
|
||||||
AARU_EXPORT TARGET_WITH_SSSE3 void AARU_CALL
|
AARU_EXPORT TARGET_WITH_SSSE3 void AARU_CALL adler32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data,
|
||||||
adler32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
|
long len)
|
||||||
{
|
{
|
||||||
uint32_t s1 = *sum1;
|
uint32_t s1 = *sum1;
|
||||||
uint32_t s2 = *sum2;
|
uint32_t s2 = *sum2;
|
||||||
@@ -80,8 +79,7 @@ adler32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
|
|||||||
__m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n);
|
__m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n);
|
||||||
__m128i v_s2 = _mm_set_epi32(0, 0, 0, s2);
|
__m128i v_s2 = _mm_set_epi32(0, 0, 0, s2);
|
||||||
__m128i v_s1 = _mm_set_epi32(0, 0, 0, 0);
|
__m128i v_s1 = _mm_set_epi32(0, 0, 0, 0);
|
||||||
do
|
do {
|
||||||
{
|
|
||||||
/*
|
/*
|
||||||
* Load 32 input bytes.
|
* Load 32 input bytes.
|
||||||
*/
|
*/
|
||||||
@@ -102,8 +100,7 @@ adler32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
|
|||||||
const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2);
|
const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2);
|
||||||
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones));
|
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones));
|
||||||
data += BLOCK_SIZE;
|
data += BLOCK_SIZE;
|
||||||
}
|
} while(--n);
|
||||||
while(--n);
|
|
||||||
v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5));
|
v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5));
|
||||||
/*
|
/*
|
||||||
* Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
|
* Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
|
||||||
@@ -151,8 +148,7 @@ adler32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
|
|||||||
s2 += (s1 += *data++);
|
s2 += (s1 += *data++);
|
||||||
len -= 16;
|
len -= 16;
|
||||||
}
|
}
|
||||||
while(len--)
|
while(len--) { s2 += (s1 += *data++); }
|
||||||
{ s2 += (s1 += *data++); }
|
|
||||||
if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
|
if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
|
||||||
s2 %= ADLER_MODULE;
|
s2 %= ADLER_MODULE;
|
||||||
}
|
}
|
||||||
|
|||||||
3
crc16.h
3
crc16.h
@@ -176,7 +176,8 @@ const uint16_t crc16_table[8][256] = {
|
|||||||
0x858C, 0x494D, 0xAF0C, 0x63CD, 0x768D, 0xBA4C, 0xFA0C, 0x36CD, 0x238D, 0xEF4C, 0x090D, 0xC5CC, 0xD08C, 0x1C4D,
|
0x858C, 0x494D, 0xAF0C, 0x63CD, 0x768D, 0xBA4C, 0xFA0C, 0x36CD, 0x238D, 0xEF4C, 0x090D, 0xC5CC, 0xD08C, 0x1C4D,
|
||||||
0x480E, 0x84CF, 0x918F, 0x5D4E, 0xBB0F, 0x77CE, 0x628E, 0xAE4F, 0xEE0F, 0x22CE, 0x378E, 0xFB4F, 0x1D0E, 0xD1CF,
|
0x480E, 0x84CF, 0x918F, 0x5D4E, 0xBB0F, 0x77CE, 0x628E, 0xAE4F, 0xEE0F, 0x22CE, 0x378E, 0xFB4F, 0x1D0E, 0xD1CF,
|
||||||
0xC48F, 0x084E, 0x440F, 0x88CE, 0x9D8E, 0x514F, 0xB70E, 0x7BCF, 0x6E8F, 0xA24E, 0xE20E, 0x2ECF, 0x3B8F, 0xF74E,
|
0xC48F, 0x084E, 0x440F, 0x88CE, 0x9D8E, 0x514F, 0xB70E, 0x7BCF, 0x6E8F, 0xA24E, 0xE20E, 0x2ECF, 0x3B8F, 0xF74E,
|
||||||
0x110F, 0xDDCE, 0xC88E, 0x044F}};
|
0x110F, 0xDDCE, 0xC88E, 0x044F}
|
||||||
|
};
|
||||||
|
|
||||||
AARU_EXPORT crc16_ctx *AARU_CALL crc16_init();
|
AARU_EXPORT crc16_ctx *AARU_CALL crc16_init();
|
||||||
AARU_EXPORT int AARU_CALL crc16_update(crc16_ctx *ctx, const uint8_t *data, uint32_t len);
|
AARU_EXPORT int AARU_CALL crc16_update(crc16_ctx *ctx, const uint8_t *data, uint32_t len);
|
||||||
|
|||||||
@@ -176,7 +176,8 @@ const uint16_t crc16_ccitt_table[8][256] = {
|
|||||||
0xB943, 0xFE90, 0x3988, 0x7E5B, 0xB62E, 0xF1FD, 0x283F, 0x6FEC, 0xA799, 0xE04A, 0x2752, 0x6081, 0xA8F4, 0xEF27,
|
0xB943, 0xFE90, 0x3988, 0x7E5B, 0xB62E, 0xF1FD, 0x283F, 0x6FEC, 0xA799, 0xE04A, 0x2752, 0x6081, 0xA8F4, 0xEF27,
|
||||||
0x7039, 0x37EA, 0xFF9F, 0xB84C, 0x7F54, 0x3887, 0xF0F2, 0xB721, 0x6EE3, 0x2930, 0xE145, 0xA696, 0x618E, 0x265D,
|
0x7039, 0x37EA, 0xFF9F, 0xB84C, 0x7F54, 0x3887, 0xF0F2, 0xB721, 0x6EE3, 0x2930, 0xE145, 0xA696, 0x618E, 0x265D,
|
||||||
0xEE28, 0xA9FB, 0x4D8D, 0x0A5E, 0xC22B, 0x85F8, 0x42E0, 0x0533, 0xCD46, 0x8A95, 0x5357, 0x1484, 0xDCF1, 0x9B22,
|
0xEE28, 0xA9FB, 0x4D8D, 0x0A5E, 0xC22B, 0x85F8, 0x42E0, 0x0533, 0xCD46, 0x8A95, 0x5357, 0x1484, 0xDCF1, 0x9B22,
|
||||||
0x5C3A, 0x1BE9, 0xD39C, 0x944F}};
|
0x5C3A, 0x1BE9, 0xD39C, 0x944F}
|
||||||
|
};
|
||||||
|
|
||||||
AARU_EXPORT crc16_ccitt_ctx *AARU_CALL crc16_ccitt_init();
|
AARU_EXPORT crc16_ccitt_ctx *AARU_CALL crc16_ccitt_init();
|
||||||
AARU_EXPORT int AARU_CALL crc16_ccitt_update(crc16_ccitt_ctx *ctx, const uint8_t *data, uint32_t len);
|
AARU_EXPORT int AARU_CALL crc16_ccitt_update(crc16_ccitt_ctx *ctx, const uint8_t *data, uint32_t len);
|
||||||
|
|||||||
3
crc32.h
3
crc32.h
@@ -275,8 +275,7 @@ AARU_EXPORT TARGET_WITH_CLMUL uint32_t AARU_CALL crc32_clmul(uint32_t previous_c
|
|||||||
|
|
||||||
#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
|
#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
|
||||||
#if __ARM_ARCH >= 7
|
#if __ARM_ARCH >= 7
|
||||||
AARU_EXPORT TARGET_ARMV8_WITH_CRC uint32_t AARU_CALL armv8_crc32_little(uint32_t previous_crc,
|
AARU_EXPORT TARGET_ARMV8_WITH_CRC uint32_t AARU_CALL armv8_crc32_little(uint32_t previous_crc, const uint8_t *data,
|
||||||
const uint8_t* data,
|
|
||||||
uint32_t len);
|
uint32_t len);
|
||||||
#endif
|
#endif
|
||||||
AARU_EXPORT TARGET_WITH_NEON uint32_t AARU_CALL crc32_vmull(uint32_t previous_crc, const uint8_t *data, long len);
|
AARU_EXPORT TARGET_WITH_NEON uint32_t AARU_CALL crc32_vmull(uint32_t previous_crc, const uint8_t *data, long len);
|
||||||
|
|||||||
@@ -110,8 +110,7 @@ TARGET_ARMV8_WITH_CRC uint32_t armv8_crc32_little(uint32_t previous_crc, const u
|
|||||||
data = (const uint8_t *)buf4;
|
data = (const uint8_t *)buf4;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
while(len--)
|
while(len--) { c = __crc32b(c, *data++); }
|
||||||
{ c = __crc32b(c, *data++); }
|
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -166,12 +166,8 @@ TARGET_WITH_CLMUL static void fold_4(__m128i *xmm_crc0, __m128i *xmm_crc1, __m12
|
|||||||
*xmm_crc3 = _mm_castps_si128(ps_res3);
|
*xmm_crc3 = _mm_castps_si128(ps_res3);
|
||||||
}
|
}
|
||||||
|
|
||||||
TARGET_WITH_CLMUL static void partial_fold(const size_t len,
|
TARGET_WITH_CLMUL static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2,
|
||||||
__m128i *xmm_crc0,
|
__m128i *xmm_crc3, __m128i *xmm_crc_part)
|
||||||
__m128i *xmm_crc1,
|
|
||||||
__m128i *xmm_crc2,
|
|
||||||
__m128i *xmm_crc3,
|
|
||||||
__m128i *xmm_crc_part)
|
|
||||||
{
|
{
|
||||||
const __m128i xmm_fold4 = _mm_set_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
|
const __m128i xmm_fold4 = _mm_set_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
|
||||||
const __m128i xmm_mask3 = _mm_set1_epi32(0x80808080);
|
const __m128i xmm_mask3 = _mm_set1_epi32(0x80808080);
|
||||||
|
|||||||
18
crc32_simd.h
18
crc32_simd.h
@@ -38,18 +38,12 @@ static const unsigned ALIGNED_(32) pshufb_shf_table[60] = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
static const uint32_t ALIGNED_(16) crc_k[] = {
|
static const uint32_t ALIGNED_(16) crc_k[] = {
|
||||||
0xccaa009e,
|
0xccaa009e, 0x00000000, /* rk1 */
|
||||||
0x00000000, /* rk1 */
|
0x751997d0, 0x00000001, /* rk2 */
|
||||||
0x751997d0,
|
0xccaa009e, 0x00000000, /* rk5 */
|
||||||
0x00000001, /* rk2 */
|
0x63cd6124, 0x00000001, /* rk6 */
|
||||||
0xccaa009e,
|
0xf7011640, 0x00000001, /* rk7 */
|
||||||
0x00000000, /* rk5 */
|
0xdb710640, 0x00000001 /* rk8 */
|
||||||
0x63cd6124,
|
|
||||||
0x00000001, /* rk6 */
|
|
||||||
0xf7011640,
|
|
||||||
0x00000001, /* rk7 */
|
|
||||||
0xdb710640,
|
|
||||||
0x00000001 /* rk8 */
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static const unsigned ALIGNED_(16) crc_mask[4] = {0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000};
|
static const unsigned ALIGNED_(16) crc_mask[4] = {0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000};
|
||||||
|
|||||||
@@ -43,8 +43,8 @@
|
|||||||
#define XOR_INITIAL(where) \
|
#define XOR_INITIAL(where) \
|
||||||
ONCE(where = vreinterpretq_u64_u32(veorq_u32(vreinterpretq_u32_u64(where), vreinterpretq_u32_u64(q_initial))))
|
ONCE(where = vreinterpretq_u64_u32(veorq_u32(vreinterpretq_u32_u64(where), vreinterpretq_u32_u64(q_initial))))
|
||||||
|
|
||||||
TARGET_WITH_NEON FORCE_INLINE void
|
TARGET_WITH_NEON FORCE_INLINE void fold_1(uint64x2_t *q_crc0, uint64x2_t *q_crc1, uint64x2_t *q_crc2,
|
||||||
fold_1(uint64x2_t *q_crc0, uint64x2_t *q_crc1, uint64x2_t *q_crc2, uint64x2_t *q_crc3)
|
uint64x2_t *q_crc3)
|
||||||
{
|
{
|
||||||
uint32_t ALIGNED_(16) data[4] = {0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001};
|
uint32_t ALIGNED_(16) data[4] = {0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001};
|
||||||
const uint64x2_t q_fold4 = vreinterpretq_u64_u32(vld1q_u32(data));
|
const uint64x2_t q_fold4 = vreinterpretq_u64_u32(vld1q_u32(data));
|
||||||
@@ -67,8 +67,8 @@ fold_1(uint64x2_t *q_crc0, uint64x2_t *q_crc1, uint64x2_t *q_crc2, uint64x2_t *q
|
|||||||
*q_crc3 = vreinterpretq_u64_u32(ps_res);
|
*q_crc3 = vreinterpretq_u64_u32(ps_res);
|
||||||
}
|
}
|
||||||
|
|
||||||
TARGET_WITH_NEON FORCE_INLINE void
|
TARGET_WITH_NEON FORCE_INLINE void fold_2(uint64x2_t *q_crc0, uint64x2_t *q_crc1, uint64x2_t *q_crc2,
|
||||||
fold_2(uint64x2_t *q_crc0, uint64x2_t *q_crc1, uint64x2_t *q_crc2, uint64x2_t *q_crc3)
|
uint64x2_t *q_crc3)
|
||||||
{
|
{
|
||||||
uint32_t ALIGNED_(16) data[4] = {0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001};
|
uint32_t ALIGNED_(16) data[4] = {0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001};
|
||||||
const uint64x2_t q_fold4 = vreinterpretq_u64_u32(vld1q_u32(data));
|
const uint64x2_t q_fold4 = vreinterpretq_u64_u32(vld1q_u32(data));
|
||||||
@@ -99,8 +99,8 @@ fold_2(uint64x2_t *q_crc0, uint64x2_t *q_crc1, uint64x2_t *q_crc2, uint64x2_t *q
|
|||||||
*q_crc3 = vreinterpretq_u64_u32(ps_res31);
|
*q_crc3 = vreinterpretq_u64_u32(ps_res31);
|
||||||
}
|
}
|
||||||
|
|
||||||
TARGET_WITH_NEON FORCE_INLINE void
|
TARGET_WITH_NEON FORCE_INLINE void fold_3(uint64x2_t *q_crc0, uint64x2_t *q_crc1, uint64x2_t *q_crc2,
|
||||||
fold_3(uint64x2_t *q_crc0, uint64x2_t *q_crc1, uint64x2_t *q_crc2, uint64x2_t *q_crc3)
|
uint64x2_t *q_crc3)
|
||||||
{
|
{
|
||||||
uint32_t ALIGNED_(16) data[4] = {0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001};
|
uint32_t ALIGNED_(16) data[4] = {0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001};
|
||||||
const uint64x2_t q_fold4 = vreinterpretq_u64_u32(vld1q_u32(data));
|
const uint64x2_t q_fold4 = vreinterpretq_u64_u32(vld1q_u32(data));
|
||||||
@@ -137,8 +137,8 @@ fold_3(uint64x2_t *q_crc0, uint64x2_t *q_crc1, uint64x2_t *q_crc2, uint64x2_t *q
|
|||||||
*q_crc3 = vreinterpretq_u64_u32(ps_res32);
|
*q_crc3 = vreinterpretq_u64_u32(ps_res32);
|
||||||
}
|
}
|
||||||
|
|
||||||
TARGET_WITH_NEON FORCE_INLINE void
|
TARGET_WITH_NEON FORCE_INLINE void fold_4(uint64x2_t *q_crc0, uint64x2_t *q_crc1, uint64x2_t *q_crc2,
|
||||||
fold_4(uint64x2_t *q_crc0, uint64x2_t *q_crc1, uint64x2_t *q_crc2, uint64x2_t *q_crc3)
|
uint64x2_t *q_crc3)
|
||||||
{
|
{
|
||||||
uint32_t ALIGNED_(16) data[4] = {0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001};
|
uint32_t ALIGNED_(16) data[4] = {0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001};
|
||||||
const uint64x2_t q_fold4 = vreinterpretq_u64_u32(vld1q_u32(data));
|
const uint64x2_t q_fold4 = vreinterpretq_u64_u32(vld1q_u32(data));
|
||||||
@@ -184,12 +184,8 @@ fold_4(uint64x2_t *q_crc0, uint64x2_t *q_crc1, uint64x2_t *q_crc2, uint64x2_t *q
|
|||||||
*q_crc3 = vreinterpretq_u64_u32(ps_res3);
|
*q_crc3 = vreinterpretq_u64_u32(ps_res3);
|
||||||
}
|
}
|
||||||
|
|
||||||
TARGET_WITH_NEON FORCE_INLINE void partial_fold(const size_t len,
|
TARGET_WITH_NEON FORCE_INLINE void partial_fold(const size_t len, uint64x2_t *q_crc0, uint64x2_t *q_crc1,
|
||||||
uint64x2_t *q_crc0,
|
uint64x2_t *q_crc2, uint64x2_t *q_crc3, uint64x2_t *q_crc_part)
|
||||||
uint64x2_t *q_crc1,
|
|
||||||
uint64x2_t *q_crc2,
|
|
||||||
uint64x2_t *q_crc3,
|
|
||||||
uint64x2_t *q_crc_part)
|
|
||||||
{
|
{
|
||||||
uint32_t ALIGNED_(16) data[4] = {0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001};
|
uint32_t ALIGNED_(16) data[4] = {0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001};
|
||||||
const uint64x2_t q_fold4 = vreinterpretq_u64_u32(vld1q_u32(data));
|
const uint64x2_t q_fold4 = vreinterpretq_u64_u32(vld1q_u32(data));
|
||||||
|
|||||||
3
crc64.h
3
crc64.h
@@ -232,7 +232,8 @@ const static uint64_t crc64_table[4][256] = {
|
|||||||
0xA0A13C6791602FF9, 0xBD4FB639B34C8E25, 0x9B7C28DBD5396C41, 0x8692A285F715CD9D, 0xD71B151F19D2A889,
|
0xA0A13C6791602FF9, 0xBD4FB639B34C8E25, 0x9B7C28DBD5396C41, 0x8692A285F715CD9D, 0xD71B151F19D2A889,
|
||||||
0xCAF59F413BFE0955, 0xECC601A35D8BEB31, 0xF1288BFD7FA74AED, 0x4FD56E9680052119, 0x523BE4C8A22980C5,
|
0xCAF59F413BFE0955, 0xECC601A35D8BEB31, 0xF1288BFD7FA74AED, 0x4FD56E9680052119, 0x523BE4C8A22980C5,
|
||||||
0x74087A2AC45C62A1, 0x69E6F074E670C37D, 0x386F47EE08B7A669, 0x2581CDB02A9B07B5, 0x03B253524CEEE5D1,
|
0x74087A2AC45C62A1, 0x69E6F074E670C37D, 0x386F47EE08B7A669, 0x2581CDB02A9B07B5, 0x03B253524CEEE5D1,
|
||||||
0x1E5CD90C6EC2440D}};
|
0x1E5CD90C6EC2440D}
|
||||||
|
};
|
||||||
|
|
||||||
#define CRC64_ECMA_POLY 0xC96C5795D7870F42
|
#define CRC64_ECMA_POLY 0xC96C5795D7870F42
|
||||||
#define CRC64_ECMA_SEED 0xFFFFFFFFFFFFFFFF
|
#define CRC64_ECMA_SEED 0xFFFFFFFFFFFFFFFF
|
||||||
|
|||||||
@@ -179,8 +179,7 @@ AARU_EXPORT TARGET_WITH_CLMUL uint64_t AARU_CALL crc64_clmul(uint64_t crc, const
|
|||||||
}
|
}
|
||||||
|
|
||||||
__m128i P;
|
__m128i P;
|
||||||
if(length == 16)
|
if(length == 16) { P = _mm_xor_si128(accumulator, _mm_load_si128(alignedData)); }
|
||||||
{ P = _mm_xor_si128(accumulator, _mm_load_si128(alignedData)); }
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
const __m128i end0 = _mm_xor_si128(accumulator, _mm_load_si128(alignedData));
|
const __m128i end0 = _mm_xor_si128(accumulator, _mm_load_si128(alignedData));
|
||||||
@@ -199,9 +198,7 @@ AARU_EXPORT TARGET_WITH_CLMUL uint64_t AARU_CALL crc64_clmul(uint64_t crc, const
|
|||||||
// Final Barrett reduction
|
// Final Barrett reduction
|
||||||
const __m128i T1 = _mm_clmulepi64_si128(R, foldConstants2, 0x00);
|
const __m128i T1 = _mm_clmulepi64_si128(R, foldConstants2, 0x00);
|
||||||
const __m128i T2 =
|
const __m128i T2 =
|
||||||
_mm_xor_si128(
|
_mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(T1, foldConstants2, 0x10), _mm_slli_si128(T1, 8)), R);
|
||||||
_mm_xor_si128(_mm_clmulepi64_si128(T1, foldConstants2, 0x10), _mm_slli_si128(T1, 8)),
|
|
||||||
R);
|
|
||||||
|
|
||||||
#if defined(_WIN64)
|
#if defined(_WIN64)
|
||||||
return ~_mm_extract_epi64(T2, 1);
|
return ~_mm_extract_epi64(T2, 1);
|
||||||
|
|||||||
@@ -24,22 +24,16 @@ static const uint8_t shuffleMasks[] = {
|
|||||||
TARGET_WITH_NEON FORCE_INLINE void shiftRight128(uint64x2_t in, size_t n, uint64x2_t *outLeft, uint64x2_t *outRight)
|
TARGET_WITH_NEON FORCE_INLINE void shiftRight128(uint64x2_t in, size_t n, uint64x2_t *outLeft, uint64x2_t *outRight)
|
||||||
{
|
{
|
||||||
const uint64x2_t maskA =
|
const uint64x2_t maskA =
|
||||||
vreinterpretq_u64_u32(
|
vreinterpretq_u64_u32(vld1q_u32((const uint32_t *)(const uint64x2_t *)(shuffleMasks + (16 - n))));
|
||||||
vld1q_u32((const uint32_t *)(const uint64x2_t *)(shuffleMasks + (16 - n))));
|
|
||||||
uint64x2_t b = vreinterpretq_u64_u8(vceqq_u8(vreinterpretq_u8_u64(vreinterpretq_u64_u32(vdupq_n_u32(0))),
|
uint64x2_t b = vreinterpretq_u64_u8(vceqq_u8(vreinterpretq_u8_u64(vreinterpretq_u64_u32(vdupq_n_u32(0))),
|
||||||
vreinterpretq_u8_u64(
|
vreinterpretq_u8_u64(vreinterpretq_u64_u32(vdupq_n_u32(0)))));
|
||||||
vreinterpretq_u64_u32(vdupq_n_u32(0)))));
|
|
||||||
const uint64x2_t maskB = vreinterpretq_u64_u32(veorq_u32(vreinterpretq_u32_u64(maskA), vreinterpretq_u32_u64(b)));
|
const uint64x2_t maskB = vreinterpretq_u64_u32(veorq_u32(vreinterpretq_u32_u64(maskA), vreinterpretq_u32_u64(b)));
|
||||||
|
|
||||||
*outLeft = mm_shuffle_epi8(in, maskB);
|
*outLeft = mm_shuffle_epi8(in, maskB);
|
||||||
*outRight = mm_shuffle_epi8(in, maskA);
|
*outRight = mm_shuffle_epi8(in, maskA);
|
||||||
}
|
}
|
||||||
|
|
||||||
TARGET_WITH_NEON FORCE_INLINE uint64x2_t
|
TARGET_WITH_NEON FORCE_INLINE uint64x2_t fold(uint64x2_t in, uint64x2_t foldConstants)
|
||||||
fold (uint64x2_t
|
|
||||||
in,
|
|
||||||
uint64x2_t foldConstants
|
|
||||||
)
|
|
||||||
{
|
{
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -85,16 +79,13 @@ AARU_EXPORT TARGET_WITH_NEON uint64_t AARU_CALL crc64_vmull(uint64_t previous_cr
|
|||||||
const size_t alignedLength = alignedEnd - alignedData;
|
const size_t alignedLength = alignedEnd - alignedData;
|
||||||
|
|
||||||
const uint64x2_t leadInMask =
|
const uint64x2_t leadInMask =
|
||||||
vreinterpretq_u64_u32(vld1q_u32(
|
vreinterpretq_u64_u32(vld1q_u32((const uint32_t *)(const uint64x2_t *)(shuffleMasks + (16 - leadInSize))));
|
||||||
(const uint32_t *)(const uint64x2_t *)(shuffleMasks + (16 - leadInSize))));
|
|
||||||
uint64x2_t a = vreinterpretq_u64_u32(vdupq_n_u32(0));
|
uint64x2_t a = vreinterpretq_u64_u32(vdupq_n_u32(0));
|
||||||
uint64x2_t b = vreinterpretq_u64_u32(
|
uint64x2_t b = vreinterpretq_u64_u32(
|
||||||
vld1q_u32((const uint32_t *)alignedData)); // Use a signed shift right to create a mask with the sign bit
|
vld1q_u32((const uint32_t *)alignedData)); // Use a signed shift right to create a mask with the sign bit
|
||||||
const uint64x2_t data0 =
|
const uint64x2_t data0 =
|
||||||
vreinterpretq_u64_u8(
|
vreinterpretq_u64_u8(vbslq_u8(vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u64(leadInMask), 7)),
|
||||||
vbslq_u8(vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u64(leadInMask), 7)),
|
vreinterpretq_u8_u64(b), vreinterpretq_u8_u64(a)));
|
||||||
vreinterpretq_u8_u64(b),
|
|
||||||
vreinterpretq_u8_u64(a)));
|
|
||||||
|
|
||||||
const uint64x2_t initialCrc = vsetq_lane_u64(~previous_crc, vdupq_n_u64(0), 0);
|
const uint64x2_t initialCrc = vsetq_lane_u64(~previous_crc, vdupq_n_u64(0), 0);
|
||||||
|
|
||||||
@@ -165,12 +156,12 @@ AARU_EXPORT TARGET_WITH_NEON uint64_t AARU_CALL crc64_vmull(uint64_t previous_cr
|
|||||||
}
|
}
|
||||||
|
|
||||||
uint64x2_t P;
|
uint64x2_t P;
|
||||||
if(len == 16) P = veorq_u64(accumulator, vreinterpretq_u64_u32(vld1q_u32((const uint32_t *)alignedData)));
|
if(len == 16)
|
||||||
|
P = veorq_u64(accumulator, vreinterpretq_u64_u32(vld1q_u32((const uint32_t *)alignedData)));
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
const uint64x2_t end0 =
|
const uint64x2_t end0 =
|
||||||
veorq_u64(accumulator,
|
veorq_u64(accumulator, vreinterpretq_u64_u32(vld1q_u32((const uint32_t *)alignedData)));
|
||||||
vreinterpretq_u64_u32(vld1q_u32((const uint32_t *)alignedData)));
|
|
||||||
const uint64x2_t end1 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t *)(alignedData + 1)));
|
const uint64x2_t end1 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t *)(alignedData + 1)));
|
||||||
|
|
||||||
uint64x2_t A, B, C, D;
|
uint64x2_t A, B, C, D;
|
||||||
|
|||||||
@@ -129,8 +129,7 @@ AARU_EXPORT int AARU_CALL fletcher16_update(fletcher16_ctx *ctx, const uint8_t *
|
|||||||
{
|
{
|
||||||
len -= NMAX;
|
len -= NMAX;
|
||||||
n = NMAX / 6; /* NMAX is divisible by 6 */
|
n = NMAX / 6; /* NMAX is divisible by 6 */
|
||||||
do
|
do {
|
||||||
{
|
|
||||||
sum1 += data[0];
|
sum1 += data[0];
|
||||||
sum2 += sum1;
|
sum2 += sum1;
|
||||||
sum1 += data[0 + 1];
|
sum1 += data[0 + 1];
|
||||||
@@ -146,8 +145,7 @@ AARU_EXPORT int AARU_CALL fletcher16_update(fletcher16_ctx *ctx, const uint8_t *
|
|||||||
|
|
||||||
/* 6 sums unrolled */
|
/* 6 sums unrolled */
|
||||||
data += 6;
|
data += 6;
|
||||||
}
|
} while(--n);
|
||||||
while(--n);
|
|
||||||
sum1 %= FLETCHER16_MODULE;
|
sum1 %= FLETCHER16_MODULE;
|
||||||
sum2 %= FLETCHER16_MODULE;
|
sum2 %= FLETCHER16_MODULE;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -37,14 +37,17 @@ AARU_EXPORT void AARU_CALL fletcher16_free(fletcher16_ctx *ctx);
|
|||||||
#if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \
|
#if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \
|
||||||
defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86)
|
defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86)
|
||||||
|
|
||||||
AARU_EXPORT TARGET_WITH_AVX2 void AARU_CALL fletcher16_avx2(uint8_t* sum1, uint8_t* sum2, const uint8_t* data, long len);
|
AARU_EXPORT TARGET_WITH_AVX2 void AARU_CALL fletcher16_avx2(uint8_t *sum1, uint8_t *sum2, const uint8_t *data,
|
||||||
AARU_EXPORT TARGET_WITH_SSSE3 void AARU_CALL fletcher16_ssse3(uint8_t* sum1, uint8_t* sum2, const uint8_t* data, long len);
|
long len);
|
||||||
|
AARU_EXPORT TARGET_WITH_SSSE3 void AARU_CALL fletcher16_ssse3(uint8_t *sum1, uint8_t *sum2, const uint8_t *data,
|
||||||
|
long len);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
|
#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
|
||||||
|
|
||||||
AARU_EXPORT TARGET_WITH_NEON void AARU_CALL fletcher16_neon(uint8_t* sum1, uint8_t* sum2, const uint8_t* data, uint32_t len);
|
AARU_EXPORT TARGET_WITH_NEON void AARU_CALL fletcher16_neon(uint8_t *sum1, uint8_t *sum2, const uint8_t *data,
|
||||||
|
uint32_t len);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
@@ -42,8 +42,7 @@
|
|||||||
* @param data Pointer to the data buffer.
|
* @param data Pointer to the data buffer.
|
||||||
* @param len Length of the data buffer in bytes.
|
* @param len Length of the data buffer in bytes.
|
||||||
*/
|
*/
|
||||||
AARU_EXPORT TARGET_WITH_AVX2 void AARU_CALL
|
AARU_EXPORT TARGET_WITH_AVX2 void AARU_CALL fletcher16_avx2(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
|
||||||
fletcher16_avx2(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
|
|
||||||
{
|
{
|
||||||
uint32_t s1 = *sum1;
|
uint32_t s1 = *sum1;
|
||||||
uint32_t s2 = *sum2;
|
uint32_t s2 = *sum2;
|
||||||
@@ -64,38 +63,8 @@ fletcher16_avx2(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
|
|||||||
if(n > blocks) n = (unsigned)blocks;
|
if(n > blocks) n = (unsigned)blocks;
|
||||||
blocks -= n;
|
blocks -= n;
|
||||||
|
|
||||||
const __m256i tap = _mm256_set_epi8(1,
|
const __m256i tap = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
|
||||||
2,
|
21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
|
||||||
3,
|
|
||||||
4,
|
|
||||||
5,
|
|
||||||
6,
|
|
||||||
7,
|
|
||||||
8,
|
|
||||||
9,
|
|
||||||
10,
|
|
||||||
11,
|
|
||||||
12,
|
|
||||||
13,
|
|
||||||
14,
|
|
||||||
15,
|
|
||||||
16,
|
|
||||||
17,
|
|
||||||
18,
|
|
||||||
19,
|
|
||||||
20,
|
|
||||||
21,
|
|
||||||
22,
|
|
||||||
23,
|
|
||||||
24,
|
|
||||||
25,
|
|
||||||
26,
|
|
||||||
27,
|
|
||||||
28,
|
|
||||||
29,
|
|
||||||
30,
|
|
||||||
31,
|
|
||||||
32);
|
|
||||||
const __m256i zero = _mm256_setzero_si256();
|
const __m256i zero = _mm256_setzero_si256();
|
||||||
const __m256i ones = _mm256_set1_epi16(1);
|
const __m256i ones = _mm256_set1_epi16(1);
|
||||||
|
|
||||||
@@ -106,8 +75,7 @@ fletcher16_avx2(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
|
|||||||
__m256i v_ps = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, (s1 * n));
|
__m256i v_ps = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, (s1 * n));
|
||||||
__m256i v_s2 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, s2);
|
__m256i v_s2 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, s2);
|
||||||
__m256i v_s1 = _mm256_setzero_si256();
|
__m256i v_s1 = _mm256_setzero_si256();
|
||||||
do
|
do {
|
||||||
{
|
|
||||||
/*
|
/*
|
||||||
* Load 32 input bytes.
|
* Load 32 input bytes.
|
||||||
*/
|
*/
|
||||||
@@ -126,8 +94,7 @@ fletcher16_avx2(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
|
|||||||
v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(mad, ones));
|
v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(mad, ones));
|
||||||
|
|
||||||
data += BLOCK_SIZE;
|
data += BLOCK_SIZE;
|
||||||
}
|
} while(--n);
|
||||||
while(--n);
|
|
||||||
|
|
||||||
__m128i sum = _mm_add_epi32(_mm256_castsi256_si128(v_s1), _mm256_extracti128_si256(v_s1, 1));
|
__m128i sum = _mm_add_epi32(_mm256_castsi256_si128(v_s1), _mm256_extracti128_si256(v_s1, 1));
|
||||||
__m128i hi = _mm_unpackhi_epi64(sum, sum);
|
__m128i hi = _mm_unpackhi_epi64(sum, sum);
|
||||||
@@ -177,8 +144,7 @@ fletcher16_avx2(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
|
|||||||
s2 += (s1 += *data++);
|
s2 += (s1 += *data++);
|
||||||
len -= 16;
|
len -= 16;
|
||||||
}
|
}
|
||||||
while(len--)
|
while(len--) { s2 += (s1 += *data++); }
|
||||||
{ s2 += (s1 += *data++); }
|
|
||||||
s1 %= FLETCHER16_MODULE;
|
s1 %= FLETCHER16_MODULE;
|
||||||
s2 %= FLETCHER16_MODULE;
|
s2 %= FLETCHER16_MODULE;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -88,8 +88,12 @@ TARGET_WITH_NEON void fletcher16_neon(uint8_t *sum1, uint8_t *sum2, const uint8_
|
|||||||
* processed before s2 must be reduced modulo FLETCHER16_MODULE.
|
* processed before s2 must be reduced modulo FLETCHER16_MODULE.
|
||||||
*/
|
*/
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
uint32x4_t v_s2 = {.n128_u32 = {0, 0, 0, s1 * n}};
|
uint32x4_t v_s2 = {
|
||||||
uint32x4_t v_s1 = {.n128_u32 = {0, 0, 0, 0}};
|
.n128_u32 = {0, 0, 0, s1 * n}
|
||||||
|
};
|
||||||
|
uint32x4_t v_s1 = {
|
||||||
|
.n128_u32 = {0, 0, 0, 0}
|
||||||
|
};
|
||||||
#else
|
#else
|
||||||
uint32x4_t v_s2 = (uint32x4_t){0, 0, 0, s1 * n};
|
uint32x4_t v_s2 = (uint32x4_t){0, 0, 0, s1 * n};
|
||||||
uint32x4_t v_s1 = (uint32x4_t){0, 0, 0, 0};
|
uint32x4_t v_s1 = (uint32x4_t){0, 0, 0, 0};
|
||||||
@@ -98,8 +102,7 @@ TARGET_WITH_NEON void fletcher16_neon(uint8_t *sum1, uint8_t *sum2, const uint8_
|
|||||||
uint16x8_t v_column_sum_2 = vdupq_n_u16(0);
|
uint16x8_t v_column_sum_2 = vdupq_n_u16(0);
|
||||||
uint16x8_t v_column_sum_3 = vdupq_n_u16(0);
|
uint16x8_t v_column_sum_3 = vdupq_n_u16(0);
|
||||||
uint16x8_t v_column_sum_4 = vdupq_n_u16(0);
|
uint16x8_t v_column_sum_4 = vdupq_n_u16(0);
|
||||||
do
|
do {
|
||||||
{
|
|
||||||
/*
|
/*
|
||||||
* Load 32 input bytes.
|
* Load 32 input bytes.
|
||||||
*/
|
*/
|
||||||
@@ -121,8 +124,7 @@ TARGET_WITH_NEON void fletcher16_neon(uint8_t *sum1, uint8_t *sum2, const uint8_
|
|||||||
v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8(bytes2));
|
v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8(bytes2));
|
||||||
v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2));
|
v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2));
|
||||||
data += BLOCK_SIZE;
|
data += BLOCK_SIZE;
|
||||||
}
|
} while(--n);
|
||||||
while(--n);
|
|
||||||
v_s2 = vshlq_n_u32(v_s2, 5);
|
v_s2 = vshlq_n_u32(v_s2, 5);
|
||||||
/*
|
/*
|
||||||
* Multiply-add bytes by [ 32, 31, 30, ... ] for s2.
|
* Multiply-add bytes by [ 32, 31, 30, ... ] for s2.
|
||||||
@@ -198,8 +200,7 @@ TARGET_WITH_NEON void fletcher16_neon(uint8_t *sum1, uint8_t *sum2, const uint8_
|
|||||||
s2 += (s1 += *data++);
|
s2 += (s1 += *data++);
|
||||||
len -= 16;
|
len -= 16;
|
||||||
}
|
}
|
||||||
while(len--)
|
while(len--) { s2 += (s1 += *data++); }
|
||||||
{ s2 += (s1 += *data++); }
|
|
||||||
s1 %= FLETCHER16_MODULE;
|
s1 %= FLETCHER16_MODULE;
|
||||||
s2 %= FLETCHER16_MODULE;
|
s2 %= FLETCHER16_MODULE;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -49,8 +49,8 @@
|
|||||||
* @param data Pointer to the data buffer.
|
* @param data Pointer to the data buffer.
|
||||||
* @param len Length of the data buffer in bytes.
|
* @param len Length of the data buffer in bytes.
|
||||||
*/
|
*/
|
||||||
AARU_EXPORT TARGET_WITH_SSSE3 void AARU_CALL
|
AARU_EXPORT TARGET_WITH_SSSE3 void AARU_CALL fletcher16_ssse3(uint8_t *sum1, uint8_t *sum2, const uint8_t *data,
|
||||||
fletcher16_ssse3(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
|
long len)
|
||||||
{
|
{
|
||||||
uint32_t s1 = *sum1;
|
uint32_t s1 = *sum1;
|
||||||
uint32_t s2 = *sum2;
|
uint32_t s2 = *sum2;
|
||||||
@@ -79,8 +79,7 @@ fletcher16_ssse3(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
|
|||||||
__m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n);
|
__m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n);
|
||||||
__m128i v_s2 = _mm_set_epi32(0, 0, 0, s2);
|
__m128i v_s2 = _mm_set_epi32(0, 0, 0, s2);
|
||||||
__m128i v_s1 = _mm_set_epi32(0, 0, 0, 0);
|
__m128i v_s1 = _mm_set_epi32(0, 0, 0, 0);
|
||||||
do
|
do {
|
||||||
{
|
|
||||||
/*
|
/*
|
||||||
* Load 32 input bytes.
|
* Load 32 input bytes.
|
||||||
*/
|
*/
|
||||||
@@ -101,8 +100,7 @@ fletcher16_ssse3(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
|
|||||||
const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2);
|
const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2);
|
||||||
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones));
|
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones));
|
||||||
data += BLOCK_SIZE;
|
data += BLOCK_SIZE;
|
||||||
}
|
} while(--n);
|
||||||
while(--n);
|
|
||||||
v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5));
|
v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5));
|
||||||
/*
|
/*
|
||||||
* Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
|
* Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
|
||||||
@@ -150,8 +148,7 @@ fletcher16_ssse3(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
|
|||||||
s2 += (s1 += *data++);
|
s2 += (s1 += *data++);
|
||||||
len -= 16;
|
len -= 16;
|
||||||
}
|
}
|
||||||
while(len--)
|
while(len--) { s2 += (s1 += *data++); }
|
||||||
{ s2 += (s1 += *data++); }
|
|
||||||
s1 %= FLETCHER16_MODULE;
|
s1 %= FLETCHER16_MODULE;
|
||||||
s2 %= FLETCHER16_MODULE;
|
s2 %= FLETCHER16_MODULE;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -92,7 +92,6 @@ AARU_EXPORT int AARU_CALL fletcher32_update(fletcher32_ctx *ctx, const uint8_t *
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
uint32_t sum1 = ctx->sum1;
|
uint32_t sum1 = ctx->sum1;
|
||||||
uint32_t sum2 = ctx->sum2;
|
uint32_t sum2 = ctx->sum2;
|
||||||
unsigned n;
|
unsigned n;
|
||||||
@@ -130,8 +129,7 @@ AARU_EXPORT int AARU_CALL fletcher32_update(fletcher32_ctx *ctx, const uint8_t *
|
|||||||
{
|
{
|
||||||
len -= NMAX;
|
len -= NMAX;
|
||||||
n = NMAX / 16; /* NMAX is divisible by 16 */
|
n = NMAX / 16; /* NMAX is divisible by 16 */
|
||||||
do
|
do {
|
||||||
{
|
|
||||||
sum1 += data[0];
|
sum1 += data[0];
|
||||||
sum2 += sum1;
|
sum2 += sum1;
|
||||||
sum1 += data[0 + 1];
|
sum1 += data[0 + 1];
|
||||||
@@ -167,8 +165,7 @@ AARU_EXPORT int AARU_CALL fletcher32_update(fletcher32_ctx *ctx, const uint8_t *
|
|||||||
|
|
||||||
/* 16 sums unrolled */
|
/* 16 sums unrolled */
|
||||||
data += 16;
|
data += 16;
|
||||||
}
|
} while(--n);
|
||||||
while(--n);
|
|
||||||
sum1 %= FLETCHER32_MODULE;
|
sum1 %= FLETCHER32_MODULE;
|
||||||
sum2 %= FLETCHER32_MODULE;
|
sum2 %= FLETCHER32_MODULE;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -37,8 +37,10 @@ AARU_EXPORT void AARU_CALL fletcher32_free(fletcher32_ctx* ctx);
|
|||||||
#if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \
|
#if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \
|
||||||
defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86)
|
defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86)
|
||||||
|
|
||||||
AARU_EXPORT TARGET_WITH_AVX2 void AARU_CALL fletcher32_avx2(uint16_t* sum1, uint16_t* sum2, const uint8_t* data, long len);
|
AARU_EXPORT TARGET_WITH_AVX2 void AARU_CALL fletcher32_avx2(uint16_t *sum1, uint16_t *sum2, const uint8_t *data,
|
||||||
AARU_EXPORT TARGET_WITH_SSSE3 void AARU_CALL fletcher32_ssse3(uint16_t* sum1, uint16_t* sum2, const uint8_t* data, long len);
|
long len);
|
||||||
|
AARU_EXPORT TARGET_WITH_SSSE3 void AARU_CALL fletcher32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data,
|
||||||
|
long len);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
@@ -42,8 +42,8 @@
|
|||||||
* @param data Pointer to the data buffer.
|
* @param data Pointer to the data buffer.
|
||||||
* @param len Length of the data buffer in bytes.
|
* @param len Length of the data buffer in bytes.
|
||||||
*/
|
*/
|
||||||
AARU_EXPORT TARGET_WITH_AVX2 void AARU_CALL
|
AARU_EXPORT TARGET_WITH_AVX2 void AARU_CALL fletcher32_avx2(uint16_t *sum1, uint16_t *sum2, const uint8_t *data,
|
||||||
fletcher32_avx2(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
|
long len)
|
||||||
{
|
{
|
||||||
uint32_t s1 = *sum1;
|
uint32_t s1 = *sum1;
|
||||||
uint32_t s2 = *sum2;
|
uint32_t s2 = *sum2;
|
||||||
@@ -64,38 +64,8 @@ fletcher32_avx2(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
|
|||||||
if(n > blocks) n = (unsigned)blocks;
|
if(n > blocks) n = (unsigned)blocks;
|
||||||
blocks -= n;
|
blocks -= n;
|
||||||
|
|
||||||
const __m256i tap = _mm256_set_epi8(1,
|
const __m256i tap = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
|
||||||
2,
|
21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
|
||||||
3,
|
|
||||||
4,
|
|
||||||
5,
|
|
||||||
6,
|
|
||||||
7,
|
|
||||||
8,
|
|
||||||
9,
|
|
||||||
10,
|
|
||||||
11,
|
|
||||||
12,
|
|
||||||
13,
|
|
||||||
14,
|
|
||||||
15,
|
|
||||||
16,
|
|
||||||
17,
|
|
||||||
18,
|
|
||||||
19,
|
|
||||||
20,
|
|
||||||
21,
|
|
||||||
22,
|
|
||||||
23,
|
|
||||||
24,
|
|
||||||
25,
|
|
||||||
26,
|
|
||||||
27,
|
|
||||||
28,
|
|
||||||
29,
|
|
||||||
30,
|
|
||||||
31,
|
|
||||||
32);
|
|
||||||
const __m256i zero = _mm256_setzero_si256();
|
const __m256i zero = _mm256_setzero_si256();
|
||||||
const __m256i ones = _mm256_set1_epi16(1);
|
const __m256i ones = _mm256_set1_epi16(1);
|
||||||
|
|
||||||
@@ -106,8 +76,7 @@ fletcher32_avx2(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
|
|||||||
__m256i v_ps = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, (s1 * n));
|
__m256i v_ps = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, (s1 * n));
|
||||||
__m256i v_s2 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, s2);
|
__m256i v_s2 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, s2);
|
||||||
__m256i v_s1 = _mm256_setzero_si256();
|
__m256i v_s1 = _mm256_setzero_si256();
|
||||||
do
|
do {
|
||||||
{
|
|
||||||
/*
|
/*
|
||||||
* Load 32 input bytes.
|
* Load 32 input bytes.
|
||||||
*/
|
*/
|
||||||
@@ -126,8 +95,7 @@ fletcher32_avx2(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
|
|||||||
v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(mad, ones));
|
v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(mad, ones));
|
||||||
|
|
||||||
data += BLOCK_SIZE;
|
data += BLOCK_SIZE;
|
||||||
}
|
} while(--n);
|
||||||
while(--n);
|
|
||||||
|
|
||||||
__m128i sum = _mm_add_epi32(_mm256_castsi256_si128(v_s1), _mm256_extracti128_si256(v_s1, 1));
|
__m128i sum = _mm_add_epi32(_mm256_castsi256_si128(v_s1), _mm256_extracti128_si256(v_s1, 1));
|
||||||
__m128i hi = _mm_unpackhi_epi64(sum, sum);
|
__m128i hi = _mm_unpackhi_epi64(sum, sum);
|
||||||
@@ -177,8 +145,7 @@ fletcher32_avx2(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
|
|||||||
s2 += (s1 += *data++);
|
s2 += (s1 += *data++);
|
||||||
len -= 16;
|
len -= 16;
|
||||||
}
|
}
|
||||||
while(len--)
|
while(len--) { s2 += (s1 += *data++); }
|
||||||
{ s2 += (s1 += *data++); }
|
|
||||||
if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE;
|
if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE;
|
||||||
s2 %= FLETCHER32_MODULE;
|
s2 %= FLETCHER32_MODULE;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -88,8 +88,12 @@ TARGET_WITH_NEON void fletcher32_neon(uint16_t *sum1, uint16_t *sum2, const uint
|
|||||||
* processed before s2 must be reduced modulo FLETCHER32_MODULE.
|
* processed before s2 must be reduced modulo FLETCHER32_MODULE.
|
||||||
*/
|
*/
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
uint32x4_t v_s2 = {.n128_u32 = {0, 0, 0, s1 * n}};
|
uint32x4_t v_s2 = {
|
||||||
uint32x4_t v_s1 = {.n128_u32 = {0, 0, 0, 0}};
|
.n128_u32 = {0, 0, 0, s1 * n}
|
||||||
|
};
|
||||||
|
uint32x4_t v_s1 = {
|
||||||
|
.n128_u32 = {0, 0, 0, 0}
|
||||||
|
};
|
||||||
#else
|
#else
|
||||||
uint32x4_t v_s2 = (uint32x4_t){0, 0, 0, s1 * n};
|
uint32x4_t v_s2 = (uint32x4_t){0, 0, 0, s1 * n};
|
||||||
uint32x4_t v_s1 = (uint32x4_t){0, 0, 0, 0};
|
uint32x4_t v_s1 = (uint32x4_t){0, 0, 0, 0};
|
||||||
@@ -98,8 +102,7 @@ TARGET_WITH_NEON void fletcher32_neon(uint16_t *sum1, uint16_t *sum2, const uint
|
|||||||
uint16x8_t v_column_sum_2 = vdupq_n_u16(0);
|
uint16x8_t v_column_sum_2 = vdupq_n_u16(0);
|
||||||
uint16x8_t v_column_sum_3 = vdupq_n_u16(0);
|
uint16x8_t v_column_sum_3 = vdupq_n_u16(0);
|
||||||
uint16x8_t v_column_sum_4 = vdupq_n_u16(0);
|
uint16x8_t v_column_sum_4 = vdupq_n_u16(0);
|
||||||
do
|
do {
|
||||||
{
|
|
||||||
/*
|
/*
|
||||||
* Load 32 input bytes.
|
* Load 32 input bytes.
|
||||||
*/
|
*/
|
||||||
@@ -121,8 +124,7 @@ TARGET_WITH_NEON void fletcher32_neon(uint16_t *sum1, uint16_t *sum2, const uint
|
|||||||
v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8(bytes2));
|
v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8(bytes2));
|
||||||
v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2));
|
v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2));
|
||||||
data += BLOCK_SIZE;
|
data += BLOCK_SIZE;
|
||||||
}
|
} while(--n);
|
||||||
while(--n);
|
|
||||||
v_s2 = vshlq_n_u32(v_s2, 5);
|
v_s2 = vshlq_n_u32(v_s2, 5);
|
||||||
/*
|
/*
|
||||||
* Multiply-add bytes by [ 32, 31, 30, ... ] for s2.
|
* Multiply-add bytes by [ 32, 31, 30, ... ] for s2.
|
||||||
@@ -198,8 +200,7 @@ TARGET_WITH_NEON void fletcher32_neon(uint16_t *sum1, uint16_t *sum2, const uint
|
|||||||
s2 += (s1 += *data++);
|
s2 += (s1 += *data++);
|
||||||
len -= 16;
|
len -= 16;
|
||||||
}
|
}
|
||||||
while(len--)
|
while(len--) { s2 += (s1 += *data++); }
|
||||||
{ s2 += (s1 += *data++); }
|
|
||||||
if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE;
|
if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE;
|
||||||
s2 %= FLETCHER32_MODULE;
|
s2 %= FLETCHER32_MODULE;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -49,8 +49,8 @@
|
|||||||
* @param data Pointer to the data buffer.
|
* @param data Pointer to the data buffer.
|
||||||
* @param len Length of the data buffer in bytes.
|
* @param len Length of the data buffer in bytes.
|
||||||
*/
|
*/
|
||||||
AARU_EXPORT TARGET_WITH_SSSE3 void AARU_CALL
|
AARU_EXPORT TARGET_WITH_SSSE3 void AARU_CALL fletcher32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data,
|
||||||
fletcher32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
|
long len)
|
||||||
{
|
{
|
||||||
uint32_t s1 = *sum1;
|
uint32_t s1 = *sum1;
|
||||||
uint32_t s2 = *sum2;
|
uint32_t s2 = *sum2;
|
||||||
@@ -79,8 +79,7 @@ fletcher32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
|
|||||||
__m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n);
|
__m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n);
|
||||||
__m128i v_s2 = _mm_set_epi32(0, 0, 0, s2);
|
__m128i v_s2 = _mm_set_epi32(0, 0, 0, s2);
|
||||||
__m128i v_s1 = _mm_set_epi32(0, 0, 0, 0);
|
__m128i v_s1 = _mm_set_epi32(0, 0, 0, 0);
|
||||||
do
|
do {
|
||||||
{
|
|
||||||
/*
|
/*
|
||||||
* Load 32 input bytes.
|
* Load 32 input bytes.
|
||||||
*/
|
*/
|
||||||
@@ -101,8 +100,7 @@ fletcher32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
|
|||||||
const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2);
|
const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2);
|
||||||
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones));
|
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones));
|
||||||
data += BLOCK_SIZE;
|
data += BLOCK_SIZE;
|
||||||
}
|
} while(--n);
|
||||||
while(--n);
|
|
||||||
v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5));
|
v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5));
|
||||||
/*
|
/*
|
||||||
* Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
|
* Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
|
||||||
@@ -150,8 +148,7 @@ fletcher32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
|
|||||||
s2 += (s1 += *data++);
|
s2 += (s1 += *data++);
|
||||||
len -= 16;
|
len -= 16;
|
||||||
}
|
}
|
||||||
while(len--)
|
while(len--) { s2 += (s1 += *data++); }
|
||||||
{ s2 += (s1 += *data++); }
|
|
||||||
if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE;
|
if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE;
|
||||||
s2 %= FLETCHER32_MODULE;
|
s2 %= FLETCHER32_MODULE;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -20,5 +20,4 @@
|
|||||||
|
|
||||||
#include "library.h"
|
#include "library.h"
|
||||||
|
|
||||||
AARU_EXPORT uint64_t AARU_CALL get_acn_version()
|
AARU_EXPORT uint64_t AARU_CALL get_acn_version() { return AARU_CHECKUMS_NATIVE_VERSION; }
|
||||||
{ return AARU_CHECKUMS_NATIVE_VERSION; }
|
|
||||||
3
simd.c
3
simd.c
@@ -264,8 +264,7 @@ int have_crc32_apple()
|
|||||||
*
|
*
|
||||||
* @return true if the current processor supports cryptographic instructions, false otherwise.
|
* @return true if the current processor supports cryptographic instructions, false otherwise.
|
||||||
*/
|
*/
|
||||||
int have_crypto_apple()
|
int have_crypto_apple() { return 0; }
|
||||||
{ return 0; }
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
@@ -297,7 +297,8 @@ AARU_EXPORT int AARU_CALL spamsum_final(spamsum_ctx *ctx, uint8_t *result)
|
|||||||
++bi;
|
++bi;
|
||||||
i = (int)ctx->bh[bi].d_len;
|
i = (int)ctx->bh[bi].d_len;
|
||||||
|
|
||||||
if(i <= remain);
|
if(i <= remain)
|
||||||
|
;
|
||||||
|
|
||||||
memcpy(result, ctx->bh[bi].digest, (size_t)i);
|
memcpy(result, ctx->bh[bi].digest, (size_t)i);
|
||||||
result += i;
|
result += i;
|
||||||
|
|||||||
Reference in New Issue
Block a user