mirror of
https://github.com/aaru-dps/Aaru.Checksums.Native.git
synced 2025-12-16 11:14:29 +00:00
Fix Adler and Fletcher calculations using SIMD when dataset is smaller than block size.
This commit is contained in:
178
adler32_avx2.c
178
adler32_avx2.c
@@ -51,101 +51,104 @@ AARU_EXPORT TARGET_WITH_AVX2 void AARU_CALL adler32_avx2(uint16_t *sum1, uint16_
|
|||||||
* Process the data in blocks.
|
* Process the data in blocks.
|
||||||
*/
|
*/
|
||||||
const unsigned BLOCK_SIZE = 1 << 5;
|
const unsigned BLOCK_SIZE = 1 << 5;
|
||||||
long blocks = len / BLOCK_SIZE;
|
if(len >= BLOCK_SIZE)
|
||||||
len -= blocks * BLOCK_SIZE;
|
|
||||||
|
|
||||||
while(blocks)
|
|
||||||
{
|
{
|
||||||
unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
|
long blocks = len / BLOCK_SIZE;
|
||||||
|
len -= blocks * BLOCK_SIZE;
|
||||||
|
|
||||||
if(n > blocks) n = (unsigned)blocks;
|
while(blocks)
|
||||||
blocks -= n;
|
|
||||||
|
|
||||||
const __m256i tap = _mm256_set_epi8(1,
|
|
||||||
2,
|
|
||||||
3,
|
|
||||||
4,
|
|
||||||
5,
|
|
||||||
6,
|
|
||||||
7,
|
|
||||||
8,
|
|
||||||
9,
|
|
||||||
10,
|
|
||||||
11,
|
|
||||||
12,
|
|
||||||
13,
|
|
||||||
14,
|
|
||||||
15,
|
|
||||||
16,
|
|
||||||
17,
|
|
||||||
18,
|
|
||||||
19,
|
|
||||||
20,
|
|
||||||
21,
|
|
||||||
22,
|
|
||||||
23,
|
|
||||||
24,
|
|
||||||
25,
|
|
||||||
26,
|
|
||||||
27,
|
|
||||||
28,
|
|
||||||
29,
|
|
||||||
30,
|
|
||||||
31,
|
|
||||||
32);
|
|
||||||
const __m256i zero = _mm256_setzero_si256();
|
|
||||||
const __m256i ones = _mm256_set1_epi16(1);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Process n blocks of data. At most NMAX data bytes can be
|
|
||||||
* processed before s2 must be reduced modulo BASE.
|
|
||||||
*/
|
|
||||||
__m256i v_ps = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, (s1 * n));
|
|
||||||
__m256i v_s2 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, s2);
|
|
||||||
__m256i v_s1 = _mm256_setzero_si256();
|
|
||||||
do
|
|
||||||
{
|
{
|
||||||
/*
|
unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
|
||||||
* Load 32 input bytes.
|
|
||||||
*/
|
if(n > blocks) n = (unsigned)blocks;
|
||||||
const __m256i bytes = _mm256_lddqu_si256((__m256i *)(data));
|
blocks -= n;
|
||||||
|
|
||||||
|
const __m256i tap = _mm256_set_epi8(1,
|
||||||
|
2,
|
||||||
|
3,
|
||||||
|
4,
|
||||||
|
5,
|
||||||
|
6,
|
||||||
|
7,
|
||||||
|
8,
|
||||||
|
9,
|
||||||
|
10,
|
||||||
|
11,
|
||||||
|
12,
|
||||||
|
13,
|
||||||
|
14,
|
||||||
|
15,
|
||||||
|
16,
|
||||||
|
17,
|
||||||
|
18,
|
||||||
|
19,
|
||||||
|
20,
|
||||||
|
21,
|
||||||
|
22,
|
||||||
|
23,
|
||||||
|
24,
|
||||||
|
25,
|
||||||
|
26,
|
||||||
|
27,
|
||||||
|
28,
|
||||||
|
29,
|
||||||
|
30,
|
||||||
|
31,
|
||||||
|
32);
|
||||||
|
const __m256i zero = _mm256_setzero_si256();
|
||||||
|
const __m256i ones = _mm256_set1_epi16(1);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Add previous block byte sum to v_ps.
|
* Process n blocks of data. At most NMAX data bytes can be
|
||||||
|
* processed before s2 must be reduced modulo BASE.
|
||||||
*/
|
*/
|
||||||
v_ps = _mm256_add_epi32(v_ps, v_s1);
|
__m256i v_ps = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, (s1 * n));
|
||||||
/*
|
__m256i v_s2 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, s2);
|
||||||
* Horizontally add the bytes for s1, multiply-adds the
|
__m256i v_s1 = _mm256_setzero_si256();
|
||||||
* bytes by [ 32, 31, 30, ... ] for s2.
|
do
|
||||||
*/
|
{
|
||||||
v_s1 = _mm256_add_epi32(v_s1, _mm256_sad_epu8(bytes, zero));
|
/*
|
||||||
const __m256i mad = _mm256_maddubs_epi16(bytes, tap);
|
* Load 32 input bytes.
|
||||||
v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(mad, ones));
|
*/
|
||||||
|
const __m256i bytes = _mm256_lddqu_si256((__m256i *)(data));
|
||||||
|
|
||||||
data += BLOCK_SIZE;
|
/*
|
||||||
|
* Add previous block byte sum to v_ps.
|
||||||
|
*/
|
||||||
|
v_ps = _mm256_add_epi32(v_ps, v_s1);
|
||||||
|
/*
|
||||||
|
* Horizontally add the bytes for s1, multiply-adds the
|
||||||
|
* bytes by [ 32, 31, 30, ... ] for s2.
|
||||||
|
*/
|
||||||
|
v_s1 = _mm256_add_epi32(v_s1, _mm256_sad_epu8(bytes, zero));
|
||||||
|
const __m256i mad = _mm256_maddubs_epi16(bytes, tap);
|
||||||
|
v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(mad, ones));
|
||||||
|
|
||||||
|
data += BLOCK_SIZE;
|
||||||
|
}
|
||||||
|
while(--n);
|
||||||
|
|
||||||
|
__m128i sum = _mm_add_epi32(_mm256_castsi256_si128(v_s1), _mm256_extracti128_si256(v_s1, 1));
|
||||||
|
__m128i hi = _mm_unpackhi_epi64(sum, sum);
|
||||||
|
sum = _mm_add_epi32(hi, sum);
|
||||||
|
hi = _mm_shuffle_epi32(sum, 177);
|
||||||
|
sum = _mm_add_epi32(sum, hi);
|
||||||
|
s1 += _mm_cvtsi128_si32(sum);
|
||||||
|
|
||||||
|
v_s2 = _mm256_add_epi32(v_s2, _mm256_slli_epi32(v_ps, 5));
|
||||||
|
sum = _mm_add_epi32(_mm256_castsi256_si128(v_s2), _mm256_extracti128_si256(v_s2, 1));
|
||||||
|
hi = _mm_unpackhi_epi64(sum, sum);
|
||||||
|
sum = _mm_add_epi32(hi, sum);
|
||||||
|
hi = _mm_shuffle_epi32(sum, 177);
|
||||||
|
sum = _mm_add_epi32(sum, hi);
|
||||||
|
s2 = _mm_cvtsi128_si32(sum);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Reduce.
|
||||||
|
*/
|
||||||
|
s1 %= ADLER_MODULE;
|
||||||
|
s2 %= ADLER_MODULE;
|
||||||
}
|
}
|
||||||
while(--n);
|
|
||||||
|
|
||||||
__m128i sum = _mm_add_epi32(_mm256_castsi256_si128(v_s1), _mm256_extracti128_si256(v_s1, 1));
|
|
||||||
__m128i hi = _mm_unpackhi_epi64(sum, sum);
|
|
||||||
sum = _mm_add_epi32(hi, sum);
|
|
||||||
hi = _mm_shuffle_epi32(sum, 177);
|
|
||||||
sum = _mm_add_epi32(sum, hi);
|
|
||||||
s1 += _mm_cvtsi128_si32(sum);
|
|
||||||
|
|
||||||
v_s2 = _mm256_add_epi32(v_s2, _mm256_slli_epi32(v_ps, 5));
|
|
||||||
sum = _mm_add_epi32(_mm256_castsi256_si128(v_s2), _mm256_extracti128_si256(v_s2, 1));
|
|
||||||
hi = _mm_unpackhi_epi64(sum, sum);
|
|
||||||
sum = _mm_add_epi32(hi, sum);
|
|
||||||
hi = _mm_shuffle_epi32(sum, 177);
|
|
||||||
sum = _mm_add_epi32(sum, hi);
|
|
||||||
s2 = _mm_cvtsi128_si32(sum);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Reduce.
|
|
||||||
*/
|
|
||||||
s1 %= ADLER_MODULE;
|
|
||||||
s2 %= ADLER_MODULE;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -178,6 +181,7 @@ AARU_EXPORT TARGET_WITH_AVX2 void AARU_CALL adler32_avx2(uint16_t *sum1, uint16_
|
|||||||
if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
|
if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
|
||||||
s2 %= ADLER_MODULE;
|
s2 %= ADLER_MODULE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Return the recombined sums.
|
* Return the recombined sums.
|
||||||
*/
|
*/
|
||||||
|
|||||||
199
adler32_neon.c
199
adler32_neon.c
@@ -55,118 +55,124 @@ TARGET_WITH_NEON void adler32_neon(uint16_t *sum1, uint16_t *sum2, const uint8_t
|
|||||||
*/
|
*/
|
||||||
uint32_t s1 = *sum1;
|
uint32_t s1 = *sum1;
|
||||||
uint32_t s2 = *sum2;
|
uint32_t s2 = *sum2;
|
||||||
/*
|
|
||||||
* Serially compute s1 & s2, until the data is 16-byte aligned.
|
|
||||||
*/
|
|
||||||
if((uintptr_t)data & 15)
|
|
||||||
{
|
|
||||||
while((uintptr_t)data & 15)
|
|
||||||
{
|
|
||||||
s2 += (s1 += *data++);
|
|
||||||
--len;
|
|
||||||
}
|
|
||||||
if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
|
|
||||||
s2 %= ADLER_MODULE;
|
|
||||||
}
|
|
||||||
/*
|
/*
|
||||||
* Process the data in blocks.
|
* Process the data in blocks.
|
||||||
*/
|
*/
|
||||||
const unsigned BLOCK_SIZE = 1 << 5;
|
const unsigned BLOCK_SIZE = 1 << 5;
|
||||||
uint32_t blocks = len / BLOCK_SIZE;
|
if(len >= BLOCK_SIZE)
|
||||||
len -= blocks * BLOCK_SIZE;
|
|
||||||
while(blocks)
|
|
||||||
{
|
{
|
||||||
unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
|
|
||||||
if(n > blocks) n = (unsigned)blocks;
|
|
||||||
blocks -= n;
|
|
||||||
/*
|
/*
|
||||||
* Process n blocks of data. At most NMAX data bytes can be
|
* Serially compute s1 & s2, until the data is 16-byte aligned.
|
||||||
* processed before s2 must be reduced modulo ADLER_MODULE.
|
|
||||||
*/
|
*/
|
||||||
#ifdef _MSC_VER
|
if((uintptr_t)data & 15)
|
||||||
uint32x4_t v_s2 = {.n128_u32 = {0, 0, 0, s1 * n}};
|
|
||||||
uint32x4_t v_s1 = {.n128_u32 = {0, 0, 0, 0}};
|
|
||||||
#else
|
|
||||||
uint32x4_t v_s2 = (uint32x4_t){0, 0, 0, s1 * n};
|
|
||||||
uint32x4_t v_s1 = (uint32x4_t){0, 0, 0, 0};
|
|
||||||
#endif
|
|
||||||
uint16x8_t v_column_sum_1 = vdupq_n_u16(0);
|
|
||||||
uint16x8_t v_column_sum_2 = vdupq_n_u16(0);
|
|
||||||
uint16x8_t v_column_sum_3 = vdupq_n_u16(0);
|
|
||||||
uint16x8_t v_column_sum_4 = vdupq_n_u16(0);
|
|
||||||
do
|
|
||||||
{
|
{
|
||||||
/*
|
while((uintptr_t)data & 15)
|
||||||
* Load 32 input bytes.
|
{
|
||||||
*/
|
s2 += (s1 += *data++);
|
||||||
const uint8x16_t bytes1 = vld1q_u8((uint8_t *)(data));
|
--len;
|
||||||
const uint8x16_t bytes2 = vld1q_u8((uint8_t *)(data + 16));
|
}
|
||||||
/*
|
if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
|
||||||
* Add previous block byte sum to v_s2.
|
s2 %= ADLER_MODULE;
|
||||||
*/
|
|
||||||
v_s2 = vaddq_u32(v_s2, v_s1);
|
|
||||||
/*
|
|
||||||
* Horizontally add the bytes for s1.
|
|
||||||
*/
|
|
||||||
v_s1 = vpadalq_u16(v_s1, vpadalq_u8(vpaddlq_u8(bytes1), bytes2));
|
|
||||||
/*
|
|
||||||
* Vertically add the bytes for s2.
|
|
||||||
*/
|
|
||||||
v_column_sum_1 = vaddw_u8(v_column_sum_1, vget_low_u8(bytes1));
|
|
||||||
v_column_sum_2 = vaddw_u8(v_column_sum_2, vget_high_u8(bytes1));
|
|
||||||
v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8(bytes2));
|
|
||||||
v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2));
|
|
||||||
data += BLOCK_SIZE;
|
|
||||||
}
|
}
|
||||||
while(--n);
|
|
||||||
v_s2 = vshlq_n_u32(v_s2, 5);
|
uint32_t blocks = len / BLOCK_SIZE;
|
||||||
/*
|
len -= blocks * BLOCK_SIZE;
|
||||||
* Multiply-add bytes by [ 32, 31, 30, ... ] for s2.
|
while(blocks)
|
||||||
*/
|
{
|
||||||
|
unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
|
||||||
|
if(n > blocks) n = (unsigned)blocks;
|
||||||
|
blocks -= n;
|
||||||
|
/*
|
||||||
|
* Process n blocks of data. At most NMAX data bytes can be
|
||||||
|
* processed before s2 must be reduced modulo ADLER_MODULE.
|
||||||
|
*/
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
uint32x4_t v_s2 = {.n128_u32 = {0, 0, 0, s1 * n}};
|
||||||
|
uint32x4_t v_s1 = {.n128_u32 = {0, 0, 0, 0}};
|
||||||
|
#else
|
||||||
|
uint32x4_t v_s2 = (uint32x4_t){0, 0, 0, s1 * n};
|
||||||
|
uint32x4_t v_s1 = (uint32x4_t){0, 0, 0, 0};
|
||||||
|
#endif
|
||||||
|
uint16x8_t v_column_sum_1 = vdupq_n_u16(0);
|
||||||
|
uint16x8_t v_column_sum_2 = vdupq_n_u16(0);
|
||||||
|
uint16x8_t v_column_sum_3 = vdupq_n_u16(0);
|
||||||
|
uint16x8_t v_column_sum_4 = vdupq_n_u16(0);
|
||||||
|
do
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Load 32 input bytes.
|
||||||
|
*/
|
||||||
|
const uint8x16_t bytes1 = vld1q_u8((uint8_t *)(data));
|
||||||
|
const uint8x16_t bytes2 = vld1q_u8((uint8_t *)(data + 16));
|
||||||
|
/*
|
||||||
|
* Add previous block byte sum to v_s2.
|
||||||
|
*/
|
||||||
|
v_s2 = vaddq_u32(v_s2, v_s1);
|
||||||
|
/*
|
||||||
|
* Horizontally add the bytes for s1.
|
||||||
|
*/
|
||||||
|
v_s1 = vpadalq_u16(v_s1, vpadalq_u8(vpaddlq_u8(bytes1), bytes2));
|
||||||
|
/*
|
||||||
|
* Vertically add the bytes for s2.
|
||||||
|
*/
|
||||||
|
v_column_sum_1 = vaddw_u8(v_column_sum_1, vget_low_u8(bytes1));
|
||||||
|
v_column_sum_2 = vaddw_u8(v_column_sum_2, vget_high_u8(bytes1));
|
||||||
|
v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8(bytes2));
|
||||||
|
v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2));
|
||||||
|
data += BLOCK_SIZE;
|
||||||
|
}
|
||||||
|
while(--n);
|
||||||
|
v_s2 = vshlq_n_u32(v_s2, 5);
|
||||||
|
/*
|
||||||
|
* Multiply-add bytes by [ 32, 31, 30, ... ] for s2.
|
||||||
|
*/
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
#ifdef _M_ARM64
|
#ifdef _M_ARM64
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), neon_ld1m_16((uint16_t[]){32, 31, 30, 29}));
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), neon_ld1m_16((uint16_t[]){32, 31, 30, 29}));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), neon_ld1m_16((uint16_t[]){28, 27, 26, 25}));
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), neon_ld1m_16((uint16_t[]){28, 27, 26, 25}));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), neon_ld1m_16((uint16_t[]){24, 23, 22, 21}));
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), neon_ld1m_16((uint16_t[]){24, 23, 22, 21}));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), neon_ld1m_16((uint16_t[]){20, 19, 18, 17}));
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), neon_ld1m_16((uint16_t[]){20, 19, 18, 17}));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), neon_ld1m_16((uint16_t[]){16, 15, 14, 13}));
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), neon_ld1m_16((uint16_t[]){16, 15, 14, 13}));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), neon_ld1m_16((uint16_t[]){12, 11, 10, 9}));
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), neon_ld1m_16((uint16_t[]){12, 11, 10, 9}));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), neon_ld1m_16((uint16_t[]){8, 7, 6, 5}));
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), neon_ld1m_16((uint16_t[]){8, 7, 6, 5}));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), neon_ld1m_16((uint16_t[]){4, 3, 2, 1}));
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), neon_ld1m_16((uint16_t[]){4, 3, 2, 1}));
|
||||||
#else
|
#else
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), vld1_u16(((uint16_t[]){32, 31, 30, 29})));
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), vld1_u16(((uint16_t[]){32, 31, 30, 29})));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), vld1_u16(((uint16_t[]){28, 27, 26, 25})));
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), vld1_u16(((uint16_t[]){28, 27, 26, 25})));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), vld1_u16(((uint16_t[]){24, 23, 22, 21})));
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), vld1_u16(((uint16_t[]){24, 23, 22, 21})));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), vld1_u16(((uint16_t[]){20, 19, 18, 17})));
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), vld1_u16(((uint16_t[]){20, 19, 18, 17})));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), vld1_u16(((uint16_t[]){16, 15, 14, 13})));
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), vld1_u16(((uint16_t[]){16, 15, 14, 13})));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), vld1_u16(((uint16_t[]){12, 11, 10, 9})));
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), vld1_u16(((uint16_t[]){12, 11, 10, 9})));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), vld1_u16(((uint16_t[]){8, 7, 6, 5})));
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), vld1_u16(((uint16_t[]){8, 7, 6, 5})));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), vld1_u16(((uint16_t[]){4, 3, 2, 1})));
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), vld1_u16(((uint16_t[]){4, 3, 2, 1})));
|
||||||
#endif
|
#endif
|
||||||
#else
|
#else
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), (uint16x4_t){32, 31, 30, 29});
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), (uint16x4_t){32, 31, 30, 29});
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), (uint16x4_t){28, 27, 26, 25});
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), (uint16x4_t){28, 27, 26, 25});
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), (uint16x4_t){24, 23, 22, 21});
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), (uint16x4_t){24, 23, 22, 21});
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), (uint16x4_t){20, 19, 18, 17});
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), (uint16x4_t){20, 19, 18, 17});
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), (uint16x4_t){16, 15, 14, 13});
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), (uint16x4_t){16, 15, 14, 13});
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), (uint16x4_t){12, 11, 10, 9});
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), (uint16x4_t){12, 11, 10, 9});
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), (uint16x4_t){8, 7, 6, 5});
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), (uint16x4_t){8, 7, 6, 5});
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), (uint16x4_t){4, 3, 2, 1});
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), (uint16x4_t){4, 3, 2, 1});
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
|
* Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
|
||||||
*/
|
*/
|
||||||
uint32x2_t t_s1 = vpadd_u32(vget_low_u32(v_s1), vget_high_u32(v_s1));
|
uint32x2_t t_s1 = vpadd_u32(vget_low_u32(v_s1), vget_high_u32(v_s1));
|
||||||
uint32x2_t t_s2 = vpadd_u32(vget_low_u32(v_s2), vget_high_u32(v_s2));
|
uint32x2_t t_s2 = vpadd_u32(vget_low_u32(v_s2), vget_high_u32(v_s2));
|
||||||
uint32x2_t s1s2 = vpadd_u32(t_s1, t_s2);
|
uint32x2_t s1s2 = vpadd_u32(t_s1, t_s2);
|
||||||
s1 += vget_lane_u32(s1s2, 0);
|
s1 += vget_lane_u32(s1s2, 0);
|
||||||
s2 += vget_lane_u32(s1s2, 1);
|
s2 += vget_lane_u32(s1s2, 1);
|
||||||
/*
|
/*
|
||||||
* Reduce.
|
* Reduce.
|
||||||
*/
|
*/
|
||||||
s1 %= ADLER_MODULE;
|
s1 %= ADLER_MODULE;
|
||||||
s2 %= ADLER_MODULE;
|
s2 %= ADLER_MODULE;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Handle leftover data.
|
* Handle leftover data.
|
||||||
*/
|
*/
|
||||||
@@ -197,6 +203,7 @@ TARGET_WITH_NEON void adler32_neon(uint16_t *sum1, uint16_t *sum2, const uint8_t
|
|||||||
if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
|
if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
|
||||||
s2 %= ADLER_MODULE;
|
s2 %= ADLER_MODULE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Return the recombined sums.
|
* Return the recombined sums.
|
||||||
*/
|
*/
|
||||||
|
|||||||
107
adler32_ssse3.c
107
adler32_ssse3.c
@@ -60,68 +60,72 @@ adler32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
|
|||||||
* Process the data in blocks.
|
* Process the data in blocks.
|
||||||
*/
|
*/
|
||||||
const unsigned BLOCK_SIZE = 1 << 5;
|
const unsigned BLOCK_SIZE = 1 << 5;
|
||||||
long blocks = len / BLOCK_SIZE;
|
if(len >= BLOCK_SIZE)
|
||||||
len -= blocks * BLOCK_SIZE;
|
|
||||||
while(blocks)
|
|
||||||
{
|
{
|
||||||
unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
|
long blocks = len / BLOCK_SIZE;
|
||||||
if(n > blocks) n = (unsigned)blocks;
|
len -= blocks * BLOCK_SIZE;
|
||||||
blocks -= n;
|
while(blocks)
|
||||||
const __m128i tap1 = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
|
|
||||||
const __m128i tap2 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
|
|
||||||
const __m128i zero = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
||||||
const __m128i ones = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1);
|
|
||||||
/*
|
|
||||||
* Process n blocks of data. At most NMAX data bytes can be
|
|
||||||
* processed before s2 must be reduced modulo BASE.
|
|
||||||
*/
|
|
||||||
__m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n);
|
|
||||||
__m128i v_s2 = _mm_set_epi32(0, 0, 0, s2);
|
|
||||||
__m128i v_s1 = _mm_set_epi32(0, 0, 0, 0);
|
|
||||||
do
|
|
||||||
{
|
{
|
||||||
|
unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
|
||||||
|
if(n > blocks) n = (unsigned)blocks;
|
||||||
|
blocks -= n;
|
||||||
|
const __m128i tap1 = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
|
||||||
|
const __m128i tap2 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
|
||||||
|
const __m128i zero = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||||
|
const __m128i ones = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1);
|
||||||
/*
|
/*
|
||||||
* Load 32 input bytes.
|
* Process n blocks of data. At most NMAX data bytes can be
|
||||||
|
* processed before s2 must be reduced modulo BASE.
|
||||||
*/
|
*/
|
||||||
const __m128i bytes1 = _mm_loadu_si128((__m128i *)(data));
|
__m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n);
|
||||||
const __m128i bytes2 = _mm_loadu_si128((__m128i *)(data + 16));
|
__m128i v_s2 = _mm_set_epi32(0, 0, 0, s2);
|
||||||
|
__m128i v_s1 = _mm_set_epi32(0, 0, 0, 0);
|
||||||
|
do
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Load 32 input bytes.
|
||||||
|
*/
|
||||||
|
const __m128i bytes1 = _mm_loadu_si128((__m128i *)(data));
|
||||||
|
const __m128i bytes2 = _mm_loadu_si128((__m128i *)(data + 16));
|
||||||
|
/*
|
||||||
|
* Add previous block byte sum to v_ps.
|
||||||
|
*/
|
||||||
|
v_ps = _mm_add_epi32(v_ps, v_s1);
|
||||||
|
/*
|
||||||
|
* Horizontally add the bytes for s1, multiply-adds the
|
||||||
|
* bytes by [ 32, 31, 30, ... ] for s2.
|
||||||
|
*/
|
||||||
|
v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zero));
|
||||||
|
const __m128i mad1 = _mm_maddubs_epi16(bytes1, tap1);
|
||||||
|
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad1, ones));
|
||||||
|
v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zero));
|
||||||
|
const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2);
|
||||||
|
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones));
|
||||||
|
data += BLOCK_SIZE;
|
||||||
|
}
|
||||||
|
while(--n);
|
||||||
|
v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5));
|
||||||
/*
|
/*
|
||||||
* Add previous block byte sum to v_ps.
|
* Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
|
||||||
*/
|
*/
|
||||||
v_ps = _mm_add_epi32(v_ps, v_s1);
|
|
||||||
/*
|
|
||||||
* Horizontally add the bytes for s1, multiply-adds the
|
|
||||||
* bytes by [ 32, 31, 30, ... ] for s2.
|
|
||||||
*/
|
|
||||||
v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zero));
|
|
||||||
const __m128i mad1 = _mm_maddubs_epi16(bytes1, tap1);
|
|
||||||
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad1, ones));
|
|
||||||
v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zero));
|
|
||||||
const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2);
|
|
||||||
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones));
|
|
||||||
data += BLOCK_SIZE;
|
|
||||||
}
|
|
||||||
while(--n);
|
|
||||||
v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5));
|
|
||||||
/*
|
|
||||||
* Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
|
|
||||||
*/
|
|
||||||
#define S23O1 _MM_SHUFFLE(2, 3, 0, 1) /* A B C D -> B A D C */
|
#define S23O1 _MM_SHUFFLE(2, 3, 0, 1) /* A B C D -> B A D C */
|
||||||
#define S1O32 _MM_SHUFFLE(1, 0, 3, 2) /* A B C D -> C D A B */
|
#define S1O32 _MM_SHUFFLE(1, 0, 3, 2) /* A B C D -> C D A B */
|
||||||
v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S23O1));
|
v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S23O1));
|
||||||
v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S1O32));
|
v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S1O32));
|
||||||
s1 += _mm_cvtsi128_si32(v_s1);
|
s1 += _mm_cvtsi128_si32(v_s1);
|
||||||
v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S23O1));
|
v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S23O1));
|
||||||
v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S1O32));
|
v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S1O32));
|
||||||
s2 = _mm_cvtsi128_si32(v_s2);
|
s2 = _mm_cvtsi128_si32(v_s2);
|
||||||
#undef S23O1
|
#undef S23O1
|
||||||
#undef S1O32
|
#undef S1O32
|
||||||
/*
|
/*
|
||||||
* Reduce.
|
* Reduce.
|
||||||
*/
|
*/
|
||||||
s1 %= ADLER_MODULE;
|
s1 %= ADLER_MODULE;
|
||||||
s2 %= ADLER_MODULE;
|
s2 %= ADLER_MODULE;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Handle leftover data.
|
* Handle leftover data.
|
||||||
*/
|
*/
|
||||||
@@ -152,6 +156,7 @@ adler32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
|
|||||||
if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
|
if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
|
||||||
s2 %= ADLER_MODULE;
|
s2 %= ADLER_MODULE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Return the recombined sums.
|
* Return the recombined sums.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -52,101 +52,104 @@ fletcher16_avx2(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
|
|||||||
* Process the data in blocks.
|
* Process the data in blocks.
|
||||||
*/
|
*/
|
||||||
const unsigned BLOCK_SIZE = 1 << 5;
|
const unsigned BLOCK_SIZE = 1 << 5;
|
||||||
long blocks = len / BLOCK_SIZE;
|
if(len >= BLOCK_SIZE)
|
||||||
len -= blocks * BLOCK_SIZE;
|
|
||||||
|
|
||||||
while(blocks)
|
|
||||||
{
|
{
|
||||||
unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
|
long blocks = len / BLOCK_SIZE;
|
||||||
|
len -= blocks * BLOCK_SIZE;
|
||||||
|
|
||||||
if(n > blocks) n = (unsigned)blocks;
|
while(blocks)
|
||||||
blocks -= n;
|
|
||||||
|
|
||||||
const __m256i tap = _mm256_set_epi8(1,
|
|
||||||
2,
|
|
||||||
3,
|
|
||||||
4,
|
|
||||||
5,
|
|
||||||
6,
|
|
||||||
7,
|
|
||||||
8,
|
|
||||||
9,
|
|
||||||
10,
|
|
||||||
11,
|
|
||||||
12,
|
|
||||||
13,
|
|
||||||
14,
|
|
||||||
15,
|
|
||||||
16,
|
|
||||||
17,
|
|
||||||
18,
|
|
||||||
19,
|
|
||||||
20,
|
|
||||||
21,
|
|
||||||
22,
|
|
||||||
23,
|
|
||||||
24,
|
|
||||||
25,
|
|
||||||
26,
|
|
||||||
27,
|
|
||||||
28,
|
|
||||||
29,
|
|
||||||
30,
|
|
||||||
31,
|
|
||||||
32);
|
|
||||||
const __m256i zero = _mm256_setzero_si256();
|
|
||||||
const __m256i ones = _mm256_set1_epi16(1);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Process n blocks of data. At most NMAX data bytes can be
|
|
||||||
* processed before s2 must be reduced modulo BASE.
|
|
||||||
*/
|
|
||||||
__m256i v_ps = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, (s1 * n));
|
|
||||||
__m256i v_s2 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, s2);
|
|
||||||
__m256i v_s1 = _mm256_setzero_si256();
|
|
||||||
do
|
|
||||||
{
|
{
|
||||||
/*
|
unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
|
||||||
* Load 32 input bytes.
|
|
||||||
*/
|
if(n > blocks) n = (unsigned)blocks;
|
||||||
const __m256i bytes = _mm256_lddqu_si256((__m256i *)(data));
|
blocks -= n;
|
||||||
|
|
||||||
|
const __m256i tap = _mm256_set_epi8(1,
|
||||||
|
2,
|
||||||
|
3,
|
||||||
|
4,
|
||||||
|
5,
|
||||||
|
6,
|
||||||
|
7,
|
||||||
|
8,
|
||||||
|
9,
|
||||||
|
10,
|
||||||
|
11,
|
||||||
|
12,
|
||||||
|
13,
|
||||||
|
14,
|
||||||
|
15,
|
||||||
|
16,
|
||||||
|
17,
|
||||||
|
18,
|
||||||
|
19,
|
||||||
|
20,
|
||||||
|
21,
|
||||||
|
22,
|
||||||
|
23,
|
||||||
|
24,
|
||||||
|
25,
|
||||||
|
26,
|
||||||
|
27,
|
||||||
|
28,
|
||||||
|
29,
|
||||||
|
30,
|
||||||
|
31,
|
||||||
|
32);
|
||||||
|
const __m256i zero = _mm256_setzero_si256();
|
||||||
|
const __m256i ones = _mm256_set1_epi16(1);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Add previous block byte sum to v_ps.
|
* Process n blocks of data. At most NMAX data bytes can be
|
||||||
|
* processed before s2 must be reduced modulo BASE.
|
||||||
*/
|
*/
|
||||||
v_ps = _mm256_add_epi32(v_ps, v_s1);
|
__m256i v_ps = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, (s1 * n));
|
||||||
/*
|
__m256i v_s2 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, s2);
|
||||||
* Horizontally add the bytes for s1, multiply-adds the
|
__m256i v_s1 = _mm256_setzero_si256();
|
||||||
* bytes by [ 32, 31, 30, ... ] for s2.
|
do
|
||||||
*/
|
{
|
||||||
v_s1 = _mm256_add_epi32(v_s1, _mm256_sad_epu8(bytes, zero));
|
/*
|
||||||
const __m256i mad = _mm256_maddubs_epi16(bytes, tap);
|
* Load 32 input bytes.
|
||||||
v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(mad, ones));
|
*/
|
||||||
|
const __m256i bytes = _mm256_lddqu_si256((__m256i *)(data));
|
||||||
|
|
||||||
data += BLOCK_SIZE;
|
/*
|
||||||
|
* Add previous block byte sum to v_ps.
|
||||||
|
*/
|
||||||
|
v_ps = _mm256_add_epi32(v_ps, v_s1);
|
||||||
|
/*
|
||||||
|
* Horizontally add the bytes for s1, multiply-adds the
|
||||||
|
* bytes by [ 32, 31, 30, ... ] for s2.
|
||||||
|
*/
|
||||||
|
v_s1 = _mm256_add_epi32(v_s1, _mm256_sad_epu8(bytes, zero));
|
||||||
|
const __m256i mad = _mm256_maddubs_epi16(bytes, tap);
|
||||||
|
v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(mad, ones));
|
||||||
|
|
||||||
|
data += BLOCK_SIZE;
|
||||||
|
}
|
||||||
|
while(--n);
|
||||||
|
|
||||||
|
__m128i sum = _mm_add_epi32(_mm256_castsi256_si128(v_s1), _mm256_extracti128_si256(v_s1, 1));
|
||||||
|
__m128i hi = _mm_unpackhi_epi64(sum, sum);
|
||||||
|
sum = _mm_add_epi32(hi, sum);
|
||||||
|
hi = _mm_shuffle_epi32(sum, 177);
|
||||||
|
sum = _mm_add_epi32(sum, hi);
|
||||||
|
s1 += _mm_cvtsi128_si32(sum);
|
||||||
|
|
||||||
|
v_s2 = _mm256_add_epi32(v_s2, _mm256_slli_epi32(v_ps, 5));
|
||||||
|
sum = _mm_add_epi32(_mm256_castsi256_si128(v_s2), _mm256_extracti128_si256(v_s2, 1));
|
||||||
|
hi = _mm_unpackhi_epi64(sum, sum);
|
||||||
|
sum = _mm_add_epi32(hi, sum);
|
||||||
|
hi = _mm_shuffle_epi32(sum, 177);
|
||||||
|
sum = _mm_add_epi32(sum, hi);
|
||||||
|
s2 = _mm_cvtsi128_si32(sum);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Reduce.
|
||||||
|
*/
|
||||||
|
s1 %= FLETCHER16_MODULE;
|
||||||
|
s2 %= FLETCHER16_MODULE;
|
||||||
}
|
}
|
||||||
while(--n);
|
|
||||||
|
|
||||||
__m128i sum = _mm_add_epi32(_mm256_castsi256_si128(v_s1), _mm256_extracti128_si256(v_s1, 1));
|
|
||||||
__m128i hi = _mm_unpackhi_epi64(sum, sum);
|
|
||||||
sum = _mm_add_epi32(hi, sum);
|
|
||||||
hi = _mm_shuffle_epi32(sum, 177);
|
|
||||||
sum = _mm_add_epi32(sum, hi);
|
|
||||||
s1 += _mm_cvtsi128_si32(sum);
|
|
||||||
|
|
||||||
v_s2 = _mm256_add_epi32(v_s2, _mm256_slli_epi32(v_ps, 5));
|
|
||||||
sum = _mm_add_epi32(_mm256_castsi256_si128(v_s2), _mm256_extracti128_si256(v_s2, 1));
|
|
||||||
hi = _mm_unpackhi_epi64(sum, sum);
|
|
||||||
sum = _mm_add_epi32(hi, sum);
|
|
||||||
hi = _mm_shuffle_epi32(sum, 177);
|
|
||||||
sum = _mm_add_epi32(sum, hi);
|
|
||||||
s2 = _mm_cvtsi128_si32(sum);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Reduce.
|
|
||||||
*/
|
|
||||||
s1 %= FLETCHER16_MODULE;
|
|
||||||
s2 %= FLETCHER16_MODULE;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -179,6 +182,7 @@ fletcher16_avx2(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
|
|||||||
s1 %= FLETCHER16_MODULE;
|
s1 %= FLETCHER16_MODULE;
|
||||||
s2 %= FLETCHER16_MODULE;
|
s2 %= FLETCHER16_MODULE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Return the recombined sums.
|
* Return the recombined sums.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -48,123 +48,131 @@
|
|||||||
* @param data Pointer to the data buffer.
|
* @param data Pointer to the data buffer.
|
||||||
* @param len Length of the data buffer in bytes.
|
* @param len Length of the data buffer in bytes.
|
||||||
*/
|
*/
|
||||||
TARGET_WITH_NEON void fletcher16_neon(uint8_t* sum1, uint8_t* sum2, const uint8_t* data, uint32_t len)
|
TARGET_WITH_NEON void fletcher16_neon(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, uint32_t len)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
* Split Fletcher-16 into component sums.
|
* Split Fletcher-16 into component sums.
|
||||||
*/
|
*/
|
||||||
uint32_t s1 = *sum1;
|
uint32_t s1 = *sum1;
|
||||||
uint32_t s2 = *sum2;
|
uint32_t s2 = *sum2;
|
||||||
/*
|
|
||||||
* Serially compute s1 & s2, until the data is 16-byte aligned.
|
|
||||||
*/
|
|
||||||
if((uintptr_t)data & 15)
|
|
||||||
{
|
|
||||||
while((uintptr_t)data & 15)
|
|
||||||
{
|
|
||||||
s2 += (s1 += *data++);
|
|
||||||
--len;
|
|
||||||
}
|
|
||||||
s1 %= FLETCHER16_MODULE;
|
|
||||||
s2 %= FLETCHER16_MODULE;
|
|
||||||
}
|
|
||||||
/*
|
/*
|
||||||
* Process the data in blocks.
|
* Process the data in blocks.
|
||||||
*/
|
*/
|
||||||
const unsigned BLOCK_SIZE = 1 << 5;
|
const unsigned BLOCK_SIZE = 1 << 5;
|
||||||
uint32_t blocks = len / BLOCK_SIZE;
|
if(len >= BLOCK_SIZE)
|
||||||
len -= blocks * BLOCK_SIZE;
|
|
||||||
while(blocks)
|
|
||||||
{
|
{
|
||||||
unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
|
|
||||||
if(n > blocks) n = (unsigned)blocks;
|
|
||||||
blocks -= n;
|
|
||||||
/*
|
/*
|
||||||
* Process n blocks of data. At most NMAX data bytes can be
|
* Serially compute s1 & s2, until the data is 16-byte aligned.
|
||||||
* processed before s2 must be reduced modulo FLETCHER16_MODULE.
|
|
||||||
*/
|
*/
|
||||||
|
if((uintptr_t)data & 15)
|
||||||
|
{
|
||||||
|
while((uintptr_t)data & 15)
|
||||||
|
{
|
||||||
|
s2 += (s1 += *data++);
|
||||||
|
--len;
|
||||||
|
}
|
||||||
|
s1 %= FLETCHER16_MODULE;
|
||||||
|
s2 %= FLETCHER16_MODULE;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t blocks = len / BLOCK_SIZE;
|
||||||
|
len -= blocks * BLOCK_SIZE;
|
||||||
|
while(blocks)
|
||||||
|
{
|
||||||
|
unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
|
||||||
|
if(n > blocks) n = (unsigned)blocks;
|
||||||
|
blocks -= n;
|
||||||
|
/*
|
||||||
|
* Process n blocks of data. At most NMAX data bytes can be
|
||||||
|
* processed before s2 must be reduced modulo FLETCHER16_MODULE.
|
||||||
|
*/
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
uint32x4_t v_s2 = {.n128_u32 = {0, 0, 0, s1 * n}};
|
uint32x4_t v_s2 = {.n128_u32 = {0, 0, 0, s1 * n}};
|
||||||
uint32x4_t v_s1 = {.n128_u32 = {0, 0, 0, 0}};
|
uint32x4_t v_s1 = {.n128_u32 = {0, 0, 0, 0}};
|
||||||
#else
|
#else
|
||||||
uint32x4_t v_s2 = (uint32x4_t){0, 0, 0, s1 * n};
|
uint32x4_t v_s2 = (uint32x4_t){0, 0, 0, s1 * n};
|
||||||
uint32x4_t v_s1 = (uint32x4_t){0, 0, 0, 0};
|
uint32x4_t v_s1 = (uint32x4_t){0, 0, 0, 0};
|
||||||
#endif
|
#endif
|
||||||
uint16x8_t v_column_sum_1 = vdupq_n_u16(0);
|
uint16x8_t v_column_sum_1 = vdupq_n_u16(0);
|
||||||
uint16x8_t v_column_sum_2 = vdupq_n_u16(0);
|
uint16x8_t v_column_sum_2 = vdupq_n_u16(0);
|
||||||
uint16x8_t v_column_sum_3 = vdupq_n_u16(0);
|
uint16x8_t v_column_sum_3 = vdupq_n_u16(0);
|
||||||
uint16x8_t v_column_sum_4 = vdupq_n_u16(0);
|
uint16x8_t v_column_sum_4 = vdupq_n_u16(0);
|
||||||
do {
|
do
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Load 32 input bytes.
|
||||||
|
*/
|
||||||
|
const uint8x16_t bytes1 = vld1q_u8((uint8_t *)(data));
|
||||||
|
const uint8x16_t bytes2 = vld1q_u8((uint8_t *)(data + 16));
|
||||||
|
/*
|
||||||
|
* Add previous block byte sum to v_s2.
|
||||||
|
*/
|
||||||
|
v_s2 = vaddq_u32(v_s2, v_s1);
|
||||||
|
/*
|
||||||
|
* Horizontally add the bytes for s1.
|
||||||
|
*/
|
||||||
|
v_s1 = vpadalq_u16(v_s1, vpadalq_u8(vpaddlq_u8(bytes1), bytes2));
|
||||||
|
/*
|
||||||
|
* Vertically add the bytes for s2.
|
||||||
|
*/
|
||||||
|
v_column_sum_1 = vaddw_u8(v_column_sum_1, vget_low_u8(bytes1));
|
||||||
|
v_column_sum_2 = vaddw_u8(v_column_sum_2, vget_high_u8(bytes1));
|
||||||
|
v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8(bytes2));
|
||||||
|
v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2));
|
||||||
|
data += BLOCK_SIZE;
|
||||||
|
}
|
||||||
|
while(--n);
|
||||||
|
v_s2 = vshlq_n_u32(v_s2, 5);
|
||||||
/*
|
/*
|
||||||
* Load 32 input bytes.
|
* Multiply-add bytes by [ 32, 31, 30, ... ] for s2.
|
||||||
*/
|
*/
|
||||||
const uint8x16_t bytes1 = vld1q_u8((uint8_t*)(data));
|
|
||||||
const uint8x16_t bytes2 = vld1q_u8((uint8_t*)(data + 16));
|
|
||||||
/*
|
|
||||||
* Add previous block byte sum to v_s2.
|
|
||||||
*/
|
|
||||||
v_s2 = vaddq_u32(v_s2, v_s1);
|
|
||||||
/*
|
|
||||||
* Horizontally add the bytes for s1.
|
|
||||||
*/
|
|
||||||
v_s1 = vpadalq_u16(v_s1, vpadalq_u8(vpaddlq_u8(bytes1), bytes2));
|
|
||||||
/*
|
|
||||||
* Vertically add the bytes for s2.
|
|
||||||
*/
|
|
||||||
v_column_sum_1 = vaddw_u8(v_column_sum_1, vget_low_u8(bytes1));
|
|
||||||
v_column_sum_2 = vaddw_u8(v_column_sum_2, vget_high_u8(bytes1));
|
|
||||||
v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8(bytes2));
|
|
||||||
v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2));
|
|
||||||
data += BLOCK_SIZE;
|
|
||||||
} while(--n);
|
|
||||||
v_s2 = vshlq_n_u32(v_s2, 5);
|
|
||||||
/*
|
|
||||||
* Multiply-add bytes by [ 32, 31, 30, ... ] for s2.
|
|
||||||
*/
|
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
#ifdef _M_ARM64
|
#ifdef _M_ARM64
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), neon_ld1m_16((uint16_t[]){32, 31, 30, 29}));
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), neon_ld1m_16((uint16_t[]){32, 31, 30, 29}));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), neon_ld1m_16((uint16_t[]){28, 27, 26, 25}));
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), neon_ld1m_16((uint16_t[]){28, 27, 26, 25}));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), neon_ld1m_16((uint16_t[]){24, 23, 22, 21}));
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), neon_ld1m_16((uint16_t[]){24, 23, 22, 21}));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), neon_ld1m_16((uint16_t[]){20, 19, 18, 17}));
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), neon_ld1m_16((uint16_t[]){20, 19, 18, 17}));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), neon_ld1m_16((uint16_t[]){16, 15, 14, 13}));
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), neon_ld1m_16((uint16_t[]){16, 15, 14, 13}));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), neon_ld1m_16((uint16_t[]){12, 11, 10, 9}));
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), neon_ld1m_16((uint16_t[]){12, 11, 10, 9}));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), neon_ld1m_16((uint16_t[]){8, 7, 6, 5}));
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), neon_ld1m_16((uint16_t[]){8, 7, 6, 5}));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), neon_ld1m_16((uint16_t[]){4, 3, 2, 1}));
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), neon_ld1m_16((uint16_t[]){4, 3, 2, 1}));
|
||||||
#else
|
#else
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), vld1_u16(((uint16_t[]){32, 31, 30, 29})));
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), vld1_u16(((uint16_t[]){32, 31, 30, 29})));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), vld1_u16(((uint16_t[]){28, 27, 26, 25})));
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), vld1_u16(((uint16_t[]){28, 27, 26, 25})));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), vld1_u16(((uint16_t[]){24, 23, 22, 21})));
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), vld1_u16(((uint16_t[]){24, 23, 22, 21})));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), vld1_u16(((uint16_t[]){20, 19, 18, 17})));
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), vld1_u16(((uint16_t[]){20, 19, 18, 17})));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), vld1_u16(((uint16_t[]){16, 15, 14, 13})));
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), vld1_u16(((uint16_t[]){16, 15, 14, 13})));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), vld1_u16(((uint16_t[]){12, 11, 10, 9})));
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), vld1_u16(((uint16_t[]){12, 11, 10, 9})));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), vld1_u16(((uint16_t[]){8, 7, 6, 5})));
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), vld1_u16(((uint16_t[]){8, 7, 6, 5})));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), vld1_u16(((uint16_t[]){4, 3, 2, 1})));
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), vld1_u16(((uint16_t[]){4, 3, 2, 1})));
|
||||||
#endif
|
#endif
|
||||||
#else
|
#else
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), (uint16x4_t){32, 31, 30, 29});
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), (uint16x4_t){32, 31, 30, 29});
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), (uint16x4_t){28, 27, 26, 25});
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), (uint16x4_t){28, 27, 26, 25});
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), (uint16x4_t){24, 23, 22, 21});
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), (uint16x4_t){24, 23, 22, 21});
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), (uint16x4_t){20, 19, 18, 17});
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), (uint16x4_t){20, 19, 18, 17});
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), (uint16x4_t){16, 15, 14, 13});
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), (uint16x4_t){16, 15, 14, 13});
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), (uint16x4_t){12, 11, 10, 9});
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), (uint16x4_t){12, 11, 10, 9});
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), (uint16x4_t){8, 7, 6, 5});
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), (uint16x4_t){8, 7, 6, 5});
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), (uint16x4_t){4, 3, 2, 1});
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), (uint16x4_t){4, 3, 2, 1});
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
|
* Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
|
||||||
*/
|
*/
|
||||||
uint32x2_t sum1 = vpadd_u32(vget_low_u32(v_s1), vget_high_u32(v_s1));
|
uint32x2_t sum1 = vpadd_u32(vget_low_u32(v_s1), vget_high_u32(v_s1));
|
||||||
uint32x2_t sum2 = vpadd_u32(vget_low_u32(v_s2), vget_high_u32(v_s2));
|
uint32x2_t sum2 = vpadd_u32(vget_low_u32(v_s2), vget_high_u32(v_s2));
|
||||||
uint32x2_t s1s2 = vpadd_u32(sum1, sum2);
|
uint32x2_t s1s2 = vpadd_u32(sum1, sum2);
|
||||||
s1 += vget_lane_u32(s1s2, 0);
|
s1 += vget_lane_u32(s1s2, 0);
|
||||||
s2 += vget_lane_u32(s1s2, 1);
|
s2 += vget_lane_u32(s1s2, 1);
|
||||||
/*
|
/*
|
||||||
* Reduce.
|
* Reduce.
|
||||||
*/
|
*/
|
||||||
s1 %= FLETCHER16_MODULE;
|
s1 %= FLETCHER16_MODULE;
|
||||||
s2 %= FLETCHER16_MODULE;
|
s2 %= FLETCHER16_MODULE;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Handle leftover data.
|
* Handle leftover data.
|
||||||
*/
|
*/
|
||||||
@@ -190,10 +198,12 @@ TARGET_WITH_NEON void fletcher16_neon(uint8_t* sum1, uint8_t* sum2, const uint8_
|
|||||||
s2 += (s1 += *data++);
|
s2 += (s1 += *data++);
|
||||||
len -= 16;
|
len -= 16;
|
||||||
}
|
}
|
||||||
while(len--) { s2 += (s1 += *data++); }
|
while(len--)
|
||||||
|
{ s2 += (s1 += *data++); }
|
||||||
s1 %= FLETCHER16_MODULE;
|
s1 %= FLETCHER16_MODULE;
|
||||||
s2 %= FLETCHER16_MODULE;
|
s2 %= FLETCHER16_MODULE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Return the recombined sums.
|
* Return the recombined sums.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -59,68 +59,72 @@ fletcher16_ssse3(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
|
|||||||
* Process the data in blocks.
|
* Process the data in blocks.
|
||||||
*/
|
*/
|
||||||
const unsigned BLOCK_SIZE = 1 << 5;
|
const unsigned BLOCK_SIZE = 1 << 5;
|
||||||
long blocks = len / BLOCK_SIZE;
|
if(len >= BLOCK_SIZE)
|
||||||
len -= blocks * BLOCK_SIZE;
|
|
||||||
while(blocks)
|
|
||||||
{
|
{
|
||||||
unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
|
long blocks = len / BLOCK_SIZE;
|
||||||
if(n > blocks) n = (unsigned)blocks;
|
len -= blocks * BLOCK_SIZE;
|
||||||
blocks -= n;
|
while(blocks)
|
||||||
const __m128i tap1 = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
|
|
||||||
const __m128i tap2 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
|
|
||||||
const __m128i zero = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
||||||
const __m128i ones = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1);
|
|
||||||
/*
|
|
||||||
* Process n blocks of data. At most NMAX data bytes can be
|
|
||||||
* processed before s2 must be reduced modulo BASE.
|
|
||||||
*/
|
|
||||||
__m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n);
|
|
||||||
__m128i v_s2 = _mm_set_epi32(0, 0, 0, s2);
|
|
||||||
__m128i v_s1 = _mm_set_epi32(0, 0, 0, 0);
|
|
||||||
do
|
|
||||||
{
|
{
|
||||||
|
unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
|
||||||
|
if(n > blocks) n = (unsigned)blocks;
|
||||||
|
blocks -= n;
|
||||||
|
const __m128i tap1 = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
|
||||||
|
const __m128i tap2 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
|
||||||
|
const __m128i zero = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||||
|
const __m128i ones = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1);
|
||||||
/*
|
/*
|
||||||
* Load 32 input bytes.
|
* Process n blocks of data. At most NMAX data bytes can be
|
||||||
|
* processed before s2 must be reduced modulo BASE.
|
||||||
*/
|
*/
|
||||||
const __m128i bytes1 = _mm_loadu_si128((__m128i *)(data));
|
__m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n);
|
||||||
const __m128i bytes2 = _mm_loadu_si128((__m128i *)(data + 16));
|
__m128i v_s2 = _mm_set_epi32(0, 0, 0, s2);
|
||||||
|
__m128i v_s1 = _mm_set_epi32(0, 0, 0, 0);
|
||||||
|
do
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Load 32 input bytes.
|
||||||
|
*/
|
||||||
|
const __m128i bytes1 = _mm_loadu_si128((__m128i *)(data));
|
||||||
|
const __m128i bytes2 = _mm_loadu_si128((__m128i *)(data + 16));
|
||||||
|
/*
|
||||||
|
* Add previous block byte sum to v_ps.
|
||||||
|
*/
|
||||||
|
v_ps = _mm_add_epi32(v_ps, v_s1);
|
||||||
|
/*
|
||||||
|
* Horizontally add the bytes for s1, multiply-adds the
|
||||||
|
* bytes by [ 32, 31, 30, ... ] for s2.
|
||||||
|
*/
|
||||||
|
v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zero));
|
||||||
|
const __m128i mad1 = _mm_maddubs_epi16(bytes1, tap1);
|
||||||
|
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad1, ones));
|
||||||
|
v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zero));
|
||||||
|
const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2);
|
||||||
|
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones));
|
||||||
|
data += BLOCK_SIZE;
|
||||||
|
}
|
||||||
|
while(--n);
|
||||||
|
v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5));
|
||||||
/*
|
/*
|
||||||
* Add previous block byte sum to v_ps.
|
* Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
|
||||||
*/
|
*/
|
||||||
v_ps = _mm_add_epi32(v_ps, v_s1);
|
|
||||||
/*
|
|
||||||
* Horizontally add the bytes for s1, multiply-adds the
|
|
||||||
* bytes by [ 32, 31, 30, ... ] for s2.
|
|
||||||
*/
|
|
||||||
v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zero));
|
|
||||||
const __m128i mad1 = _mm_maddubs_epi16(bytes1, tap1);
|
|
||||||
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad1, ones));
|
|
||||||
v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zero));
|
|
||||||
const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2);
|
|
||||||
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones));
|
|
||||||
data += BLOCK_SIZE;
|
|
||||||
}
|
|
||||||
while(--n);
|
|
||||||
v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5));
|
|
||||||
/*
|
|
||||||
* Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
|
|
||||||
*/
|
|
||||||
#define S23O1 _MM_SHUFFLE(2, 3, 0, 1) /* A B C D -> B A D C */
|
#define S23O1 _MM_SHUFFLE(2, 3, 0, 1) /* A B C D -> B A D C */
|
||||||
#define S1O32 _MM_SHUFFLE(1, 0, 3, 2) /* A B C D -> C D A B */
|
#define S1O32 _MM_SHUFFLE(1, 0, 3, 2) /* A B C D -> C D A B */
|
||||||
v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S23O1));
|
v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S23O1));
|
||||||
v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S1O32));
|
v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S1O32));
|
||||||
s1 += _mm_cvtsi128_si32(v_s1);
|
s1 += _mm_cvtsi128_si32(v_s1);
|
||||||
v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S23O1));
|
v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S23O1));
|
||||||
v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S1O32));
|
v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S1O32));
|
||||||
s2 = _mm_cvtsi128_si32(v_s2);
|
s2 = _mm_cvtsi128_si32(v_s2);
|
||||||
#undef S23O1
|
#undef S23O1
|
||||||
#undef S1O32
|
#undef S1O32
|
||||||
/*
|
/*
|
||||||
* Reduce.
|
* Reduce.
|
||||||
*/
|
*/
|
||||||
s1 %= FLETCHER16_MODULE;
|
s1 %= FLETCHER16_MODULE;
|
||||||
s2 %= FLETCHER16_MODULE;
|
s2 %= FLETCHER16_MODULE;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Handle leftover data.
|
* Handle leftover data.
|
||||||
*/
|
*/
|
||||||
@@ -151,6 +155,7 @@ fletcher16_ssse3(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
|
|||||||
s1 %= FLETCHER16_MODULE;
|
s1 %= FLETCHER16_MODULE;
|
||||||
s2 %= FLETCHER16_MODULE;
|
s2 %= FLETCHER16_MODULE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Return the recombined sums.
|
* Return the recombined sums.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -52,101 +52,104 @@ fletcher32_avx2(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
|
|||||||
* Process the data in blocks.
|
* Process the data in blocks.
|
||||||
*/
|
*/
|
||||||
const unsigned BLOCK_SIZE = 1 << 5;
|
const unsigned BLOCK_SIZE = 1 << 5;
|
||||||
long blocks = len / BLOCK_SIZE;
|
if(len >= BLOCK_SIZE)
|
||||||
len -= blocks * BLOCK_SIZE;
|
|
||||||
|
|
||||||
while(blocks)
|
|
||||||
{
|
{
|
||||||
unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
|
long blocks = len / BLOCK_SIZE;
|
||||||
|
len -= blocks * BLOCK_SIZE;
|
||||||
|
|
||||||
if(n > blocks) n = (unsigned)blocks;
|
while(blocks)
|
||||||
blocks -= n;
|
|
||||||
|
|
||||||
const __m256i tap = _mm256_set_epi8(1,
|
|
||||||
2,
|
|
||||||
3,
|
|
||||||
4,
|
|
||||||
5,
|
|
||||||
6,
|
|
||||||
7,
|
|
||||||
8,
|
|
||||||
9,
|
|
||||||
10,
|
|
||||||
11,
|
|
||||||
12,
|
|
||||||
13,
|
|
||||||
14,
|
|
||||||
15,
|
|
||||||
16,
|
|
||||||
17,
|
|
||||||
18,
|
|
||||||
19,
|
|
||||||
20,
|
|
||||||
21,
|
|
||||||
22,
|
|
||||||
23,
|
|
||||||
24,
|
|
||||||
25,
|
|
||||||
26,
|
|
||||||
27,
|
|
||||||
28,
|
|
||||||
29,
|
|
||||||
30,
|
|
||||||
31,
|
|
||||||
32);
|
|
||||||
const __m256i zero = _mm256_setzero_si256();
|
|
||||||
const __m256i ones = _mm256_set1_epi16(1);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Process n blocks of data. At most NMAX data bytes can be
|
|
||||||
* processed before s2 must be reduced modulo BASE.
|
|
||||||
*/
|
|
||||||
__m256i v_ps = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, (s1 * n));
|
|
||||||
__m256i v_s2 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, s2);
|
|
||||||
__m256i v_s1 = _mm256_setzero_si256();
|
|
||||||
do
|
|
||||||
{
|
{
|
||||||
/*
|
unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
|
||||||
* Load 32 input bytes.
|
|
||||||
*/
|
if(n > blocks) n = (unsigned)blocks;
|
||||||
const __m256i bytes = _mm256_lddqu_si256((__m256i *)(data));
|
blocks -= n;
|
||||||
|
|
||||||
|
const __m256i tap = _mm256_set_epi8(1,
|
||||||
|
2,
|
||||||
|
3,
|
||||||
|
4,
|
||||||
|
5,
|
||||||
|
6,
|
||||||
|
7,
|
||||||
|
8,
|
||||||
|
9,
|
||||||
|
10,
|
||||||
|
11,
|
||||||
|
12,
|
||||||
|
13,
|
||||||
|
14,
|
||||||
|
15,
|
||||||
|
16,
|
||||||
|
17,
|
||||||
|
18,
|
||||||
|
19,
|
||||||
|
20,
|
||||||
|
21,
|
||||||
|
22,
|
||||||
|
23,
|
||||||
|
24,
|
||||||
|
25,
|
||||||
|
26,
|
||||||
|
27,
|
||||||
|
28,
|
||||||
|
29,
|
||||||
|
30,
|
||||||
|
31,
|
||||||
|
32);
|
||||||
|
const __m256i zero = _mm256_setzero_si256();
|
||||||
|
const __m256i ones = _mm256_set1_epi16(1);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Add previous block byte sum to v_ps.
|
* Process n blocks of data. At most NMAX data bytes can be
|
||||||
|
* processed before s2 must be reduced modulo BASE.
|
||||||
*/
|
*/
|
||||||
v_ps = _mm256_add_epi32(v_ps, v_s1);
|
__m256i v_ps = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, (s1 * n));
|
||||||
/*
|
__m256i v_s2 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, s2);
|
||||||
* Horizontally add the bytes for s1, multiply-adds the
|
__m256i v_s1 = _mm256_setzero_si256();
|
||||||
* bytes by [ 32, 31, 30, ... ] for s2.
|
do
|
||||||
*/
|
{
|
||||||
v_s1 = _mm256_add_epi32(v_s1, _mm256_sad_epu8(bytes, zero));
|
/*
|
||||||
const __m256i mad = _mm256_maddubs_epi16(bytes, tap);
|
* Load 32 input bytes.
|
||||||
v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(mad, ones));
|
*/
|
||||||
|
const __m256i bytes = _mm256_lddqu_si256((__m256i *)(data));
|
||||||
|
|
||||||
data += BLOCK_SIZE;
|
/*
|
||||||
|
* Add previous block byte sum to v_ps.
|
||||||
|
*/
|
||||||
|
v_ps = _mm256_add_epi32(v_ps, v_s1);
|
||||||
|
/*
|
||||||
|
* Horizontally add the bytes for s1, multiply-adds the
|
||||||
|
* bytes by [ 32, 31, 30, ... ] for s2.
|
||||||
|
*/
|
||||||
|
v_s1 = _mm256_add_epi32(v_s1, _mm256_sad_epu8(bytes, zero));
|
||||||
|
const __m256i mad = _mm256_maddubs_epi16(bytes, tap);
|
||||||
|
v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(mad, ones));
|
||||||
|
|
||||||
|
data += BLOCK_SIZE;
|
||||||
|
}
|
||||||
|
while(--n);
|
||||||
|
|
||||||
|
__m128i sum = _mm_add_epi32(_mm256_castsi256_si128(v_s1), _mm256_extracti128_si256(v_s1, 1));
|
||||||
|
__m128i hi = _mm_unpackhi_epi64(sum, sum);
|
||||||
|
sum = _mm_add_epi32(hi, sum);
|
||||||
|
hi = _mm_shuffle_epi32(sum, 177);
|
||||||
|
sum = _mm_add_epi32(sum, hi);
|
||||||
|
s1 += _mm_cvtsi128_si32(sum);
|
||||||
|
|
||||||
|
v_s2 = _mm256_add_epi32(v_s2, _mm256_slli_epi32(v_ps, 5));
|
||||||
|
sum = _mm_add_epi32(_mm256_castsi256_si128(v_s2), _mm256_extracti128_si256(v_s2, 1));
|
||||||
|
hi = _mm_unpackhi_epi64(sum, sum);
|
||||||
|
sum = _mm_add_epi32(hi, sum);
|
||||||
|
hi = _mm_shuffle_epi32(sum, 177);
|
||||||
|
sum = _mm_add_epi32(sum, hi);
|
||||||
|
s2 = _mm_cvtsi128_si32(sum);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Reduce.
|
||||||
|
*/
|
||||||
|
s1 %= FLETCHER32_MODULE;
|
||||||
|
s2 %= FLETCHER32_MODULE;
|
||||||
}
|
}
|
||||||
while(--n);
|
|
||||||
|
|
||||||
__m128i sum = _mm_add_epi32(_mm256_castsi256_si128(v_s1), _mm256_extracti128_si256(v_s1, 1));
|
|
||||||
__m128i hi = _mm_unpackhi_epi64(sum, sum);
|
|
||||||
sum = _mm_add_epi32(hi, sum);
|
|
||||||
hi = _mm_shuffle_epi32(sum, 177);
|
|
||||||
sum = _mm_add_epi32(sum, hi);
|
|
||||||
s1 += _mm_cvtsi128_si32(sum);
|
|
||||||
|
|
||||||
v_s2 = _mm256_add_epi32(v_s2, _mm256_slli_epi32(v_ps, 5));
|
|
||||||
sum = _mm_add_epi32(_mm256_castsi256_si128(v_s2), _mm256_extracti128_si256(v_s2, 1));
|
|
||||||
hi = _mm_unpackhi_epi64(sum, sum);
|
|
||||||
sum = _mm_add_epi32(hi, sum);
|
|
||||||
hi = _mm_shuffle_epi32(sum, 177);
|
|
||||||
sum = _mm_add_epi32(sum, hi);
|
|
||||||
s2 = _mm_cvtsi128_si32(sum);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Reduce.
|
|
||||||
*/
|
|
||||||
s1 %= FLETCHER32_MODULE;
|
|
||||||
s2 %= FLETCHER32_MODULE;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -179,6 +182,7 @@ fletcher32_avx2(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
|
|||||||
if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE;
|
if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE;
|
||||||
s2 %= FLETCHER32_MODULE;
|
s2 %= FLETCHER32_MODULE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Return the recombined sums.
|
* Return the recombined sums.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -55,118 +55,124 @@ TARGET_WITH_NEON void fletcher32_neon(uint16_t *sum1, uint16_t *sum2, const uint
|
|||||||
*/
|
*/
|
||||||
uint32_t s1 = *sum1;
|
uint32_t s1 = *sum1;
|
||||||
uint32_t s2 = *sum2;
|
uint32_t s2 = *sum2;
|
||||||
/*
|
|
||||||
* Serially compute s1 & s2, until the data is 16-byte aligned.
|
|
||||||
*/
|
|
||||||
if((uintptr_t)data & 15)
|
|
||||||
{
|
|
||||||
while((uintptr_t)data & 15)
|
|
||||||
{
|
|
||||||
s2 += (s1 += *data++);
|
|
||||||
--len;
|
|
||||||
}
|
|
||||||
if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE;
|
|
||||||
s2 %= FLETCHER32_MODULE;
|
|
||||||
}
|
|
||||||
/*
|
/*
|
||||||
* Process the data in blocks.
|
* Process the data in blocks.
|
||||||
*/
|
*/
|
||||||
const unsigned BLOCK_SIZE = 1 << 5;
|
const unsigned BLOCK_SIZE = 1 << 5;
|
||||||
uint32_t blocks = len / BLOCK_SIZE;
|
if(len >= BLOCK_SIZE)
|
||||||
len -= blocks * BLOCK_SIZE;
|
|
||||||
while(blocks)
|
|
||||||
{
|
{
|
||||||
unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
|
|
||||||
if(n > blocks) n = (unsigned)blocks;
|
|
||||||
blocks -= n;
|
|
||||||
/*
|
/*
|
||||||
* Process n blocks of data. At most NMAX data bytes can be
|
* Serially compute s1 & s2, until the data is 16-byte aligned.
|
||||||
* processed before s2 must be reduced modulo FLETCHER32_MODULE.
|
|
||||||
*/
|
*/
|
||||||
#ifdef _MSC_VER
|
if((uintptr_t)data & 15)
|
||||||
uint32x4_t v_s2 = {.n128_u32 = {0, 0, 0, s1 * n}};
|
|
||||||
uint32x4_t v_s1 = {.n128_u32 = {0, 0, 0, 0}};
|
|
||||||
#else
|
|
||||||
uint32x4_t v_s2 = (uint32x4_t){0, 0, 0, s1 * n};
|
|
||||||
uint32x4_t v_s1 = (uint32x4_t){0, 0, 0, 0};
|
|
||||||
#endif
|
|
||||||
uint16x8_t v_column_sum_1 = vdupq_n_u16(0);
|
|
||||||
uint16x8_t v_column_sum_2 = vdupq_n_u16(0);
|
|
||||||
uint16x8_t v_column_sum_3 = vdupq_n_u16(0);
|
|
||||||
uint16x8_t v_column_sum_4 = vdupq_n_u16(0);
|
|
||||||
do
|
|
||||||
{
|
{
|
||||||
/*
|
while((uintptr_t)data & 15)
|
||||||
* Load 32 input bytes.
|
{
|
||||||
*/
|
s2 += (s1 += *data++);
|
||||||
const uint8x16_t bytes1 = vld1q_u8((uint8_t *)(data));
|
--len;
|
||||||
const uint8x16_t bytes2 = vld1q_u8((uint8_t *)(data + 16));
|
}
|
||||||
/*
|
if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE;
|
||||||
* Add previous block byte sum to v_s2.
|
s2 %= FLETCHER32_MODULE;
|
||||||
*/
|
|
||||||
v_s2 = vaddq_u32(v_s2, v_s1);
|
|
||||||
/*
|
|
||||||
* Horizontally add the bytes for s1.
|
|
||||||
*/
|
|
||||||
v_s1 = vpadalq_u16(v_s1, vpadalq_u8(vpaddlq_u8(bytes1), bytes2));
|
|
||||||
/*
|
|
||||||
* Vertically add the bytes for s2.
|
|
||||||
*/
|
|
||||||
v_column_sum_1 = vaddw_u8(v_column_sum_1, vget_low_u8(bytes1));
|
|
||||||
v_column_sum_2 = vaddw_u8(v_column_sum_2, vget_high_u8(bytes1));
|
|
||||||
v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8(bytes2));
|
|
||||||
v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2));
|
|
||||||
data += BLOCK_SIZE;
|
|
||||||
}
|
}
|
||||||
while(--n);
|
|
||||||
v_s2 = vshlq_n_u32(v_s2, 5);
|
uint32_t blocks = len / BLOCK_SIZE;
|
||||||
/*
|
len -= blocks * BLOCK_SIZE;
|
||||||
* Multiply-add bytes by [ 32, 31, 30, ... ] for s2.
|
while(blocks)
|
||||||
*/
|
{
|
||||||
|
unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
|
||||||
|
if(n > blocks) n = (unsigned)blocks;
|
||||||
|
blocks -= n;
|
||||||
|
/*
|
||||||
|
* Process n blocks of data. At most NMAX data bytes can be
|
||||||
|
* processed before s2 must be reduced modulo FLETCHER32_MODULE.
|
||||||
|
*/
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
uint32x4_t v_s2 = {.n128_u32 = {0, 0, 0, s1 * n}};
|
||||||
|
uint32x4_t v_s1 = {.n128_u32 = {0, 0, 0, 0}};
|
||||||
|
#else
|
||||||
|
uint32x4_t v_s2 = (uint32x4_t){0, 0, 0, s1 * n};
|
||||||
|
uint32x4_t v_s1 = (uint32x4_t){0, 0, 0, 0};
|
||||||
|
#endif
|
||||||
|
uint16x8_t v_column_sum_1 = vdupq_n_u16(0);
|
||||||
|
uint16x8_t v_column_sum_2 = vdupq_n_u16(0);
|
||||||
|
uint16x8_t v_column_sum_3 = vdupq_n_u16(0);
|
||||||
|
uint16x8_t v_column_sum_4 = vdupq_n_u16(0);
|
||||||
|
do
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Load 32 input bytes.
|
||||||
|
*/
|
||||||
|
const uint8x16_t bytes1 = vld1q_u8((uint8_t *)(data));
|
||||||
|
const uint8x16_t bytes2 = vld1q_u8((uint8_t *)(data + 16));
|
||||||
|
/*
|
||||||
|
* Add previous block byte sum to v_s2.
|
||||||
|
*/
|
||||||
|
v_s2 = vaddq_u32(v_s2, v_s1);
|
||||||
|
/*
|
||||||
|
* Horizontally add the bytes for s1.
|
||||||
|
*/
|
||||||
|
v_s1 = vpadalq_u16(v_s1, vpadalq_u8(vpaddlq_u8(bytes1), bytes2));
|
||||||
|
/*
|
||||||
|
* Vertically add the bytes for s2.
|
||||||
|
*/
|
||||||
|
v_column_sum_1 = vaddw_u8(v_column_sum_1, vget_low_u8(bytes1));
|
||||||
|
v_column_sum_2 = vaddw_u8(v_column_sum_2, vget_high_u8(bytes1));
|
||||||
|
v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8(bytes2));
|
||||||
|
v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2));
|
||||||
|
data += BLOCK_SIZE;
|
||||||
|
}
|
||||||
|
while(--n);
|
||||||
|
v_s2 = vshlq_n_u32(v_s2, 5);
|
||||||
|
/*
|
||||||
|
* Multiply-add bytes by [ 32, 31, 30, ... ] for s2.
|
||||||
|
*/
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
#ifdef _M_ARM64
|
#ifdef _M_ARM64
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), neon_ld1m_16((uint16_t[]){32, 31, 30, 29}));
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), neon_ld1m_16((uint16_t[]){32, 31, 30, 29}));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), neon_ld1m_16((uint16_t[]){28, 27, 26, 25}));
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), neon_ld1m_16((uint16_t[]){28, 27, 26, 25}));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), neon_ld1m_16((uint16_t[]){24, 23, 22, 21}));
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), neon_ld1m_16((uint16_t[]){24, 23, 22, 21}));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), neon_ld1m_16((uint16_t[]){20, 19, 18, 17}));
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), neon_ld1m_16((uint16_t[]){20, 19, 18, 17}));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), neon_ld1m_16((uint16_t[]){16, 15, 14, 13}));
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), neon_ld1m_16((uint16_t[]){16, 15, 14, 13}));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), neon_ld1m_16((uint16_t[]){12, 11, 10, 9}));
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), neon_ld1m_16((uint16_t[]){12, 11, 10, 9}));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), neon_ld1m_16((uint16_t[]){8, 7, 6, 5}));
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), neon_ld1m_16((uint16_t[]){8, 7, 6, 5}));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), neon_ld1m_16((uint16_t[]){4, 3, 2, 1}));
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), neon_ld1m_16((uint16_t[]){4, 3, 2, 1}));
|
||||||
#else
|
#else
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), vld1_u16(((uint16_t[]){32, 31, 30, 29})));
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), vld1_u16(((uint16_t[]){32, 31, 30, 29})));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), vld1_u16(((uint16_t[]){28, 27, 26, 25})));
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), vld1_u16(((uint16_t[]){28, 27, 26, 25})));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), vld1_u16(((uint16_t[]){24, 23, 22, 21})));
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), vld1_u16(((uint16_t[]){24, 23, 22, 21})));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), vld1_u16(((uint16_t[]){20, 19, 18, 17})));
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), vld1_u16(((uint16_t[]){20, 19, 18, 17})));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), vld1_u16(((uint16_t[]){16, 15, 14, 13})));
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), vld1_u16(((uint16_t[]){16, 15, 14, 13})));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), vld1_u16(((uint16_t[]){12, 11, 10, 9})));
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), vld1_u16(((uint16_t[]){12, 11, 10, 9})));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), vld1_u16(((uint16_t[]){8, 7, 6, 5})));
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), vld1_u16(((uint16_t[]){8, 7, 6, 5})));
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), vld1_u16(((uint16_t[]){4, 3, 2, 1})));
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), vld1_u16(((uint16_t[]){4, 3, 2, 1})));
|
||||||
#endif
|
#endif
|
||||||
#else
|
#else
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), (uint16x4_t){32, 31, 30, 29});
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), (uint16x4_t){32, 31, 30, 29});
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), (uint16x4_t){28, 27, 26, 25});
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), (uint16x4_t){28, 27, 26, 25});
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), (uint16x4_t){24, 23, 22, 21});
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), (uint16x4_t){24, 23, 22, 21});
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), (uint16x4_t){20, 19, 18, 17});
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), (uint16x4_t){20, 19, 18, 17});
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), (uint16x4_t){16, 15, 14, 13});
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), (uint16x4_t){16, 15, 14, 13});
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), (uint16x4_t){12, 11, 10, 9});
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), (uint16x4_t){12, 11, 10, 9});
|
||||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), (uint16x4_t){8, 7, 6, 5});
|
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), (uint16x4_t){8, 7, 6, 5});
|
||||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), (uint16x4_t){4, 3, 2, 1});
|
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), (uint16x4_t){4, 3, 2, 1});
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
|
* Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
|
||||||
*/
|
*/
|
||||||
uint32x2_t sum1 = vpadd_u32(vget_low_u32(v_s1), vget_high_u32(v_s1));
|
uint32x2_t sum1 = vpadd_u32(vget_low_u32(v_s1), vget_high_u32(v_s1));
|
||||||
uint32x2_t sum2 = vpadd_u32(vget_low_u32(v_s2), vget_high_u32(v_s2));
|
uint32x2_t sum2 = vpadd_u32(vget_low_u32(v_s2), vget_high_u32(v_s2));
|
||||||
uint32x2_t s1s2 = vpadd_u32(sum1, sum2);
|
uint32x2_t s1s2 = vpadd_u32(sum1, sum2);
|
||||||
s1 += vget_lane_u32(s1s2, 0);
|
s1 += vget_lane_u32(s1s2, 0);
|
||||||
s2 += vget_lane_u32(s1s2, 1);
|
s2 += vget_lane_u32(s1s2, 1);
|
||||||
/*
|
/*
|
||||||
* Reduce.
|
* Reduce.
|
||||||
*/
|
*/
|
||||||
s1 %= FLETCHER32_MODULE;
|
s1 %= FLETCHER32_MODULE;
|
||||||
s2 %= FLETCHER32_MODULE;
|
s2 %= FLETCHER32_MODULE;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Handle leftover data.
|
* Handle leftover data.
|
||||||
*/
|
*/
|
||||||
@@ -197,6 +203,7 @@ TARGET_WITH_NEON void fletcher32_neon(uint16_t *sum1, uint16_t *sum2, const uint
|
|||||||
if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE;
|
if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE;
|
||||||
s2 %= FLETCHER32_MODULE;
|
s2 %= FLETCHER32_MODULE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Return the recombined sums.
|
* Return the recombined sums.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -59,68 +59,72 @@ fletcher32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
|
|||||||
* Process the data in blocks.
|
* Process the data in blocks.
|
||||||
*/
|
*/
|
||||||
const unsigned BLOCK_SIZE = 1 << 5;
|
const unsigned BLOCK_SIZE = 1 << 5;
|
||||||
long blocks = len / BLOCK_SIZE;
|
if(len >= BLOCK_SIZE)
|
||||||
len -= blocks * BLOCK_SIZE;
|
|
||||||
while(blocks)
|
|
||||||
{
|
{
|
||||||
unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
|
long blocks = len / BLOCK_SIZE;
|
||||||
if(n > blocks) n = (unsigned)blocks;
|
len -= blocks * BLOCK_SIZE;
|
||||||
blocks -= n;
|
while(blocks)
|
||||||
const __m128i tap1 = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
|
|
||||||
const __m128i tap2 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
|
|
||||||
const __m128i zero = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
||||||
const __m128i ones = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1);
|
|
||||||
/*
|
|
||||||
* Process n blocks of data. At most NMAX data bytes can be
|
|
||||||
* processed before s2 must be reduced modulo BASE.
|
|
||||||
*/
|
|
||||||
__m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n);
|
|
||||||
__m128i v_s2 = _mm_set_epi32(0, 0, 0, s2);
|
|
||||||
__m128i v_s1 = _mm_set_epi32(0, 0, 0, 0);
|
|
||||||
do
|
|
||||||
{
|
{
|
||||||
|
unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
|
||||||
|
if(n > blocks) n = (unsigned)blocks;
|
||||||
|
blocks -= n;
|
||||||
|
const __m128i tap1 = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
|
||||||
|
const __m128i tap2 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
|
||||||
|
const __m128i zero = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||||
|
const __m128i ones = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1);
|
||||||
/*
|
/*
|
||||||
* Load 32 input bytes.
|
* Process n blocks of data. At most NMAX data bytes can be
|
||||||
|
* processed before s2 must be reduced modulo BASE.
|
||||||
*/
|
*/
|
||||||
const __m128i bytes1 = _mm_loadu_si128((__m128i *)(data));
|
__m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n);
|
||||||
const __m128i bytes2 = _mm_loadu_si128((__m128i *)(data + 16));
|
__m128i v_s2 = _mm_set_epi32(0, 0, 0, s2);
|
||||||
|
__m128i v_s1 = _mm_set_epi32(0, 0, 0, 0);
|
||||||
|
do
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Load 32 input bytes.
|
||||||
|
*/
|
||||||
|
const __m128i bytes1 = _mm_loadu_si128((__m128i *)(data));
|
||||||
|
const __m128i bytes2 = _mm_loadu_si128((__m128i *)(data + 16));
|
||||||
|
/*
|
||||||
|
* Add previous block byte sum to v_ps.
|
||||||
|
*/
|
||||||
|
v_ps = _mm_add_epi32(v_ps, v_s1);
|
||||||
|
/*
|
||||||
|
* Horizontally add the bytes for s1, multiply-adds the
|
||||||
|
* bytes by [ 32, 31, 30, ... ] for s2.
|
||||||
|
*/
|
||||||
|
v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zero));
|
||||||
|
const __m128i mad1 = _mm_maddubs_epi16(bytes1, tap1);
|
||||||
|
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad1, ones));
|
||||||
|
v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zero));
|
||||||
|
const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2);
|
||||||
|
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones));
|
||||||
|
data += BLOCK_SIZE;
|
||||||
|
}
|
||||||
|
while(--n);
|
||||||
|
v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5));
|
||||||
/*
|
/*
|
||||||
* Add previous block byte sum to v_ps.
|
* Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
|
||||||
*/
|
*/
|
||||||
v_ps = _mm_add_epi32(v_ps, v_s1);
|
|
||||||
/*
|
|
||||||
* Horizontally add the bytes for s1, multiply-adds the
|
|
||||||
* bytes by [ 32, 31, 30, ... ] for s2.
|
|
||||||
*/
|
|
||||||
v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zero));
|
|
||||||
const __m128i mad1 = _mm_maddubs_epi16(bytes1, tap1);
|
|
||||||
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad1, ones));
|
|
||||||
v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zero));
|
|
||||||
const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2);
|
|
||||||
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones));
|
|
||||||
data += BLOCK_SIZE;
|
|
||||||
}
|
|
||||||
while(--n);
|
|
||||||
v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5));
|
|
||||||
/*
|
|
||||||
* Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
|
|
||||||
*/
|
|
||||||
#define S23O1 _MM_SHUFFLE(2, 3, 0, 1) /* A B C D -> B A D C */
|
#define S23O1 _MM_SHUFFLE(2, 3, 0, 1) /* A B C D -> B A D C */
|
||||||
#define S1O32 _MM_SHUFFLE(1, 0, 3, 2) /* A B C D -> C D A B */
|
#define S1O32 _MM_SHUFFLE(1, 0, 3, 2) /* A B C D -> C D A B */
|
||||||
v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S23O1));
|
v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S23O1));
|
||||||
v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S1O32));
|
v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S1O32));
|
||||||
s1 += _mm_cvtsi128_si32(v_s1);
|
s1 += _mm_cvtsi128_si32(v_s1);
|
||||||
v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S23O1));
|
v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S23O1));
|
||||||
v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S1O32));
|
v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S1O32));
|
||||||
s2 = _mm_cvtsi128_si32(v_s2);
|
s2 = _mm_cvtsi128_si32(v_s2);
|
||||||
#undef S23O1
|
#undef S23O1
|
||||||
#undef S1O32
|
#undef S1O32
|
||||||
/*
|
/*
|
||||||
* Reduce.
|
* Reduce.
|
||||||
*/
|
*/
|
||||||
s1 %= FLETCHER32_MODULE;
|
s1 %= FLETCHER32_MODULE;
|
||||||
s2 %= FLETCHER32_MODULE;
|
s2 %= FLETCHER32_MODULE;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Handle leftover data.
|
* Handle leftover data.
|
||||||
*/
|
*/
|
||||||
@@ -151,6 +155,7 @@ fletcher32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
|
|||||||
if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE;
|
if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE;
|
||||||
s2 %= FLETCHER32_MODULE;
|
s2 %= FLETCHER32_MODULE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Return the recombined sums.
|
* Return the recombined sums.
|
||||||
*/
|
*/
|
||||||
|
|||||||
Reference in New Issue
Block a user