Fix Adler and Fletcher calculations using SIMD when dataset is smaller than block size.

This commit is contained in:
2023-09-24 19:33:25 +01:00
parent 89382334ec
commit 0d9d1d92eb
9 changed files with 750 additions and 699 deletions

View File

@@ -52,101 +52,104 @@ fletcher32_avx2(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
* Process the data in blocks.
*/
const unsigned BLOCK_SIZE = 1 << 5;
long blocks = len / BLOCK_SIZE;
len -= blocks * BLOCK_SIZE;
while(blocks)
if(len >= BLOCK_SIZE)
{
unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
long blocks = len / BLOCK_SIZE;
len -= blocks * BLOCK_SIZE;
if(n > blocks) n = (unsigned)blocks;
blocks -= n;
const __m256i tap = _mm256_set_epi8(1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31,
32);
const __m256i zero = _mm256_setzero_si256();
const __m256i ones = _mm256_set1_epi16(1);
/*
* Process n blocks of data. At most NMAX data bytes can be
* processed before s2 must be reduced modulo BASE.
*/
__m256i v_ps = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, (s1 * n));
__m256i v_s2 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, s2);
__m256i v_s1 = _mm256_setzero_si256();
do
while(blocks)
{
/*
* Load 32 input bytes.
*/
const __m256i bytes = _mm256_lddqu_si256((__m256i *)(data));
unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
if(n > blocks) n = (unsigned)blocks;
blocks -= n;
const __m256i tap = _mm256_set_epi8(1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31,
32);
const __m256i zero = _mm256_setzero_si256();
const __m256i ones = _mm256_set1_epi16(1);
/*
* Add previous block byte sum to v_ps.
* Process n blocks of data. At most NMAX data bytes can be
* processed before s2 must be reduced modulo BASE.
*/
v_ps = _mm256_add_epi32(v_ps, v_s1);
/*
* Horizontally add the bytes for s1, multiply-adds the
* bytes by [ 32, 31, 30, ... ] for s2.
*/
v_s1 = _mm256_add_epi32(v_s1, _mm256_sad_epu8(bytes, zero));
const __m256i mad = _mm256_maddubs_epi16(bytes, tap);
v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(mad, ones));
__m256i v_ps = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, (s1 * n));
__m256i v_s2 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, s2);
__m256i v_s1 = _mm256_setzero_si256();
do
{
/*
* Load 32 input bytes.
*/
const __m256i bytes = _mm256_lddqu_si256((__m256i *)(data));
data += BLOCK_SIZE;
/*
* Add previous block byte sum to v_ps.
*/
v_ps = _mm256_add_epi32(v_ps, v_s1);
/*
* Horizontally add the bytes for s1, multiply-adds the
* bytes by [ 32, 31, 30, ... ] for s2.
*/
v_s1 = _mm256_add_epi32(v_s1, _mm256_sad_epu8(bytes, zero));
const __m256i mad = _mm256_maddubs_epi16(bytes, tap);
v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(mad, ones));
data += BLOCK_SIZE;
}
while(--n);
__m128i sum = _mm_add_epi32(_mm256_castsi256_si128(v_s1), _mm256_extracti128_si256(v_s1, 1));
__m128i hi = _mm_unpackhi_epi64(sum, sum);
sum = _mm_add_epi32(hi, sum);
hi = _mm_shuffle_epi32(sum, 177);
sum = _mm_add_epi32(sum, hi);
s1 += _mm_cvtsi128_si32(sum);
v_s2 = _mm256_add_epi32(v_s2, _mm256_slli_epi32(v_ps, 5));
sum = _mm_add_epi32(_mm256_castsi256_si128(v_s2), _mm256_extracti128_si256(v_s2, 1));
hi = _mm_unpackhi_epi64(sum, sum);
sum = _mm_add_epi32(hi, sum);
hi = _mm_shuffle_epi32(sum, 177);
sum = _mm_add_epi32(sum, hi);
s2 = _mm_cvtsi128_si32(sum);
/*
* Reduce.
*/
s1 %= FLETCHER32_MODULE;
s2 %= FLETCHER32_MODULE;
}
while(--n);
__m128i sum = _mm_add_epi32(_mm256_castsi256_si128(v_s1), _mm256_extracti128_si256(v_s1, 1));
__m128i hi = _mm_unpackhi_epi64(sum, sum);
sum = _mm_add_epi32(hi, sum);
hi = _mm_shuffle_epi32(sum, 177);
sum = _mm_add_epi32(sum, hi);
s1 += _mm_cvtsi128_si32(sum);
v_s2 = _mm256_add_epi32(v_s2, _mm256_slli_epi32(v_ps, 5));
sum = _mm_add_epi32(_mm256_castsi256_si128(v_s2), _mm256_extracti128_si256(v_s2, 1));
hi = _mm_unpackhi_epi64(sum, sum);
sum = _mm_add_epi32(hi, sum);
hi = _mm_shuffle_epi32(sum, 177);
sum = _mm_add_epi32(sum, hi);
s2 = _mm_cvtsi128_si32(sum);
/*
* Reduce.
*/
s1 %= FLETCHER32_MODULE;
s2 %= FLETCHER32_MODULE;
}
/*
@@ -179,6 +182,7 @@ fletcher32_avx2(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE;
s2 %= FLETCHER32_MODULE;
}
/*
* Return the recombined sums.
*/