Fix Adler and Fletcher calculations using SIMD when dataset is smaller than block size.

This commit is contained in:
2023-09-24 19:33:25 +01:00
parent 89382334ec
commit 0d9d1d92eb
9 changed files with 750 additions and 699 deletions

View File

@@ -51,6 +51,8 @@ AARU_EXPORT TARGET_WITH_AVX2 void AARU_CALL adler32_avx2(uint16_t *sum1, uint16_
* Process the data in blocks.
*/
const unsigned BLOCK_SIZE = 1 << 5;
if(len >= BLOCK_SIZE)
{
long blocks = len / BLOCK_SIZE;
len -= blocks * BLOCK_SIZE;
@@ -147,6 +149,7 @@ AARU_EXPORT TARGET_WITH_AVX2 void AARU_CALL adler32_avx2(uint16_t *sum1, uint16_
s1 %= ADLER_MODULE;
s2 %= ADLER_MODULE;
}
}
/*
* Handle leftover data.
@@ -178,6 +181,7 @@ AARU_EXPORT TARGET_WITH_AVX2 void AARU_CALL adler32_avx2(uint16_t *sum1, uint16_
if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
s2 %= ADLER_MODULE;
}
/*
* Return the recombined sums.
*/

View File

@@ -55,6 +55,13 @@ TARGET_WITH_NEON void adler32_neon(uint16_t *sum1, uint16_t *sum2, const uint8_t
*/
uint32_t s1 = *sum1;
uint32_t s2 = *sum2;
/*
* Process the data in blocks.
*/
const unsigned BLOCK_SIZE = 1 << 5;
if(len >= BLOCK_SIZE)
{
/*
* Serially compute s1 & s2, until the data is 16-byte aligned.
*/
@@ -68,10 +75,7 @@ TARGET_WITH_NEON void adler32_neon(uint16_t *sum1, uint16_t *sum2, const uint8_t
if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
s2 %= ADLER_MODULE;
}
/*
* Process the data in blocks.
*/
const unsigned BLOCK_SIZE = 1 << 5;
uint32_t blocks = len / BLOCK_SIZE;
len -= blocks * BLOCK_SIZE;
while(blocks)
@@ -167,6 +171,8 @@ TARGET_WITH_NEON void adler32_neon(uint16_t *sum1, uint16_t *sum2, const uint8_t
s1 %= ADLER_MODULE;
s2 %= ADLER_MODULE;
}
}
/*
* Handle leftover data.
*/
@@ -197,6 +203,7 @@ TARGET_WITH_NEON void adler32_neon(uint16_t *sum1, uint16_t *sum2, const uint8_t
if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
s2 %= ADLER_MODULE;
}
/*
* Return the recombined sums.
*/

View File

@@ -60,6 +60,8 @@ adler32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
* Process the data in blocks.
*/
const unsigned BLOCK_SIZE = 1 << 5;
if(len >= BLOCK_SIZE)
{
long blocks = len / BLOCK_SIZE;
len -= blocks * BLOCK_SIZE;
while(blocks)
@@ -122,6 +124,8 @@ adler32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
s1 %= ADLER_MODULE;
s2 %= ADLER_MODULE;
}
}
/*
* Handle leftover data.
*/
@@ -152,6 +156,7 @@ adler32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
s2 %= ADLER_MODULE;
}
/*
* Return the recombined sums.
*/

View File

@@ -52,6 +52,8 @@ fletcher16_avx2(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
* Process the data in blocks.
*/
const unsigned BLOCK_SIZE = 1 << 5;
if(len >= BLOCK_SIZE)
{
long blocks = len / BLOCK_SIZE;
len -= blocks * BLOCK_SIZE;
@@ -148,6 +150,7 @@ fletcher16_avx2(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
s1 %= FLETCHER16_MODULE;
s2 %= FLETCHER16_MODULE;
}
}
/*
* Handle leftover data.
@@ -179,6 +182,7 @@ fletcher16_avx2(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
s1 %= FLETCHER16_MODULE;
s2 %= FLETCHER16_MODULE;
}
/*
* Return the recombined sums.
*/

View File

@@ -55,6 +55,13 @@ TARGET_WITH_NEON void fletcher16_neon(uint8_t* sum1, uint8_t* sum2, const uint8_
*/
uint32_t s1 = *sum1;
uint32_t s2 = *sum2;
/*
* Process the data in blocks.
*/
const unsigned BLOCK_SIZE = 1 << 5;
if(len >= BLOCK_SIZE)
{
/*
* Serially compute s1 & s2, until the data is 16-byte aligned.
*/
@@ -68,10 +75,7 @@ TARGET_WITH_NEON void fletcher16_neon(uint8_t* sum1, uint8_t* sum2, const uint8_
s1 %= FLETCHER16_MODULE;
s2 %= FLETCHER16_MODULE;
}
/*
* Process the data in blocks.
*/
const unsigned BLOCK_SIZE = 1 << 5;
uint32_t blocks = len / BLOCK_SIZE;
len -= blocks * BLOCK_SIZE;
while(blocks)
@@ -94,7 +98,8 @@ TARGET_WITH_NEON void fletcher16_neon(uint8_t* sum1, uint8_t* sum2, const uint8_
uint16x8_t v_column_sum_2 = vdupq_n_u16(0);
uint16x8_t v_column_sum_3 = vdupq_n_u16(0);
uint16x8_t v_column_sum_4 = vdupq_n_u16(0);
do {
do
{
/*
* Load 32 input bytes.
*/
@@ -116,7 +121,8 @@ TARGET_WITH_NEON void fletcher16_neon(uint8_t* sum1, uint8_t* sum2, const uint8_
v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8(bytes2));
v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2));
data += BLOCK_SIZE;
} while(--n);
}
while(--n);
v_s2 = vshlq_n_u32(v_s2, 5);
/*
* Multiply-add bytes by [ 32, 31, 30, ... ] for s2.
@@ -165,6 +171,8 @@ TARGET_WITH_NEON void fletcher16_neon(uint8_t* sum1, uint8_t* sum2, const uint8_
s1 %= FLETCHER16_MODULE;
s2 %= FLETCHER16_MODULE;
}
}
/*
* Handle leftover data.
*/
@@ -190,10 +198,12 @@ TARGET_WITH_NEON void fletcher16_neon(uint8_t* sum1, uint8_t* sum2, const uint8_
s2 += (s1 += *data++);
len -= 16;
}
while(len--) { s2 += (s1 += *data++); }
while(len--)
{ s2 += (s1 += *data++); }
s1 %= FLETCHER16_MODULE;
s2 %= FLETCHER16_MODULE;
}
/*
* Return the recombined sums.
*/

View File

@@ -59,6 +59,8 @@ fletcher16_ssse3(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
* Process the data in blocks.
*/
const unsigned BLOCK_SIZE = 1 << 5;
if(len >= BLOCK_SIZE)
{
long blocks = len / BLOCK_SIZE;
len -= blocks * BLOCK_SIZE;
while(blocks)
@@ -121,6 +123,8 @@ fletcher16_ssse3(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
s1 %= FLETCHER16_MODULE;
s2 %= FLETCHER16_MODULE;
}
}
/*
* Handle leftover data.
*/
@@ -151,6 +155,7 @@ fletcher16_ssse3(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
s1 %= FLETCHER16_MODULE;
s2 %= FLETCHER16_MODULE;
}
/*
* Return the recombined sums.
*/

View File

@@ -52,6 +52,8 @@ fletcher32_avx2(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
* Process the data in blocks.
*/
const unsigned BLOCK_SIZE = 1 << 5;
if(len >= BLOCK_SIZE)
{
long blocks = len / BLOCK_SIZE;
len -= blocks * BLOCK_SIZE;
@@ -148,6 +150,7 @@ fletcher32_avx2(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
s1 %= FLETCHER32_MODULE;
s2 %= FLETCHER32_MODULE;
}
}
/*
* Handle leftover data.
@@ -179,6 +182,7 @@ fletcher32_avx2(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE;
s2 %= FLETCHER32_MODULE;
}
/*
* Return the recombined sums.
*/

View File

@@ -55,6 +55,13 @@ TARGET_WITH_NEON void fletcher32_neon(uint16_t *sum1, uint16_t *sum2, const uint
*/
uint32_t s1 = *sum1;
uint32_t s2 = *sum2;
/*
* Process the data in blocks.
*/
const unsigned BLOCK_SIZE = 1 << 5;
if(len >= BLOCK_SIZE)
{
/*
* Serially compute s1 & s2, until the data is 16-byte aligned.
*/
@@ -68,10 +75,7 @@ TARGET_WITH_NEON void fletcher32_neon(uint16_t *sum1, uint16_t *sum2, const uint
if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE;
s2 %= FLETCHER32_MODULE;
}
/*
* Process the data in blocks.
*/
const unsigned BLOCK_SIZE = 1 << 5;
uint32_t blocks = len / BLOCK_SIZE;
len -= blocks * BLOCK_SIZE;
while(blocks)
@@ -167,6 +171,8 @@ TARGET_WITH_NEON void fletcher32_neon(uint16_t *sum1, uint16_t *sum2, const uint
s1 %= FLETCHER32_MODULE;
s2 %= FLETCHER32_MODULE;
}
}
/*
* Handle leftover data.
*/
@@ -197,6 +203,7 @@ TARGET_WITH_NEON void fletcher32_neon(uint16_t *sum1, uint16_t *sum2, const uint
if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE;
s2 %= FLETCHER32_MODULE;
}
/*
* Return the recombined sums.
*/

View File

@@ -59,6 +59,8 @@ fletcher32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
* Process the data in blocks.
*/
const unsigned BLOCK_SIZE = 1 << 5;
if(len >= BLOCK_SIZE)
{
long blocks = len / BLOCK_SIZE;
len -= blocks * BLOCK_SIZE;
while(blocks)
@@ -121,6 +123,8 @@ fletcher32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
s1 %= FLETCHER32_MODULE;
s2 %= FLETCHER32_MODULE;
}
}
/*
* Handle leftover data.
*/
@@ -151,6 +155,7 @@ fletcher32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE;
s2 %= FLETCHER32_MODULE;
}
/*
* Return the recombined sums.
*/