mirror of
https://github.com/aaru-dps/Aaru.Checksums.Native.git
synced 2025-12-16 11:14:29 +00:00
Fix Adler and Fletcher calculations using SIMD when dataset is smaller than block size.
This commit is contained in:
@@ -51,6 +51,8 @@ AARU_EXPORT TARGET_WITH_AVX2 void AARU_CALL adler32_avx2(uint16_t *sum1, uint16_
|
||||
* Process the data in blocks.
|
||||
*/
|
||||
const unsigned BLOCK_SIZE = 1 << 5;
|
||||
if(len >= BLOCK_SIZE)
|
||||
{
|
||||
long blocks = len / BLOCK_SIZE;
|
||||
len -= blocks * BLOCK_SIZE;
|
||||
|
||||
@@ -147,6 +149,7 @@ AARU_EXPORT TARGET_WITH_AVX2 void AARU_CALL adler32_avx2(uint16_t *sum1, uint16_
|
||||
s1 %= ADLER_MODULE;
|
||||
s2 %= ADLER_MODULE;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle leftover data.
|
||||
@@ -178,6 +181,7 @@ AARU_EXPORT TARGET_WITH_AVX2 void AARU_CALL adler32_avx2(uint16_t *sum1, uint16_
|
||||
if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
|
||||
s2 %= ADLER_MODULE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the recombined sums.
|
||||
*/
|
||||
|
||||
@@ -55,6 +55,13 @@ TARGET_WITH_NEON void adler32_neon(uint16_t *sum1, uint16_t *sum2, const uint8_t
|
||||
*/
|
||||
uint32_t s1 = *sum1;
|
||||
uint32_t s2 = *sum2;
|
||||
|
||||
/*
|
||||
* Process the data in blocks.
|
||||
*/
|
||||
const unsigned BLOCK_SIZE = 1 << 5;
|
||||
if(len >= BLOCK_SIZE)
|
||||
{
|
||||
/*
|
||||
* Serially compute s1 & s2, until the data is 16-byte aligned.
|
||||
*/
|
||||
@@ -68,10 +75,7 @@ TARGET_WITH_NEON void adler32_neon(uint16_t *sum1, uint16_t *sum2, const uint8_t
|
||||
if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
|
||||
s2 %= ADLER_MODULE;
|
||||
}
|
||||
/*
|
||||
* Process the data in blocks.
|
||||
*/
|
||||
const unsigned BLOCK_SIZE = 1 << 5;
|
||||
|
||||
uint32_t blocks = len / BLOCK_SIZE;
|
||||
len -= blocks * BLOCK_SIZE;
|
||||
while(blocks)
|
||||
@@ -167,6 +171,8 @@ TARGET_WITH_NEON void adler32_neon(uint16_t *sum1, uint16_t *sum2, const uint8_t
|
||||
s1 %= ADLER_MODULE;
|
||||
s2 %= ADLER_MODULE;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle leftover data.
|
||||
*/
|
||||
@@ -197,6 +203,7 @@ TARGET_WITH_NEON void adler32_neon(uint16_t *sum1, uint16_t *sum2, const uint8_t
|
||||
if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
|
||||
s2 %= ADLER_MODULE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the recombined sums.
|
||||
*/
|
||||
|
||||
@@ -60,6 +60,8 @@ adler32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
|
||||
* Process the data in blocks.
|
||||
*/
|
||||
const unsigned BLOCK_SIZE = 1 << 5;
|
||||
if(len >= BLOCK_SIZE)
|
||||
{
|
||||
long blocks = len / BLOCK_SIZE;
|
||||
len -= blocks * BLOCK_SIZE;
|
||||
while(blocks)
|
||||
@@ -122,6 +124,8 @@ adler32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
|
||||
s1 %= ADLER_MODULE;
|
||||
s2 %= ADLER_MODULE;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle leftover data.
|
||||
*/
|
||||
@@ -152,6 +156,7 @@ adler32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
|
||||
if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
|
||||
s2 %= ADLER_MODULE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the recombined sums.
|
||||
*/
|
||||
|
||||
@@ -52,6 +52,8 @@ fletcher16_avx2(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
|
||||
* Process the data in blocks.
|
||||
*/
|
||||
const unsigned BLOCK_SIZE = 1 << 5;
|
||||
if(len >= BLOCK_SIZE)
|
||||
{
|
||||
long blocks = len / BLOCK_SIZE;
|
||||
len -= blocks * BLOCK_SIZE;
|
||||
|
||||
@@ -148,6 +150,7 @@ fletcher16_avx2(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
|
||||
s1 %= FLETCHER16_MODULE;
|
||||
s2 %= FLETCHER16_MODULE;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle leftover data.
|
||||
@@ -179,6 +182,7 @@ fletcher16_avx2(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
|
||||
s1 %= FLETCHER16_MODULE;
|
||||
s2 %= FLETCHER16_MODULE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the recombined sums.
|
||||
*/
|
||||
|
||||
@@ -48,13 +48,20 @@
|
||||
* @param data Pointer to the data buffer.
|
||||
* @param len Length of the data buffer in bytes.
|
||||
*/
|
||||
TARGET_WITH_NEON void fletcher16_neon(uint8_t* sum1, uint8_t* sum2, const uint8_t* data, uint32_t len)
|
||||
TARGET_WITH_NEON void fletcher16_neon(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, uint32_t len)
|
||||
{
|
||||
/*
|
||||
* Split Fletcher-16 into component sums.
|
||||
*/
|
||||
uint32_t s1 = *sum1;
|
||||
uint32_t s2 = *sum2;
|
||||
|
||||
/*
|
||||
* Process the data in blocks.
|
||||
*/
|
||||
const unsigned BLOCK_SIZE = 1 << 5;
|
||||
if(len >= BLOCK_SIZE)
|
||||
{
|
||||
/*
|
||||
* Serially compute s1 & s2, until the data is 16-byte aligned.
|
||||
*/
|
||||
@@ -68,10 +75,7 @@ TARGET_WITH_NEON void fletcher16_neon(uint8_t* sum1, uint8_t* sum2, const uint8_
|
||||
s1 %= FLETCHER16_MODULE;
|
||||
s2 %= FLETCHER16_MODULE;
|
||||
}
|
||||
/*
|
||||
* Process the data in blocks.
|
||||
*/
|
||||
const unsigned BLOCK_SIZE = 1 << 5;
|
||||
|
||||
uint32_t blocks = len / BLOCK_SIZE;
|
||||
len -= blocks * BLOCK_SIZE;
|
||||
while(blocks)
|
||||
@@ -94,12 +98,13 @@ TARGET_WITH_NEON void fletcher16_neon(uint8_t* sum1, uint8_t* sum2, const uint8_
|
||||
uint16x8_t v_column_sum_2 = vdupq_n_u16(0);
|
||||
uint16x8_t v_column_sum_3 = vdupq_n_u16(0);
|
||||
uint16x8_t v_column_sum_4 = vdupq_n_u16(0);
|
||||
do {
|
||||
do
|
||||
{
|
||||
/*
|
||||
* Load 32 input bytes.
|
||||
*/
|
||||
const uint8x16_t bytes1 = vld1q_u8((uint8_t*)(data));
|
||||
const uint8x16_t bytes2 = vld1q_u8((uint8_t*)(data + 16));
|
||||
const uint8x16_t bytes1 = vld1q_u8((uint8_t *)(data));
|
||||
const uint8x16_t bytes2 = vld1q_u8((uint8_t *)(data + 16));
|
||||
/*
|
||||
* Add previous block byte sum to v_s2.
|
||||
*/
|
||||
@@ -116,7 +121,8 @@ TARGET_WITH_NEON void fletcher16_neon(uint8_t* sum1, uint8_t* sum2, const uint8_
|
||||
v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8(bytes2));
|
||||
v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2));
|
||||
data += BLOCK_SIZE;
|
||||
} while(--n);
|
||||
}
|
||||
while(--n);
|
||||
v_s2 = vshlq_n_u32(v_s2, 5);
|
||||
/*
|
||||
* Multiply-add bytes by [ 32, 31, 30, ... ] for s2.
|
||||
@@ -165,6 +171,8 @@ TARGET_WITH_NEON void fletcher16_neon(uint8_t* sum1, uint8_t* sum2, const uint8_
|
||||
s1 %= FLETCHER16_MODULE;
|
||||
s2 %= FLETCHER16_MODULE;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle leftover data.
|
||||
*/
|
||||
@@ -190,10 +198,12 @@ TARGET_WITH_NEON void fletcher16_neon(uint8_t* sum1, uint8_t* sum2, const uint8_
|
||||
s2 += (s1 += *data++);
|
||||
len -= 16;
|
||||
}
|
||||
while(len--) { s2 += (s1 += *data++); }
|
||||
while(len--)
|
||||
{ s2 += (s1 += *data++); }
|
||||
s1 %= FLETCHER16_MODULE;
|
||||
s2 %= FLETCHER16_MODULE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the recombined sums.
|
||||
*/
|
||||
|
||||
@@ -59,6 +59,8 @@ fletcher16_ssse3(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
|
||||
* Process the data in blocks.
|
||||
*/
|
||||
const unsigned BLOCK_SIZE = 1 << 5;
|
||||
if(len >= BLOCK_SIZE)
|
||||
{
|
||||
long blocks = len / BLOCK_SIZE;
|
||||
len -= blocks * BLOCK_SIZE;
|
||||
while(blocks)
|
||||
@@ -121,6 +123,8 @@ fletcher16_ssse3(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
|
||||
s1 %= FLETCHER16_MODULE;
|
||||
s2 %= FLETCHER16_MODULE;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle leftover data.
|
||||
*/
|
||||
@@ -151,6 +155,7 @@ fletcher16_ssse3(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
|
||||
s1 %= FLETCHER16_MODULE;
|
||||
s2 %= FLETCHER16_MODULE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the recombined sums.
|
||||
*/
|
||||
|
||||
@@ -52,6 +52,8 @@ fletcher32_avx2(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
|
||||
* Process the data in blocks.
|
||||
*/
|
||||
const unsigned BLOCK_SIZE = 1 << 5;
|
||||
if(len >= BLOCK_SIZE)
|
||||
{
|
||||
long blocks = len / BLOCK_SIZE;
|
||||
len -= blocks * BLOCK_SIZE;
|
||||
|
||||
@@ -148,6 +150,7 @@ fletcher32_avx2(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
|
||||
s1 %= FLETCHER32_MODULE;
|
||||
s2 %= FLETCHER32_MODULE;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle leftover data.
|
||||
@@ -179,6 +182,7 @@ fletcher32_avx2(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
|
||||
if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE;
|
||||
s2 %= FLETCHER32_MODULE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the recombined sums.
|
||||
*/
|
||||
|
||||
@@ -55,6 +55,13 @@ TARGET_WITH_NEON void fletcher32_neon(uint16_t *sum1, uint16_t *sum2, const uint
|
||||
*/
|
||||
uint32_t s1 = *sum1;
|
||||
uint32_t s2 = *sum2;
|
||||
|
||||
/*
|
||||
* Process the data in blocks.
|
||||
*/
|
||||
const unsigned BLOCK_SIZE = 1 << 5;
|
||||
if(len >= BLOCK_SIZE)
|
||||
{
|
||||
/*
|
||||
* Serially compute s1 & s2, until the data is 16-byte aligned.
|
||||
*/
|
||||
@@ -68,10 +75,7 @@ TARGET_WITH_NEON void fletcher32_neon(uint16_t *sum1, uint16_t *sum2, const uint
|
||||
if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE;
|
||||
s2 %= FLETCHER32_MODULE;
|
||||
}
|
||||
/*
|
||||
* Process the data in blocks.
|
||||
*/
|
||||
const unsigned BLOCK_SIZE = 1 << 5;
|
||||
|
||||
uint32_t blocks = len / BLOCK_SIZE;
|
||||
len -= blocks * BLOCK_SIZE;
|
||||
while(blocks)
|
||||
@@ -167,6 +171,8 @@ TARGET_WITH_NEON void fletcher32_neon(uint16_t *sum1, uint16_t *sum2, const uint
|
||||
s1 %= FLETCHER32_MODULE;
|
||||
s2 %= FLETCHER32_MODULE;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle leftover data.
|
||||
*/
|
||||
@@ -197,6 +203,7 @@ TARGET_WITH_NEON void fletcher32_neon(uint16_t *sum1, uint16_t *sum2, const uint
|
||||
if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE;
|
||||
s2 %= FLETCHER32_MODULE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the recombined sums.
|
||||
*/
|
||||
|
||||
@@ -59,6 +59,8 @@ fletcher32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
|
||||
* Process the data in blocks.
|
||||
*/
|
||||
const unsigned BLOCK_SIZE = 1 << 5;
|
||||
if(len >= BLOCK_SIZE)
|
||||
{
|
||||
long blocks = len / BLOCK_SIZE;
|
||||
len -= blocks * BLOCK_SIZE;
|
||||
while(blocks)
|
||||
@@ -121,6 +123,8 @@ fletcher32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
|
||||
s1 %= FLETCHER32_MODULE;
|
||||
s2 %= FLETCHER32_MODULE;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle leftover data.
|
||||
*/
|
||||
@@ -151,6 +155,7 @@ fletcher32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
|
||||
if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE;
|
||||
s2 %= FLETCHER32_MODULE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the recombined sums.
|
||||
*/
|
||||
|
||||
Reference in New Issue
Block a user