General refactor and cleanup.

This commit is contained in:
2024-04-30 15:12:48 +01:00
parent 552aa9da02
commit bd5051ce18
48 changed files with 1157 additions and 1290 deletions

View File

@@ -88,8 +88,12 @@ TARGET_WITH_NEON void adler32_neon(uint16_t *sum1, uint16_t *sum2, const uint8_t
* processed before s2 must be reduced modulo ADLER_MODULE.
*/
#ifdef _MSC_VER
uint32x4_t v_s2 = {.n128_u32 = {0, 0, 0, s1 * n}};
uint32x4_t v_s1 = {.n128_u32 = {0, 0, 0, 0}};
uint32x4_t v_s2 = {
.n128_u32 = {0, 0, 0, s1 * n}
};
uint32x4_t v_s1 = {
.n128_u32 = {0, 0, 0, 0}
};
#else
uint32x4_t v_s2 = (uint32x4_t){0, 0, 0, s1 * n};
uint32x4_t v_s1 = (uint32x4_t){0, 0, 0, 0};
@@ -98,8 +102,7 @@ TARGET_WITH_NEON void adler32_neon(uint16_t *sum1, uint16_t *sum2, const uint8_t
uint16x8_t v_column_sum_2 = vdupq_n_u16(0);
uint16x8_t v_column_sum_3 = vdupq_n_u16(0);
uint16x8_t v_column_sum_4 = vdupq_n_u16(0);
do
{
do {
/*
* Load 32 input bytes.
*/
@@ -108,22 +111,21 @@ TARGET_WITH_NEON void adler32_neon(uint16_t *sum1, uint16_t *sum2, const uint8_t
/*
* Add previous block byte sum to v_s2.
*/
v_s2 = vaddq_u32(v_s2, v_s1);
v_s2 = vaddq_u32(v_s2, v_s1);
/*
* Horizontally add the bytes for s1.
*/
v_s1 = vpadalq_u16(v_s1, vpadalq_u8(vpaddlq_u8(bytes1), bytes2));
v_s1 = vpadalq_u16(v_s1, vpadalq_u8(vpaddlq_u8(bytes1), bytes2));
/*
* Vertically add the bytes for s2.
*/
v_column_sum_1 = vaddw_u8(v_column_sum_1, vget_low_u8(bytes1));
v_column_sum_2 = vaddw_u8(v_column_sum_2, vget_high_u8(bytes1));
v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8(bytes2));
v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2));
v_column_sum_1 = vaddw_u8(v_column_sum_1, vget_low_u8(bytes1));
v_column_sum_2 = vaddw_u8(v_column_sum_2, vget_high_u8(bytes1));
v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8(bytes2));
v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2));
data += BLOCK_SIZE;
}
while(--n);
v_s2 = vshlq_n_u32(v_s2, 5);
} while(--n);
v_s2 = vshlq_n_u32(v_s2, 5);
/*
* Multiply-add bytes by [ 32, 31, 30, ... ] for s2.
*/
@@ -198,8 +200,7 @@ TARGET_WITH_NEON void adler32_neon(uint16_t *sum1, uint16_t *sum2, const uint8_t
s2 += (s1 += *data++);
len -= 16;
}
while(len--)
{ s2 += (s1 += *data++); }
while(len--) { s2 += (s1 += *data++); }
if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
s2 %= ADLER_MODULE;
}