From 0d9d1d92ebb8ef8f3bd6c001edf435c620671aca Mon Sep 17 00:00:00 2001 From: Natalia Portillo Date: Sun, 24 Sep 2023 19:33:25 +0100 Subject: [PATCH] Fix Adler and Fletcher calculations using SIMD when dataset is smaller than block size. --- adler32_avx2.c | 178 ++++++++++++++++++++-------------------- adler32_neon.c | 199 +++++++++++++++++++++++---------------------- adler32_ssse3.c | 107 ++++++++++++------------ fletcher16_avx2.c | 178 ++++++++++++++++++++-------------------- fletcher16_neon.c | 196 +++++++++++++++++++++++--------------------- fletcher16_ssse3.c | 107 ++++++++++++------------ fletcher32_avx2.c | 178 ++++++++++++++++++++-------------------- fletcher32_neon.c | 199 +++++++++++++++++++++++---------------------- fletcher32_ssse3.c | 107 ++++++++++++------------ 9 files changed, 750 insertions(+), 699 deletions(-) diff --git a/adler32_avx2.c b/adler32_avx2.c index d2f1f5a..cd74f04 100644 --- a/adler32_avx2.c +++ b/adler32_avx2.c @@ -51,101 +51,104 @@ AARU_EXPORT TARGET_WITH_AVX2 void AARU_CALL adler32_avx2(uint16_t *sum1, uint16_ * Process the data in blocks. */ const unsigned BLOCK_SIZE = 1 << 5; - long blocks = len / BLOCK_SIZE; - len -= blocks * BLOCK_SIZE; - - while(blocks) + if(len >= BLOCK_SIZE) { - unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ + long blocks = len / BLOCK_SIZE; + len -= blocks * BLOCK_SIZE; - if(n > blocks) n = (unsigned)blocks; - blocks -= n; - - const __m256i tap = _mm256_set_epi8(1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32); - const __m256i zero = _mm256_setzero_si256(); - const __m256i ones = _mm256_set1_epi16(1); - - /* - * Process n blocks of data. At most NMAX data bytes can be - * processed before s2 must be reduced modulo BASE. - */ - __m256i v_ps = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, (s1 * n)); - __m256i v_s2 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, s2); - __m256i v_s1 = _mm256_setzero_si256(); - do + while(blocks) { - /* - * Load 32 input bytes. - */ - const __m256i bytes = _mm256_lddqu_si256((__m256i *)(data)); + unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ + + if(n > blocks) n = (unsigned)blocks; + blocks -= n; + + const __m256i tap = _mm256_set_epi8(1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32); + const __m256i zero = _mm256_setzero_si256(); + const __m256i ones = _mm256_set1_epi16(1); /* - * Add previous block byte sum to v_ps. + * Process n blocks of data. At most NMAX data bytes can be + * processed before s2 must be reduced modulo BASE. */ - v_ps = _mm256_add_epi32(v_ps, v_s1); - /* - * Horizontally add the bytes for s1, multiply-adds the - * bytes by [ 32, 31, 30, ... ] for s2. - */ - v_s1 = _mm256_add_epi32(v_s1, _mm256_sad_epu8(bytes, zero)); - const __m256i mad = _mm256_maddubs_epi16(bytes, tap); - v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(mad, ones)); + __m256i v_ps = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, (s1 * n)); + __m256i v_s2 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, s2); + __m256i v_s1 = _mm256_setzero_si256(); + do + { + /* + * Load 32 input bytes. + */ + const __m256i bytes = _mm256_lddqu_si256((__m256i *)(data)); - data += BLOCK_SIZE; + /* + * Add previous block byte sum to v_ps. + */ + v_ps = _mm256_add_epi32(v_ps, v_s1); + /* + * Horizontally add the bytes for s1, multiply-adds the + * bytes by [ 32, 31, 30, ... ] for s2. + */ + v_s1 = _mm256_add_epi32(v_s1, _mm256_sad_epu8(bytes, zero)); + const __m256i mad = _mm256_maddubs_epi16(bytes, tap); + v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(mad, ones)); + + data += BLOCK_SIZE; + } + while(--n); + + __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(v_s1), _mm256_extracti128_si256(v_s1, 1)); + __m128i hi = _mm_unpackhi_epi64(sum, sum); + sum = _mm_add_epi32(hi, sum); + hi = _mm_shuffle_epi32(sum, 177); + sum = _mm_add_epi32(sum, hi); + s1 += _mm_cvtsi128_si32(sum); + + v_s2 = _mm256_add_epi32(v_s2, _mm256_slli_epi32(v_ps, 5)); + sum = _mm_add_epi32(_mm256_castsi256_si128(v_s2), _mm256_extracti128_si256(v_s2, 1)); + hi = _mm_unpackhi_epi64(sum, sum); + sum = _mm_add_epi32(hi, sum); + hi = _mm_shuffle_epi32(sum, 177); + sum = _mm_add_epi32(sum, hi); + s2 = _mm_cvtsi128_si32(sum); + + /* + * Reduce. + */ + s1 %= ADLER_MODULE; + s2 %= ADLER_MODULE; } - while(--n); - - __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(v_s1), _mm256_extracti128_si256(v_s1, 1)); - __m128i hi = _mm_unpackhi_epi64(sum, sum); - sum = _mm_add_epi32(hi, sum); - hi = _mm_shuffle_epi32(sum, 177); - sum = _mm_add_epi32(sum, hi); - s1 += _mm_cvtsi128_si32(sum); - - v_s2 = _mm256_add_epi32(v_s2, _mm256_slli_epi32(v_ps, 5)); - sum = _mm_add_epi32(_mm256_castsi256_si128(v_s2), _mm256_extracti128_si256(v_s2, 1)); - hi = _mm_unpackhi_epi64(sum, sum); - sum = _mm_add_epi32(hi, sum); - hi = _mm_shuffle_epi32(sum, 177); - sum = _mm_add_epi32(sum, hi); - s2 = _mm_cvtsi128_si32(sum); - - /* - * Reduce. - */ - s1 %= ADLER_MODULE; - s2 %= ADLER_MODULE; } /* @@ -178,6 +181,7 @@ AARU_EXPORT TARGET_WITH_AVX2 void AARU_CALL adler32_avx2(uint16_t *sum1, uint16_ if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE; s2 %= ADLER_MODULE; } + /* * Return the recombined sums. */ diff --git a/adler32_neon.c b/adler32_neon.c index c203573..523ce1d 100644 --- a/adler32_neon.c +++ b/adler32_neon.c @@ -55,118 +55,124 @@ TARGET_WITH_NEON void adler32_neon(uint16_t *sum1, uint16_t *sum2, const uint8_t */ uint32_t s1 = *sum1; uint32_t s2 = *sum2; - /* - * Serially compute s1 & s2, until the data is 16-byte aligned. - */ - if((uintptr_t)data & 15) - { - while((uintptr_t)data & 15) - { - s2 += (s1 += *data++); - --len; - } - if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE; - s2 %= ADLER_MODULE; - } + /* * Process the data in blocks. */ const unsigned BLOCK_SIZE = 1 << 5; - uint32_t blocks = len / BLOCK_SIZE; - len -= blocks * BLOCK_SIZE; - while(blocks) + if(len >= BLOCK_SIZE) { - unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ - if(n > blocks) n = (unsigned)blocks; - blocks -= n; /* - * Process n blocks of data. At most NMAX data bytes can be - * processed before s2 must be reduced modulo ADLER_MODULE. + * Serially compute s1 & s2, until the data is 16-byte aligned. */ -#ifdef _MSC_VER - uint32x4_t v_s2 = {.n128_u32 = {0, 0, 0, s1 * n}}; - uint32x4_t v_s1 = {.n128_u32 = {0, 0, 0, 0}}; -#else - uint32x4_t v_s2 = (uint32x4_t){0, 0, 0, s1 * n}; - uint32x4_t v_s1 = (uint32x4_t){0, 0, 0, 0}; -#endif - uint16x8_t v_column_sum_1 = vdupq_n_u16(0); - uint16x8_t v_column_sum_2 = vdupq_n_u16(0); - uint16x8_t v_column_sum_3 = vdupq_n_u16(0); - uint16x8_t v_column_sum_4 = vdupq_n_u16(0); - do + if((uintptr_t)data & 15) { - /* - * Load 32 input bytes. - */ - const uint8x16_t bytes1 = vld1q_u8((uint8_t *)(data)); - const uint8x16_t bytes2 = vld1q_u8((uint8_t *)(data + 16)); - /* - * Add previous block byte sum to v_s2. - */ - v_s2 = vaddq_u32(v_s2, v_s1); - /* - * Horizontally add the bytes for s1. - */ - v_s1 = vpadalq_u16(v_s1, vpadalq_u8(vpaddlq_u8(bytes1), bytes2)); - /* - * Vertically add the bytes for s2. - */ - v_column_sum_1 = vaddw_u8(v_column_sum_1, vget_low_u8(bytes1)); - v_column_sum_2 = vaddw_u8(v_column_sum_2, vget_high_u8(bytes1)); - v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8(bytes2)); - v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2)); - data += BLOCK_SIZE; + while((uintptr_t)data & 15) + { + s2 += (s1 += *data++); + --len; + } + if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE; + s2 %= ADLER_MODULE; } - while(--n); - v_s2 = vshlq_n_u32(v_s2, 5); - /* - * Multiply-add bytes by [ 32, 31, 30, ... ] for s2. - */ + + uint32_t blocks = len / BLOCK_SIZE; + len -= blocks * BLOCK_SIZE; + while(blocks) + { + unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ + if(n > blocks) n = (unsigned)blocks; + blocks -= n; + /* + * Process n blocks of data. At most NMAX data bytes can be + * processed before s2 must be reduced modulo ADLER_MODULE. + */ +#ifdef _MSC_VER + uint32x4_t v_s2 = {.n128_u32 = {0, 0, 0, s1 * n}}; + uint32x4_t v_s1 = {.n128_u32 = {0, 0, 0, 0}}; +#else + uint32x4_t v_s2 = (uint32x4_t){0, 0, 0, s1 * n}; + uint32x4_t v_s1 = (uint32x4_t){0, 0, 0, 0}; +#endif + uint16x8_t v_column_sum_1 = vdupq_n_u16(0); + uint16x8_t v_column_sum_2 = vdupq_n_u16(0); + uint16x8_t v_column_sum_3 = vdupq_n_u16(0); + uint16x8_t v_column_sum_4 = vdupq_n_u16(0); + do + { + /* + * Load 32 input bytes. + */ + const uint8x16_t bytes1 = vld1q_u8((uint8_t *)(data)); + const uint8x16_t bytes2 = vld1q_u8((uint8_t *)(data + 16)); + /* + * Add previous block byte sum to v_s2. + */ + v_s2 = vaddq_u32(v_s2, v_s1); + /* + * Horizontally add the bytes for s1. + */ + v_s1 = vpadalq_u16(v_s1, vpadalq_u8(vpaddlq_u8(bytes1), bytes2)); + /* + * Vertically add the bytes for s2. + */ + v_column_sum_1 = vaddw_u8(v_column_sum_1, vget_low_u8(bytes1)); + v_column_sum_2 = vaddw_u8(v_column_sum_2, vget_high_u8(bytes1)); + v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8(bytes2)); + v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2)); + data += BLOCK_SIZE; + } + while(--n); + v_s2 = vshlq_n_u32(v_s2, 5); + /* + * Multiply-add bytes by [ 32, 31, 30, ... ] for s2. + */ #ifdef _MSC_VER #ifdef _M_ARM64 - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), neon_ld1m_16((uint16_t[]){32, 31, 30, 29})); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), neon_ld1m_16((uint16_t[]){28, 27, 26, 25})); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), neon_ld1m_16((uint16_t[]){24, 23, 22, 21})); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), neon_ld1m_16((uint16_t[]){20, 19, 18, 17})); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), neon_ld1m_16((uint16_t[]){16, 15, 14, 13})); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), neon_ld1m_16((uint16_t[]){12, 11, 10, 9})); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), neon_ld1m_16((uint16_t[]){8, 7, 6, 5})); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), neon_ld1m_16((uint16_t[]){4, 3, 2, 1})); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), neon_ld1m_16((uint16_t[]){32, 31, 30, 29})); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), neon_ld1m_16((uint16_t[]){28, 27, 26, 25})); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), neon_ld1m_16((uint16_t[]){24, 23, 22, 21})); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), neon_ld1m_16((uint16_t[]){20, 19, 18, 17})); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), neon_ld1m_16((uint16_t[]){16, 15, 14, 13})); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), neon_ld1m_16((uint16_t[]){12, 11, 10, 9})); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), neon_ld1m_16((uint16_t[]){8, 7, 6, 5})); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), neon_ld1m_16((uint16_t[]){4, 3, 2, 1})); #else - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), vld1_u16(((uint16_t[]){32, 31, 30, 29}))); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), vld1_u16(((uint16_t[]){28, 27, 26, 25}))); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), vld1_u16(((uint16_t[]){24, 23, 22, 21}))); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), vld1_u16(((uint16_t[]){20, 19, 18, 17}))); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), vld1_u16(((uint16_t[]){16, 15, 14, 13}))); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), vld1_u16(((uint16_t[]){12, 11, 10, 9}))); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), vld1_u16(((uint16_t[]){8, 7, 6, 5}))); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), vld1_u16(((uint16_t[]){4, 3, 2, 1}))); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), vld1_u16(((uint16_t[]){32, 31, 30, 29}))); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), vld1_u16(((uint16_t[]){28, 27, 26, 25}))); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), vld1_u16(((uint16_t[]){24, 23, 22, 21}))); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), vld1_u16(((uint16_t[]){20, 19, 18, 17}))); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), vld1_u16(((uint16_t[]){16, 15, 14, 13}))); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), vld1_u16(((uint16_t[]){12, 11, 10, 9}))); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), vld1_u16(((uint16_t[]){8, 7, 6, 5}))); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), vld1_u16(((uint16_t[]){4, 3, 2, 1}))); #endif #else - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), (uint16x4_t){32, 31, 30, 29}); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), (uint16x4_t){28, 27, 26, 25}); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), (uint16x4_t){24, 23, 22, 21}); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), (uint16x4_t){20, 19, 18, 17}); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), (uint16x4_t){16, 15, 14, 13}); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), (uint16x4_t){12, 11, 10, 9}); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), (uint16x4_t){8, 7, 6, 5}); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), (uint16x4_t){4, 3, 2, 1}); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), (uint16x4_t){32, 31, 30, 29}); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), (uint16x4_t){28, 27, 26, 25}); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), (uint16x4_t){24, 23, 22, 21}); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), (uint16x4_t){20, 19, 18, 17}); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), (uint16x4_t){16, 15, 14, 13}); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), (uint16x4_t){12, 11, 10, 9}); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), (uint16x4_t){8, 7, 6, 5}); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), (uint16x4_t){4, 3, 2, 1}); #endif - /* - * Sum epi32 ints v_s1(s2) and accumulate in s1(s2). - */ - uint32x2_t t_s1 = vpadd_u32(vget_low_u32(v_s1), vget_high_u32(v_s1)); - uint32x2_t t_s2 = vpadd_u32(vget_low_u32(v_s2), vget_high_u32(v_s2)); - uint32x2_t s1s2 = vpadd_u32(t_s1, t_s2); - s1 += vget_lane_u32(s1s2, 0); - s2 += vget_lane_u32(s1s2, 1); - /* - * Reduce. - */ - s1 %= ADLER_MODULE; - s2 %= ADLER_MODULE; + /* + * Sum epi32 ints v_s1(s2) and accumulate in s1(s2). + */ + uint32x2_t t_s1 = vpadd_u32(vget_low_u32(v_s1), vget_high_u32(v_s1)); + uint32x2_t t_s2 = vpadd_u32(vget_low_u32(v_s2), vget_high_u32(v_s2)); + uint32x2_t s1s2 = vpadd_u32(t_s1, t_s2); + s1 += vget_lane_u32(s1s2, 0); + s2 += vget_lane_u32(s1s2, 1); + /* + * Reduce. + */ + s1 %= ADLER_MODULE; + s2 %= ADLER_MODULE; + } } + /* * Handle leftover data. */ @@ -197,6 +203,7 @@ TARGET_WITH_NEON void adler32_neon(uint16_t *sum1, uint16_t *sum2, const uint8_t if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE; s2 %= ADLER_MODULE; } + /* * Return the recombined sums. */ diff --git a/adler32_ssse3.c b/adler32_ssse3.c index 870a7e8..8ebfb9e 100644 --- a/adler32_ssse3.c +++ b/adler32_ssse3.c @@ -60,68 +60,72 @@ adler32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len) * Process the data in blocks. */ const unsigned BLOCK_SIZE = 1 << 5; - long blocks = len / BLOCK_SIZE; - len -= blocks * BLOCK_SIZE; - while(blocks) + if(len >= BLOCK_SIZE) { - unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ - if(n > blocks) n = (unsigned)blocks; - blocks -= n; - const __m128i tap1 = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); - const __m128i tap2 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); - const __m128i zero = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); - const __m128i ones = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1); - /* - * Process n blocks of data. At most NMAX data bytes can be - * processed before s2 must be reduced modulo BASE. - */ - __m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n); - __m128i v_s2 = _mm_set_epi32(0, 0, 0, s2); - __m128i v_s1 = _mm_set_epi32(0, 0, 0, 0); - do + long blocks = len / BLOCK_SIZE; + len -= blocks * BLOCK_SIZE; + while(blocks) { + unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ + if(n > blocks) n = (unsigned)blocks; + blocks -= n; + const __m128i tap1 = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); + const __m128i tap2 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); + const __m128i zero = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + const __m128i ones = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1); /* - * Load 32 input bytes. + * Process n blocks of data. At most NMAX data bytes can be + * processed before s2 must be reduced modulo BASE. */ - const __m128i bytes1 = _mm_loadu_si128((__m128i *)(data)); - const __m128i bytes2 = _mm_loadu_si128((__m128i *)(data + 16)); + __m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n); + __m128i v_s2 = _mm_set_epi32(0, 0, 0, s2); + __m128i v_s1 = _mm_set_epi32(0, 0, 0, 0); + do + { + /* + * Load 32 input bytes. + */ + const __m128i bytes1 = _mm_loadu_si128((__m128i *)(data)); + const __m128i bytes2 = _mm_loadu_si128((__m128i *)(data + 16)); + /* + * Add previous block byte sum to v_ps. + */ + v_ps = _mm_add_epi32(v_ps, v_s1); + /* + * Horizontally add the bytes for s1, multiply-adds the + * bytes by [ 32, 31, 30, ... ] for s2. + */ + v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zero)); + const __m128i mad1 = _mm_maddubs_epi16(bytes1, tap1); + v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad1, ones)); + v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zero)); + const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2); + v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones)); + data += BLOCK_SIZE; + } + while(--n); + v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5)); /* - * Add previous block byte sum to v_ps. + * Sum epi32 ints v_s1(s2) and accumulate in s1(s2). */ - v_ps = _mm_add_epi32(v_ps, v_s1); - /* - * Horizontally add the bytes for s1, multiply-adds the - * bytes by [ 32, 31, 30, ... ] for s2. - */ - v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zero)); - const __m128i mad1 = _mm_maddubs_epi16(bytes1, tap1); - v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad1, ones)); - v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zero)); - const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2); - v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones)); - data += BLOCK_SIZE; - } - while(--n); - v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5)); - /* - * Sum epi32 ints v_s1(s2) and accumulate in s1(s2). - */ #define S23O1 _MM_SHUFFLE(2, 3, 0, 1) /* A B C D -> B A D C */ #define S1O32 _MM_SHUFFLE(1, 0, 3, 2) /* A B C D -> C D A B */ - v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S23O1)); - v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S1O32)); - s1 += _mm_cvtsi128_si32(v_s1); - v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S23O1)); - v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S1O32)); - s2 = _mm_cvtsi128_si32(v_s2); + v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S23O1)); + v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S1O32)); + s1 += _mm_cvtsi128_si32(v_s1); + v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S23O1)); + v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S1O32)); + s2 = _mm_cvtsi128_si32(v_s2); #undef S23O1 #undef S1O32 - /* - * Reduce. - */ - s1 %= ADLER_MODULE; - s2 %= ADLER_MODULE; + /* + * Reduce. + */ + s1 %= ADLER_MODULE; + s2 %= ADLER_MODULE; + } } + /* * Handle leftover data. */ @@ -152,6 +156,7 @@ adler32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len) if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE; s2 %= ADLER_MODULE; } + /* * Return the recombined sums. */ diff --git a/fletcher16_avx2.c b/fletcher16_avx2.c index 8a66e48..154673a 100644 --- a/fletcher16_avx2.c +++ b/fletcher16_avx2.c @@ -52,101 +52,104 @@ fletcher16_avx2(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len) * Process the data in blocks. */ const unsigned BLOCK_SIZE = 1 << 5; - long blocks = len / BLOCK_SIZE; - len -= blocks * BLOCK_SIZE; - - while(blocks) + if(len >= BLOCK_SIZE) { - unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ + long blocks = len / BLOCK_SIZE; + len -= blocks * BLOCK_SIZE; - if(n > blocks) n = (unsigned)blocks; - blocks -= n; - - const __m256i tap = _mm256_set_epi8(1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32); - const __m256i zero = _mm256_setzero_si256(); - const __m256i ones = _mm256_set1_epi16(1); - - /* - * Process n blocks of data. At most NMAX data bytes can be - * processed before s2 must be reduced modulo BASE. - */ - __m256i v_ps = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, (s1 * n)); - __m256i v_s2 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, s2); - __m256i v_s1 = _mm256_setzero_si256(); - do + while(blocks) { - /* - * Load 32 input bytes. - */ - const __m256i bytes = _mm256_lddqu_si256((__m256i *)(data)); + unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ + + if(n > blocks) n = (unsigned)blocks; + blocks -= n; + + const __m256i tap = _mm256_set_epi8(1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32); + const __m256i zero = _mm256_setzero_si256(); + const __m256i ones = _mm256_set1_epi16(1); /* - * Add previous block byte sum to v_ps. + * Process n blocks of data. At most NMAX data bytes can be + * processed before s2 must be reduced modulo BASE. */ - v_ps = _mm256_add_epi32(v_ps, v_s1); - /* - * Horizontally add the bytes for s1, multiply-adds the - * bytes by [ 32, 31, 30, ... ] for s2. - */ - v_s1 = _mm256_add_epi32(v_s1, _mm256_sad_epu8(bytes, zero)); - const __m256i mad = _mm256_maddubs_epi16(bytes, tap); - v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(mad, ones)); + __m256i v_ps = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, (s1 * n)); + __m256i v_s2 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, s2); + __m256i v_s1 = _mm256_setzero_si256(); + do + { + /* + * Load 32 input bytes. + */ + const __m256i bytes = _mm256_lddqu_si256((__m256i *)(data)); - data += BLOCK_SIZE; + /* + * Add previous block byte sum to v_ps. + */ + v_ps = _mm256_add_epi32(v_ps, v_s1); + /* + * Horizontally add the bytes for s1, multiply-adds the + * bytes by [ 32, 31, 30, ... ] for s2. + */ + v_s1 = _mm256_add_epi32(v_s1, _mm256_sad_epu8(bytes, zero)); + const __m256i mad = _mm256_maddubs_epi16(bytes, tap); + v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(mad, ones)); + + data += BLOCK_SIZE; + } + while(--n); + + __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(v_s1), _mm256_extracti128_si256(v_s1, 1)); + __m128i hi = _mm_unpackhi_epi64(sum, sum); + sum = _mm_add_epi32(hi, sum); + hi = _mm_shuffle_epi32(sum, 177); + sum = _mm_add_epi32(sum, hi); + s1 += _mm_cvtsi128_si32(sum); + + v_s2 = _mm256_add_epi32(v_s2, _mm256_slli_epi32(v_ps, 5)); + sum = _mm_add_epi32(_mm256_castsi256_si128(v_s2), _mm256_extracti128_si256(v_s2, 1)); + hi = _mm_unpackhi_epi64(sum, sum); + sum = _mm_add_epi32(hi, sum); + hi = _mm_shuffle_epi32(sum, 177); + sum = _mm_add_epi32(sum, hi); + s2 = _mm_cvtsi128_si32(sum); + + /* + * Reduce. + */ + s1 %= FLETCHER16_MODULE; + s2 %= FLETCHER16_MODULE; } - while(--n); - - __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(v_s1), _mm256_extracti128_si256(v_s1, 1)); - __m128i hi = _mm_unpackhi_epi64(sum, sum); - sum = _mm_add_epi32(hi, sum); - hi = _mm_shuffle_epi32(sum, 177); - sum = _mm_add_epi32(sum, hi); - s1 += _mm_cvtsi128_si32(sum); - - v_s2 = _mm256_add_epi32(v_s2, _mm256_slli_epi32(v_ps, 5)); - sum = _mm_add_epi32(_mm256_castsi256_si128(v_s2), _mm256_extracti128_si256(v_s2, 1)); - hi = _mm_unpackhi_epi64(sum, sum); - sum = _mm_add_epi32(hi, sum); - hi = _mm_shuffle_epi32(sum, 177); - sum = _mm_add_epi32(sum, hi); - s2 = _mm_cvtsi128_si32(sum); - - /* - * Reduce. - */ - s1 %= FLETCHER16_MODULE; - s2 %= FLETCHER16_MODULE; } /* @@ -179,6 +182,7 @@ fletcher16_avx2(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len) s1 %= FLETCHER16_MODULE; s2 %= FLETCHER16_MODULE; } + /* * Return the recombined sums. */ diff --git a/fletcher16_neon.c b/fletcher16_neon.c index 2d30d7f..dd3a71d 100644 --- a/fletcher16_neon.c +++ b/fletcher16_neon.c @@ -48,123 +48,131 @@ * @param data Pointer to the data buffer. * @param len Length of the data buffer in bytes. */ -TARGET_WITH_NEON void fletcher16_neon(uint8_t* sum1, uint8_t* sum2, const uint8_t* data, uint32_t len) +TARGET_WITH_NEON void fletcher16_neon(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, uint32_t len) { /* * Split Fletcher-16 into component sums. */ uint32_t s1 = *sum1; uint32_t s2 = *sum2; - /* - * Serially compute s1 & s2, until the data is 16-byte aligned. - */ - if((uintptr_t)data & 15) - { - while((uintptr_t)data & 15) - { - s2 += (s1 += *data++); - --len; - } - s1 %= FLETCHER16_MODULE; - s2 %= FLETCHER16_MODULE; - } + /* * Process the data in blocks. */ const unsigned BLOCK_SIZE = 1 << 5; - uint32_t blocks = len / BLOCK_SIZE; - len -= blocks * BLOCK_SIZE; - while(blocks) + if(len >= BLOCK_SIZE) { - unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ - if(n > blocks) n = (unsigned)blocks; - blocks -= n; /* - * Process n blocks of data. At most NMAX data bytes can be - * processed before s2 must be reduced modulo FLETCHER16_MODULE. + * Serially compute s1 & s2, until the data is 16-byte aligned. */ + if((uintptr_t)data & 15) + { + while((uintptr_t)data & 15) + { + s2 += (s1 += *data++); + --len; + } + s1 %= FLETCHER16_MODULE; + s2 %= FLETCHER16_MODULE; + } + + uint32_t blocks = len / BLOCK_SIZE; + len -= blocks * BLOCK_SIZE; + while(blocks) + { + unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ + if(n > blocks) n = (unsigned)blocks; + blocks -= n; + /* + * Process n blocks of data. At most NMAX data bytes can be + * processed before s2 must be reduced modulo FLETCHER16_MODULE. + */ #ifdef _MSC_VER - uint32x4_t v_s2 = {.n128_u32 = {0, 0, 0, s1 * n}}; - uint32x4_t v_s1 = {.n128_u32 = {0, 0, 0, 0}}; + uint32x4_t v_s2 = {.n128_u32 = {0, 0, 0, s1 * n}}; + uint32x4_t v_s1 = {.n128_u32 = {0, 0, 0, 0}}; #else - uint32x4_t v_s2 = (uint32x4_t){0, 0, 0, s1 * n}; - uint32x4_t v_s1 = (uint32x4_t){0, 0, 0, 0}; + uint32x4_t v_s2 = (uint32x4_t){0, 0, 0, s1 * n}; + uint32x4_t v_s1 = (uint32x4_t){0, 0, 0, 0}; #endif - uint16x8_t v_column_sum_1 = vdupq_n_u16(0); - uint16x8_t v_column_sum_2 = vdupq_n_u16(0); - uint16x8_t v_column_sum_3 = vdupq_n_u16(0); - uint16x8_t v_column_sum_4 = vdupq_n_u16(0); - do { + uint16x8_t v_column_sum_1 = vdupq_n_u16(0); + uint16x8_t v_column_sum_2 = vdupq_n_u16(0); + uint16x8_t v_column_sum_3 = vdupq_n_u16(0); + uint16x8_t v_column_sum_4 = vdupq_n_u16(0); + do + { + /* + * Load 32 input bytes. + */ + const uint8x16_t bytes1 = vld1q_u8((uint8_t *)(data)); + const uint8x16_t bytes2 = vld1q_u8((uint8_t *)(data + 16)); + /* + * Add previous block byte sum to v_s2. + */ + v_s2 = vaddq_u32(v_s2, v_s1); + /* + * Horizontally add the bytes for s1. + */ + v_s1 = vpadalq_u16(v_s1, vpadalq_u8(vpaddlq_u8(bytes1), bytes2)); + /* + * Vertically add the bytes for s2. + */ + v_column_sum_1 = vaddw_u8(v_column_sum_1, vget_low_u8(bytes1)); + v_column_sum_2 = vaddw_u8(v_column_sum_2, vget_high_u8(bytes1)); + v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8(bytes2)); + v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2)); + data += BLOCK_SIZE; + } + while(--n); + v_s2 = vshlq_n_u32(v_s2, 5); /* - * Load 32 input bytes. + * Multiply-add bytes by [ 32, 31, 30, ... ] for s2. */ - const uint8x16_t bytes1 = vld1q_u8((uint8_t*)(data)); - const uint8x16_t bytes2 = vld1q_u8((uint8_t*)(data + 16)); - /* - * Add previous block byte sum to v_s2. - */ - v_s2 = vaddq_u32(v_s2, v_s1); - /* - * Horizontally add the bytes for s1. - */ - v_s1 = vpadalq_u16(v_s1, vpadalq_u8(vpaddlq_u8(bytes1), bytes2)); - /* - * Vertically add the bytes for s2. - */ - v_column_sum_1 = vaddw_u8(v_column_sum_1, vget_low_u8(bytes1)); - v_column_sum_2 = vaddw_u8(v_column_sum_2, vget_high_u8(bytes1)); - v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8(bytes2)); - v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2)); - data += BLOCK_SIZE; - } while(--n); - v_s2 = vshlq_n_u32(v_s2, 5); - /* - * Multiply-add bytes by [ 32, 31, 30, ... ] for s2. - */ #ifdef _MSC_VER #ifdef _M_ARM64 - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), neon_ld1m_16((uint16_t[]){32, 31, 30, 29})); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), neon_ld1m_16((uint16_t[]){28, 27, 26, 25})); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), neon_ld1m_16((uint16_t[]){24, 23, 22, 21})); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), neon_ld1m_16((uint16_t[]){20, 19, 18, 17})); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), neon_ld1m_16((uint16_t[]){16, 15, 14, 13})); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), neon_ld1m_16((uint16_t[]){12, 11, 10, 9})); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), neon_ld1m_16((uint16_t[]){8, 7, 6, 5})); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), neon_ld1m_16((uint16_t[]){4, 3, 2, 1})); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), neon_ld1m_16((uint16_t[]){32, 31, 30, 29})); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), neon_ld1m_16((uint16_t[]){28, 27, 26, 25})); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), neon_ld1m_16((uint16_t[]){24, 23, 22, 21})); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), neon_ld1m_16((uint16_t[]){20, 19, 18, 17})); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), neon_ld1m_16((uint16_t[]){16, 15, 14, 13})); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), neon_ld1m_16((uint16_t[]){12, 11, 10, 9})); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), neon_ld1m_16((uint16_t[]){8, 7, 6, 5})); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), neon_ld1m_16((uint16_t[]){4, 3, 2, 1})); #else - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), vld1_u16(((uint16_t[]){32, 31, 30, 29}))); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), vld1_u16(((uint16_t[]){28, 27, 26, 25}))); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), vld1_u16(((uint16_t[]){24, 23, 22, 21}))); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), vld1_u16(((uint16_t[]){20, 19, 18, 17}))); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), vld1_u16(((uint16_t[]){16, 15, 14, 13}))); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), vld1_u16(((uint16_t[]){12, 11, 10, 9}))); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), vld1_u16(((uint16_t[]){8, 7, 6, 5}))); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), vld1_u16(((uint16_t[]){4, 3, 2, 1}))); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), vld1_u16(((uint16_t[]){32, 31, 30, 29}))); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), vld1_u16(((uint16_t[]){28, 27, 26, 25}))); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), vld1_u16(((uint16_t[]){24, 23, 22, 21}))); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), vld1_u16(((uint16_t[]){20, 19, 18, 17}))); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), vld1_u16(((uint16_t[]){16, 15, 14, 13}))); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), vld1_u16(((uint16_t[]){12, 11, 10, 9}))); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), vld1_u16(((uint16_t[]){8, 7, 6, 5}))); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), vld1_u16(((uint16_t[]){4, 3, 2, 1}))); #endif #else - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), (uint16x4_t){32, 31, 30, 29}); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), (uint16x4_t){28, 27, 26, 25}); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), (uint16x4_t){24, 23, 22, 21}); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), (uint16x4_t){20, 19, 18, 17}); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), (uint16x4_t){16, 15, 14, 13}); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), (uint16x4_t){12, 11, 10, 9}); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), (uint16x4_t){8, 7, 6, 5}); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), (uint16x4_t){4, 3, 2, 1}); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), (uint16x4_t){32, 31, 30, 29}); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), (uint16x4_t){28, 27, 26, 25}); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), (uint16x4_t){24, 23, 22, 21}); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), (uint16x4_t){20, 19, 18, 17}); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), (uint16x4_t){16, 15, 14, 13}); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), (uint16x4_t){12, 11, 10, 9}); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), (uint16x4_t){8, 7, 6, 5}); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), (uint16x4_t){4, 3, 2, 1}); #endif - /* - * Sum epi32 ints v_s1(s2) and accumulate in s1(s2). - */ - uint32x2_t sum1 = vpadd_u32(vget_low_u32(v_s1), vget_high_u32(v_s1)); - uint32x2_t sum2 = vpadd_u32(vget_low_u32(v_s2), vget_high_u32(v_s2)); - uint32x2_t s1s2 = vpadd_u32(sum1, sum2); - s1 += vget_lane_u32(s1s2, 0); - s2 += vget_lane_u32(s1s2, 1); - /* - * Reduce. - */ - s1 %= FLETCHER16_MODULE; - s2 %= FLETCHER16_MODULE; + /* + * Sum epi32 ints v_s1(s2) and accumulate in s1(s2). + */ + uint32x2_t sum1 = vpadd_u32(vget_low_u32(v_s1), vget_high_u32(v_s1)); + uint32x2_t sum2 = vpadd_u32(vget_low_u32(v_s2), vget_high_u32(v_s2)); + uint32x2_t s1s2 = vpadd_u32(sum1, sum2); + s1 += vget_lane_u32(s1s2, 0); + s2 += vget_lane_u32(s1s2, 1); + /* + * Reduce. + */ + s1 %= FLETCHER16_MODULE; + s2 %= FLETCHER16_MODULE; + } } + /* * Handle leftover data. */ @@ -190,10 +198,12 @@ TARGET_WITH_NEON void fletcher16_neon(uint8_t* sum1, uint8_t* sum2, const uint8_ s2 += (s1 += *data++); len -= 16; } - while(len--) { s2 += (s1 += *data++); } + while(len--) + { s2 += (s1 += *data++); } s1 %= FLETCHER16_MODULE; s2 %= FLETCHER16_MODULE; } + /* * Return the recombined sums. */ diff --git a/fletcher16_ssse3.c b/fletcher16_ssse3.c index 838e213..916fb0b 100644 --- a/fletcher16_ssse3.c +++ b/fletcher16_ssse3.c @@ -59,68 +59,72 @@ fletcher16_ssse3(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len) * Process the data in blocks. */ const unsigned BLOCK_SIZE = 1 << 5; - long blocks = len / BLOCK_SIZE; - len -= blocks * BLOCK_SIZE; - while(blocks) + if(len >= BLOCK_SIZE) { - unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ - if(n > blocks) n = (unsigned)blocks; - blocks -= n; - const __m128i tap1 = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); - const __m128i tap2 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); - const __m128i zero = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); - const __m128i ones = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1); - /* - * Process n blocks of data. At most NMAX data bytes can be - * processed before s2 must be reduced modulo BASE. - */ - __m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n); - __m128i v_s2 = _mm_set_epi32(0, 0, 0, s2); - __m128i v_s1 = _mm_set_epi32(0, 0, 0, 0); - do + long blocks = len / BLOCK_SIZE; + len -= blocks * BLOCK_SIZE; + while(blocks) { + unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ + if(n > blocks) n = (unsigned)blocks; + blocks -= n; + const __m128i tap1 = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); + const __m128i tap2 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); + const __m128i zero = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + const __m128i ones = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1); /* - * Load 32 input bytes. + * Process n blocks of data. At most NMAX data bytes can be + * processed before s2 must be reduced modulo BASE. */ - const __m128i bytes1 = _mm_loadu_si128((__m128i *)(data)); - const __m128i bytes2 = _mm_loadu_si128((__m128i *)(data + 16)); + __m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n); + __m128i v_s2 = _mm_set_epi32(0, 0, 0, s2); + __m128i v_s1 = _mm_set_epi32(0, 0, 0, 0); + do + { + /* + * Load 32 input bytes. + */ + const __m128i bytes1 = _mm_loadu_si128((__m128i *)(data)); + const __m128i bytes2 = _mm_loadu_si128((__m128i *)(data + 16)); + /* + * Add previous block byte sum to v_ps. + */ + v_ps = _mm_add_epi32(v_ps, v_s1); + /* + * Horizontally add the bytes for s1, multiply-adds the + * bytes by [ 32, 31, 30, ... ] for s2. + */ + v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zero)); + const __m128i mad1 = _mm_maddubs_epi16(bytes1, tap1); + v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad1, ones)); + v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zero)); + const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2); + v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones)); + data += BLOCK_SIZE; + } + while(--n); + v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5)); /* - * Add previous block byte sum to v_ps. + * Sum epi32 ints v_s1(s2) and accumulate in s1(s2). */ - v_ps = _mm_add_epi32(v_ps, v_s1); - /* - * Horizontally add the bytes for s1, multiply-adds the - * bytes by [ 32, 31, 30, ... ] for s2. - */ - v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zero)); - const __m128i mad1 = _mm_maddubs_epi16(bytes1, tap1); - v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad1, ones)); - v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zero)); - const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2); - v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones)); - data += BLOCK_SIZE; - } - while(--n); - v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5)); - /* - * Sum epi32 ints v_s1(s2) and accumulate in s1(s2). - */ #define S23O1 _MM_SHUFFLE(2, 3, 0, 1) /* A B C D -> B A D C */ #define S1O32 _MM_SHUFFLE(1, 0, 3, 2) /* A B C D -> C D A B */ - v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S23O1)); - v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S1O32)); - s1 += _mm_cvtsi128_si32(v_s1); - v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S23O1)); - v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S1O32)); - s2 = _mm_cvtsi128_si32(v_s2); + v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S23O1)); + v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S1O32)); + s1 += _mm_cvtsi128_si32(v_s1); + v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S23O1)); + v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S1O32)); + s2 = _mm_cvtsi128_si32(v_s2); #undef S23O1 #undef S1O32 - /* - * Reduce. - */ - s1 %= FLETCHER16_MODULE; - s2 %= FLETCHER16_MODULE; + /* + * Reduce. + */ + s1 %= FLETCHER16_MODULE; + s2 %= FLETCHER16_MODULE; + } } + /* * Handle leftover data. */ @@ -151,6 +155,7 @@ fletcher16_ssse3(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len) s1 %= FLETCHER16_MODULE; s2 %= FLETCHER16_MODULE; } + /* * Return the recombined sums. */ diff --git a/fletcher32_avx2.c b/fletcher32_avx2.c index 3e17120..20fcbc4 100644 --- a/fletcher32_avx2.c +++ b/fletcher32_avx2.c @@ -52,101 +52,104 @@ fletcher32_avx2(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len) * Process the data in blocks. */ const unsigned BLOCK_SIZE = 1 << 5; - long blocks = len / BLOCK_SIZE; - len -= blocks * BLOCK_SIZE; - - while(blocks) + if(len >= BLOCK_SIZE) { - unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ + long blocks = len / BLOCK_SIZE; + len -= blocks * BLOCK_SIZE; - if(n > blocks) n = (unsigned)blocks; - blocks -= n; - - const __m256i tap = _mm256_set_epi8(1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32); - const __m256i zero = _mm256_setzero_si256(); - const __m256i ones = _mm256_set1_epi16(1); - - /* - * Process n blocks of data. At most NMAX data bytes can be - * processed before s2 must be reduced modulo BASE. - */ - __m256i v_ps = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, (s1 * n)); - __m256i v_s2 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, s2); - __m256i v_s1 = _mm256_setzero_si256(); - do + while(blocks) { - /* - * Load 32 input bytes. - */ - const __m256i bytes = _mm256_lddqu_si256((__m256i *)(data)); + unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ + + if(n > blocks) n = (unsigned)blocks; + blocks -= n; + + const __m256i tap = _mm256_set_epi8(1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32); + const __m256i zero = _mm256_setzero_si256(); + const __m256i ones = _mm256_set1_epi16(1); /* - * Add previous block byte sum to v_ps. + * Process n blocks of data. At most NMAX data bytes can be + * processed before s2 must be reduced modulo BASE. */ - v_ps = _mm256_add_epi32(v_ps, v_s1); - /* - * Horizontally add the bytes for s1, multiply-adds the - * bytes by [ 32, 31, 30, ... ] for s2. - */ - v_s1 = _mm256_add_epi32(v_s1, _mm256_sad_epu8(bytes, zero)); - const __m256i mad = _mm256_maddubs_epi16(bytes, tap); - v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(mad, ones)); + __m256i v_ps = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, (s1 * n)); + __m256i v_s2 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, s2); + __m256i v_s1 = _mm256_setzero_si256(); + do + { + /* + * Load 32 input bytes. + */ + const __m256i bytes = _mm256_lddqu_si256((__m256i *)(data)); - data += BLOCK_SIZE; + /* + * Add previous block byte sum to v_ps. + */ + v_ps = _mm256_add_epi32(v_ps, v_s1); + /* + * Horizontally add the bytes for s1, multiply-adds the + * bytes by [ 32, 31, 30, ... ] for s2. + */ + v_s1 = _mm256_add_epi32(v_s1, _mm256_sad_epu8(bytes, zero)); + const __m256i mad = _mm256_maddubs_epi16(bytes, tap); + v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(mad, ones)); + + data += BLOCK_SIZE; + } + while(--n); + + __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(v_s1), _mm256_extracti128_si256(v_s1, 1)); + __m128i hi = _mm_unpackhi_epi64(sum, sum); + sum = _mm_add_epi32(hi, sum); + hi = _mm_shuffle_epi32(sum, 177); + sum = _mm_add_epi32(sum, hi); + s1 += _mm_cvtsi128_si32(sum); + + v_s2 = _mm256_add_epi32(v_s2, _mm256_slli_epi32(v_ps, 5)); + sum = _mm_add_epi32(_mm256_castsi256_si128(v_s2), _mm256_extracti128_si256(v_s2, 1)); + hi = _mm_unpackhi_epi64(sum, sum); + sum = _mm_add_epi32(hi, sum); + hi = _mm_shuffle_epi32(sum, 177); + sum = _mm_add_epi32(sum, hi); + s2 = _mm_cvtsi128_si32(sum); + + /* + * Reduce. + */ + s1 %= FLETCHER32_MODULE; + s2 %= FLETCHER32_MODULE; } - while(--n); - - __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(v_s1), _mm256_extracti128_si256(v_s1, 1)); - __m128i hi = _mm_unpackhi_epi64(sum, sum); - sum = _mm_add_epi32(hi, sum); - hi = _mm_shuffle_epi32(sum, 177); - sum = _mm_add_epi32(sum, hi); - s1 += _mm_cvtsi128_si32(sum); - - v_s2 = _mm256_add_epi32(v_s2, _mm256_slli_epi32(v_ps, 5)); - sum = _mm_add_epi32(_mm256_castsi256_si128(v_s2), _mm256_extracti128_si256(v_s2, 1)); - hi = _mm_unpackhi_epi64(sum, sum); - sum = _mm_add_epi32(hi, sum); - hi = _mm_shuffle_epi32(sum, 177); - sum = _mm_add_epi32(sum, hi); - s2 = _mm_cvtsi128_si32(sum); - - /* - * Reduce. - */ - s1 %= FLETCHER32_MODULE; - s2 %= FLETCHER32_MODULE; } /* @@ -179,6 +182,7 @@ fletcher32_avx2(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len) if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE; s2 %= FLETCHER32_MODULE; } + /* * Return the recombined sums. */ diff --git a/fletcher32_neon.c b/fletcher32_neon.c index 43832e0..8a59864 100644 --- a/fletcher32_neon.c +++ b/fletcher32_neon.c @@ -55,118 +55,124 @@ TARGET_WITH_NEON void fletcher32_neon(uint16_t *sum1, uint16_t *sum2, const uint */ uint32_t s1 = *sum1; uint32_t s2 = *sum2; - /* - * Serially compute s1 & s2, until the data is 16-byte aligned. - */ - if((uintptr_t)data & 15) - { - while((uintptr_t)data & 15) - { - s2 += (s1 += *data++); - --len; - } - if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE; - s2 %= FLETCHER32_MODULE; - } + /* * Process the data in blocks. */ const unsigned BLOCK_SIZE = 1 << 5; - uint32_t blocks = len / BLOCK_SIZE; - len -= blocks * BLOCK_SIZE; - while(blocks) + if(len >= BLOCK_SIZE) { - unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ - if(n > blocks) n = (unsigned)blocks; - blocks -= n; /* - * Process n blocks of data. At most NMAX data bytes can be - * processed before s2 must be reduced modulo FLETCHER32_MODULE. + * Serially compute s1 & s2, until the data is 16-byte aligned. */ -#ifdef _MSC_VER - uint32x4_t v_s2 = {.n128_u32 = {0, 0, 0, s1 * n}}; - uint32x4_t v_s1 = {.n128_u32 = {0, 0, 0, 0}}; -#else - uint32x4_t v_s2 = (uint32x4_t){0, 0, 0, s1 * n}; - uint32x4_t v_s1 = (uint32x4_t){0, 0, 0, 0}; -#endif - uint16x8_t v_column_sum_1 = vdupq_n_u16(0); - uint16x8_t v_column_sum_2 = vdupq_n_u16(0); - uint16x8_t v_column_sum_3 = vdupq_n_u16(0); - uint16x8_t v_column_sum_4 = vdupq_n_u16(0); - do + if((uintptr_t)data & 15) { - /* - * Load 32 input bytes. - */ - const uint8x16_t bytes1 = vld1q_u8((uint8_t *)(data)); - const uint8x16_t bytes2 = vld1q_u8((uint8_t *)(data + 16)); - /* - * Add previous block byte sum to v_s2. - */ - v_s2 = vaddq_u32(v_s2, v_s1); - /* - * Horizontally add the bytes for s1. - */ - v_s1 = vpadalq_u16(v_s1, vpadalq_u8(vpaddlq_u8(bytes1), bytes2)); - /* - * Vertically add the bytes for s2. - */ - v_column_sum_1 = vaddw_u8(v_column_sum_1, vget_low_u8(bytes1)); - v_column_sum_2 = vaddw_u8(v_column_sum_2, vget_high_u8(bytes1)); - v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8(bytes2)); - v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2)); - data += BLOCK_SIZE; + while((uintptr_t)data & 15) + { + s2 += (s1 += *data++); + --len; + } + if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE; + s2 %= FLETCHER32_MODULE; } - while(--n); - v_s2 = vshlq_n_u32(v_s2, 5); - /* - * Multiply-add bytes by [ 32, 31, 30, ... ] for s2. - */ + + uint32_t blocks = len / BLOCK_SIZE; + len -= blocks * BLOCK_SIZE; + while(blocks) + { + unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ + if(n > blocks) n = (unsigned)blocks; + blocks -= n; + /* + * Process n blocks of data. At most NMAX data bytes can be + * processed before s2 must be reduced modulo FLETCHER32_MODULE. + */ +#ifdef _MSC_VER + uint32x4_t v_s2 = {.n128_u32 = {0, 0, 0, s1 * n}}; + uint32x4_t v_s1 = {.n128_u32 = {0, 0, 0, 0}}; +#else + uint32x4_t v_s2 = (uint32x4_t){0, 0, 0, s1 * n}; + uint32x4_t v_s1 = (uint32x4_t){0, 0, 0, 0}; +#endif + uint16x8_t v_column_sum_1 = vdupq_n_u16(0); + uint16x8_t v_column_sum_2 = vdupq_n_u16(0); + uint16x8_t v_column_sum_3 = vdupq_n_u16(0); + uint16x8_t v_column_sum_4 = vdupq_n_u16(0); + do + { + /* + * Load 32 input bytes. + */ + const uint8x16_t bytes1 = vld1q_u8((uint8_t *)(data)); + const uint8x16_t bytes2 = vld1q_u8((uint8_t *)(data + 16)); + /* + * Add previous block byte sum to v_s2. + */ + v_s2 = vaddq_u32(v_s2, v_s1); + /* + * Horizontally add the bytes for s1. + */ + v_s1 = vpadalq_u16(v_s1, vpadalq_u8(vpaddlq_u8(bytes1), bytes2)); + /* + * Vertically add the bytes for s2. + */ + v_column_sum_1 = vaddw_u8(v_column_sum_1, vget_low_u8(bytes1)); + v_column_sum_2 = vaddw_u8(v_column_sum_2, vget_high_u8(bytes1)); + v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8(bytes2)); + v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2)); + data += BLOCK_SIZE; + } + while(--n); + v_s2 = vshlq_n_u32(v_s2, 5); + /* + * Multiply-add bytes by [ 32, 31, 30, ... ] for s2. + */ #ifdef _MSC_VER #ifdef _M_ARM64 - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), neon_ld1m_16((uint16_t[]){32, 31, 30, 29})); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), neon_ld1m_16((uint16_t[]){28, 27, 26, 25})); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), neon_ld1m_16((uint16_t[]){24, 23, 22, 21})); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), neon_ld1m_16((uint16_t[]){20, 19, 18, 17})); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), neon_ld1m_16((uint16_t[]){16, 15, 14, 13})); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), neon_ld1m_16((uint16_t[]){12, 11, 10, 9})); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), neon_ld1m_16((uint16_t[]){8, 7, 6, 5})); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), neon_ld1m_16((uint16_t[]){4, 3, 2, 1})); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), neon_ld1m_16((uint16_t[]){32, 31, 30, 29})); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), neon_ld1m_16((uint16_t[]){28, 27, 26, 25})); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), neon_ld1m_16((uint16_t[]){24, 23, 22, 21})); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), neon_ld1m_16((uint16_t[]){20, 19, 18, 17})); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), neon_ld1m_16((uint16_t[]){16, 15, 14, 13})); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), neon_ld1m_16((uint16_t[]){12, 11, 10, 9})); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), neon_ld1m_16((uint16_t[]){8, 7, 6, 5})); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), neon_ld1m_16((uint16_t[]){4, 3, 2, 1})); #else - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), vld1_u16(((uint16_t[]){32, 31, 30, 29}))); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), vld1_u16(((uint16_t[]){28, 27, 26, 25}))); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), vld1_u16(((uint16_t[]){24, 23, 22, 21}))); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), vld1_u16(((uint16_t[]){20, 19, 18, 17}))); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), vld1_u16(((uint16_t[]){16, 15, 14, 13}))); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), vld1_u16(((uint16_t[]){12, 11, 10, 9}))); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), vld1_u16(((uint16_t[]){8, 7, 6, 5}))); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), vld1_u16(((uint16_t[]){4, 3, 2, 1}))); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), vld1_u16(((uint16_t[]){32, 31, 30, 29}))); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), vld1_u16(((uint16_t[]){28, 27, 26, 25}))); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), vld1_u16(((uint16_t[]){24, 23, 22, 21}))); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), vld1_u16(((uint16_t[]){20, 19, 18, 17}))); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), vld1_u16(((uint16_t[]){16, 15, 14, 13}))); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), vld1_u16(((uint16_t[]){12, 11, 10, 9}))); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), vld1_u16(((uint16_t[]){8, 7, 6, 5}))); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), vld1_u16(((uint16_t[]){4, 3, 2, 1}))); #endif #else - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), (uint16x4_t){32, 31, 30, 29}); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), (uint16x4_t){28, 27, 26, 25}); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), (uint16x4_t){24, 23, 22, 21}); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), (uint16x4_t){20, 19, 18, 17}); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), (uint16x4_t){16, 15, 14, 13}); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), (uint16x4_t){12, 11, 10, 9}); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), (uint16x4_t){8, 7, 6, 5}); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), (uint16x4_t){4, 3, 2, 1}); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), (uint16x4_t){32, 31, 30, 29}); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), (uint16x4_t){28, 27, 26, 25}); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), (uint16x4_t){24, 23, 22, 21}); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), (uint16x4_t){20, 19, 18, 17}); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), (uint16x4_t){16, 15, 14, 13}); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), (uint16x4_t){12, 11, 10, 9}); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), (uint16x4_t){8, 7, 6, 5}); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), (uint16x4_t){4, 3, 2, 1}); #endif - /* - * Sum epi32 ints v_s1(s2) and accumulate in s1(s2). - */ - uint32x2_t sum1 = vpadd_u32(vget_low_u32(v_s1), vget_high_u32(v_s1)); - uint32x2_t sum2 = vpadd_u32(vget_low_u32(v_s2), vget_high_u32(v_s2)); - uint32x2_t s1s2 = vpadd_u32(sum1, sum2); - s1 += vget_lane_u32(s1s2, 0); - s2 += vget_lane_u32(s1s2, 1); - /* - * Reduce. - */ - s1 %= FLETCHER32_MODULE; - s2 %= FLETCHER32_MODULE; + /* + * Sum epi32 ints v_s1(s2) and accumulate in s1(s2). + */ + uint32x2_t sum1 = vpadd_u32(vget_low_u32(v_s1), vget_high_u32(v_s1)); + uint32x2_t sum2 = vpadd_u32(vget_low_u32(v_s2), vget_high_u32(v_s2)); + uint32x2_t s1s2 = vpadd_u32(sum1, sum2); + s1 += vget_lane_u32(s1s2, 0); + s2 += vget_lane_u32(s1s2, 1); + /* + * Reduce. + */ + s1 %= FLETCHER32_MODULE; + s2 %= FLETCHER32_MODULE; + } } + /* * Handle leftover data. */ @@ -197,6 +203,7 @@ TARGET_WITH_NEON void fletcher32_neon(uint16_t *sum1, uint16_t *sum2, const uint if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE; s2 %= FLETCHER32_MODULE; } + /* * Return the recombined sums. */ diff --git a/fletcher32_ssse3.c b/fletcher32_ssse3.c index 20cc08c..b9983b8 100644 --- a/fletcher32_ssse3.c +++ b/fletcher32_ssse3.c @@ -59,68 +59,72 @@ fletcher32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len) * Process the data in blocks. */ const unsigned BLOCK_SIZE = 1 << 5; - long blocks = len / BLOCK_SIZE; - len -= blocks * BLOCK_SIZE; - while(blocks) + if(len >= BLOCK_SIZE) { - unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ - if(n > blocks) n = (unsigned)blocks; - blocks -= n; - const __m128i tap1 = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); - const __m128i tap2 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); - const __m128i zero = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); - const __m128i ones = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1); - /* - * Process n blocks of data. At most NMAX data bytes can be - * processed before s2 must be reduced modulo BASE. - */ - __m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n); - __m128i v_s2 = _mm_set_epi32(0, 0, 0, s2); - __m128i v_s1 = _mm_set_epi32(0, 0, 0, 0); - do + long blocks = len / BLOCK_SIZE; + len -= blocks * BLOCK_SIZE; + while(blocks) { + unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ + if(n > blocks) n = (unsigned)blocks; + blocks -= n; + const __m128i tap1 = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); + const __m128i tap2 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); + const __m128i zero = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + const __m128i ones = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1); /* - * Load 32 input bytes. + * Process n blocks of data. At most NMAX data bytes can be + * processed before s2 must be reduced modulo BASE. */ - const __m128i bytes1 = _mm_loadu_si128((__m128i *)(data)); - const __m128i bytes2 = _mm_loadu_si128((__m128i *)(data + 16)); + __m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n); + __m128i v_s2 = _mm_set_epi32(0, 0, 0, s2); + __m128i v_s1 = _mm_set_epi32(0, 0, 0, 0); + do + { + /* + * Load 32 input bytes. + */ + const __m128i bytes1 = _mm_loadu_si128((__m128i *)(data)); + const __m128i bytes2 = _mm_loadu_si128((__m128i *)(data + 16)); + /* + * Add previous block byte sum to v_ps. + */ + v_ps = _mm_add_epi32(v_ps, v_s1); + /* + * Horizontally add the bytes for s1, multiply-adds the + * bytes by [ 32, 31, 30, ... ] for s2. + */ + v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zero)); + const __m128i mad1 = _mm_maddubs_epi16(bytes1, tap1); + v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad1, ones)); + v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zero)); + const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2); + v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones)); + data += BLOCK_SIZE; + } + while(--n); + v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5)); /* - * Add previous block byte sum to v_ps. + * Sum epi32 ints v_s1(s2) and accumulate in s1(s2). */ - v_ps = _mm_add_epi32(v_ps, v_s1); - /* - * Horizontally add the bytes for s1, multiply-adds the - * bytes by [ 32, 31, 30, ... ] for s2. - */ - v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zero)); - const __m128i mad1 = _mm_maddubs_epi16(bytes1, tap1); - v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad1, ones)); - v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zero)); - const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2); - v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones)); - data += BLOCK_SIZE; - } - while(--n); - v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5)); - /* - * Sum epi32 ints v_s1(s2) and accumulate in s1(s2). - */ #define S23O1 _MM_SHUFFLE(2, 3, 0, 1) /* A B C D -> B A D C */ #define S1O32 _MM_SHUFFLE(1, 0, 3, 2) /* A B C D -> C D A B */ - v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S23O1)); - v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S1O32)); - s1 += _mm_cvtsi128_si32(v_s1); - v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S23O1)); - v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S1O32)); - s2 = _mm_cvtsi128_si32(v_s2); + v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S23O1)); + v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S1O32)); + s1 += _mm_cvtsi128_si32(v_s1); + v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S23O1)); + v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S1O32)); + s2 = _mm_cvtsi128_si32(v_s2); #undef S23O1 #undef S1O32 - /* - * Reduce. - */ - s1 %= FLETCHER32_MODULE; - s2 %= FLETCHER32_MODULE; + /* + * Reduce. + */ + s1 %= FLETCHER32_MODULE; + s2 %= FLETCHER32_MODULE; + } } + /* * Handle leftover data. */ @@ -151,6 +155,7 @@ fletcher32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len) if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE; s2 %= FLETCHER32_MODULE; } + /* * Return the recombined sums. */