Fix Adler and Fletcher calculations using SIMD when dataset is smaller than block size.

2025-12-16 11:14:29 +00:00 · 2023-09-24 19:33:25 +01:00
parent 89382334ec
commit 0d9d1d92eb
9 changed files with 750 additions and 699 deletions
--- a/adler32_avx2.c
+++ b/adler32_avx2.c
@@ -51,6 +51,8 @@ AARU_EXPORT TARGET_WITH_AVX2 void AARU_CALL adler32_avx2(uint16_t *sum1, uint16_
     * Process the data in blocks.
     */
    const unsigned BLOCK_SIZE = 1 << 5;
+    if(len >= BLOCK_SIZE)
+    {
        long blocks = len / BLOCK_SIZE;
        len -= blocks * BLOCK_SIZE;

@@ -147,6 +149,7 @@ AARU_EXPORT TARGET_WITH_AVX2 void AARU_CALL adler32_avx2(uint16_t *sum1, uint16_
            s1 %= ADLER_MODULE;
            s2 %= ADLER_MODULE;
        }
+    }

    /*
     * Handle leftover data.
@@ -178,6 +181,7 @@ AARU_EXPORT TARGET_WITH_AVX2 void AARU_CALL adler32_avx2(uint16_t *sum1, uint16_
        if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
        s2 %= ADLER_MODULE;
    }
+
    /*
     * Return the recombined sums.
     */
--- a/adler32_neon.c
+++ b/adler32_neon.c
@@ -55,6 +55,13 @@ TARGET_WITH_NEON void adler32_neon(uint16_t *sum1, uint16_t *sum2, const uint8_t
     */
    uint32_t s1 = *sum1;
    uint32_t s2 = *sum2;
+
+    /*
+     * Process the data in blocks.
+     */
+    const unsigned BLOCK_SIZE = 1 << 5;
+    if(len >= BLOCK_SIZE)
+    {
        /*
         * Serially compute s1 & s2, until the data is 16-byte aligned.
         */
@@ -68,10 +75,7 @@ TARGET_WITH_NEON void adler32_neon(uint16_t *sum1, uint16_t *sum2, const uint8_t
            if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
            s2 %= ADLER_MODULE;
        }
-    /*
-     * Process the data in blocks.
-     */
-    const unsigned BLOCK_SIZE = 1 << 5;
+
        uint32_t blocks = len / BLOCK_SIZE;
        len -= blocks * BLOCK_SIZE;
        while(blocks)
@@ -167,6 +171,8 @@ TARGET_WITH_NEON void adler32_neon(uint16_t *sum1, uint16_t *sum2, const uint8_t
            s1 %= ADLER_MODULE;
            s2 %= ADLER_MODULE;
        }
+    }
+
    /*
     * Handle leftover data.
     */
@@ -197,6 +203,7 @@ TARGET_WITH_NEON void adler32_neon(uint16_t *sum1, uint16_t *sum2, const uint8_t
        if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
        s2 %= ADLER_MODULE;
    }
+
    /*
     * Return the recombined sums.
     */
--- a/adler32_ssse3.c
+++ b/adler32_ssse3.c
@@ -60,6 +60,8 @@ adler32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
     * Process the data in blocks.
     */
    const unsigned BLOCK_SIZE = 1 << 5;
+    if(len >= BLOCK_SIZE)
+    {
        long blocks = len / BLOCK_SIZE;
        len -= blocks * BLOCK_SIZE;
        while(blocks)
@@ -122,6 +124,8 @@ adler32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
            s1 %= ADLER_MODULE;
            s2 %= ADLER_MODULE;
        }
+    }
+
    /*
     * Handle leftover data.
     */
@@ -152,6 +156,7 @@ adler32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
        if(s1 >= ADLER_MODULE) s1 -= ADLER_MODULE;
        s2 %= ADLER_MODULE;
    }
+
    /*
     * Return the recombined sums.
     */
--- a/fletcher16_avx2.c
+++ b/fletcher16_avx2.c
@@ -52,6 +52,8 @@ fletcher16_avx2(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
     * Process the data in blocks.
     */
    const unsigned BLOCK_SIZE = 1 << 5;
+    if(len >= BLOCK_SIZE)
+    {
        long           blocks     = len / BLOCK_SIZE;
        len -= blocks * BLOCK_SIZE;

@@ -148,6 +150,7 @@ fletcher16_avx2(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
            s1 %= FLETCHER16_MODULE;
            s2 %= FLETCHER16_MODULE;
        }
+    }

    /*
     * Handle leftover data.
@@ -179,6 +182,7 @@ fletcher16_avx2(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
        s1 %= FLETCHER16_MODULE;
        s2 %= FLETCHER16_MODULE;
    }
+
    /*
     * Return the recombined sums.
     */
--- a/fletcher16_neon.c
+++ b/fletcher16_neon.c
@@ -55,6 +55,13 @@ TARGET_WITH_NEON void fletcher16_neon(uint8_t* sum1, uint8_t* sum2, const uint8_
     */
    uint32_t s1 = *sum1;
    uint32_t s2 = *sum2;
+
+    /*
+     * Process the data in blocks.
+     */
+    const unsigned BLOCK_SIZE = 1 << 5;
+    if(len >= BLOCK_SIZE)
+    {
        /*
         * Serially compute s1 & s2, until the data is 16-byte aligned.
         */
@@ -68,10 +75,7 @@ TARGET_WITH_NEON void fletcher16_neon(uint8_t* sum1, uint8_t* sum2, const uint8_
            s1 %= FLETCHER16_MODULE;
            s2 %= FLETCHER16_MODULE;
        }
-    /*
-     * Process the data in blocks.
-     */
-    const unsigned BLOCK_SIZE = 1 << 5;
+
        uint32_t blocks = len / BLOCK_SIZE;
        len -= blocks * BLOCK_SIZE;
        while(blocks)
@@ -94,7 +98,8 @@ TARGET_WITH_NEON void fletcher16_neon(uint8_t* sum1, uint8_t* sum2, const uint8_
            uint16x8_t v_column_sum_2 = vdupq_n_u16(0);
            uint16x8_t v_column_sum_3 = vdupq_n_u16(0);
            uint16x8_t v_column_sum_4 = vdupq_n_u16(0);
-        do {
+            do
+            {
                /*
                 * Load 32 input bytes.
                 */
@@ -116,7 +121,8 @@ TARGET_WITH_NEON void fletcher16_neon(uint8_t* sum1, uint8_t* sum2, const uint8_
                v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8(bytes2));
                v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2));
                data += BLOCK_SIZE;
-        } while(--n);
+            }
+            while(--n);
            v_s2                      = vshlq_n_u32(v_s2, 5);
            /*
             * Multiply-add bytes by [ 32, 31, 30, ... ] for s2.
@@ -165,6 +171,8 @@ TARGET_WITH_NEON void fletcher16_neon(uint8_t* sum1, uint8_t* sum2, const uint8_
            s1 %= FLETCHER16_MODULE;
            s2 %= FLETCHER16_MODULE;
        }
+    }
+
    /*
     * Handle leftover data.
     */
@@ -190,10 +198,12 @@ TARGET_WITH_NEON void fletcher16_neon(uint8_t* sum1, uint8_t* sum2, const uint8_
            s2 += (s1 += *data++);
            len -= 16;
        }
-        while(len--) { s2 += (s1 += *data++); }
+        while(len--)
+        { s2 += (s1 += *data++); }
        s1 %= FLETCHER16_MODULE;
        s2 %= FLETCHER16_MODULE;
    }
+
    /*
     * Return the recombined sums.
     */
--- a/fletcher16_ssse3.c
+++ b/fletcher16_ssse3.c
@@ -59,6 +59,8 @@ fletcher16_ssse3(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
     * Process the data in blocks.
     */
    const unsigned BLOCK_SIZE = 1 << 5;
+    if(len >= BLOCK_SIZE)
+    {
        long blocks = len / BLOCK_SIZE;
        len -= blocks * BLOCK_SIZE;
        while(blocks)
@@ -121,6 +123,8 @@ fletcher16_ssse3(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
            s1 %= FLETCHER16_MODULE;
            s2 %= FLETCHER16_MODULE;
        }
+    }
+
    /*
     * Handle leftover data.
     */
@@ -151,6 +155,7 @@ fletcher16_ssse3(uint8_t *sum1, uint8_t *sum2, const uint8_t *data, long len)
        s1 %= FLETCHER16_MODULE;
        s2 %= FLETCHER16_MODULE;
    }
+
    /*
     * Return the recombined sums.
     */
--- a/fletcher32_avx2.c
+++ b/fletcher32_avx2.c
@@ -52,6 +52,8 @@ fletcher32_avx2(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
     * Process the data in blocks.
     */
    const unsigned BLOCK_SIZE = 1 << 5;
+    if(len >= BLOCK_SIZE)
+    {
        long blocks = len / BLOCK_SIZE;
        len -= blocks * BLOCK_SIZE;

@@ -148,6 +150,7 @@ fletcher32_avx2(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
            s1 %= FLETCHER32_MODULE;
            s2 %= FLETCHER32_MODULE;
        }
+    }

    /*
     * Handle leftover data.
@@ -179,6 +182,7 @@ fletcher32_avx2(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
        if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE;
        s2 %= FLETCHER32_MODULE;
    }
+
    /*
     * Return the recombined sums.
     */
--- a/fletcher32_neon.c
+++ b/fletcher32_neon.c
@@ -55,6 +55,13 @@ TARGET_WITH_NEON void fletcher32_neon(uint16_t *sum1, uint16_t *sum2, const uint
     */
    uint32_t s1 = *sum1;
    uint32_t s2 = *sum2;
+
+    /*
+     * Process the data in blocks.
+     */
+    const unsigned BLOCK_SIZE = 1 << 5;
+    if(len >= BLOCK_SIZE)
+    {
        /*
         * Serially compute s1 & s2, until the data is 16-byte aligned.
         */
@@ -68,10 +75,7 @@ TARGET_WITH_NEON void fletcher32_neon(uint16_t *sum1, uint16_t *sum2, const uint
            if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE;
            s2 %= FLETCHER32_MODULE;
        }
-    /*
-     * Process the data in blocks.
-     */
-    const unsigned BLOCK_SIZE = 1 << 5;
+
        uint32_t blocks = len / BLOCK_SIZE;
        len -= blocks * BLOCK_SIZE;
        while(blocks)
@@ -167,6 +171,8 @@ TARGET_WITH_NEON void fletcher32_neon(uint16_t *sum1, uint16_t *sum2, const uint
            s1 %= FLETCHER32_MODULE;
            s2 %= FLETCHER32_MODULE;
        }
+    }
+
    /*
     * Handle leftover data.
     */
@@ -197,6 +203,7 @@ TARGET_WITH_NEON void fletcher32_neon(uint16_t *sum1, uint16_t *sum2, const uint
        if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE;
        s2 %= FLETCHER32_MODULE;
    }
+
    /*
     * Return the recombined sums.
     */
--- a/fletcher32_ssse3.c
+++ b/fletcher32_ssse3.c
@@ -59,6 +59,8 @@ fletcher32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
     * Process the data in blocks.
     */
    const unsigned BLOCK_SIZE = 1 << 5;
+    if(len >= BLOCK_SIZE)
+    {
        long blocks = len / BLOCK_SIZE;
        len -= blocks * BLOCK_SIZE;
        while(blocks)
@@ -121,6 +123,8 @@ fletcher32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
            s1 %= FLETCHER32_MODULE;
            s2 %= FLETCHER32_MODULE;
        }
+    }
+
    /*
     * Handle leftover data.
     */
@@ -151,6 +155,7 @@ fletcher32_ssse3(uint16_t *sum1, uint16_t *sum2, const uint8_t *data, long len)
        if(s1 >= FLETCHER32_MODULE) s1 -= FLETCHER32_MODULE;
        s2 %= FLETCHER32_MODULE;
    }
+
    /*
     * Return the recombined sums.
     */