From ee09f7c57e9d23e615d7f8090b25fc4b0d1e83b1 Mon Sep 17 00:00:00 2001
From: Natalia Portillo <claunia@claunia.com>
Date: Mon, 11 Oct 2021 22:51:11 +0100
Subject: [PATCH] Fix SIMD implementations for partial CRC blocks.

---
 CRC32/clmul.cs  | 295 +-----------------------------------------------
 CRC32Context.cs |  17 ++-
 CRC64/clmul.cs  |  27 +----
 CRC64Context.cs |  19 +++-
 4 files changed, 34 insertions(+), 324 deletions(-)
diff --git a/CRC32/clmul.cs b/CRC32/clmul.cs
index b446d53..679450d 100644
--- a/CRC32/clmul.cs
+++ b/CRC32/clmul.cs
@@ -81,91 +81,6 @@ namespace Aaru.Checksums.CRC32
             Vector128.Create(0x0201008fu, 0x06050403, 0x0a090807, 0x0e0d0c0b)  /* shl  1 (16 -15)/shr15*/
         };
 
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        static void Fold1(ref Vector128<uint> xmmCRC0, ref Vector128<uint> xmmCRC1, ref Vector128<uint> xmmCRC2,
-                          ref Vector128<uint> xmmCRC3)
-        {
-            Vector128<uint> xmmFold4 = Vector128.Create(0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001);
-
-            Vector128<uint> xTmp3 = xmmCRC3;
-
-            xmmCRC3 = xmmCRC0;
-            xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
-            xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
-            Vector128<float> psCRC0 = xmmCRC0.AsSingle();
-            Vector128<float> psCRC3 = xmmCRC3.AsSingle();
-            Vector128<float> psRes  = Sse.Xor(psCRC0, psCRC3);
-
-            xmmCRC0 = xmmCRC1;
-            xmmCRC1 = xmmCRC2;
-            xmmCRC2 = xTmp3;
-            xmmCRC3 = psRes.AsUInt32();
-        }
-
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        static void Fold2(ref Vector128<uint> xmmCRC0, ref Vector128<uint> xmmCRC1, ref Vector128<uint> xmmCRC2,
-                          ref Vector128<uint> xmmCRC3)
-        {
-            Vector128<uint> xmmFold4 = Vector128.Create(0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001);
-
-            Vector128<uint> xTmp3 = xmmCRC3;
-            Vector128<uint> xTmp2 = xmmCRC2;
-
-            xmmCRC3 = xmmCRC1;
-            xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
-            xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
-            Vector128<float> psCRC3  = xmmCRC3.AsSingle();
-            Vector128<float> psCRC1  = xmmCRC1.AsSingle();
-            Vector128<float> psRes31 = Sse.Xor(psCRC3, psCRC1);
-
-            xmmCRC2 = xmmCRC0;
-            xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
-            xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
-            Vector128<float> psCRC0  = xmmCRC0.AsSingle();
-            Vector128<float> psCRC2  = xmmCRC2.AsSingle();
-            Vector128<float> psRes20 = Sse.Xor(psCRC0, psCRC2);
-
-            xmmCRC0 = xTmp2;
-            xmmCRC1 = xTmp3;
-            xmmCRC2 = psRes20.AsUInt32();
-            xmmCRC3 = psRes31.AsUInt32();
-        }
-
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        static void Fold3(ref Vector128<uint> xmmCRC0, ref Vector128<uint> xmmCRC1, ref Vector128<uint> xmmCRC2,
-                          ref Vector128<uint> xmmCRC3)
-        {
-            Vector128<uint> xmmFold4 = Vector128.Create(0x54442bd4, 0x00000001, 0xc6e41596, 0x00000001);
-
-            Vector128<uint> xTmp3 = xmmCRC3;
-
-            xmmCRC3 = xmmCRC2;
-            xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
-            xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
-            Vector128<float> psCRC2  = xmmCRC2.AsSingle();
-            Vector128<float> psCRC3  = xmmCRC3.AsSingle();
-            Vector128<float> psRes32 = Sse.Xor(psCRC2, psCRC3);
-
-            xmmCRC2 = xmmCRC1;
-            xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
-            xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
-            Vector128<float> psCRC1 = xmmCRC1.AsSingle();
-            psCRC2 = xmmCRC2.AsSingle();
-            Vector128<float> psRes21 = Sse.Xor(psCRC1, psCRC2);
-
-            xmmCRC1 = xmmCRC0;
-            xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
-            xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
-            Vector128<float> psCRC0 = xmmCRC0.AsSingle();
-            psCRC1 = xmmCRC1.AsSingle();
-            Vector128<float> psRes10 = Sse.Xor(psCRC0, psCRC1);
-
-            xmmCRC0 = xTmp3;
-            xmmCRC1 = psRes10.AsUInt32();
-            xmmCRC2 = psRes21.AsUInt32();
-            xmmCRC3 = psRes32.AsUInt32();
-        }
-
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         static void Fold4(ref Vector128<uint> xmmCRC0, ref Vector128<uint> xmmCRC1, ref Vector128<uint> xmmCRC2,
                           ref Vector128<uint> xmmCRC3)
@@ -207,51 +122,6 @@ namespace Aaru.Checksums.CRC32
             xmmCRC3 = psRes3.AsUInt32();
         }
 
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        static void PartialFold(long len, ref Vector128<uint> xmmCRC0, ref Vector128<uint> xmmCRC1,
-                                ref Vector128<uint> xmmCRC2, ref Vector128<uint> xmmCRC3,
-                                ref Vector128<uint> xmmCRCPart)
-        {
-            Vector128<uint> xmmFold4 = Vector128.Create(0x54442bd4, 0x00000001, 0xc6e41596, 0x00000001);
-            Vector128<uint> xmmMask3 = Vector128.Create(0x80808080);
-
-            Vector128<uint> xmmShl = _pshufbShfTable[len - 1];
-            Vector128<uint> xmmShr = xmmShl;
-            xmmShr = Sse2.Xor(xmmShr, xmmMask3);
-
-            Vector128<uint> xmmA00 = Ssse3.Shuffle(xmmCRC0.AsByte(), xmmShl.AsByte()).AsUInt32();
-
-            xmmCRC0 = Ssse3.Shuffle(xmmCRC0.AsByte(), xmmShr.AsByte()).AsUInt32();
-            Vector128<uint> xmmTmp1 = Ssse3.Shuffle(xmmCRC1.AsByte(), xmmShl.AsByte()).AsUInt32();
-            xmmCRC0 = Sse2.Or(xmmCRC0, xmmTmp1);
-
-            xmmCRC1 = Ssse3.Shuffle(xmmCRC1.AsByte(), xmmShr.AsByte()).AsUInt32();
-            Vector128<uint> xmmTmp2 = Ssse3.Shuffle(xmmCRC2.AsByte(), xmmShl.AsByte()).AsUInt32();
-            xmmCRC1 = Sse2.Or(xmmCRC1, xmmTmp2);
-
-            xmmCRC2 = Ssse3.Shuffle(xmmCRC2.AsByte(), xmmShr.AsByte()).AsUInt32();
-            Vector128<uint> xmmTmp3 = Ssse3.Shuffle(xmmCRC3.AsByte(), xmmShl.AsByte()).AsUInt32();
-            xmmCRC2 = Sse2.Or(xmmCRC2, xmmTmp3);
-
-            xmmCRC3    = Ssse3.Shuffle(xmmCRC3.AsByte(), xmmShr.AsByte()).AsUInt32();
-            xmmCRCPart = Ssse3.Shuffle(xmmCRCPart.AsByte(), xmmShl.AsByte()).AsUInt32();
-            xmmCRC3    = Sse2.Or(xmmCRC3, xmmCRCPart);
-
-            Vector128<uint> xmmA01 = Pclmulqdq.CarrylessMultiply(xmmA00.AsUInt64(), xmmFold4.AsUInt64(), 0x10).
-                                               AsUInt32();
-
-            xmmA00 = Pclmulqdq.CarrylessMultiply(xmmA00.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
-
-            Vector128<float> psCRC3 = xmmCRC3.AsSingle();
-            Vector128<float> psa00  = xmmA00.AsSingle();
-            Vector128<float> psa01  = xmmA01.AsSingle();
-
-            Vector128<float> psRes = Sse.Xor(psCRC3, psa00);
-            psRes = Sse.Xor(psRes, psa01);
-
-            xmmCRC3 = psRes.AsUInt32();
-        }
-
         internal static uint Step(byte[] src, long len, uint initialCRC)
         {
             Vector128<uint> xmmT0, xmmT1, xmmT2;
@@ -260,8 +130,7 @@ namespace Aaru.Checksums.CRC32
             Vector128<uint> xmmCRC1    = Vector128<uint>.Zero;
             Vector128<uint> xmmCRC2    = Vector128<uint>.Zero;
             Vector128<uint> xmmCRC3    = Vector128<uint>.Zero;
-            Vector128<uint> xmmCRCPart;
-            int             bufPos = 0;
+            int             bufPos     = 0;
 
             bool first = true;
 
@@ -269,44 +138,6 @@ namespace Aaru.Checksums.CRC32
             Vector128<uint> xmmMask  = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000);
             Vector128<uint> xmmMask2 = Vector128.Create(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
 
-            uint crc;
-
-            if(len < 16)
-            {
-                switch(len)
-                {
-                    case 0: return initialCRC;
-                    case < 4:
-                        /*
-                     * no idea how to do this for <4 bytes, delegate to classic impl.
-                     */
-                        crc = ~initialCRC;
-
-                        switch(len)
-                        {
-                            case 3:
-                                crc = (crc >> 8) ^ Crc32Context._isoCrc32Table[0][(crc & 0xFF) ^ src[bufPos++]];
-                                goto case 2;
-                            case 2:
-                                crc = (crc >> 8) ^ Crc32Context._isoCrc32Table[0][(crc & 0xFF) ^ src[bufPos++]];
-                                goto case 1;
-                            case 1:
-                                crc = (crc >> 8) ^ Crc32Context._isoCrc32Table[0][(crc & 0xFF) ^ src[bufPos]];
-
-                                break;
-                        }
-
-                        return ~crc;
-                }
-
-                xmmCRCPart = Vector128.Create(BitConverter.ToUInt32(src, 0), BitConverter.ToUInt32(src, 4),
-                                              BitConverter.ToUInt32(src, 8), BitConverter.ToUInt32(src, 12));
-
-                xmmCRCPart = Sse2.Xor(xmmCRCPart, xmmInitial);
-
-                goto partial;
-            }
-
             while((len -= 64) >= 0)
             {
                 xmmT0 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
@@ -348,126 +179,6 @@ namespace Aaru.Checksums.CRC32
                 xmmCRC3 = Sse2.Xor(xmmCRC3, xmmT3);
             }
 
-            /*
-             * len = num bytes left - 64
-             */
-            if(len + 16 >= 0)
-            {
-                len += 16;
-
-                xmmT0 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
-                                         BitConverter.ToUInt32(src, bufPos                                     + 8),
-                                         BitConverter.ToUInt32(src, bufPos                                     + 12));
-
-                bufPos += 16;
-
-                xmmT1 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
-                                         BitConverter.ToUInt32(src, bufPos                                     + 8),
-                                         BitConverter.ToUInt32(src, bufPos                                     + 12));
-
-                bufPos += 16;
-
-                xmmT2 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
-                                         BitConverter.ToUInt32(src, bufPos                                     + 8),
-                                         BitConverter.ToUInt32(src, bufPos                                     + 12));
-
-                bufPos += 16;
-
-                if(first)
-                    xmmT0 = Sse2.Xor(xmmT0, xmmInitial);
-
-                Fold3(ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3);
-
-                xmmCRC1 = Sse2.Xor(xmmCRC1, xmmT0);
-                xmmCRC2 = Sse2.Xor(xmmCRC2, xmmT1);
-                xmmCRC3 = Sse2.Xor(xmmCRC3, xmmT2);
-
-                if(len == 0)
-                    goto done;
-
-                xmmCRCPart = Vector128.Create(BitConverter.ToUInt32(src, bufPos),
-                                              BitConverter.ToUInt32(src, bufPos + 4),
-                                              BitConverter.ToUInt32(src, bufPos + 8),
-                                              BitConverter.ToUInt32(src, bufPos + 12));
-            }
-            else if(len + 32 >= 0)
-            {
-                len += 32;
-
-                xmmT0 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
-                                         BitConverter.ToUInt32(src, bufPos                                     + 8),
-                                         BitConverter.ToUInt32(src, bufPos                                     + 12));
-
-                bufPos += 16;
-
-                xmmT1 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
-                                         BitConverter.ToUInt32(src, bufPos                                     + 8),
-                                         BitConverter.ToUInt32(src, bufPos                                     + 12));
-
-                bufPos += 16;
-
-                if(first)
-                    xmmT0 = Sse2.Xor(xmmT0, xmmInitial);
-
-                Fold2(ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3);
-
-                xmmCRC2 = Sse2.Xor(xmmCRC2, xmmT0);
-                xmmCRC3 = Sse2.Xor(xmmCRC3, xmmT1);
-
-                if(len == 0)
-                    goto done;
-
-                xmmCRCPart = Vector128.Create(BitConverter.ToUInt32(src, bufPos),
-                                              BitConverter.ToUInt32(src, bufPos + 4),
-                                              BitConverter.ToUInt32(src, bufPos + 8),
-                                              BitConverter.ToUInt32(src, bufPos + 12));
-            }
-            else if(len + 48 >= 0)
-            {
-                len += 48;
-
-                xmmT0 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
-                                         BitConverter.ToUInt32(src, bufPos                                     + 8),
-                                         BitConverter.ToUInt32(src, bufPos                                     + 12));
-
-                bufPos += 16;
-
-                if(first)
-                    xmmT0 = Sse2.Xor(xmmT0, xmmInitial);
-
-                Fold1(ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3);
-
-                xmmCRC3 = Sse2.Xor(xmmCRC3, xmmT0);
-
-                if(len == 0)
-                    goto done;
-
-                xmmCRCPart = Vector128.Create(BitConverter.ToUInt32(src, bufPos),
-                                              BitConverter.ToUInt32(src, bufPos + 4),
-                                              BitConverter.ToUInt32(src, bufPos + 8),
-                                              BitConverter.ToUInt32(src, bufPos + 12));
-            }
-            else
-            {
-                len += 64;
-
-                if(len == 0)
-                    goto done;
-
-                xmmCRCPart = Vector128.Create(BitConverter.ToUInt32(src, bufPos),
-                                              BitConverter.ToUInt32(src, bufPos + 4),
-                                              BitConverter.ToUInt32(src, bufPos + 8),
-                                              BitConverter.ToUInt32(src, bufPos + 12));
-
-                if(first)
-                    xmmCRCPart = Sse2.Xor(xmmCRCPart, xmmInitial);
-            }
-
-            partial:
-            PartialFold(len, ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3, ref xmmCRCPart);
-
-            done:
-
             /* fold 512 to 32 */
 
             /*
@@ -533,9 +244,7 @@ namespace Aaru.Checksums.CRC32
              * no real advantage - it's a tiny bit slower per call, while no additional CPUs
              * would be supported by only requiring SSSE3 and CLMUL instead of SSE4.1 + CLMUL
              */
-            crc = Sse41.Extract(xmmCRC3, 2);
-
-            return ~crc;
+            return ~Sse41.Extract(xmmCRC3, 2);
         }
     }
 }
\ No newline at end of file
diff --git a/CRC32Context.cs b/CRC32Context.cs
index 949012c..132869b 100644
--- a/CRC32Context.cs
+++ b/CRC32Context.cs
@@ -412,6 +412,8 @@ namespace Aaru.Checksums
 
         static void Step(ref uint previousCrc, uint[][] table, byte[] data, uint len, bool useIso)
         {
+            int currentPos = 0;
+
             if(useIso)
             {
                 if(Pclmulqdq.IsSupported &&
@@ -419,9 +421,19 @@ namespace Aaru.Checksums
                    Ssse3.IsSupported     &&
                    Sse2.IsSupported)
                 {
-                    previousCrc = ~Clmul.Step(data, len, ~previousCrc);
+                    // Only works in blocks of 16 bytes
+                    uint blocks = len / 64;
 
-                    return;
+                    if(blocks > 0)
+                    {
+                        previousCrc = ~Clmul.Step(data, blocks * 64, ~previousCrc);
+
+                        currentPos =  (int)(blocks * 64);
+                        len        -= blocks * 64;
+                    }
+
+                    if(len == 0)
+                        return;
                 }
 
                 if(Crc32.Arm64.IsSupported)
@@ -442,7 +454,6 @@ namespace Aaru.Checksums
             // Unroll according to Intel slicing by uint8_t
             // http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf
             // http://sourceforge.net/projects/slicing-by-8/
-            int       currentPos  = 0;
             const int unroll      = 4;
             const int bytesAtOnce = 8 * unroll;
             uint      crc         = previousCrc;
diff --git a/CRC64/clmul.cs b/CRC64/clmul.cs
index 04aa3fc..2bd9d39 100644
--- a/CRC64/clmul.cs
+++ b/CRC64/clmul.cs
@@ -81,9 +81,7 @@ namespace Aaru.Checksums.CRC64
             const ulong      pol            = 0x92d8af2baf0e1e85;
             Vector128<ulong> foldConstants1 = Vector128.Create(k1, k2);
             Vector128<ulong> foldConstants2 = Vector128.Create(mu, pol);
-            uint             leadOutSize    = length % 16;
             Vector128<ulong> initialCrc     = Vector128.Create(~crc, 0);
-            Vector128<ulong> p;
             length -= 16;
 
             // Initial CRC can simply be added to data
@@ -103,28 +101,9 @@ namespace Aaru.Checksums.CRC64
                 bufPos += 16;
             }
 
-            if(length == 16)
-            {
-                p = Sse2.Xor(accumulator,
-                             Vector128.Create(BitConverter.ToUInt64(data, bufPos),
-                                              BitConverter.ToUInt64(data, bufPos + 8)));
-            }
-            else
-            {
-                Vector128<ulong> end0 = Sse2.Xor(accumulator,
-                                                 Vector128.Create(BitConverter.ToUInt64(data, bufPos),
-                                                                  BitConverter.ToUInt64(data, bufPos + 8)));
-
-                bufPos += 16;
-
-                Vector128<ulong> end1 =
-                    Vector128.Create(BitConverter.ToUInt64(data, bufPos), BitConverter.ToUInt64(data, bufPos + 8));
-
-                ShiftRight128(end0, leadOutSize, out Vector128<ulong> a, out Vector128<ulong> b);
-                ShiftRight128(end1, leadOutSize, out Vector128<ulong> c, out _);
-
-                p = Sse2.Xor(Fold(a, foldConstants1), Sse2.Or(b, c));
-            }
+            Vector128<ulong> p = Sse2.Xor(accumulator,
+                                          Vector128.Create(BitConverter.ToUInt64(data, bufPos),
+                                                           BitConverter.ToUInt64(data, bufPos + 8)));
 
             Vector128<ulong> r = Sse2.Xor(Pclmulqdq.CarrylessMultiply(p, foldConstants1, 0x10),
                                           Sse2.ShiftRightLogical128BitLane(p, 8));
diff --git a/CRC64Context.cs b/CRC64Context.cs
index 411baa5..f9c0855 100644
--- a/CRC64Context.cs
+++ b/CRC64Context.cs
@@ -352,23 +352,34 @@ namespace Aaru.Checksums
 
         static void Step(ref ulong previousCrc, ulong[][] table, byte[] data, uint len, bool useEcma)
         {
+            int dataOff = 0;
+
             if(useEcma               &&
                Pclmulqdq.IsSupported &&
                Sse41.IsSupported     &&
                Ssse3.IsSupported     &&
                Sse2.IsSupported)
             {
-                previousCrc = ~Clmul.Step(~previousCrc, data, len);
+                // Only works in blocks of 32 bytes
+                uint blocks = len / 32;
 
-                return;
+                if(blocks > 0)
+                {
+                    previousCrc = ~Clmul.Step(~previousCrc, data, blocks * 32);
+
+                    dataOff =  (int)(blocks * 32);
+                    len     -= blocks * 32;
+                }
+
+                if(len == 0)
+                    return;
             }
 
             // Unroll according to Intel slicing by uint8_t
             // http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf
             // http://sourceforge.net/projects/slicing-by-8/
 
-            ulong crc     = previousCrc;
-            int   dataOff = 0;
+            ulong crc = previousCrc;
 
             if(len > 4)
             {