diff --git a/Aaru.Checksums.csproj b/Aaru.Checksums.csproj
index cac8456..696e74a 100644
--- a/Aaru.Checksums.csproj
+++ b/Aaru.Checksums.csproj
@@ -55,28 +55,29 @@
false
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+
{F8BDF57B-1571-4CD0-84B3-B422088D359A}
Aaru.Helpers
@@ -95,15 +96,15 @@
-
+
-
-
-
+
+
+
diff --git a/CRC32/clmul.cs b/CRC32/clmul.cs
new file mode 100644
index 0000000..b446d53
--- /dev/null
+++ b/CRC32/clmul.cs
@@ -0,0 +1,541 @@
+// /***************************************************************************
+// Aaru Data Preservation Suite
+// ----------------------------------------------------------------------------
+//
+// Filename : clmul.cs
+// Author(s) : Natalia Portillo
+// Wajdi Feghali
+// Jim Guilford
+// Vinodh Gopal
+// Erdinc Ozturk
+// Jim Kukunas
+// Marian Beermann
+//
+// Component : Checksums.
+//
+// --[ Description ] ----------------------------------------------------------
+//
+// Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
+// instruction.
+//
+// A white paper describing this algorithm can be found at:
+// http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+//
+// --[ License ] --------------------------------------------------------------
+//
+// This software is provided 'as-is', without any express or implied warranty.
+// In no event will the authors be held liable for any damages arising from
+// the use of this software.
+//
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+//
+// 1. The origin of this software must not be misrepresented; you must not
+// claim that you wrote the original software. If you use this software
+// in a product, an acknowledgment in the product documentation would be
+// appreciated but is not required.
+//
+// 2. Altered source versions must be plainly marked as such, and must not be
+// misrepresented as being the original software.
+//
+// 3. This notice may not be removed or altered from any source distribution.
+//
+// ----------------------------------------------------------------------------
+// Copyright © 2011-2021 Natalia Portillo
+// Copyright (c) 2016 Marian Beermann (add support for initial value, restructuring)
+// Copyright (C) 2013 Intel Corporation. All rights reserved.
+// ****************************************************************************/
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace Aaru.Checksums.CRC32
+{
+ internal static class Clmul
+ {
+ static readonly uint[] _crcK =
+ {
+ 0xccaa009e, 0x00000000, /* rk1 */ 0x751997d0, 0x00000001, /* rk2 */ 0xccaa009e, 0x00000000, /* rk5 */
+ 0x63cd6124, 0x00000001, /* rk6 */ 0xf7011640, 0x00000001, /* rk7 */ 0xdb710640, 0x00000001 /* rk8 */
+ };
+
+ static readonly Vector128[] _pshufbShfTable =
+ {
+ Vector128.Create(0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d), /* shl 15 (16 - 1)/shr1 */
+ Vector128.Create(0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e), /* shl 14 (16 - 3)/shr2 */
+ Vector128.Create(0x86858483, 0x8a898887, 0x8e8d8c8b, 0x0201008f), /* shl 13 (16 - 4)/shr3 */
+ Vector128.Create(0x87868584, 0x8b8a8988, 0x8f8e8d8c, 0x03020100), /* shl 12 (16 - 4)/shr4 */
+ Vector128.Create(0x88878685, 0x8c8b8a89, 0x008f8e8d, 0x04030201), /* shl 11 (16 - 5)/shr5 */
+ Vector128.Create(0x89888786, 0x8d8c8b8a, 0x01008f8e, 0x05040302), /* shl 10 (16 - 6)/shr6 */
+ Vector128.Create(0x8a898887, 0x8e8d8c8b, 0x0201008f, 0x06050403), /* shl 9 (16 - 7)/shr7 */
+ Vector128.Create(0x8b8a8988, 0x8f8e8d8c, 0x03020100, 0x07060504), /* shl 8 (16 - 8)/shr8 */
+ Vector128.Create(0x8c8b8a89, 0x008f8e8d, 0x04030201, 0x08070605), /* shl 7 (16 - 9)/shr9 */
+ Vector128.Create(0x8d8c8b8a, 0x01008f8e, 0x05040302, 0x09080706), /* shl 6 (16 -10)/shr10*/
+ Vector128.Create(0x8e8d8c8b, 0x0201008f, 0x06050403, 0x0a090807), /* shl 5 (16 -11)/shr11*/
+ Vector128.Create(0x8f8e8d8c, 0x03020100, 0x07060504, 0x0b0a0908), /* shl 4 (16 -12)/shr12*/
+ Vector128.Create(0x008f8e8du, 0x04030201, 0x08070605, 0x0c0b0a09), /* shl 3 (16 -13)/shr13*/
+ Vector128.Create(0x01008f8eu, 0x05040302, 0x09080706, 0x0d0c0b0a), /* shl 2 (16 -14)/shr14*/
+ Vector128.Create(0x0201008fu, 0x06050403, 0x0a090807, 0x0e0d0c0b) /* shl 1 (16 -15)/shr15*/
+ };
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ static void Fold1(ref Vector128 xmmCRC0, ref Vector128 xmmCRC1, ref Vector128 xmmCRC2,
+ ref Vector128 xmmCRC3)
+ {
+ Vector128 xmmFold4 = Vector128.Create(0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001);
+
+ Vector128 xTmp3 = xmmCRC3;
+
+ xmmCRC3 = xmmCRC0;
+ xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
+ xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
+ Vector128 psCRC0 = xmmCRC0.AsSingle();
+ Vector128 psCRC3 = xmmCRC3.AsSingle();
+ Vector128 psRes = Sse.Xor(psCRC0, psCRC3);
+
+ xmmCRC0 = xmmCRC1;
+ xmmCRC1 = xmmCRC2;
+ xmmCRC2 = xTmp3;
+ xmmCRC3 = psRes.AsUInt32();
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ static void Fold2(ref Vector128 xmmCRC0, ref Vector128 xmmCRC1, ref Vector128 xmmCRC2,
+ ref Vector128 xmmCRC3)
+ {
+ Vector128 xmmFold4 = Vector128.Create(0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001);
+
+ Vector128 xTmp3 = xmmCRC3;
+ Vector128 xTmp2 = xmmCRC2;
+
+ xmmCRC3 = xmmCRC1;
+ xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
+ xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
+ Vector128 psCRC3 = xmmCRC3.AsSingle();
+ Vector128 psCRC1 = xmmCRC1.AsSingle();
+ Vector128 psRes31 = Sse.Xor(psCRC3, psCRC1);
+
+ xmmCRC2 = xmmCRC0;
+ xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
+ xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
+ Vector128 psCRC0 = xmmCRC0.AsSingle();
+ Vector128 psCRC2 = xmmCRC2.AsSingle();
+ Vector128 psRes20 = Sse.Xor(psCRC0, psCRC2);
+
+ xmmCRC0 = xTmp2;
+ xmmCRC1 = xTmp3;
+ xmmCRC2 = psRes20.AsUInt32();
+ xmmCRC3 = psRes31.AsUInt32();
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ static void Fold3(ref Vector128 xmmCRC0, ref Vector128 xmmCRC1, ref Vector128 xmmCRC2,
+ ref Vector128 xmmCRC3)
+ {
+ Vector128 xmmFold4 = Vector128.Create(0x54442bd4, 0x00000001, 0xc6e41596, 0x00000001);
+
+ Vector128 xTmp3 = xmmCRC3;
+
+ xmmCRC3 = xmmCRC2;
+ xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
+ xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
+ Vector128 psCRC2 = xmmCRC2.AsSingle();
+ Vector128 psCRC3 = xmmCRC3.AsSingle();
+ Vector128 psRes32 = Sse.Xor(psCRC2, psCRC3);
+
+ xmmCRC2 = xmmCRC1;
+ xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
+ xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
+ Vector128 psCRC1 = xmmCRC1.AsSingle();
+ psCRC2 = xmmCRC2.AsSingle();
+ Vector128 psRes21 = Sse.Xor(psCRC1, psCRC2);
+
+ xmmCRC1 = xmmCRC0;
+ xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
+ xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
+ Vector128 psCRC0 = xmmCRC0.AsSingle();
+ psCRC1 = xmmCRC1.AsSingle();
+ Vector128 psRes10 = Sse.Xor(psCRC0, psCRC1);
+
+ xmmCRC0 = xTmp3;
+ xmmCRC1 = psRes10.AsUInt32();
+ xmmCRC2 = psRes21.AsUInt32();
+ xmmCRC3 = psRes32.AsUInt32();
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ static void Fold4(ref Vector128 xmmCRC0, ref Vector128 xmmCRC1, ref Vector128 xmmCRC2,
+ ref Vector128 xmmCRC3)
+ {
+ Vector128 xmmFold4 = Vector128.Create(0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001);
+
+ Vector128 xTmp0 = xmmCRC0;
+ Vector128 xTmp1 = xmmCRC1;
+ Vector128 xTmp2 = xmmCRC2;
+ Vector128 xTmp3 = xmmCRC3;
+
+ xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
+ xTmp0 = Pclmulqdq.CarrylessMultiply(xTmp0.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
+ Vector128 psCRC0 = xmmCRC0.AsSingle();
+ Vector128 psT0 = xTmp0.AsSingle();
+ Vector128 psRes0 = Sse.Xor(psCRC0, psT0);
+
+ xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
+ xTmp1 = Pclmulqdq.CarrylessMultiply(xTmp1.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
+ Vector128 psCRC1 = xmmCRC1.AsSingle();
+ Vector128 psT1 = xTmp1.AsSingle();
+ Vector128 psRes1 = Sse.Xor(psCRC1, psT1);
+
+ xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
+ xTmp2 = Pclmulqdq.CarrylessMultiply(xTmp2.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
+ Vector128 psCRC2 = xmmCRC2.AsSingle();
+ Vector128 psT2 = xTmp2.AsSingle();
+ Vector128 psRes2 = Sse.Xor(psCRC2, psT2);
+
+ xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
+ xTmp3 = Pclmulqdq.CarrylessMultiply(xTmp3.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
+ Vector128 psCRC3 = xmmCRC3.AsSingle();
+ Vector128 psT3 = xTmp3.AsSingle();
+ Vector128 psRes3 = Sse.Xor(psCRC3, psT3);
+
+ xmmCRC0 = psRes0.AsUInt32();
+ xmmCRC1 = psRes1.AsUInt32();
+ xmmCRC2 = psRes2.AsUInt32();
+ xmmCRC3 = psRes3.AsUInt32();
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ static void PartialFold(long len, ref Vector128 xmmCRC0, ref Vector128 xmmCRC1,
+ ref Vector128 xmmCRC2, ref Vector128 xmmCRC3,
+ ref Vector128 xmmCRCPart)
+ {
+ Vector128 xmmFold4 = Vector128.Create(0x54442bd4, 0x00000001, 0xc6e41596, 0x00000001);
+ Vector128 xmmMask3 = Vector128.Create(0x80808080);
+
+ Vector128 xmmShl = _pshufbShfTable[len - 1];
+ Vector128 xmmShr = xmmShl;
+ xmmShr = Sse2.Xor(xmmShr, xmmMask3);
+
+ Vector128 xmmA00 = Ssse3.Shuffle(xmmCRC0.AsByte(), xmmShl.AsByte()).AsUInt32();
+
+ xmmCRC0 = Ssse3.Shuffle(xmmCRC0.AsByte(), xmmShr.AsByte()).AsUInt32();
+ Vector128 xmmTmp1 = Ssse3.Shuffle(xmmCRC1.AsByte(), xmmShl.AsByte()).AsUInt32();
+ xmmCRC0 = Sse2.Or(xmmCRC0, xmmTmp1);
+
+ xmmCRC1 = Ssse3.Shuffle(xmmCRC1.AsByte(), xmmShr.AsByte()).AsUInt32();
+ Vector128 xmmTmp2 = Ssse3.Shuffle(xmmCRC2.AsByte(), xmmShl.AsByte()).AsUInt32();
+ xmmCRC1 = Sse2.Or(xmmCRC1, xmmTmp2);
+
+ xmmCRC2 = Ssse3.Shuffle(xmmCRC2.AsByte(), xmmShr.AsByte()).AsUInt32();
+ Vector128 xmmTmp3 = Ssse3.Shuffle(xmmCRC3.AsByte(), xmmShl.AsByte()).AsUInt32();
+ xmmCRC2 = Sse2.Or(xmmCRC2, xmmTmp3);
+
+ xmmCRC3 = Ssse3.Shuffle(xmmCRC3.AsByte(), xmmShr.AsByte()).AsUInt32();
+ xmmCRCPart = Ssse3.Shuffle(xmmCRCPart.AsByte(), xmmShl.AsByte()).AsUInt32();
+ xmmCRC3 = Sse2.Or(xmmCRC3, xmmCRCPart);
+
+ Vector128 xmmA01 = Pclmulqdq.CarrylessMultiply(xmmA00.AsUInt64(), xmmFold4.AsUInt64(), 0x10).
+ AsUInt32();
+
+ xmmA00 = Pclmulqdq.CarrylessMultiply(xmmA00.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
+
+ Vector128 psCRC3 = xmmCRC3.AsSingle();
+ Vector128 psa00 = xmmA00.AsSingle();
+ Vector128 psa01 = xmmA01.AsSingle();
+
+ Vector128 psRes = Sse.Xor(psCRC3, psa00);
+ psRes = Sse.Xor(psRes, psa01);
+
+ xmmCRC3 = psRes.AsUInt32();
+ }
+
+ internal static uint Step(byte[] src, long len, uint initialCRC)
+ {
+ Vector128 xmmT0, xmmT1, xmmT2;
+ Vector128 xmmInitial = Sse2.ConvertScalarToVector128UInt32(initialCRC);
+ Vector128 xmmCRC0 = Sse2.ConvertScalarToVector128UInt32(0x9db42487);
+ Vector128 xmmCRC1 = Vector128.Zero;
+ Vector128 xmmCRC2 = Vector128.Zero;
+ Vector128 xmmCRC3 = Vector128.Zero;
+ Vector128 xmmCRCPart;
+ int bufPos = 0;
+
+ bool first = true;
+
+ /* fold 512 to 32 step variable declarations for ISO-C90 compat. */
+ Vector128 xmmMask = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000);
+ Vector128 xmmMask2 = Vector128.Create(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+
+ uint crc;
+
+ if(len < 16)
+ {
+ switch(len)
+ {
+ case 0: return initialCRC;
+ case < 4:
+ /*
+ * no idea how to do this for <4 bytes, delegate to classic impl.
+ */
+ crc = ~initialCRC;
+
+ switch(len)
+ {
+ case 3:
+ crc = (crc >> 8) ^ Crc32Context._isoCrc32Table[0][(crc & 0xFF) ^ src[bufPos++]];
+ goto case 2;
+ case 2:
+ crc = (crc >> 8) ^ Crc32Context._isoCrc32Table[0][(crc & 0xFF) ^ src[bufPos++]];
+ goto case 1;
+ case 1:
+ crc = (crc >> 8) ^ Crc32Context._isoCrc32Table[0][(crc & 0xFF) ^ src[bufPos]];
+
+ break;
+ }
+
+ return ~crc;
+ }
+
+ xmmCRCPart = Vector128.Create(BitConverter.ToUInt32(src, 0), BitConverter.ToUInt32(src, 4),
+ BitConverter.ToUInt32(src, 8), BitConverter.ToUInt32(src, 12));
+
+ xmmCRCPart = Sse2.Xor(xmmCRCPart, xmmInitial);
+
+ goto partial;
+ }
+
+ while((len -= 64) >= 0)
+ {
+ xmmT0 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
+ BitConverter.ToUInt32(src, bufPos + 8),
+ BitConverter.ToUInt32(src, bufPos + 12));
+
+ bufPos += 16;
+
+ xmmT1 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
+ BitConverter.ToUInt32(src, bufPos + 8),
+ BitConverter.ToUInt32(src, bufPos + 12));
+
+ bufPos += 16;
+
+ xmmT2 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
+ BitConverter.ToUInt32(src, bufPos + 8),
+ BitConverter.ToUInt32(src, bufPos + 12));
+
+ bufPos += 16;
+
+ Vector128 xmmT3 = Vector128.Create(BitConverter.ToUInt32(src, bufPos),
+ BitConverter.ToUInt32(src, bufPos + 4),
+ BitConverter.ToUInt32(src, bufPos + 8),
+ BitConverter.ToUInt32(src, bufPos + 12));
+
+ bufPos += 16;
+
+ if(first)
+ {
+ first = false;
+ xmmT0 = Sse2.Xor(xmmT0, xmmInitial);
+ }
+
+ Fold4(ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3);
+
+ xmmCRC0 = Sse2.Xor(xmmCRC0, xmmT0);
+ xmmCRC1 = Sse2.Xor(xmmCRC1, xmmT1);
+ xmmCRC2 = Sse2.Xor(xmmCRC2, xmmT2);
+ xmmCRC3 = Sse2.Xor(xmmCRC3, xmmT3);
+ }
+
+ /*
+ * len = num bytes left - 64
+ */
+ if(len + 16 >= 0)
+ {
+ len += 16;
+
+ xmmT0 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
+ BitConverter.ToUInt32(src, bufPos + 8),
+ BitConverter.ToUInt32(src, bufPos + 12));
+
+ bufPos += 16;
+
+ xmmT1 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
+ BitConverter.ToUInt32(src, bufPos + 8),
+ BitConverter.ToUInt32(src, bufPos + 12));
+
+ bufPos += 16;
+
+ xmmT2 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
+ BitConverter.ToUInt32(src, bufPos + 8),
+ BitConverter.ToUInt32(src, bufPos + 12));
+
+ bufPos += 16;
+
+ if(first)
+ xmmT0 = Sse2.Xor(xmmT0, xmmInitial);
+
+ Fold3(ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3);
+
+ xmmCRC1 = Sse2.Xor(xmmCRC1, xmmT0);
+ xmmCRC2 = Sse2.Xor(xmmCRC2, xmmT1);
+ xmmCRC3 = Sse2.Xor(xmmCRC3, xmmT2);
+
+ if(len == 0)
+ goto done;
+
+ xmmCRCPart = Vector128.Create(BitConverter.ToUInt32(src, bufPos),
+ BitConverter.ToUInt32(src, bufPos + 4),
+ BitConverter.ToUInt32(src, bufPos + 8),
+ BitConverter.ToUInt32(src, bufPos + 12));
+ }
+ else if(len + 32 >= 0)
+ {
+ len += 32;
+
+ xmmT0 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
+ BitConverter.ToUInt32(src, bufPos + 8),
+ BitConverter.ToUInt32(src, bufPos + 12));
+
+ bufPos += 16;
+
+ xmmT1 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
+ BitConverter.ToUInt32(src, bufPos + 8),
+ BitConverter.ToUInt32(src, bufPos + 12));
+
+ bufPos += 16;
+
+ if(first)
+ xmmT0 = Sse2.Xor(xmmT0, xmmInitial);
+
+ Fold2(ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3);
+
+ xmmCRC2 = Sse2.Xor(xmmCRC2, xmmT0);
+ xmmCRC3 = Sse2.Xor(xmmCRC3, xmmT1);
+
+ if(len == 0)
+ goto done;
+
+ xmmCRCPart = Vector128.Create(BitConverter.ToUInt32(src, bufPos),
+ BitConverter.ToUInt32(src, bufPos + 4),
+ BitConverter.ToUInt32(src, bufPos + 8),
+ BitConverter.ToUInt32(src, bufPos + 12));
+ }
+ else if(len + 48 >= 0)
+ {
+ len += 48;
+
+ xmmT0 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
+ BitConverter.ToUInt32(src, bufPos + 8),
+ BitConverter.ToUInt32(src, bufPos + 12));
+
+ bufPos += 16;
+
+ if(first)
+ xmmT0 = Sse2.Xor(xmmT0, xmmInitial);
+
+ Fold1(ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3);
+
+ xmmCRC3 = Sse2.Xor(xmmCRC3, xmmT0);
+
+ if(len == 0)
+ goto done;
+
+ xmmCRCPart = Vector128.Create(BitConverter.ToUInt32(src, bufPos),
+ BitConverter.ToUInt32(src, bufPos + 4),
+ BitConverter.ToUInt32(src, bufPos + 8),
+ BitConverter.ToUInt32(src, bufPos + 12));
+ }
+ else
+ {
+ len += 64;
+
+ if(len == 0)
+ goto done;
+
+ xmmCRCPart = Vector128.Create(BitConverter.ToUInt32(src, bufPos),
+ BitConverter.ToUInt32(src, bufPos + 4),
+ BitConverter.ToUInt32(src, bufPos + 8),
+ BitConverter.ToUInt32(src, bufPos + 12));
+
+ if(first)
+ xmmCRCPart = Sse2.Xor(xmmCRCPart, xmmInitial);
+ }
+
+ partial:
+ PartialFold(len, ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3, ref xmmCRCPart);
+
+ done:
+
+ /* fold 512 to 32 */
+
+ /*
+ * k1
+ */
+ Vector128 crcFold = Vector128.Create(_crcK[0], _crcK[1], _crcK[2], _crcK[3]);
+
+ Vector128 xTmp0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), crcFold.AsUInt64(), 0x10).
+ AsUInt32();
+
+ xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), crcFold.AsUInt64(), 0x01).AsUInt32();
+ xmmCRC1 = Sse2.Xor(xmmCRC1, xTmp0);
+ xmmCRC1 = Sse2.Xor(xmmCRC1, xmmCRC0);
+
+ Vector128 xTmp1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), crcFold.AsUInt64(), 0x10).
+ AsUInt32();
+
+ xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), crcFold.AsUInt64(), 0x01).AsUInt32();
+ xmmCRC2 = Sse2.Xor(xmmCRC2, xTmp1);
+ xmmCRC2 = Sse2.Xor(xmmCRC2, xmmCRC1);
+
+ Vector128 xTmp2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), crcFold.AsUInt64(), 0x10).
+ AsUInt32();
+
+ xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), crcFold.AsUInt64(), 0x01).AsUInt32();
+ xmmCRC3 = Sse2.Xor(xmmCRC3, xTmp2);
+ xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC2);
+
+ /*
+ * k5
+ */
+ crcFold = Vector128.Create(_crcK[4], _crcK[5], _crcK[6], _crcK[7]);
+
+ xmmCRC0 = xmmCRC3;
+ xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0).AsUInt32();
+ xmmCRC0 = Sse2.ShiftRightLogical128BitLane(xmmCRC0, 8);
+ xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC0);
+
+ xmmCRC0 = xmmCRC3;
+ xmmCRC3 = Sse2.ShiftLeftLogical128BitLane(xmmCRC3, 4);
+ xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0x10).AsUInt32();
+ xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC0);
+ xmmCRC3 = Sse2.And(xmmCRC3, xmmMask2);
+
+ /*
+ * k7
+ */
+ xmmCRC1 = xmmCRC3;
+ xmmCRC2 = xmmCRC3;
+ crcFold = Vector128.Create(_crcK[8], _crcK[9], _crcK[10], _crcK[11]);
+
+ xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0).AsUInt32();
+ xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC2);
+ xmmCRC3 = Sse2.And(xmmCRC3, xmmMask);
+
+ xmmCRC2 = xmmCRC3;
+ xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0x10).AsUInt32();
+ xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC2);
+ xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC1);
+
+ /*
+ * could just as well write xmm_crc3[2], doing a movaps and truncating, but
+ * no real advantage - it's a tiny bit slower per call, while no additional CPUs
+ * would be supported by only requiring SSSE3 and CLMUL instead of SSE4.1 + CLMUL
+ */
+ crc = Sse41.Extract(xmmCRC3, 2);
+
+ return ~crc;
+ }
+ }
+}
\ No newline at end of file
diff --git a/CRC32Context.cs b/CRC32Context.cs
index f313ee3..1db3101 100644
--- a/CRC32Context.cs
+++ b/CRC32Context.cs
@@ -32,7 +32,9 @@
using System;
using System.IO;
+using System.Runtime.Intrinsics.X86;
using System.Text;
+using Aaru.Checksums.CRC32;
using Aaru.CommonTypes.Interfaces;
using Aaru.Helpers;
@@ -45,7 +47,7 @@ namespace Aaru.Checksums
const uint CRC32_ISO_POLY = 0xEDB88320;
const uint CRC32_ISO_SEED = 0xFFFFFFFF;
- static readonly uint[][] _isoCrc32Table =
+ internal static readonly uint[][] _isoCrc32Table =
{
new uint[]
{
@@ -332,6 +334,7 @@ namespace Aaru.Checksums
readonly uint _finalSeed;
readonly uint[][] _table;
uint _hashInt;
+ readonly bool _useIso;
/// Initializes the CRC32 table and seed as CRC32-ISO
public Crc32Context()
@@ -339,6 +342,7 @@ namespace Aaru.Checksums
_hashInt = CRC32_ISO_SEED;
_finalSeed = CRC32_ISO_SEED;
_table = _isoCrc32Table;
+ _useIso = true;
}
/// Initializes the CRC32 table with a custom polynomial and seed
@@ -346,6 +350,7 @@ namespace Aaru.Checksums
{
_hashInt = seed;
_finalSeed = seed;
+ _useIso = polynomial == CRC32_ISO_POLY && seed == CRC32_ISO_SEED;
_table = GenerateTable(polynomial);
}
@@ -354,7 +359,7 @@ namespace Aaru.Checksums
/// Updates the hash with data.
/// Data buffer.
/// Length of buffer to hash.
- public void Update(byte[] data, uint len) => Step(ref _hashInt, _table, data, len);
+ public void Update(byte[] data, uint len) => Step(ref _hashInt, _table, data, len, _useIso);
///
/// Updates the hash with data.
@@ -404,40 +409,48 @@ namespace Aaru.Checksums
return table;
}
- static void Step(ref uint previousCrc, uint[][] table, byte[] data, uint len)
+ static void Step(ref uint previousCrc, uint[][] table, byte[] data, uint len, bool useIso)
{
+ if(useIso &&
+ Pclmulqdq.IsSupported &&
+ Sse41.IsSupported &&
+ Ssse3.IsSupported &&
+ Sse2.IsSupported)
+ {
+ previousCrc = ~Clmul.Step(data, len, ~previousCrc);
+
+ return;
+ }
+
// Unroll according to Intel slicing by uint8_t
// http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf
// http://sourceforge.net/projects/slicing-by-8/
+ int currentPos = 0;
+ const int unroll = 4;
+ const int bytesAtOnce = 8 * unroll;
+ uint crc = previousCrc;
- uint crc;
- int current_pos = 0;
- const int unroll = 4;
- const int bytes_at_once = 8 * unroll;
-
- crc = previousCrc;
-
- while(len >= bytes_at_once)
+ while(len >= bytesAtOnce)
{
int unrolling;
for(unrolling = 0; unrolling < unroll; unrolling++)
{
- uint one = BitConverter.ToUInt32(data, current_pos) ^ crc;
- current_pos += 4;
- uint two = BitConverter.ToUInt32(data, current_pos);
- current_pos += 4;
+ uint one = BitConverter.ToUInt32(data, currentPos) ^ crc;
+ currentPos += 4;
+ uint two = BitConverter.ToUInt32(data, currentPos);
+ currentPos += 4;
crc = table[0][(two >> 24) & 0xFF] ^ table[1][(two >> 16) & 0xFF] ^ table[2][(two >> 8) & 0xFF] ^
table[3][two & 0xFF] ^ table[4][(one >> 24) & 0xFF] ^ table[5][(one >> 16) & 0xFF] ^
table[6][(one >> 8) & 0xFF] ^ table[7][one & 0xFF];
}
- len -= bytes_at_once;
+ len -= bytesAtOnce;
}
while(len-- != 0)
- crc = (crc >> 8) ^ table[0][(crc & 0xFF) ^ data[current_pos++]];
+ crc = (crc >> 8) ^ table[0][(crc & 0xFF) ^ data[currentPos++]];
previousCrc = crc;
}
@@ -475,7 +488,8 @@ namespace Aaru.Checksums
while(read > 0)
{
- Step(ref localHashInt, localTable, buffer, (uint)read);
+ Step(ref localHashInt, localTable, buffer, (uint)read,
+ polynomial == CRC32_ISO_POLY && seed == CRC32_ISO_SEED);
read = fileStream.Read(buffer, 0, 65536);
}
@@ -512,7 +526,7 @@ namespace Aaru.Checksums
uint[][] localTable = GenerateTable(polynomial);
- Step(ref localHashInt, localTable, data, len);
+ Step(ref localHashInt, localTable, data, len, polynomial == CRC32_ISO_POLY && seed == CRC32_ISO_SEED);
localHashInt ^= seed;
hash = BigEndianBitConverter.GetBytes(localHashInt);