From 7b8f4cd795c7b089f6273bf09386b62e45425185 Mon Sep 17 00:00:00 2001
From: Natalia Portillo <claunia@claunia.com>
Date: Sun, 26 Sep 2021 23:13:43 +0100
Subject: [PATCH] Add PCLMUL implementation of CRC32.

---
 Aaru.Checksums.csproj |  45 ++--
 CRC32/clmul.cs        | 541 ++++++++++++++++++++++++++++++++++++++++++
 CRC32Context.cs       |  52 ++--
 3 files changed, 597 insertions(+), 41 deletions(-)
 create mode 100644 CRC32/clmul.cs
diff --git a/Aaru.Checksums.csproj b/Aaru.Checksums.csproj
index cac845667..696e74abd 100644
--- a/Aaru.Checksums.csproj
+++ b/Aaru.Checksums.csproj
@@ -55,28 +55,29 @@
     <ConsolePause>false</ConsolePause>
   </PropertyGroup>
   <ItemGroup>
-    <Reference Include="System"/>
+    <Reference Include="System" />
   </ItemGroup>
   <ItemGroup>
-    <Compile Include="CRC16CCITTContext.cs"/>
-    <Compile Include="CRC16IBMContext.cs"/>
-    <Compile Include="Register.cs"/>
-    <Compile Include="SpamSumContext.cs"/>
-    <Compile Include="Adler32Context.cs"/>
-    <Compile Include="CDChecksums.cs"/>
-    <Compile Include="CRC16Context.cs"/>
-    <Compile Include="CRC32Context.cs"/>
-    <Compile Include="CRC64Context.cs"/>
-    <Compile Include="FletcherContext.cs"/>
-    <Compile Include="MD5Context.cs"/>
-    <Compile Include="ReedSolomon.cs"/>
-    <Compile Include="SHA1Context.cs"/>
-    <Compile Include="SHA256Context.cs"/>
-    <Compile Include="SHA384Context.cs"/>
-    <Compile Include="SHA512Context.cs"/>
+    <Compile Include="CRC16CCITTContext.cs" />
+    <Compile Include="CRC16IBMContext.cs" />
+    <Compile Include="CRC32\clmul.cs" />
+    <Compile Include="Register.cs" />
+    <Compile Include="SpamSumContext.cs" />
+    <Compile Include="Adler32Context.cs" />
+    <Compile Include="CDChecksums.cs" />
+    <Compile Include="CRC16Context.cs" />
+    <Compile Include="CRC32Context.cs" />
+    <Compile Include="CRC64Context.cs" />
+    <Compile Include="FletcherContext.cs" />
+    <Compile Include="MD5Context.cs" />
+    <Compile Include="ReedSolomon.cs" />
+    <Compile Include="SHA1Context.cs" />
+    <Compile Include="SHA256Context.cs" />
+    <Compile Include="SHA384Context.cs" />
+    <Compile Include="SHA512Context.cs" />
   </ItemGroup>
   <ItemGroup>
-    <ProjectReference Include="..\Aaru.CommonTypes\Aaru.CommonTypes.csproj"/>
+    <ProjectReference Include="..\Aaru.CommonTypes\Aaru.CommonTypes.csproj" />
     <ProjectReference Include="..\Aaru.Helpers\Aaru.Helpers.csproj">
       <Project>{F8BDF57B-1571-4CD0-84B3-B422088D359A}</Project>
       <Name>Aaru.Helpers</Name>
@@ -95,15 +96,15 @@
     </EmbeddedResource>
   </ItemGroup>
   <ItemGroup>
-    <PackageReference Include="Unclassified.NetRevisionTask" Version="0.4.1" PrivateAssets="all"/>
+    <PackageReference Include="Unclassified.NetRevisionTask" Version="0.4.1" PrivateAssets="all" />
   </ItemGroup>
   <ProjectExtensions>
     <MonoDevelop>
       <Properties>
         <Policies>
-          <StandardHeader IncludeInNewFiles="True" Text="/***************************************************************************&#xA;Aaru Data Preservation Suite&#xA;----------------------------------------------------------------------------&#xA; &#xA;Filename       : ${FileName}&#xA;Author(s)      : ${AuthorName} &lt;${AuthorEmail}&gt;&#xA;&#xA;Component      : Component&#xA; &#xA;--[ Description ] ----------------------------------------------------------&#xA; &#xA;    Description&#xA; &#xA;--[ License ] --------------------------------------------------------------&#xA; &#xA;    This library is free software; you can redistribute it and/or modify&#xA;    it under the terms of the GNU Lesser General Public License as&#xA;    published by the Free Software Foundation; either version 2.1 of the&#xA;    License, or (at your option) any later version.&#xA;&#xA;    This library is distributed in the hope that it will be useful, but&#xA;    WITHOUT ANY WARRANTY; without even the implied warranty of&#xA;    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU&#xA;    Lesser General Public License for more details.&#xA;&#xA;    You should have received a copy of the GNU Lesser General Public&#xA;    License along with this library; if not, see &lt;http://www.gnu.org/licenses/&gt;.&#xA;&#xA;----------------------------------------------------------------------------&#xA;Copyright © 2011-${Year} ${CopyrightHolder}&#xA;****************************************************************************/"/>
-          <TextStylePolicy FileWidth="120" TabWidth="4" IndentWidth="4" RemoveTrailingWhitespace="True" NoTabsAfterNonTabs="False" EolMarker="Native" TabsToSpaces="True" scope="text/x-csharp"/>
-          <CSharpFormattingPolicy IndentBlock="True" IndentBraces="False" IndentSwitchSection="True" IndentSwitchCaseSection="True" LabelPositioning="OneLess" NewLinesForBracesInTypes="True" NewLinesForBracesInMethods="True" NewLinesForBracesInProperties="True" NewLinesForBracesInAccessors="True" NewLinesForBracesInAnonymousMethods="True" NewLinesForBracesInControlBlocks="True" NewLinesForBracesInAnonymousTypes="True" NewLinesForBracesInObjectCollectionArrayInitializers="True" NewLinesForBracesInLambdaExpressionBody="True" NewLineForElse="True" NewLineForCatch="True" NewLineForFinally="True" NewLineForMembersInObjectInit="True" NewLineForMembersInAnonymousTypes="True" NewLineForClausesInQuery="True" SpacingAfterMethodDeclarationName="False" SpaceWithinMethodDeclarationParenthesis="False" SpaceBetweenEmptyMethodDeclarationParentheses="False" SpaceAfterMethodCallName="False" SpaceWithinMethodCallParentheses="False" SpaceBetweenEmptyMethodCallParentheses="False" SpaceWithinExpressionParentheses="False" SpaceWithinCastParentheses="False" SpaceWithinOtherParentheses="False" SpaceAfterCast="False" SpacesIgnoreAroundVariableDeclaration="False" SpaceBeforeOpenSquareBracket="False" SpaceBetweenEmptySquareBrackets="False" SpaceWithinSquareBrackets="False" SpaceAfterColonInBaseTypeDeclaration="True" SpaceAfterComma="True" SpaceAfterDot="False" SpaceAfterSemicolonsInForStatement="True" SpaceBeforeColonInBaseTypeDeclaration="True" SpaceBeforeComma="False" SpaceBeforeDot="False" SpaceBeforeSemicolonsInForStatement="False" SpacingAroundBinaryOperator="Single" WrappingPreserveSingleLine="True" WrappingKeepStatementsOnSingleLine="True" PlaceSystemDirectiveFirst="True" SpaceAfterControlFlowStatementKeyword="False" scope="text/x-csharp"/>
+          <StandardHeader IncludeInNewFiles="True" Text="/***************************************************************************&#xA;Aaru Data Preservation Suite&#xA;----------------------------------------------------------------------------&#xA; &#xA;Filename       : ${FileName}&#xA;Author(s)      : ${AuthorName} &lt;${AuthorEmail}&gt;&#xA;&#xA;Component      : Component&#xA; &#xA;--[ Description ] ----------------------------------------------------------&#xA; &#xA;    Description&#xA; &#xA;--[ License ] --------------------------------------------------------------&#xA; &#xA;    This library is free software; you can redistribute it and/or modify&#xA;    it under the terms of the GNU Lesser General Public License as&#xA;    published by the Free Software Foundation; either version 2.1 of the&#xA;    License, or (at your option) any later version.&#xA;&#xA;    This library is distributed in the hope that it will be useful, but&#xA;    WITHOUT ANY WARRANTY; without even the implied warranty of&#xA;    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU&#xA;    Lesser General Public License for more details.&#xA;&#xA;    You should have received a copy of the GNU Lesser General Public&#xA;    License along with this library; if not, see &lt;http://www.gnu.org/licenses/&gt;.&#xA;&#xA;----------------------------------------------------------------------------&#xA;Copyright © 2011-${Year} ${CopyrightHolder}&#xA;****************************************************************************/" />
+          <TextStylePolicy FileWidth="120" TabWidth="4" IndentWidth="4" RemoveTrailingWhitespace="True" NoTabsAfterNonTabs="False" EolMarker="Native" TabsToSpaces="True" scope="text/x-csharp" />
+          <CSharpFormattingPolicy IndentBlock="True" IndentBraces="False" IndentSwitchSection="True" IndentSwitchCaseSection="True" LabelPositioning="OneLess" NewLinesForBracesInTypes="True" NewLinesForBracesInMethods="True" NewLinesForBracesInProperties="True" NewLinesForBracesInAccessors="True" NewLinesForBracesInAnonymousMethods="True" NewLinesForBracesInControlBlocks="True" NewLinesForBracesInAnonymousTypes="True" NewLinesForBracesInObjectCollectionArrayInitializers="True" NewLinesForBracesInLambdaExpressionBody="True" NewLineForElse="True" NewLineForCatch="True" NewLineForFinally="True" NewLineForMembersInObjectInit="True" NewLineForMembersInAnonymousTypes="True" NewLineForClausesInQuery="True" SpacingAfterMethodDeclarationName="False" SpaceWithinMethodDeclarationParenthesis="False" SpaceBetweenEmptyMethodDeclarationParentheses="False" SpaceAfterMethodCallName="False" SpaceWithinMethodCallParentheses="False" SpaceBetweenEmptyMethodCallParentheses="False" SpaceWithinExpressionParentheses="False" SpaceWithinCastParentheses="False" SpaceWithinOtherParentheses="False" SpaceAfterCast="False" SpacesIgnoreAroundVariableDeclaration="False" SpaceBeforeOpenSquareBracket="False" SpaceBetweenEmptySquareBrackets="False" SpaceWithinSquareBrackets="False" SpaceAfterColonInBaseTypeDeclaration="True" SpaceAfterComma="True" SpaceAfterDot="False" SpaceAfterSemicolonsInForStatement="True" SpaceBeforeColonInBaseTypeDeclaration="True" SpaceBeforeComma="False" SpaceBeforeDot="False" SpaceBeforeSemicolonsInForStatement="False" SpacingAroundBinaryOperator="Single" WrappingPreserveSingleLine="True" WrappingKeepStatementsOnSingleLine="True" PlaceSystemDirectiveFirst="True" SpaceAfterControlFlowStatementKeyword="False" scope="text/x-csharp" />
         </Policies>
       </Properties>
     </MonoDevelop>
diff --git a/CRC32/clmul.cs b/CRC32/clmul.cs
new file mode 100644
index 000000000..b446d53b0
--- /dev/null
+++ b/CRC32/clmul.cs
@@ -0,0 +1,541 @@
+// /***************************************************************************
+// Aaru Data Preservation Suite
+// ----------------------------------------------------------------------------
+//
+// Filename       : clmul.cs
+// Author(s)      : Natalia Portillo <claunia@claunia.com>
+//                  Wajdi Feghali    <wajdi.k.feghali@intel.com>
+//                  Jim Guilford     <james.guilford@intel.com>
+//                  Vinodh Gopal     <vinodh.gopal@intel.com>
+//                  Erdinc Ozturk    <erdinc.ozturk@intel.com>
+//                  Jim Kukunas      <james.t.kukunas@linux.intel.com>
+//                  Marian Beermann
+//
+// Component      : Checksums.
+//
+// --[ Description ] ----------------------------------------------------------
+//
+// Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
+// instruction.
+//
+// A white paper describing this algorithm can be found at:
+// http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+//
+// --[ License ] --------------------------------------------------------------
+//
+// This software is provided 'as-is', without any express or implied warranty.
+// In no event will the authors be held liable for any damages arising from
+// the use of this software.
+//
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+//
+//   1. The origin of this software must not be misrepresented; you must not
+//      claim that you wrote the original software. If you use this software
+//      in a product, an acknowledgment in the product documentation would be
+//      appreciated but is not required.
+//
+//   2. Altered source versions must be plainly marked as such, and must not be
+//      misrepresented as being the original software.
+//
+//   3. This notice may not be removed or altered from any source distribution.
+//
+// ----------------------------------------------------------------------------
+// Copyright © 2011-2021 Natalia Portillo
+// Copyright (c) 2016 Marian Beermann (add support for initial value, restructuring)
+// Copyright (C) 2013 Intel Corporation. All rights reserved.
+// ****************************************************************************/
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace Aaru.Checksums.CRC32
+{
+    internal static class Clmul
+    {
+        static readonly uint[] _crcK =
+        {
+            0xccaa009e, 0x00000000, /* rk1 */ 0x751997d0, 0x00000001, /* rk2 */ 0xccaa009e, 0x00000000, /* rk5 */
+            0x63cd6124, 0x00000001, /* rk6 */ 0xf7011640, 0x00000001, /* rk7 */ 0xdb710640, 0x00000001  /* rk8 */
+        };
+
+        static readonly Vector128<uint>[] _pshufbShfTable =
+        {
+            Vector128.Create(0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d),  /* shl 15 (16 - 1)/shr1 */
+            Vector128.Create(0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e),  /* shl 14 (16 - 3)/shr2 */
+            Vector128.Create(0x86858483, 0x8a898887, 0x8e8d8c8b, 0x0201008f),  /* shl 13 (16 - 4)/shr3 */
+            Vector128.Create(0x87868584, 0x8b8a8988, 0x8f8e8d8c, 0x03020100),  /* shl 12 (16 - 4)/shr4 */
+            Vector128.Create(0x88878685, 0x8c8b8a89, 0x008f8e8d, 0x04030201),  /* shl 11 (16 - 5)/shr5 */
+            Vector128.Create(0x89888786, 0x8d8c8b8a, 0x01008f8e, 0x05040302),  /* shl 10 (16 - 6)/shr6 */
+            Vector128.Create(0x8a898887, 0x8e8d8c8b, 0x0201008f, 0x06050403),  /* shl  9 (16 - 7)/shr7 */
+            Vector128.Create(0x8b8a8988, 0x8f8e8d8c, 0x03020100, 0x07060504),  /* shl  8 (16 - 8)/shr8 */
+            Vector128.Create(0x8c8b8a89, 0x008f8e8d, 0x04030201, 0x08070605),  /* shl  7 (16 - 9)/shr9 */
+            Vector128.Create(0x8d8c8b8a, 0x01008f8e, 0x05040302, 0x09080706),  /* shl  6 (16 -10)/shr10*/
+            Vector128.Create(0x8e8d8c8b, 0x0201008f, 0x06050403, 0x0a090807),  /* shl  5 (16 -11)/shr11*/
+            Vector128.Create(0x8f8e8d8c, 0x03020100, 0x07060504, 0x0b0a0908),  /* shl  4 (16 -12)/shr12*/
+            Vector128.Create(0x008f8e8du, 0x04030201, 0x08070605, 0x0c0b0a09), /* shl  3 (16 -13)/shr13*/
+            Vector128.Create(0x01008f8eu, 0x05040302, 0x09080706, 0x0d0c0b0a), /* shl  2 (16 -14)/shr14*/
+            Vector128.Create(0x0201008fu, 0x06050403, 0x0a090807, 0x0e0d0c0b)  /* shl  1 (16 -15)/shr15*/
+        };
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        static void Fold1(ref Vector128<uint> xmmCRC0, ref Vector128<uint> xmmCRC1, ref Vector128<uint> xmmCRC2,
+                          ref Vector128<uint> xmmCRC3)
+        {
+            Vector128<uint> xmmFold4 = Vector128.Create(0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001);
+
+            Vector128<uint> xTmp3 = xmmCRC3;
+
+            xmmCRC3 = xmmCRC0;
+            xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
+            xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
+            Vector128<float> psCRC0 = xmmCRC0.AsSingle();
+            Vector128<float> psCRC3 = xmmCRC3.AsSingle();
+            Vector128<float> psRes  = Sse.Xor(psCRC0, psCRC3);
+
+            xmmCRC0 = xmmCRC1;
+            xmmCRC1 = xmmCRC2;
+            xmmCRC2 = xTmp3;
+            xmmCRC3 = psRes.AsUInt32();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        static void Fold2(ref Vector128<uint> xmmCRC0, ref Vector128<uint> xmmCRC1, ref Vector128<uint> xmmCRC2,
+                          ref Vector128<uint> xmmCRC3)
+        {
+            Vector128<uint> xmmFold4 = Vector128.Create(0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001);
+
+            Vector128<uint> xTmp3 = xmmCRC3;
+            Vector128<uint> xTmp2 = xmmCRC2;
+
+            xmmCRC3 = xmmCRC1;
+            xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
+            xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
+            Vector128<float> psCRC3  = xmmCRC3.AsSingle();
+            Vector128<float> psCRC1  = xmmCRC1.AsSingle();
+            Vector128<float> psRes31 = Sse.Xor(psCRC3, psCRC1);
+
+            xmmCRC2 = xmmCRC0;
+            xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
+            xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
+            Vector128<float> psCRC0  = xmmCRC0.AsSingle();
+            Vector128<float> psCRC2  = xmmCRC2.AsSingle();
+            Vector128<float> psRes20 = Sse.Xor(psCRC0, psCRC2);
+
+            xmmCRC0 = xTmp2;
+            xmmCRC1 = xTmp3;
+            xmmCRC2 = psRes20.AsUInt32();
+            xmmCRC3 = psRes31.AsUInt32();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        static void Fold3(ref Vector128<uint> xmmCRC0, ref Vector128<uint> xmmCRC1, ref Vector128<uint> xmmCRC2,
+                          ref Vector128<uint> xmmCRC3)
+        {
+            Vector128<uint> xmmFold4 = Vector128.Create(0x54442bd4, 0x00000001, 0xc6e41596, 0x00000001);
+
+            Vector128<uint> xTmp3 = xmmCRC3;
+
+            xmmCRC3 = xmmCRC2;
+            xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
+            xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
+            Vector128<float> psCRC2  = xmmCRC2.AsSingle();
+            Vector128<float> psCRC3  = xmmCRC3.AsSingle();
+            Vector128<float> psRes32 = Sse.Xor(psCRC2, psCRC3);
+
+            xmmCRC2 = xmmCRC1;
+            xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
+            xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
+            Vector128<float> psCRC1 = xmmCRC1.AsSingle();
+            psCRC2 = xmmCRC2.AsSingle();
+            Vector128<float> psRes21 = Sse.Xor(psCRC1, psCRC2);
+
+            xmmCRC1 = xmmCRC0;
+            xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
+            xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
+            Vector128<float> psCRC0 = xmmCRC0.AsSingle();
+            psCRC1 = xmmCRC1.AsSingle();
+            Vector128<float> psRes10 = Sse.Xor(psCRC0, psCRC1);
+
+            xmmCRC0 = xTmp3;
+            xmmCRC1 = psRes10.AsUInt32();
+            xmmCRC2 = psRes21.AsUInt32();
+            xmmCRC3 = psRes32.AsUInt32();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        static void Fold4(ref Vector128<uint> xmmCRC0, ref Vector128<uint> xmmCRC1, ref Vector128<uint> xmmCRC2,
+                          ref Vector128<uint> xmmCRC3)
+        {
+            Vector128<uint> xmmFold4 = Vector128.Create(0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001);
+
+            Vector128<uint> xTmp0 = xmmCRC0;
+            Vector128<uint> xTmp1 = xmmCRC1;
+            Vector128<uint> xTmp2 = xmmCRC2;
+            Vector128<uint> xTmp3 = xmmCRC3;
+
+            xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
+            xTmp0   = Pclmulqdq.CarrylessMultiply(xTmp0.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
+            Vector128<float> psCRC0 = xmmCRC0.AsSingle();
+            Vector128<float> psT0   = xTmp0.AsSingle();
+            Vector128<float> psRes0 = Sse.Xor(psCRC0, psT0);
+
+            xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
+            xTmp1   = Pclmulqdq.CarrylessMultiply(xTmp1.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
+            Vector128<float> psCRC1 = xmmCRC1.AsSingle();
+            Vector128<float> psT1   = xTmp1.AsSingle();
+            Vector128<float> psRes1 = Sse.Xor(psCRC1, psT1);
+
+            xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
+            xTmp2   = Pclmulqdq.CarrylessMultiply(xTmp2.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
+            Vector128<float> psCRC2 = xmmCRC2.AsSingle();
+            Vector128<float> psT2   = xTmp2.AsSingle();
+            Vector128<float> psRes2 = Sse.Xor(psCRC2, psT2);
+
+            xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
+            xTmp3   = Pclmulqdq.CarrylessMultiply(xTmp3.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
+            Vector128<float> psCRC3 = xmmCRC3.AsSingle();
+            Vector128<float> psT3   = xTmp3.AsSingle();
+            Vector128<float> psRes3 = Sse.Xor(psCRC3, psT3);
+
+            xmmCRC0 = psRes0.AsUInt32();
+            xmmCRC1 = psRes1.AsUInt32();
+            xmmCRC2 = psRes2.AsUInt32();
+            xmmCRC3 = psRes3.AsUInt32();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        static void PartialFold(long len, ref Vector128<uint> xmmCRC0, ref Vector128<uint> xmmCRC1,
+                                ref Vector128<uint> xmmCRC2, ref Vector128<uint> xmmCRC3,
+                                ref Vector128<uint> xmmCRCPart)
+        {
+            Vector128<uint> xmmFold4 = Vector128.Create(0x54442bd4, 0x00000001, 0xc6e41596, 0x00000001);
+            Vector128<uint> xmmMask3 = Vector128.Create(0x80808080);
+
+            Vector128<uint> xmmShl = _pshufbShfTable[len - 1];
+            Vector128<uint> xmmShr = xmmShl;
+            xmmShr = Sse2.Xor(xmmShr, xmmMask3);
+
+            Vector128<uint> xmmA00 = Ssse3.Shuffle(xmmCRC0.AsByte(), xmmShl.AsByte()).AsUInt32();
+
+            xmmCRC0 = Ssse3.Shuffle(xmmCRC0.AsByte(), xmmShr.AsByte()).AsUInt32();
+            Vector128<uint> xmmTmp1 = Ssse3.Shuffle(xmmCRC1.AsByte(), xmmShl.AsByte()).AsUInt32();
+            xmmCRC0 = Sse2.Or(xmmCRC0, xmmTmp1);
+
+            xmmCRC1 = Ssse3.Shuffle(xmmCRC1.AsByte(), xmmShr.AsByte()).AsUInt32();
+            Vector128<uint> xmmTmp2 = Ssse3.Shuffle(xmmCRC2.AsByte(), xmmShl.AsByte()).AsUInt32();
+            xmmCRC1 = Sse2.Or(xmmCRC1, xmmTmp2);
+
+            xmmCRC2 = Ssse3.Shuffle(xmmCRC2.AsByte(), xmmShr.AsByte()).AsUInt32();
+            Vector128<uint> xmmTmp3 = Ssse3.Shuffle(xmmCRC3.AsByte(), xmmShl.AsByte()).AsUInt32();
+            xmmCRC2 = Sse2.Or(xmmCRC2, xmmTmp3);
+
+            xmmCRC3    = Ssse3.Shuffle(xmmCRC3.AsByte(), xmmShr.AsByte()).AsUInt32();
+            xmmCRCPart = Ssse3.Shuffle(xmmCRCPart.AsByte(), xmmShl.AsByte()).AsUInt32();
+            xmmCRC3    = Sse2.Or(xmmCRC3, xmmCRCPart);
+
+            Vector128<uint> xmmA01 = Pclmulqdq.CarrylessMultiply(xmmA00.AsUInt64(), xmmFold4.AsUInt64(), 0x10).
+                                               AsUInt32();
+
+            xmmA00 = Pclmulqdq.CarrylessMultiply(xmmA00.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
+
+            Vector128<float> psCRC3 = xmmCRC3.AsSingle();
+            Vector128<float> psa00  = xmmA00.AsSingle();
+            Vector128<float> psa01  = xmmA01.AsSingle();
+
+            Vector128<float> psRes = Sse.Xor(psCRC3, psa00);
+            psRes = Sse.Xor(psRes, psa01);
+
+            xmmCRC3 = psRes.AsUInt32();
+        }
+
+        internal static uint Step(byte[] src, long len, uint initialCRC)
+        {
+            Vector128<uint> xmmT0, xmmT1, xmmT2;
+            Vector128<uint> xmmInitial = Sse2.ConvertScalarToVector128UInt32(initialCRC);
+            Vector128<uint> xmmCRC0    = Sse2.ConvertScalarToVector128UInt32(0x9db42487);
+            Vector128<uint> xmmCRC1    = Vector128<uint>.Zero;
+            Vector128<uint> xmmCRC2    = Vector128<uint>.Zero;
+            Vector128<uint> xmmCRC3    = Vector128<uint>.Zero;
+            Vector128<uint> xmmCRCPart;
+            int             bufPos = 0;
+
+            bool first = true;
+
+            /* fold 512 to 32 step variable declarations for ISO-C90 compat. */
+            Vector128<uint> xmmMask  = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000);
+            Vector128<uint> xmmMask2 = Vector128.Create(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+
+            uint crc;
+
+            if(len < 16)
+            {
+                switch(len)
+                {
+                    case 0: return initialCRC;
+                    case < 4:
+                        /*
+                     * no idea how to do this for <4 bytes, delegate to classic impl.
+                     */
+                        crc = ~initialCRC;
+
+                        switch(len)
+                        {
+                            case 3:
+                                crc = (crc >> 8) ^ Crc32Context._isoCrc32Table[0][(crc & 0xFF) ^ src[bufPos++]];
+                                goto case 2;
+                            case 2:
+                                crc = (crc >> 8) ^ Crc32Context._isoCrc32Table[0][(crc & 0xFF) ^ src[bufPos++]];
+                                goto case 1;
+                            case 1:
+                                crc = (crc >> 8) ^ Crc32Context._isoCrc32Table[0][(crc & 0xFF) ^ src[bufPos]];
+
+                                break;
+                        }
+
+                        return ~crc;
+                }
+
+                xmmCRCPart = Vector128.Create(BitConverter.ToUInt32(src, 0), BitConverter.ToUInt32(src, 4),
+                                              BitConverter.ToUInt32(src, 8), BitConverter.ToUInt32(src, 12));
+
+                xmmCRCPart = Sse2.Xor(xmmCRCPart, xmmInitial);
+
+                goto partial;
+            }
+
+            while((len -= 64) >= 0)
+            {
+                xmmT0 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
+                                         BitConverter.ToUInt32(src, bufPos                                     + 8),
+                                         BitConverter.ToUInt32(src, bufPos                                     + 12));
+
+                bufPos += 16;
+
+                xmmT1 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
+                                         BitConverter.ToUInt32(src, bufPos                                     + 8),
+                                         BitConverter.ToUInt32(src, bufPos                                     + 12));
+
+                bufPos += 16;
+
+                xmmT2 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
+                                         BitConverter.ToUInt32(src, bufPos                                     + 8),
+                                         BitConverter.ToUInt32(src, bufPos                                     + 12));
+
+                bufPos += 16;
+
+                Vector128<uint> xmmT3 = Vector128.Create(BitConverter.ToUInt32(src, bufPos),
+                                                         BitConverter.ToUInt32(src, bufPos + 4),
+                                                         BitConverter.ToUInt32(src, bufPos + 8),
+                                                         BitConverter.ToUInt32(src, bufPos + 12));
+
+                bufPos += 16;
+
+                if(first)
+                {
+                    first = false;
+                    xmmT0 = Sse2.Xor(xmmT0, xmmInitial);
+                }
+
+                Fold4(ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3);
+
+                xmmCRC0 = Sse2.Xor(xmmCRC0, xmmT0);
+                xmmCRC1 = Sse2.Xor(xmmCRC1, xmmT1);
+                xmmCRC2 = Sse2.Xor(xmmCRC2, xmmT2);
+                xmmCRC3 = Sse2.Xor(xmmCRC3, xmmT3);
+            }
+
+            /*
+             * len = num bytes left - 64
+             */
+            if(len + 16 >= 0)
+            {
+                len += 16;
+
+                xmmT0 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
+                                         BitConverter.ToUInt32(src, bufPos                                     + 8),
+                                         BitConverter.ToUInt32(src, bufPos                                     + 12));
+
+                bufPos += 16;
+
+                xmmT1 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
+                                         BitConverter.ToUInt32(src, bufPos                                     + 8),
+                                         BitConverter.ToUInt32(src, bufPos                                     + 12));
+
+                bufPos += 16;
+
+                xmmT2 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
+                                         BitConverter.ToUInt32(src, bufPos                                     + 8),
+                                         BitConverter.ToUInt32(src, bufPos                                     + 12));
+
+                bufPos += 16;
+
+                if(first)
+                    xmmT0 = Sse2.Xor(xmmT0, xmmInitial);
+
+                Fold3(ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3);
+
+                xmmCRC1 = Sse2.Xor(xmmCRC1, xmmT0);
+                xmmCRC2 = Sse2.Xor(xmmCRC2, xmmT1);
+                xmmCRC3 = Sse2.Xor(xmmCRC3, xmmT2);
+
+                if(len == 0)
+                    goto done;
+
+                xmmCRCPart = Vector128.Create(BitConverter.ToUInt32(src, bufPos),
+                                              BitConverter.ToUInt32(src, bufPos + 4),
+                                              BitConverter.ToUInt32(src, bufPos + 8),
+                                              BitConverter.ToUInt32(src, bufPos + 12));
+            }
+            else if(len + 32 >= 0)
+            {
+                len += 32;
+
+                xmmT0 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
+                                         BitConverter.ToUInt32(src, bufPos                                     + 8),
+                                         BitConverter.ToUInt32(src, bufPos                                     + 12));
+
+                bufPos += 16;
+
+                xmmT1 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
+                                         BitConverter.ToUInt32(src, bufPos                                     + 8),
+                                         BitConverter.ToUInt32(src, bufPos                                     + 12));
+
+                bufPos += 16;
+
+                if(first)
+                    xmmT0 = Sse2.Xor(xmmT0, xmmInitial);
+
+                Fold2(ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3);
+
+                xmmCRC2 = Sse2.Xor(xmmCRC2, xmmT0);
+                xmmCRC3 = Sse2.Xor(xmmCRC3, xmmT1);
+
+                if(len == 0)
+                    goto done;
+
+                xmmCRCPart = Vector128.Create(BitConverter.ToUInt32(src, bufPos),
+                                              BitConverter.ToUInt32(src, bufPos + 4),
+                                              BitConverter.ToUInt32(src, bufPos + 8),
+                                              BitConverter.ToUInt32(src, bufPos + 12));
+            }
+            else if(len + 48 >= 0)
+            {
+                len += 48;
+
+                xmmT0 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
+                                         BitConverter.ToUInt32(src, bufPos                                     + 8),
+                                         BitConverter.ToUInt32(src, bufPos                                     + 12));
+
+                bufPos += 16;
+
+                if(first)
+                    xmmT0 = Sse2.Xor(xmmT0, xmmInitial);
+
+                Fold1(ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3);
+
+                xmmCRC3 = Sse2.Xor(xmmCRC3, xmmT0);
+
+                if(len == 0)
+                    goto done;
+
+                xmmCRCPart = Vector128.Create(BitConverter.ToUInt32(src, bufPos),
+                                              BitConverter.ToUInt32(src, bufPos + 4),
+                                              BitConverter.ToUInt32(src, bufPos + 8),
+                                              BitConverter.ToUInt32(src, bufPos + 12));
+            }
+            else
+            {
+                len += 64;
+
+                if(len == 0)
+                    goto done;
+
+                xmmCRCPart = Vector128.Create(BitConverter.ToUInt32(src, bufPos),
+                                              BitConverter.ToUInt32(src, bufPos + 4),
+                                              BitConverter.ToUInt32(src, bufPos + 8),
+                                              BitConverter.ToUInt32(src, bufPos + 12));
+
+                if(first)
+                    xmmCRCPart = Sse2.Xor(xmmCRCPart, xmmInitial);
+            }
+
+            partial:
+            PartialFold(len, ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3, ref xmmCRCPart);
+
+            done:
+
+            /* fold 512 to 32 */
+
+            /*
+             * k1
+             */
+            Vector128<uint> crcFold = Vector128.Create(_crcK[0], _crcK[1], _crcK[2], _crcK[3]);
+
+            Vector128<uint> xTmp0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), crcFold.AsUInt64(), 0x10).
+                                              AsUInt32();
+
+            xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), crcFold.AsUInt64(), 0x01).AsUInt32();
+            xmmCRC1 = Sse2.Xor(xmmCRC1, xTmp0);
+            xmmCRC1 = Sse2.Xor(xmmCRC1, xmmCRC0);
+
+            Vector128<uint> xTmp1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), crcFold.AsUInt64(), 0x10).
+                                              AsUInt32();
+
+            xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), crcFold.AsUInt64(), 0x01).AsUInt32();
+            xmmCRC2 = Sse2.Xor(xmmCRC2, xTmp1);
+            xmmCRC2 = Sse2.Xor(xmmCRC2, xmmCRC1);
+
+            Vector128<uint> xTmp2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), crcFold.AsUInt64(), 0x10).
+                                              AsUInt32();
+
+            xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), crcFold.AsUInt64(), 0x01).AsUInt32();
+            xmmCRC3 = Sse2.Xor(xmmCRC3, xTmp2);
+            xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC2);
+
+            /*
+             * k5
+             */
+            crcFold = Vector128.Create(_crcK[4], _crcK[5], _crcK[6], _crcK[7]);
+
+            xmmCRC0 = xmmCRC3;
+            xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0).AsUInt32();
+            xmmCRC0 = Sse2.ShiftRightLogical128BitLane(xmmCRC0, 8);
+            xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC0);
+
+            xmmCRC0 = xmmCRC3;
+            xmmCRC3 = Sse2.ShiftLeftLogical128BitLane(xmmCRC3, 4);
+            xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0x10).AsUInt32();
+            xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC0);
+            xmmCRC3 = Sse2.And(xmmCRC3, xmmMask2);
+
+            /*
+             * k7
+             */
+            xmmCRC1 = xmmCRC3;
+            xmmCRC2 = xmmCRC3;
+            crcFold = Vector128.Create(_crcK[8], _crcK[9], _crcK[10], _crcK[11]);
+
+            xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0).AsUInt32();
+            xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC2);
+            xmmCRC3 = Sse2.And(xmmCRC3, xmmMask);
+
+            xmmCRC2 = xmmCRC3;
+            xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0x10).AsUInt32();
+            xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC2);
+            xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC1);
+
+            /*
+             * could just as well write xmm_crc3[2], doing a movaps and truncating, but
+             * no real advantage - it's a tiny bit slower per call, while no additional CPUs
+             * would be supported by only requiring SSSE3 and CLMUL instead of SSE4.1 + CLMUL
+             */
+            crc = Sse41.Extract(xmmCRC3, 2);
+
+            return ~crc;
+        }
+    }
+}
\ No newline at end of file
diff --git a/CRC32Context.cs b/CRC32Context.cs
index f313ee3ef..1db3101eb 100644
--- a/CRC32Context.cs
+++ b/CRC32Context.cs
@@ -32,7 +32,9 @@
 
 using System;
 using System.IO;
+using System.Runtime.Intrinsics.X86;
 using System.Text;
+using Aaru.Checksums.CRC32;
 using Aaru.CommonTypes.Interfaces;
 using Aaru.Helpers;
 
@@ -45,7 +47,7 @@ namespace Aaru.Checksums
         const uint CRC32_ISO_POLY = 0xEDB88320;
         const uint CRC32_ISO_SEED = 0xFFFFFFFF;
 
-        static readonly uint[][] _isoCrc32Table =
+        internal static readonly uint[][] _isoCrc32Table =
         {
             new uint[]
             {
@@ -332,6 +334,7 @@ namespace Aaru.Checksums
         readonly uint     _finalSeed;
         readonly uint[][] _table;
         uint              _hashInt;
+        readonly bool     _useIso;
 
         /// <summary>Initializes the CRC32 table and seed as CRC32-ISO</summary>
         public Crc32Context()
@@ -339,6 +342,7 @@ namespace Aaru.Checksums
             _hashInt   = CRC32_ISO_SEED;
             _finalSeed = CRC32_ISO_SEED;
             _table     = _isoCrc32Table;
+            _useIso    = true;
         }
 
         /// <summary>Initializes the CRC32 table with a custom polynomial and seed</summary>
@@ -346,6 +350,7 @@ namespace Aaru.Checksums
         {
             _hashInt   = seed;
             _finalSeed = seed;
+            _useIso    = polynomial == CRC32_ISO_POLY && seed == CRC32_ISO_SEED;
 
             _table = GenerateTable(polynomial);
         }
@@ -354,7 +359,7 @@ namespace Aaru.Checksums
         /// <summary>Updates the hash with data.</summary>
         /// <param name="data">Data buffer.</param>
         /// <param name="len">Length of buffer to hash.</param>
-        public void Update(byte[] data, uint len) => Step(ref _hashInt, _table, data, len);
+        public void Update(byte[] data, uint len) => Step(ref _hashInt, _table, data, len, _useIso);
 
         /// <inheritdoc />
         /// <summary>Updates the hash with data.</summary>
@@ -404,40 +409,48 @@ namespace Aaru.Checksums
             return table;
         }
 
-        static void Step(ref uint previousCrc, uint[][] table, byte[] data, uint len)
+        static void Step(ref uint previousCrc, uint[][] table, byte[] data, uint len, bool useIso)
         {
+            if(useIso                &&
+               Pclmulqdq.IsSupported &&
+               Sse41.IsSupported     &&
+               Ssse3.IsSupported     &&
+               Sse2.IsSupported)
+            {
+                previousCrc = ~Clmul.Step(data, len, ~previousCrc);
+
+                return;
+            }
+
             // Unroll according to Intel slicing by uint8_t
             // http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf
             // http://sourceforge.net/projects/slicing-by-8/
+            int       currentPos  = 0;
+            const int unroll      = 4;
+            const int bytesAtOnce = 8 * unroll;
+            uint      crc         = previousCrc;
 
-            uint      crc;
-            int       current_pos   = 0;
-            const int unroll        = 4;
-            const int bytes_at_once = 8 * unroll;
-
-            crc = previousCrc;
-
-            while(len >= bytes_at_once)
+            while(len >= bytesAtOnce)
             {
                 int unrolling;
 
                 for(unrolling = 0; unrolling < unroll; unrolling++)
                 {
-                    uint one = BitConverter.ToUInt32(data, current_pos) ^ crc;
-                    current_pos += 4;
-                    uint two = BitConverter.ToUInt32(data, current_pos);
-                    current_pos += 4;
+                    uint one = BitConverter.ToUInt32(data, currentPos) ^ crc;
+                    currentPos += 4;
+                    uint two = BitConverter.ToUInt32(data, currentPos);
+                    currentPos += 4;
 
                     crc = table[0][(two >> 24) & 0xFF] ^ table[1][(two >> 16) & 0xFF] ^ table[2][(two >> 8)  & 0xFF] ^
                           table[3][two         & 0xFF] ^ table[4][(one >> 24) & 0xFF] ^ table[5][(one >> 16) & 0xFF] ^
                           table[6][(one                                >> 8)  & 0xFF] ^ table[7][one         & 0xFF];
                 }
 
-                len -= bytes_at_once;
+                len -= bytesAtOnce;
             }
 
             while(len-- != 0)
-                crc = (crc >> 8) ^ table[0][(crc & 0xFF) ^ data[current_pos++]];
+                crc = (crc >> 8) ^ table[0][(crc & 0xFF) ^ data[currentPos++]];
 
             previousCrc = crc;
         }
@@ -475,7 +488,8 @@ namespace Aaru.Checksums
 
             while(read > 0)
             {
-                Step(ref localHashInt, localTable, buffer, (uint)read);
+                Step(ref localHashInt, localTable, buffer, (uint)read,
+                     polynomial == CRC32_ISO_POLY && seed == CRC32_ISO_SEED);
 
                 read = fileStream.Read(buffer, 0, 65536);
             }
@@ -512,7 +526,7 @@ namespace Aaru.Checksums
 
             uint[][] localTable = GenerateTable(polynomial);
 
-            Step(ref localHashInt, localTable, data, len);
+            Step(ref localHashInt, localTable, data, len, polynomial == CRC32_ISO_POLY && seed == CRC32_ISO_SEED);
 
             localHashInt ^= seed;
             hash         =  BigEndianBitConverter.GetBytes(localHashInt);