From 7b8f4cd795c7b089f6273bf09386b62e45425185 Mon Sep 17 00:00:00 2001 From: Natalia Portillo Date: Sun, 26 Sep 2021 23:13:43 +0100 Subject: [PATCH] Add PCLMUL implementation of CRC32. --- Aaru.Checksums.csproj | 45 ++-- CRC32/clmul.cs | 541 ++++++++++++++++++++++++++++++++++++++++++ CRC32Context.cs | 52 ++-- 3 files changed, 597 insertions(+), 41 deletions(-) create mode 100644 CRC32/clmul.cs diff --git a/Aaru.Checksums.csproj b/Aaru.Checksums.csproj index cac845667..696e74abd 100644 --- a/Aaru.Checksums.csproj +++ b/Aaru.Checksums.csproj @@ -55,28 +55,29 @@ false - + - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + - + {F8BDF57B-1571-4CD0-84B3-B422088D359A} Aaru.Helpers @@ -95,15 +96,15 @@ - + - - - + + + diff --git a/CRC32/clmul.cs b/CRC32/clmul.cs new file mode 100644 index 000000000..b446d53b0 --- /dev/null +++ b/CRC32/clmul.cs @@ -0,0 +1,541 @@ +// /*************************************************************************** +// Aaru Data Preservation Suite +// ---------------------------------------------------------------------------- +// +// Filename : clmul.cs +// Author(s) : Natalia Portillo +// Wajdi Feghali +// Jim Guilford +// Vinodh Gopal +// Erdinc Ozturk +// Jim Kukunas +// Marian Beermann +// +// Component : Checksums. +// +// --[ Description ] ---------------------------------------------------------- +// +// Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ +// instruction. +// +// A white paper describing this algorithm can be found at: +// http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf +// +// --[ License ] -------------------------------------------------------------- +// +// This software is provided 'as-is', without any express or implied warranty. +// In no event will the authors be held liable for any damages arising from +// the use of this software. +// +// Permission is granted to anyone to use this software for any purpose, +// including commercial applications, and to alter it and redistribute it +// freely, subject to the following restrictions: +// +// 1. The origin of this software must not be misrepresented; you must not +// claim that you wrote the original software. If you use this software +// in a product, an acknowledgment in the product documentation would be +// appreciated but is not required. +// +// 2. Altered source versions must be plainly marked as such, and must not be +// misrepresented as being the original software. +// +// 3. This notice may not be removed or altered from any source distribution. +// +// ---------------------------------------------------------------------------- +// Copyright © 2011-2021 Natalia Portillo +// Copyright (c) 2016 Marian Beermann (add support for initial value, restructuring) +// Copyright (C) 2013 Intel Corporation. All rights reserved. +// ****************************************************************************/ + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +namespace Aaru.Checksums.CRC32 +{ + internal static class Clmul + { + static readonly uint[] _crcK = + { + 0xccaa009e, 0x00000000, /* rk1 */ 0x751997d0, 0x00000001, /* rk2 */ 0xccaa009e, 0x00000000, /* rk5 */ + 0x63cd6124, 0x00000001, /* rk6 */ 0xf7011640, 0x00000001, /* rk7 */ 0xdb710640, 0x00000001 /* rk8 */ + }; + + static readonly Vector128[] _pshufbShfTable = + { + Vector128.Create(0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d), /* shl 15 (16 - 1)/shr1 */ + Vector128.Create(0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e), /* shl 14 (16 - 3)/shr2 */ + Vector128.Create(0x86858483, 0x8a898887, 0x8e8d8c8b, 0x0201008f), /* shl 13 (16 - 4)/shr3 */ + Vector128.Create(0x87868584, 0x8b8a8988, 0x8f8e8d8c, 0x03020100), /* shl 12 (16 - 4)/shr4 */ + Vector128.Create(0x88878685, 0x8c8b8a89, 0x008f8e8d, 0x04030201), /* shl 11 (16 - 5)/shr5 */ + Vector128.Create(0x89888786, 0x8d8c8b8a, 0x01008f8e, 0x05040302), /* shl 10 (16 - 6)/shr6 */ + Vector128.Create(0x8a898887, 0x8e8d8c8b, 0x0201008f, 0x06050403), /* shl 9 (16 - 7)/shr7 */ + Vector128.Create(0x8b8a8988, 0x8f8e8d8c, 0x03020100, 0x07060504), /* shl 8 (16 - 8)/shr8 */ + Vector128.Create(0x8c8b8a89, 0x008f8e8d, 0x04030201, 0x08070605), /* shl 7 (16 - 9)/shr9 */ + Vector128.Create(0x8d8c8b8a, 0x01008f8e, 0x05040302, 0x09080706), /* shl 6 (16 -10)/shr10*/ + Vector128.Create(0x8e8d8c8b, 0x0201008f, 0x06050403, 0x0a090807), /* shl 5 (16 -11)/shr11*/ + Vector128.Create(0x8f8e8d8c, 0x03020100, 0x07060504, 0x0b0a0908), /* shl 4 (16 -12)/shr12*/ + Vector128.Create(0x008f8e8du, 0x04030201, 0x08070605, 0x0c0b0a09), /* shl 3 (16 -13)/shr13*/ + Vector128.Create(0x01008f8eu, 0x05040302, 0x09080706, 0x0d0c0b0a), /* shl 2 (16 -14)/shr14*/ + Vector128.Create(0x0201008fu, 0x06050403, 0x0a090807, 0x0e0d0c0b) /* shl 1 (16 -15)/shr15*/ + }; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static void Fold1(ref Vector128 xmmCRC0, ref Vector128 xmmCRC1, ref Vector128 xmmCRC2, + ref Vector128 xmmCRC3) + { + Vector128 xmmFold4 = Vector128.Create(0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001); + + Vector128 xTmp3 = xmmCRC3; + + xmmCRC3 = xmmCRC0; + xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); + xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); + Vector128 psCRC0 = xmmCRC0.AsSingle(); + Vector128 psCRC3 = xmmCRC3.AsSingle(); + Vector128 psRes = Sse.Xor(psCRC0, psCRC3); + + xmmCRC0 = xmmCRC1; + xmmCRC1 = xmmCRC2; + xmmCRC2 = xTmp3; + xmmCRC3 = psRes.AsUInt32(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static void Fold2(ref Vector128 xmmCRC0, ref Vector128 xmmCRC1, ref Vector128 xmmCRC2, + ref Vector128 xmmCRC3) + { + Vector128 xmmFold4 = Vector128.Create(0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001); + + Vector128 xTmp3 = xmmCRC3; + Vector128 xTmp2 = xmmCRC2; + + xmmCRC3 = xmmCRC1; + xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); + xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); + Vector128 psCRC3 = xmmCRC3.AsSingle(); + Vector128 psCRC1 = xmmCRC1.AsSingle(); + Vector128 psRes31 = Sse.Xor(psCRC3, psCRC1); + + xmmCRC2 = xmmCRC0; + xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); + xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); + Vector128 psCRC0 = xmmCRC0.AsSingle(); + Vector128 psCRC2 = xmmCRC2.AsSingle(); + Vector128 psRes20 = Sse.Xor(psCRC0, psCRC2); + + xmmCRC0 = xTmp2; + xmmCRC1 = xTmp3; + xmmCRC2 = psRes20.AsUInt32(); + xmmCRC3 = psRes31.AsUInt32(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static void Fold3(ref Vector128 xmmCRC0, ref Vector128 xmmCRC1, ref Vector128 xmmCRC2, + ref Vector128 xmmCRC3) + { + Vector128 xmmFold4 = Vector128.Create(0x54442bd4, 0x00000001, 0xc6e41596, 0x00000001); + + Vector128 xTmp3 = xmmCRC3; + + xmmCRC3 = xmmCRC2; + xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); + xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); + Vector128 psCRC2 = xmmCRC2.AsSingle(); + Vector128 psCRC3 = xmmCRC3.AsSingle(); + Vector128 psRes32 = Sse.Xor(psCRC2, psCRC3); + + xmmCRC2 = xmmCRC1; + xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); + xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); + Vector128 psCRC1 = xmmCRC1.AsSingle(); + psCRC2 = xmmCRC2.AsSingle(); + Vector128 psRes21 = Sse.Xor(psCRC1, psCRC2); + + xmmCRC1 = xmmCRC0; + xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); + xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); + Vector128 psCRC0 = xmmCRC0.AsSingle(); + psCRC1 = xmmCRC1.AsSingle(); + Vector128 psRes10 = Sse.Xor(psCRC0, psCRC1); + + xmmCRC0 = xTmp3; + xmmCRC1 = psRes10.AsUInt32(); + xmmCRC2 = psRes21.AsUInt32(); + xmmCRC3 = psRes32.AsUInt32(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static void Fold4(ref Vector128 xmmCRC0, ref Vector128 xmmCRC1, ref Vector128 xmmCRC2, + ref Vector128 xmmCRC3) + { + Vector128 xmmFold4 = Vector128.Create(0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001); + + Vector128 xTmp0 = xmmCRC0; + Vector128 xTmp1 = xmmCRC1; + Vector128 xTmp2 = xmmCRC2; + Vector128 xTmp3 = xmmCRC3; + + xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); + xTmp0 = Pclmulqdq.CarrylessMultiply(xTmp0.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); + Vector128 psCRC0 = xmmCRC0.AsSingle(); + Vector128 psT0 = xTmp0.AsSingle(); + Vector128 psRes0 = Sse.Xor(psCRC0, psT0); + + xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); + xTmp1 = Pclmulqdq.CarrylessMultiply(xTmp1.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); + Vector128 psCRC1 = xmmCRC1.AsSingle(); + Vector128 psT1 = xTmp1.AsSingle(); + Vector128 psRes1 = Sse.Xor(psCRC1, psT1); + + xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); + xTmp2 = Pclmulqdq.CarrylessMultiply(xTmp2.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); + Vector128 psCRC2 = xmmCRC2.AsSingle(); + Vector128 psT2 = xTmp2.AsSingle(); + Vector128 psRes2 = Sse.Xor(psCRC2, psT2); + + xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); + xTmp3 = Pclmulqdq.CarrylessMultiply(xTmp3.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); + Vector128 psCRC3 = xmmCRC3.AsSingle(); + Vector128 psT3 = xTmp3.AsSingle(); + Vector128 psRes3 = Sse.Xor(psCRC3, psT3); + + xmmCRC0 = psRes0.AsUInt32(); + xmmCRC1 = psRes1.AsUInt32(); + xmmCRC2 = psRes2.AsUInt32(); + xmmCRC3 = psRes3.AsUInt32(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static void PartialFold(long len, ref Vector128 xmmCRC0, ref Vector128 xmmCRC1, + ref Vector128 xmmCRC2, ref Vector128 xmmCRC3, + ref Vector128 xmmCRCPart) + { + Vector128 xmmFold4 = Vector128.Create(0x54442bd4, 0x00000001, 0xc6e41596, 0x00000001); + Vector128 xmmMask3 = Vector128.Create(0x80808080); + + Vector128 xmmShl = _pshufbShfTable[len - 1]; + Vector128 xmmShr = xmmShl; + xmmShr = Sse2.Xor(xmmShr, xmmMask3); + + Vector128 xmmA00 = Ssse3.Shuffle(xmmCRC0.AsByte(), xmmShl.AsByte()).AsUInt32(); + + xmmCRC0 = Ssse3.Shuffle(xmmCRC0.AsByte(), xmmShr.AsByte()).AsUInt32(); + Vector128 xmmTmp1 = Ssse3.Shuffle(xmmCRC1.AsByte(), xmmShl.AsByte()).AsUInt32(); + xmmCRC0 = Sse2.Or(xmmCRC0, xmmTmp1); + + xmmCRC1 = Ssse3.Shuffle(xmmCRC1.AsByte(), xmmShr.AsByte()).AsUInt32(); + Vector128 xmmTmp2 = Ssse3.Shuffle(xmmCRC2.AsByte(), xmmShl.AsByte()).AsUInt32(); + xmmCRC1 = Sse2.Or(xmmCRC1, xmmTmp2); + + xmmCRC2 = Ssse3.Shuffle(xmmCRC2.AsByte(), xmmShr.AsByte()).AsUInt32(); + Vector128 xmmTmp3 = Ssse3.Shuffle(xmmCRC3.AsByte(), xmmShl.AsByte()).AsUInt32(); + xmmCRC2 = Sse2.Or(xmmCRC2, xmmTmp3); + + xmmCRC3 = Ssse3.Shuffle(xmmCRC3.AsByte(), xmmShr.AsByte()).AsUInt32(); + xmmCRCPart = Ssse3.Shuffle(xmmCRCPart.AsByte(), xmmShl.AsByte()).AsUInt32(); + xmmCRC3 = Sse2.Or(xmmCRC3, xmmCRCPart); + + Vector128 xmmA01 = Pclmulqdq.CarrylessMultiply(xmmA00.AsUInt64(), xmmFold4.AsUInt64(), 0x10). + AsUInt32(); + + xmmA00 = Pclmulqdq.CarrylessMultiply(xmmA00.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); + + Vector128 psCRC3 = xmmCRC3.AsSingle(); + Vector128 psa00 = xmmA00.AsSingle(); + Vector128 psa01 = xmmA01.AsSingle(); + + Vector128 psRes = Sse.Xor(psCRC3, psa00); + psRes = Sse.Xor(psRes, psa01); + + xmmCRC3 = psRes.AsUInt32(); + } + + internal static uint Step(byte[] src, long len, uint initialCRC) + { + Vector128 xmmT0, xmmT1, xmmT2; + Vector128 xmmInitial = Sse2.ConvertScalarToVector128UInt32(initialCRC); + Vector128 xmmCRC0 = Sse2.ConvertScalarToVector128UInt32(0x9db42487); + Vector128 xmmCRC1 = Vector128.Zero; + Vector128 xmmCRC2 = Vector128.Zero; + Vector128 xmmCRC3 = Vector128.Zero; + Vector128 xmmCRCPart; + int bufPos = 0; + + bool first = true; + + /* fold 512 to 32 step variable declarations for ISO-C90 compat. */ + Vector128 xmmMask = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000); + Vector128 xmmMask2 = Vector128.Create(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF); + + uint crc; + + if(len < 16) + { + switch(len) + { + case 0: return initialCRC; + case < 4: + /* + * no idea how to do this for <4 bytes, delegate to classic impl. + */ + crc = ~initialCRC; + + switch(len) + { + case 3: + crc = (crc >> 8) ^ Crc32Context._isoCrc32Table[0][(crc & 0xFF) ^ src[bufPos++]]; + goto case 2; + case 2: + crc = (crc >> 8) ^ Crc32Context._isoCrc32Table[0][(crc & 0xFF) ^ src[bufPos++]]; + goto case 1; + case 1: + crc = (crc >> 8) ^ Crc32Context._isoCrc32Table[0][(crc & 0xFF) ^ src[bufPos]]; + + break; + } + + return ~crc; + } + + xmmCRCPart = Vector128.Create(BitConverter.ToUInt32(src, 0), BitConverter.ToUInt32(src, 4), + BitConverter.ToUInt32(src, 8), BitConverter.ToUInt32(src, 12)); + + xmmCRCPart = Sse2.Xor(xmmCRCPart, xmmInitial); + + goto partial; + } + + while((len -= 64) >= 0) + { + xmmT0 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4), + BitConverter.ToUInt32(src, bufPos + 8), + BitConverter.ToUInt32(src, bufPos + 12)); + + bufPos += 16; + + xmmT1 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4), + BitConverter.ToUInt32(src, bufPos + 8), + BitConverter.ToUInt32(src, bufPos + 12)); + + bufPos += 16; + + xmmT2 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4), + BitConverter.ToUInt32(src, bufPos + 8), + BitConverter.ToUInt32(src, bufPos + 12)); + + bufPos += 16; + + Vector128 xmmT3 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), + BitConverter.ToUInt32(src, bufPos + 4), + BitConverter.ToUInt32(src, bufPos + 8), + BitConverter.ToUInt32(src, bufPos + 12)); + + bufPos += 16; + + if(first) + { + first = false; + xmmT0 = Sse2.Xor(xmmT0, xmmInitial); + } + + Fold4(ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3); + + xmmCRC0 = Sse2.Xor(xmmCRC0, xmmT0); + xmmCRC1 = Sse2.Xor(xmmCRC1, xmmT1); + xmmCRC2 = Sse2.Xor(xmmCRC2, xmmT2); + xmmCRC3 = Sse2.Xor(xmmCRC3, xmmT3); + } + + /* + * len = num bytes left - 64 + */ + if(len + 16 >= 0) + { + len += 16; + + xmmT0 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4), + BitConverter.ToUInt32(src, bufPos + 8), + BitConverter.ToUInt32(src, bufPos + 12)); + + bufPos += 16; + + xmmT1 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4), + BitConverter.ToUInt32(src, bufPos + 8), + BitConverter.ToUInt32(src, bufPos + 12)); + + bufPos += 16; + + xmmT2 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4), + BitConverter.ToUInt32(src, bufPos + 8), + BitConverter.ToUInt32(src, bufPos + 12)); + + bufPos += 16; + + if(first) + xmmT0 = Sse2.Xor(xmmT0, xmmInitial); + + Fold3(ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3); + + xmmCRC1 = Sse2.Xor(xmmCRC1, xmmT0); + xmmCRC2 = Sse2.Xor(xmmCRC2, xmmT1); + xmmCRC3 = Sse2.Xor(xmmCRC3, xmmT2); + + if(len == 0) + goto done; + + xmmCRCPart = Vector128.Create(BitConverter.ToUInt32(src, bufPos), + BitConverter.ToUInt32(src, bufPos + 4), + BitConverter.ToUInt32(src, bufPos + 8), + BitConverter.ToUInt32(src, bufPos + 12)); + } + else if(len + 32 >= 0) + { + len += 32; + + xmmT0 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4), + BitConverter.ToUInt32(src, bufPos + 8), + BitConverter.ToUInt32(src, bufPos + 12)); + + bufPos += 16; + + xmmT1 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4), + BitConverter.ToUInt32(src, bufPos + 8), + BitConverter.ToUInt32(src, bufPos + 12)); + + bufPos += 16; + + if(first) + xmmT0 = Sse2.Xor(xmmT0, xmmInitial); + + Fold2(ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3); + + xmmCRC2 = Sse2.Xor(xmmCRC2, xmmT0); + xmmCRC3 = Sse2.Xor(xmmCRC3, xmmT1); + + if(len == 0) + goto done; + + xmmCRCPart = Vector128.Create(BitConverter.ToUInt32(src, bufPos), + BitConverter.ToUInt32(src, bufPos + 4), + BitConverter.ToUInt32(src, bufPos + 8), + BitConverter.ToUInt32(src, bufPos + 12)); + } + else if(len + 48 >= 0) + { + len += 48; + + xmmT0 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4), + BitConverter.ToUInt32(src, bufPos + 8), + BitConverter.ToUInt32(src, bufPos + 12)); + + bufPos += 16; + + if(first) + xmmT0 = Sse2.Xor(xmmT0, xmmInitial); + + Fold1(ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3); + + xmmCRC3 = Sse2.Xor(xmmCRC3, xmmT0); + + if(len == 0) + goto done; + + xmmCRCPart = Vector128.Create(BitConverter.ToUInt32(src, bufPos), + BitConverter.ToUInt32(src, bufPos + 4), + BitConverter.ToUInt32(src, bufPos + 8), + BitConverter.ToUInt32(src, bufPos + 12)); + } + else + { + len += 64; + + if(len == 0) + goto done; + + xmmCRCPart = Vector128.Create(BitConverter.ToUInt32(src, bufPos), + BitConverter.ToUInt32(src, bufPos + 4), + BitConverter.ToUInt32(src, bufPos + 8), + BitConverter.ToUInt32(src, bufPos + 12)); + + if(first) + xmmCRCPart = Sse2.Xor(xmmCRCPart, xmmInitial); + } + + partial: + PartialFold(len, ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3, ref xmmCRCPart); + + done: + + /* fold 512 to 32 */ + + /* + * k1 + */ + Vector128 crcFold = Vector128.Create(_crcK[0], _crcK[1], _crcK[2], _crcK[3]); + + Vector128 xTmp0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), crcFold.AsUInt64(), 0x10). + AsUInt32(); + + xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), crcFold.AsUInt64(), 0x01).AsUInt32(); + xmmCRC1 = Sse2.Xor(xmmCRC1, xTmp0); + xmmCRC1 = Sse2.Xor(xmmCRC1, xmmCRC0); + + Vector128 xTmp1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), crcFold.AsUInt64(), 0x10). + AsUInt32(); + + xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), crcFold.AsUInt64(), 0x01).AsUInt32(); + xmmCRC2 = Sse2.Xor(xmmCRC2, xTmp1); + xmmCRC2 = Sse2.Xor(xmmCRC2, xmmCRC1); + + Vector128 xTmp2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), crcFold.AsUInt64(), 0x10). + AsUInt32(); + + xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), crcFold.AsUInt64(), 0x01).AsUInt32(); + xmmCRC3 = Sse2.Xor(xmmCRC3, xTmp2); + xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC2); + + /* + * k5 + */ + crcFold = Vector128.Create(_crcK[4], _crcK[5], _crcK[6], _crcK[7]); + + xmmCRC0 = xmmCRC3; + xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0).AsUInt32(); + xmmCRC0 = Sse2.ShiftRightLogical128BitLane(xmmCRC0, 8); + xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC0); + + xmmCRC0 = xmmCRC3; + xmmCRC3 = Sse2.ShiftLeftLogical128BitLane(xmmCRC3, 4); + xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0x10).AsUInt32(); + xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC0); + xmmCRC3 = Sse2.And(xmmCRC3, xmmMask2); + + /* + * k7 + */ + xmmCRC1 = xmmCRC3; + xmmCRC2 = xmmCRC3; + crcFold = Vector128.Create(_crcK[8], _crcK[9], _crcK[10], _crcK[11]); + + xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0).AsUInt32(); + xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC2); + xmmCRC3 = Sse2.And(xmmCRC3, xmmMask); + + xmmCRC2 = xmmCRC3; + xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0x10).AsUInt32(); + xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC2); + xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC1); + + /* + * could just as well write xmm_crc3[2], doing a movaps and truncating, but + * no real advantage - it's a tiny bit slower per call, while no additional CPUs + * would be supported by only requiring SSSE3 and CLMUL instead of SSE4.1 + CLMUL + */ + crc = Sse41.Extract(xmmCRC3, 2); + + return ~crc; + } + } +} \ No newline at end of file diff --git a/CRC32Context.cs b/CRC32Context.cs index f313ee3ef..1db3101eb 100644 --- a/CRC32Context.cs +++ b/CRC32Context.cs @@ -32,7 +32,9 @@ using System; using System.IO; +using System.Runtime.Intrinsics.X86; using System.Text; +using Aaru.Checksums.CRC32; using Aaru.CommonTypes.Interfaces; using Aaru.Helpers; @@ -45,7 +47,7 @@ namespace Aaru.Checksums const uint CRC32_ISO_POLY = 0xEDB88320; const uint CRC32_ISO_SEED = 0xFFFFFFFF; - static readonly uint[][] _isoCrc32Table = + internal static readonly uint[][] _isoCrc32Table = { new uint[] { @@ -332,6 +334,7 @@ namespace Aaru.Checksums readonly uint _finalSeed; readonly uint[][] _table; uint _hashInt; + readonly bool _useIso; /// Initializes the CRC32 table and seed as CRC32-ISO public Crc32Context() @@ -339,6 +342,7 @@ namespace Aaru.Checksums _hashInt = CRC32_ISO_SEED; _finalSeed = CRC32_ISO_SEED; _table = _isoCrc32Table; + _useIso = true; } /// Initializes the CRC32 table with a custom polynomial and seed @@ -346,6 +350,7 @@ namespace Aaru.Checksums { _hashInt = seed; _finalSeed = seed; + _useIso = polynomial == CRC32_ISO_POLY && seed == CRC32_ISO_SEED; _table = GenerateTable(polynomial); } @@ -354,7 +359,7 @@ namespace Aaru.Checksums /// Updates the hash with data. /// Data buffer. /// Length of buffer to hash. - public void Update(byte[] data, uint len) => Step(ref _hashInt, _table, data, len); + public void Update(byte[] data, uint len) => Step(ref _hashInt, _table, data, len, _useIso); /// /// Updates the hash with data. @@ -404,40 +409,48 @@ namespace Aaru.Checksums return table; } - static void Step(ref uint previousCrc, uint[][] table, byte[] data, uint len) + static void Step(ref uint previousCrc, uint[][] table, byte[] data, uint len, bool useIso) { + if(useIso && + Pclmulqdq.IsSupported && + Sse41.IsSupported && + Ssse3.IsSupported && + Sse2.IsSupported) + { + previousCrc = ~Clmul.Step(data, len, ~previousCrc); + + return; + } + // Unroll according to Intel slicing by uint8_t // http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf // http://sourceforge.net/projects/slicing-by-8/ + int currentPos = 0; + const int unroll = 4; + const int bytesAtOnce = 8 * unroll; + uint crc = previousCrc; - uint crc; - int current_pos = 0; - const int unroll = 4; - const int bytes_at_once = 8 * unroll; - - crc = previousCrc; - - while(len >= bytes_at_once) + while(len >= bytesAtOnce) { int unrolling; for(unrolling = 0; unrolling < unroll; unrolling++) { - uint one = BitConverter.ToUInt32(data, current_pos) ^ crc; - current_pos += 4; - uint two = BitConverter.ToUInt32(data, current_pos); - current_pos += 4; + uint one = BitConverter.ToUInt32(data, currentPos) ^ crc; + currentPos += 4; + uint two = BitConverter.ToUInt32(data, currentPos); + currentPos += 4; crc = table[0][(two >> 24) & 0xFF] ^ table[1][(two >> 16) & 0xFF] ^ table[2][(two >> 8) & 0xFF] ^ table[3][two & 0xFF] ^ table[4][(one >> 24) & 0xFF] ^ table[5][(one >> 16) & 0xFF] ^ table[6][(one >> 8) & 0xFF] ^ table[7][one & 0xFF]; } - len -= bytes_at_once; + len -= bytesAtOnce; } while(len-- != 0) - crc = (crc >> 8) ^ table[0][(crc & 0xFF) ^ data[current_pos++]]; + crc = (crc >> 8) ^ table[0][(crc & 0xFF) ^ data[currentPos++]]; previousCrc = crc; } @@ -475,7 +488,8 @@ namespace Aaru.Checksums while(read > 0) { - Step(ref localHashInt, localTable, buffer, (uint)read); + Step(ref localHashInt, localTable, buffer, (uint)read, + polynomial == CRC32_ISO_POLY && seed == CRC32_ISO_SEED); read = fileStream.Read(buffer, 0, 65536); } @@ -512,7 +526,7 @@ namespace Aaru.Checksums uint[][] localTable = GenerateTable(polynomial); - Step(ref localHashInt, localTable, data, len); + Step(ref localHashInt, localTable, data, len, polynomial == CRC32_ISO_POLY && seed == CRC32_ISO_SEED); localHashInt ^= seed; hash = BigEndianBitConverter.GetBytes(localHashInt);