From ee09f7c57e9d23e615d7f8090b25fc4b0d1e83b1 Mon Sep 17 00:00:00 2001 From: Natalia Portillo Date: Mon, 11 Oct 2021 22:51:11 +0100 Subject: [PATCH] Fix SIMD implementations for partial CRC blocks. --- CRC32/clmul.cs | 295 +----------------------------------------------- CRC32Context.cs | 17 ++- CRC64/clmul.cs | 27 +---- CRC64Context.cs | 19 +++- 4 files changed, 34 insertions(+), 324 deletions(-) diff --git a/CRC32/clmul.cs b/CRC32/clmul.cs index b446d53..679450d 100644 --- a/CRC32/clmul.cs +++ b/CRC32/clmul.cs @@ -81,91 +81,6 @@ namespace Aaru.Checksums.CRC32 Vector128.Create(0x0201008fu, 0x06050403, 0x0a090807, 0x0e0d0c0b) /* shl 1 (16 -15)/shr15*/ }; - [MethodImpl(MethodImplOptions.AggressiveInlining)] - static void Fold1(ref Vector128 xmmCRC0, ref Vector128 xmmCRC1, ref Vector128 xmmCRC2, - ref Vector128 xmmCRC3) - { - Vector128 xmmFold4 = Vector128.Create(0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001); - - Vector128 xTmp3 = xmmCRC3; - - xmmCRC3 = xmmCRC0; - xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); - xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); - Vector128 psCRC0 = xmmCRC0.AsSingle(); - Vector128 psCRC3 = xmmCRC3.AsSingle(); - Vector128 psRes = Sse.Xor(psCRC0, psCRC3); - - xmmCRC0 = xmmCRC1; - xmmCRC1 = xmmCRC2; - xmmCRC2 = xTmp3; - xmmCRC3 = psRes.AsUInt32(); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - static void Fold2(ref Vector128 xmmCRC0, ref Vector128 xmmCRC1, ref Vector128 xmmCRC2, - ref Vector128 xmmCRC3) - { - Vector128 xmmFold4 = Vector128.Create(0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001); - - Vector128 xTmp3 = xmmCRC3; - Vector128 xTmp2 = xmmCRC2; - - xmmCRC3 = xmmCRC1; - xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); - xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); - Vector128 psCRC3 = xmmCRC3.AsSingle(); - Vector128 psCRC1 = xmmCRC1.AsSingle(); - Vector128 psRes31 = Sse.Xor(psCRC3, psCRC1); - - xmmCRC2 = xmmCRC0; - xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); - xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); - Vector128 psCRC0 = xmmCRC0.AsSingle(); - Vector128 psCRC2 = xmmCRC2.AsSingle(); - Vector128 psRes20 = Sse.Xor(psCRC0, psCRC2); - - xmmCRC0 = xTmp2; - xmmCRC1 = xTmp3; - xmmCRC2 = psRes20.AsUInt32(); - xmmCRC3 = psRes31.AsUInt32(); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - static void Fold3(ref Vector128 xmmCRC0, ref Vector128 xmmCRC1, ref Vector128 xmmCRC2, - ref Vector128 xmmCRC3) - { - Vector128 xmmFold4 = Vector128.Create(0x54442bd4, 0x00000001, 0xc6e41596, 0x00000001); - - Vector128 xTmp3 = xmmCRC3; - - xmmCRC3 = xmmCRC2; - xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); - xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); - Vector128 psCRC2 = xmmCRC2.AsSingle(); - Vector128 psCRC3 = xmmCRC3.AsSingle(); - Vector128 psRes32 = Sse.Xor(psCRC2, psCRC3); - - xmmCRC2 = xmmCRC1; - xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); - xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); - Vector128 psCRC1 = xmmCRC1.AsSingle(); - psCRC2 = xmmCRC2.AsSingle(); - Vector128 psRes21 = Sse.Xor(psCRC1, psCRC2); - - xmmCRC1 = xmmCRC0; - xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); - xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); - Vector128 psCRC0 = xmmCRC0.AsSingle(); - psCRC1 = xmmCRC1.AsSingle(); - Vector128 psRes10 = Sse.Xor(psCRC0, psCRC1); - - xmmCRC0 = xTmp3; - xmmCRC1 = psRes10.AsUInt32(); - xmmCRC2 = psRes21.AsUInt32(); - xmmCRC3 = psRes32.AsUInt32(); - } - [MethodImpl(MethodImplOptions.AggressiveInlining)] static void Fold4(ref Vector128 xmmCRC0, ref Vector128 xmmCRC1, ref Vector128 xmmCRC2, ref Vector128 xmmCRC3) @@ -207,51 +122,6 @@ namespace Aaru.Checksums.CRC32 xmmCRC3 = psRes3.AsUInt32(); } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - static void PartialFold(long len, ref Vector128 xmmCRC0, ref Vector128 xmmCRC1, - ref Vector128 xmmCRC2, ref Vector128 xmmCRC3, - ref Vector128 xmmCRCPart) - { - Vector128 xmmFold4 = Vector128.Create(0x54442bd4, 0x00000001, 0xc6e41596, 0x00000001); - Vector128 xmmMask3 = Vector128.Create(0x80808080); - - Vector128 xmmShl = _pshufbShfTable[len - 1]; - Vector128 xmmShr = xmmShl; - xmmShr = Sse2.Xor(xmmShr, xmmMask3); - - Vector128 xmmA00 = Ssse3.Shuffle(xmmCRC0.AsByte(), xmmShl.AsByte()).AsUInt32(); - - xmmCRC0 = Ssse3.Shuffle(xmmCRC0.AsByte(), xmmShr.AsByte()).AsUInt32(); - Vector128 xmmTmp1 = Ssse3.Shuffle(xmmCRC1.AsByte(), xmmShl.AsByte()).AsUInt32(); - xmmCRC0 = Sse2.Or(xmmCRC0, xmmTmp1); - - xmmCRC1 = Ssse3.Shuffle(xmmCRC1.AsByte(), xmmShr.AsByte()).AsUInt32(); - Vector128 xmmTmp2 = Ssse3.Shuffle(xmmCRC2.AsByte(), xmmShl.AsByte()).AsUInt32(); - xmmCRC1 = Sse2.Or(xmmCRC1, xmmTmp2); - - xmmCRC2 = Ssse3.Shuffle(xmmCRC2.AsByte(), xmmShr.AsByte()).AsUInt32(); - Vector128 xmmTmp3 = Ssse3.Shuffle(xmmCRC3.AsByte(), xmmShl.AsByte()).AsUInt32(); - xmmCRC2 = Sse2.Or(xmmCRC2, xmmTmp3); - - xmmCRC3 = Ssse3.Shuffle(xmmCRC3.AsByte(), xmmShr.AsByte()).AsUInt32(); - xmmCRCPart = Ssse3.Shuffle(xmmCRCPart.AsByte(), xmmShl.AsByte()).AsUInt32(); - xmmCRC3 = Sse2.Or(xmmCRC3, xmmCRCPart); - - Vector128 xmmA01 = Pclmulqdq.CarrylessMultiply(xmmA00.AsUInt64(), xmmFold4.AsUInt64(), 0x10). - AsUInt32(); - - xmmA00 = Pclmulqdq.CarrylessMultiply(xmmA00.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); - - Vector128 psCRC3 = xmmCRC3.AsSingle(); - Vector128 psa00 = xmmA00.AsSingle(); - Vector128 psa01 = xmmA01.AsSingle(); - - Vector128 psRes = Sse.Xor(psCRC3, psa00); - psRes = Sse.Xor(psRes, psa01); - - xmmCRC3 = psRes.AsUInt32(); - } - internal static uint Step(byte[] src, long len, uint initialCRC) { Vector128 xmmT0, xmmT1, xmmT2; @@ -260,8 +130,7 @@ namespace Aaru.Checksums.CRC32 Vector128 xmmCRC1 = Vector128.Zero; Vector128 xmmCRC2 = Vector128.Zero; Vector128 xmmCRC3 = Vector128.Zero; - Vector128 xmmCRCPart; - int bufPos = 0; + int bufPos = 0; bool first = true; @@ -269,44 +138,6 @@ namespace Aaru.Checksums.CRC32 Vector128 xmmMask = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000); Vector128 xmmMask2 = Vector128.Create(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF); - uint crc; - - if(len < 16) - { - switch(len) - { - case 0: return initialCRC; - case < 4: - /* - * no idea how to do this for <4 bytes, delegate to classic impl. - */ - crc = ~initialCRC; - - switch(len) - { - case 3: - crc = (crc >> 8) ^ Crc32Context._isoCrc32Table[0][(crc & 0xFF) ^ src[bufPos++]]; - goto case 2; - case 2: - crc = (crc >> 8) ^ Crc32Context._isoCrc32Table[0][(crc & 0xFF) ^ src[bufPos++]]; - goto case 1; - case 1: - crc = (crc >> 8) ^ Crc32Context._isoCrc32Table[0][(crc & 0xFF) ^ src[bufPos]]; - - break; - } - - return ~crc; - } - - xmmCRCPart = Vector128.Create(BitConverter.ToUInt32(src, 0), BitConverter.ToUInt32(src, 4), - BitConverter.ToUInt32(src, 8), BitConverter.ToUInt32(src, 12)); - - xmmCRCPart = Sse2.Xor(xmmCRCPart, xmmInitial); - - goto partial; - } - while((len -= 64) >= 0) { xmmT0 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4), @@ -348,126 +179,6 @@ namespace Aaru.Checksums.CRC32 xmmCRC3 = Sse2.Xor(xmmCRC3, xmmT3); } - /* - * len = num bytes left - 64 - */ - if(len + 16 >= 0) - { - len += 16; - - xmmT0 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4), - BitConverter.ToUInt32(src, bufPos + 8), - BitConverter.ToUInt32(src, bufPos + 12)); - - bufPos += 16; - - xmmT1 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4), - BitConverter.ToUInt32(src, bufPos + 8), - BitConverter.ToUInt32(src, bufPos + 12)); - - bufPos += 16; - - xmmT2 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4), - BitConverter.ToUInt32(src, bufPos + 8), - BitConverter.ToUInt32(src, bufPos + 12)); - - bufPos += 16; - - if(first) - xmmT0 = Sse2.Xor(xmmT0, xmmInitial); - - Fold3(ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3); - - xmmCRC1 = Sse2.Xor(xmmCRC1, xmmT0); - xmmCRC2 = Sse2.Xor(xmmCRC2, xmmT1); - xmmCRC3 = Sse2.Xor(xmmCRC3, xmmT2); - - if(len == 0) - goto done; - - xmmCRCPart = Vector128.Create(BitConverter.ToUInt32(src, bufPos), - BitConverter.ToUInt32(src, bufPos + 4), - BitConverter.ToUInt32(src, bufPos + 8), - BitConverter.ToUInt32(src, bufPos + 12)); - } - else if(len + 32 >= 0) - { - len += 32; - - xmmT0 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4), - BitConverter.ToUInt32(src, bufPos + 8), - BitConverter.ToUInt32(src, bufPos + 12)); - - bufPos += 16; - - xmmT1 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4), - BitConverter.ToUInt32(src, bufPos + 8), - BitConverter.ToUInt32(src, bufPos + 12)); - - bufPos += 16; - - if(first) - xmmT0 = Sse2.Xor(xmmT0, xmmInitial); - - Fold2(ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3); - - xmmCRC2 = Sse2.Xor(xmmCRC2, xmmT0); - xmmCRC3 = Sse2.Xor(xmmCRC3, xmmT1); - - if(len == 0) - goto done; - - xmmCRCPart = Vector128.Create(BitConverter.ToUInt32(src, bufPos), - BitConverter.ToUInt32(src, bufPos + 4), - BitConverter.ToUInt32(src, bufPos + 8), - BitConverter.ToUInt32(src, bufPos + 12)); - } - else if(len + 48 >= 0) - { - len += 48; - - xmmT0 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4), - BitConverter.ToUInt32(src, bufPos + 8), - BitConverter.ToUInt32(src, bufPos + 12)); - - bufPos += 16; - - if(first) - xmmT0 = Sse2.Xor(xmmT0, xmmInitial); - - Fold1(ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3); - - xmmCRC3 = Sse2.Xor(xmmCRC3, xmmT0); - - if(len == 0) - goto done; - - xmmCRCPart = Vector128.Create(BitConverter.ToUInt32(src, bufPos), - BitConverter.ToUInt32(src, bufPos + 4), - BitConverter.ToUInt32(src, bufPos + 8), - BitConverter.ToUInt32(src, bufPos + 12)); - } - else - { - len += 64; - - if(len == 0) - goto done; - - xmmCRCPart = Vector128.Create(BitConverter.ToUInt32(src, bufPos), - BitConverter.ToUInt32(src, bufPos + 4), - BitConverter.ToUInt32(src, bufPos + 8), - BitConverter.ToUInt32(src, bufPos + 12)); - - if(first) - xmmCRCPart = Sse2.Xor(xmmCRCPart, xmmInitial); - } - - partial: - PartialFold(len, ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3, ref xmmCRCPart); - - done: - /* fold 512 to 32 */ /* @@ -533,9 +244,7 @@ namespace Aaru.Checksums.CRC32 * no real advantage - it's a tiny bit slower per call, while no additional CPUs * would be supported by only requiring SSSE3 and CLMUL instead of SSE4.1 + CLMUL */ - crc = Sse41.Extract(xmmCRC3, 2); - - return ~crc; + return ~Sse41.Extract(xmmCRC3, 2); } } } \ No newline at end of file diff --git a/CRC32Context.cs b/CRC32Context.cs index 949012c..132869b 100644 --- a/CRC32Context.cs +++ b/CRC32Context.cs @@ -412,6 +412,8 @@ namespace Aaru.Checksums static void Step(ref uint previousCrc, uint[][] table, byte[] data, uint len, bool useIso) { + int currentPos = 0; + if(useIso) { if(Pclmulqdq.IsSupported && @@ -419,9 +421,19 @@ namespace Aaru.Checksums Ssse3.IsSupported && Sse2.IsSupported) { - previousCrc = ~Clmul.Step(data, len, ~previousCrc); + // Only works in blocks of 16 bytes + uint blocks = len / 64; - return; + if(blocks > 0) + { + previousCrc = ~Clmul.Step(data, blocks * 64, ~previousCrc); + + currentPos = (int)(blocks * 64); + len -= blocks * 64; + } + + if(len == 0) + return; } if(Crc32.Arm64.IsSupported) @@ -442,7 +454,6 @@ namespace Aaru.Checksums // Unroll according to Intel slicing by uint8_t // http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf // http://sourceforge.net/projects/slicing-by-8/ - int currentPos = 0; const int unroll = 4; const int bytesAtOnce = 8 * unroll; uint crc = previousCrc; diff --git a/CRC64/clmul.cs b/CRC64/clmul.cs index 04aa3fc..2bd9d39 100644 --- a/CRC64/clmul.cs +++ b/CRC64/clmul.cs @@ -81,9 +81,7 @@ namespace Aaru.Checksums.CRC64 const ulong pol = 0x92d8af2baf0e1e85; Vector128 foldConstants1 = Vector128.Create(k1, k2); Vector128 foldConstants2 = Vector128.Create(mu, pol); - uint leadOutSize = length % 16; Vector128 initialCrc = Vector128.Create(~crc, 0); - Vector128 p; length -= 16; // Initial CRC can simply be added to data @@ -103,28 +101,9 @@ namespace Aaru.Checksums.CRC64 bufPos += 16; } - if(length == 16) - { - p = Sse2.Xor(accumulator, - Vector128.Create(BitConverter.ToUInt64(data, bufPos), - BitConverter.ToUInt64(data, bufPos + 8))); - } - else - { - Vector128 end0 = Sse2.Xor(accumulator, - Vector128.Create(BitConverter.ToUInt64(data, bufPos), - BitConverter.ToUInt64(data, bufPos + 8))); - - bufPos += 16; - - Vector128 end1 = - Vector128.Create(BitConverter.ToUInt64(data, bufPos), BitConverter.ToUInt64(data, bufPos + 8)); - - ShiftRight128(end0, leadOutSize, out Vector128 a, out Vector128 b); - ShiftRight128(end1, leadOutSize, out Vector128 c, out _); - - p = Sse2.Xor(Fold(a, foldConstants1), Sse2.Or(b, c)); - } + Vector128 p = Sse2.Xor(accumulator, + Vector128.Create(BitConverter.ToUInt64(data, bufPos), + BitConverter.ToUInt64(data, bufPos + 8))); Vector128 r = Sse2.Xor(Pclmulqdq.CarrylessMultiply(p, foldConstants1, 0x10), Sse2.ShiftRightLogical128BitLane(p, 8)); diff --git a/CRC64Context.cs b/CRC64Context.cs index 411baa5..f9c0855 100644 --- a/CRC64Context.cs +++ b/CRC64Context.cs @@ -352,23 +352,34 @@ namespace Aaru.Checksums static void Step(ref ulong previousCrc, ulong[][] table, byte[] data, uint len, bool useEcma) { + int dataOff = 0; + if(useEcma && Pclmulqdq.IsSupported && Sse41.IsSupported && Ssse3.IsSupported && Sse2.IsSupported) { - previousCrc = ~Clmul.Step(~previousCrc, data, len); + // Only works in blocks of 32 bytes + uint blocks = len / 32; - return; + if(blocks > 0) + { + previousCrc = ~Clmul.Step(~previousCrc, data, blocks * 32); + + dataOff = (int)(blocks * 32); + len -= blocks * 32; + } + + if(len == 0) + return; } // Unroll according to Intel slicing by uint8_t // http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf // http://sourceforge.net/projects/slicing-by-8/ - ulong crc = previousCrc; - int dataOff = 0; + ulong crc = previousCrc; if(len > 4) {