diff --git a/Aaru6.Checksums/Aaru6.Checksums.csproj b/Aaru6.Checksums/Aaru6.Checksums.csproj
index b9dba7f..d9ac662 100644
--- a/Aaru6.Checksums/Aaru6.Checksums.csproj
+++ b/Aaru6.Checksums/Aaru6.Checksums.csproj
@@ -50,22 +50,23 @@
true
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -73,16 +74,16 @@
-
-
+
+
-
-
-
+
+
+
diff --git a/Aaru6.Checksums/CRC32/clmul.cs b/Aaru6.Checksums/CRC32/clmul.cs
index 15298c9..c8bb7f4 100644
--- a/Aaru6.Checksums/CRC32/clmul.cs
+++ b/Aaru6.Checksums/CRC32/clmul.cs
@@ -81,91 +81,6 @@ namespace Aaru6.Checksums.CRC32
Vector128.Create(0x0201008fu, 0x06050403, 0x0a090807, 0x0e0d0c0b) /* shl 1 (16 -15)/shr15*/
};
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- static void Fold1(ref Vector128 xmmCRC0, ref Vector128 xmmCRC1, ref Vector128 xmmCRC2,
- ref Vector128 xmmCRC3)
- {
- Vector128 xmmFold4 = Vector128.Create(0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001);
-
- Vector128 xTmp3 = xmmCRC3;
-
- xmmCRC3 = xmmCRC0;
- xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
- xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
- Vector128 psCRC0 = xmmCRC0.AsSingle();
- Vector128 psCRC3 = xmmCRC3.AsSingle();
- Vector128 psRes = Sse.Xor(psCRC0, psCRC3);
-
- xmmCRC0 = xmmCRC1;
- xmmCRC1 = xmmCRC2;
- xmmCRC2 = xTmp3;
- xmmCRC3 = psRes.AsUInt32();
- }
-
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- static void Fold2(ref Vector128 xmmCRC0, ref Vector128 xmmCRC1, ref Vector128 xmmCRC2,
- ref Vector128 xmmCRC3)
- {
- Vector128 xmmFold4 = Vector128.Create(0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001);
-
- Vector128 xTmp3 = xmmCRC3;
- Vector128 xTmp2 = xmmCRC2;
-
- xmmCRC3 = xmmCRC1;
- xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
- xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
- Vector128 psCRC3 = xmmCRC3.AsSingle();
- Vector128 psCRC1 = xmmCRC1.AsSingle();
- Vector128 psRes31 = Sse.Xor(psCRC3, psCRC1);
-
- xmmCRC2 = xmmCRC0;
- xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
- xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
- Vector128 psCRC0 = xmmCRC0.AsSingle();
- Vector128 psCRC2 = xmmCRC2.AsSingle();
- Vector128 psRes20 = Sse.Xor(psCRC0, psCRC2);
-
- xmmCRC0 = xTmp2;
- xmmCRC1 = xTmp3;
- xmmCRC2 = psRes20.AsUInt32();
- xmmCRC3 = psRes31.AsUInt32();
- }
-
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- static void Fold3(ref Vector128 xmmCRC0, ref Vector128 xmmCRC1, ref Vector128 xmmCRC2,
- ref Vector128 xmmCRC3)
- {
- Vector128 xmmFold4 = Vector128.Create(0x54442bd4, 0x00000001, 0xc6e41596, 0x00000001);
-
- Vector128 xTmp3 = xmmCRC3;
-
- xmmCRC3 = xmmCRC2;
- xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
- xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
- Vector128 psCRC2 = xmmCRC2.AsSingle();
- Vector128 psCRC3 = xmmCRC3.AsSingle();
- Vector128 psRes32 = Sse.Xor(psCRC2, psCRC3);
-
- xmmCRC2 = xmmCRC1;
- xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
- xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
- Vector128 psCRC1 = xmmCRC1.AsSingle();
- psCRC2 = xmmCRC2.AsSingle();
- Vector128 psRes21 = Sse.Xor(psCRC1, psCRC2);
-
- xmmCRC1 = xmmCRC0;
- xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
- xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
- Vector128 psCRC0 = xmmCRC0.AsSingle();
- psCRC1 = xmmCRC1.AsSingle();
- Vector128 psRes10 = Sse.Xor(psCRC0, psCRC1);
-
- xmmCRC0 = xTmp3;
- xmmCRC1 = psRes10.AsUInt32();
- xmmCRC2 = psRes21.AsUInt32();
- xmmCRC3 = psRes32.AsUInt32();
- }
-
[MethodImpl(MethodImplOptions.AggressiveInlining)]
static void Fold4(ref Vector128 xmmCRC0, ref Vector128 xmmCRC1, ref Vector128 xmmCRC2,
ref Vector128 xmmCRC3)
@@ -207,51 +122,6 @@ namespace Aaru6.Checksums.CRC32
xmmCRC3 = psRes3.AsUInt32();
}
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- static void PartialFold(long len, ref Vector128 xmmCRC0, ref Vector128 xmmCRC1,
- ref Vector128 xmmCRC2, ref Vector128 xmmCRC3,
- ref Vector128 xmmCRCPart)
- {
- Vector128 xmmFold4 = Vector128.Create(0x54442bd4, 0x00000001, 0xc6e41596, 0x00000001);
- Vector128 xmmMask3 = Vector128.Create(0x80808080);
-
- Vector128 xmmShl = _pshufbShfTable[len - 1];
- Vector128 xmmShr = xmmShl;
- xmmShr = Sse2.Xor(xmmShr, xmmMask3);
-
- Vector128 xmmA00 = Ssse3.Shuffle(xmmCRC0.AsByte(), xmmShl.AsByte()).AsUInt32();
-
- xmmCRC0 = Ssse3.Shuffle(xmmCRC0.AsByte(), xmmShr.AsByte()).AsUInt32();
- Vector128 xmmTmp1 = Ssse3.Shuffle(xmmCRC1.AsByte(), xmmShl.AsByte()).AsUInt32();
- xmmCRC0 = Sse2.Or(xmmCRC0, xmmTmp1);
-
- xmmCRC1 = Ssse3.Shuffle(xmmCRC1.AsByte(), xmmShr.AsByte()).AsUInt32();
- Vector128 xmmTmp2 = Ssse3.Shuffle(xmmCRC2.AsByte(), xmmShl.AsByte()).AsUInt32();
- xmmCRC1 = Sse2.Or(xmmCRC1, xmmTmp2);
-
- xmmCRC2 = Ssse3.Shuffle(xmmCRC2.AsByte(), xmmShr.AsByte()).AsUInt32();
- Vector128 xmmTmp3 = Ssse3.Shuffle(xmmCRC3.AsByte(), xmmShl.AsByte()).AsUInt32();
- xmmCRC2 = Sse2.Or(xmmCRC2, xmmTmp3);
-
- xmmCRC3 = Ssse3.Shuffle(xmmCRC3.AsByte(), xmmShr.AsByte()).AsUInt32();
- xmmCRCPart = Ssse3.Shuffle(xmmCRCPart.AsByte(), xmmShl.AsByte()).AsUInt32();
- xmmCRC3 = Sse2.Or(xmmCRC3, xmmCRCPart);
-
- Vector128 xmmA01 = Pclmulqdq.CarrylessMultiply(xmmA00.AsUInt64(), xmmFold4.AsUInt64(), 0x10).
- AsUInt32();
-
- xmmA00 = Pclmulqdq.CarrylessMultiply(xmmA00.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
-
- Vector128 psCRC3 = xmmCRC3.AsSingle();
- Vector128 psa00 = xmmA00.AsSingle();
- Vector128 psa01 = xmmA01.AsSingle();
-
- Vector128 psRes = Sse.Xor(psCRC3, psa00);
- psRes = Sse.Xor(psRes, psa01);
-
- xmmCRC3 = psRes.AsUInt32();
- }
-
internal static uint Step(byte[] src, long len, uint initialCRC)
{
Vector128 xmmT0, xmmT1, xmmT2;
@@ -260,8 +130,7 @@ namespace Aaru6.Checksums.CRC32
Vector128 xmmCRC1 = Vector128.Zero;
Vector128 xmmCRC2 = Vector128.Zero;
Vector128 xmmCRC3 = Vector128.Zero;
- Vector128 xmmCRCPart;
- int bufPos = 0;
+ int bufPos = 0;
bool first = true;
@@ -269,44 +138,6 @@ namespace Aaru6.Checksums.CRC32
Vector128 xmmMask = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000);
Vector128 xmmMask2 = Vector128.Create(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
- uint crc;
-
- if(len < 16)
- {
- switch(len)
- {
- case 0: return initialCRC;
- case < 4:
- /*
- * no idea how to do this for <4 bytes, delegate to classic impl.
- */
- crc = ~initialCRC;
-
- switch(len)
- {
- case 3:
- crc = (crc >> 8) ^ Crc32Context._isoCrc32Table[0][(crc & 0xFF) ^ src[bufPos++]];
- goto case 2;
- case 2:
- crc = (crc >> 8) ^ Crc32Context._isoCrc32Table[0][(crc & 0xFF) ^ src[bufPos++]];
- goto case 1;
- case 1:
- crc = (crc >> 8) ^ Crc32Context._isoCrc32Table[0][(crc & 0xFF) ^ src[bufPos]];
-
- break;
- }
-
- return ~crc;
- }
-
- xmmCRCPart = Vector128.Create(BitConverter.ToUInt32(src, 0), BitConverter.ToUInt32(src, 4),
- BitConverter.ToUInt32(src, 8), BitConverter.ToUInt32(src, 12));
-
- xmmCRCPart = Sse2.Xor(xmmCRCPart, xmmInitial);
-
- goto partial;
- }
-
while((len -= 64) >= 0)
{
xmmT0 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
@@ -348,126 +179,6 @@ namespace Aaru6.Checksums.CRC32
xmmCRC3 = Sse2.Xor(xmmCRC3, xmmT3);
}
- /*
- * len = num bytes left - 64
- */
- if(len + 16 >= 0)
- {
- len += 16;
-
- xmmT0 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
- BitConverter.ToUInt32(src, bufPos + 8),
- BitConverter.ToUInt32(src, bufPos + 12));
-
- bufPos += 16;
-
- xmmT1 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
- BitConverter.ToUInt32(src, bufPos + 8),
- BitConverter.ToUInt32(src, bufPos + 12));
-
- bufPos += 16;
-
- xmmT2 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
- BitConverter.ToUInt32(src, bufPos + 8),
- BitConverter.ToUInt32(src, bufPos + 12));
-
- bufPos += 16;
-
- if(first)
- xmmT0 = Sse2.Xor(xmmT0, xmmInitial);
-
- Fold3(ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3);
-
- xmmCRC1 = Sse2.Xor(xmmCRC1, xmmT0);
- xmmCRC2 = Sse2.Xor(xmmCRC2, xmmT1);
- xmmCRC3 = Sse2.Xor(xmmCRC3, xmmT2);
-
- if(len == 0)
- goto done;
-
- xmmCRCPart = Vector128.Create(BitConverter.ToUInt32(src, bufPos),
- BitConverter.ToUInt32(src, bufPos + 4),
- BitConverter.ToUInt32(src, bufPos + 8),
- BitConverter.ToUInt32(src, bufPos + 12));
- }
- else if(len + 32 >= 0)
- {
- len += 32;
-
- xmmT0 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
- BitConverter.ToUInt32(src, bufPos + 8),
- BitConverter.ToUInt32(src, bufPos + 12));
-
- bufPos += 16;
-
- xmmT1 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
- BitConverter.ToUInt32(src, bufPos + 8),
- BitConverter.ToUInt32(src, bufPos + 12));
-
- bufPos += 16;
-
- if(first)
- xmmT0 = Sse2.Xor(xmmT0, xmmInitial);
-
- Fold2(ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3);
-
- xmmCRC2 = Sse2.Xor(xmmCRC2, xmmT0);
- xmmCRC3 = Sse2.Xor(xmmCRC3, xmmT1);
-
- if(len == 0)
- goto done;
-
- xmmCRCPart = Vector128.Create(BitConverter.ToUInt32(src, bufPos),
- BitConverter.ToUInt32(src, bufPos + 4),
- BitConverter.ToUInt32(src, bufPos + 8),
- BitConverter.ToUInt32(src, bufPos + 12));
- }
- else if(len + 48 >= 0)
- {
- len += 48;
-
- xmmT0 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
- BitConverter.ToUInt32(src, bufPos + 8),
- BitConverter.ToUInt32(src, bufPos + 12));
-
- bufPos += 16;
-
- if(first)
- xmmT0 = Sse2.Xor(xmmT0, xmmInitial);
-
- Fold1(ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3);
-
- xmmCRC3 = Sse2.Xor(xmmCRC3, xmmT0);
-
- if(len == 0)
- goto done;
-
- xmmCRCPart = Vector128.Create(BitConverter.ToUInt32(src, bufPos),
- BitConverter.ToUInt32(src, bufPos + 4),
- BitConverter.ToUInt32(src, bufPos + 8),
- BitConverter.ToUInt32(src, bufPos + 12));
- }
- else
- {
- len += 64;
-
- if(len == 0)
- goto done;
-
- xmmCRCPart = Vector128.Create(BitConverter.ToUInt32(src, bufPos),
- BitConverter.ToUInt32(src, bufPos + 4),
- BitConverter.ToUInt32(src, bufPos + 8),
- BitConverter.ToUInt32(src, bufPos + 12));
-
- if(first)
- xmmCRCPart = Sse2.Xor(xmmCRCPart, xmmInitial);
- }
-
- partial:
- PartialFold(len, ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3, ref xmmCRCPart);
-
- done:
-
/* fold 512 to 32 */
/*
@@ -533,9 +244,7 @@ namespace Aaru6.Checksums.CRC32
* no real advantage - it's a tiny bit slower per call, while no additional CPUs
* would be supported by only requiring SSSE3 and CLMUL instead of SSE4.1 + CLMUL
*/
- crc = Sse41.Extract(xmmCRC3, 2);
-
- return ~crc;
+ return ~Sse41.Extract(xmmCRC3, 2);
}
}
}
\ No newline at end of file
diff --git a/Aaru6.Checksums/CRC32Context.cs b/Aaru6.Checksums/CRC32Context.cs
index 56a6e5a..765b1d6 100644
--- a/Aaru6.Checksums/CRC32Context.cs
+++ b/Aaru6.Checksums/CRC32Context.cs
@@ -35,9 +35,9 @@ using System.IO;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;
using System.Text;
+using Aaru6.Checksums.CRC32;
using Aaru.CommonTypes.Interfaces;
using Aaru.Helpers;
-using Aaru6.Checksums.CRC32;
namespace Aaru6.Checksums
{
@@ -412,6 +412,8 @@ namespace Aaru6.Checksums
static void Step(ref uint previousCrc, uint[][] table, byte[] data, uint len, bool useIso)
{
+ int currentPos = 0;
+
if(useIso)
{
if(Pclmulqdq.IsSupported &&
@@ -419,9 +421,19 @@ namespace Aaru6.Checksums
Ssse3.IsSupported &&
Sse2.IsSupported)
{
- previousCrc = ~Clmul.Step(data, len, ~previousCrc);
+ // Only works in blocks of 16 bytes
+ uint blocks = len / 64;
- return;
+ if(blocks > 0)
+ {
+ previousCrc = ~Clmul.Step(data, blocks * 64, ~previousCrc);
+
+ currentPos = (int)(blocks * 64);
+ len -= blocks * 64;
+ }
+
+ if(len == 0)
+ return;
}
if(Crc32.Arm64.IsSupported)
@@ -442,7 +454,6 @@ namespace Aaru6.Checksums
// Unroll according to Intel slicing by uint8_t
// http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf
// http://sourceforge.net/projects/slicing-by-8/
- int currentPos = 0;
const int unroll = 4;
const int bytesAtOnce = 8 * unroll;
uint crc = previousCrc;
diff --git a/Aaru6.Checksums/CRC64/clmul.cs b/Aaru6.Checksums/CRC64/clmul.cs
index e7a3e82..e32e0d5 100644
--- a/Aaru6.Checksums/CRC64/clmul.cs
+++ b/Aaru6.Checksums/CRC64/clmul.cs
@@ -81,9 +81,7 @@ namespace Aaru6.Checksums.CRC64
const ulong pol = 0x92d8af2baf0e1e85;
Vector128 foldConstants1 = Vector128.Create(k1, k2);
Vector128 foldConstants2 = Vector128.Create(mu, pol);
- uint leadOutSize = length % 16;
Vector128 initialCrc = Vector128.Create(~crc, 0);
- Vector128 p;
length -= 16;
// Initial CRC can simply be added to data
@@ -103,28 +101,9 @@ namespace Aaru6.Checksums.CRC64
bufPos += 16;
}
- if(length == 16)
- {
- p = Sse2.Xor(accumulator,
- Vector128.Create(BitConverter.ToUInt64(data, bufPos),
- BitConverter.ToUInt64(data, bufPos + 8)));
- }
- else
- {
- Vector128 end0 = Sse2.Xor(accumulator,
- Vector128.Create(BitConverter.ToUInt64(data, bufPos),
- BitConverter.ToUInt64(data, bufPos + 8)));
-
- bufPos += 16;
-
- Vector128 end1 =
- Vector128.Create(BitConverter.ToUInt64(data, bufPos), BitConverter.ToUInt64(data, bufPos + 8));
-
- ShiftRight128(end0, leadOutSize, out Vector128 a, out Vector128 b);
- ShiftRight128(end1, leadOutSize, out Vector128 c, out _);
-
- p = Sse2.Xor(Fold(a, foldConstants1), Sse2.Or(b, c));
- }
+ Vector128 p = Sse2.Xor(accumulator,
+ Vector128.Create(BitConverter.ToUInt64(data, bufPos),
+ BitConverter.ToUInt64(data, bufPos + 8)));
Vector128 r = Sse2.Xor(Pclmulqdq.CarrylessMultiply(p, foldConstants1, 0x10),
Sse2.ShiftRightLogical128BitLane(p, 8));
diff --git a/Aaru6.Checksums/CRC64Context.cs b/Aaru6.Checksums/CRC64Context.cs
index 4806d64..30c463e 100644
--- a/Aaru6.Checksums/CRC64Context.cs
+++ b/Aaru6.Checksums/CRC64Context.cs
@@ -34,9 +34,9 @@ using System;
using System.IO;
using System.Runtime.Intrinsics.X86;
using System.Text;
+using Aaru6.Checksums.CRC64;
using Aaru.CommonTypes.Interfaces;
using Aaru.Helpers;
-using Aaru6.Checksums.CRC64;
namespace Aaru6.Checksums
{
@@ -352,23 +352,34 @@ namespace Aaru6.Checksums
static void Step(ref ulong previousCrc, ulong[][] table, byte[] data, uint len, bool useEcma)
{
+ int dataOff = 0;
+
if(useEcma &&
Pclmulqdq.IsSupported &&
Sse41.IsSupported &&
Ssse3.IsSupported &&
Sse2.IsSupported)
{
- previousCrc = ~Clmul.Step(~previousCrc, data, len);
+ // Only works in blocks of 32 bytes
+ uint blocks = len / 32;
- return;
+ if(blocks > 0)
+ {
+ previousCrc = ~Clmul.Step(~previousCrc, data, blocks * 32);
+
+ dataOff = (int)(blocks * 32);
+ len -= blocks * 32;
+ }
+
+ if(len == 0)
+ return;
}
// Unroll according to Intel slicing by uint8_t
// http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf
// http://sourceforge.net/projects/slicing-by-8/
- ulong crc = previousCrc;
- int dataOff = 0;
+ ulong crc = previousCrc;
if(len > 4)
{