// /*************************************************************************** // Aaru Data Preservation Suite // ---------------------------------------------------------------------------- // // Filename : clmul.cs // Author(s) : Natalia Portillo // Wajdi Feghali // Jim Guilford // Vinodh Gopal // Erdinc Ozturk // Jim Kukunas // Marian Beermann // // Component : Checksums. // // --[ Description ] ---------------------------------------------------------- // // Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ // instruction. // // A white paper describing this algorithm can be found at: // http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf // // --[ License ] -------------------------------------------------------------- // // This software is provided 'as-is', without any express or implied warranty. // In no event will the authors be held liable for any damages arising from // the use of this software. // // Permission is granted to anyone to use this software for any purpose, // including commercial applications, and to alter it and redistribute it // freely, subject to the following restrictions: // // 1. The origin of this software must not be misrepresented; you must not // claim that you wrote the original software. If you use this software // in a product, an acknowledgment in the product documentation would be // appreciated but is not required. // // 2. Altered source versions must be plainly marked as such, and must not be // misrepresented as being the original software. // // 3. This notice may not be removed or altered from any source distribution. // // ---------------------------------------------------------------------------- // Copyright © 2011-2025 Natalia Portillo // Copyright (c) 2016 Marian Beermann (add support for initial value, restructuring) // Copyright (C) 2013 Intel Corporation. All rights reserved. // ****************************************************************************/ using System; using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; namespace RomRepoMgr.Core.Checksums.CRC32; static class Clmul { static readonly uint[] _crcK = [ 0xccaa009e, 0x00000000, /* rk1 */ 0x751997d0, 0x00000001, /* rk2 */ 0xccaa009e, 0x00000000, /* rk5 */ 0x63cd6124, 0x00000001, /* rk6 */ 0xf7011640, 0x00000001, /* rk7 */ 0xdb710640, 0x00000001 /* rk8 */ ]; [MethodImpl(MethodImplOptions.AggressiveInlining)] static void Fold4(ref Vector128 xmmCRC0, ref Vector128 xmmCRC1, ref Vector128 xmmCRC2, ref Vector128 xmmCRC3) { var xmmFold4 = Vector128.Create(0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001); Vector128 xTmp0 = xmmCRC0; Vector128 xTmp1 = xmmCRC1; Vector128 xTmp2 = xmmCRC2; Vector128 xTmp3 = xmmCRC3; xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); xTmp0 = Pclmulqdq.CarrylessMultiply(xTmp0.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); Vector128 psCRC0 = xmmCRC0.AsSingle(); Vector128 psT0 = xTmp0.AsSingle(); Vector128 psRes0 = Sse.Xor(psCRC0, psT0); xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); xTmp1 = Pclmulqdq.CarrylessMultiply(xTmp1.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); Vector128 psCRC1 = xmmCRC1.AsSingle(); Vector128 psT1 = xTmp1.AsSingle(); Vector128 psRes1 = Sse.Xor(psCRC1, psT1); xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); xTmp2 = Pclmulqdq.CarrylessMultiply(xTmp2.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); Vector128 psCRC2 = xmmCRC2.AsSingle(); Vector128 psT2 = xTmp2.AsSingle(); Vector128 psRes2 = Sse.Xor(psCRC2, psT2); xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); xTmp3 = Pclmulqdq.CarrylessMultiply(xTmp3.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); Vector128 psCRC3 = xmmCRC3.AsSingle(); Vector128 psT3 = xTmp3.AsSingle(); Vector128 psRes3 = Sse.Xor(psCRC3, psT3); xmmCRC0 = psRes0.AsUInt32(); xmmCRC1 = psRes1.AsUInt32(); xmmCRC2 = psRes2.AsUInt32(); xmmCRC3 = psRes3.AsUInt32(); } internal static uint Step(byte[] src, long len, uint initialCRC) { Vector128 xmmInitial = Sse2.ConvertScalarToVector128UInt32(initialCRC); Vector128 xmmCRC0 = Sse2.ConvertScalarToVector128UInt32(0x9db42487); Vector128 xmmCRC1 = Vector128.Zero; Vector128 xmmCRC2 = Vector128.Zero; Vector128 xmmCRC3 = Vector128.Zero; var bufPos = 0; var first = true; /* fold 512 to 32 step variable declarations for ISO-C90 compat. */ var xmmMask = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000); var xmmMask2 = Vector128.Create(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF); while((len -= 64) >= 0) { var xmmT0 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4), BitConverter.ToUInt32(src, bufPos + 8), BitConverter.ToUInt32(src, bufPos + 12)); bufPos += 16; var xmmT1 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4), BitConverter.ToUInt32(src, bufPos + 8), BitConverter.ToUInt32(src, bufPos + 12)); bufPos += 16; var xmmT2 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4), BitConverter.ToUInt32(src, bufPos + 8), BitConverter.ToUInt32(src, bufPos + 12)); bufPos += 16; var xmmT3 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4), BitConverter.ToUInt32(src, bufPos + 8), BitConverter.ToUInt32(src, bufPos + 12)); bufPos += 16; if(first) { first = false; xmmT0 = Sse2.Xor(xmmT0, xmmInitial); } Fold4(ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3); xmmCRC0 = Sse2.Xor(xmmCRC0, xmmT0); xmmCRC1 = Sse2.Xor(xmmCRC1, xmmT1); xmmCRC2 = Sse2.Xor(xmmCRC2, xmmT2); xmmCRC3 = Sse2.Xor(xmmCRC3, xmmT3); } /* fold 512 to 32 */ /* * k1 */ var crcFold = Vector128.Create(_crcK[0], _crcK[1], _crcK[2], _crcK[3]); Vector128 xTmp0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), crcFold.AsUInt64(), 0x10).AsUInt32(); xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), crcFold.AsUInt64(), 0x01).AsUInt32(); xmmCRC1 = Sse2.Xor(xmmCRC1, xTmp0); xmmCRC1 = Sse2.Xor(xmmCRC1, xmmCRC0); Vector128 xTmp1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), crcFold.AsUInt64(), 0x10).AsUInt32(); xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), crcFold.AsUInt64(), 0x01).AsUInt32(); xmmCRC2 = Sse2.Xor(xmmCRC2, xTmp1); xmmCRC2 = Sse2.Xor(xmmCRC2, xmmCRC1); Vector128 xTmp2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), crcFold.AsUInt64(), 0x10).AsUInt32(); xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), crcFold.AsUInt64(), 0x01).AsUInt32(); xmmCRC3 = Sse2.Xor(xmmCRC3, xTmp2); xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC2); /* * k5 */ crcFold = Vector128.Create(_crcK[4], _crcK[5], _crcK[6], _crcK[7]); xmmCRC0 = xmmCRC3; xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0).AsUInt32(); xmmCRC0 = Sse2.ShiftRightLogical128BitLane(xmmCRC0, 8); xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC0); xmmCRC0 = xmmCRC3; xmmCRC3 = Sse2.ShiftLeftLogical128BitLane(xmmCRC3, 4); xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0x10).AsUInt32(); xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC0); xmmCRC3 = Sse2.And(xmmCRC3, xmmMask2); /* * k7 */ xmmCRC1 = xmmCRC3; xmmCRC2 = xmmCRC3; crcFold = Vector128.Create(_crcK[8], _crcK[9], _crcK[10], _crcK[11]); xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0).AsUInt32(); xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC2); xmmCRC3 = Sse2.And(xmmCRC3, xmmMask); xmmCRC2 = xmmCRC3; xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0x10).AsUInt32(); xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC2); xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC1); /* * could just as well write xmm_crc3[2], doing a movaps and truncating, but * no real advantage - it's a tiny bit slower per call, while no additional CPUs * would be supported by only requiring SSSE3 and CLMUL instead of SSE4.1 + CLMUL */ return ~Sse41.Extract(xmmCRC3, 2); } }