mirror of
https://github.com/aaru-dps/AaruBenchmark.git
synced 2025-12-16 19:24:36 +00:00
438 lines
20 KiB
C#
438 lines
20 KiB
C#
// /***************************************************************************
|
|
// Aaru Data Preservation Suite
|
|
// ----------------------------------------------------------------------------
|
|
//
|
|
// Filename : clmul.cs
|
|
// Author(s) : Natalia Portillo <claunia@claunia.com>
|
|
// Wajdi Feghali <wajdi.k.feghali@intel.com>
|
|
// Jim Guilford <james.guilford@intel.com>
|
|
// Vinodh Gopal <vinodh.gopal@intel.com>
|
|
// Erdinc Ozturk <erdinc.ozturk@intel.com>
|
|
// Jim Kukunas <james.t.kukunas@linux.intel.com>
|
|
// Marian Beermann
|
|
//
|
|
// Component : Checksums.
|
|
//
|
|
// --[ Description ] ----------------------------------------------------------
|
|
//
|
|
// Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
|
|
// instruction.
|
|
//
|
|
// A white paper describing this algorithm can be found at:
|
|
// http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
|
|
//
|
|
// --[ License ] --------------------------------------------------------------
|
|
//
|
|
// This software is provided 'as-is', without any express or implied warranty.
|
|
// In no event will the authors be held liable for any damages arising from
|
|
// the use of this software.
|
|
//
|
|
// Permission is granted to anyone to use this software for any purpose,
|
|
// including commercial applications, and to alter it and redistribute it
|
|
// freely, subject to the following restrictions:
|
|
//
|
|
// 1. The origin of this software must not be misrepresented; you must not
|
|
// claim that you wrote the original software. If you use this software
|
|
// in a product, an acknowledgment in the product documentation would be
|
|
// appreciated but is not required.
|
|
//
|
|
// 2. Altered source versions must be plainly marked as such, and must not be
|
|
// misrepresented as being the original software.
|
|
//
|
|
// 3. This notice may not be removed or altered from any source distribution.
|
|
//
|
|
// ----------------------------------------------------------------------------
|
|
// Copyright © 2011-2024 Natalia Portillo
|
|
// Copyright (c) 2016 Marian Beermann (add support for initial value, restructuring)
|
|
// Copyright (C) 2013 Intel Corporation. All rights reserved.
|
|
// ****************************************************************************/
|
|
|
|
using System;
|
|
using System.Runtime.CompilerServices;
|
|
using System.Runtime.Intrinsics;
|
|
using System.Runtime.Intrinsics.Arm;
|
|
|
|
namespace Aaru6.Checksums.CRC32;
|
|
|
|
static class Vmull
|
|
{
|
|
static readonly uint[] _crcK =
|
|
{
|
|
0xccaa009e, 0x00000000, /* rk1 */ 0x751997d0, 0x00000001, /* rk2 */ 0xccaa009e, 0x00000000, /* rk5 */
|
|
0x63cd6124, 0x00000001, /* rk6 */ 0xf7011640, 0x00000001, /* rk7 */ 0xdb710640, 0x00000001 /* rk8 */
|
|
};
|
|
|
|
static readonly Vector128<uint>[] _pshufbShfTable =
|
|
{
|
|
Vector128.Create(0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d), /* shl 15 (16 - 1)/shr1 */
|
|
Vector128.Create(0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e), /* shl 14 (16 - 3)/shr2 */
|
|
Vector128.Create(0x86858483, 0x8a898887, 0x8e8d8c8b, 0x0201008f), /* shl 13 (16 - 4)/shr3 */
|
|
Vector128.Create(0x87868584, 0x8b8a8988, 0x8f8e8d8c, 0x03020100), /* shl 12 (16 - 4)/shr4 */
|
|
Vector128.Create(0x88878685, 0x8c8b8a89, 0x008f8e8d, 0x04030201), /* shl 11 (16 - 5)/shr5 */
|
|
Vector128.Create(0x89888786, 0x8d8c8b8a, 0x01008f8e, 0x05040302), /* shl 10 (16 - 6)/shr6 */
|
|
Vector128.Create(0x8a898887, 0x8e8d8c8b, 0x0201008f, 0x06050403), /* shl 9 (16 - 7)/shr7 */
|
|
Vector128.Create(0x8b8a8988, 0x8f8e8d8c, 0x03020100, 0x07060504), /* shl 8 (16 - 8)/shr8 */
|
|
Vector128.Create(0x8c8b8a89, 0x008f8e8d, 0x04030201, 0x08070605), /* shl 7 (16 - 9)/shr9 */
|
|
Vector128.Create(0x8d8c8b8a, 0x01008f8e, 0x05040302, 0x09080706), /* shl 6 (16 -10)/shr10*/
|
|
Vector128.Create(0x8e8d8c8b, 0x0201008f, 0x06050403, 0x0a090807), /* shl 5 (16 -11)/shr11*/
|
|
Vector128.Create(0x8f8e8d8c, 0x03020100, 0x07060504, 0x0b0a0908), /* shl 4 (16 -12)/shr12*/
|
|
Vector128.Create(0x008f8e8du, 0x04030201, 0x08070605, 0x0c0b0a09), /* shl 3 (16 -13)/shr13*/
|
|
Vector128.Create(0x01008f8eu, 0x05040302, 0x09080706, 0x0d0c0b0a), /* shl 2 (16 -14)/shr14*/
|
|
Vector128.Create(0x0201008fu, 0x06050403, 0x0a090807, 0x0e0d0c0b) /* shl 1 (16 -15)/shr15*/
|
|
};
|
|
|
|
static Vector128<ulong> vmull_p64(Vector64<ulong> a, Vector64<ulong> b)
|
|
{
|
|
if(Aes.IsSupported) return Aes.PolynomialMultiplyWideningLower(a, b);
|
|
|
|
// Masks
|
|
Vector128<byte> k4832 = Vector128.Create(Vector64.Create(0x0000fffffffffffful),
|
|
Vector64.Create(0x00000000fffffffful))
|
|
.AsByte();
|
|
|
|
Vector128<byte> k1600 = Vector128.Create(Vector64.Create(0x000000000000fffful),
|
|
Vector64.Create(0x0000000000000000ul))
|
|
.AsByte();
|
|
|
|
// Do the multiplies, rotating with vext to get all combinations
|
|
Vector128<byte> d = AdvSimd.PolynomialMultiplyWideningLower(a.AsByte(), b.AsByte()).AsByte(); // D = A0 * B0
|
|
|
|
Vector128<byte> e = AdvSimd
|
|
.PolynomialMultiplyWideningLower(a.AsByte(),
|
|
AdvSimd.ExtractVector64(b.AsByte(), b.AsByte(), 1))
|
|
.AsByte(); // E = A0 * B1
|
|
|
|
Vector128<byte> f = AdvSimd
|
|
.PolynomialMultiplyWideningLower(AdvSimd.ExtractVector64(a.AsByte(), a.AsByte(), 1),
|
|
b.AsByte())
|
|
.AsByte(); // F = A1 * B0
|
|
|
|
Vector128<byte> g = AdvSimd
|
|
.PolynomialMultiplyWideningLower(a.AsByte(),
|
|
AdvSimd.ExtractVector64(b.AsByte(), b.AsByte(), 2))
|
|
.AsByte(); // G = A0 * B2
|
|
|
|
Vector128<byte> h = AdvSimd
|
|
.PolynomialMultiplyWideningLower(AdvSimd.ExtractVector64(a.AsByte(), a.AsByte(), 2),
|
|
b.AsByte())
|
|
.AsByte(); // H = A2 * B0
|
|
|
|
Vector128<byte> i = AdvSimd
|
|
.PolynomialMultiplyWideningLower(a.AsByte(),
|
|
AdvSimd.ExtractVector64(b.AsByte(), b.AsByte(), 3))
|
|
.AsByte(); // I = A0 * B3
|
|
|
|
Vector128<byte> j = AdvSimd
|
|
.PolynomialMultiplyWideningLower(AdvSimd.ExtractVector64(a.AsByte(), a.AsByte(), 3),
|
|
b.AsByte())
|
|
.AsByte(); // J = A3 * B0
|
|
|
|
Vector128<byte> k = AdvSimd
|
|
.PolynomialMultiplyWideningLower(a.AsByte(),
|
|
AdvSimd.ExtractVector64(b.AsByte(), b.AsByte(), 4))
|
|
.AsByte(); // L = A0 * B4
|
|
|
|
// Add cross products
|
|
Vector128<byte> l = AdvSimd.Xor(e, f); // L = E + F
|
|
Vector128<byte> m = AdvSimd.Xor(g, h); // M = G + H
|
|
Vector128<byte> n = AdvSimd.Xor(i, j); // N = I + J
|
|
|
|
Vector128<byte> lmP0;
|
|
Vector128<byte> lmP1;
|
|
Vector128<byte> nkP0;
|
|
Vector128<byte> nkP1;
|
|
|
|
// Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
|
|
// instructions.
|
|
if(AdvSimd.Arm64.IsSupported)
|
|
{
|
|
lmP0 = AdvSimd.Arm64.ZipLow(l.AsUInt64(), m.AsUInt64()).AsByte();
|
|
lmP1 = AdvSimd.Arm64.ZipHigh(l.AsUInt64(), m.AsUInt64()).AsByte();
|
|
nkP0 = AdvSimd.Arm64.ZipLow(n.AsUInt64(), k.AsUInt64()).AsByte();
|
|
nkP1 = AdvSimd.Arm64.ZipHigh(n.AsUInt64(), k.AsUInt64()).AsByte();
|
|
}
|
|
else
|
|
{
|
|
lmP0 = Vector128.Create(l.GetLower(), m.GetLower());
|
|
lmP1 = Vector128.Create(l.GetUpper(), m.GetUpper());
|
|
nkP0 = Vector128.Create(n.GetLower(), k.GetLower());
|
|
nkP1 = Vector128.Create(n.GetUpper(), k.GetUpper());
|
|
}
|
|
|
|
// t0 = (L) (P0 + P1) << 8
|
|
// t1 = (M) (P2 + P3) << 16
|
|
Vector128<byte> t0T1Tmp = AdvSimd.Xor(lmP0, lmP1);
|
|
Vector128<byte> t0T1H = AdvSimd.And(lmP1, k4832);
|
|
Vector128<byte> t0T1L = AdvSimd.Xor(t0T1Tmp, t0T1H);
|
|
|
|
// t2 = (N) (P4 + P5) << 24
|
|
// t3 = (K) (P6 + P7) << 32
|
|
Vector128<byte> t2T3Tmp = AdvSimd.Xor(nkP0, nkP1);
|
|
Vector128<byte> t2T3H = AdvSimd.And(nkP1, k1600);
|
|
Vector128<byte> t2T3L = AdvSimd.Xor(t2T3Tmp, t2T3H);
|
|
|
|
Vector128<byte> t1;
|
|
Vector128<byte> t0;
|
|
Vector128<byte> t3;
|
|
Vector128<byte> t2;
|
|
|
|
// De-interleave
|
|
if(AdvSimd.Arm64.IsSupported)
|
|
{
|
|
t0 = AdvSimd.Arm64.UnzipEven(t0T1L.AsUInt64(), t0T1H.AsUInt64()).AsByte();
|
|
t1 = AdvSimd.Arm64.UnzipOdd(t0T1L.AsUInt64(), t0T1H.AsUInt64()).AsByte();
|
|
t2 = AdvSimd.Arm64.UnzipEven(t2T3L.AsUInt64(), t2T3H.AsUInt64()).AsByte();
|
|
t3 = AdvSimd.Arm64.UnzipOdd(t2T3L.AsUInt64(), t2T3H.AsUInt64()).AsByte();
|
|
}
|
|
else
|
|
{
|
|
t1 = Vector128.Create(t0T1L.GetUpper(), t0T1H.GetUpper());
|
|
t0 = Vector128.Create(t0T1L.GetLower(), t0T1H.GetLower());
|
|
t3 = Vector128.Create(t2T3L.GetUpper(), t2T3H.GetUpper());
|
|
t2 = Vector128.Create(t2T3L.GetLower(), t2T3H.GetLower());
|
|
}
|
|
|
|
// Shift the cross products
|
|
Vector128<byte> t0Shift = AdvSimd.ExtractVector128(t0, t0, 15); // t0 << 8
|
|
Vector128<byte> t1Shift = AdvSimd.ExtractVector128(t1, t1, 14); // t1 << 16
|
|
Vector128<byte> t2Shift = AdvSimd.ExtractVector128(t2, t2, 13); // t2 << 24
|
|
Vector128<byte> t3Shift = AdvSimd.ExtractVector128(t3, t3, 12); // t3 << 32
|
|
|
|
// Accumulate the products
|
|
Vector128<byte> cross1 = AdvSimd.Xor(t0Shift, t1Shift);
|
|
Vector128<byte> cross2 = AdvSimd.Xor(t2Shift, t3Shift);
|
|
Vector128<byte> mix = AdvSimd.Xor(d, cross1);
|
|
Vector128<byte> r = AdvSimd.Xor(mix, cross2);
|
|
|
|
return r.AsUInt64();
|
|
}
|
|
|
|
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
|
static Vector128<ulong> mm_shuffle_epi8(Vector128<ulong> a, Vector128<ulong> b)
|
|
{
|
|
Vector128<byte> tbl = a.AsByte(); // input a
|
|
Vector128<byte> idx = b.AsByte(); // input b
|
|
|
|
Vector128<byte>
|
|
idxMasked = AdvSimd.And(idx, AdvSimd.DuplicateToVector128((byte)0x8F)); // avoid using meaningless bits
|
|
|
|
return AdvSimd.Arm64.VectorTableLookup(tbl, idxMasked).AsUInt64();
|
|
}
|
|
|
|
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
|
static void Fold4(ref Vector128<ulong> qCRC0, ref Vector128<ulong> qCRC1, ref Vector128<ulong> qCRC2,
|
|
ref Vector128<ulong> qCRC3)
|
|
{
|
|
Vector128<ulong> qFold4 = Vector128.Create(0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001).AsUInt64();
|
|
|
|
Vector128<ulong> xTmp0 = qCRC0;
|
|
Vector128<ulong> xTmp1 = qCRC1;
|
|
Vector128<ulong> xTmp2 = qCRC2;
|
|
Vector128<ulong> xTmp3 = qCRC3;
|
|
|
|
qCRC0 = vmull_p64(qCRC0.GetUpper(), qFold4.GetLower());
|
|
xTmp0 = vmull_p64(xTmp0.GetLower(), qFold4.GetUpper());
|
|
Vector128<uint> psCRC0 = qCRC0.AsUInt32();
|
|
Vector128<uint> psT0 = xTmp0.AsUInt32();
|
|
Vector128<uint> psRes0 = AdvSimd.Xor(psCRC0, psT0);
|
|
|
|
qCRC1 = vmull_p64(qCRC1.GetUpper(), qFold4.GetLower());
|
|
xTmp1 = vmull_p64(xTmp1.GetLower(), qFold4.GetUpper());
|
|
Vector128<uint> psCRC1 = qCRC1.AsUInt32();
|
|
Vector128<uint> psT1 = xTmp1.AsUInt32();
|
|
Vector128<uint> psRes1 = AdvSimd.Xor(psCRC1, psT1);
|
|
|
|
qCRC2 = vmull_p64(qCRC2.GetUpper(), qFold4.GetLower());
|
|
xTmp2 = vmull_p64(xTmp2.GetLower(), qFold4.GetUpper());
|
|
Vector128<uint> psCRC2 = qCRC2.AsUInt32();
|
|
Vector128<uint> psT2 = xTmp2.AsUInt32();
|
|
Vector128<uint> psRes2 = AdvSimd.Xor(psCRC2, psT2);
|
|
|
|
qCRC3 = vmull_p64(qCRC3.GetUpper(), qFold4.GetLower());
|
|
xTmp3 = vmull_p64(xTmp3.GetLower(), qFold4.GetUpper());
|
|
Vector128<uint> psCRC3 = qCRC3.AsUInt32();
|
|
Vector128<uint> psT3 = xTmp3.AsUInt32();
|
|
Vector128<uint> psRes3 = AdvSimd.Xor(psCRC3, psT3);
|
|
|
|
qCRC0 = psRes0.AsUInt64();
|
|
qCRC1 = psRes1.AsUInt64();
|
|
qCRC2 = psRes2.AsUInt64();
|
|
qCRC3 = psRes3.AsUInt64();
|
|
}
|
|
|
|
internal static uint Step(byte[] src, long len, uint initialCRC)
|
|
{
|
|
Vector128<ulong> qT0;
|
|
Vector128<ulong> qT1;
|
|
Vector128<ulong> qT2;
|
|
Vector128<ulong> qT3;
|
|
Vector128<ulong> qInitial = AdvSimd.Insert(Vector128<uint>.Zero, 0, initialCRC).AsUInt64();
|
|
Vector128<ulong> qCRC0 = AdvSimd.Insert(Vector128<uint>.Zero, 0, 0x9db42487).AsUInt64();
|
|
Vector128<ulong> qCRC1 = Vector128<ulong>.Zero;
|
|
Vector128<ulong> qCRC2 = Vector128<ulong>.Zero;
|
|
Vector128<ulong> qCRC3 = Vector128<ulong>.Zero;
|
|
var bufPos = 0;
|
|
|
|
var first = true;
|
|
|
|
/* fold 512 to 32 step variable declarations for ISO-C90 compat. */
|
|
var qMask = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000);
|
|
var qMask2 = Vector128.Create(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
|
|
|
|
uint crc;
|
|
Vector128<ulong> xTmp0;
|
|
Vector128<ulong> xTmp1;
|
|
Vector128<ulong> xTmp2;
|
|
Vector128<ulong> crcFold;
|
|
|
|
while((len -= 64) >= 0)
|
|
{
|
|
qT0 = Vector128.Create(BitConverter.ToUInt32(src, bufPos),
|
|
BitConverter.ToUInt32(src, bufPos + 4),
|
|
BitConverter.ToUInt32(src, bufPos + 8),
|
|
BitConverter.ToUInt32(src, bufPos + 12))
|
|
.AsUInt64();
|
|
|
|
bufPos += 16;
|
|
|
|
qT1 = Vector128.Create(BitConverter.ToUInt32(src, bufPos),
|
|
BitConverter.ToUInt32(src, bufPos + 4),
|
|
BitConverter.ToUInt32(src, bufPos + 8),
|
|
BitConverter.ToUInt32(src, bufPos + 12))
|
|
.AsUInt64();
|
|
|
|
bufPos += 16;
|
|
|
|
qT2 = Vector128.Create(BitConverter.ToUInt32(src, bufPos),
|
|
BitConverter.ToUInt32(src, bufPos + 4),
|
|
BitConverter.ToUInt32(src, bufPos + 8),
|
|
BitConverter.ToUInt32(src, bufPos + 12))
|
|
.AsUInt64();
|
|
|
|
bufPos += 16;
|
|
|
|
qT3 = Vector128.Create(BitConverter.ToUInt32(src, bufPos),
|
|
BitConverter.ToUInt32(src, bufPos + 4),
|
|
BitConverter.ToUInt32(src, bufPos + 8),
|
|
BitConverter.ToUInt32(src, bufPos + 12))
|
|
.AsUInt64();
|
|
|
|
bufPos += 16;
|
|
|
|
if(first)
|
|
{
|
|
first = false;
|
|
|
|
qT0 = AdvSimd.Xor(qT0.AsUInt32(), qInitial.AsUInt32()).AsUInt64();
|
|
}
|
|
|
|
Fold4(ref qCRC0, ref qCRC1, ref qCRC2, ref qCRC3);
|
|
|
|
qCRC0 = AdvSimd.Xor(qCRC0.AsUInt32(), qT0.AsUInt32()).AsUInt64();
|
|
qCRC1 = AdvSimd.Xor(qCRC1.AsUInt32(), qT1.AsUInt32()).AsUInt64();
|
|
qCRC2 = AdvSimd.Xor(qCRC2.AsUInt32(), qT2.AsUInt32()).AsUInt64();
|
|
qCRC3 = AdvSimd.Xor(qCRC3.AsUInt32(), qT3.AsUInt32()).AsUInt64();
|
|
}
|
|
|
|
/* fold 512 to 32 */
|
|
|
|
/*
|
|
* k1
|
|
*/
|
|
crcFold = Vector128.Create(_crcK[0], _crcK[1], _crcK[2], _crcK[3]).AsUInt64();
|
|
|
|
xTmp0 = vmull_p64(qCRC0.GetLower(), crcFold.GetUpper());
|
|
qCRC0 = vmull_p64(qCRC0.GetUpper(), crcFold.GetLower());
|
|
qCRC1 = AdvSimd.Xor(qCRC1.AsUInt32(), xTmp0.AsUInt32()).AsUInt64();
|
|
qCRC1 = AdvSimd.Xor(qCRC1.AsUInt32(), qCRC0.AsUInt32()).AsUInt64();
|
|
|
|
xTmp1 = vmull_p64(qCRC1.GetLower(), crcFold.GetUpper());
|
|
qCRC1 = vmull_p64(qCRC1.GetUpper(), crcFold.GetLower());
|
|
qCRC2 = AdvSimd.Xor(qCRC2.AsUInt32(), xTmp1.AsUInt32()).AsUInt64();
|
|
qCRC2 = AdvSimd.Xor(qCRC2.AsUInt32(), qCRC1.AsUInt32()).AsUInt64();
|
|
|
|
xTmp2 = vmull_p64(qCRC2.GetLower(), crcFold.GetUpper());
|
|
qCRC2 = vmull_p64(qCRC2.GetUpper(), crcFold.GetLower());
|
|
qCRC3 = AdvSimd.Xor(qCRC3.AsUInt32(), xTmp2.AsUInt32()).AsUInt64();
|
|
qCRC3 = AdvSimd.Xor(qCRC3.AsUInt32(), qCRC2.AsUInt32()).AsUInt64();
|
|
|
|
/*
|
|
* k5
|
|
*/
|
|
crcFold = Vector128.Create(_crcK[4], _crcK[5], _crcK[6], _crcK[7]).AsUInt64();
|
|
|
|
qCRC0 = qCRC3;
|
|
qCRC3 = vmull_p64(qCRC3.GetLower(), crcFold.GetLower());
|
|
|
|
Vector128<byte> qCRC0B = qCRC0.AsByte();
|
|
|
|
qCRC0 = Vector128.Create(AdvSimd.Extract(qCRC0B, 8),
|
|
AdvSimd.Extract(qCRC0B, 9),
|
|
AdvSimd.Extract(qCRC0B, 10),
|
|
AdvSimd.Extract(qCRC0B, 11),
|
|
AdvSimd.Extract(qCRC0B, 12),
|
|
AdvSimd.Extract(qCRC0B, 13),
|
|
AdvSimd.Extract(qCRC0B, 14),
|
|
AdvSimd.Extract(qCRC0B, 15),
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
0)
|
|
.AsUInt64();
|
|
|
|
qCRC3 = AdvSimd.Xor(qCRC3.AsUInt32(), qCRC0.AsUInt32()).AsUInt64();
|
|
|
|
qCRC0 = qCRC3;
|
|
|
|
Vector128<byte> qCRC3B = qCRC3.AsByte();
|
|
|
|
qCRC3 = Vector128.Create(0,
|
|
0,
|
|
0,
|
|
0,
|
|
AdvSimd.Extract(qCRC3B, 0),
|
|
AdvSimd.Extract(qCRC3B, 1),
|
|
AdvSimd.Extract(qCRC3B, 2),
|
|
AdvSimd.Extract(qCRC3B, 3),
|
|
AdvSimd.Extract(qCRC3B, 4),
|
|
AdvSimd.Extract(qCRC3B, 5),
|
|
AdvSimd.Extract(qCRC3B, 6),
|
|
AdvSimd.Extract(qCRC3B, 7),
|
|
AdvSimd.Extract(qCRC3B, 8),
|
|
AdvSimd.Extract(qCRC3B, 9),
|
|
AdvSimd.Extract(qCRC3B, 10),
|
|
AdvSimd.Extract(qCRC3B, 11))
|
|
.AsUInt64();
|
|
|
|
qCRC3 = vmull_p64(qCRC3.GetLower(), crcFold.GetUpper());
|
|
qCRC3 = AdvSimd.Xor(qCRC3.AsUInt32(), qCRC0.AsUInt32()).AsUInt64();
|
|
qCRC3 = AdvSimd.And(qCRC3.AsUInt32(), qMask2.AsUInt32()).AsUInt64();
|
|
|
|
/*
|
|
* k7
|
|
*/
|
|
qCRC1 = qCRC3;
|
|
qCRC2 = qCRC3;
|
|
crcFold = Vector128.Create(_crcK[8], _crcK[9], _crcK[10], _crcK[11]).AsUInt64();
|
|
|
|
qCRC3 = vmull_p64(qCRC3.GetLower(), crcFold.GetLower());
|
|
qCRC3 = AdvSimd.Xor(qCRC3.AsUInt32(), qCRC2.AsUInt32()).AsUInt64();
|
|
qCRC3 = AdvSimd.And(qCRC3.AsUInt32(), qMask.AsUInt32()).AsUInt64();
|
|
|
|
qCRC2 = qCRC3;
|
|
qCRC3 = vmull_p64(qCRC3.GetLower(), crcFold.GetUpper());
|
|
qCRC3 = AdvSimd.Xor(qCRC3.AsUInt32(), qCRC2.AsUInt32()).AsUInt64();
|
|
qCRC3 = AdvSimd.Xor(qCRC3.AsUInt32(), qCRC1.AsUInt32()).AsUInt64();
|
|
|
|
/*
|
|
* could just as well write q_crc3[2], doing a movaps and truncating, but
|
|
* no real advantage - it's a tiny bit slower per call, while no additional CPUs
|
|
* would be supported by only requiring SSSE3 and CLMUL instead of SSE4.1 + CLMUL
|
|
*/
|
|
return ~AdvSimd.Extract(qCRC3.AsUInt32(), 2);
|
|
}
|
|
} |