Aaru.Checksums.Native/crc64_vmull.c

/*
 * This file is part of the Aaru Data Preservation Suite.
 * Copyright (c) 2019-2023 Natalia Portillo.
 *
 * This file is under the public domain:
 * https://github.com/rawrunprotected/crc
 */

#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)

#include <arm_neon.h>
#include <stddef.h>
#include <stdint.h>

#include "library.h"
#include "arm_vmull.h"
#include "crc64.h"

static const uint8_t shuffleMasks[] = {
    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
    0x8f, 0x8e, 0x8d, 0x8c, 0x8b, 0x8a, 0x89, 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81, 0x80,
};

TARGET_WITH_SIMD FORCE_INLINE void shiftRight128(uint64x2_t in, size_t n, uint64x2_t* outLeft, uint64x2_t* outRight)
{
    const uint64x2_t maskA =
        vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)(const uint64x2_t*)(shuffleMasks + (16 - n))));
    uint64x2_t       b     = vreinterpretq_u64_u8(vceqq_u8(vreinterpretq_u8_u64(vreinterpretq_u64_u32(vdupq_n_u32(0))),
                                                 vreinterpretq_u8_u64(vreinterpretq_u64_u32(vdupq_n_u32(0)))));
    const uint64x2_t maskB = vreinterpretq_u64_u32(veorq_u32(vreinterpretq_u32_u64(maskA), vreinterpretq_u32_u64(b)));

    *outLeft  = mm_shuffle_epi8(in, maskB);
    *outRight = mm_shuffle_epi8(in, maskA);
}

TARGET_WITH_SIMD FORCE_INLINE uint64x2_t fold(uint64x2_t in, uint64x2_t foldConstants)
{
    return veorq_u64(sse2neon_vmull_p64(vget_low_u64(in), vget_low_u64(foldConstants)),
                     sse2neon_vmull_p64(vget_high_u64(in), vget_high_u64(foldConstants)));
}

/**
 * @brief Calculates the CRC-64 checksum using the vmull instruction.
 *
 * This function calculates the CRC-64 checksum of the given data using the
 * vmull instruction for optimized performance. It takes the previous CRC value,
 * the data buffer, and the length of data as parameters. The function returns
 * the resulting CRC-32 checksum.
 *
 * @param previous_crc The previous CRC value.
 * @param data The data buffer.
 * @param len The length of the data buffer.
 *
 * @return The CRC-64 checksum of the given data.
 */
AARU_EXPORT TARGET_WITH_SIMD uint64_t AARU_CALL crc64_vmull(uint64_t previous_crc, const uint8_t* data, long len)
{
    const uint64_t k1 = 0xe05dd497ca393ae4; // bitReflect(expMod65(128 + 64, poly, 1)) << 1;
    const uint64_t k2 = 0xdabe95afc7875f40; // bitReflect(expMod65(128, poly, 1)) << 1;
    const uint64_t mu = 0x9c3e466c172963d5; // (bitReflect(div129by65(poly)) << 1) | 1;
    const uint64_t p  = 0x92d8af2baf0e1e85; // (bitReflect(poly) << 1) | 1;

    const uint64x2_t foldConstants1 = vcombine_u64(vcreate_u64(k1), vcreate_u64(k2));
    const uint64x2_t foldConstants2 = vcombine_u64(vcreate_u64(mu), vcreate_u64(p));

    const uint8_t* end = data + len;

    // Align pointers
    const uint64x2_t* alignedData = (const uint64x2_t*)((uintptr_t)data & ~(uintptr_t)15);
    const uint64x2_t* alignedEnd  = (const uint64x2_t*)(((uintptr_t)end + 15) & ~(uintptr_t)15);

    const size_t leadInSize  = data - (const uint8_t*)alignedData;
    const size_t leadOutSize = (const uint8_t*)alignedEnd - end;

    const size_t alignedLength = alignedEnd - alignedData;

    const uint64x2_t leadInMask =
        vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)(const uint64x2_t*)(shuffleMasks + (16 - leadInSize))));
    uint64x2_t a = vreinterpretq_u64_u32(vdupq_n_u32(0));
    uint64x2_t b = vreinterpretq_u64_u32(
        vld1q_u32((const uint32_t*)alignedData)); // Use a signed shift right to create a mask with the sign bit
    const uint64x2_t data0 =
        vreinterpretq_u64_u8(vbslq_u8(vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u64(leadInMask), 7)),
                                      vreinterpretq_u8_u64(b),
                                      vreinterpretq_u8_u64(a)));

    const uint64x2_t initialCrc = vsetq_lane_u64(~previous_crc, vdupq_n_u64(0), 0);

    uint64x2_t R;
    if(alignedLength == 1)
    {
        // Single data block, initial CRC possibly bleeds into zero padding
        uint64x2_t crc0, crc1;
        shiftRight128(initialCrc, 16 - len, &crc0, &crc1);

        uint64x2_t A, B;
        shiftRight128(data0, leadOutSize, &A, &B);

        const uint64x2_t P = veorq_u64(A, crc0);
        R                  = veorq_u64(sse2neon_vmull_p64(vget_low_u64(P), vget_high_u64(foldConstants1)),
                      veorq_u64(mm_srli_si128(P, 8), mm_slli_si128(crc1, 8)));
    }
    else if(alignedLength == 2)
    {
        const uint64x2_t data1 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)(alignedData + 1)));

        if(len < 8)
        {
            // Initial CRC bleeds into the zero padding
            uint64x2_t crc0, crc1;
            shiftRight128(initialCrc, 16 - len, &crc0, &crc1);

            uint64x2_t A, B, C, D;
            shiftRight128(data0, leadOutSize, &A, &B);
            shiftRight128(data1, leadOutSize, &C, &D);

            const uint64x2_t P = veorq_u64(veorq_u64(B, C), crc0);
            R                  = veorq_u64(sse2neon_vmull_p64(vget_low_u64(P), vget_high_u64(foldConstants1)),
                          veorq_u64(mm_srli_si128(P, 8), mm_slli_si128(crc1, 8)));
        }
        else
        {
            // We can fit the initial CRC into the data without bleeding into the zero padding
            uint64x2_t crc0, crc1;
            shiftRight128(initialCrc, leadInSize, &crc0, &crc1);

            uint64x2_t A, B, C, D;
            shiftRight128(veorq_u64(data0, crc0), leadOutSize, &A, &B);
            shiftRight128(veorq_u64(data1, crc1), leadOutSize, &C, &D);

            const uint64x2_t P = veorq_u64(fold(A, foldConstants1), veorq_u64(B, C));
            R = veorq_u64(sse2neon_vmull_p64(vget_low_u64(P), vget_high_u64(foldConstants1)), mm_srli_si128(P, 8));
        }
    }
    else
    {
        alignedData++;
        len -= 16 - leadInSize;

        // Initial CRC can simply be added to data
        uint64x2_t crc0, crc1;
        shiftRight128(initialCrc, leadInSize, &crc0, &crc1);

        uint64x2_t accumulator = veorq_u64(fold(veorq_u64(crc0, data0), foldConstants1), crc1);

        while(len >= 32)
        {
            accumulator = fold(veorq_u64(vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)alignedData)), accumulator),
                               foldConstants1);

            len -= 16;
            alignedData++;
        }

        uint64x2_t P;
        if(len == 16) P = veorq_u64(accumulator, vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)alignedData)));
        else
        {
            const uint64x2_t end0 =
                veorq_u64(accumulator, vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)alignedData)));
            const uint64x2_t end1 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)(alignedData + 1)));

            uint64x2_t A, B, C, D;
            shiftRight128(end0, leadOutSize, &A, &B);
            shiftRight128(end1, leadOutSize, &C, &D);

            P = veorq_u64(fold(A, foldConstants1),
                          vreinterpretq_u64_u32(vorrq_u32(vreinterpretq_u32_u64(B), vreinterpretq_u32_u64(C))));
        }

        R = veorq_u64(sse2neon_vmull_p64(vget_low_u64(P), vget_high_u64(foldConstants1)), mm_srli_si128(P, 8));
    }

    // Final Barrett reduction
    const uint64x2_t T1 = sse2neon_vmull_p64(vget_low_u64(R), vget_low_u64(foldConstants2));
    const uint64x2_t T2 = veorq_u64(
        veorq_u64(sse2neon_vmull_p64(vget_low_u64(T1), vget_high_u64(foldConstants2)), mm_slli_si128(T1, 8)), R);

    return ~vgetq_lane_u64(T2, 1);
}

#endif
Set file headers. 2021-10-13 03:25:16 +01:00			`/*`
Refactor code. 2021-10-13 03:46:47 +01:00			`* This file is part of the Aaru Data Preservation Suite.`
Update copyright year. 2022-12-01 23:06:20 +00:00			`* Copyright (c) 2019-2023 Natalia Portillo.`
Refactor code. 2021-10-13 03:46:47 +01:00			`*`
			`* This file is under the public domain:`
			`* https://github.com/rawrunprotected/crc`
			`*/`
Add ARM SIMD VMULL implementation of CRC64. 2021-10-12 01:45:37 +01:00
			`#if defined(__aarch64__) \|\| defined(_M_ARM64) \|\| defined(__arm__) \|\| defined(_M_ARM)`

			`#include <arm_neon.h>`
Fix definition of size_t for non-Windows platforms. 2021-10-13 02:54:08 +01:00			`#include <stddef.h>`
Refactor code. 2021-10-13 03:46:47 +01:00			`#include <stdint.h>`
Add ARM SIMD VMULL implementation of CRC64. 2021-10-12 01:45:37 +01:00
			`#include "library.h"`
			`#include "arm_vmull.h"`
			`#include "crc64.h"`

			`static const uint8_t shuffleMasks[] = {`
			`0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,`
			`0x8f, 0x8e, 0x8d, 0x8c, 0x8b, 0x8a, 0x89, 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81, 0x80,`
			`};`

			`TARGET_WITH_SIMD FORCE_INLINE void shiftRight128(uint64x2_t in, size_t n, uint64x2_t* outLeft, uint64x2_t* outRight)`
			`{`
			`const uint64x2_t maskA =`
			`vreinterpretq_u64_u32(vld1q_u32((const uint32_t)(const uint64x2_t)(shuffleMasks + (16 - n))));`
			`uint64x2_t b = vreinterpretq_u64_u8(vceqq_u8(vreinterpretq_u8_u64(vreinterpretq_u64_u32(vdupq_n_u32(0))),`
Refactor code. 2021-10-13 03:46:47 +01:00			`vreinterpretq_u8_u64(vreinterpretq_u64_u32(vdupq_n_u32(0)))));`
Add ARM SIMD VMULL implementation of CRC64. 2021-10-12 01:45:37 +01:00			`const uint64x2_t maskB = vreinterpretq_u64_u32(veorq_u32(vreinterpretq_u32_u64(maskA), vreinterpretq_u32_u64(b)));`

			`*outLeft = mm_shuffle_epi8(in, maskB);`
			`*outRight = mm_shuffle_epi8(in, maskA);`
			`}`

			`TARGET_WITH_SIMD FORCE_INLINE uint64x2_t fold(uint64x2_t in, uint64x2_t foldConstants)`
			`{`
			`return veorq_u64(sse2neon_vmull_p64(vget_low_u64(in), vget_low_u64(foldConstants)),`
			`sse2neon_vmull_p64(vget_high_u64(in), vget_high_u64(foldConstants)));`
			`}`

Add documentation. 2023-09-23 18:10:44 +01:00			`/**`
			`* @brief Calculates the CRC-64 checksum using the vmull instruction.`
			`*`
			`* This function calculates the CRC-64 checksum of the given data using the`
			`* vmull instruction for optimized performance. It takes the previous CRC value,`
			`* the data buffer, and the length of data as parameters. The function returns`
			`* the resulting CRC-32 checksum.`
			`*`
			`* @param previous_crc The previous CRC value.`
			`* @param data The data buffer.`
			`* @param len The length of the data buffer.`
			`*`
			`* @return The CRC-64 checksum of the given data.`
			`*/`
Consistency of method signatures. 2021-10-13 03:07:04 +01:00			`AARU_EXPORT TARGET_WITH_SIMD uint64_t AARU_CALL crc64_vmull(uint64_t previous_crc, const uint8_t* data, long len)`
Add ARM SIMD VMULL implementation of CRC64. 2021-10-12 01:45:37 +01:00			`{`
			`const uint64_t k1 = 0xe05dd497ca393ae4; // bitReflect(expMod65(128 + 64, poly, 1)) << 1;`
			`const uint64_t k2 = 0xdabe95afc7875f40; // bitReflect(expMod65(128, poly, 1)) << 1;`
			`const uint64_t mu = 0x9c3e466c172963d5; // (bitReflect(div129by65(poly)) << 1) \| 1;`
			`const uint64_t p = 0x92d8af2baf0e1e85; // (bitReflect(poly) << 1) \| 1;`

			`const uint64x2_t foldConstants1 = vcombine_u64(vcreate_u64(k1), vcreate_u64(k2));`
			`const uint64x2_t foldConstants2 = vcombine_u64(vcreate_u64(mu), vcreate_u64(p));`

Consistency of method signatures. 2021-10-13 03:07:04 +01:00			`const uint8_t* end = data + len;`
Add ARM SIMD VMULL implementation of CRC64. 2021-10-12 01:45:37 +01:00
			`// Align pointers`
			`const uint64x2_t* alignedData = (const uint64x2_t*)((uintptr_t)data & ~(uintptr_t)15);`
			`const uint64x2_t* alignedEnd = (const uint64x2_t*)(((uintptr_t)end + 15) & ~(uintptr_t)15);`

			`const size_t leadInSize = data - (const uint8_t*)alignedData;`
			`const size_t leadOutSize = (const uint8_t*)alignedEnd - end;`

			`const size_t alignedLength = alignedEnd - alignedData;`

			`const uint64x2_t leadInMask =`
			`vreinterpretq_u64_u32(vld1q_u32((const uint32_t)(const uint64x2_t)(shuffleMasks + (16 - leadInSize))));`
			`uint64x2_t a = vreinterpretq_u64_u32(vdupq_n_u32(0));`
			`uint64x2_t b = vreinterpretq_u64_u32(`
			`vld1q_u32((const uint32_t*)alignedData)); // Use a signed shift right to create a mask with the sign bit`
			`const uint64x2_t data0 =`
			`vreinterpretq_u64_u8(vbslq_u8(vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u64(leadInMask), 7)),`
			`vreinterpretq_u8_u64(b),`
			`vreinterpretq_u8_u64(a)));`

Consistency of method signatures. 2021-10-13 03:07:04 +01:00			`const uint64x2_t initialCrc = vsetq_lane_u64(~previous_crc, vdupq_n_u64(0), 0);`
Add ARM SIMD VMULL implementation of CRC64. 2021-10-12 01:45:37 +01:00
			`uint64x2_t R;`
			`if(alignedLength == 1)`
			`{`
			`// Single data block, initial CRC possibly bleeds into zero padding`
			`uint64x2_t crc0, crc1;`
Consistency of method signatures. 2021-10-13 03:07:04 +01:00			`shiftRight128(initialCrc, 16 - len, &crc0, &crc1);`
Add ARM SIMD VMULL implementation of CRC64. 2021-10-12 01:45:37 +01:00
			`uint64x2_t A, B;`
			`shiftRight128(data0, leadOutSize, &A, &B);`

			`const uint64x2_t P = veorq_u64(A, crc0);`
			`R = veorq_u64(sse2neon_vmull_p64(vget_low_u64(P), vget_high_u64(foldConstants1)),`
Refactor code. 2021-10-13 03:46:47 +01:00			`veorq_u64(mm_srli_si128(P, 8), mm_slli_si128(crc1, 8)));`
Add ARM SIMD VMULL implementation of CRC64. 2021-10-12 01:45:37 +01:00			`}`
			`else if(alignedLength == 2)`
			`{`
			`const uint64x2_t data1 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)(alignedData + 1)));`

Consistency of method signatures. 2021-10-13 03:07:04 +01:00			`if(len < 8)`
Add ARM SIMD VMULL implementation of CRC64. 2021-10-12 01:45:37 +01:00			`{`
			`// Initial CRC bleeds into the zero padding`
			`uint64x2_t crc0, crc1;`
Consistency of method signatures. 2021-10-13 03:07:04 +01:00			`shiftRight128(initialCrc, 16 - len, &crc0, &crc1);`
Add ARM SIMD VMULL implementation of CRC64. 2021-10-12 01:45:37 +01:00
			`uint64x2_t A, B, C, D;`
			`shiftRight128(data0, leadOutSize, &A, &B);`
			`shiftRight128(data1, leadOutSize, &C, &D);`

			`const uint64x2_t P = veorq_u64(veorq_u64(B, C), crc0);`
			`R = veorq_u64(sse2neon_vmull_p64(vget_low_u64(P), vget_high_u64(foldConstants1)),`
Refactor code. 2021-10-13 03:46:47 +01:00			`veorq_u64(mm_srli_si128(P, 8), mm_slli_si128(crc1, 8)));`
Add ARM SIMD VMULL implementation of CRC64. 2021-10-12 01:45:37 +01:00			`}`
			`else`
			`{`
			`// We can fit the initial CRC into the data without bleeding into the zero padding`
			`uint64x2_t crc0, crc1;`
			`shiftRight128(initialCrc, leadInSize, &crc0, &crc1);`

			`uint64x2_t A, B, C, D;`
			`shiftRight128(veorq_u64(data0, crc0), leadOutSize, &A, &B);`
			`shiftRight128(veorq_u64(data1, crc1), leadOutSize, &C, &D);`

			`const uint64x2_t P = veorq_u64(fold(A, foldConstants1), veorq_u64(B, C));`
			`R = veorq_u64(sse2neon_vmull_p64(vget_low_u64(P), vget_high_u64(foldConstants1)), mm_srli_si128(P, 8));`
			`}`
			`}`
			`else`
			`{`
			`alignedData++;`
Consistency of method signatures. 2021-10-13 03:07:04 +01:00			`len -= 16 - leadInSize;`
Add ARM SIMD VMULL implementation of CRC64. 2021-10-12 01:45:37 +01:00
			`// Initial CRC can simply be added to data`
			`uint64x2_t crc0, crc1;`
			`shiftRight128(initialCrc, leadInSize, &crc0, &crc1);`

			`uint64x2_t accumulator = veorq_u64(fold(veorq_u64(crc0, data0), foldConstants1), crc1);`

Consistency of method signatures. 2021-10-13 03:07:04 +01:00			`while(len >= 32)`
Add ARM SIMD VMULL implementation of CRC64. 2021-10-12 01:45:37 +01:00			`{`
			`accumulator = fold(veorq_u64(vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)alignedData)), accumulator),`
			`foldConstants1);`

Consistency of method signatures. 2021-10-13 03:07:04 +01:00			`len -= 16;`
Add ARM SIMD VMULL implementation of CRC64. 2021-10-12 01:45:37 +01:00			`alignedData++;`
			`}`

			`uint64x2_t P;`
Consistency of method signatures. 2021-10-13 03:07:04 +01:00			`if(len == 16) P = veorq_u64(accumulator, vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)alignedData)));`
Add ARM SIMD VMULL implementation of CRC64. 2021-10-12 01:45:37 +01:00			`else`
			`{`
			`const uint64x2_t end0 =`
			`veorq_u64(accumulator, vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)alignedData)));`
			`const uint64x2_t end1 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)(alignedData + 1)));`

			`uint64x2_t A, B, C, D;`
			`shiftRight128(end0, leadOutSize, &A, &B);`
			`shiftRight128(end1, leadOutSize, &C, &D);`

			`P = veorq_u64(fold(A, foldConstants1),`
			`vreinterpretq_u64_u32(vorrq_u32(vreinterpretq_u32_u64(B), vreinterpretq_u32_u64(C))));`
			`}`

			`R = veorq_u64(sse2neon_vmull_p64(vget_low_u64(P), vget_high_u64(foldConstants1)), mm_srli_si128(P, 8));`
			`}`

			`// Final Barrett reduction`
			`const uint64x2_t T1 = sse2neon_vmull_p64(vget_low_u64(R), vget_low_u64(foldConstants2));`
			`const uint64x2_t T2 = veorq_u64(`
			`veorq_u64(sse2neon_vmull_p64(vget_low_u64(T1), vget_high_u64(foldConstants2)), mm_slli_si128(T1, 8)), R);`

			`return ~vgetq_lane_u64(T2, 1);`
			`}`

			`#endif`