Add ARM SIMD VMULL implementation of CRC64.

2025-12-16 19:24:29 +00:00 · 2021-10-12 01:45:37 +01:00
parent ee776e95f8
commit d3bb34dc58
8 changed files with 416 additions and 116 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,6 +26,6 @@ if("${CMAKE_BUILD_TYPE}" MATCHES "Release")
    endif()
 endif()

-add_library("Aaru.Checksums.Native" SHARED adler32.h adler32.c crc16.h crc16.c crc16_ccitt.h crc16_ccitt.c crc32.c crc32.h crc64.c crc64.h fletcher16.h fletcher16.c fletcher32.h fletcher32.c library.h spamsum.c spamsum.h crc32_clmul.c crc64_clmul.c simd.c simd.h adler32_ssse3.c adler32_avx2.c adler32_neon.c crc32_arm_simd.c crc32_vmull.c crc32_simd.h)
+add_library("Aaru.Checksums.Native" SHARED adler32.h adler32.c crc16.h crc16.c crc16_ccitt.h crc16_ccitt.c crc32.c crc32.h crc64.c crc64.h fletcher16.h fletcher16.c fletcher32.h fletcher32.c library.h spamsum.c spamsum.h crc32_clmul.c crc64_clmul.c simd.c simd.h adler32_ssse3.c adler32_avx2.c adler32_neon.c crc32_arm_simd.c crc32_vmull.c crc32_simd.h arm_vmull.c arm_vmull.h crc64_vmull.c)

 add_subdirectory(tests)
--- a/arm_vmull.c
+++ b/arm_vmull.c
@@ -0,0 +1,140 @@
+//
+// Created by claunia on 12/10/21.
+//
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
+
+#include <arm_neon.h>
+
+#include "library.h"
+#include "arm_vmull.h"
+#include "simd.h"
+
+TARGET_WITH_CRYPTO static uint64x2_t sse2neon_vmull_p64_crypto(uint64x1_t _a, uint64x1_t _b)
+{
+    poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
+    poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
+    return vreinterpretq_u64_p128(vmull_p64(a, b));
+}
+
+TARGET_WITH_SIMD uint64x2_t sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    // Wraps vmull_p64
+    if(have_arm_crypto()) return sse2neon_vmull_p64_crypto(_a, _b);
+
+    // ARMv7 polyfill
+    // ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
+    //
+    // vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
+    // 64-bit->128-bit polynomial multiply.
+    //
+    // It needs some work and is somewhat slow, but it is still faster than all
+    // known scalar methods.
+    //
+    // Algorithm adapted to C from
+    // https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
+    // from "Fast Software Polynomial Multiplication on ARM Processors Using the
+    // NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
+    // (https://hal.inria.fr/hal-01506572)
+
+    poly8x8_t a = vreinterpret_p8_u64(_a);
+    poly8x8_t b = vreinterpret_p8_u64(_b);
+
+    // Masks
+    uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff), vcreate_u8(0x00000000ffffffff));
+    uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff), vcreate_u8(0x0000000000000000));
+
+    // Do the multiplies, rotating with vext to get all combinations
+    uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));                // D = A0 * B0
+    uint8x16_t e = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1
+    uint8x16_t f = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0
+    uint8x16_t g = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2
+    uint8x16_t h = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0
+    uint8x16_t i = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3
+    uint8x16_t j = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0
+    uint8x16_t k = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4
+
+    // Add cross products
+    uint8x16_t l = veorq_u8(e, f); // L = E + F
+    uint8x16_t m = veorq_u8(g, h); // M = G + H
+    uint8x16_t n = veorq_u8(i, j); // N = I + J
+
+    // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
+    // instructions.
+#if defined(__aarch64__)
+    uint8x16_t lm_p0 = vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t lm_p1 = vreinterpretq_u8_u64(vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t nk_p0 = vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+    uint8x16_t nk_p1 = vreinterpretq_u8_u64(vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+#else
+    uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
+    uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
+    uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
+    uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
+#endif
+    // t0 = (L) (P0 + P1) << 8
+    // t1 = (M) (P2 + P3) << 16
+    uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
+    uint8x16_t t0t1_h   = vandq_u8(lm_p1, k48_32);
+    uint8x16_t t0t1_l   = veorq_u8(t0t1_tmp, t0t1_h);
+
+    // t2 = (N) (P4 + P5) << 24
+    // t3 = (K) (P6 + P7) << 32
+    uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
+    uint8x16_t t2t3_h   = vandq_u8(nk_p1, k16_00);
+    uint8x16_t t2t3_l   = veorq_u8(t2t3_tmp, t2t3_h);
+
+    // De-interleave
+#if defined(__aarch64__)
+    uint8x16_t t0 = vreinterpretq_u8_u64(vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t1 = vreinterpretq_u8_u64(vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t2 = vreinterpretq_u8_u64(vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+    uint8x16_t t3 = vreinterpretq_u8_u64(vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+#else
+    uint8x16_t t1    = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
+    uint8x16_t t0    = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
+    uint8x16_t t3    = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
+    uint8x16_t t2    = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
+#endif
+    // Shift the cross products
+    uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8
+    uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16
+    uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24
+    uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32
+
+    // Accumulate the products
+    uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
+    uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
+    uint8x16_t mix    = veorq_u8(d, cross1);
+    uint8x16_t r      = veorq_u8(mix, cross2);
+    return vreinterpretq_u64_u8(r);
+}
+
+TARGET_WITH_SIMD uint64x2_t mm_shuffle_epi8(uint64x2_t a, uint64x2_t b)
+{
+    uint8x16_t tbl        = vreinterpretq_u8_u64(a);         // input a
+    uint8x16_t idx        = vreinterpretq_u8_u64(b);         // input b
+    uint8x16_t idx_masked = vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits
+#if defined(__aarch64__)
+    return vreinterpretq_u64_u8(vqtbl1q_u8(tbl, idx_masked));
+#else
+    // use this line if testing on aarch64
+    uint8x8x2_t a_split = {vget_low_u8(tbl), vget_high_u8(tbl)};
+    return vreinterpretq_u64_u8(
+        vcombine_u8(vtbl2_u8(a_split, vget_low_u8(idx_masked)), vtbl2_u8(a_split, vget_high_u8(idx_masked))));
+#endif
+}
+
+TARGET_WITH_SIMD uint64x2_t mm_srli_si128(uint64x2_t a, int imm)
+{
+    uint8x16_t tmp[2] = {vreinterpretq_u8_u64(a), vdupq_n_u8(0)};
+    return vreinterpretq_u64_u8(vld1q_u8(((uint8_t const*)tmp) + imm));
+}
+
+TARGET_WITH_SIMD uint64x2_t mm_slli_si128(uint64x2_t a, int imm)
+{
+    uint8x16_t tmp[2] = {vdupq_n_u8(0), vreinterpretq_u8_u64(a)};
+    return vreinterpretq_u64_u8(vld1q_u8(((uint8_t const*)tmp) + (16 - imm)));
+}
+
+#endif
--- a/arm_vmull.h
+++ b/arm_vmull.h
@@ -0,0 +1,18 @@
+//
+// Created by claunia on 12/10/21.
+//
+
+#ifndef AARU_CHECKSUMS_NATIVE__ARM_VMULL_H_
+#define AARU_CHECKSUMS_NATIVE__ARM_VMULL_H_
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
+
+TARGET_WITH_CRYPTO static uint64x2_t sse2neon_vmull_p64_crypto(uint64x1_t _a, uint64x1_t _b);
+TARGET_WITH_SIMD uint64x2_t          sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b);
+TARGET_WITH_SIMD uint64x2_t          mm_shuffle_epi8(uint64x2_t a, uint64x2_t b);
+TARGET_WITH_SIMD uint64x2_t          mm_srli_si128(uint64x2_t a, int imm);
+TARGET_WITH_SIMD uint64x2_t          mm_slli_si128(uint64x2_t a, int imm);
+
+#endif
+
+#endif // AARU_CHECKSUMS_NATIVE__ARM_VMULL_H_
--- a/crc32_vmull.c
+++ b/crc32_vmull.c
@@ -11,121 +11,7 @@
 #include "library.h"
 #include "crc32.h"
 #include "crc32_simd.h"
-
-TARGET_WITH_CRYPTO static uint64x2_t sse2neon_vmull_p64_crypto(uint64x1_t _a, uint64x1_t _b)
-{
-    poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
-    poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
-    return vreinterpretq_u64_p128(vmull_p64(a, b));
-}
-
-FORCE_INLINE TARGET_WITH_SIMD uint64x2_t sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
-{
-    // Wraps vmull_p64
-    if(have_arm_crypto()) return sse2neon_vmull_p64_crypto(_a, _b);
-
-    // ARMv7 polyfill
-    // ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
-    //
-    // vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
-    // 64-bit->128-bit polynomial multiply.
-    //
-    // It needs some work and is somewhat slow, but it is still faster than all
-    // known scalar methods.
-    //
-    // Algorithm adapted to C from
-    // https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
-    // from "Fast Software Polynomial Multiplication on ARM Processors Using the
-    // NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
-    // (https://hal.inria.fr/hal-01506572)
-
-    poly8x8_t a = vreinterpret_p8_u64(_a);
-    poly8x8_t b = vreinterpret_p8_u64(_b);
-
-    // Masks
-    uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff), vcreate_u8(0x00000000ffffffff));
-    uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff), vcreate_u8(0x0000000000000000));
-
-    // Do the multiplies, rotating with vext to get all combinations
-    uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));                // D = A0 * B0
-    uint8x16_t e = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1
-    uint8x16_t f = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0
-    uint8x16_t g = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2
-    uint8x16_t h = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0
-    uint8x16_t i = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3
-    uint8x16_t j = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0
-    uint8x16_t k = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4
-
-    // Add cross products
-    uint8x16_t l = veorq_u8(e, f); // L = E + F
-    uint8x16_t m = veorq_u8(g, h); // M = G + H
-    uint8x16_t n = veorq_u8(i, j); // N = I + J
-
-    // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
-    // instructions.
-#if defined(__aarch64__)
-    uint8x16_t lm_p0 = vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
-    uint8x16_t lm_p1 = vreinterpretq_u8_u64(vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
-    uint8x16_t nk_p0 = vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
-    uint8x16_t nk_p1 = vreinterpretq_u8_u64(vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
-#else
-    uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
-    uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
-    uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
-    uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
-#endif
-    // t0 = (L) (P0 + P1) << 8
-    // t1 = (M) (P2 + P3) << 16
-    uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
-    uint8x16_t t0t1_h   = vandq_u8(lm_p1, k48_32);
-    uint8x16_t t0t1_l   = veorq_u8(t0t1_tmp, t0t1_h);
-
-    // t2 = (N) (P4 + P5) << 24
-    // t3 = (K) (P6 + P7) << 32
-    uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
-    uint8x16_t t2t3_h   = vandq_u8(nk_p1, k16_00);
-    uint8x16_t t2t3_l   = veorq_u8(t2t3_tmp, t2t3_h);
-
-    // De-interleave
-#if defined(__aarch64__)
-    uint8x16_t t0 = vreinterpretq_u8_u64(vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
-    uint8x16_t t1 = vreinterpretq_u8_u64(vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
-    uint8x16_t t2 = vreinterpretq_u8_u64(vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
-    uint8x16_t t3 = vreinterpretq_u8_u64(vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
-#else
-    uint8x16_t t1    = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
-    uint8x16_t t0    = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
-    uint8x16_t t3    = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
-    uint8x16_t t2    = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
-#endif
-    // Shift the cross products
-    uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8
-    uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16
-    uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24
-    uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32
-
-    // Accumulate the products
-    uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
-    uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
-    uint8x16_t mix    = veorq_u8(d, cross1);
-    uint8x16_t r      = veorq_u8(mix, cross2);
-    return vreinterpretq_u64_u8(r);
-}
-
-FORCE_INLINE TARGET_WITH_SIMD uint64x2_t mm_shuffle_epi8(uint64x2_t a, uint64x2_t b)
-{
-    uint8x16_t tbl        = vreinterpretq_u8_u64(a);         // input a
-    uint8x16_t idx        = vreinterpretq_u8_u64(b);         // input b
-    uint8x16_t idx_masked = vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits
-#if defined(__aarch64__)
-    return vreinterpretq_u64_u8(vqtbl1q_u8(tbl, idx_masked));
-#else
-    // use this line if testing on aarch64
-    uint8x8x2_t a_split = {vget_low_u8(tbl), vget_high_u8(tbl)};
-    return vreinterpretq_u64_u8(
-        vcombine_u8(vtbl2_u8(a_split, vget_low_u8(idx_masked)), vtbl2_u8(a_split, vget_high_u8(idx_masked))));
-#endif
-}
+#include "arm_vmull.h"

 /*
 * somewhat surprisingly the "naive" way of doing this, ie. with a flag and a cond. branch,
--- a/crc64.c
+++ b/crc64.c
@@ -47,6 +47,14 @@ AARU_EXPORT int AARU_CALL crc64_update(crc64_ctx* ctx, const uint8_t* data, uint
    }
 #endif

+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
+    if(have_neon())
+    {
+        ctx->crc = ~crc64_vmull(~ctx->crc, data, len);
+        return 0;
+    }
+#endif
+
    // Unroll according to Intel slicing by uint8_t
    // http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf
    // http://sourceforge.net/projects/slicing-by-8/
--- a/crc64.h
+++ b/crc64.h
@@ -244,3 +244,7 @@ AARU_EXPORT void AARU_CALL       crc64_slicing(uint64_t* crc, const uint8_t* dat
    defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86)
 AARU_EXPORT uint64_t AARU_CALL crc64_clmul(uint64_t crc, const uint8_t* data, long length);
 #endif
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
+AARU_EXPORT TARGET_WITH_SIMD uint64_t AARU_CALL crc64_vmull(uint64_t crc, const uint8_t* data, long length);
+#endif
--- a/crc64_vmull.c
+++ b/crc64_vmull.c
@@ -0,0 +1,164 @@
+//
+// Created by claunia on 12/10/21.
+//
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
+
+#include <arm_neon.h>
+#include <glob.h>
+#include <stdint.h>
+
+#include "library.h"
+#include "arm_vmull.h"
+#include "crc64.h"
+
+static const uint8_t shuffleMasks[] = {
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+    0x8f, 0x8e, 0x8d, 0x8c, 0x8b, 0x8a, 0x89, 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81, 0x80,
+};
+
+TARGET_WITH_SIMD FORCE_INLINE void shiftRight128(uint64x2_t in, size_t n, uint64x2_t* outLeft, uint64x2_t* outRight)
+{
+    const uint64x2_t maskA =
+        vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)(const uint64x2_t*)(shuffleMasks + (16 - n))));
+    uint64x2_t       b     = vreinterpretq_u64_u8(vceqq_u8(vreinterpretq_u8_u64(vreinterpretq_u64_u32(vdupq_n_u32(0))),
+                                                           vreinterpretq_u8_u64(vreinterpretq_u64_u32(vdupq_n_u32(0)))));
+    const uint64x2_t maskB = vreinterpretq_u64_u32(veorq_u32(vreinterpretq_u32_u64(maskA), vreinterpretq_u32_u64(b)));
+
+    *outLeft  = mm_shuffle_epi8(in, maskB);
+    *outRight = mm_shuffle_epi8(in, maskA);
+}
+
+TARGET_WITH_SIMD FORCE_INLINE uint64x2_t fold(uint64x2_t in, uint64x2_t foldConstants)
+{
+    return veorq_u64(sse2neon_vmull_p64(vget_low_u64(in), vget_low_u64(foldConstants)),
+                     sse2neon_vmull_p64(vget_high_u64(in), vget_high_u64(foldConstants)));
+}
+
+AARU_EXPORT TARGET_WITH_SIMD uint64_t AARU_CALL crc64_vmull(uint64_t crc, const uint8_t* data, long length)
+{
+    const uint64_t k1 = 0xe05dd497ca393ae4; // bitReflect(expMod65(128 + 64, poly, 1)) << 1;
+    const uint64_t k2 = 0xdabe95afc7875f40; // bitReflect(expMod65(128, poly, 1)) << 1;
+    const uint64_t mu = 0x9c3e466c172963d5; // (bitReflect(div129by65(poly)) << 1) | 1;
+    const uint64_t p  = 0x92d8af2baf0e1e85; // (bitReflect(poly) << 1) | 1;
+
+    const uint64x2_t foldConstants1 = vcombine_u64(vcreate_u64(k1), vcreate_u64(k2));
+    const uint64x2_t foldConstants2 = vcombine_u64(vcreate_u64(mu), vcreate_u64(p));
+
+    const uint8_t* end = data + length;
+
+    // Align pointers
+    const uint64x2_t* alignedData = (const uint64x2_t*)((uintptr_t)data & ~(uintptr_t)15);
+    const uint64x2_t* alignedEnd  = (const uint64x2_t*)(((uintptr_t)end + 15) & ~(uintptr_t)15);
+
+    const size_t leadInSize  = data - (const uint8_t*)alignedData;
+    const size_t leadOutSize = (const uint8_t*)alignedEnd - end;
+
+    const size_t alignedLength = alignedEnd - alignedData;
+
+    const uint64x2_t leadInMask =
+        vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)(const uint64x2_t*)(shuffleMasks + (16 - leadInSize))));
+    uint64x2_t a = vreinterpretq_u64_u32(vdupq_n_u32(0));
+    uint64x2_t b = vreinterpretq_u64_u32(
+        vld1q_u32((const uint32_t*)alignedData)); // Use a signed shift right to create a mask with the sign bit
+    const uint64x2_t data0 =
+        vreinterpretq_u64_u8(vbslq_u8(vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u64(leadInMask), 7)),
+                                      vreinterpretq_u8_u64(b),
+                                      vreinterpretq_u8_u64(a)));
+
+    const uint64x2_t initialCrc = vsetq_lane_u64(~crc, vdupq_n_u64(0), 0);
+
+    uint64x2_t R;
+    if(alignedLength == 1)
+    {
+        // Single data block, initial CRC possibly bleeds into zero padding
+        uint64x2_t crc0, crc1;
+        shiftRight128(initialCrc, 16 - length, &crc0, &crc1);
+
+        uint64x2_t A, B;
+        shiftRight128(data0, leadOutSize, &A, &B);
+
+        const uint64x2_t P = veorq_u64(A, crc0);
+        R                  = veorq_u64(sse2neon_vmull_p64(vget_low_u64(P), vget_high_u64(foldConstants1)),
+                                       veorq_u64(mm_srli_si128(P, 8), mm_slli_si128(crc1, 8)));
+    }
+    else if(alignedLength == 2)
+    {
+        const uint64x2_t data1 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)(alignedData + 1)));
+
+        if(length < 8)
+        {
+            // Initial CRC bleeds into the zero padding
+            uint64x2_t crc0, crc1;
+            shiftRight128(initialCrc, 16 - length, &crc0, &crc1);
+
+            uint64x2_t A, B, C, D;
+            shiftRight128(data0, leadOutSize, &A, &B);
+            shiftRight128(data1, leadOutSize, &C, &D);
+
+            const uint64x2_t P = veorq_u64(veorq_u64(B, C), crc0);
+            R                  = veorq_u64(sse2neon_vmull_p64(vget_low_u64(P), vget_high_u64(foldConstants1)),
+                                           veorq_u64(mm_srli_si128(P, 8), mm_slli_si128(crc1, 8)));
+        }
+        else
+        {
+            // We can fit the initial CRC into the data without bleeding into the zero padding
+            uint64x2_t crc0, crc1;
+            shiftRight128(initialCrc, leadInSize, &crc0, &crc1);
+
+            uint64x2_t A, B, C, D;
+            shiftRight128(veorq_u64(data0, crc0), leadOutSize, &A, &B);
+            shiftRight128(veorq_u64(data1, crc1), leadOutSize, &C, &D);
+
+            const uint64x2_t P = veorq_u64(fold(A, foldConstants1), veorq_u64(B, C));
+            R = veorq_u64(sse2neon_vmull_p64(vget_low_u64(P), vget_high_u64(foldConstants1)), mm_srli_si128(P, 8));
+        }
+    }
+    else
+    {
+        alignedData++;
+        length -= 16 - leadInSize;
+
+        // Initial CRC can simply be added to data
+        uint64x2_t crc0, crc1;
+        shiftRight128(initialCrc, leadInSize, &crc0, &crc1);
+
+        uint64x2_t accumulator = veorq_u64(fold(veorq_u64(crc0, data0), foldConstants1), crc1);
+
+        while(length >= 32)
+        {
+            accumulator = fold(veorq_u64(vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)alignedData)), accumulator),
+                               foldConstants1);
+
+            length -= 16;
+            alignedData++;
+        }
+
+        uint64x2_t P;
+        if(length == 16) P = veorq_u64(accumulator, vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)alignedData)));
+        else
+        {
+            const uint64x2_t end0 =
+                veorq_u64(accumulator, vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)alignedData)));
+            const uint64x2_t end1 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t*)(alignedData + 1)));
+
+            uint64x2_t A, B, C, D;
+            shiftRight128(end0, leadOutSize, &A, &B);
+            shiftRight128(end1, leadOutSize, &C, &D);
+
+            P = veorq_u64(fold(A, foldConstants1),
+                          vreinterpretq_u64_u32(vorrq_u32(vreinterpretq_u32_u64(B), vreinterpretq_u32_u64(C))));
+        }
+
+        R = veorq_u64(sse2neon_vmull_p64(vget_low_u64(P), vget_high_u64(foldConstants1)), mm_srli_si128(P, 8));
+    }
+
+    // Final Barrett reduction
+    const uint64x2_t T1 = sse2neon_vmull_p64(vget_low_u64(R), vget_low_u64(foldConstants2));
+    const uint64x2_t T2 = veorq_u64(
+        veorq_u64(sse2neon_vmull_p64(vget_low_u64(T1), vget_high_u64(foldConstants2)), mm_slli_si128(T1, 8)), R);
+
+    return ~vgetq_lane_u64(T2, 1);
+}
+
+#endif
--- a/tests/crc64.cpp
+++ b/tests/crc64.cpp
@@ -283,3 +283,83 @@ TEST_F(crc64Fixture, crc64_clmul_2352bytes)
    EXPECT_EQ(crc, EXPECTED_CRC64_2352BYTES);
 }
 #endif
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
+TEST_F(crc64Fixture, crc64_vmull)
+{
+    if(!have_neon()) return;
+    
+    uint64_t crc = CRC64_ECMA_SEED;
+
+    crc = ~crc64_vmull(~crc, buffer, 1048576);
+
+    crc ^= CRC64_ECMA_SEED;
+
+    EXPECT_EQ(crc, EXPECTED_CRC64);
+}
+
+TEST_F(crc64Fixture, crc64_vmull_misaligned)
+{
+    if(!have_neon()) return;
+
+    uint64_t crc = CRC64_ECMA_SEED;
+
+    crc = ~crc64_vmull(~crc, buffer_misaligned+1, 1048576);
+
+    crc ^= CRC64_ECMA_SEED;
+
+    EXPECT_EQ(crc, EXPECTED_CRC64);
+}
+
+TEST_F(crc64Fixture, crc64_vmull_15bytes)
+{
+    if(!have_neon()) return;
+
+    uint64_t crc = CRC64_ECMA_SEED;
+
+    crc = ~crc64_vmull(~crc, buffer, 15);
+
+    crc ^= CRC64_ECMA_SEED;
+
+    EXPECT_EQ(crc, EXPECTED_CRC64_15BYTES);
+}
+
+TEST_F(crc64Fixture, crc64_vmull_31bytes)
+{
+    if(!have_neon()) return;
+
+    uint64_t crc = CRC64_ECMA_SEED;
+
+    crc = ~crc64_vmull(~crc, buffer, 31);
+
+    crc ^= CRC64_ECMA_SEED;
+
+    EXPECT_EQ(crc, EXPECTED_CRC64_31BYTES);
+}
+
+TEST_F(crc64Fixture, crc64_vmull_63bytes)
+{
+    if(!have_neon()) return;
+
+    uint64_t crc = CRC64_ECMA_SEED;
+
+    crc = ~crc64_vmull(~crc, buffer, 63);
+
+    crc ^= CRC64_ECMA_SEED;
+
+    EXPECT_EQ(crc, EXPECTED_CRC64_63BYTES);
+}
+
+TEST_F(crc64Fixture, crc64_vmull_2352bytes)
+{
+    if(!have_neon()) return;
+
+    uint64_t crc = CRC64_ECMA_SEED;
+
+    crc = ~crc64_vmull(~crc, buffer, 2352);
+
+    crc ^= CRC64_ECMA_SEED;
+
+    EXPECT_EQ(crc, EXPECTED_CRC64_2352BYTES);
+}
+#endif