19#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
30static const uint8_t shuffleMasks[] = {
31 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
32 0x8f, 0x8e, 0x8d, 0x8c, 0x8b, 0x8a, 0x89, 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81, 0x80,
35TARGET_WITH_SIMD
FORCE_INLINE void shiftRight128(uint64x2_t in,
size_t n, uint64x2_t *outLeft, uint64x2_t *outRight)
37 const uint64x2_t maskA =
38 vreinterpretq_u64_u32(vld1q_u32((
const uint32_t *)(
const uint64x2_t *)(shuffleMasks + (16 - n))));
39 uint64x2_t b = vreinterpretq_u64_u8(vceqq_u8(vreinterpretq_u8_u64(vreinterpretq_u64_u32(vdupq_n_u32(0))),
40 vreinterpretq_u8_u64(vreinterpretq_u64_u32(vdupq_n_u32(0)))));
41 const uint64x2_t maskB = vreinterpretq_u64_u32(veorq_u32(vreinterpretq_u32_u64(maskA), vreinterpretq_u32_u64(b)));
43 *outLeft = mm_shuffle_epi8(in, maskB);
44 *outRight = mm_shuffle_epi8(in, maskA);
47TARGET_WITH_SIMD
FORCE_INLINE uint64x2_t fold(uint64x2_t in, uint64x2_t foldConstants)
49 return veorq_u64(sse2neon_vmull_p64(vget_low_u64(in), vget_low_u64(foldConstants)),
50 sse2neon_vmull_p64(vget_high_u64(in), vget_high_u64(foldConstants)));
61AARU_EXPORT TARGET_WITH_SIMD uint64_t
AARU_CALL aaruf_crc64_vmull(uint64_t previous_crc,
const uint8_t *data,
long len)
63 TRACE(
"Entering aaruf_crc64_vmull(%llu, %p, %ld)", previous_crc, data, len);
65 const uint64_t k1 = 0xe05dd497ca393ae4;
66 const uint64_t k2 = 0xdabe95afc7875f40;
67 const uint64_t mu = 0x9c3e466c172963d5;
68 const uint64_t p = 0x92d8af2baf0e1e85;
70 const uint64x2_t foldConstants1 = vcombine_u64(vcreate_u64(k1), vcreate_u64(k2));
71 const uint64x2_t foldConstants2 = vcombine_u64(vcreate_u64(mu), vcreate_u64(p));
73 const uint8_t *end = data + len;
76 const uint64x2_t *alignedData = (
const uint64x2_t *)((uintptr_t)data & ~(uintptr_t)15);
77 const uint64x2_t *alignedEnd = (
const uint64x2_t *)(((uintptr_t)end + 15) & ~(uintptr_t)15);
79 const size_t leadInSize = data - (
const uint8_t *)alignedData;
80 const size_t leadOutSize = (
const uint8_t *)alignedEnd - end;
82 const size_t alignedLength = alignedEnd - alignedData;
84 const uint64x2_t leadInMask =
85 vreinterpretq_u64_u32(vld1q_u32((
const uint32_t *)(
const uint64x2_t *)(shuffleMasks + (16 - leadInSize))));
86 uint64x2_t a = vreinterpretq_u64_u32(vdupq_n_u32(0));
87 uint64x2_t b = vreinterpretq_u64_u32(
88 vld1q_u32((
const uint32_t *)alignedData));
89 const uint64x2_t data0 =
90 vreinterpretq_u64_u8(vbslq_u8(vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u64(leadInMask), 7)),
91 vreinterpretq_u8_u64(b), vreinterpretq_u8_u64(a)));
93 const uint64x2_t initialCrc = vsetq_lane_u64(~previous_crc, vdupq_n_u64(0), 0);
96 if(alignedLength == 1)
99 uint64x2_t crc0, crc1;
100 shiftRight128(initialCrc, 16 - len, &crc0, &crc1);
103 shiftRight128(data0, leadOutSize, &A, &B);
105 const uint64x2_t P = veorq_u64(A, crc0);
106 R = veorq_u64(sse2neon_vmull_p64(vget_low_u64(P), vget_high_u64(foldConstants1)),
107 veorq_u64(mm_srli_si128(P, 8), mm_slli_si128(crc1, 8)));
109 else if(alignedLength == 2)
111 const uint64x2_t data1 = vreinterpretq_u64_u32(vld1q_u32((
const uint32_t *)(alignedData + 1)));
116 uint64x2_t crc0, crc1;
117 shiftRight128(initialCrc, 16 - len, &crc0, &crc1);
119 uint64x2_t A, B, C, D;
120 shiftRight128(data0, leadOutSize, &A, &B);
121 shiftRight128(data1, leadOutSize, &C, &D);
123 const uint64x2_t P = veorq_u64(veorq_u64(B, C), crc0);
124 R = veorq_u64(sse2neon_vmull_p64(vget_low_u64(P), vget_high_u64(foldConstants1)),
125 veorq_u64(mm_srli_si128(P, 8), mm_slli_si128(crc1, 8)));
130 uint64x2_t crc0, crc1;
131 shiftRight128(initialCrc, leadInSize, &crc0, &crc1);
133 uint64x2_t A, B, C, D;
134 shiftRight128(veorq_u64(data0, crc0), leadOutSize, &A, &B);
135 shiftRight128(veorq_u64(data1, crc1), leadOutSize, &C, &D);
137 const uint64x2_t P = veorq_u64(fold(A, foldConstants1), veorq_u64(B, C));
138 R = veorq_u64(sse2neon_vmull_p64(vget_low_u64(P), vget_high_u64(foldConstants1)), mm_srli_si128(P, 8));
144 len -= 16 - leadInSize;
147 uint64x2_t crc0, crc1;
148 shiftRight128(initialCrc, leadInSize, &crc0, &crc1);
150 uint64x2_t accumulator = veorq_u64(fold(veorq_u64(crc0, data0), foldConstants1), crc1);
154 accumulator = fold(veorq_u64(vreinterpretq_u64_u32(vld1q_u32((
const uint32_t *)alignedData)), accumulator),
163 P = veorq_u64(accumulator, vreinterpretq_u64_u32(vld1q_u32((
const uint32_t *)alignedData)));
166 const uint64x2_t end0 =
167 veorq_u64(accumulator, vreinterpretq_u64_u32(vld1q_u32((
const uint32_t *)alignedData)));
168 const uint64x2_t end1 = vreinterpretq_u64_u32(vld1q_u32((
const uint32_t *)(alignedData + 1)));
170 uint64x2_t A, B, C, D;
171 shiftRight128(end0, leadOutSize, &A, &B);
172 shiftRight128(end1, leadOutSize, &C, &D);
174 P = veorq_u64(fold(A, foldConstants1),
175 vreinterpretq_u64_u32(vorrq_u32(vreinterpretq_u32_u64(B), vreinterpretq_u32_u64(C))));
178 R = veorq_u64(sse2neon_vmull_p64(vget_low_u64(P), vget_high_u64(foldConstants1)), mm_srli_si128(P, 8));
182 const uint64x2_t T1 = sse2neon_vmull_p64(vget_low_u64(R), vget_low_u64(foldConstants2));
183 const uint64x2_t T2 = veorq_u64(
184 veorq_u64(sse2neon_vmull_p64(vget_low_u64(T1), vget_high_u64(foldConstants2)), mm_slli_si128(T1, 8)), R);
186 TRACE(
"Exiting aaruf_crc64_vmull()");
188 return ~vgetq_lane_u64(T2, 1);