libaaruformat 1.0
Aaru Data Preservation Suite - Format Library
Loading...
Searching...
No Matches
crc64_vmull.c
Go to the documentation of this file.
1/*
2 * This file is part of the Aaru Data Preservation Suite.
3 * Copyright (c) 2019-2026 Natalia Portillo.
4 *
5 * This library is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU Lesser General Public License as
7 * published by the Free Software Foundation; either version 2.1 of the
8 * License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
17 */
18
19#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
20
21#include <arm_neon.h>
22#include <stddef.h>
23#include <stdint.h>
24#include <string.h>
25
26#include <aaruformat.h>
27
28#include "arm_vmull.h"
29#include "log.h"
30
31static const uint8_t shuffleMasks[] = {
32 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
33 0x8f, 0x8e, 0x8d, 0x8c, 0x8b, 0x8a, 0x89, 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81, 0x80,
34};
35
36TARGET_WITH_SIMD FORCE_INLINE void shiftRight128(uint64x2_t in, size_t n, uint64x2_t *outLeft, uint64x2_t *outRight)
37{
38 const uint64x2_t maskA =
39 vreinterpretq_u64_u32(vld1q_u32((const uint32_t *)(const uint64x2_t *)(shuffleMasks + (16 - n))));
40 uint64x2_t b = vreinterpretq_u64_u8(vceqq_u8(vreinterpretq_u8_u64(vreinterpretq_u64_u32(vdupq_n_u32(0))),
41 vreinterpretq_u8_u64(vreinterpretq_u64_u32(vdupq_n_u32(0)))));
42 const uint64x2_t maskB = vreinterpretq_u64_u32(veorq_u32(vreinterpretq_u32_u64(maskA), vreinterpretq_u32_u64(b)));
43
44 *outLeft = mm_shuffle_epi8(in, maskB);
45 *outRight = mm_shuffle_epi8(in, maskA);
46}
47
48TARGET_WITH_SIMD FORCE_INLINE uint64x2_t fold(uint64x2_t in, uint64x2_t foldConstants)
49{
50 return veorq_u64(sse2neon_vmull_p64(vget_low_u64(in), vget_low_u64(foldConstants)),
51 sse2neon_vmull_p64(vget_high_u64(in), vget_high_u64(foldConstants)));
52}
53
62AARU_EXPORT TARGET_WITH_SIMD uint64_t AARU_CALL aaruf_crc64_vmull(uint64_t previous_crc, const uint8_t *data, long len)
63{
64 TRACE("Entering aaruf_crc64_vmull(%llu, %p, %ld)", previous_crc, data, len);
65
66 const uint64_t k1 = 0xe05dd497ca393ae4; // bitReflect(expMod65(128 + 64, poly, 1)) << 1;
67 const uint64_t k2 = 0xdabe95afc7875f40; // bitReflect(expMod65(128, poly, 1)) << 1;
68 const uint64_t mu = 0x9c3e466c172963d5; // (bitReflect(div129by65(poly)) << 1) | 1;
69 const uint64_t p = 0x92d8af2baf0e1e85; // (bitReflect(poly) << 1) | 1;
70
71 const uint64x2_t foldConstants1 = vcombine_u64(vcreate_u64(k1), vcreate_u64(k2));
72 const uint64x2_t foldConstants2 = vcombine_u64(vcreate_u64(mu), vcreate_u64(p));
73
74 const uint8_t *end = data + len;
75
76 // Align pointers
77 const uint64x2_t *alignedData = (const uint64x2_t *)((uintptr_t)data & ~(uintptr_t)15);
78 const uint64x2_t *alignedEnd = (const uint64x2_t *)(((uintptr_t)end + 15) & ~(uintptr_t)15);
79
80 const size_t leadInSize = data - (const uint8_t *)alignedData;
81 const size_t leadOutSize = (const uint8_t *)alignedEnd - end;
82
83 const size_t alignedLength = alignedEnd - alignedData;
84
85 const uint64x2_t leadInMask =
86 vreinterpretq_u64_u32(vld1q_u32((const uint32_t *)(const uint64x2_t *)(shuffleMasks + (16 - leadInSize))));
87 uint64x2_t a = vreinterpretq_u64_u32(vdupq_n_u32(0));
88 uint64x2_t b = vreinterpretq_u64_u32(
89 vld1q_u32((const uint32_t *)alignedData)); // Use a signed shift right to create a mask with the sign bit
90 const uint64x2_t data0 =
91 vreinterpretq_u64_u8(vbslq_u8(vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u64(leadInMask), 7)),
92 vreinterpretq_u8_u64(b), vreinterpretq_u8_u64(a)));
93
94 const uint64x2_t initialCrc = vsetq_lane_u64(~previous_crc, vdupq_n_u64(0), 0);
95
96 uint64x2_t R;
97 if(alignedLength == 1)
98 {
99 // Single data block, initial CRC possibly bleeds into zero padding
100 uint64x2_t crc0, crc1;
101 shiftRight128(initialCrc, 16 - len, &crc0, &crc1);
102
103 uint64x2_t A, B;
104 shiftRight128(data0, leadOutSize, &A, &B);
105
106 const uint64x2_t P = veorq_u64(A, crc0);
107 R = veorq_u64(sse2neon_vmull_p64(vget_low_u64(P), vget_high_u64(foldConstants1)),
108 veorq_u64(mm_srli_si128(P, 8), mm_slli_si128(crc1, 8)));
109 }
110 else if(alignedLength == 2)
111 {
112 const uint64x2_t data1 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t *)(alignedData + 1)));
113
114 if(len < 8)
115 {
116 // Initial CRC bleeds into the zero padding
117 uint64x2_t crc0, crc1;
118 shiftRight128(initialCrc, 16 - len, &crc0, &crc1);
119
120 uint64x2_t A, B, C, D;
121 shiftRight128(data0, leadOutSize, &A, &B);
122 shiftRight128(data1, leadOutSize, &C, &D);
123
124 const uint64x2_t P = veorq_u64(veorq_u64(B, C), crc0);
125 R = veorq_u64(sse2neon_vmull_p64(vget_low_u64(P), vget_high_u64(foldConstants1)),
126 veorq_u64(mm_srli_si128(P, 8), mm_slli_si128(crc1, 8)));
127 }
128 else
129 {
130 // We can fit the initial CRC into the data without bleeding into the zero padding
131 uint64x2_t crc0, crc1;
132 shiftRight128(initialCrc, leadInSize, &crc0, &crc1);
133
134 uint64x2_t A, B, C, D;
135 shiftRight128(veorq_u64(data0, crc0), leadOutSize, &A, &B);
136 shiftRight128(veorq_u64(data1, crc1), leadOutSize, &C, &D);
137
138 const uint64x2_t P = veorq_u64(fold(A, foldConstants1), veorq_u64(B, C));
139 R = veorq_u64(sse2neon_vmull_p64(vget_low_u64(P), vget_high_u64(foldConstants1)), mm_srli_si128(P, 8));
140 }
141 }
142 else
143 {
144 alignedData++;
145 len -= 16 - leadInSize;
146
147 // Initial CRC can simply be added to data
148 uint64x2_t crc0, crc1;
149 shiftRight128(initialCrc, leadInSize, &crc0, &crc1);
150
151 uint64x2_t accumulator = veorq_u64(fold(veorq_u64(crc0, data0), foldConstants1), crc1);
152
153 while(len >= 32)
154 {
155 accumulator = fold(veorq_u64(vreinterpretq_u64_u32(vld1q_u32((const uint32_t *)alignedData)), accumulator),
156 foldConstants1);
157
158 len -= 16;
159 alignedData++;
160 }
161
162 uint64x2_t P;
163 if(len == 16)
164 P = veorq_u64(accumulator, vreinterpretq_u64_u32(vld1q_u32((const uint32_t *)alignedData)));
165 else
166 {
167 // When len is between 16 and 32, we need both blocks but must be careful not to read past buffer end
168 const uint64x2_t end0 =
169 veorq_u64(accumulator, vreinterpretq_u64_u32(vld1q_u32((const uint32_t *)alignedData)));
170
171 // For the second block, always use safe copy to avoid buffer overflow
172 // The algorithm expects to read up to alignedEnd, but ASan prevents over-reading
173 uint8_t temp[16] __attribute__((aligned(16))) = {0};
174 const uint8_t *nextBlockAddr = (const uint8_t *)(alignedData + 1);
175
176 // Only copy bytes that are actually within the original buffer
177 if(nextBlockAddr < end)
178 {
179 size_t available = (size_t)(end - nextBlockAddr);
180 if(available > 16) available = 16;
181 memcpy(temp, nextBlockAddr, available);
182 }
183
184 const uint64x2_t end1 = vreinterpretq_u64_u32(vld1q_u32((const uint32_t *)temp));
185
186 uint64x2_t A, B, C, D;
187 shiftRight128(end0, leadOutSize, &A, &B);
188 shiftRight128(end1, leadOutSize, &C, &D);
189
190 P = veorq_u64(fold(A, foldConstants1),
191 vreinterpretq_u64_u32(vorrq_u32(vreinterpretq_u32_u64(B), vreinterpretq_u32_u64(C))));
192 }
193
194 R = veorq_u64(sse2neon_vmull_p64(vget_low_u64(P), vget_high_u64(foldConstants1)), mm_srli_si128(P, 8));
195 }
196
197 // Final Barrett reduction
198 const uint64x2_t T1 = sse2neon_vmull_p64(vget_low_u64(R), vget_low_u64(foldConstants2));
199 const uint64x2_t T2 = veorq_u64(
200 veorq_u64(sse2neon_vmull_p64(vget_low_u64(T1), vget_high_u64(foldConstants2)), mm_slli_si128(T1, 8)), R);
201
202 TRACE("Exiting aaruf_crc64_vmull()");
203
204 return ~vgetq_lane_u64(T2, 1);
205}
206
207#endif
#define AARU_CALL
Definition decls.h:46
#define AARU_EXPORT
Definition decls.h:55
#define FORCE_INLINE
Definition decls.h:64
#define TRACE(fmt,...)
Definition log.h:25
static __attribute__((always_inline))
Definition lru.c:76