libaaruformat 1.0
Aaru Data Preservation Suite - Format Library
Loading...
Searching...
No Matches
crc64_clmul.c
Go to the documentation of this file.
1/*
2 * This file is part of the Aaru Data Preservation Suite.
3 * Copyright (c) 2019-2026 Natalia Portillo.
4 *
5 * This library is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU Lesser General Public License as
7 * published by the Free Software Foundation; either version 2.1 of the
8 * License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
17 */
18
19#if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \
20 defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86)
21
22#include <inttypes.h>
23#include <smmintrin.h>
24#include <string.h>
25#include <wmmintrin.h>
26
27#include "log.h"
28
29#ifdef _MSC_VER
30#include <intrin.h>
31#endif
32
33#include <aaruformat.h>
34
35// Reverses bits
36static uint64_t bitReflect(uint64_t v)
37{
38 v = v >> 1 & 0x5555555555555555 | (v & 0x5555555555555555) << 1;
39 v = v >> 2 & 0x3333333333333333 | (v & 0x3333333333333333) << 2;
40 v = v >> 4 & 0x0F0F0F0F0F0F0F0F | (v & 0x0F0F0F0F0F0F0F0F) << 4;
41 v = v >> 8 & 0x00FF00FF00FF00FF | (v & 0x00FF00FF00FF00FF) << 8;
42 v = v >> 16 & 0x0000FFFF0000FFFF | (v & 0x0000FFFF0000FFFF) << 16;
43 v = v >> 32 | v << 32;
44 return v;
45}
46
47// Computes r*x^N mod p(x)
48static uint64_t expMod65(uint32_t n, uint64_t p, uint64_t r)
49{
50 return n == 0 ? r : expMod65(n - 1, p, r << 1 ^ p & (int64_t)r >> 63);
51}
52
53// Computes x^129 / p(x); the result has an implicit 65th bit.
54static uint64_t div129by65(uint64_t poly)
55{
56 uint64_t q = 0;
57 uint64_t h = poly;
58 for(uint32_t i = 0; i < 64; ++i)
59 {
60 q |= (h & 1ull << 63) >> i;
61 h = h << 1 ^ poly & (int64_t)h >> 63;
62 }
63 return q;
64}
65
66static const uint8_t shuffleMasks[] = {
67 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
68 0x8f, 0x8e, 0x8d, 0x8c, 0x8b, 0x8a, 0x89, 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81, 0x80,
69};
70
71CLMUL static void shiftRight128(__m128i in, size_t n, __m128i *out_left, __m128i *out_right)
72{
73 const __m128i mask_a = _mm_loadu_si128((const __m128i *)(shuffleMasks + (16 - n)));
74 const __m128i mask_b = _mm_xor_si128(mask_a, _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()));
75
76 *out_left = _mm_shuffle_epi8(in, mask_b);
77 *out_right = _mm_shuffle_epi8(in, mask_a);
78}
79
80CLMUL static __m128i fold(__m128i in, __m128i fold_constants)
81{
82 return _mm_xor_si128(_mm_clmulepi64_si128(in, fold_constants, 0x00),
83 _mm_clmulepi64_si128(in, fold_constants, 0x11));
84}
85
94AARU_EXPORT CLMUL uint64_t AARU_CALL aaruf_crc64_clmul(const uint64_t crc, const uint8_t *data, long length)
95{
96 TRACE("Entering aaruf_crc64_clmul(%" PRIu64 ", %p, %ld)", crc, data, length);
97
98 const uint64_t k1 = 0xe05dd497ca393ae4; // bitReflect(expMod65(128 + 64, poly, 1)) << 1;
99 const uint64_t k2 = 0xdabe95afc7875f40; // bitReflect(expMod65(128, poly, 1)) << 1;
100 const uint64_t mu = 0x9c3e466c172963d5; // (bitReflect(div129by65(poly)) << 1) | 1;
101 const uint64_t p = 0x92d8af2baf0e1e85; // (bitReflect(poly) << 1) | 1;
102
103 const __m128i fold_constants_1 = _mm_set_epi64x(k2, k1);
104 const __m128i fold_constants_2 = _mm_set_epi64x(p, mu);
105
106 const uint8_t *end = data + length;
107
108 // Align pointers
109 const __m128i *aligned_data = (const __m128i *)((uintptr_t)data & ~(uintptr_t)15);
110 const __m128i *aligned_end = (const __m128i *)((uintptr_t)end + 15 & ~(uintptr_t)15);
111
112 const size_t lead_in_size = data - (const uint8_t *)aligned_data;
113 const size_t lead_out_size = (const uint8_t *)aligned_end - end;
114
115 const size_t aligned_length = aligned_end - aligned_data;
116
117 const __m128i lead_in_mask = _mm_loadu_si128((const __m128i *)(shuffleMasks + (16 - lead_in_size)));
118 const __m128i data0 = _mm_blendv_epi8(_mm_setzero_si128(), _mm_load_si128(aligned_data), lead_in_mask);
119
120#if defined(_WIN64)
121 const __m128i initial_crc = _mm_cvtsi64x_si128(~crc);
122#else
123 const __m128i initial_crc = _mm_set_epi64x(0, ~crc);
124#endif
125
126 __m128i r_reg;
127 if(aligned_length == 1)
128 {
129 // Single data block, initial CRC possibly bleeds into zero padding
130 __m128i crc0, crc1;
131 shiftRight128(initial_crc, 16 - length, &crc0, &crc1);
132
133 __m128i a_reg, b_reg;
134 shiftRight128(data0, lead_out_size, &a_reg, &b_reg);
135
136 const __m128i p_reg = _mm_xor_si128(a_reg, crc0);
137 r_reg = _mm_xor_si128(_mm_clmulepi64_si128(p_reg, fold_constants_1, 0x10),
138 _mm_xor_si128(_mm_srli_si128(p_reg, 8), _mm_slli_si128(crc1, 8)));
139 }
140 else if(aligned_length == 2)
141 {
142 const __m128i data1 = _mm_load_si128(aligned_data + 1);
143
144 if(length < 8)
145 {
146 // Initial CRC bleeds into the zero padding
147 __m128i crc0, crc1;
148 shiftRight128(initial_crc, 16 - length, &crc0, &crc1);
149
150 __m128i a_reg, b_reg, c_reg, d_reg;
151 shiftRight128(data0, lead_out_size, &a_reg, &b_reg);
152 shiftRight128(data1, lead_out_size, &c_reg, &d_reg);
153
154 const __m128i p_reg = _mm_xor_si128(_mm_xor_si128(b_reg, c_reg), crc0);
155 r_reg = _mm_xor_si128(_mm_clmulepi64_si128(p_reg, fold_constants_1, 0x10),
156 _mm_xor_si128(_mm_srli_si128(p_reg, 8), _mm_slli_si128(crc1, 8)));
157 }
158 else
159 {
160 // We can fit the initial CRC into the data without bleeding into the zero padding
161 __m128i crc0, crc1;
162 shiftRight128(initial_crc, lead_in_size, &crc0, &crc1);
163
164 __m128i a_reg, b_reg, c_reg, d_reg;
165 shiftRight128(_mm_xor_si128(data0, crc0), lead_out_size, &a_reg, &b_reg);
166 shiftRight128(_mm_xor_si128(data1, crc1), lead_out_size, &c_reg, &d_reg);
167
168 const __m128i p_reg = _mm_xor_si128(fold(a_reg, fold_constants_1), _mm_xor_si128(b_reg, c_reg));
169 r_reg = _mm_xor_si128(_mm_clmulepi64_si128(p_reg, fold_constants_1, 0x10), _mm_srli_si128(p_reg, 8));
170 }
171 }
172 else
173 {
174 aligned_data++;
175 length -= 16 - lead_in_size;
176
177 // Initial CRC can simply be added to data
178 __m128i crc0, crc1;
179 shiftRight128(initial_crc, lead_in_size, &crc0, &crc1);
180
181 __m128i accumulator = _mm_xor_si128(fold(_mm_xor_si128(crc0, data0), fold_constants_1), crc1);
182
183 while(length >= 32)
184 {
185 accumulator = fold(_mm_xor_si128(_mm_load_si128(aligned_data), accumulator), fold_constants_1);
186
187 length -= 16;
188 aligned_data++;
189 }
190
191 __m128i p_reg;
192 if(length == 16) { p_reg = _mm_xor_si128(accumulator, _mm_load_si128(aligned_data)); }
193 else
194 {
195 const __m128i end0 = _mm_xor_si128(accumulator, _mm_load_si128(aligned_data));
196
197 // For the second block, safely handle the case where it extends past the actual data
198 // Always use safe copy approach to avoid ASan buffer overflow detection
199 uint8_t temp[16] __attribute__((aligned(16))) = {0};
200 const uint8_t *next_block_addr = (const uint8_t *)(aligned_data + 1);
201
202 // Only copy bytes that are actually within the original buffer
203 if(next_block_addr < end)
204 {
205 size_t available = (size_t)(end - next_block_addr);
206 if(available > 16) available = 16;
207 memcpy(temp, next_block_addr, available);
208 }
209
210 const __m128i end1 = _mm_load_si128((const __m128i *)temp);
211
212 __m128i a_reg, b_reg, c_reg, d_reg;
213 shiftRight128(end0, lead_out_size, &a_reg, &b_reg);
214 shiftRight128(end1, lead_out_size, &c_reg, &d_reg);
215
216 p_reg = _mm_xor_si128(fold(a_reg, fold_constants_1), _mm_or_si128(b_reg, c_reg));
217 }
218
219 r_reg = _mm_xor_si128(_mm_clmulepi64_si128(p_reg, fold_constants_1, 0x10), _mm_srli_si128(p_reg, 8));
220 }
221
222 // Final Barrett reduction
223 const __m128i t1_reg = _mm_clmulepi64_si128(r_reg, fold_constants_2, 0x00);
224 const __m128i t2_reg = _mm_xor_si128(
225 _mm_xor_si128(_mm_clmulepi64_si128(t1_reg, fold_constants_2, 0x10), _mm_slli_si128(t1_reg, 8)), r_reg);
226
227 TRACE("Exiting aaruf_crc64_clmul()");
228
229#if defined(_WIN64)
230 return ~_mm_extract_epi64(t2_reg, 1);
231#else
232 return ~((uint64_t)(uint32_t)_mm_extract_epi32(t2_reg, 3) << 32 | (uint64_t)(uint32_t)_mm_extract_epi32(t2_reg, 2));
233#endif
234}
235
236#endif
#define AARU_CALL
Definition decls.h:46
#define AARU_EXPORT
Definition decls.h:55
#define TRACE(fmt,...)
Definition log.h:25
static __attribute__((always_inline))
Definition lru.c:76