libaaruformat 1.0
Aaru Data Preservation Suite - Format Library
Loading...
Searching...
No Matches
crc64_clmul.c
Go to the documentation of this file.
1/*
2 * This file is part of the Aaru Data Preservation Suite.
3 * Copyright (c) 2019-2025 Natalia Portillo.
4 *
5 * This library is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU Lesser General Public License as
7 * published by the Free Software Foundation; either version 2.1 of the
8 * License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
17 */
18
19#if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \
20 defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86)
21
22#include <inttypes.h>
23#include <smmintrin.h>
24#include <wmmintrin.h>
25
26#include "log.h"
27
28#ifdef _MSC_VER
29#include <intrin.h>
30#endif
31
32#include <aaruformat.h>
33
34// Reverses bits
35static uint64_t bitReflect(uint64_t v)
36{
37 v = v >> 1 & 0x5555555555555555 | (v & 0x5555555555555555) << 1;
38 v = v >> 2 & 0x3333333333333333 | (v & 0x3333333333333333) << 2;
39 v = v >> 4 & 0x0F0F0F0F0F0F0F0F | (v & 0x0F0F0F0F0F0F0F0F) << 4;
40 v = v >> 8 & 0x00FF00FF00FF00FF | (v & 0x00FF00FF00FF00FF) << 8;
41 v = v >> 16 & 0x0000FFFF0000FFFF | (v & 0x0000FFFF0000FFFF) << 16;
42 v = v >> 32 | v << 32;
43 return v;
44}
45
46// Computes r*x^N mod p(x)
47static uint64_t expMod65(uint32_t n, uint64_t p, uint64_t r)
48{
49 return n == 0 ? r : expMod65(n - 1, p, r << 1 ^ p & (int64_t)r >> 63);
50}
51
52// Computes x^129 / p(x); the result has an implicit 65th bit.
53static uint64_t div129by65(uint64_t poly)
54{
55 uint64_t q = 0;
56 uint64_t h = poly;
57 for(uint32_t i = 0; i < 64; ++i)
58 {
59 q |= (h & 1ull << 63) >> i;
60 h = h << 1 ^ poly & (int64_t)h >> 63;
61 }
62 return q;
63}
64
65static const uint8_t shuffleMasks[] = {
66 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
67 0x8f, 0x8e, 0x8d, 0x8c, 0x8b, 0x8a, 0x89, 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81, 0x80,
68};
69
70CLMUL static void shiftRight128(__m128i in, size_t n, __m128i *out_left, __m128i *out_right)
71{
72 const __m128i mask_a = _mm_loadu_si128((const __m128i *)(shuffleMasks + (16 - n)));
73 const __m128i mask_b = _mm_xor_si128(mask_a, _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()));
74
75 *out_left = _mm_shuffle_epi8(in, mask_b);
76 *out_right = _mm_shuffle_epi8(in, mask_a);
77}
78
79CLMUL static __m128i fold(__m128i in, __m128i fold_constants)
80{
81 return _mm_xor_si128(_mm_clmulepi64_si128(in, fold_constants, 0x00),
82 _mm_clmulepi64_si128(in, fold_constants, 0x11));
83}
84
93AARU_EXPORT CLMUL uint64_t AARU_CALL aaruf_crc64_clmul(const uint64_t crc, const uint8_t *data, long length)
94{
95 TRACE("Entering aaruf_crc64_clmul(%" PRIu64 ", %p, %ld)", crc, data, length);
96
97 const uint64_t k1 = 0xe05dd497ca393ae4; // bitReflect(expMod65(128 + 64, poly, 1)) << 1;
98 const uint64_t k2 = 0xdabe95afc7875f40; // bitReflect(expMod65(128, poly, 1)) << 1;
99 const uint64_t mu = 0x9c3e466c172963d5; // (bitReflect(div129by65(poly)) << 1) | 1;
100 const uint64_t p = 0x92d8af2baf0e1e85; // (bitReflect(poly) << 1) | 1;
101
102 const __m128i fold_constants_1 = _mm_set_epi64x(k2, k1);
103 const __m128i fold_constants_2 = _mm_set_epi64x(p, mu);
104
105 const uint8_t *end = data + length;
106
107 // Align pointers
108 const __m128i *aligned_data = (const __m128i *)((uintptr_t)data & ~(uintptr_t)15);
109 const __m128i *aligned_end = (const __m128i *)((uintptr_t)end + 15 & ~(uintptr_t)15);
110
111 const size_t lead_in_size = data - (const uint8_t *)aligned_data;
112 const size_t lead_out_size = (const uint8_t *)aligned_end - end;
113
114 const size_t aligned_length = aligned_end - aligned_data;
115
116 const __m128i lead_in_mask = _mm_loadu_si128((const __m128i *)(shuffleMasks + (16 - lead_in_size)));
117 const __m128i data0 = _mm_blendv_epi8(_mm_setzero_si128(), _mm_load_si128(aligned_data), lead_in_mask);
118
119#if defined(_WIN64)
120 const __m128i initial_crc = _mm_cvtsi64x_si128(~crc);
121#else
122 const __m128i initial_crc = _mm_set_epi64x(0, ~crc);
123#endif
124
125 __m128i r_reg;
126 if(aligned_length == 1)
127 {
128 // Single data block, initial CRC possibly bleeds into zero padding
129 __m128i crc0, crc1;
130 shiftRight128(initial_crc, 16 - length, &crc0, &crc1);
131
132 __m128i a_reg, b_reg;
133 shiftRight128(data0, lead_out_size, &a_reg, &b_reg);
134
135 const __m128i p_reg = _mm_xor_si128(a_reg, crc0);
136 r_reg = _mm_xor_si128(_mm_clmulepi64_si128(p_reg, fold_constants_1, 0x10),
137 _mm_xor_si128(_mm_srli_si128(p_reg, 8), _mm_slli_si128(crc1, 8)));
138 }
139 else if(aligned_length == 2)
140 {
141 const __m128i data1 = _mm_load_si128(aligned_data + 1);
142
143 if(length < 8)
144 {
145 // Initial CRC bleeds into the zero padding
146 __m128i crc0, crc1;
147 shiftRight128(initial_crc, 16 - length, &crc0, &crc1);
148
149 __m128i a_reg, b_reg, c_reg, d_reg;
150 shiftRight128(data0, lead_out_size, &a_reg, &b_reg);
151 shiftRight128(data1, lead_out_size, &c_reg, &d_reg);
152
153 const __m128i p_reg = _mm_xor_si128(_mm_xor_si128(b_reg, c_reg), crc0);
154 r_reg = _mm_xor_si128(_mm_clmulepi64_si128(p_reg, fold_constants_1, 0x10),
155 _mm_xor_si128(_mm_srli_si128(p_reg, 8), _mm_slli_si128(crc1, 8)));
156 }
157 else
158 {
159 // We can fit the initial CRC into the data without bleeding into the zero padding
160 __m128i crc0, crc1;
161 shiftRight128(initial_crc, lead_in_size, &crc0, &crc1);
162
163 __m128i a_reg, b_reg, c_reg, d_reg;
164 shiftRight128(_mm_xor_si128(data0, crc0), lead_out_size, &a_reg, &b_reg);
165 shiftRight128(_mm_xor_si128(data1, crc1), lead_out_size, &c_reg, &d_reg);
166
167 const __m128i p_reg = _mm_xor_si128(fold(a_reg, fold_constants_1), _mm_xor_si128(b_reg, c_reg));
168 r_reg = _mm_xor_si128(_mm_clmulepi64_si128(p_reg, fold_constants_1, 0x10), _mm_srli_si128(p_reg, 8));
169 }
170 }
171 else
172 {
173 aligned_data++;
174 length -= 16 - lead_in_size;
175
176 // Initial CRC can simply be added to data
177 __m128i crc0, crc1;
178 shiftRight128(initial_crc, lead_in_size, &crc0, &crc1);
179
180 __m128i accumulator = _mm_xor_si128(fold(_mm_xor_si128(crc0, data0), fold_constants_1), crc1);
181
182 while(length >= 32)
183 {
184 accumulator = fold(_mm_xor_si128(_mm_load_si128(aligned_data), accumulator), fold_constants_1);
185
186 length -= 16;
187 aligned_data++;
188 }
189
190 __m128i p_reg;
191 if(length == 16) { p_reg = _mm_xor_si128(accumulator, _mm_load_si128(aligned_data)); }
192 else
193 {
194 const __m128i end0 = _mm_xor_si128(accumulator, _mm_load_si128(aligned_data));
195 const __m128i end1 = _mm_load_si128(aligned_data + 1);
196
197 __m128i a_reg, b_reg, c_reg, d_reg;
198 shiftRight128(end0, lead_out_size, &a_reg, &b_reg);
199 shiftRight128(end1, lead_out_size, &c_reg, &d_reg);
200
201 p_reg = _mm_xor_si128(fold(a_reg, fold_constants_1), _mm_or_si128(b_reg, c_reg));
202 }
203
204 r_reg = _mm_xor_si128(_mm_clmulepi64_si128(p_reg, fold_constants_1, 0x10), _mm_srli_si128(p_reg, 8));
205 }
206
207 // Final Barrett reduction
208 const __m128i t1_reg = _mm_clmulepi64_si128(r_reg, fold_constants_2, 0x00);
209 const __m128i t2_reg = _mm_xor_si128(
210 _mm_xor_si128(_mm_clmulepi64_si128(t1_reg, fold_constants_2, 0x10), _mm_slli_si128(t1_reg, 8)), r_reg);
211
212 TRACE("Exiting aaruf_crc64_clmul()");
213
214#if defined(_WIN64)
215 return ~_mm_extract_epi64(t2_reg, 1);
216#else
217 return ~((uint64_t)(uint32_t)_mm_extract_epi32(t2_reg, 3) << 32 | (uint64_t)(uint32_t)_mm_extract_epi32(t2_reg, 2));
218#endif
219}
220
221#endif
#define AARU_CALL
Definition decls.h:45
#define AARU_EXPORT
Definition decls.h:54
#define TRACE(fmt,...)
Definition log.h:25