19#if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \
20 defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86)
35static uint64_t bitReflect(uint64_t v)
37 v = v >> 1 & 0x5555555555555555 | (v & 0x5555555555555555) << 1;
38 v = v >> 2 & 0x3333333333333333 | (v & 0x3333333333333333) << 2;
39 v = v >> 4 & 0x0F0F0F0F0F0F0F0F | (v & 0x0F0F0F0F0F0F0F0F) << 4;
40 v = v >> 8 & 0x00FF00FF00FF00FF | (v & 0x00FF00FF00FF00FF) << 8;
41 v = v >> 16 & 0x0000FFFF0000FFFF | (v & 0x0000FFFF0000FFFF) << 16;
42 v = v >> 32 | v << 32;
47static uint64_t expMod65(uint32_t n, uint64_t p, uint64_t r)
49 return n == 0 ? r : expMod65(n - 1, p, r << 1 ^ p & (int64_t)r >> 63);
53static uint64_t div129by65(uint64_t poly)
57 for(uint32_t i = 0; i < 64; ++i)
59 q |= (h & 1ull << 63) >> i;
60 h = h << 1 ^ poly & (int64_t)h >> 63;
65static const uint8_t shuffleMasks[] = {
66 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
67 0x8f, 0x8e, 0x8d, 0x8c, 0x8b, 0x8a, 0x89, 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81, 0x80,
70CLMUL
static void shiftRight128(__m128i in,
size_t n, __m128i *out_left, __m128i *out_right)
72 const __m128i mask_a = _mm_loadu_si128((
const __m128i *)(shuffleMasks + (16 - n)));
73 const __m128i mask_b = _mm_xor_si128(mask_a, _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()));
75 *out_left = _mm_shuffle_epi8(in, mask_b);
76 *out_right = _mm_shuffle_epi8(in, mask_a);
79CLMUL
static __m128i fold(__m128i in, __m128i fold_constants)
81 return _mm_xor_si128(_mm_clmulepi64_si128(in, fold_constants, 0x00),
82 _mm_clmulepi64_si128(in, fold_constants, 0x11));
93AARU_EXPORT CLMUL uint64_t
AARU_CALL aaruf_crc64_clmul(
const uint64_t crc,
const uint8_t *data,
long length)
95 TRACE(
"Entering aaruf_crc64_clmul(%" PRIu64
", %p, %ld)", crc, data, length);
97 const uint64_t k1 = 0xe05dd497ca393ae4;
98 const uint64_t k2 = 0xdabe95afc7875f40;
99 const uint64_t mu = 0x9c3e466c172963d5;
100 const uint64_t p = 0x92d8af2baf0e1e85;
102 const __m128i fold_constants_1 = _mm_set_epi64x(k2, k1);
103 const __m128i fold_constants_2 = _mm_set_epi64x(p, mu);
105 const uint8_t *end = data + length;
108 const __m128i *aligned_data = (
const __m128i *)((uintptr_t)data & ~(uintptr_t)15);
109 const __m128i *aligned_end = (
const __m128i *)((uintptr_t)end + 15 & ~(uintptr_t)15);
111 const size_t lead_in_size = data - (
const uint8_t *)aligned_data;
112 const size_t lead_out_size = (
const uint8_t *)aligned_end - end;
114 const size_t aligned_length = aligned_end - aligned_data;
116 const __m128i lead_in_mask = _mm_loadu_si128((
const __m128i *)(shuffleMasks + (16 - lead_in_size)));
117 const __m128i data0 = _mm_blendv_epi8(_mm_setzero_si128(), _mm_load_si128(aligned_data), lead_in_mask);
120 const __m128i initial_crc = _mm_cvtsi64x_si128(~crc);
122 const __m128i initial_crc = _mm_set_epi64x(0, ~crc);
126 if(aligned_length == 1)
130 shiftRight128(initial_crc, 16 - length, &crc0, &crc1);
132 __m128i a_reg, b_reg;
133 shiftRight128(data0, lead_out_size, &a_reg, &b_reg);
135 const __m128i p_reg = _mm_xor_si128(a_reg, crc0);
136 r_reg = _mm_xor_si128(_mm_clmulepi64_si128(p_reg, fold_constants_1, 0x10),
137 _mm_xor_si128(_mm_srli_si128(p_reg, 8), _mm_slli_si128(crc1, 8)));
139 else if(aligned_length == 2)
141 const __m128i data1 = _mm_load_si128(aligned_data + 1);
147 shiftRight128(initial_crc, 16 - length, &crc0, &crc1);
149 __m128i a_reg, b_reg, c_reg, d_reg;
150 shiftRight128(data0, lead_out_size, &a_reg, &b_reg);
151 shiftRight128(data1, lead_out_size, &c_reg, &d_reg);
153 const __m128i p_reg = _mm_xor_si128(_mm_xor_si128(b_reg, c_reg), crc0);
154 r_reg = _mm_xor_si128(_mm_clmulepi64_si128(p_reg, fold_constants_1, 0x10),
155 _mm_xor_si128(_mm_srli_si128(p_reg, 8), _mm_slli_si128(crc1, 8)));
161 shiftRight128(initial_crc, lead_in_size, &crc0, &crc1);
163 __m128i a_reg, b_reg, c_reg, d_reg;
164 shiftRight128(_mm_xor_si128(data0, crc0), lead_out_size, &a_reg, &b_reg);
165 shiftRight128(_mm_xor_si128(data1, crc1), lead_out_size, &c_reg, &d_reg);
167 const __m128i p_reg = _mm_xor_si128(fold(a_reg, fold_constants_1), _mm_xor_si128(b_reg, c_reg));
168 r_reg = _mm_xor_si128(_mm_clmulepi64_si128(p_reg, fold_constants_1, 0x10), _mm_srli_si128(p_reg, 8));
174 length -= 16 - lead_in_size;
178 shiftRight128(initial_crc, lead_in_size, &crc0, &crc1);
180 __m128i accumulator = _mm_xor_si128(fold(_mm_xor_si128(crc0, data0), fold_constants_1), crc1);
184 accumulator = fold(_mm_xor_si128(_mm_load_si128(aligned_data), accumulator), fold_constants_1);
191 if(length == 16) { p_reg = _mm_xor_si128(accumulator, _mm_load_si128(aligned_data)); }
194 const __m128i end0 = _mm_xor_si128(accumulator, _mm_load_si128(aligned_data));
195 const __m128i end1 = _mm_load_si128(aligned_data + 1);
197 __m128i a_reg, b_reg, c_reg, d_reg;
198 shiftRight128(end0, lead_out_size, &a_reg, &b_reg);
199 shiftRight128(end1, lead_out_size, &c_reg, &d_reg);
201 p_reg = _mm_xor_si128(fold(a_reg, fold_constants_1), _mm_or_si128(b_reg, c_reg));
204 r_reg = _mm_xor_si128(_mm_clmulepi64_si128(p_reg, fold_constants_1, 0x10), _mm_srli_si128(p_reg, 8));
208 const __m128i t1_reg = _mm_clmulepi64_si128(r_reg, fold_constants_2, 0x00);
209 const __m128i t2_reg = _mm_xor_si128(
210 _mm_xor_si128(_mm_clmulepi64_si128(t1_reg, fold_constants_2, 0x10), _mm_slli_si128(t1_reg, 8)), r_reg);
212 TRACE(
"Exiting aaruf_crc64_clmul()");
215 return ~_mm_extract_epi64(t2_reg, 1);
217 return ~((uint64_t)(uint32_t)_mm_extract_epi32(t2_reg, 3) << 32 | (uint64_t)(uint32_t)_mm_extract_epi32(t2_reg, 2));