mirror of
https://github.com/aaru-dps/Aaru.Checksums.Native.git
synced 2025-12-16 19:24:29 +00:00
Refactor and reformat.
This commit is contained in:
155
crc32_clmul.c
155
crc32_clmul.c
@@ -23,7 +23,7 @@
|
||||
* 3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \
|
||||
#if defined(__x86_64__) || defined(__amd64) || defined(_M_AMD64) || defined(_M_X64) || defined(__I386__) || \
|
||||
defined(__i386__) || defined(__THW_INTEL) || defined(_M_IX86)
|
||||
|
||||
#include <inttypes.h>
|
||||
@@ -34,7 +34,7 @@
|
||||
#include "crc32.h"
|
||||
#include "crc32_simd.h"
|
||||
|
||||
CLMUL static void fold_1(__m128i* xmm_crc0, __m128i* xmm_crc1, __m128i* xmm_crc2, __m128i* xmm_crc3)
|
||||
TARGET_WITH_CLMUL static void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3)
|
||||
{
|
||||
const __m128i xmm_fold4 = _mm_set_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
|
||||
|
||||
@@ -46,9 +46,9 @@ CLMUL static void fold_1(__m128i* xmm_crc0, __m128i* xmm_crc1, __m128i* xmm_crc2
|
||||
*xmm_crc3 = *xmm_crc0;
|
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_res = _mm_xor_ps(ps_crc0, ps_crc3);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_res = _mm_xor_ps(ps_crc0, ps_crc3);
|
||||
|
||||
*xmm_crc0 = *xmm_crc1;
|
||||
*xmm_crc1 = *xmm_crc2;
|
||||
@@ -56,7 +56,7 @@ CLMUL static void fold_1(__m128i* xmm_crc0, __m128i* xmm_crc1, __m128i* xmm_crc2
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res);
|
||||
}
|
||||
|
||||
CLMUL static void fold_2(__m128i* xmm_crc0, __m128i* xmm_crc1, __m128i* xmm_crc2, __m128i* xmm_crc3)
|
||||
TARGET_WITH_CLMUL static void fold_2(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3)
|
||||
{
|
||||
const __m128i xmm_fold4 = _mm_set_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
|
||||
|
||||
@@ -69,16 +69,16 @@ CLMUL static void fold_2(__m128i* xmm_crc0, __m128i* xmm_crc1, __m128i* xmm_crc2
|
||||
*xmm_crc3 = *xmm_crc1;
|
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
|
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_res31 = _mm_xor_ps(ps_crc3, ps_crc1);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_res31 = _mm_xor_ps(ps_crc3, ps_crc1);
|
||||
|
||||
*xmm_crc2 = *xmm_crc0;
|
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_res20 = _mm_xor_ps(ps_crc0, ps_crc2);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_res20 = _mm_xor_ps(ps_crc0, ps_crc2);
|
||||
|
||||
*xmm_crc0 = x_tmp2;
|
||||
*xmm_crc1 = x_tmp3;
|
||||
@@ -86,7 +86,7 @@ CLMUL static void fold_2(__m128i* xmm_crc0, __m128i* xmm_crc1, __m128i* xmm_crc2
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res31);
|
||||
}
|
||||
|
||||
CLMUL static void fold_3(__m128i* xmm_crc0, __m128i* xmm_crc1, __m128i* xmm_crc2, __m128i* xmm_crc3)
|
||||
TARGET_WITH_CLMUL static void fold_3(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3)
|
||||
{
|
||||
const __m128i xmm_fold4 = _mm_set_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
|
||||
|
||||
@@ -98,23 +98,23 @@ CLMUL static void fold_3(__m128i* xmm_crc0, __m128i* xmm_crc1, __m128i* xmm_crc2
|
||||
*xmm_crc3 = *xmm_crc2;
|
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
|
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3);
|
||||
|
||||
*xmm_crc2 = *xmm_crc1;
|
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
|
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_res21 = _mm_xor_ps(ps_crc1, ps_crc2);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_res21 = _mm_xor_ps(ps_crc1, ps_crc2);
|
||||
|
||||
*xmm_crc1 = *xmm_crc0;
|
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_res10 = _mm_xor_ps(ps_crc0, ps_crc1);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_res10 = _mm_xor_ps(ps_crc0, ps_crc1);
|
||||
|
||||
*xmm_crc0 = x_tmp3;
|
||||
*xmm_crc1 = _mm_castps_si128(ps_res10);
|
||||
@@ -122,7 +122,7 @@ CLMUL static void fold_3(__m128i* xmm_crc0, __m128i* xmm_crc1, __m128i* xmm_crc2
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res32);
|
||||
}
|
||||
|
||||
CLMUL static void fold_4(__m128i* xmm_crc0, __m128i* xmm_crc1, __m128i* xmm_crc2, __m128i* xmm_crc3)
|
||||
TARGET_WITH_CLMUL static void fold_4(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3)
|
||||
{
|
||||
const __m128i xmm_fold4 = _mm_set_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
|
||||
|
||||
@@ -137,28 +137,28 @@ CLMUL static void fold_4(__m128i* xmm_crc0, __m128i* xmm_crc1, __m128i* xmm_crc2
|
||||
x_tmp3 = *xmm_crc3;
|
||||
|
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
||||
x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_t0 = _mm_castsi128_ps(x_tmp0);
|
||||
ps_res0 = _mm_xor_ps(ps_crc0, ps_t0);
|
||||
x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_t0 = _mm_castsi128_ps(x_tmp0);
|
||||
ps_res0 = _mm_xor_ps(ps_crc0, ps_t0);
|
||||
|
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
|
||||
x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_t1 = _mm_castsi128_ps(x_tmp1);
|
||||
ps_res1 = _mm_xor_ps(ps_crc1, ps_t1);
|
||||
x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_t1 = _mm_castsi128_ps(x_tmp1);
|
||||
ps_res1 = _mm_xor_ps(ps_crc1, ps_t1);
|
||||
|
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
|
||||
x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_t2 = _mm_castsi128_ps(x_tmp2);
|
||||
ps_res2 = _mm_xor_ps(ps_crc2, ps_t2);
|
||||
x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_t2 = _mm_castsi128_ps(x_tmp2);
|
||||
ps_res2 = _mm_xor_ps(ps_crc2, ps_t2);
|
||||
|
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
|
||||
x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_t3 = _mm_castsi128_ps(x_tmp3);
|
||||
ps_res3 = _mm_xor_ps(ps_crc3, ps_t3);
|
||||
x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_t3 = _mm_castsi128_ps(x_tmp3);
|
||||
ps_res3 = _mm_xor_ps(ps_crc3, ps_t3);
|
||||
|
||||
*xmm_crc0 = _mm_castps_si128(ps_res0);
|
||||
*xmm_crc1 = _mm_castps_si128(ps_res1);
|
||||
@@ -166,12 +166,12 @@ CLMUL static void fold_4(__m128i* xmm_crc0, __m128i* xmm_crc1, __m128i* xmm_crc2
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res3);
|
||||
}
|
||||
|
||||
CLMUL static void partial_fold(const size_t len,
|
||||
__m128i* xmm_crc0,
|
||||
__m128i* xmm_crc1,
|
||||
__m128i* xmm_crc2,
|
||||
__m128i* xmm_crc3,
|
||||
__m128i* xmm_crc_part)
|
||||
TARGET_WITH_CLMUL static void partial_fold(const size_t len,
|
||||
__m128i *xmm_crc0,
|
||||
__m128i *xmm_crc1,
|
||||
__m128i *xmm_crc2,
|
||||
__m128i *xmm_crc3,
|
||||
__m128i *xmm_crc_part)
|
||||
{
|
||||
const __m128i xmm_fold4 = _mm_set_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
|
||||
const __m128i xmm_mask3 = _mm_set1_epi32(0x80808080);
|
||||
@@ -180,22 +180,22 @@ CLMUL static void partial_fold(const size_t len,
|
||||
__m128i xmm_a0_0, xmm_a0_1;
|
||||
__m128 ps_crc3, psa0_0, psa0_1, ps_res;
|
||||
|
||||
xmm_shl = _mm_load_si128((__m128i*)pshufb_shf_table + (len - 1));
|
||||
xmm_shl = _mm_load_si128((__m128i *)pshufb_shf_table + (len - 1));
|
||||
xmm_shr = xmm_shl;
|
||||
xmm_shr = _mm_xor_si128(xmm_shr, xmm_mask3);
|
||||
|
||||
xmm_a0_0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shl);
|
||||
|
||||
*xmm_crc0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shr);
|
||||
xmm_tmp1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shl);
|
||||
xmm_tmp1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shl);
|
||||
*xmm_crc0 = _mm_or_si128(*xmm_crc0, xmm_tmp1);
|
||||
|
||||
*xmm_crc1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shr);
|
||||
xmm_tmp2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shl);
|
||||
xmm_tmp2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shl);
|
||||
*xmm_crc1 = _mm_or_si128(*xmm_crc1, xmm_tmp2);
|
||||
|
||||
*xmm_crc2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shr);
|
||||
xmm_tmp3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shl);
|
||||
xmm_tmp3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shl);
|
||||
*xmm_crc2 = _mm_or_si128(*xmm_crc2, xmm_tmp3);
|
||||
|
||||
*xmm_crc3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shr);
|
||||
@@ -224,7 +224,7 @@ CLMUL static void partial_fold(const size_t len,
|
||||
#define XOR_INITIAL(where) ONCE(where = _mm_xor_si128(where, xmm_initial))
|
||||
|
||||
/**
|
||||
* @brief Calculate the CRC32 checksum using CLMUL instruction extension.
|
||||
* @brief Calculate the CRC32 checksum using TARGET_WITH_CLMUL instruction extension.
|
||||
*
|
||||
* @param previous_crc The previously calculated CRC32 checksum.
|
||||
* @param data Pointer to the input data buffer.
|
||||
@@ -232,7 +232,7 @@ CLMUL static void partial_fold(const size_t len,
|
||||
*
|
||||
* @return The calculated CRC32 checksum.
|
||||
*/
|
||||
AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(uint32_t previous_crc, const uint8_t* data, long len)
|
||||
AARU_EXPORT TARGET_WITH_CLMUL uint32_t AARU_CALL crc32_clmul(uint32_t previous_crc, const uint8_t *data, long len)
|
||||
{
|
||||
unsigned long algn_diff;
|
||||
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
|
||||
@@ -246,8 +246,8 @@ AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(uint32_t previous_crc, const ui
|
||||
int first = 1;
|
||||
|
||||
/* fold 512 to 32 step variable declarations for ISO-C90 compat. */
|
||||
const __m128i xmm_mask = _mm_load_si128((__m128i*)crc_mask);
|
||||
const __m128i xmm_mask2 = _mm_load_si128((__m128i*)crc_mask2);
|
||||
const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
|
||||
const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
|
||||
|
||||
uint32_t crc;
|
||||
__m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
|
||||
@@ -263,13 +263,16 @@ AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(uint32_t previous_crc, const ui
|
||||
uint32_t crc = ~previous_crc;
|
||||
switch(len)
|
||||
{
|
||||
case 3: crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *data++];
|
||||
case 2: crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *data++];
|
||||
case 1: crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *data++];
|
||||
case 3:
|
||||
crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *data++];
|
||||
case 2:
|
||||
crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *data++];
|
||||
case 1:
|
||||
crc = (crc >> 8) ^ crc32_table[0][(crc & 0xFF) ^ *data++];
|
||||
}
|
||||
return ~crc;
|
||||
}
|
||||
xmm_crc_part = _mm_loadu_si128((__m128i*)data);
|
||||
xmm_crc_part = _mm_loadu_si128((__m128i *)data);
|
||||
XOR_INITIAL(xmm_crc_part);
|
||||
goto partial;
|
||||
}
|
||||
@@ -278,7 +281,7 @@ AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(uint32_t previous_crc, const ui
|
||||
algn_diff = (0 - (uintptr_t)data) & 0xF;
|
||||
if(algn_diff)
|
||||
{
|
||||
xmm_crc_part = _mm_loadu_si128((__m128i*)data);
|
||||
xmm_crc_part = _mm_loadu_si128((__m128i *)data);
|
||||
XOR_INITIAL(xmm_crc_part);
|
||||
|
||||
data += algn_diff;
|
||||
@@ -289,10 +292,10 @@ AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(uint32_t previous_crc, const ui
|
||||
|
||||
while((len -= 64) >= 0)
|
||||
{
|
||||
xmm_t0 = _mm_load_si128((__m128i*)data);
|
||||
xmm_t1 = _mm_load_si128((__m128i*)data + 1);
|
||||
xmm_t2 = _mm_load_si128((__m128i*)data + 2);
|
||||
xmm_t3 = _mm_load_si128((__m128i*)data + 3);
|
||||
xmm_t0 = _mm_load_si128((__m128i *)data);
|
||||
xmm_t1 = _mm_load_si128((__m128i *)data + 1);
|
||||
xmm_t2 = _mm_load_si128((__m128i *)data + 2);
|
||||
xmm_t3 = _mm_load_si128((__m128i *)data + 3);
|
||||
|
||||
XOR_INITIAL(xmm_t0);
|
||||
|
||||
@@ -313,9 +316,9 @@ AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(uint32_t previous_crc, const ui
|
||||
{
|
||||
len += 16;
|
||||
|
||||
xmm_t0 = _mm_load_si128((__m128i*)data);
|
||||
xmm_t1 = _mm_load_si128((__m128i*)data + 1);
|
||||
xmm_t2 = _mm_load_si128((__m128i*)data + 2);
|
||||
xmm_t0 = _mm_load_si128((__m128i *)data);
|
||||
xmm_t1 = _mm_load_si128((__m128i *)data + 1);
|
||||
xmm_t2 = _mm_load_si128((__m128i *)data + 2);
|
||||
|
||||
XOR_INITIAL(xmm_t0);
|
||||
|
||||
@@ -327,14 +330,14 @@ AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(uint32_t previous_crc, const ui
|
||||
|
||||
if(len == 0) goto done;
|
||||
|
||||
xmm_crc_part = _mm_load_si128((__m128i*)data + 3);
|
||||
xmm_crc_part = _mm_load_si128((__m128i *)data + 3);
|
||||
}
|
||||
else if(len + 32 >= 0)
|
||||
{
|
||||
len += 32;
|
||||
|
||||
xmm_t0 = _mm_load_si128((__m128i*)data);
|
||||
xmm_t1 = _mm_load_si128((__m128i*)data + 1);
|
||||
xmm_t0 = _mm_load_si128((__m128i *)data);
|
||||
xmm_t1 = _mm_load_si128((__m128i *)data + 1);
|
||||
|
||||
XOR_INITIAL(xmm_t0);
|
||||
|
||||
@@ -345,13 +348,13 @@ AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(uint32_t previous_crc, const ui
|
||||
|
||||
if(len == 0) goto done;
|
||||
|
||||
xmm_crc_part = _mm_load_si128((__m128i*)data + 2);
|
||||
xmm_crc_part = _mm_load_si128((__m128i *)data + 2);
|
||||
}
|
||||
else if(len + 48 >= 0)
|
||||
{
|
||||
len += 48;
|
||||
|
||||
xmm_t0 = _mm_load_si128((__m128i*)data);
|
||||
xmm_t0 = _mm_load_si128((__m128i *)data);
|
||||
|
||||
XOR_INITIAL(xmm_t0);
|
||||
|
||||
@@ -361,13 +364,13 @@ AARU_EXPORT CLMUL uint32_t AARU_CALL crc32_clmul(uint32_t previous_crc, const ui
|
||||
|
||||
if(len == 0) goto done;
|
||||
|
||||
xmm_crc_part = _mm_load_si128((__m128i*)data + 1);
|
||||
xmm_crc_part = _mm_load_si128((__m128i *)data + 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
len += 64;
|
||||
if(len == 0) goto done;
|
||||
xmm_crc_part = _mm_load_si128((__m128i*)data);
|
||||
xmm_crc_part = _mm_load_si128((__m128i *)data);
|
||||
XOR_INITIAL(xmm_crc_part);
|
||||
}
|
||||
|
||||
@@ -382,7 +385,7 @@ done:
|
||||
/*
|
||||
* k1
|
||||
*/
|
||||
crc_fold = _mm_load_si128((__m128i*)crc_k);
|
||||
crc_fold = _mm_load_si128((__m128i *)crc_k);
|
||||
|
||||
x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
|
||||
xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
|
||||
@@ -402,7 +405,7 @@ done:
|
||||
/*
|
||||
* k5
|
||||
*/
|
||||
crc_fold = _mm_load_si128((__m128i*)crc_k + 1);
|
||||
crc_fold = _mm_load_si128((__m128i *)crc_k + 1);
|
||||
|
||||
xmm_crc0 = xmm_crc3;
|
||||
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
|
||||
@@ -420,7 +423,7 @@ done:
|
||||
*/
|
||||
xmm_crc1 = xmm_crc3;
|
||||
xmm_crc2 = xmm_crc3;
|
||||
crc_fold = _mm_load_si128((__m128i*)crc_k + 2);
|
||||
crc_fold = _mm_load_si128((__m128i *)crc_k + 2);
|
||||
|
||||
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
|
||||
@@ -434,7 +437,7 @@ done:
|
||||
/*
|
||||
* could just as well write xmm_crc3[2], doing a movaps and truncating, but
|
||||
* no real advantage - it's a tiny bit slower per call, while no additional CPUs
|
||||
* would be supported by only requiring SSSE3 and CLMUL instead of SSE4.1 + CLMUL
|
||||
* would be supported by only requiring TARGET_WITH_SSSE3 and TARGET_WITH_CLMUL instead of SSE4.1 + TARGET_WITH_CLMUL
|
||||
*/
|
||||
crc = _mm_extract_epi32(xmm_crc3, 2);
|
||||
return ~crc;
|
||||
|
||||
Reference in New Issue
Block a user