27#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
35#if !defined(__MINGW32__) && (!defined(__ANDROID__) || !defined(__arm__))
36TARGET_WITH_CRYPTO
static uint64x2_t sse2neon_vmull_p64_crypto(uint64x1_t _a, uint64x1_t _b)
38 poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
39 poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
40 return vreinterpretq_u64_p128(vmull_p64(a, b));
44TARGET_WITH_SIMD uint64x2_t sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
46#if !defined(__MINGW32__) && (!defined(__ANDROID__) || !defined(__arm__))
48 if(have_arm_crypto())
return sse2neon_vmull_p64_crypto(_a, _b);
66 poly8x8_t a = vreinterpret_p8_u64(_a);
67 poly8x8_t b = vreinterpret_p8_u64(_b);
70 uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff), vcreate_u8(0x00000000ffffffff));
71 uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff), vcreate_u8(0x0000000000000000));
74 uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));
75 uint8x16_t e = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));
76 uint8x16_t f = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));
77 uint8x16_t g = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));
78 uint8x16_t h = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));
79 uint8x16_t i = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));
80 uint8x16_t j = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));
81 uint8x16_t k = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));
84 uint8x16_t l = veorq_u8(e, f);
85 uint8x16_t m = veorq_u8(g, h);
86 uint8x16_t n = veorq_u8(i, j);
90#if defined(__aarch64__)
91 uint8x16_t lm_p0 = vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
92 uint8x16_t lm_p1 = vreinterpretq_u8_u64(vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
93 uint8x16_t nk_p0 = vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
94 uint8x16_t nk_p1 = vreinterpretq_u8_u64(vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
96 uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
97 uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
98 uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
99 uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
103 uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
104 uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
105 uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
109 uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
110 uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
111 uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
114#if defined(__aarch64__)
115 uint8x16_t t0 = vreinterpretq_u8_u64(vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
116 uint8x16_t t1 = vreinterpretq_u8_u64(vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
117 uint8x16_t t2 = vreinterpretq_u8_u64(vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
118 uint8x16_t t3 = vreinterpretq_u8_u64(vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
120 uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
121 uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
122 uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
123 uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
126 uint8x16_t t0_shift = vextq_u8(t0, t0, 15);
127 uint8x16_t t1_shift = vextq_u8(t1, t1, 14);
128 uint8x16_t t2_shift = vextq_u8(t2, t2, 13);
129 uint8x16_t t3_shift = vextq_u8(t3, t3, 12);
132 uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
133 uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
134 uint8x16_t mix = veorq_u8(d, cross1);
135 uint8x16_t r = veorq_u8(mix, cross2);
136 return vreinterpretq_u64_u8(r);
139TARGET_WITH_SIMD uint64x2_t mm_shuffle_epi8(uint64x2_t a, uint64x2_t b)
141 uint8x16_t tbl = vreinterpretq_u8_u64(a);
142 uint8x16_t idx = vreinterpretq_u8_u64(b);
143 uint8x16_t idx_masked = vandq_u8(idx, vdupq_n_u8(0x8F));
144#if defined(__aarch64__)
145 return vreinterpretq_u64_u8(vqtbl1q_u8(tbl, idx_masked));
148 uint8x8x2_t a_split = {vget_low_u8(tbl), vget_high_u8(tbl)};
149 return vreinterpretq_u64_u8(
150 vcombine_u8(vtbl2_u8(a_split, vget_low_u8(idx_masked)), vtbl2_u8(a_split, vget_high_u8(idx_masked))));
154TARGET_WITH_SIMD uint64x2_t mm_srli_si128(uint64x2_t a,
int imm)
156 uint8x16_t tmp[2] = {vreinterpretq_u8_u64(a), vdupq_n_u8(0)};
157 return vreinterpretq_u64_u8(vld1q_u8(((uint8_t
const *)tmp) + imm));
160TARGET_WITH_SIMD uint64x2_t mm_slli_si128(uint64x2_t a,
int imm)
162 uint8x16_t tmp[2] = {vdupq_n_u8(0), vreinterpretq_u8_u64(a)};
163 return vreinterpretq_u64_u8(vld1q_u8(((uint8_t
const *)tmp) + (16 - imm)));