Fix compilation on ARM and ARM64 using MSVC.

This commit is contained in:
2021-10-13 00:41:58 +01:00
parent 6c10f3e58d
commit 8d77838be2
9 changed files with 84 additions and 14 deletions

View File

@@ -1,9 +1,21 @@
cmake_minimum_required(VERSION 3.15)
IF(APPLE)
IF("${CMAKE_OSX_ARCHITECTURES}" STREQUAL "")
SET(CMAKE_OSX_ARCHITECTURES "arm64;x86_64" CACHE STRING "Build architectures for Mac OS X" FORCE)
ENDIF()
ENDIF(APPLE)
project("Aaru.Checksums.Native" C)
if("${CMAKE_C_COMPILER_ID}" MATCHES "MSVC" AND "${CMAKE_C_COMPILER_ARCHITECTURE_ID}" MATCHES "ARMV7")
set(CMAKE_C_STANDARD 11)
else()
set(CMAKE_C_STANDARD 90)
endif()
message("Detected system processor: ${CMAKE_SYSTEM_PROCESSOR}")
message("Detected vs platform name: ${CMAKE_C_COMPILER_ARCHITECTURE_ID}")
message("Detected compiler: ${CMAKE_C_COMPILER_ID}")
message("Detected build type: ${CMAKE_BUILD_TYPE}")

View File

@@ -48,7 +48,7 @@ AARU_EXPORT adler32_ctx* AARU_CALL adler32_init()
AARU_EXPORT int AARU_CALL adler32_update(adler32_ctx* ctx, const uint8_t* data, uint32_t len)
{
if(!ctx || !data) return -1;
#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
#if defined(__aarch64__) || defined(_M_ARM64) || ((defined(__arm__) || defined(_M_ARM)) && !defined(_WIN32))
if(have_neon())
{
adler32_neon(&ctx->sum1, &ctx->sum2, data, len);

View File

@@ -2,7 +2,7 @@
// Created by claunia on 28/9/21.
//
#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
#if defined(__aarch64__) || defined(_M_ARM64) || ((defined(__arm__) || defined(_M_ARM)) && !defined(_WIN32))
#include <arm_neon.h>
@@ -45,8 +45,13 @@ TARGET_WITH_SIMD void adler32_neon(uint16_t* sum1, uint16_t* sum2, const unsigne
* Process n blocks of data. At most NMAX data bytes can be
* processed before s2 must be reduced modulo ADLER_MODULE.
*/
#ifdef _WIN32
uint32x4_t v_s2 = {.n128_u32 = {0, 0, 0, s1 * n}};
uint32x4_t v_s1 = {.n128_u32 = {0, 0, 0, 0}};
#else
uint32x4_t v_s2 = (uint32x4_t){0, 0, 0, s1 * n};
uint32x4_t v_s1 = (uint32x4_t){0, 0, 0, 0};
#endif
uint16x8_t v_column_sum_1 = vdupq_n_u16(0);
uint16x8_t v_column_sum_2 = vdupq_n_u16(0);
uint16x8_t v_column_sum_3 = vdupq_n_u16(0);
@@ -78,6 +83,16 @@ TARGET_WITH_SIMD void adler32_neon(uint16_t* sum1, uint16_t* sum2, const unsigne
/*
* Multiply-add bytes by [ 32, 31, 30, ... ] for s2.
*/
#ifdef _WIN32
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), neon_ld1m_16((uint16_t[]) {32, 31, 30, 29}));
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), neon_ld1m_16((uint16_t[]){28, 27, 26, 25}));
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), neon_ld1m_16((uint16_t[]) {24, 23, 22, 21}));
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), neon_ld1m_16((uint16_t[]) {20, 19, 18, 17}));
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), neon_ld1m_16((uint16_t[]) {16, 15, 14, 13}));
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), neon_ld1m_16((uint16_t[]) {12, 11, 10, 9}));
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), neon_ld1m_16((uint16_t[]) {8, 7, 6, 5}));
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), neon_ld1m_16((uint16_t[]) {4, 3, 2, 1}));
#else
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), (uint16x4_t){32, 31, 30, 29});
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), (uint16x4_t){28, 27, 26, 25});
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), (uint16x4_t){24, 23, 22, 21});
@@ -86,6 +101,7 @@ TARGET_WITH_SIMD void adler32_neon(uint16_t* sum1, uint16_t* sum2, const unsigne
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), (uint16x4_t){12, 11, 10, 9});
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), (uint16x4_t){8, 7, 6, 5});
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), (uint16x4_t){4, 3, 2, 1});
#endif
/*
* Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
*/

View File

@@ -10,17 +10,21 @@
#include "arm_vmull.h"
#include "simd.h"
#if !defined(_WIN32)
TARGET_WITH_CRYPTO static uint64x2_t sse2neon_vmull_p64_crypto(uint64x1_t _a, uint64x1_t _b)
{
poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
return vreinterpretq_u64_p128(vmull_p64(a, b));
}
#endif
TARGET_WITH_SIMD uint64x2_t sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
{
#if !defined(_WIN32)
// Wraps vmull_p64
if(have_arm_crypto()) return sse2neon_vmull_p64_crypto(_a, _b);
#endif
// ARMv7 polyfill
// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.

View File

@@ -5,7 +5,6 @@
#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
#include <arm_neon.h>
#include <glob.h>
#include <stdint.h>
#include "library.h"

View File

@@ -5,7 +5,6 @@
#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
#include <arm_neon.h>
#include <glob.h>
#include <stdint.h>
#include "library.h"

54
simd.c
View File

@@ -92,8 +92,13 @@ int have_avx2(void)
#endif
#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
#if defined(_WIN32)
#include <windows.h>
#include <processthreadsapi.h>
#else
#include <sys/auxv.h>
#endif
#endif
#if defined(__aarch64__) || defined(_M_ARM64)
int have_neon(void)
@@ -101,15 +106,50 @@ int have_neon(void)
return 1; // ARMv8-A made it mandatory
}
int have_arm_crc32(void) { return getauxval(AT_HWCAP) & HWCAP_CRC32; }
int have_arm_crc32(void)
{
#if defined(_WIN32)
return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE) != 0;
#else
return getauxval(AT_HWCAP) & HWCAP_CRC32;
#endif
}
int have_arm_crypto(void) { return getauxval(AT_HWCAP) & HWCAP_AES; }
int have_arm_crypto(void)
{
#if defined(_WIN32)
return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) != 0;
#else
return getauxval(AT_HWCAP) & HWCAP_AES;
#endif
}
#endif
#if defined(__arm__) || defined(_M_ARM)
int have_neon(void) { return getauxval(AT_HWCAP) & HWCAP_NEON; }
int have_arm_crc32(void) { return getauxval(AT_HWCAP2) & HWCAP2_CRC32; }
int have_arm_crypto(void) { return getauxval(AT_HWCAP2) & HWCAP2_AES; }
int have_neon(void)
{
#if defined(_WIN32)
return IsProcessorFeaturePresent(PF_ARM_VFP_32_REGISTERS_AVAILABLE) != 0;
#else
return getauxval(AT_HWCAP) & HWCAP_NEON;
#endif
}
int have_arm_crc32(void)
{
#if defined(_WIN32)
return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE) != 0;
#else
return getauxval(AT_HWCAP2) & HWCAP2_CRC32;
#endif
}
int have_arm_crypto(void)
{
#if defined(_WIN32)
return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) != 0;
#else
return getauxval(AT_HWCAP2) & HWCAP2_AES;
#endif
}
#endif

4
simd.h
View File

@@ -22,13 +22,13 @@ AARU_EXPORT int have_ssse3(void);
AARU_EXPORT int have_avx2(void);
#endif
#if defined(__arm__) || defined(_M_ARM)
#if(defined(__arm__) || defined(_M_ARM)) && !defined(_WIN32)
#define HWCAP_NEON (1 << 12)
#define HWCAP2_AES (1 << 0)
#define HWCAP2_CRC32 (1 << 4)
#endif
#if defined(__aarch64__) || defined(_M_ARM64)
#if(defined(__aarch64__) || defined(_M_ARM64)) && !defined(_WIN32)
#define HWCAP_NEON (1 << 1)
#define HWCAP_AES (1 << 3)
#define HWCAP_CRC32 (1 << 7)

View File

@@ -233,7 +233,7 @@ TEST_F(adler32Fixture, adler32_slicing_2352bytes)
EXPECT_EQ(adler32, EXPECTED_ADLER32_2352BYTES);
}
#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
#if defined(__aarch64__) || defined(_M_ARM64) || ((defined(__arm__) || defined(_M_ARM)) && !defined(_WIN32))
TEST_F(adler32Fixture, adler32_neon)
{
if(!have_neon()) return;