mirror of
https://github.com/aaru-dps/Aaru.Checksums.Native.git
synced 2025-12-16 11:14:29 +00:00
Fix compilation on ARM and ARM64 using MSVC.
This commit is contained in:
@@ -1,9 +1,21 @@
|
||||
cmake_minimum_required(VERSION 3.15)
|
||||
|
||||
IF(APPLE)
|
||||
IF("${CMAKE_OSX_ARCHITECTURES}" STREQUAL "")
|
||||
SET(CMAKE_OSX_ARCHITECTURES "arm64;x86_64" CACHE STRING "Build architectures for Mac OS X" FORCE)
|
||||
ENDIF()
|
||||
ENDIF(APPLE)
|
||||
|
||||
project("Aaru.Checksums.Native" C)
|
||||
|
||||
set(CMAKE_C_STANDARD 90)
|
||||
if("${CMAKE_C_COMPILER_ID}" MATCHES "MSVC" AND "${CMAKE_C_COMPILER_ARCHITECTURE_ID}" MATCHES "ARMV7")
|
||||
set(CMAKE_C_STANDARD 11)
|
||||
else()
|
||||
set(CMAKE_C_STANDARD 90)
|
||||
endif()
|
||||
|
||||
message("Detected system processor: ${CMAKE_SYSTEM_PROCESSOR}")
|
||||
message("Detected vs platform name: ${CMAKE_C_COMPILER_ARCHITECTURE_ID}")
|
||||
message("Detected compiler: ${CMAKE_C_COMPILER_ID}")
|
||||
message("Detected build type: ${CMAKE_BUILD_TYPE}")
|
||||
|
||||
@@ -28,4 +40,4 @@ endif()
|
||||
|
||||
add_library("Aaru.Checksums.Native" SHARED adler32.h adler32.c crc16.h crc16.c crc16_ccitt.h crc16_ccitt.c crc32.c crc32.h crc64.c crc64.h fletcher16.h fletcher16.c fletcher32.h fletcher32.c library.h spamsum.c spamsum.h crc32_clmul.c crc64_clmul.c simd.c simd.h adler32_ssse3.c adler32_avx2.c adler32_neon.c crc32_arm_simd.c crc32_vmull.c crc32_simd.h arm_vmull.c arm_vmull.h crc64_vmull.c)
|
||||
|
||||
add_subdirectory(tests)
|
||||
add_subdirectory(tests)
|
||||
|
||||
@@ -48,7 +48,7 @@ AARU_EXPORT adler32_ctx* AARU_CALL adler32_init()
|
||||
AARU_EXPORT int AARU_CALL adler32_update(adler32_ctx* ctx, const uint8_t* data, uint32_t len)
|
||||
{
|
||||
if(!ctx || !data) return -1;
|
||||
#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
|
||||
#if defined(__aarch64__) || defined(_M_ARM64) || ((defined(__arm__) || defined(_M_ARM)) && !defined(_WIN32))
|
||||
if(have_neon())
|
||||
{
|
||||
adler32_neon(&ctx->sum1, &ctx->sum2, data, len);
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
// Created by claunia on 28/9/21.
|
||||
//
|
||||
|
||||
#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
|
||||
#if defined(__aarch64__) || defined(_M_ARM64) || ((defined(__arm__) || defined(_M_ARM)) && !defined(_WIN32))
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
@@ -45,8 +45,13 @@ TARGET_WITH_SIMD void adler32_neon(uint16_t* sum1, uint16_t* sum2, const unsigne
|
||||
* Process n blocks of data. At most NMAX data bytes can be
|
||||
* processed before s2 must be reduced modulo ADLER_MODULE.
|
||||
*/
|
||||
#ifdef _WIN32
|
||||
uint32x4_t v_s2 = {.n128_u32 = {0, 0, 0, s1 * n}};
|
||||
uint32x4_t v_s1 = {.n128_u32 = {0, 0, 0, 0}};
|
||||
#else
|
||||
uint32x4_t v_s2 = (uint32x4_t){0, 0, 0, s1 * n};
|
||||
uint32x4_t v_s1 = (uint32x4_t){0, 0, 0, 0};
|
||||
#endif
|
||||
uint16x8_t v_column_sum_1 = vdupq_n_u16(0);
|
||||
uint16x8_t v_column_sum_2 = vdupq_n_u16(0);
|
||||
uint16x8_t v_column_sum_3 = vdupq_n_u16(0);
|
||||
@@ -78,6 +83,16 @@ TARGET_WITH_SIMD void adler32_neon(uint16_t* sum1, uint16_t* sum2, const unsigne
|
||||
/*
|
||||
* Multiply-add bytes by [ 32, 31, 30, ... ] for s2.
|
||||
*/
|
||||
#ifdef _WIN32
|
||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), neon_ld1m_16((uint16_t[]) {32, 31, 30, 29}));
|
||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), neon_ld1m_16((uint16_t[]){28, 27, 26, 25}));
|
||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), neon_ld1m_16((uint16_t[]) {24, 23, 22, 21}));
|
||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), neon_ld1m_16((uint16_t[]) {20, 19, 18, 17}));
|
||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), neon_ld1m_16((uint16_t[]) {16, 15, 14, 13}));
|
||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), neon_ld1m_16((uint16_t[]) {12, 11, 10, 9}));
|
||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), neon_ld1m_16((uint16_t[]) {8, 7, 6, 5}));
|
||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), neon_ld1m_16((uint16_t[]) {4, 3, 2, 1}));
|
||||
#else
|
||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), (uint16x4_t){32, 31, 30, 29});
|
||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), (uint16x4_t){28, 27, 26, 25});
|
||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), (uint16x4_t){24, 23, 22, 21});
|
||||
@@ -86,6 +101,7 @@ TARGET_WITH_SIMD void adler32_neon(uint16_t* sum1, uint16_t* sum2, const unsigne
|
||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), (uint16x4_t){12, 11, 10, 9});
|
||||
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), (uint16x4_t){8, 7, 6, 5});
|
||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), (uint16x4_t){4, 3, 2, 1});
|
||||
#endif
|
||||
/*
|
||||
* Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
|
||||
*/
|
||||
|
||||
@@ -10,17 +10,21 @@
|
||||
#include "arm_vmull.h"
|
||||
#include "simd.h"
|
||||
|
||||
#if !defined(_WIN32)
|
||||
TARGET_WITH_CRYPTO static uint64x2_t sse2neon_vmull_p64_crypto(uint64x1_t _a, uint64x1_t _b)
|
||||
{
|
||||
poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
|
||||
poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
|
||||
return vreinterpretq_u64_p128(vmull_p64(a, b));
|
||||
}
|
||||
#endif
|
||||
|
||||
TARGET_WITH_SIMD uint64x2_t sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
|
||||
{
|
||||
#if !defined(_WIN32)
|
||||
// Wraps vmull_p64
|
||||
if(have_arm_crypto()) return sse2neon_vmull_p64_crypto(_a, _b);
|
||||
#endif
|
||||
|
||||
// ARMv7 polyfill
|
||||
// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include <glob.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "library.h"
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include <glob.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "library.h"
|
||||
|
||||
50
simd.c
50
simd.c
@@ -92,8 +92,13 @@ int have_avx2(void)
|
||||
#endif
|
||||
|
||||
#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
|
||||
#if defined(_WIN32)
|
||||
#include <windows.h>
|
||||
#include <processthreadsapi.h>
|
||||
#else
|
||||
#include <sys/auxv.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
int have_neon(void)
|
||||
@@ -101,15 +106,50 @@ int have_neon(void)
|
||||
return 1; // ARMv8-A made it mandatory
|
||||
}
|
||||
|
||||
int have_arm_crc32(void) { return getauxval(AT_HWCAP) & HWCAP_CRC32; }
|
||||
int have_arm_crc32(void)
|
||||
{
|
||||
#if defined(_WIN32)
|
||||
return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE) != 0;
|
||||
#else
|
||||
return getauxval(AT_HWCAP) & HWCAP_CRC32;
|
||||
#endif
|
||||
}
|
||||
|
||||
int have_arm_crypto(void) { return getauxval(AT_HWCAP) & HWCAP_AES; }
|
||||
int have_arm_crypto(void)
|
||||
{
|
||||
#if defined(_WIN32)
|
||||
return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) != 0;
|
||||
#else
|
||||
return getauxval(AT_HWCAP) & HWCAP_AES;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__arm__) || defined(_M_ARM)
|
||||
int have_neon(void) { return getauxval(AT_HWCAP) & HWCAP_NEON; }
|
||||
int have_neon(void)
|
||||
{
|
||||
#if defined(_WIN32)
|
||||
return IsProcessorFeaturePresent(PF_ARM_VFP_32_REGISTERS_AVAILABLE) != 0;
|
||||
#else
|
||||
return getauxval(AT_HWCAP) & HWCAP_NEON;
|
||||
#endif
|
||||
}
|
||||
|
||||
int have_arm_crc32(void) { return getauxval(AT_HWCAP2) & HWCAP2_CRC32; }
|
||||
int have_arm_crc32(void)
|
||||
{
|
||||
#if defined(_WIN32)
|
||||
return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE) != 0;
|
||||
#else
|
||||
return getauxval(AT_HWCAP2) & HWCAP2_CRC32;
|
||||
#endif
|
||||
}
|
||||
|
||||
int have_arm_crypto(void) { return getauxval(AT_HWCAP2) & HWCAP2_AES; }
|
||||
int have_arm_crypto(void)
|
||||
{
|
||||
#if defined(_WIN32)
|
||||
return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) != 0;
|
||||
#else
|
||||
return getauxval(AT_HWCAP2) & HWCAP2_AES;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
4
simd.h
4
simd.h
@@ -22,13 +22,13 @@ AARU_EXPORT int have_ssse3(void);
|
||||
AARU_EXPORT int have_avx2(void);
|
||||
#endif
|
||||
|
||||
#if defined(__arm__) || defined(_M_ARM)
|
||||
#if(defined(__arm__) || defined(_M_ARM)) && !defined(_WIN32)
|
||||
#define HWCAP_NEON (1 << 12)
|
||||
#define HWCAP2_AES (1 << 0)
|
||||
#define HWCAP2_CRC32 (1 << 4)
|
||||
#endif
|
||||
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
#if(defined(__aarch64__) || defined(_M_ARM64)) && !defined(_WIN32)
|
||||
#define HWCAP_NEON (1 << 1)
|
||||
#define HWCAP_AES (1 << 3)
|
||||
#define HWCAP_CRC32 (1 << 7)
|
||||
|
||||
@@ -233,7 +233,7 @@ TEST_F(adler32Fixture, adler32_slicing_2352bytes)
|
||||
EXPECT_EQ(adler32, EXPECTED_ADLER32_2352BYTES);
|
||||
}
|
||||
|
||||
#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
|
||||
#if defined(__aarch64__) || defined(_M_ARM64) || ((defined(__arm__) || defined(_M_ARM)) && !defined(_WIN32))
|
||||
TEST_F(adler32Fixture, adler32_neon)
|
||||
{
|
||||
if(!have_neon()) return;
|
||||
|
||||
Reference in New Issue
Block a user