diff --git a/CMakeLists.txt b/CMakeLists.txt index b3caa98..af6c71b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,9 +1,21 @@ cmake_minimum_required(VERSION 3.15) + +IF(APPLE) + IF("${CMAKE_OSX_ARCHITECTURES}" STREQUAL "") + SET(CMAKE_OSX_ARCHITECTURES "arm64;x86_64" CACHE STRING "Build architectures for Mac OS X" FORCE) + ENDIF() +ENDIF(APPLE) + project("Aaru.Checksums.Native" C) -set(CMAKE_C_STANDARD 90) +if("${CMAKE_C_COMPILER_ID}" MATCHES "MSVC" AND "${CMAKE_C_COMPILER_ARCHITECTURE_ID}" MATCHES "ARMV7") + set(CMAKE_C_STANDARD 11) +else() + set(CMAKE_C_STANDARD 90) +endif() message("Detected system processor: ${CMAKE_SYSTEM_PROCESSOR}") +message("Detected vs platform name: ${CMAKE_C_COMPILER_ARCHITECTURE_ID}") message("Detected compiler: ${CMAKE_C_COMPILER_ID}") message("Detected build type: ${CMAKE_BUILD_TYPE}") @@ -28,4 +40,4 @@ endif() add_library("Aaru.Checksums.Native" SHARED adler32.h adler32.c crc16.h crc16.c crc16_ccitt.h crc16_ccitt.c crc32.c crc32.h crc64.c crc64.h fletcher16.h fletcher16.c fletcher32.h fletcher32.c library.h spamsum.c spamsum.h crc32_clmul.c crc64_clmul.c simd.c simd.h adler32_ssse3.c adler32_avx2.c adler32_neon.c crc32_arm_simd.c crc32_vmull.c crc32_simd.h arm_vmull.c arm_vmull.h crc64_vmull.c) -add_subdirectory(tests) \ No newline at end of file +add_subdirectory(tests) diff --git a/adler32.c b/adler32.c index 5c6dbbd..f83f7d9 100644 --- a/adler32.c +++ b/adler32.c @@ -48,7 +48,7 @@ AARU_EXPORT adler32_ctx* AARU_CALL adler32_init() AARU_EXPORT int AARU_CALL adler32_update(adler32_ctx* ctx, const uint8_t* data, uint32_t len) { if(!ctx || !data) return -1; -#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM) +#if defined(__aarch64__) || defined(_M_ARM64) || ((defined(__arm__) || defined(_M_ARM)) && !defined(_WIN32)) if(have_neon()) { adler32_neon(&ctx->sum1, &ctx->sum2, data, len); diff --git a/adler32_neon.c b/adler32_neon.c index 62159cd..c401db2 100644 --- a/adler32_neon.c +++ b/adler32_neon.c @@ -2,7 +2,7 @@ // Created by claunia on 28/9/21. // -#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM) +#if defined(__aarch64__) || defined(_M_ARM64) || ((defined(__arm__) || defined(_M_ARM)) && !defined(_WIN32)) #include @@ -45,8 +45,13 @@ TARGET_WITH_SIMD void adler32_neon(uint16_t* sum1, uint16_t* sum2, const unsigne * Process n blocks of data. At most NMAX data bytes can be * processed before s2 must be reduced modulo ADLER_MODULE. */ +#ifdef _WIN32 + uint32x4_t v_s2 = {.n128_u32 = {0, 0, 0, s1 * n}}; + uint32x4_t v_s1 = {.n128_u32 = {0, 0, 0, 0}}; +#else uint32x4_t v_s2 = (uint32x4_t){0, 0, 0, s1 * n}; uint32x4_t v_s1 = (uint32x4_t){0, 0, 0, 0}; +#endif uint16x8_t v_column_sum_1 = vdupq_n_u16(0); uint16x8_t v_column_sum_2 = vdupq_n_u16(0); uint16x8_t v_column_sum_3 = vdupq_n_u16(0); @@ -78,6 +83,16 @@ TARGET_WITH_SIMD void adler32_neon(uint16_t* sum1, uint16_t* sum2, const unsigne /* * Multiply-add bytes by [ 32, 31, 30, ... ] for s2. */ +#ifdef _WIN32 + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), neon_ld1m_16((uint16_t[]) {32, 31, 30, 29})); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), neon_ld1m_16((uint16_t[]){28, 27, 26, 25})); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), neon_ld1m_16((uint16_t[]) {24, 23, 22, 21})); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), neon_ld1m_16((uint16_t[]) {20, 19, 18, 17})); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_3), neon_ld1m_16((uint16_t[]) {16, 15, 14, 13})); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), neon_ld1m_16((uint16_t[]) {12, 11, 10, 9})); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), neon_ld1m_16((uint16_t[]) {8, 7, 6, 5})); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), neon_ld1m_16((uint16_t[]) {4, 3, 2, 1})); +#else v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_1), (uint16x4_t){32, 31, 30, 29}); v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), (uint16x4_t){28, 27, 26, 25}); v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_2), (uint16x4_t){24, 23, 22, 21}); @@ -86,6 +101,7 @@ TARGET_WITH_SIMD void adler32_neon(uint16_t* sum1, uint16_t* sum2, const unsigne v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), (uint16x4_t){12, 11, 10, 9}); v_s2 = vmlal_u16(v_s2, vget_low_u16(v_column_sum_4), (uint16x4_t){8, 7, 6, 5}); v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), (uint16x4_t){4, 3, 2, 1}); +#endif /* * Sum epi32 ints v_s1(s2) and accumulate in s1(s2). */ diff --git a/arm_vmull.c b/arm_vmull.c index a175198..9e7c8d6 100644 --- a/arm_vmull.c +++ b/arm_vmull.c @@ -10,17 +10,21 @@ #include "arm_vmull.h" #include "simd.h" +#if !defined(_WIN32) TARGET_WITH_CRYPTO static uint64x2_t sse2neon_vmull_p64_crypto(uint64x1_t _a, uint64x1_t _b) { poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0); poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0); return vreinterpretq_u64_p128(vmull_p64(a, b)); } +#endif TARGET_WITH_SIMD uint64x2_t sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) { +#if !defined(_WIN32) // Wraps vmull_p64 if(have_arm_crypto()) return sse2neon_vmull_p64_crypto(_a, _b); +#endif // ARMv7 polyfill // ARMv7/some A64 lacks vmull_p64, but it has vmull_p8. diff --git a/crc32_vmull.c b/crc32_vmull.c index 6a43e41..fd88cd6 100644 --- a/crc32_vmull.c +++ b/crc32_vmull.c @@ -5,7 +5,6 @@ #if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM) #include -#include #include #include "library.h" diff --git a/crc64_vmull.c b/crc64_vmull.c index 9df958e..c89acdf 100644 --- a/crc64_vmull.c +++ b/crc64_vmull.c @@ -5,7 +5,6 @@ #if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM) #include -#include #include #include "library.h" diff --git a/simd.c b/simd.c index c32b74c..7387489 100644 --- a/simd.c +++ b/simd.c @@ -92,8 +92,13 @@ int have_avx2(void) #endif #if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM) +#if defined(_WIN32) +#include +#include +#else #include #endif +#endif #if defined(__aarch64__) || defined(_M_ARM64) int have_neon(void) @@ -101,15 +106,50 @@ int have_neon(void) return 1; // ARMv8-A made it mandatory } -int have_arm_crc32(void) { return getauxval(AT_HWCAP) & HWCAP_CRC32; } +int have_arm_crc32(void) +{ +#if defined(_WIN32) + return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE) != 0; +#else + return getauxval(AT_HWCAP) & HWCAP_CRC32; +#endif +} -int have_arm_crypto(void) { return getauxval(AT_HWCAP) & HWCAP_AES; } +int have_arm_crypto(void) +{ +#if defined(_WIN32) + return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) != 0; +#else + return getauxval(AT_HWCAP) & HWCAP_AES; +#endif +} #endif #if defined(__arm__) || defined(_M_ARM) -int have_neon(void) { return getauxval(AT_HWCAP) & HWCAP_NEON; } +int have_neon(void) +{ +#if defined(_WIN32) + return IsProcessorFeaturePresent(PF_ARM_VFP_32_REGISTERS_AVAILABLE) != 0; +#else + return getauxval(AT_HWCAP) & HWCAP_NEON; +#endif +} -int have_arm_crc32(void) { return getauxval(AT_HWCAP2) & HWCAP2_CRC32; } +int have_arm_crc32(void) +{ +#if defined(_WIN32) + return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE) != 0; +#else + return getauxval(AT_HWCAP2) & HWCAP2_CRC32; +#endif +} -int have_arm_crypto(void) { return getauxval(AT_HWCAP2) & HWCAP2_AES; } +int have_arm_crypto(void) +{ +#if defined(_WIN32) + return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) != 0; +#else + return getauxval(AT_HWCAP2) & HWCAP2_AES; +#endif +} #endif \ No newline at end of file diff --git a/simd.h b/simd.h index 7520ced..edd0cc8 100644 --- a/simd.h +++ b/simd.h @@ -22,13 +22,13 @@ AARU_EXPORT int have_ssse3(void); AARU_EXPORT int have_avx2(void); #endif -#if defined(__arm__) || defined(_M_ARM) +#if(defined(__arm__) || defined(_M_ARM)) && !defined(_WIN32) #define HWCAP_NEON (1 << 12) #define HWCAP2_AES (1 << 0) #define HWCAP2_CRC32 (1 << 4) #endif -#if defined(__aarch64__) || defined(_M_ARM64) +#if(defined(__aarch64__) || defined(_M_ARM64)) && !defined(_WIN32) #define HWCAP_NEON (1 << 1) #define HWCAP_AES (1 << 3) #define HWCAP_CRC32 (1 << 7) diff --git a/tests/adler32.cpp b/tests/adler32.cpp index 7065f82..ee4d453 100644 --- a/tests/adler32.cpp +++ b/tests/adler32.cpp @@ -233,7 +233,7 @@ TEST_F(adler32Fixture, adler32_slicing_2352bytes) EXPECT_EQ(adler32, EXPECTED_ADLER32_2352BYTES); } -#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM) +#if defined(__aarch64__) || defined(_M_ARM64) || ((defined(__arm__) || defined(_M_ARM)) && !defined(_WIN32)) TEST_F(adler32Fixture, adler32_neon) { if(!have_neon()) return;