diff --git a/include/share/compat.h b/include/share/compat.h index a17cc634..6f04f441 100644 --- a/include/share/compat.h +++ b/include/share/compat.h @@ -199,12 +199,4 @@ int flac_snprintf(char *str, size_t size, const char *fmt, ...); }; #endif -/* SSSE3, SSE4 support: MSVS 2008, GCC 4.3 -- currently disabled, Intel Compiler 10.0 */ -#if ( defined _MSC_VER && _MSC_VER >= 1500 ) \ - || ( 0 && defined __GNUC__ && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) ) \ - || ( defined __INTEL_COMPILER && __INTEL_COMPILER >= 1000 ) -#define FLAC__SSSE3_SUPPORTED 1 -#define FLAC__SSE4_SUPPORTED 1 -#endif - #endif /* FLAC__SHARE__COMPAT_H */ diff --git a/src/libFLAC/include/private/cpu.h b/src/libFLAC/include/private/cpu.h index 1bd8bcab..07d7fb07 100644 --- a/src/libFLAC/include/private/cpu.h +++ b/src/libFLAC/include/private/cpu.h @@ -39,6 +39,47 @@ #include #endif +/* SSE intrinsics support by ICC/MSVC/GCC */ +#if defined __INTEL_COMPILER + #define FLAC__SSE_TARGET(x) + #define FLAC__SSE_SUPPORTED 1 + #define FLAC__SSE2_SUPPORTED 1 + #if (__INTEL_COMPILER >= 1000) /* Intel C++ Compiler 10.0 */ + #define FLAC__SSSE3_SUPPORTED 1 + #define FLAC__SSE4_1_SUPPORTED 1 + #endif +#elif defined _MSC_VER + #define FLAC__SSE_TARGET(x) + #define FLAC__SSE_SUPPORTED 1 + #define FLAC__SSE2_SUPPORTED 1 + #if (_MSC_VER >= 1500) /* MS Visual Studio 2008 */ + #define FLAC__SSSE3_SUPPORTED 1 + #define FLAC__SSE4_1_SUPPORTED 1 + #endif +#elif defined __GNUC__ + #if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)) /* since GCC 4.9 -msse.. compiler options aren't necessary */ + #define FLAC__SSE_TARGET(x) __attribute__ ((__target__ (x))) + #define FLAC__SSE_SUPPORTED 1 + #define FLAC__SSE2_SUPPORTED 1 + #define FLAC__SSSE3_SUPPORTED 1 + #define FLAC__SSE4_1_SUPPORTED 1 + #else /* for GCC older than 4.9 */ + #define FLAC__SSE_TARGET(x) + #ifdef __SSE__ + #define FLAC__SSE_SUPPORTED 1 + #endif + #ifdef __SSE2__ + #define FLAC__SSE2_SUPPORTED 1 + #endif + #ifdef __SSSE3__ + #define FLAC__SSSE3_SUPPORTED 1 + #endif + #ifdef __SSE4_1__ + #define FLAC__SSE4_1_SUPPORTED 1 + #endif + #endif /* GCC version */ +#endif /* compiler version */ + typedef enum { FLAC__CPUINFO_TYPE_IA32, FLAC__CPUINFO_TYPE_X86_64, diff --git a/src/libFLAC/include/private/lpc.h b/src/libFLAC/include/private/lpc.h index 27760b48..d0872c3c 100644 --- a/src/libFLAC/include/private/lpc.h +++ b/src/libFLAC/include/private/lpc.h @@ -37,6 +37,7 @@ #include #endif +#include "private/cpu.h" #include "private/float.h" #include "FLAC/format.h" @@ -80,10 +81,12 @@ void FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow(const FLAC__real data[], u # endif # endif # if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN +# ifdef FLAC__SSE_SUPPORTED void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); +# endif # endif #endif @@ -156,9 +159,11 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32(const FLAC__ # endif # endif # if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN +# ifdef FLAC__SSE2_SUPPORTED void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]); void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]); -# ifdef FLAC__SSE4_SUPPORTED +# endif +# ifdef FLAC__SSE4_1_SUPPORTED void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]); # endif # endif @@ -195,9 +200,9 @@ void FLAC__lpc_restore_signal_asm_ppc_altivec_16(const FLAC__int32 residual[], u void FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]); # endif /* FLAC__CPU_IA32 || FLAC__CPU_PPC */ # if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN -# ifdef FLAC__SSE4_SUPPORTED +# ifdef FLAC__SSE4_1_SUPPORTED void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]); -# endif +# endif # endif #endif /* FLAC__NO_ASM */ diff --git a/src/libFLAC/include/private/stream_encoder.h b/src/libFLAC/include/private/stream_encoder.h index ee7d9788..d26039ac 100644 --- a/src/libFLAC/include/private/stream_encoder.h +++ b/src/libFLAC/include/private/stream_encoder.h @@ -38,11 +38,13 @@ #endif #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN -#include "share/compat.h" +#include "private/cpu.h" #include "FLAC/format.h" +#ifdef FLAC__SSE2_SUPPORTED extern void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[], unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps); +#endif #ifdef FLAC__SSSE3_SUPPORTED extern void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[], diff --git a/src/libFLAC/lpc_intrin_sse.c b/src/libFLAC/lpc_intrin_sse.c index e8f9f578..23299cc1 100644 --- a/src/libFLAC/lpc_intrin_sse.c +++ b/src/libFLAC/lpc_intrin_sse.c @@ -37,13 +37,15 @@ #ifndef FLAC__INTEGER_ONLY_LIBRARY #ifndef FLAC__NO_ASM #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN +#include "private/lpc.h" +#ifdef FLAC__SSE_SUPPORTED #include "FLAC/assert.h" #include "FLAC/format.h" -#include "private/lpc.h" #include /* SSE */ +FLAC__SSE_TARGET("sse") void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) { __m128 xmm0, xmm2, xmm5; @@ -80,6 +82,7 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4(const FLAC__real data[], _mm_storeu_ps(autoc, xmm5); } +FLAC__SSE_TARGET("sse") void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) { __m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6; @@ -125,6 +128,7 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8(const FLAC__real data[], _mm_storeu_ps(autoc+4, xmm6); } +FLAC__SSE_TARGET("sse") void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) { __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; @@ -178,6 +182,7 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12(const FLAC__real data[] _mm_storeu_ps(autoc+8, xmm7); } +FLAC__SSE_TARGET("sse") void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) { __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9; @@ -241,6 +246,7 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16(const FLAC__real data[] _mm_storeu_ps(autoc+12,xmm9); } +#endif /* FLAC__SSE_SUPPORTED */ #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */ #endif /* FLAC__NO_ASM */ #endif /* FLAC__INTEGER_ONLY_LIBRARY */ diff --git a/src/libFLAC/lpc_intrin_sse2.c b/src/libFLAC/lpc_intrin_sse2.c index 9311151d..98d51bd5 100644 --- a/src/libFLAC/lpc_intrin_sse2.c +++ b/src/libFLAC/lpc_intrin_sse2.c @@ -37,13 +37,15 @@ #ifndef FLAC__INTEGER_ONLY_LIBRARY #ifndef FLAC__NO_ASM #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN +#include "private/lpc.h" +#ifdef FLAC__SSE2_SUPPORTED #include "FLAC/assert.h" #include "FLAC/format.h" -#include "private/lpc.h" #include /* SSE2 */ +FLAC__SSE_TARGET("sse2") void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) { int i; @@ -787,6 +789,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ #define RESIDUAL_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization); +FLAC__SSE_TARGET("sse2") void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) { int i; @@ -1313,6 +1316,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in } } +#endif /* FLAC__SSE2_SUPPORTED */ #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */ #endif /* FLAC__NO_ASM */ #endif /* FLAC__INTEGER_ONLY_LIBRARY */ diff --git a/src/libFLAC/lpc_intrin_sse41.c b/src/libFLAC/lpc_intrin_sse41.c index ea8eb371..97ee9eae 100644 --- a/src/libFLAC/lpc_intrin_sse41.c +++ b/src/libFLAC/lpc_intrin_sse41.c @@ -34,16 +34,14 @@ # include #endif -#include "share/compat.h" - #ifndef FLAC__INTEGER_ONLY_LIBRARY #ifndef FLAC__NO_ASM #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN -#ifdef FLAC__SSE4_SUPPORTED +#include "private/lpc.h" +#ifdef FLAC__SSE4_1_SUPPORTED #include "FLAC/assert.h" #include "FLAC/format.h" -#include "private/lpc.h" #include /* SSE4.1 */ @@ -68,6 +66,7 @@ #define DATA_RESULT(xmmN) data[i] = residual[i] + (FLAC__int32)(_mm_cvtsi128_si64(xmmN) >> lp_quantization); #endif +FLAC__SSE_TARGET("sse4.1") void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) { int i; @@ -594,6 +593,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL } } +FLAC__SSE_TARGET("sse4.1") void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]) { int i; @@ -1120,7 +1120,7 @@ void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], un } } -#endif /* FLAC__SSE4_SUPPORTED */ +#endif /* FLAC__SSE4_1_SUPPORTED */ #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */ #endif /* FLAC__NO_ASM */ #endif /* FLAC__INTEGER_ONLY_LIBRARY */ diff --git a/src/libFLAC/stream_decoder.c b/src/libFLAC/stream_decoder.c index d8cd7142..633dcdc3 100644 --- a/src/libFLAC/stream_decoder.c +++ b/src/libFLAC/stream_decoder.c @@ -417,7 +417,7 @@ static FLAC__StreamDecoderInitStatus init_stream_internal_( } #endif #ifdef FLAC__HAS_X86INTRIN -# if defined FLAC__SSE4_SUPPORTED && 0 /* now we have FLAC__lpc_restore_signal_wide_asm_ia32() which is slightly faster */ +# if defined FLAC__SSE4_1_SUPPORTED && 0 /* now we have FLAC__lpc_restore_signal_wide_asm_ia32() which is slightly faster */ if(decoder->private_->cpuinfo.ia32.sse41) decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide_intrin_sse41; # endif diff --git a/src/libFLAC/stream_encoder.c b/src/libFLAC/stream_encoder.c index cbf28156..1cd123fc 100644 --- a/src/libFLAC/stream_encoder.c +++ b/src/libFLAC/stream_encoder.c @@ -920,11 +920,13 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_( encoder->private_->local_fixed_compute_best_predictor = FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov; # endif /* FLAC__HAS_NASM */ # ifdef FLAC__HAS_X86INTRIN +# ifdef FLAC__SSE2_SUPPORTED if(encoder->private_->cpuinfo.ia32.sse2) { encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2; encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2; } -# ifdef FLAC__SSE4_SUPPORTED +# endif +# ifdef FLAC__SSE4_1_SUPPORTED if(encoder->private_->cpuinfo.ia32.sse41) encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41; # endif @@ -932,6 +934,7 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_( # elif defined FLAC__CPU_X86_64 FLAC__ASSERT(encoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_X86_64); # ifdef FLAC__HAS_X86INTRIN +# ifdef FLAC__SSE_SUPPORTED if(encoder->protected_->max_lpc_order < 4) encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4; else if(encoder->protected_->max_lpc_order < 8) @@ -940,9 +943,11 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_( encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12; else if(encoder->protected_->max_lpc_order < 16) encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16; - +# endif +# ifdef FLAC__SSE2_SUPPORTED encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2; encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2; +# endif # endif /* FLAC__HAS_X86INTRIN */ # endif /* FLAC__CPU_... */ } @@ -956,15 +961,19 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_( encoder->private_->local_precompute_partition_info_sums = FLAC__precompute_partition_info_sums_intrin_ssse3; else # endif +# ifdef FLAC__SSE2_SUPPORTED if(encoder->private_->cpuinfo.ia32.sse2) encoder->private_->local_precompute_partition_info_sums = FLAC__precompute_partition_info_sums_intrin_sse2; +# endif # elif defined FLAC__CPU_X86_64 # ifdef FLAC__SSSE3_SUPPORTED if(encoder->private_->cpuinfo.x86_64.ssse3) encoder->private_->local_precompute_partition_info_sums = FLAC__precompute_partition_info_sums_intrin_ssse3; else # endif +# ifdef FLAC__SSE2_SUPPORTED encoder->private_->local_precompute_partition_info_sums = FLAC__precompute_partition_info_sums_intrin_sse2; +# endif # endif /* FLAC__CPU_... */ } #endif /* !FLAC__NO_ASM && FLAC__HAS_X86INTRIN */ diff --git a/src/libFLAC/stream_encoder_intrin_sse2.c b/src/libFLAC/stream_encoder_intrin_sse2.c index ac2e69f7..9852175a 100644 --- a/src/libFLAC/stream_encoder_intrin_sse2.c +++ b/src/libFLAC/stream_encoder_intrin_sse2.c @@ -36,12 +36,14 @@ #ifndef FLAC__NO_ASM #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN +#include "private/stream_encoder.h" +#ifdef FLAC__SSE2_SUPPORTED #include /* for abs() */ #include /* SSE2 */ #include "FLAC/assert.h" -#include "private/stream_encoder.h" +FLAC__SSE_TARGET("sse2") void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[], unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps) { @@ -157,5 +159,6 @@ void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual } } +#endif /* FLAC__SSE2_SUPPORTED */ #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */ #endif /* FLAC__NO_ASM */ diff --git a/src/libFLAC/stream_encoder_intrin_ssse3.c b/src/libFLAC/stream_encoder_intrin_ssse3.c index 35f3b23a..7294c554 100644 --- a/src/libFLAC/stream_encoder_intrin_ssse3.c +++ b/src/libFLAC/stream_encoder_intrin_ssse3.c @@ -34,17 +34,16 @@ # include #endif -#include "share/compat.h" - #ifndef FLAC__NO_ASM #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN +#include "private/stream_encoder.h" #ifdef FLAC__SSSE3_SUPPORTED #include /* for abs() */ #include /* SSSE3 */ #include "FLAC/assert.h" -#include "private/stream_encoder.h" +FLAC__SSE_TARGET("ssse3") void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[], unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps) {