From 93f6109c9016ea7c39be4a525e93745e753e8425 Mon Sep 17 00:00:00 2001 From: Erik de Castro Lopo Date: Fri, 11 Apr 2014 06:21:09 +1000 Subject: [PATCH] Add intrinsics version of two lpc functions. Functions: - FLAC__fixed_compute_best_predictor - FLAC__fixed_compute_best_predictor_wide Patch-from: lvqcl --- src/libFLAC/Makefile.am | 2 ++ src/libFLAC/Makefile.lite | 2 ++ src/libFLAC/include/private/fixed.h | 17 +++++++++++++---- src/libFLAC/libFLAC_dynamic.vcproj | 8 ++++++++ src/libFLAC/libFLAC_static.vcproj | 8 ++++++++ src/libFLAC/stream_encoder.c | 25 ++++++++++++++++++++++++- 6 files changed, 57 insertions(+), 5 deletions(-) diff --git a/src/libFLAC/Makefile.am b/src/libFLAC/Makefile.am index 258de40d..b6e1f471 100644 --- a/src/libFLAC/Makefile.am +++ b/src/libFLAC/Makefile.am @@ -121,6 +121,8 @@ libFLAC_sources = \ cpu.c \ crc.c \ fixed.c \ + fixed_intrin_sse2.c \ + fixed_intrin_ssse3.c \ float.c \ format.c \ lpc.c \ diff --git a/src/libFLAC/Makefile.lite b/src/libFLAC/Makefile.lite index 4533138a..6127cf37 100644 --- a/src/libFLAC/Makefile.lite +++ b/src/libFLAC/Makefile.lite @@ -85,6 +85,8 @@ SRCS_C = \ cpu.c \ crc.c \ fixed.c \ + fixed_intrin_sse2.c \ + fixed_intrin_ssse3.c \ float.c \ format.c \ lpc.c \ diff --git a/src/libFLAC/include/private/fixed.h b/src/libFLAC/include/private/fixed.h index f786f7de..d1b8a869 100644 --- a/src/libFLAC/include/private/fixed.h +++ b/src/libFLAC/include/private/fixed.h @@ -37,6 +37,7 @@ #include #endif +#include "private/cpu.h" #include "private/float.h" #include "FLAC/format.h" @@ -54,14 +55,22 @@ */ #ifndef FLAC__INTEGER_ONLY_LIBRARY unsigned FLAC__fixed_compute_best_predictor(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]); +unsigned FLAC__fixed_compute_best_predictor_wide(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]); # ifndef FLAC__NO_ASM -# ifdef FLAC__CPU_IA32 -# ifdef FLAC__HAS_NASM -unsigned FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]); +# if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN +# ifdef FLAC__SSE2_SUPPORTED +unsigned FLAC__fixed_compute_best_predictor_intrin_sse2(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1]); +unsigned FLAC__fixed_compute_best_predictor_wide_intrin_sse2(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1]); +# endif +# ifdef FLAC__SSSE3_SUPPORTED +unsigned FLAC__fixed_compute_best_predictor_intrin_ssse3(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]); +unsigned FLAC__fixed_compute_best_predictor_wide_intrin_ssse3(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1]); # endif # endif +# if defined FLAC__CPU_IA32 && defined FLAC__HAS_NASM +unsigned FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]); +# endif # endif -unsigned FLAC__fixed_compute_best_predictor_wide(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]); #else unsigned FLAC__fixed_compute_best_predictor(const FLAC__int32 data[], unsigned data_len, FLAC__fixedpoint residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]); unsigned FLAC__fixed_compute_best_predictor_wide(const FLAC__int32 data[], unsigned data_len, FLAC__fixedpoint residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]); diff --git a/src/libFLAC/libFLAC_dynamic.vcproj b/src/libFLAC/libFLAC_dynamic.vcproj index 693a10c8..f3bb464a 100644 --- a/src/libFLAC/libFLAC_dynamic.vcproj +++ b/src/libFLAC/libFLAC_dynamic.vcproj @@ -310,6 +310,14 @@ RelativePath=".\fixed.c" > + + + + diff --git a/src/libFLAC/libFLAC_static.vcproj b/src/libFLAC/libFLAC_static.vcproj index 4851755a..fdc9f73c 100644 --- a/src/libFLAC/libFLAC_static.vcproj +++ b/src/libFLAC/libFLAC_static.vcproj @@ -323,6 +323,14 @@ RelativePath=".\fixed.c" > + + + + diff --git a/src/libFLAC/stream_encoder.c b/src/libFLAC/stream_encoder.c index 2b1765cf..3fe0a746 100644 --- a/src/libFLAC/stream_encoder.c +++ b/src/libFLAC/stream_encoder.c @@ -348,8 +348,10 @@ typedef struct FLAC__StreamEncoderPrivate { void (*local_precompute_partition_info_sums)(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[], unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps); #ifndef FLAC__INTEGER_ONLY_LIBRARY unsigned (*local_fixed_compute_best_predictor)(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]); + unsigned (*local_fixed_compute_best_predictor_wide)(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]); #else unsigned (*local_fixed_compute_best_predictor)(const FLAC__int32 data[], unsigned data_len, FLAC__fixedpoint residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]); + unsigned (*local_fixed_compute_best_predictor_wide)(const FLAC__int32 data[], unsigned data_len, FLAC__fixedpoint residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]); #endif #ifndef FLAC__INTEGER_ONLY_LIBRARY void (*local_lpc_compute_autocorrelation)(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); @@ -879,6 +881,7 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_( #endif encoder->private_->local_precompute_partition_info_sums = precompute_partition_info_sums_; encoder->private_->local_fixed_compute_best_predictor = FLAC__fixed_compute_best_predictor; + encoder->private_->local_fixed_compute_best_predictor_wide = FLAC__fixed_compute_best_predictor_wide; #ifndef FLAC__INTEGER_ONLY_LIBRARY encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients; encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide; @@ -937,6 +940,17 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_( encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2; encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2; } +# ifdef FLAC__SSSE3_SUPPORTED + if (encoder->private_->cpuinfo.ia32.ssse3) { + encoder->private_->local_fixed_compute_best_predictor = FLAC__fixed_compute_best_predictor_intrin_ssse3; + encoder->private_->local_fixed_compute_best_predictor_wide = FLAC__fixed_compute_best_predictor_wide_intrin_ssse3; + } + else +# endif + if (encoder->private_->cpuinfo.ia32.sse2) { + encoder->private_->local_fixed_compute_best_predictor = FLAC__fixed_compute_best_predictor_intrin_sse2; + encoder->private_->local_fixed_compute_best_predictor_wide = FLAC__fixed_compute_best_predictor_wide_intrin_sse2; + } # endif # ifdef FLAC__SSE4_1_SUPPORTED if(encoder->private_->cpuinfo.ia32.sse41) @@ -959,6 +973,15 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_( # ifdef FLAC__SSE2_SUPPORTED /* encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2; // OPT: not faster than C; TODO: more tests on different CPUs */ encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2; +# ifdef FLAC__SSSE3_SUPPORTED + if (encoder->private_->cpuinfo.x86_64.ssse3) { + encoder->private_->local_fixed_compute_best_predictor = FLAC__fixed_compute_best_predictor_intrin_ssse3; + encoder->private_->local_fixed_compute_best_predictor_wide = FLAC__fixed_compute_best_predictor_wide_intrin_ssse3; + } + else +# endif + encoder->private_->local_fixed_compute_best_predictor = FLAC__fixed_compute_best_predictor_intrin_sse2; + encoder->private_->local_fixed_compute_best_predictor_wide = FLAC__fixed_compute_best_predictor_wide_intrin_sse2; # endif # endif /* FLAC__HAS_X86INTRIN */ # endif /* FLAC__CPU_... */ @@ -991,7 +1014,7 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_( #endif /* !FLAC__NO_ASM && FLAC__HAS_X86INTRIN */ /* finally override based on wide-ness if necessary */ if(encoder->private_->use_wide_by_block) { - encoder->private_->local_fixed_compute_best_predictor = FLAC__fixed_compute_best_predictor_wide; + encoder->private_->local_fixed_compute_best_predictor = encoder->private_->local_fixed_compute_best_predictor_wide; } /* set state to OK; from here on, errors are fatal and we'll override the state then */