Adds SSE-accelerated lpc functions.

New functions are:
    FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4()
    FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8()
    FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12()
    FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16()
    FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2()

Patch-from: lvqcl <lvqcl.mail@gmail.com>
This commit is contained in:
Erik de Castro Lopo
2013-09-15 09:51:28 +10:00
parent 84c3e3d52c
commit 5e5ee2720c
7 changed files with 606 additions and 9 deletions

View File

@@ -915,7 +915,21 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
if(encoder->private_->cpuinfo.data.ia32.mmx && encoder->private_->cpuinfo.data.ia32.cmov)
encoder->private_->local_fixed_compute_best_predictor = FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov;
# endif /* FLAC__HAS_NASM */
# endif /* FLAC__CPU_IA32 */
# elif defined FLAC__CPU_X86_64
FLAC__ASSERT(encoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_X86_64);
# ifdef FLAC__HAS_X86INTRIN
if(encoder->protected_->max_lpc_order < 4)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4;
else if(encoder->protected_->max_lpc_order < 8)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8;
else if(encoder->protected_->max_lpc_order < 12)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12;
else if(encoder->protected_->max_lpc_order < 16)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16;
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2;
# endif /* FLAC__HAS_X86INTRIN */
# endif /* FLAC__CPU_... */
}
# endif /* !FLAC__NO_ASM */
#endif /* !FLAC__INTEGER_ONLY_LIBRARY */
@@ -2238,8 +2252,8 @@ FLAC__bool resize_buffers_(FLAC__StreamEncoder *encoder, unsigned new_blocksize)
ok = true;
/* WATCHOUT: FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx()
* requires that the input arrays (in our case the integer signals)
/* WATCHOUT: FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx() and ..._intrin_sse2()
* require that the input arrays (in our case the integer signals)
* have a buffer of up to 3 zeroes in front (at negative indices) for
* alignment purposes; we use 4 in front to keep the data well-aligned.
*/
@@ -3167,7 +3181,7 @@ FLAC__bool process_subframe_(
#endif
#ifndef FLAC__INTEGER_ONLY_LIBRARY
FLAC__double lpc_residual_bits_per_sample;
FLAC__real autoc[FLAC__MAX_LPC_ORDER+1]; /* WATCHOUT: the size is important even though encoder->protected_->max_lpc_order might be less; some asm routines need all the space */
FLAC__real autoc[FLAC__MAX_LPC_ORDER+1]; /* WATCHOUT: the size is important even though encoder->protected_->max_lpc_order might be less; some asm and x86 intrinsic routines need all the space */
FLAC__double lpc_error[FLAC__MAX_LPC_ORDER];
unsigned min_lpc_order, max_lpc_order, lpc_order;
unsigned min_qlp_coeff_precision, max_qlp_coeff_precision, qlp_coeff_precision;
@@ -3558,7 +3572,7 @@ unsigned evaluate_lpc_subframe_(
FLAC__EntropyCodingMethod_PartitionedRiceContents *partitioned_rice_contents
)
{
FLAC__int32 qlp_coeff[FLAC__MAX_LPC_ORDER];
FLAC__int32 qlp_coeff[FLAC__MAX_LPC_ORDER]; /* WATCHOUT: the size is important; x86 intrinsic routines need more than 'order' elements */
unsigned i, residual_bits, estimate;
int quantization, ret;
const unsigned residual_samples = blocksize - order;