diff --git a/src/libFLAC/cpu.c b/src/libFLAC/cpu.c index bb095067..40c79509 100644 --- a/src/libFLAC/cpu.c +++ b/src/libFLAC/cpu.c @@ -164,6 +164,8 @@ void FLAC__cpu_info(FLAC__CPUInfo *info) /* http://www.sandpile.org/x86/cpuid.htm */ #ifdef FLAC__HAS_X86INTRIN FLAC__uint32 flags_eax, flags_ebx, flags_ecx, flags_edx; + FLAC__cpu_info_x86(0, &flags_eax, &flags_ebx, &flags_ecx, &flags_edx); + info->ia32.intel = (flags_ebx == 0x756E6547 && flags_edx == 0x49656E69 && flags_ecx == 0x6C65746E)? true : false; /* GenuineIntel */ FLAC__cpu_info_x86(1, &flags_eax, &flags_ebx, &flags_ecx, &flags_edx); #else FLAC__uint32 flags_ecx, flags_edx; @@ -347,6 +349,8 @@ void FLAC__cpu_info(FLAC__CPUInfo *info) { /* http://www.sandpile.org/x86/cpuid.htm */ FLAC__uint32 flags_eax, flags_ebx, flags_ecx, flags_edx; + FLAC__cpu_info_x86(0, &flags_eax, &flags_ebx, &flags_ecx, &flags_edx); + info->x86.intel = (flags_ebx == 0x756E6547 && flags_edx == 0x49656E69 && flags_ecx == 0x6C65746E)? true : false; /* GenuineIntel */ FLAC__cpu_info_x86(1, &flags_eax, &flags_ebx, &flags_ecx, &flags_edx); info->x86.sse3 = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSE3 )? true : false; info->x86.ssse3 = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSSE3)? true : false; diff --git a/src/libFLAC/ia32/lpc_asm.nasm b/src/libFLAC/ia32/lpc_asm.nasm index bf650323..e98250ac 100644 --- a/src/libFLAC/ia32/lpc_asm.nasm +++ b/src/libFLAC/ia32/lpc_asm.nasm @@ -36,10 +36,10 @@ data_section cglobal FLAC__lpc_compute_autocorrelation_asm_ia32 -cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4 -cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8 -cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12 -cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16 +cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4_old +cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8_old +cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12_old +cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16_old cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32 @@ -443,7 +443,7 @@ cident FLAC__lpc_compute_autocorrelation_asm_ia32 ret ALIGN 16 -cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4 +cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4_old ;[esp + 16] == autoc[] ;[esp + 12] == lag ;[esp + 8] == data_len @@ -490,7 +490,7 @@ cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4 ret ALIGN 16 -cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8 +cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8_old ;[esp + 16] == autoc[] ;[esp + 12] == lag ;[esp + 8] == data_len @@ -549,7 +549,7 @@ cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8 ret ALIGN 16 -cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12 +cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12_old ;[esp + 16] == autoc[] ;[esp + 12] == lag ;[esp + 8] == data_len @@ -623,7 +623,7 @@ cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12 ret ALIGN 16 -cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16 +cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16_old ;[ebp + 20] == autoc[] ;[ebp + 16] == lag ;[ebp + 12] == data_len diff --git a/src/libFLAC/include/private/cpu.h b/src/libFLAC/include/private/cpu.h index 8927897a..380f4f07 100644 --- a/src/libFLAC/include/private/cpu.h +++ b/src/libFLAC/include/private/cpu.h @@ -116,6 +116,8 @@ typedef enum { #if defined FLAC__CPU_IA32 typedef struct { + FLAC__bool intel; + FLAC__bool cmov; FLAC__bool mmx; FLAC__bool sse; @@ -131,6 +133,8 @@ typedef struct { } FLAC__CPUInfo_IA32; #elif defined FLAC__CPU_X86_64 typedef struct { + FLAC__bool intel; + FLAC__bool sse3; FLAC__bool ssse3; FLAC__bool sse41; diff --git a/src/libFLAC/include/private/lpc.h b/src/libFLAC/include/private/lpc.h index d36b30be..c4ed085c 100644 --- a/src/libFLAC/include/private/lpc.h +++ b/src/libFLAC/include/private/lpc.h @@ -73,18 +73,22 @@ void FLAC__lpc_compute_autocorrelation(const FLAC__real data[], unsigned data_le # ifdef FLAC__CPU_IA32 # ifdef FLAC__HAS_NASM void FLAC__lpc_compute_autocorrelation_asm_ia32(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); -void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); -void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); -void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); -void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); +void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); +void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); +void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); +void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); # endif # endif # if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN # ifdef FLAC__SSE_SUPPORTED -void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); -void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); -void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); -void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); +void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); +void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); +void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); +void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); +void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); +void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); +void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); +void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]); # endif # endif #endif diff --git a/src/libFLAC/lpc_intrin_sse.c b/src/libFLAC/lpc_intrin_sse.c index 81bf586e..9106e14f 100644 --- a/src/libFLAC/lpc_intrin_sse.c +++ b/src/libFLAC/lpc_intrin_sse.c @@ -45,11 +45,15 @@ #include /* SSE */ -#if 1 -/* Faster on current Intel (starting from Core i aka Nehalem) and all AMD CPUs */ +/* new routines: more unaligned loads, less shuffle + * old routines: less unaligned loads, more shuffle + * these *_old routines are equivalent to the ASM routines in ia32/lpc_asm.nasm + */ + +/* new routines: faster on current Intel (starting from Core i aka Nehalem) and all AMD CPUs */ FLAC__SSE_TARGET("sse") -void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) +void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) { int i; int limit = data_len - 4; @@ -85,7 +89,7 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4(const FLAC__real data[], } FLAC__SSE_TARGET("sse") -void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) +void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) { int i; int limit = data_len - 8; @@ -129,7 +133,7 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8(const FLAC__real data[], } FLAC__SSE_TARGET("sse") -void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) +void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) { int i; int limit = data_len - 12; @@ -181,7 +185,7 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12(const FLAC__real data[] } FLAC__SSE_TARGET("sse") -void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) +void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) { int i; int limit = data_len - 16; @@ -240,11 +244,10 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16(const FLAC__real data[] _mm_storeu_ps(autoc+12,sum3); } -#else -/* Faster on older Intel CPUs (up to Core 2) */ +/* old routines: faster on older Intel CPUs (up to Core 2) */ FLAC__SSE_TARGET("sse") -void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) +void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) { __m128 xmm0, xmm2, xmm5; @@ -281,7 +284,7 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4(const FLAC__real data[], } FLAC__SSE_TARGET("sse") -void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) +void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) { __m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6; @@ -327,7 +330,7 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8(const FLAC__real data[], } FLAC__SSE_TARGET("sse") -void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) +void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) { __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; @@ -381,7 +384,7 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12(const FLAC__real data[] } FLAC__SSE_TARGET("sse") -void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) +void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) { __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9; @@ -443,7 +446,6 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16(const FLAC__real data[] _mm_storeu_ps(autoc+8, xmm8); _mm_storeu_ps(autoc+12,xmm9); } -#endif #endif /* FLAC__SSE_SUPPORTED */ #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */ diff --git a/src/libFLAC/stream_encoder.c b/src/libFLAC/stream_encoder.c index 45bdb252..3aa90fb5 100644 --- a/src/libFLAC/stream_encoder.c +++ b/src/libFLAC/stream_encoder.c @@ -898,13 +898,13 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_( # ifdef FLAC__HAS_NASM if(encoder->private_->cpuinfo.ia32.sse) { if(encoder->protected_->max_lpc_order < 4) - encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4; + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4_old; else if(encoder->protected_->max_lpc_order < 8) - encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8; + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8_old; else if(encoder->protected_->max_lpc_order < 12) - encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12; + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12_old; else if(encoder->protected_->max_lpc_order < 16) - encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16; + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16_old; else encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32; } @@ -927,16 +927,30 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_( # ifdef FLAC__HAS_X86INTRIN # if defined FLAC__SSE_SUPPORTED if(encoder->private_->cpuinfo.ia32.sse) { - if(encoder->protected_->max_lpc_order < 4) - encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4; - else if(encoder->protected_->max_lpc_order < 8) - encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8; - else if(encoder->protected_->max_lpc_order < 12) - encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12; - else if(encoder->protected_->max_lpc_order < 16) - encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16; - else - encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation; + if(encoder->private_->cpuinfo.ia32.sse42 || !encoder->private_->cpuinfo.ia32.intel) { /* use new autocorrelation functions */ + if(encoder->protected_->max_lpc_order < 4) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_new; + else if(encoder->protected_->max_lpc_order < 8) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_new; + else if(encoder->protected_->max_lpc_order < 12) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_new; + else if(encoder->protected_->max_lpc_order < 16) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_new; + else + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation; + } + else { /* use old autocorrelation functions */ + if(encoder->protected_->max_lpc_order < 4) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_old; + else if(encoder->protected_->max_lpc_order < 8) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_old; + else if(encoder->protected_->max_lpc_order < 12) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_old; + else if(encoder->protected_->max_lpc_order < 16) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_old; + else + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation; + } } # endif @@ -977,14 +991,26 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_( FLAC__ASSERT(encoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_X86_64); # ifdef FLAC__HAS_X86INTRIN # ifdef FLAC__SSE_SUPPORTED - if(encoder->protected_->max_lpc_order < 4) - encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4; - else if(encoder->protected_->max_lpc_order < 8) - encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8; - else if(encoder->protected_->max_lpc_order < 12) - encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12; - else if(encoder->protected_->max_lpc_order < 16) - encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16; + if(encoder->private_->cpuinfo.x86.sse42 || !encoder->private_->cpuinfo.x86.intel) { /* use new autocorrelation functions */ + if(encoder->protected_->max_lpc_order < 4) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_new; + else if(encoder->protected_->max_lpc_order < 8) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_new; + else if(encoder->protected_->max_lpc_order < 12) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_new; + else if(encoder->protected_->max_lpc_order < 16) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_new; + } + else { + if(encoder->protected_->max_lpc_order < 4) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_old; + else if(encoder->protected_->max_lpc_order < 8) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_old; + else if(encoder->protected_->max_lpc_order < 12) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_old; + else if(encoder->protected_->max_lpc_order < 16) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_old; + } # endif # ifdef FLAC__SSE2_SUPPORTED