mirror of
https://github.com/claunia/flac.git
synced 2025-12-16 18:54:26 +00:00
add 3DNOW stuff from Miroslav
This commit is contained in:
@@ -27,6 +27,10 @@ const unsigned FLAC__CPUINFO_IA32_CPUID_FXSR = 0x01000000;
|
||||
const unsigned FLAC__CPUINFO_IA32_CPUID_SSE = 0x02000000;
|
||||
const unsigned FLAC__CPUINFO_IA32_CPUID_SSE2 = 0x04000000;
|
||||
|
||||
const unsigned FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_3DNOW = 0x80000000;
|
||||
const unsigned FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_EXT3DNOW = 0x40000000;
|
||||
const unsigned FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_EXTMMX = 0x00400000;
|
||||
|
||||
|
||||
void FLAC__cpu_info(FLAC__CPUInfo *info)
|
||||
{
|
||||
@@ -41,6 +45,11 @@ void FLAC__cpu_info(FLAC__CPUInfo *info)
|
||||
info->data.ia32.fxsr = (cpuid & FLAC__CPUINFO_IA32_CPUID_FXSR)? true : false;
|
||||
info->data.ia32.sse = (cpuid & FLAC__CPUINFO_IA32_CPUID_SSE)? true : false; /* @@@ also need to check for operating system support */
|
||||
info->data.ia32.sse2 = (cpuid & FLAC__CPUINFO_IA32_CPUID_SSE2)? true : false; /* @@@ also need to check for operating system support */
|
||||
|
||||
cpuid = FLAC__cpu_info_extended_amd_asm_ia32();
|
||||
info->data.ia32._3dnow = (cpuid & FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_3DNOW)? true : false;
|
||||
info->data.ia32.ext3dnow = (cpuid & FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_EXT3DNOW)? true : false;
|
||||
info->data.ia32.extmmx = (cpuid & FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_EXTMMX)? true : false;
|
||||
}
|
||||
#else
|
||||
info->use_asm = false;
|
||||
|
||||
@@ -21,20 +21,18 @@
|
||||
data_section
|
||||
|
||||
cglobal FLAC__cpu_info_asm_ia32
|
||||
cglobal FLAC__cpu_info_extended_amd_asm_ia32
|
||||
|
||||
code_section
|
||||
|
||||
; **********************************************************************
|
||||
;
|
||||
ALIGN 16
|
||||
cident FLAC__cpu_info_asm_ia32
|
||||
|
||||
push ebx
|
||||
|
||||
have_cpuid:
|
||||
pushfd
|
||||
pop eax
|
||||
mov edx, eax
|
||||
xor eax, 00200000h
|
||||
xor eax, 0x00200000
|
||||
push eax
|
||||
popfd
|
||||
pushfd
|
||||
@@ -42,12 +40,43 @@ cident FLAC__cpu_info_asm_ia32
|
||||
cmp eax, edx
|
||||
jz .no_cpuid
|
||||
mov eax, 1
|
||||
jmp .end
|
||||
.no_cpuid:
|
||||
xor eax, eax
|
||||
.end:
|
||||
ret
|
||||
|
||||
cident FLAC__cpu_info_asm_ia32
|
||||
push ebx
|
||||
call have_cpuid
|
||||
test eax, eax
|
||||
jz .no_cpuid
|
||||
mov eax, 1
|
||||
cpuid
|
||||
mov eax, edx
|
||||
jmp short .end
|
||||
jmp .end
|
||||
.no_cpuid:
|
||||
xor eax, eax ; return 0
|
||||
.end:
|
||||
xor eax, eax
|
||||
.end
|
||||
pop ebx
|
||||
ret
|
||||
|
||||
cident FLAC__cpu_info_extended_amd_asm_ia32
|
||||
push ebx
|
||||
call have_cpuid
|
||||
test eax, eax
|
||||
jz .no_cpuid
|
||||
mov eax, 0x80000000
|
||||
cpuid
|
||||
cmp eax, 0x80000001
|
||||
jb .no_cpuid
|
||||
mov eax, 0x80000001
|
||||
cpuid
|
||||
mov eax, edx
|
||||
jmp .end
|
||||
.no_cpuid
|
||||
xor eax, eax
|
||||
.end
|
||||
pop ebx
|
||||
ret
|
||||
|
||||
|
||||
@@ -24,6 +24,7 @@ cglobal FLAC__lpc_compute_autocorrelation_asm_ia32
|
||||
cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
|
||||
cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
|
||||
cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
|
||||
cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
|
||||
cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
|
||||
cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
|
||||
cglobal FLAC__lpc_restore_signal_asm_ia32
|
||||
@@ -592,6 +593,124 @@ cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
|
||||
.end:
|
||||
ret
|
||||
|
||||
align 16
|
||||
cident FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
|
||||
;[ebp + 32] autoc
|
||||
;[ebp + 28] lag
|
||||
;[ebp + 24] data_len
|
||||
;[ebp + 20] data
|
||||
|
||||
push ebp
|
||||
push ebx
|
||||
push esi
|
||||
push edi
|
||||
mov ebp, esp
|
||||
|
||||
mov esi, [ebp + 20]
|
||||
mov edi, [ebp + 24]
|
||||
mov edx, [ebp + 28]
|
||||
mov eax, edx
|
||||
neg eax
|
||||
and esp, byte -8
|
||||
lea esp, [esp + 4 * eax]
|
||||
mov ecx, edx
|
||||
xor eax, eax
|
||||
.loop0:
|
||||
dec ecx
|
||||
mov [esp + 4 * ecx], eax
|
||||
jnz short .loop0
|
||||
|
||||
mov eax, edi
|
||||
sub eax, edx
|
||||
mov ebx, edx
|
||||
and ebx, byte 1
|
||||
sub eax, ebx
|
||||
lea ecx, [esi + 4 * eax - 12]
|
||||
cmp esi, ecx
|
||||
mov eax, esi
|
||||
ja short .loop2_pre
|
||||
align 16 ;8 nops
|
||||
.loop1_i:
|
||||
movd mm0, [eax]
|
||||
movd mm2, [eax + 4]
|
||||
movd mm4, [eax + 8]
|
||||
movd mm6, [eax + 12]
|
||||
mov ebx, edx
|
||||
punpckldq mm0, mm0
|
||||
punpckldq mm2, mm2
|
||||
punpckldq mm4, mm4
|
||||
punpckldq mm6, mm6
|
||||
align 16 ;3 nops
|
||||
.loop1_j:
|
||||
sub ebx, byte 2
|
||||
movd mm1, [eax + 4 * ebx]
|
||||
movd mm3, [eax + 4 * ebx + 4]
|
||||
movd mm5, [eax + 4 * ebx + 8]
|
||||
movd mm7, [eax + 4 * ebx + 12]
|
||||
punpckldq mm1, mm3
|
||||
punpckldq mm3, mm5
|
||||
pfmul mm1, mm0
|
||||
punpckldq mm5, mm7
|
||||
pfmul mm3, mm2
|
||||
punpckldq mm7, [eax + 4 * ebx + 16]
|
||||
pfmul mm5, mm4
|
||||
pfmul mm7, mm6
|
||||
pfadd mm1, mm3
|
||||
movq mm3, [esp + 4 * ebx]
|
||||
pfadd mm5, mm7
|
||||
pfadd mm1, mm5
|
||||
pfadd mm3, mm1
|
||||
movq [esp + 4 * ebx], mm3
|
||||
jg short .loop1_j
|
||||
|
||||
add eax, byte 16
|
||||
cmp eax, ecx
|
||||
jb short .loop1_i
|
||||
|
||||
.loop2_pre:
|
||||
mov ebx, eax
|
||||
sub eax, esi
|
||||
shr eax, 2
|
||||
lea ecx, [esi + 4 * edi]
|
||||
mov esi, ebx
|
||||
.loop2_i:
|
||||
movd mm0, [esi]
|
||||
mov ebx, edi
|
||||
sub ebx, eax
|
||||
cmp ebx, edx
|
||||
jbe short .loop2_j
|
||||
mov ebx, edx
|
||||
.loop2_j:
|
||||
dec ebx
|
||||
movd mm1, [esi + 4 * ebx]
|
||||
pfmul mm1, mm0
|
||||
movd mm2, [esp + 4 * ebx]
|
||||
pfadd mm1, mm2
|
||||
movd [esp + 4 * ebx], mm1
|
||||
|
||||
jnz short .loop2_j
|
||||
|
||||
add esi, byte 4
|
||||
inc eax
|
||||
cmp esi, ecx
|
||||
jnz short .loop2_i
|
||||
|
||||
mov edi, [ebp + 32]
|
||||
.loop3:
|
||||
dec edx
|
||||
mov eax, [esp + 4 * edx]
|
||||
mov [edi + 4 * edx], eax
|
||||
jnz short .loop3
|
||||
|
||||
femms
|
||||
|
||||
mov esp, ebp
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 data[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
|
||||
;
|
||||
; for(i = 0; i < data_len; i++) {
|
||||
|
||||
@@ -33,13 +33,21 @@ typedef struct {
|
||||
FLAC__bool fxsr;
|
||||
FLAC__bool sse;
|
||||
FLAC__bool sse2;
|
||||
FLAC__bool _3dnow;
|
||||
FLAC__bool ext3dnow;
|
||||
FLAC__bool extmmx;
|
||||
} FLAC__CPUInfo_IA32;
|
||||
|
||||
extern const unsigned FLAC__CPUINFO_IA32_CPUID_CMOV;
|
||||
extern const unsigned FLAC__CPUINFO_IA32_CPUID_MMX;
|
||||
extern const unsigned FLAC__CPUINFO_IA32_CPUID_FXSR;
|
||||
extern const unsigned FLAC__CPUINFO_IA32_CPUID_SSE;
|
||||
extern const unsigned FLAC__CPUINFO_IA32_CPUID_SSE2;
|
||||
|
||||
extern const unsigned FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_3DNOW;
|
||||
extern const unsigned FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_EXT3DNOW;
|
||||
extern const unsigned FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_EXTMMX;
|
||||
|
||||
typedef struct {
|
||||
FLAC__bool use_asm;
|
||||
FLAC__CPUInfo_Type type;
|
||||
@@ -54,6 +62,7 @@ void FLAC__cpu_info(FLAC__CPUInfo *info);
|
||||
#ifdef FLAC__CPU_IA32
|
||||
#ifdef FLAC__HAS_NASM
|
||||
unsigned FLAC__cpu_info_asm_ia32();
|
||||
unsigned FLAC__cpu_info_extended_amd_asm_ia32();
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -44,6 +44,7 @@ void FLAC__lpc_compute_autocorrelation_asm_ia32(const FLAC__real data[], unsigne
|
||||
void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
|
||||
void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
|
||||
void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
|
||||
void FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -375,7 +375,7 @@ FLAC__StreamEncoderState FLAC__stream_encoder_init(FLAC__StreamEncoder *encoder)
|
||||
#ifdef FLAC__CPU_IA32
|
||||
FLAC__ASSERT(encoder->private->cpuinfo.type == FLAC__CPUINFO_TYPE_IA32);
|
||||
#ifdef FLAC__HAS_NASM
|
||||
if(0 && encoder->private->cpuinfo.data.ia32.sse) { /* SSE version lacks necessary resolution, plus SSE flag doesn't check for OS support */
|
||||
if(0 && encoder->private->cpuinfo.data.ia32.sse) { /*@@@ SSE version lacks necessary resolution, plus SSE flag doesn't check for OS support */
|
||||
if(encoder->protected->max_lpc_order < 4)
|
||||
encoder->private->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4;
|
||||
else if(encoder->protected->max_lpc_order < 8)
|
||||
@@ -385,6 +385,8 @@ FLAC__StreamEncoderState FLAC__stream_encoder_init(FLAC__StreamEncoder *encoder)
|
||||
else
|
||||
encoder->private->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32;
|
||||
}
|
||||
else if(0 && encoder->private->cpuinfo.data.ia32._3dnow) /*@@@ turn back on in first beta after 1.0 */
|
||||
encoder->private->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow;
|
||||
else
|
||||
encoder->private->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32;
|
||||
if(encoder->private->cpuinfo.data.ia32.mmx && encoder->private->cpuinfo.data.ia32.cmov)
|
||||
|
||||
Reference in New Issue
Block a user