diff --git a/src/libFLAC/i386/lpc_asm.s b/src/libFLAC/i386/lpc_asm.s index 0bf46172..53c94603 100644 --- a/src/libFLAC/i386/lpc_asm.s +++ b/src/libFLAC/i386/lpc_asm.s @@ -57,10 +57,10 @@ cglobal FLAC__lpc_restore_signal_asm_i386_mmx ALIGN 16 cident FLAC__lpc_compute_autocorrelation_asm_i386 - ; esp + 32 == autoc[] - ; esp + 28 == lag - ; esp + 24 == data_len - ; esp + 20 == data[] + ;[esp + 32] == autoc[] + ;[esp + 28] == lag + ;[esp + 24] == data_len + ;[esp + 20] == data[] push ebp push ebx @@ -208,10 +208,10 @@ cident FLAC__lpc_compute_autocorrelation_asm_i386 ALIGN 16 cident FLAC__lpc_compute_autocorrelation_asm_i386_sse - ; esp + 16 == autoc[] - ; esp + 12 == lag - ; esp + 8 == data_len - ; esp + 4 == data[] + ;[esp + 16] == autoc[] + ;[esp + 12] == lag + ;[esp + 8] == data_len + ;[esp + 4] == data[] ; for(coeff = 0; coeff < lag; coeff++) ; autoc[coeff] = 0.0; @@ -227,45 +227,38 @@ cident FLAC__lpc_compute_autocorrelation_asm_i386_sse shufps xmm0, xmm0, 0 ; xmm0 = data[0],data[0],data[0],data[0] movaps xmm1, xmm0 ; xmm1 = data[0],data[0],data[0],data[0] xorps xmm3, xmm3 ; xmm3 = 0,0,0,0 -.warmup: ; xmm3:xmm2 = data[sample-[7..0]] - movaps xmm4, xmm0 - movaps xmm5, xmm1 ; xmm5:xmm4 = xmm1:xmm0 = data[sample]*8 - mulps xmm4, xmm2 - mulps xmm5, xmm3 ; xmm5:xmm4 = xmm1:xmm0 * xmm3:xmm2 - addps xmm6, xmm4 - addps xmm7, xmm5 ; xmm7:xmm6 += xmm1:xmm0 * xmm3:xmm2 +.warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample] + mulps xmm0, xmm2 + mulps xmm1, xmm3 ; xmm5:xmm4 = xmm1:xmm0 * xmm3:xmm2 + addps xmm6, xmm0 + addps xmm7, xmm1 ; xmm7:xmm6 += xmm1:xmm0 * xmm3:xmm2 dec edx ;* there's no need to even check for this because we know that lag == 8 ;* and data_len >= lag, so our 1-sample warmup cannot finish the loop ; jz .loop_end ALIGN 16 .loop_8: - ; read the next sample + ; start by reading the next sample movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] - add eax, 4 - shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample] - movaps xmm1, xmm0 ; xmm1 = data[sample],data[sample],data[sample],data[sample] - ; now shift the lagged samples - movaps xmm4, xmm2 - movaps xmm5, xmm3 - shufps xmm2, xmm4, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float - shufps xmm3, xmm5, 93h ; 93h=2-1-0-3 => xmm3 gets rotated left by one float - movss xmm3, xmm2 - movss xmm2, xmm0 - - movaps xmm4, xmm0 - movaps xmm5, xmm1 ; xmm5:xmm4 = xmm1:xmm0 = data[sample]*8 - mulps xmm4, xmm2 - mulps xmm5, xmm3 ; xmm5:xmm4 = xmm1:xmm0 * xmm3:xmm2 - addps xmm6, xmm4 - addps xmm7, xmm5 ; xmm7:xmm6 += xmm1:xmm0 * xmm3:xmm2 + ; here we reorder the instructions; see the (#) indexes for a logical order + shufps xmm2, xmm2, 93h ; (3) 93h=2-1-0-3 => xmm2 gets rotated left by one float + add eax, 4 ; (0) + shufps xmm3, xmm3, 93h ; (4) 93h=2-1-0-3 => xmm3 gets rotated left by one float + shufps xmm0, xmm0, 0 ; (1) xmm0 = data[sample],data[sample],data[sample],data[sample] + movss xmm3, xmm2 ; (5) + movaps xmm1, xmm0 ; (2) xmm1 = data[sample],data[sample],data[sample],data[sample] + movss xmm2, xmm0 ; (6) + mulps xmm1, xmm3 ; (8) + mulps xmm0, xmm2 ; (7) xmm5:xmm4 = xmm1:xmm0 * xmm3:xmm2 + addps xmm7, xmm1 ; (10) + addps xmm6, xmm0 ; (9) xmm7:xmm6 += xmm1:xmm0 * xmm3:xmm2 dec edx jnz .loop_8 .loop_end: ; store autoc mov edx, [esp + 16] ; edx == autoc - movups xmm6, [edx] - movups xmm7, [edx + 4] + movups [edx], xmm6 + movups [edx + 4], xmm7 .end: ret