add miroslav's version of FLAC__lpc_restore_signal, add SSE version of FLAC__lpc_compute_autocorrelation

This commit is contained in:
Josh Coalson
2001-05-18 18:46:40 +00:00
parent 7e40e6dad9
commit 675effa90e

View File

@@ -21,6 +21,7 @@
data_section
cglobal FLAC__lpc_compute_autocorrelation_asm
cglobal FLAC__lpc_restore_signal_asm
code_section
@@ -28,65 +29,430 @@ cglobal FLAC__lpc_compute_autocorrelation_asm
;
; void FLAC__lpc_compute_autocorrelation_asm(const real data[], unsigned data_len, unsigned lag, real autoc[])
; {
; real d;
; unsigned i;
; real d;
; unsigned sample, coeff;
; const unsigned limit = data_len - lag;
;
; assert(lag > 0);
; assert(lag <= data_len);
;
; while(lag--) {
; for(i = lag, d = 0.0; i < data_len; i++)
; d += data[i] * data[i - lag];
; autoc[lag] = d;
; }
; for(coeff = 0; coeff < lag; coeff++)
; autoc[coeff] = 0.0;
; for(sample = 0; sample <= limit; sample++) {
; d = data[sample];
; for(coeff = 0; coeff < lag; coeff++)
; autoc[coeff] += d * data[sample+coeff];
; }
; for(; sample < data_len; sample++) {
; d = data[sample];
; for(coeff = 0; coeff < data_len - sample; coeff++)
; autoc[coeff] += d * data[sample+coeff];
; }
; }
;
FLAC__lpc_compute_autocorrelation_asm:
; esp + 20 == data[]
; esp + 24 == data_len
; esp + 28 == lag
; esp + 32 == autoc[]
push ebp
lea ebp, [esp + 8]
push eax
push ebx
push ecx
push edx
push esi
push edi
mov esi, [ebp] ; esi == data
mov ecx, [ebp + 4] ; ecx == data_len
mov edx, [ebp + 8] ; edx == lag
mov edi, [ebp + 12] ; edi == autoc
mov edx, [esp + 28] ; edx == lag
mov edi, [esp + 32] ; edi == autoc
; for(coeff = 0; coeff < lag; coeff++)
; autoc[coeff] = 0.0;
mov ecx, edx ; ecx = # of dwords of 0 to write
xor eax, eax
rep stosd
; const unsigned limit = data_len - lag;
mov esi, [esp + 24] ; esi == data_len
mov ecx, esi
sub ecx, edx ; ecx == limit
mov edi, [esp + 32] ; edi == autoc
inc ecx ; we are looping <= limit so we add one to the counter
; for(sample = 0; sample <= limit; sample++) {
; d = data[sample];
; for(coeff = 0; coeff < lag; coeff++)
; autoc[coeff] += d * data[sample+coeff];
; }
mov eax, [esp + 20] ; eax == &data[sample] <- &data[0]
xor ebp, ebp ; ebp == sample <- 0
ALIGN 16
.outer_loop:
test edx, edx
jz .outer_end
dec edx ; lag--
mov ebx, edx ; ebx == i <- lag
xor eax, eax ; eax == i-lag
fldz ; ST = d <- 0.0
push eax ; save &data[sample], for inner_loop:
; eax == &data[sample+coeff] <- &data[sample]
fld dword [eax] ; ST = d <- data[sample]
mov edx, [esp + 32] ; edx <- lag (note the +4 due to the above 'push eax')
mov ebx, edi ; ebx == &autoc[coeff] <- &autoc[0]
and edx, 3 ; edx <- lag % 4
jz .inner_start
cmp edx, 1
je .warmup_1
cmp edx, 2
je .warmup_2
.warmup_3:
fld st0 ; ST = d d
fmul dword [eax] ; ST = d*data[sample+coeff] d
add eax, byte 4 ; (sample+coeff)++
fadd dword [ebx] ; ST = autoc[coeff]+d*data[sample+coeff] d
fstp dword [ebx] ; autoc[coeff]+=d*data[sample+coeff] ST = d
add ebx, byte 4 ; coeff++
.warmup_2:
fld st0 ; ST = d d
fmul dword [eax] ; ST = d*data[sample+coeff] d
add eax, byte 4 ; (sample+coeff)++
fadd dword [ebx] ; ST = autoc[coeff]+d*data[sample+coeff] d
fstp dword [ebx] ; autoc[coeff]+=d*data[sample+coeff] ST = d
add ebx, byte 4 ; coeff++
.warmup_1:
fld st0 ; ST = d d
fmul dword [eax] ; ST = d*data[sample+coeff] d
add eax, byte 4 ; (sample+coeff)++
fadd dword [ebx] ; ST = autoc[coeff]+d*data[sample+coeff] d
fstp dword [ebx] ; autoc[coeff]+=d*data[sample+coeff] ST = d
add ebx, byte 4 ; coeff++
.inner_start:
mov edx, [esp + 32] ; edx <- lag (note the +4 due to the above 'push eax')
shr edx, 2 ; edx <- lag / 4
jz .inner_end
ALIGN 16
.inner_loop:
cmp ebx, ecx
jae short .inner_end
fld qword [esi + ebx * 8] ; ST = data[i] d
fmul qword [esi + eax * 8] ; ST = data[i]*data[i-lag] d
faddp st1 ; d += data[i]*data[i-lag] ST = d
inc eax
inc ebx
jmp .inner_loop
fld st0 ; ST = d d
fmul dword [eax] ; ST = d*data[sample+coeff] d
add eax, byte 4 ; (sample+coeff)++
fadd dword [ebx] ; ST = autoc[coeff]+d*data[sample+coeff] d
fstp dword [ebx] ; autoc[coeff]+=d*data[sample+coeff] ST = d
add ebx, byte 4 ; coeff++
fld st0 ; ST = d d
fmul dword [eax] ; ST = d*data[sample+coeff] d
add eax, byte 4 ; (sample+coeff)++
fadd dword [ebx] ; ST = autoc[coeff]+d*data[sample+coeff] d
fstp dword [ebx] ; autoc[coeff]+=d*data[sample+coeff] ST = d
add ebx, byte 4 ; coeff++
fld st0 ; ST = d d
fmul dword [eax] ; ST = d*data[sample+coeff] d
add eax, byte 4 ; (sample+coeff)++
fadd dword [ebx] ; ST = autoc[coeff]+d*data[sample+coeff] d
fstp dword [ebx] ; autoc[coeff]+=d*data[sample+coeff] ST = d
add ebx, byte 4 ; coeff++
fld st0 ; ST = d d
fmul dword [eax] ; ST = d*data[sample+coeff] d
add eax, byte 4 ; (sample+coeff)++
fadd dword [ebx] ; ST = autoc[coeff]+d*data[sample+coeff] d
fstp dword [ebx] ; autoc[coeff]+=d*data[sample+coeff] ST = d
add ebx, byte 4 ; coeff++
dec edx
jnz .inner_loop
.inner_end:
pop eax ; restore &data[sample]
fstp st0 ; pop d, ST = empty
inc ebp ; sample++
add eax, byte 4 ; &data[sample++]
dec ecx
jnz .outer_loop
fstp qword [edi + edx * 8]
jmp .outer_loop
.outer_end:
; for(; sample < data_len; sample++) {
; d = data[sample];
; for(coeff = 0; coeff < data_len - sample; coeff++)
; autoc[coeff] += d * data[sample+coeff];
; }
mov ecx, [esp + 28] ; ecx <- lag
dec ecx ; ecx <- lag - 1
jz .end ; skip loop if 0
ALIGN 16
.outer_loop2:
push eax ; save &data[sample], for inner_loop2:
; eax == &data[sample+coeff] <- &data[sample]
fld dword [eax] ; ST = d <- data[sample]
mov edx, esi ; edx <- data_len
sub edx, ebp ; edx <- data_len-sample
mov ebx, edi ; ebx == &autoc[coeff] <- &autoc[0]
ALIGN 16
.inner_loop2:
fld st0 ; ST = d d
fmul dword [eax] ; ST = d*data[sample+coeff] d
add eax, byte 4 ; (sample+coeff)++
fadd dword [ebx] ; ST = autoc[coeff]+d*data[sample+coeff] d
fstp dword [ebx] ; autoc[coeff]+=d*data[sample+coeff] ST = d
add ebx, byte 4 ; coeff++
dec edx
jnz .inner_loop2
pop eax ; restore &data[sample]
fstp st0 ; pop d, ST = empty
inc ebp ; sample++
add eax, byte 4 ; &data[sample++]
dec ecx
jnz .outer_loop2
.end:
pop edi
pop esi
pop ebx
pop ebp
ret
; NOTE: this SSE version is not even tested yet and only works for lag == 8
FLAC__lpc_compute_autocorrelation_sse:
; esp + 4 == data[]
; esp + 8 == data_len
; esp + 12 == lag
; esp + 16 == autoc[]
; for(coeff = 0; coeff < lag; coeff++)
; autoc[coeff] = 0.0;
xorps xmm6, xmm6
xorps xmm7, xmm7
mov edx, [esp + 8] ; edx == data_len
mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
add eax, 4
movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
shufps xmm0, xmm0, 0 ; xmm0 = data[0],data[0],data[0],data[0]
movaps xmm1, xmm0 ; xmm1 = data[0],data[0],data[0],data[0]
xorps xmm3, xmm3 ; xmm3 = 0,0,0,0
.warmup: ; xmm3:xmm2 = data[sample-[7..0]]
movps xmm4, xmm0
movps xmm5, xmm1 ; xmm5:xmm4 = xmm1:xmm0 = data[sample]*8
mulps xmm4, xmm2
mulps xmm5, xmm3 ; xmm5:xmm4 = xmm1:xmm0 * xmm3:xmm2
addps xmm6, xmm4
addps xmm7, xmm5 ; xmm7:xmm6 += xmm1:xmm0 * xmm3:xmm2
dec edx
;* there's no need to even check for this because we know that lag == 8
;* and data_len >= lag, so our 1-sample warmup cannot finish the loop
; jz .loop_end
ALIGN 16
.loop_8:
; read the next sample
movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
add eax, 4
shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample]
movaps xmm1, xmm0 ; xmm1 = data[sample],data[sample],data[sample],data[sample]
; now shift the lagged samples
movaps xmm4, xmm2
movaps xmm5, xmm3
shufps xmm2, xmm4, 2103h ; xmm2
shufps xmm3, xmm5, 2103h ; xmm2
movss xmm3, xmm2
movss xmm2, xmm0
movps xmm4, xmm0
movps xmm5, xmm1 ; xmm5:xmm4 = xmm1:xmm0 = data[sample]*8
mulps xmm4, xmm2
mulps xmm5, xmm3 ; xmm5:xmm4 = xmm1:xmm0 * xmm3:xmm2
addps xmm6, xmm4
addps xmm7, xmm5 ; xmm7:xmm6 += xmm1:xmm0 * xmm3:xmm2
dec edx
jnz .loop_8
.loop_end:
; store autoc
mov edx, [esp + 16] ; edx == autoc
movups xmm6, [edx]
movups xmm7, [edx + 4]
.end:
ret
; **********************************************************************
;
; void FLAC__lpc_restore_signal(const int32 residual[], unsigned data_len, const int32 qlp_coeff[], unsigned order, int lp_quantization, int32 data[])
; {
; unsigned i, j;
; int32 sum;
;
; assert(order > 0);
;
; for(i = 0; i < data_len; i++) {
; sum = 0;
; for(j = 0; j < order; j++)
; sum += qlp_coeff[j] * data[i-j-1];
; data[i] = residual[i] + (sum >> lp_quantization);
; }
; }
FLAC__lpc_restore_signal_asm:
; [esp + 20] == residual[]
; [esp + 24] == data_len
; [esp + 28] == qlp_coeff[]
; [esp + 32] == order
; [esp + 36] == lp_quantization
; [esp + 40] == data[]
push ebp
push ebx
push esi
push edi
mov esi, [esp + 20]
mov ebx, [esp + 24]
mov eax, [esp + 32]
mov edi, [esp + 40]
cmp eax, byte 32 ; for order <= 32 there is a faster routine
jbe short .b
; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
ALIGN 16
.loop_i_a
xor ebp, ebp
mov ecx, [esp + 32]
mov edx, ecx
shl edx, 2
add edx, [esp + 28]
neg ecx
ALIGN 16
.loop_j_a
sub edx, 4
mov eax, [edx]
imul eax, [edi + 4 * ecx]
add ebp, eax
inc ecx
jnz .loop_j_a
mov cl, [esp + 36]
sar ebp, cl
add ebp, [esi]
mov [edi], ebp
add edi, byte 4
add esi, byte 4
dec ebx
jnz .loop_i_a
jmp .end
.b
sub esi, edi
neg eax
lea edx, [eax + eax * 8 + .jumper_0]
inc edx
mov eax, [esp + 28]
xor ebp, ebp
jmp edx
;.jumper_32
mov ecx, [eax + 124] ;32
imul ecx, [edi - 128]
add ebp, ecx
mov ecx, [eax + 120]
imul ecx, [edi - 124]
add ebp, ecx
mov ecx, [eax + 116] ;30
imul ecx, [edi - 120]
add ebp, ecx
mov ecx, [eax + 112]
imul ecx, [edi - 116]
add ebp, ecx
mov ecx, [eax + 108]
imul ecx, [edi - 112]
add ebp, ecx
mov ecx, [eax + 104]
imul ecx, [edi - 108]
add ebp, ecx
mov ecx, [eax + 100]
imul ecx, [edi - 104]
add ebp, ecx
mov ecx, [eax + 96] ;25
imul ecx, [edi - 100]
add ebp, ecx
mov ecx, [eax + 92]
imul ecx, [edi - 96]
add ebp, ecx
mov ecx, [eax + 88]
imul ecx, [edi - 92]
add ebp, ecx
mov ecx, [eax + 84]
imul ecx, [edi - 88]
add ebp, ecx
mov ecx, [eax + 80]
imul ecx, [edi - 84]
add ebp, ecx
mov ecx, [eax + 76] ;20
imul ecx, [edi - 80]
add ebp, ecx
mov ecx, [eax + 72]
imul ecx, [edi - 76]
add ebp, ecx
mov ecx, [eax + 68]
imul ecx, [edi - 72]
add ebp, ecx
mov ecx, [eax + 64]
imul ecx, [edi - 68]
add ebp, ecx
mov ecx, [eax + 60]
imul ecx, [edi - 64]
add ebp, ecx
mov ecx, [eax + 56] ;15
imul ecx, [edi - 60]
add ebp, ecx
mov ecx, [eax + 52]
imul ecx, [edi - 56]
add ebp, ecx
mov ecx, [eax + 48]
imul ecx, [edi - 52]
add ebp, ecx
mov ecx, [eax + 44]
imul ecx, [edi - 48]
add ebp, ecx
mov ecx, [eax + 40]
imul ecx, [edi - 44]
add ebp, ecx
mov ecx, [eax + 36] ;10
imul ecx, [edi - 40]
add ebp, ecx
mov ecx, [eax + 32]
imul ecx, [edi - 36]
add ebp, ecx
mov ecx, [eax + 28]
imul ecx, [edi - 32]
add ebp, ecx
mov ecx, [eax + 24]
imul ecx, [edi - 28]
add ebp, ecx
mov ecx, [eax + 20]
imul ecx, [edi - 24]
add ebp, ecx
mov ecx, [eax + 16] ;5
imul ecx, [edi - 20]
add ebp, ecx
mov ecx, [eax + 12]
imul ecx, [edi - 16]
add ebp, ecx
mov ecx, [eax + 8]
imul ecx, [edi - 12]
add ebp, ecx
mov ecx, [eax + 4]
imul ecx, [edi - 8]
add ebp, ecx
mov ecx, [eax] ;there is one byte missing
imul ecx, [edi - 4]
add ebp, ecx
.jumper_0
mov cl, [esp + 36]
sar ebp, cl
add ebp, [esi + edi]
mov [edi], ebp
add edi, byte 4
dec ebx
jz short .end
xor ebp, ebp
jmp edx
.end
pop edi
pop esi
pop edx
pop ecx
pop ebx
pop eax
pop ebp
ret