mirror of
https://github.com/claunia/flac.git
synced 2025-12-16 18:54:26 +00:00
add miroslav's versions of FLAC__lpc_compute_residual_from_qlp_coeffients
This commit is contained in:
@@ -22,6 +22,8 @@
|
||||
|
||||
cglobal FLAC__lpc_compute_autocorrelation_asm_i386
|
||||
cglobal FLAC__lpc_compute_autocorrelation_asm_i386_sse
|
||||
cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_i386
|
||||
cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_i386_mmx
|
||||
cglobal FLAC__lpc_restore_signal_asm_i386
|
||||
cglobal FLAC__lpc_restore_signal_asm_i386_mmx
|
||||
|
||||
@@ -268,6 +270,391 @@ cident FLAC__lpc_compute_autocorrelation_asm_i386_sse:
|
||||
.end:
|
||||
ret
|
||||
|
||||
;void FLAC__lpc_compute_residual_from_qlp_coefficients(const int32 data[], unsigned data_len, const int32 qlp_coeff[], unsigned order, int lp_quantization, int32 residual[])
|
||||
;
|
||||
; for(i = 0; i < data_len; i++) {
|
||||
; sum = 0;
|
||||
; for(j = 0; j < order; j++)
|
||||
; sum += qlp_coeff[j] * data[i-j-1];
|
||||
; residual[i] = data[i] - (sum >> lp_quantization);
|
||||
; }
|
||||
;
|
||||
ALIGN 16
|
||||
cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_i386:
|
||||
;[esp + 40] residual[]
|
||||
;[esp + 36] lp_quantization
|
||||
;[esp + 32] order
|
||||
;[esp + 28] qlp_coeff[]
|
||||
;[esp + 24] data_len
|
||||
;[esp + 20] data[]
|
||||
|
||||
push ebp
|
||||
push ebx
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov esi, [esp + 20]
|
||||
mov edi, [esp + 40]
|
||||
mov eax, [esp + 32]
|
||||
mov ebx, [esp + 24]
|
||||
|
||||
.begin:
|
||||
cmp eax, byte 1
|
||||
jg short .i_1more
|
||||
|
||||
mov ecx, [esp + 28]
|
||||
mov edx, [ecx]
|
||||
mov eax, [esi - 4]
|
||||
mov cl, [esp + 36]
|
||||
ALIGN 16
|
||||
.i_1_loop_i:
|
||||
imul eax, edx
|
||||
sar eax, cl
|
||||
neg eax
|
||||
add eax, [esi]
|
||||
mov [edi], eax
|
||||
mov eax, [esi]
|
||||
add edi, byte 4
|
||||
add esi, byte 4
|
||||
dec ebx
|
||||
jnz .i_1_loop_i
|
||||
|
||||
jmp .end
|
||||
|
||||
.i_1more:
|
||||
cmp eax, byte 32 ; for order <= 32 there is a faster routine
|
||||
jbe short .i_32
|
||||
|
||||
; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
|
||||
ALIGN 16
|
||||
.i_32more_loop_i:
|
||||
xor ebp, ebp
|
||||
mov ecx, [esp + 32]
|
||||
mov edx, ecx
|
||||
shl edx, 2
|
||||
add edx, [esp + 28]
|
||||
neg ecx
|
||||
ALIGN 16
|
||||
.i_32more_loop_j:
|
||||
sub edx, byte 4
|
||||
mov eax, [edx]
|
||||
imul eax, [esi + 4 * ecx]
|
||||
add ebp, eax
|
||||
inc ecx
|
||||
jnz short .i_32more_loop_j
|
||||
|
||||
mov cl, [esp + 36]
|
||||
sar ebp, cl
|
||||
neg ebp
|
||||
add ebp, [esi]
|
||||
mov [edi], ebp
|
||||
add esi, byte 4
|
||||
add edi, byte 4
|
||||
|
||||
dec ebx
|
||||
jnz .i_32more_loop_i
|
||||
|
||||
jmp .end
|
||||
|
||||
.i_32:
|
||||
sub edi, esi
|
||||
neg eax
|
||||
lea edx, [eax + eax * 8 + .jumper_0]
|
||||
inc edx
|
||||
mov eax, [esp + 28]
|
||||
xor ebp, ebp
|
||||
jmp edx
|
||||
|
||||
mov ecx, [eax + 124]
|
||||
imul ecx, [esi - 128]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax + 120]
|
||||
imul ecx, [esi - 124]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax + 116]
|
||||
imul ecx, [esi - 120]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax + 112]
|
||||
imul ecx, [esi - 116]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax + 108]
|
||||
imul ecx, [esi - 112]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax + 104]
|
||||
imul ecx, [esi - 108]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax + 100]
|
||||
imul ecx, [esi - 104]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax + 96]
|
||||
imul ecx, [esi - 100]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax + 92]
|
||||
imul ecx, [esi - 96]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax + 88]
|
||||
imul ecx, [esi - 92]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax + 84]
|
||||
imul ecx, [esi - 88]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax + 80]
|
||||
imul ecx, [esi - 84]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax + 76]
|
||||
imul ecx, [esi - 80]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax + 72]
|
||||
imul ecx, [esi - 76]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax + 68]
|
||||
imul ecx, [esi - 72]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax + 64]
|
||||
imul ecx, [esi - 68]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax + 60]
|
||||
imul ecx, [esi - 64]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax + 56]
|
||||
imul ecx, [esi - 60]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax + 52]
|
||||
imul ecx, [esi - 56]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax + 48]
|
||||
imul ecx, [esi - 52]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax + 44]
|
||||
imul ecx, [esi - 48]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax + 40]
|
||||
imul ecx, [esi - 44]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax + 36]
|
||||
imul ecx, [esi - 40]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax + 32]
|
||||
imul ecx, [esi - 36]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax + 28]
|
||||
imul ecx, [esi - 32]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax + 24]
|
||||
imul ecx, [esi - 28]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax + 20]
|
||||
imul ecx, [esi - 24]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax + 16]
|
||||
imul ecx, [esi - 20]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax + 12]
|
||||
imul ecx, [esi - 16]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax + 8]
|
||||
imul ecx, [esi - 12]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax + 4]
|
||||
imul ecx, [esi - 8]
|
||||
add ebp, ecx
|
||||
mov ecx, [eax] ;there is one byte missing
|
||||
imul ecx, [esi - 4]
|
||||
add ebp, ecx
|
||||
.jumper_0:
|
||||
|
||||
mov cl, [esp + 36]
|
||||
sar ebp, cl
|
||||
neg ebp
|
||||
add ebp, [esi]
|
||||
mov [edi + esi], ebp
|
||||
add esi, byte 4
|
||||
|
||||
dec ebx
|
||||
jz short .end
|
||||
xor ebp, ebp
|
||||
jmp edx
|
||||
|
||||
.end:
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
|
||||
; the channel must be <= 16. Especially note that this routine cannot be used
|
||||
; for side-channel coded 16bps channels since the effective bps is 17.
|
||||
ALIGN 16
|
||||
cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_i386_mmx:
|
||||
;[esp + 40] residual[]
|
||||
;[esp + 36] lp_quantization
|
||||
;[esp + 32] order
|
||||
;[esp + 28] qlp_coeff[]
|
||||
;[esp + 24] data_len
|
||||
;[esp + 20] data[]
|
||||
|
||||
push ebp
|
||||
push ebx
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov esi, [esp + 20]
|
||||
mov edi, [esp + 40]
|
||||
mov eax, [esp + 32]
|
||||
mov ebx, [esp + 24]
|
||||
|
||||
dec ebx
|
||||
test ebx, ebx
|
||||
jz near .last_one
|
||||
|
||||
mov edx, [esp + 28]
|
||||
movd mm6, [esp + 36]
|
||||
mov ebp, esp
|
||||
|
||||
and esp, 0xfffffff8
|
||||
|
||||
xor ecx, ecx
|
||||
.copy_qlp_loop:
|
||||
push word [edx + 4 * ecx]
|
||||
inc ecx
|
||||
cmp ecx, eax
|
||||
jnz short .copy_qlp_loop
|
||||
|
||||
and ecx, 0x3
|
||||
test ecx, ecx
|
||||
je short .za_end
|
||||
sub ecx, byte 4
|
||||
.za_loop:
|
||||
push word 0
|
||||
inc eax
|
||||
inc ecx
|
||||
jnz short .za_loop
|
||||
.za_end:
|
||||
|
||||
movq mm5, [esp + 2 * eax - 8]
|
||||
movd mm4, [esi - 16]
|
||||
punpckldq mm4, [esi - 12]
|
||||
movd mm0, [esi - 8]
|
||||
punpckldq mm0, [esi - 4]
|
||||
packssdw mm4, mm0
|
||||
|
||||
cmp eax, byte 4
|
||||
jnbe short .mmx_4more
|
||||
|
||||
align 16
|
||||
.mmx_4_loop_i:
|
||||
movd mm1, [esi]
|
||||
movq mm3, mm4
|
||||
punpckldq mm1, [esi + 4]
|
||||
psrlq mm4, 16
|
||||
movq mm0, mm1
|
||||
psllq mm0, 48
|
||||
por mm4, mm0
|
||||
movq mm2, mm4
|
||||
psrlq mm4, 16
|
||||
pxor mm0, mm0
|
||||
punpckhdq mm0, mm1
|
||||
pmaddwd mm3, mm5
|
||||
pmaddwd mm2, mm5
|
||||
psllq mm0, 16
|
||||
por mm4, mm0
|
||||
movq mm0, mm3
|
||||
punpckldq mm3, mm2
|
||||
punpckhdq mm0, mm2
|
||||
paddd mm3, mm0
|
||||
psrad mm3, mm6
|
||||
psubd mm1, mm3
|
||||
movd [edi], mm1
|
||||
punpckhdq mm1, mm1
|
||||
movd [edi + 4], mm1
|
||||
|
||||
add edi, byte 8
|
||||
add esi, byte 8
|
||||
|
||||
sub ebx, 2
|
||||
jg .mmx_4_loop_i
|
||||
jmp .mmx_end
|
||||
|
||||
.mmx_4more:
|
||||
shl eax, 2
|
||||
neg eax
|
||||
add eax, byte 16
|
||||
|
||||
align 16
|
||||
.mmx_4more_loop_i:
|
||||
movd mm1, [esi]
|
||||
punpckldq mm1, [esi + 4]
|
||||
movq mm3, mm4
|
||||
psrlq mm4, 16
|
||||
movq mm0, mm1
|
||||
psllq mm0, 48
|
||||
por mm4, mm0
|
||||
movq mm2, mm4
|
||||
psrlq mm4, 16
|
||||
pxor mm0, mm0
|
||||
punpckhdq mm0, mm1
|
||||
pmaddwd mm3, mm5
|
||||
pmaddwd mm2, mm5
|
||||
psllq mm0, 16
|
||||
por mm4, mm0
|
||||
|
||||
mov ecx, esi
|
||||
add ecx, eax
|
||||
mov edx, esp
|
||||
|
||||
align 16
|
||||
.mmx_4more_loop_j:
|
||||
movd mm0, [ecx - 16]
|
||||
movd mm7, [ecx - 8]
|
||||
punpckldq mm0, [ecx - 12]
|
||||
punpckldq mm7, [ecx - 4]
|
||||
packssdw mm0, mm7
|
||||
pmaddwd mm0, [edx]
|
||||
punpckhdq mm7, mm7
|
||||
paddd mm3, mm0
|
||||
movd mm0, [ecx - 12]
|
||||
punpckldq mm0, [ecx - 8]
|
||||
punpckldq mm7, [ecx]
|
||||
packssdw mm0, mm7
|
||||
pmaddwd mm0, [edx]
|
||||
paddd mm2, mm0
|
||||
|
||||
add edx, byte 8
|
||||
add ecx, byte 16
|
||||
cmp ecx, esi
|
||||
jnz .mmx_4more_loop_j
|
||||
|
||||
movq mm0, mm3
|
||||
punpckldq mm3, mm2
|
||||
punpckhdq mm0, mm2
|
||||
paddd mm3, mm0
|
||||
psrad mm3, mm6
|
||||
psubd mm1, mm3
|
||||
movd [edi], mm1
|
||||
punpckhdq mm1, mm1
|
||||
movd [edi + 4], mm1
|
||||
|
||||
add edi, byte 8
|
||||
add esi, byte 8
|
||||
|
||||
sub ebx, 2
|
||||
jg near .mmx_4more_loop_i
|
||||
|
||||
.mmx_end:
|
||||
emms
|
||||
mov esp, ebp
|
||||
.last_one:
|
||||
mov eax, [esp + 32]
|
||||
inc ebx
|
||||
jnz near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_i386.begin
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; **********************************************************************
|
||||
;
|
||||
; void FLAC__lpc_restore_signal(const int32 residual[], unsigned data_len, const int32 qlp_coeff[], unsigned order, int lp_quantization, int32 data[])
|
||||
|
||||
Reference in New Issue
Block a user