mirror of
https://github.com/claunia/flac.git
synced 2025-12-16 18:54:26 +00:00
771 lines
18 KiB
NASM
771 lines
18 KiB
NASM
; libFLAC - Free Lossless Audio Codec library
|
|
; Copyright (C) 2001 Josh Coalson
|
|
;
|
|
; This library is free software; you can redistribute it and/or
|
|
; modify it under the terms of the GNU Library General Public
|
|
; License as published by the Free Software Foundation; either
|
|
; version 2 of the License, or (at your option) any later version.
|
|
;
|
|
; This library is distributed in the hope that it will be useful,
|
|
; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
; Library General Public License for more details.
|
|
;
|
|
; You should have received a copy of the GNU Library General Public
|
|
; License along with this library; if not, write to the
|
|
; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
; Boston, MA 02111-1307, USA.
|
|
|
|
; [CR] is a note to flag that the instruction can be easily reordered
|
|
|
|
%include "nasm.h"
|
|
|
|
data_section
|
|
|
|
cglobal FLAC__lpc_compute_autocorrelation_asm
|
|
|
|
code_section
|
|
|
|
; **********************************************************************
|
|
;
|
|
; void FLAC__lpc_compute_autocorrelation_asm(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
|
|
; {
|
|
; FLAC__real d;
|
|
; unsigned sample, coeff;
|
|
; const unsigned limit = data_len - lag;
|
|
;
|
|
; assert(lag > 0);
|
|
; assert(lag <= data_len);
|
|
;
|
|
; for(coeff = 0; coeff < lag; coeff++)
|
|
; autoc[coeff] = 0.0;
|
|
; for(sample = 0; sample <= limit; sample++){
|
|
; d = data[sample];
|
|
; for(coeff = 0; coeff < lag; coeff++)
|
|
; autoc[coeff] += d * data[sample+coeff];
|
|
; }
|
|
; for(; sample < data_len; sample++){
|
|
; d = data[sample];
|
|
; for(coeff = 0; coeff < data_len - sample; coeff++)
|
|
; autoc[coeff] += d * data[sample+coeff];
|
|
; }
|
|
; }
|
|
;
|
|
FLAC__lpc_compute_autocorrelation_asm:
|
|
|
|
push ebp
|
|
lea ebp, [esp + 8]
|
|
push ebx
|
|
push esi
|
|
push edi
|
|
|
|
mov edx, [ebp + 8] ; edx == lag
|
|
mov ecx, [ebp + 4] ; ecx == data_len
|
|
mov esi, [ebp] ; esi == data
|
|
mov edi, [ebp + 12] ; edi == autoc
|
|
|
|
cmp edx, 1
|
|
ja short .lag_above_1
|
|
.lag_eq_1:
|
|
fldz ; will accumulate autoc[0]
|
|
ALIGN 16
|
|
.lag_1_loop:
|
|
fld dword [esi]
|
|
add esi, byte 4 ; sample++
|
|
fmul st0, st0
|
|
faddp st1, st0
|
|
dec ecx
|
|
jnz .lag_1_loop
|
|
fstp dword [edi]
|
|
jmp .end
|
|
|
|
.lag_above_1:
|
|
cmp edx, 2
|
|
ja short .lag_above_2
|
|
.lag_eq_2:
|
|
fldz ; will accumulate autoc[1]
|
|
dec ecx
|
|
fldz ; will accumulate autoc[0]
|
|
fld dword [esi]
|
|
ALIGN 16
|
|
.lag_2_loop:
|
|
add esi, byte 4 ; [CR] sample++
|
|
fld st0
|
|
fmul st0, st0
|
|
faddp st2, st0 ; add to autoc[0]
|
|
fld dword [esi]
|
|
fmul st1, st0
|
|
fxch
|
|
faddp st3, st0 ; add to autoc[1]
|
|
dec ecx
|
|
jnz .lag_2_loop
|
|
; clean up the leftovers
|
|
fmul st0, st0
|
|
faddp st1, st0 ; add to autoc[0]
|
|
fstp dword [edi]
|
|
fstp dword [edi + 4]
|
|
jmp .end
|
|
|
|
.lag_above_2:
|
|
cmp edx, 3
|
|
ja short .lag_above_3
|
|
.lag_eq_3:
|
|
fldz ; will accumulate autoc[2]
|
|
dec ecx
|
|
fldz ; will accumulate autoc[1]
|
|
dec ecx
|
|
fldz ; will accumulate autoc[0]
|
|
ALIGN 16
|
|
.lag_3_loop:
|
|
fld dword [esi]
|
|
fld st0
|
|
fmul st0, st0
|
|
faddp st2, st0 ; add to autoc[0]
|
|
fld dword [esi + 4]
|
|
fmul st0, st1
|
|
faddp st3, st0 ; add to autoc[1]
|
|
fld dword [esi + 8]
|
|
fmulp st1, st0
|
|
add esi, byte 4 ; [CR] sample++
|
|
faddp st3, st0 ; add to autoc[2]
|
|
dec ecx
|
|
jnz .lag_3_loop
|
|
; clean up the leftovers
|
|
fld dword [esi]
|
|
fld st0
|
|
fmul st0, st0
|
|
faddp st2, st0 ; add to autoc[0]
|
|
fld dword [esi + 4]
|
|
fmul st1, st0
|
|
fxch
|
|
faddp st3, st0 ; add to autoc[1]
|
|
fmul st0, st0
|
|
faddp st1, st0 ; add to autoc[0]
|
|
fstp dword [edi]
|
|
fstp dword [edi + 4]
|
|
fstp dword [edi + 8]
|
|
jmp .end
|
|
|
|
.lag_above_3:
|
|
cmp edx, 4
|
|
ja near .lag_above_4
|
|
.lag_eq_4:
|
|
fldz ; will accumulate autoc[3]
|
|
dec ecx
|
|
fldz ; will accumulate autoc[2]
|
|
dec ecx
|
|
fldz ; will accumulate autoc[1]
|
|
dec ecx
|
|
fldz ; will accumulate autoc[0]
|
|
ALIGN 16
|
|
.lag_4_loop:
|
|
fld dword [esi]
|
|
fld st0
|
|
fmul st0, st0
|
|
faddp st2, st0 ; add to autoc[0]
|
|
fld dword [esi + 4]
|
|
fmul st0, st1
|
|
faddp st3, st0 ; add to autoc[1]
|
|
fld dword [esi + 8]
|
|
fmul st0, st1
|
|
faddp st4, st0 ; add to autoc[2]
|
|
fld dword [esi + 12]
|
|
fmulp st1, st0
|
|
add esi, byte 4 ; [CR] sample++
|
|
faddp st4, st0 ; add to autoc[3]
|
|
dec ecx
|
|
jnz .lag_4_loop
|
|
; clean up the leftovers
|
|
fld dword [esi]
|
|
fld st0
|
|
fmul st0, st0
|
|
faddp st2, st0 ; add to autoc[0]
|
|
fld dword [esi + 4]
|
|
fmul st0, st1
|
|
faddp st3, st0 ; add to autoc[1]
|
|
fld dword [esi + 8]
|
|
fmulp st1, st0
|
|
add esi, byte 4 ; [CR] sample++
|
|
faddp st3, st0 ; add to autoc[2]
|
|
fld dword [esi]
|
|
fld st0
|
|
fmul st0, st0
|
|
faddp st2, st0 ; add to autoc[0]
|
|
fld dword [esi + 4]
|
|
fmul st1, st0
|
|
fxch
|
|
faddp st3, st0 ; add to autoc[1]
|
|
fmul st0, st0
|
|
faddp st1, st0 ; add to autoc[0]
|
|
fstp dword [edi]
|
|
fstp dword [edi + 4]
|
|
fstp dword [edi + 8]
|
|
fstp dword [edi + 12]
|
|
jmp .end
|
|
|
|
.lag_above_4:
|
|
cmp edx, 5
|
|
ja near .lag_above_5
|
|
.lag_eq_5:
|
|
fldz ; will accumulate autoc[4]
|
|
fldz ; will accumulate autoc[3]
|
|
fldz ; will accumulate autoc[2]
|
|
fldz ; will accumulate autoc[1]
|
|
fldz ; will accumulate autoc[0]
|
|
sub ecx, byte 4
|
|
ALIGN 16
|
|
.lag_5_loop:
|
|
fld dword [esi]
|
|
fld st0
|
|
fmul st0, st0
|
|
faddp st2, st0 ; add to autoc[0]
|
|
fld dword [esi + 4]
|
|
fmul st0, st1
|
|
faddp st3, st0 ; add to autoc[1]
|
|
fld dword [esi + 8]
|
|
fmul st0, st1
|
|
faddp st4, st0 ; add to autoc[2]
|
|
fld dword [esi + 12]
|
|
fmul st0, st1
|
|
faddp st5, st0 ; add to autoc[3]
|
|
fld dword [esi + 16]
|
|
fmulp st1, st0
|
|
add esi, byte 4 ; [CR] sample++
|
|
faddp st5, st0 ; add to autoc[4]
|
|
dec ecx
|
|
jnz .lag_5_loop
|
|
; clean up the leftovers
|
|
fld dword [esi]
|
|
fld st0
|
|
fmul st0, st0
|
|
faddp st2, st0 ; add to autoc[0]
|
|
fld dword [esi + 4]
|
|
fmul st0, st1
|
|
faddp st3, st0 ; add to autoc[1]
|
|
fld dword [esi + 8]
|
|
fmul st0, st1
|
|
faddp st4, st0 ; add to autoc[2]
|
|
fld dword [esi + 12]
|
|
fmulp st1, st0
|
|
add esi, byte 4 ; [CR] sample++
|
|
faddp st4, st0 ; add to autoc[3]
|
|
fld dword [esi]
|
|
fld st0
|
|
fmul st0, st0
|
|
faddp st2, st0 ; add to autoc[0]
|
|
fld dword [esi + 4]
|
|
fmul st0, st1
|
|
faddp st3, st0 ; add to autoc[1]
|
|
fld dword [esi + 8]
|
|
fmulp st1, st0
|
|
add esi, byte 4 ; [CR] sample++
|
|
faddp st3, st0 ; add to autoc[2]
|
|
fld dword [esi]
|
|
fld st0
|
|
fmul st0, st0
|
|
faddp st2, st0 ; add to autoc[0]
|
|
fld dword [esi + 4]
|
|
fmul st1, st0
|
|
fxch
|
|
faddp st3, st0 ; add to autoc[1]
|
|
fmul st0, st0
|
|
faddp st1, st0 ; add to autoc[0]
|
|
fstp dword [edi]
|
|
fstp dword [edi + 4]
|
|
fstp dword [edi + 8]
|
|
fstp dword [edi + 12]
|
|
fstp dword [edi + 16]
|
|
jmp .end
|
|
|
|
.lag_above_5:
|
|
cmp edx, 6
|
|
ja .lag_above_6
|
|
.lag_eq_6:
|
|
fldz ; will accumulate autoc[5]
|
|
fldz ; will accumulate autoc[4]
|
|
fldz ; will accumulate autoc[3]
|
|
fldz ; will accumulate autoc[2]
|
|
fldz ; will accumulate autoc[1]
|
|
fldz ; will accumulate autoc[0]
|
|
sub ecx, byte 5
|
|
ALIGN 16
|
|
.lag_6_loop:
|
|
fld dword [esi]
|
|
fld st0
|
|
fmul st0, st0
|
|
faddp st2, st0 ; add to autoc[0]
|
|
fld dword [esi + 4]
|
|
fmul st0, st1
|
|
faddp st3, st0 ; add to autoc[1]
|
|
fld dword [esi + 8]
|
|
fmul st0, st1
|
|
faddp st4, st0 ; add to autoc[2]
|
|
fld dword [esi + 12]
|
|
fmul st0, st1
|
|
faddp st5, st0 ; add to autoc[3]
|
|
fld dword [esi + 16]
|
|
fmul st0, st1
|
|
faddp st6, st0 ; add to autoc[4]
|
|
fld dword [esi + 20]
|
|
fmulp st1, st0
|
|
add esi, byte 4 ; [CR] sample++
|
|
faddp st6, st0 ; add to autoc[5]
|
|
dec ecx
|
|
jnz .lag_6_loop
|
|
; clean up the leftovers
|
|
fld dword [esi]
|
|
fld st0
|
|
fmul st0, st0
|
|
faddp st2, st0 ; add to autoc[0]
|
|
fld dword [esi + 4]
|
|
fmul st0, st1
|
|
faddp st3, st0 ; add to autoc[1]
|
|
fld dword [esi + 8]
|
|
fmul st0, st1
|
|
faddp st4, st0 ; add to autoc[2]
|
|
fld dword [esi + 12]
|
|
fmul st0, st1
|
|
faddp st5, st0 ; add to autoc[3]
|
|
fld dword [esi + 16]
|
|
fmulp st1, st0
|
|
add esi, byte 4 ; [CR] sample++
|
|
faddp st5, st0 ; add to autoc[4]
|
|
fld dword [esi]
|
|
fld st0
|
|
fmul st0, st0
|
|
faddp st2, st0 ; add to autoc[0]
|
|
fld dword [esi + 4]
|
|
fmul st0, st1
|
|
faddp st3, st0 ; add to autoc[1]
|
|
fld dword [esi + 8]
|
|
fmul st0, st1
|
|
faddp st4, st0 ; add to autoc[2]
|
|
fld dword [esi + 12]
|
|
fmulp st1, st0
|
|
add esi, byte 4 ; [CR] sample++
|
|
faddp st4, st0 ; add to autoc[3]
|
|
fld dword [esi]
|
|
fld st0
|
|
fmul st0, st0
|
|
faddp st2, st0 ; add to autoc[0]
|
|
fld dword [esi + 4]
|
|
fmul st0, st1
|
|
faddp st3, st0 ; add to autoc[1]
|
|
fld dword [esi + 8]
|
|
fmulp st1, st0
|
|
add esi, byte 4 ; [CR] sample++
|
|
faddp st3, st0 ; add to autoc[2]
|
|
fld dword [esi]
|
|
fld st0
|
|
fmul st0, st0
|
|
faddp st2, st0 ; add to autoc[0]
|
|
fld dword [esi + 4]
|
|
fmul st1, st0
|
|
fxch
|
|
faddp st3, st0 ; add to autoc[1]
|
|
fmul st0, st0
|
|
faddp st1, st0 ; add to autoc[0]
|
|
fstp dword [edi]
|
|
fstp dword [edi + 4]
|
|
fstp dword [edi + 8]
|
|
fstp dword [edi + 12]
|
|
fstp dword [edi + 16]
|
|
fstp dword [edi + 20]
|
|
jmp .end
|
|
|
|
.lag_above_6:
|
|
; for(coeff = 0; coeff < lag; coeff++)
|
|
; autoc[coeff] = 0.0;
|
|
lea ecx, [edx * 2] ; ecx = # of dwords of 0 to write
|
|
xor eax, eax
|
|
rep stosd
|
|
mov ecx, [ebp + 4] ; ecx == data_len
|
|
mov edi, [ebp + 12] ; edi == autoc
|
|
; const unsigned limit = data_len - lag;
|
|
sub ecx, edx
|
|
inc ecx ; we are looping <= limit so we add one to the counter
|
|
; for(sample = 0; sample <= limit; sample++){
|
|
; d = data[sample];
|
|
; for(coeff = 0; coeff < lag; coeff++)
|
|
; autoc[coeff] += d * data[sample+coeff];
|
|
; }
|
|
xor eax, eax ; eax == sample <- 0
|
|
ALIGN 16
|
|
.outer_loop:
|
|
push eax ; save sample
|
|
fld dword [esi + eax * 4] ; ST = d <- data[sample]
|
|
mov ebx, eax ; ebx == sample+coeff <- sample
|
|
mov edx, [ebp + 8] ; edx <- lag
|
|
xor eax, eax ; eax == coeff <- 0
|
|
ALIGN 16
|
|
.inner_loop:
|
|
fld st0 ; ST = d d
|
|
fmul dword [esi + ebx * 4] ; ST = d*data[sample+coeff] d
|
|
fadd dword [edi + eax * 4] ; ST = autoc[coeff]+d*data[sample+coeff] d
|
|
fstp dword [edi + eax * 4] ; autoc[coeff]+=d*data[sample+coeff] ST = d
|
|
inc ebx ; (sample+coeff)++
|
|
inc eax ; coeff++
|
|
dec edx
|
|
jnz .inner_loop
|
|
pop eax ; restore sample
|
|
fstp st0 ; pop d, ST = empty
|
|
inc eax ; sample++
|
|
loop .outer_loop
|
|
; for(; sample < data_len; sample++){
|
|
; d = data[sample];
|
|
; for(coeff = 0; coeff < data_len - sample; coeff++)
|
|
; autoc[coeff] += d * data[sample+coeff];
|
|
; }
|
|
mov ecx, [ebp + 8] ; ecx <- lag
|
|
dec ecx ; ecx <- lag - 1
|
|
jz .outer_end ; skip loop if 0
|
|
.outer_loop2:
|
|
push eax ; save sample
|
|
fld dword [esi + eax * 4] ; ST = d <- data[sample]
|
|
mov ebx, eax ; ebx == sample+coeff <- sample
|
|
mov edx, [ebp + 4] ; edx <- data_len
|
|
sub edx, eax ; edx <- data_len-sample
|
|
xor eax, eax ; eax == coeff <- 0
|
|
.inner_loop2:
|
|
fld st0 ; ST = d d
|
|
fmul dword [esi + ebx * 4] ; ST = d*data[sample+coeff] d
|
|
fadd dword [edi + eax * 4] ; ST = autoc[coeff]+d*data[sample+coeff] d
|
|
fstp dword [edi + eax * 4] ; autoc[coeff]+=d*data[sample+coeff] ST = d
|
|
inc ebx ; (sample+coeff)++
|
|
inc eax ; coeff++
|
|
dec edx
|
|
jnz .inner_loop2
|
|
pop eax ; restore sample
|
|
fstp st0 ; pop d, ST = empty
|
|
inc eax ; sample++
|
|
loop .outer_loop2
|
|
.outer_end:
|
|
jmp .end
|
|
|
|
.lag_eq_6_plus_1:
|
|
mov ecx, [ebp + 4] ; ecx == data_len
|
|
mov esi, [ebp] ; esi == data
|
|
mov edi, [ebp + 12] ; edi == autoc
|
|
fldz ; will accumulate autoc[6]
|
|
sub ecx, byte 6
|
|
ALIGN 16
|
|
.lag_6_1_loop:
|
|
fld dword [esi]
|
|
fld dword [esi + 24]
|
|
fmulp st1, st0
|
|
add esi, byte 4 ; [CR] sample++
|
|
faddp st1, st0 ; add to autoc[6]
|
|
dec ecx
|
|
jnz .lag_6_1_loop
|
|
fstp dword [edi + 24]
|
|
jmp .end
|
|
|
|
.lag_eq_6_plus_2:
|
|
mov ecx, [ebp + 4] ; ecx == data_len
|
|
mov esi, [ebp] ; esi == data
|
|
mov edi, [ebp + 12] ; edi == autoc
|
|
fldz ; will accumulate autoc[7]
|
|
fldz ; will accumulate autoc[6]
|
|
sub ecx, byte 7
|
|
ALIGN 16
|
|
.lag_6_2_loop:
|
|
fld dword [esi]
|
|
fld dword [esi + 24]
|
|
fmul st0, st1
|
|
faddp st2, st0 ; add to autoc[6]
|
|
fld dword [esi + 28]
|
|
fmulp st1, st0
|
|
add esi, byte 4 ; [CR] sample++
|
|
faddp st2, st0 ; add to autoc[7]
|
|
dec ecx
|
|
jnz .lag_6_2_loop
|
|
; clean up the leftovers
|
|
fld dword [esi]
|
|
fld dword [esi + 24]
|
|
fmulp st1, st0
|
|
faddp st1, st0 ; add to autoc[6]
|
|
fstp dword [edi + 24]
|
|
fstp dword [edi + 28]
|
|
jmp .end
|
|
|
|
.lag_eq_6_plus_3:
|
|
mov ecx, [ebp + 4] ; ecx == data_len
|
|
mov esi, [ebp] ; esi == data
|
|
mov edi, [ebp + 12] ; edi == autoc
|
|
fldz ; will accumulate autoc[8]
|
|
fldz ; will accumulate autoc[7]
|
|
fldz ; will accumulate autoc[6]
|
|
sub ecx, byte 8
|
|
ALIGN 16
|
|
.lag_6_3_loop:
|
|
fld dword [esi]
|
|
fld dword [esi + 24]
|
|
fmul st0, st1
|
|
faddp st2, st0 ; add to autoc[6]
|
|
fld dword [esi + 28]
|
|
fmul st0, st1
|
|
faddp st3, st0 ; add to autoc[7]
|
|
fld dword [esi + 32]
|
|
fmulp st1, st0
|
|
add esi, byte 4 ; [CR] sample++
|
|
faddp st3, st0 ; add to autoc[8]
|
|
dec ecx
|
|
jnz .lag_6_3_loop
|
|
; clean up the leftovers
|
|
fld dword [esi]
|
|
fld dword [esi + 24]
|
|
fmul st0, st1
|
|
faddp st2, st0 ; add to autoc[6]
|
|
fld dword [esi + 28]
|
|
fmulp st1, st0
|
|
add esi, byte 4 ; [CR] sample++
|
|
faddp st2, st0 ; add to autoc[7]
|
|
fld dword [esi]
|
|
fld dword [esi + 24]
|
|
fmulp st1, st0
|
|
faddp st1, st0 ; add to autoc[6]
|
|
fstp dword [edi + 24]
|
|
fstp dword [edi + 28]
|
|
fstp dword [edi + 32]
|
|
jmp .end
|
|
|
|
.lag_eq_6_plus_4:
|
|
mov ecx, [ebp + 4] ; ecx == data_len
|
|
mov esi, [ebp] ; esi == data
|
|
mov edi, [ebp + 12] ; edi == autoc
|
|
fldz ; will accumulate autoc[9]
|
|
fldz ; will accumulate autoc[8]
|
|
fldz ; will accumulate autoc[7]
|
|
fldz ; will accumulate autoc[6]
|
|
sub ecx, byte 9
|
|
ALIGN 16
|
|
.lag_6_4_loop:
|
|
fld dword [esi]
|
|
fld dword [esi + 24]
|
|
fmul st0, st1
|
|
faddp st2, st0 ; add to autoc[6]
|
|
fld dword [esi + 28]
|
|
fmul st0, st1
|
|
faddp st3, st0 ; add to autoc[7]
|
|
fld dword [esi + 32]
|
|
fmul st0, st1
|
|
faddp st4, st0 ; add to autoc[8]
|
|
fld dword [esi + 36]
|
|
fmulp st1, st0
|
|
add esi, byte 4 ; [CR] sample++
|
|
faddp st4, st0 ; add to autoc[9]
|
|
dec ecx
|
|
jnz .lag_6_4_loop
|
|
; clean up the leftovers
|
|
fld dword [esi]
|
|
fld dword [esi + 24]
|
|
fmul st0, st1
|
|
faddp st2, st0 ; add to autoc[6]
|
|
fld dword [esi + 28]
|
|
fmul st0, st1
|
|
faddp st3, st0 ; add to autoc[7]
|
|
fld dword [esi + 32]
|
|
fmulp st1, st0
|
|
add esi, byte 4 ; [CR] sample++
|
|
faddp st3, st0 ; add to autoc[8]
|
|
fld dword [esi]
|
|
fld dword [esi + 24]
|
|
fmul st0, st1
|
|
faddp st2, st0 ; add to autoc[6]
|
|
fld dword [esi + 28]
|
|
fmulp st1, st0
|
|
add esi, byte 4 ; [CR] sample++
|
|
faddp st2, st0 ; add to autoc[7]
|
|
fld dword [esi]
|
|
fld dword [esi + 24]
|
|
fmulp st1, st0
|
|
faddp st1, st0 ; add to autoc[6]
|
|
fstp dword [edi + 24]
|
|
fstp dword [edi + 28]
|
|
fstp dword [edi + 32]
|
|
fstp dword [edi + 36]
|
|
jmp .end
|
|
|
|
.lag_eq_6_plus_5:
|
|
mov ecx, [ebp + 4] ; ecx == data_len
|
|
mov esi, [ebp] ; esi == data
|
|
mov edi, [ebp + 12] ; edi == autoc
|
|
fldz ; will accumulate autoc[10]
|
|
fldz ; will accumulate autoc[9]
|
|
fldz ; will accumulate autoc[8]
|
|
fldz ; will accumulate autoc[7]
|
|
fldz ; will accumulate autoc[6]
|
|
sub ecx, byte 10
|
|
ALIGN 16
|
|
.lag_6_5_loop:
|
|
fld dword [esi]
|
|
fld dword [esi + 24]
|
|
fmul st0, st1
|
|
faddp st2, st0 ; add to autoc[6]
|
|
fld dword [esi + 28]
|
|
fmul st0, st1
|
|
faddp st3, st0 ; add to autoc[7]
|
|
fld dword [esi + 32]
|
|
fmul st0, st1
|
|
faddp st4, st0 ; add to autoc[8]
|
|
fld dword [esi + 36]
|
|
fmul st0, st1
|
|
faddp st5, st0 ; add to autoc[9]
|
|
fld dword [esi + 40]
|
|
fmulp st1, st0
|
|
add esi, byte 4 ; [CR] sample++
|
|
faddp st5, st0 ; add to autoc[10]
|
|
dec ecx
|
|
jnz .lag_6_5_loop
|
|
; clean up the leftovers
|
|
fld dword [esi]
|
|
fld dword [esi + 24]
|
|
fmul st0, st1
|
|
faddp st2, st0 ; add to autoc[6]
|
|
fld dword [esi + 28]
|
|
fmul st0, st1
|
|
faddp st3, st0 ; add to autoc[7]
|
|
fld dword [esi + 32]
|
|
fmul st0, st1
|
|
faddp st4, st0 ; add to autoc[8]
|
|
fld dword [esi + 36]
|
|
fmulp st1, st0
|
|
add esi, byte 4 ; [CR] sample++
|
|
faddp st4, st0 ; add to autoc[9]
|
|
fld dword [esi]
|
|
fld dword [esi + 24]
|
|
fmul st0, st1
|
|
faddp st2, st0 ; add to autoc[6]
|
|
fld dword [esi + 28]
|
|
fmul st0, st1
|
|
faddp st3, st0 ; add to autoc[7]
|
|
fld dword [esi + 32]
|
|
fmulp st1, st0
|
|
add esi, byte 4 ; [CR] sample++
|
|
faddp st3, st0 ; add to autoc[8]
|
|
fld dword [esi]
|
|
fld dword [esi + 24]
|
|
fmul st0, st1
|
|
faddp st2, st0 ; add to autoc[6]
|
|
fld dword [esi + 28]
|
|
fmulp st1, st0
|
|
add esi, byte 4 ; [CR] sample++
|
|
faddp st2, st0 ; add to autoc[7]
|
|
fld dword [esi]
|
|
fld dword [esi + 24]
|
|
fmulp st1, st0
|
|
faddp st1, st0 ; add to autoc[6]
|
|
fstp dword [edi + 24]
|
|
fstp dword [edi + 28]
|
|
fstp dword [edi + 32]
|
|
fstp dword [edi + 36]
|
|
fstp dword [edi + 40]
|
|
jmp .end
|
|
|
|
.lag_eq_6_plus_6:
|
|
mov ecx, [ebp + 4] ; ecx == data_len
|
|
mov esi, [ebp] ; esi == data
|
|
mov edi, [ebp + 12] ; edi == autoc
|
|
fldz ; will accumulate autoc[11]
|
|
fldz ; will accumulate autoc[10]
|
|
fldz ; will accumulate autoc[9]
|
|
fldz ; will accumulate autoc[8]
|
|
fldz ; will accumulate autoc[7]
|
|
fldz ; will accumulate autoc[6]
|
|
sub ecx, byte 11
|
|
ALIGN 16
|
|
.lag_6_6_loop:
|
|
fld dword [esi]
|
|
fld dword [esi + 24]
|
|
fmul st0, st1
|
|
faddp st2, st0 ; add to autoc[6]
|
|
fld dword [esi + 28]
|
|
fmul st0, st1
|
|
faddp st3, st0 ; add to autoc[7]
|
|
fld dword [esi + 32]
|
|
fmul st0, st1
|
|
faddp st4, st0 ; add to autoc[8]
|
|
fld dword [esi + 36]
|
|
fmul st0, st1
|
|
faddp st5, st0 ; add to autoc[9]
|
|
fld dword [esi + 40]
|
|
fmul st0, st1
|
|
faddp st6, st0 ; add to autoc[10]
|
|
fld dword [esi + 44]
|
|
fmulp st1, st0
|
|
add esi, byte 4 ; [CR] sample++
|
|
faddp st6, st0 ; add to autoc[11]
|
|
dec ecx
|
|
jnz .lag_6_6_loop
|
|
; clean up the leftovers
|
|
fld dword [esi]
|
|
fld dword [esi + 24]
|
|
fmul st0, st1
|
|
faddp st2, st0 ; add to autoc[6]
|
|
fld dword [esi + 28]
|
|
fmul st0, st1
|
|
faddp st3, st0 ; add to autoc[7]
|
|
fld dword [esi + 32]
|
|
fmul st0, st1
|
|
faddp st4, st0 ; add to autoc[8]
|
|
fld dword [esi + 36]
|
|
fmul st0, st1
|
|
faddp st5, st0 ; add to autoc[9]
|
|
fld dword [esi + 40]
|
|
fmulp st1, st0
|
|
add esi, byte 4 ; [CR] sample++
|
|
faddp st5, st0 ; add to autoc[10]
|
|
fld dword [esi]
|
|
fld dword [esi + 24]
|
|
fmul st0, st1
|
|
faddp st2, st0 ; add to autoc[6]
|
|
fld dword [esi + 28]
|
|
fmul st0, st1
|
|
faddp st3, st0 ; add to autoc[7]
|
|
fld dword [esi + 32]
|
|
fmul st0, st1
|
|
faddp st4, st0 ; add to autoc[8]
|
|
fld dword [esi + 36]
|
|
fmulp st1, st0
|
|
add esi, byte 4 ; [CR] sample++
|
|
faddp st4, st0 ; add to autoc[9]
|
|
fld dword [esi]
|
|
fld dword [esi + 24]
|
|
fmul st0, st1
|
|
faddp st2, st0 ; add to autoc[6]
|
|
fld dword [esi + 28]
|
|
fmul st0, st1
|
|
faddp st3, st0 ; add to autoc[7]
|
|
fld dword [esi + 32]
|
|
fmulp st1, st0
|
|
add esi, byte 4 ; [CR] sample++
|
|
faddp st3, st0 ; add to autoc[8]
|
|
fld dword [esi]
|
|
fld dword [esi + 24]
|
|
fmul st0, st1
|
|
faddp st2, st0 ; add to autoc[6]
|
|
fld dword [esi + 28]
|
|
fmulp st1, st0
|
|
add esi, byte 4 ; [CR] sample++
|
|
faddp st2, st0 ; add to autoc[7]
|
|
fld dword [esi]
|
|
fld dword [esi + 24]
|
|
fmulp st1, st0
|
|
faddp st1, st0 ; add to autoc[6]
|
|
fstp dword [edi + 24]
|
|
fstp dword [edi + 28]
|
|
fstp dword [edi + 32]
|
|
fstp dword [edi + 36]
|
|
fstp dword [edi + 40]
|
|
fstp dword [edi + 44]
|
|
jmp .end
|
|
|
|
.end:
|
|
pop edi
|
|
pop esi
|
|
pop ebx
|
|
pop ebp
|
|
ret
|
|
|
|
end
|