mirror of
https://github.com/claunia/flac.git
synced 2025-12-16 18:54:26 +00:00
initial import
This commit is contained in:
341
src/libFLAC/i386/fixed_asm.nasm
Normal file
341
src/libFLAC/i386/fixed_asm.nasm
Normal file
@@ -0,0 +1,341 @@
|
||||
; libFLAC - Free Lossless Audio Codec library
|
||||
; Copyright (C) 2001 Josh Coalson
|
||||
;
|
||||
; This library is free software; you can redistribute it and/or
|
||||
; modify it under the terms of the GNU Library General Public
|
||||
; License as published by the Free Software Foundation; either
|
||||
; version 2 of the License, or (at your option) any later version.
|
||||
;
|
||||
; This library is distributed in the hope that it will be useful,
|
||||
; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
; Library General Public License for more details.
|
||||
;
|
||||
; You should have received a copy of the GNU Library General Public
|
||||
; License along with this library; if not, write to the
|
||||
; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
; Boston, MA 02111-1307, USA.
|
||||
|
||||
%include "nasm.h"
|
||||
|
||||
data_section
|
||||
|
||||
cglobal FLAC__fixed_compute_best_predictor
|
||||
|
||||
code_section
|
||||
|
||||
; **********************************************************************
|
||||
;
|
||||
; unsigned FLAC__fixed_compute_best_predictor(const int32 data[], unsigned data_len, real residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1])
|
||||
; {
|
||||
; int32 last_error_0 = data[-1];
|
||||
; int32 last_error_1 = data[-1] - data[-2];
|
||||
; int32 last_error_2 = last_error_1 - (data[-2] - data[-3]);
|
||||
; int32 last_error_3 = last_error_2 - (data[-2] - 2*data[-3] + data[-4]);
|
||||
; int32 error, save;
|
||||
; uint32 total_error_0 = 0, total_error_1 = 0, total_error_2 = 0, total_error_3 = 0, total_error_4 = 0;
|
||||
; unsigned i, order;
|
||||
;
|
||||
; for(i = 0; i < data_len; i++) {
|
||||
; error = data[i] ; total_error_0 += local_abs(error); save = error;
|
||||
; error -= last_error_0; total_error_1 += local_abs(error); last_error_0 = save; save = error;
|
||||
; error -= last_error_1; total_error_2 += local_abs(error); last_error_1 = save; save = error;
|
||||
; error -= last_error_2; total_error_3 += local_abs(error); last_error_2 = save; save = error;
|
||||
; error -= last_error_3; total_error_4 += local_abs(error); last_error_3 = save;
|
||||
; }
|
||||
;
|
||||
; if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4))
|
||||
; order = 0;
|
||||
; else if(total_error_1 < min(min(total_error_2, total_error_3), total_error_4))
|
||||
; order = 1;
|
||||
; else if(total_error_2 < min(total_error_3, total_error_4))
|
||||
; order = 2;
|
||||
; else if(total_error_3 < total_error_4)
|
||||
; order = 3;
|
||||
; else
|
||||
; order = 4;
|
||||
;
|
||||
; residual_bits_per_sample[0] = (real)((data_len > 0 && total_error_0 > 0) ? log(M_LN2 * (real)total_error_0 / (real) data_len) / M_LN2 : 0.0);
|
||||
; residual_bits_per_sample[1] = (real)((data_len > 0 && total_error_1 > 0) ? log(M_LN2 * (real)total_error_1 / (real) data_len) / M_LN2 : 0.0);
|
||||
; residual_bits_per_sample[2] = (real)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (real)total_error_2 / (real) data_len) / M_LN2 : 0.0);
|
||||
; residual_bits_per_sample[3] = (real)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (real)total_error_3 / (real) data_len) / M_LN2 : 0.0);
|
||||
; residual_bits_per_sample[4] = (real)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (real)total_error_4 / (real) data_len) / M_LN2 : 0.0);
|
||||
;
|
||||
; return order;
|
||||
; }
|
||||
;@@@ NOTE: not tested yet!
|
||||
FLAC__fixed_compute_best_predictor_asm:
|
||||
|
||||
; esp + 28 == data[]
|
||||
; esp + 32 == data_len
|
||||
; esp + 36 == residual_bits_per_sample[]
|
||||
|
||||
push ebp
|
||||
push ebx
|
||||
push esi
|
||||
push edi
|
||||
sub esp, byte 8 ; [esp + 0] == temp space for loading uint64s to FPU regs
|
||||
|
||||
; eax == error
|
||||
; ebx == &data[i]
|
||||
; mm0 == total_error_1:total_error_0
|
||||
; mm1 == total_error_3:total_error_2
|
||||
; mm2 == 0:total_error_4
|
||||
; mm3/4 == 0:unpackarea
|
||||
; mm5 == abs(error_1):abs(error_0)
|
||||
; mm5 == abs(error_3):abs(error_2)
|
||||
; mm6 == last_error_1:last_error_0
|
||||
; mm7 == last_error_3:last_error_2
|
||||
|
||||
pxor mm0, mm0 ; total_error_1 = total_error_0 = 0
|
||||
pxor mm1, mm1 ; total_error_3 = total_error_2 = 0
|
||||
pxor mm2, mm2 ; total_error_4 = 0
|
||||
mov ebx, [esp + 28] ; ebx = data[]
|
||||
mov ecx, [ebx - 4] ; ecx == data[-1] last_error_0 = data[-1]
|
||||
mov eax, [ebx - 8] ; eax == data[-2]
|
||||
mov ebp, [ebx - 16] ; ebp == data[-4]
|
||||
mov ebx, [ebx - 12] ; ebx == data[-3]
|
||||
mov edx, ecx
|
||||
sub edx, eax ; last_error_1 = data[-1] - data[-2]
|
||||
mov esi, edx
|
||||
sub esi, eax
|
||||
add esi, ebx ; last_error_2 = last_error_1 - (data[-2] - data[-3])
|
||||
shl ebx, 1
|
||||
mov edi, esi
|
||||
sub edi, eax
|
||||
add edi, ebx
|
||||
sub edi, ebp ; last_error_3 = last_error_2 - (data[-2] - 2*data[-3] + data[-4]);
|
||||
mov ebx, [esp + 28] ; ebx = data[]
|
||||
mov ecx, [esp + 32] ; ecx = data_len
|
||||
movd mm6, ecx ; mm6 = 0:last_error_0
|
||||
movd mm3, edx ; mm3 = 0:last_error_1
|
||||
movd mm7, esi ; mm7 = 0:last_error_2
|
||||
movd mm4, edi ; mm4 = 0:last_error_3
|
||||
punpckldq mm6, mm3 ; mm6 = last_error_1:last_error_0
|
||||
punpckldq mm7, mm4 ; mm7 = last_error_3:last_error_2
|
||||
|
||||
.loop:
|
||||
mov eax, [ebx] ; eax = error_0 = data[i]
|
||||
add ebx, 4
|
||||
mov edx, eax ; edx = error_0
|
||||
mov edi, eax ; edi == save = error_0
|
||||
neg edx ; edx = -error_0
|
||||
cmovns eax, edx ; eax = abs(error_0)
|
||||
movd mm5, eax ; mm5 = 0:abs(error_0)
|
||||
movd edx, mm6 ; edx = last_error_0
|
||||
mov eax, edi ; eax = error(error_0)
|
||||
pshufw mm3, mm6, 4eh ; 4eh=1-0-3-2, mm3 = last_error_0:last_error_1
|
||||
movd mm6, edi ; mm6 = 0:last_error_0(=save)
|
||||
punpckldq mm6, mm3 ; mm6 = last_error_1:last_error_0
|
||||
sub eax, edx ; error -= last_error_0
|
||||
mov edi, eax ; edi == save = error_1
|
||||
mov edx, eax ; edx = error_1
|
||||
neg edx ; edx = -error_1
|
||||
cmovns eax, edx ; eax = abs(error_1)
|
||||
movd mm4, eax ; mm4 = 0:abs(error_1)
|
||||
punpckldq mm5, mm4 ; mm5 = abs(error_1):abs(error_0)
|
||||
pshufw mm3, mm6, 4eh ; 4eh=1-0-3-2, mm3 = last_error_0:last_error_1
|
||||
movd edx, mm3 ; edx = last_error_1
|
||||
mov eax, edi ; eax = error(error_1)
|
||||
movd mm4, edi ; mm4 = 0:save
|
||||
punpckldq mm6, mm4 ; mm6 = last_error_1(=save):last_error_0
|
||||
sub eax, edx ; error -= last_error_1
|
||||
mov edi, eax ; edi == save = error_2
|
||||
paddd mm0, mm5 ; [CR] total_error_1 += abs(error_1) ; total_error_0 += abs(error_0)
|
||||
mov edx, eax ; edx = error_2
|
||||
neg edx ; edx = -error_2
|
||||
cmovns eax, edx ; eax = abs(error_2)
|
||||
movd mm5, eax ; mm5 = 0:abs(error_2)
|
||||
movd edx, mm7 ; edx = last_error_2
|
||||
mov eax, edi ; eax = error(error_2)
|
||||
pshufw mm3, mm7, 4eh ; 4eh=1-0-3-2, mm3 = last_error_2:last_error_3
|
||||
movd mm7, edi ; mm7 = 0:last_error_2(=save)
|
||||
punpckldq mm7, mm3 ; mm7 = last_error_3:last_error_2
|
||||
sub eax, edx ; error -= last_error_2
|
||||
mov edi, eax ; edi == save = error_3
|
||||
mov edx, eax ; edx = error_3
|
||||
neg edx ; edx = -error_3
|
||||
cmovns eax, edx ; eax = abs(error_3)
|
||||
movd mm4, eax ; mm4 = 0:abs(error_3)
|
||||
punpckldq mm5, mm4 ; mm5 = abs(error_3):abs(error_2)
|
||||
pshufw mm3, mm7, 4eh ; 4eh=1-0-3-2, mm3 = last_error_2:last_error_3
|
||||
movd edx, mm3 ; edx = last_error_3
|
||||
mov eax, edi ; eax = error(error_3)
|
||||
movd mm4, edi ; mm4 = 0:save
|
||||
punpckldq mm7, mm4 ; mm7 = last_error_3(=save):last_error_2
|
||||
sub eax, edx ; error -= last_error_3
|
||||
paddd mm1, mm5 ; [CR] total_error_3 += abs(error_3) ; total_error_2 += abs(error_2)
|
||||
mov edx, eax ; edx = error_4
|
||||
neg edx ; edx = -error_4
|
||||
cmovns eax, edx ; eax = abs(error_4)
|
||||
movd mm5, eax ; mm5 = 0:abs(error_4)
|
||||
paddd mm2, mm5 ; total_error_4 += abs(error_4)
|
||||
dec ecx
|
||||
jecxz .loop_end ; can't "jnz .loop" because of distance
|
||||
jmp .loop
|
||||
.loop_end:
|
||||
|
||||
; if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4))
|
||||
; order = 0;
|
||||
; else if(total_error_1 < min(min(total_error_2, total_error_3), total_error_4))
|
||||
; order = 1;
|
||||
; else if(total_error_2 < min(total_error_3, total_error_4))
|
||||
; order = 2;
|
||||
; else if(total_error_3 < total_error_4)
|
||||
; order = 3;
|
||||
; else
|
||||
; order = 4;
|
||||
movd edi, mm2 ; edi = total_error_4
|
||||
pshufw mm4, mm1, 4eh ; 4eh=1-0-4-2, mm3 = total_error_2:total_error_3
|
||||
movd edx, mm1 ; edx = total_error_2
|
||||
movd esi, mm4 ; esi = total_error_3
|
||||
pshufw mm3, mm0, 4eh ; 4eh=1-0-3-2, mm3 = total_error_0:total_error_1
|
||||
movd ebx, mm0 ; ebx = total_error_0
|
||||
movd ecx, mm3 ; ecx = total_error_1
|
||||
emms
|
||||
mov eax, ebx ; eax = total_error_0
|
||||
cmp ecx, ebx
|
||||
cmovb eax, ecx ; eax = min(total_error_0, total_error_1)
|
||||
cmp edx, eax
|
||||
cmovb eax, edx ; eax = min(total_error_0, total_error_1, total_error_2)
|
||||
cmp esi, eax
|
||||
cmovb eax, esi ; eax = min(total_error_0, total_error_1, total_error_2, total_error_3)
|
||||
cmp edi, eax
|
||||
cmovb eax, edi ; eax = min(total_error_0, total_error_1, total_error_2, total_error_3, total_error_4)
|
||||
|
||||
cmp eax, ebx
|
||||
jne .not_order_0
|
||||
xor ebp, ebp
|
||||
jmp short .got_order
|
||||
.not_order_0:
|
||||
cmp eax, ecx
|
||||
jne .not_order_0
|
||||
mov ebp, 1
|
||||
jmp short .got_order
|
||||
.not_order_1:
|
||||
cmp eax, edx
|
||||
jne .not_order_0
|
||||
mov ebp, 2
|
||||
jmp short .got_order
|
||||
.not_order_2:
|
||||
cmp eax, esi
|
||||
jne .not_order_0
|
||||
mov ebp, 3
|
||||
jmp short .got_order
|
||||
.not_order_3:
|
||||
mov ebp, 4
|
||||
.got_order:
|
||||
; residual_bits_per_sample[0] = (real)((data_len > 0 && total_error_0 > 0) ? log(M_LN2 * (real)total_error_0 / (real) data_len) / M_LN2 : 0.0);
|
||||
; residual_bits_per_sample[1] = (real)((data_len > 0 && total_error_1 > 0) ? log(M_LN2 * (real)total_error_1 / (real) data_len) / M_LN2 : 0.0);
|
||||
; residual_bits_per_sample[2] = (real)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (real)total_error_2 / (real) data_len) / M_LN2 : 0.0);
|
||||
; residual_bits_per_sample[3] = (real)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (real)total_error_3 / (real) data_len) / M_LN2 : 0.0);
|
||||
; residual_bits_per_sample[4] = (real)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (real)total_error_4 / (real) data_len) / M_LN2 : 0.0);
|
||||
fild dword [esp + 32] ; ST = data_len (NOTE: assumes data_len is <2gigs)
|
||||
fldz ; ST = 0.0 data_len
|
||||
xor eax, eax
|
||||
cmp eax, [esp + 32]
|
||||
jne .rbps_0
|
||||
; data_len == 0, so residual_bits_per_sample[*] = 0.0
|
||||
mov ecx, 5 ; eax still == 0, ecx = # of dwords of 0 to store
|
||||
mov edi, [esp + 36]
|
||||
rep stosd
|
||||
jmp .end
|
||||
.rbps_0:
|
||||
cmp eax, ebx
|
||||
je .total_error_0_is_0
|
||||
fld1 ; ST = 1.0 0.0 data_len
|
||||
mov [esp], ebx
|
||||
mov [esp + 4], eax ; [esp + 0] = (uint64)total_error_0
|
||||
fild qword [esp] ; ST = total_error_0 1.0 0.0 data_len
|
||||
fdiv st3 ; ST = total_error_0/data_len 1.0 0.0 data_len
|
||||
fldln2 ; ST = ln2 total_error_0/data_len 1.0 0.0 data_len
|
||||
fmulp st1 ; ST = ln2*total_error_0/data_len 1.0 0.0 data_len
|
||||
fyl2x ; ST = log2(ln2*total_error_0/data_len) 0.0 data_len
|
||||
mov ebx, [esp + 36]
|
||||
fstp dword [ebx] ; residual_bits_per_sample[0] = log2(ln2*total_error_0/data_len) ST = 0.0 data_len
|
||||
jmp short .rbps_1
|
||||
.total_error_0_is_0:
|
||||
mov ebx, [esp + 36]
|
||||
fst dword [ebx] ; ST = 0.0 data_len
|
||||
.rbps_1:
|
||||
cmp eax, ecx
|
||||
je .total_error_1_is_0
|
||||
fld1 ; ST = 1.0 0.0 data_len
|
||||
mov [esp], ecx
|
||||
mov [esp + 4], eax ; [esp + 0] = (uint64)total_error_1
|
||||
fild qword [esp] ; ST = total_error_1 1.0 0.0 data_len
|
||||
fdiv st3 ; ST = total_error_1/data_len 1.0 0.0 data_len
|
||||
fldln2 ; ST = ln2 total_error_1/data_len 1.0 0.0 data_len
|
||||
fmulp st1 ; ST = ln2*total_error_1/data_len 1.0 0.0 data_len
|
||||
fyl2x ; ST = log2(ln2*total_error_1/data_len) 0.0 data_len
|
||||
mov ebx, [esp + 36]
|
||||
fstp dword [ebx + 4] ; residual_bits_per_sample[1] = log2(ln2*total_error_1/data_len) ST = 0.0 data_len
|
||||
jmp short .rbps_2
|
||||
.total_error_1_is_0:
|
||||
mov ebx, [esp + 36]
|
||||
fst dword [ebx + 4] ; residual_bits_per_sample[1] = 0.0 ST = 0.0 data_len
|
||||
.rbps_2:
|
||||
cmp eax, edx
|
||||
je .total_error_2_is_0
|
||||
fld1 ; ST = 1.0 0.0 data_len
|
||||
mov [esp], edx
|
||||
mov [esp + 4], eax ; [esp + 0] = (uint64)total_error_2
|
||||
fild qword [esp] ; ST = total_error_2 1.0 0.0 data_len
|
||||
fdiv st3 ; ST = total_error_2/data_len 1.0 0.0 data_len
|
||||
fldln2 ; ST = ln2 total_error_2/data_len 1.0 0.0 data_len
|
||||
fmulp st1 ; ST = ln2*total_error_2/data_len 1.0 0.0 data_len
|
||||
fyl2x ; ST = log2(ln2*total_error_2/data_len) 0.0 data_len
|
||||
mov ebx, [esp + 36]
|
||||
fstp dword [ebx + 8] ; residual_bits_per_sample[2] = log2(ln2*total_error_2/data_len) ST = 0.0 data_len
|
||||
jmp short .rbps_3
|
||||
.total_error_2_is_0:
|
||||
mov ebx, [esp + 36]
|
||||
fst dword [ebx + 8] ; residual_bits_per_sample[2] = 0.0 ST = 0.0 data_len
|
||||
.rbps_3:
|
||||
cmp eax, esi
|
||||
je .total_error_3_is_0
|
||||
fld1 ; ST = 1.0 0.0 data_len
|
||||
mov [esp], esi
|
||||
mov [esp + 4], eax ; [esp + 0] = (uint64)total_error_3
|
||||
fild qword [esp] ; ST = total_error_3 1.0 0.0 data_len
|
||||
fdiv st3 ; ST = total_error_3/data_len 1.0 0.0 data_len
|
||||
fldln2 ; ST = ln2 total_error_3/data_len 1.0 0.0 data_len
|
||||
fmulp st1 ; ST = ln2*total_error_3/data_len 1.0 0.0 data_len
|
||||
fyl2x ; ST = log2(ln2*total_error_3/data_len) 0.0 data_len
|
||||
mov ebx, [esp + 36]
|
||||
fstp dword [ebx + 12] ; residual_bits_per_sample[3] = log2(ln2*total_error_3/data_len) ST = 0.0 data_len
|
||||
jmp short .rbps_4
|
||||
.total_error_3_is_0:
|
||||
mov ebx, [esp + 36]
|
||||
fst dword [ebx + 12] ; residual_bits_per_sample[3] = 0.0 ST = 0.0 data_len
|
||||
.rbps_4:
|
||||
cmp eax, edi
|
||||
je .total_error_4_is_0
|
||||
fld1 ; ST = 1.0 0.0 data_len
|
||||
mov [esp], edi
|
||||
mov [esp + 4], eax ; [esp + 0] = (uint64)total_error_4
|
||||
fild qword [esp] ; ST = total_error_4 1.0 0.0 data_len
|
||||
fdiv st3 ; ST = total_error_4/data_len 1.0 0.0 data_len
|
||||
fldln2 ; ST = ln2 total_error_4/data_len 1.0 0.0 data_len
|
||||
fmulp st1 ; ST = ln2*total_error_4/data_len 1.0 0.0 data_len
|
||||
fyl2x ; ST = log2(ln2*total_error_4/data_len) 0.0 data_len
|
||||
mov ebx, [esp + 36]
|
||||
fstp dword [ebx + 16] ; residual_bits_per_sample[2] = log2(ln2*total_error_4/data_len) ST = 0.0 data_len
|
||||
jmp short .rbps_end
|
||||
.total_error_4_is_0:
|
||||
mov ebx, [esp + 36]
|
||||
fst dword [ebx + 16] ; residual_bits_per_sample[2] = 0.0 ST = 0.0 data_len
|
||||
.rbps_end:
|
||||
fstp st0 ; ST = data_len
|
||||
fstp st0 ; ST = [empty]
|
||||
|
||||
.end:
|
||||
mov eax, ebp ; return order
|
||||
add esp, byte 8
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
end
|
||||
Reference in New Issue
Block a user