; libFLAC - Free Lossless Audio Codec library ; Copyright (C) 2001 Josh Coalson ; ; This library is free software; you can redistribute it and/or ; modify it under the terms of the GNU Library General Public ; License as published by the Free Software Foundation; either ; version 2 of the License, or (at your option) any later version. ; ; This library is distributed in the hope that it will be useful, ; but WITHOUT ANY WARRANTY; without even the implied warranty of ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ; Library General Public License for more details. ; ; You should have received a copy of the GNU Library General Public ; License along with this library; if not, write to the ; Free Software Foundation, Inc., 59 Temple Place - Suite 330, ; Boston, MA 02111-1307, USA. %include "nasm.h" data_section cglobal FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov code_section ; ********************************************************************** ; ; unsigned FLAC__fixed_compute_best_predictor(const FLAC__int32 data[], unsigned data_len, FLAC__real residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]) ; { ; FLAC__int32 last_error_0 = data[-1]; ; FLAC__int32 last_error_1 = data[-1] - data[-2]; ; FLAC__int32 last_error_2 = last_error_1 - (data[-2] - data[-3]); ; FLAC__int32 last_error_3 = last_error_2 - (data[-2] - 2*data[-3] + data[-4]); ; FLAC__int32 error, save; ; FLAC__uint32 total_error_0 = 0, total_error_1 = 0, total_error_2 = 0, total_error_3 = 0, total_error_4 = 0; ; unsigned i, order; ; ; for(i = 0; i < data_len; i++) { ; error = data[i] ; total_error_0 += local_abs(error); save = error; ; error -= last_error_0; total_error_1 += local_abs(error); last_error_0 = save; save = error; ; error -= last_error_1; total_error_2 += local_abs(error); last_error_1 = save; save = error; ; error -= last_error_2; total_error_3 += local_abs(error); last_error_2 = save; save = error; ; error -= last_error_3; total_error_4 += local_abs(error); last_error_3 = save; ; } ; ; if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4)) ; order = 0; ; else if(total_error_1 < min(min(total_error_2, total_error_3), total_error_4)) ; order = 1; ; else if(total_error_2 < min(total_error_3, total_error_4)) ; order = 2; ; else if(total_error_3 < total_error_4) ; order = 3; ; else ; order = 4; ; ; residual_bits_per_sample[0] = (FLAC__real)((data_len > 0 && total_error_0 > 0) ? log(M_LN2 * (double)total_error_0 / (double)data_len) / M_LN2 : 0.0); ; residual_bits_per_sample[1] = (FLAC__real)((data_len > 0 && total_error_1 > 0) ? log(M_LN2 * (double)total_error_1 / (double)data_len) / M_LN2 : 0.0); ; residual_bits_per_sample[2] = (FLAC__real)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (double)total_error_2 / (double)data_len) / M_LN2 : 0.0); ; residual_bits_per_sample[3] = (FLAC__real)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (double)total_error_3 / (double)data_len) / M_LN2 : 0.0); ; residual_bits_per_sample[4] = (FLAC__real)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (double)total_error_4 / (double)data_len) / M_LN2 : 0.0); ; ; return order; ; } ALIGN 16 cident FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov ; esp + 36 == data[] ; esp + 40 == data_len ; esp + 44 == residual_bits_per_sample[] push ebp push ebx push esi push edi sub esp, byte 16 ; qword [esp] == temp space for loading FLAC__uint64s to FPU regs ; dword [esp] == last_error_0 ; dword [esp + 4] == last_error_1 ; dword [esp + 8] == last_error_2 ; dword [esp + 12] == last_error_3 ; eax == error ; ebx == &data[i] ; ecx == loop counter (i) ; edx == temp ; edi == save ; ebp == order ; mm0 == total_error_1:total_error_0 ; mm1 == total_error_3:total_error_2 ; mm2 == 0:total_error_4 ; mm3/4 == 0:unpackarea ; mm5 == abs(error_1):abs(error_0) ; mm5 == abs(error_3):abs(error_2) pxor mm0, mm0 ; total_error_1 = total_error_0 = 0 pxor mm1, mm1 ; total_error_3 = total_error_2 = 0 pxor mm2, mm2 ; total_error_4 = 0 mov ebx, [esp + 36] ; ebx = data[] mov ecx, [ebx - 4] ; ecx == data[-1] last_error_0 = data[-1] mov eax, [ebx - 8] ; eax == data[-2] mov ebp, [ebx - 16] ; ebp == data[-4] mov ebx, [ebx - 12] ; ebx == data[-3] mov edx, ecx sub edx, eax ; last_error_1 = data[-1] - data[-2] mov esi, edx sub esi, eax add esi, ebx ; last_error_2 = last_error_1 - (data[-2] - data[-3]) shl ebx, 1 mov edi, esi sub edi, eax add edi, ebx sub edi, ebp ; last_error_3 = last_error_2 - (data[-2] - 2*data[-3] + data[-4]); mov ebx, [esp + 36] ; ebx = data[] mov [esp], ecx ; [esp] = last_error_0 mov [esp + 4], edx ; [esp + 4] = last_error_1 mov [esp + 8], esi ; [esp + 8] = last_error_2 mov [esp + 12], edi ; [esp + 12] = last_error_3 mov ecx, [esp + 40] ; ecx = data_len ; for(i = 0; i < data_len; i++) { ; error_0 = data[i] ; save = error_0; total_error_0 += local_abs(error_0); ; error_1 -= last_error_0; last_error_0 = save; save = error_1; total_error_1 += local_abs(error_1); ; error_2 -= last_error_1; last_error_1 = save; save = error_2; total_error_2 += local_abs(error_2); ; error_3 -= last_error_2; last_error_2 = save; save = error_3; total_error_3 += local_abs(error_3); ; error_4 -= last_error_3; last_error_3 = save; total_error_4 += local_abs(error_4); ; } ALIGN 16 .loop: mov eax, [ebx] ; eax = error_0 = data[i] add ebx, 4 mov edi, eax ; edi == save = error_0 mov edx, eax ; edx = error_0 neg edx ; edx = -error_0 cmovns eax, edx ; eax = abs(error_0) movd mm5, eax ; mm5 = 0:abs(error_0) mov edx, [esp] ; edx = last_error_0 mov eax, edi ; eax = error(error_0) mov [esp], edi ; [esp] == last_error_0 = save sub eax, edx ; error -= last_error_0 mov edi, eax ; edi == save = error_1 mov edx, eax ; edx = error_1 neg edx ; edx = -error_1 cmovns eax, edx ; eax = abs(error_1) movd mm4, eax ; mm4 = 0:abs(error_1) punpckldq mm5, mm4 ; mm5 = abs(error_1):abs(error_0) mov edx, [esp + 4] ; edx = last_error_1 mov eax, edi ; eax = error(error_1) mov [esp + 4], edi ; [esp + 4] == last_error_1 = save sub eax, edx ; error -= last_error_1 mov edi, eax ; edi == save = error_2 mov edx, eax ; edx = error_2 paddd mm0, mm5 ; [CR] total_error_1 += abs(error_1) ; total_error_0 += abs(error_0) neg edx ; edx = -error_2 cmovns eax, edx ; eax = abs(error_2) movd mm5, eax ; mm5 = 0:abs(error_2) mov edx, [esp + 8] ; edx = last_error_2 mov eax, edi ; eax = error(error_2) mov [esp + 8], edi ; [esp + 8] == last_error_2 = save sub eax, edx ; error -= last_error_2 mov edi, eax ; edi == save = error_3 mov edx, eax ; edx = error_3 neg edx ; edx = -error_3 cmovns eax, edx ; eax = abs(error_3) movd mm4, eax ; mm4 = 0:abs(error_3) punpckldq mm5, mm4 ; mm5 = abs(error_3):abs(error_2) mov edx, [esp + 12] ; edx = last_error_3 mov eax, edi ; eax = error(error_3) mov [esp + 12], edi ; [esp + 12] == last_error_3 = save sub eax, edx ; error -= last_error_3 mov edx, eax ; edx = error_4 paddd mm1, mm5 ; [CR] total_error_3 += abs(error_3) ; total_error_2 += abs(error_2) neg edx ; edx = -error_4 cmovns eax, edx ; eax = abs(error_4) movd mm5, eax ; mm5 = 0:abs(error_4) paddd mm2, mm5 ; total_error_4 += abs(error_4) dec ecx jnz near .loop ; if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4)) ; order = 0; ; else if(total_error_1 < min(min(total_error_2, total_error_3), total_error_4)) ; order = 1; ; else if(total_error_2 < min(total_error_3, total_error_4)) ; order = 2; ; else if(total_error_3 < total_error_4) ; order = 3; ; else ; order = 4; movd edi, mm2 ; edi = total_error_4 movq mm4, mm1 ; mm4 = total_error_3:total_error_2 psrlq mm4, 32 ; mm4 = 0:total_error_3 movd edx, mm1 ; edx = total_error_2 movd esi, mm4 ; esi = total_error_3 movq mm3, mm0 ; mm3 = total_error_1:total_error_0 psrlq mm3, 32 ; mm3 = 0:total_error_1 movd ebx, mm0 ; ebx = total_error_0 movd ecx, mm3 ; ecx = total_error_1 emms mov eax, ebx ; eax = total_error_0 cmp ecx, ebx cmovb eax, ecx ; eax = min(total_error_0, total_error_1) cmp edx, eax cmovb eax, edx ; eax = min(total_error_0, total_error_1, total_error_2) cmp esi, eax cmovb eax, esi ; eax = min(total_error_0, total_error_1, total_error_2, total_error_3) cmp edi, eax cmovb eax, edi ; eax = min(total_error_0, total_error_1, total_error_2, total_error_3, total_error_4) cmp eax, ebx jne .not_order_0 xor ebp, ebp jmp short .got_order .not_order_0: cmp eax, ecx jne .not_order_1 mov ebp, 1 jmp short .got_order .not_order_1: cmp eax, edx jne .not_order_2 mov ebp, 2 jmp short .got_order .not_order_2: cmp eax, esi jne .not_order_3 mov ebp, 3 jmp short .got_order .not_order_3: mov ebp, 4 .got_order: ; residual_bits_per_sample[0] = (FLAC__real)((data_len > 0 && total_error_0 > 0) ? log(M_LN2 * (double)total_error_0 / (double)data_len) / M_LN2 : 0.0); ; residual_bits_per_sample[1] = (FLAC__real)((data_len > 0 && total_error_1 > 0) ? log(M_LN2 * (double)total_error_1 / (double)data_len) / M_LN2 : 0.0); ; residual_bits_per_sample[2] = (FLAC__real)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (double)total_error_2 / (double)data_len) / M_LN2 : 0.0); ; residual_bits_per_sample[3] = (FLAC__real)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (double)total_error_3 / (double)data_len) / M_LN2 : 0.0); ; residual_bits_per_sample[4] = (FLAC__real)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (double)total_error_4 / (double)data_len) / M_LN2 : 0.0); xor eax, eax cmp eax, [esp + 40] je near .data_len_is_0 fild dword [esp + 40] ; ST = data_len (NOTE: assumes data_len is <2gigs) .rbps_0: test ebx, ebx jz .total_error_0_is_0 fld1 ; ST = 1.0 data_len mov [esp], ebx mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_0 mov ebx, [esp + 44] fild qword [esp] ; ST = total_error_0 1.0 data_len fdiv st2 ; ST = total_error_0/data_len 1.0 data_len fldln2 ; ST = ln2 total_error_0/data_len 1.0 data_len fmulp st1 ; ST = ln2*total_error_0/data_len 1.0 data_len fyl2x ; ST = log2(ln2*total_error_0/data_len) data_len fstp dword [ebx] ; residual_bits_per_sample[0] = log2(ln2*total_error_0/data_len) ST = data_len jmp short .rbps_1 .total_error_0_is_0: mov ebx, [esp + 44] mov [ebx], eax ; residual_bits_per_sample[0] = 0.0 .rbps_1: test ecx, ecx jz .total_error_1_is_0 fld1 ; ST = 1.0 data_len mov [esp], ecx mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_1 fild qword [esp] ; ST = total_error_1 1.0 data_len fdiv st2 ; ST = total_error_1/data_len 1.0 data_len fldln2 ; ST = ln2 total_error_1/data_len 1.0 data_len fmulp st1 ; ST = ln2*total_error_1/data_len 1.0 data_len fyl2x ; ST = log2(ln2*total_error_1/data_len) data_len fstp dword [ebx + 4] ; residual_bits_per_sample[1] = log2(ln2*total_error_1/data_len) ST = data_len jmp short .rbps_2 .total_error_1_is_0: mov [ebx + 4], eax ; residual_bits_per_sample[1] = 0.0 .rbps_2: test edx, edx jz .total_error_2_is_0 fld1 ; ST = 1.0 data_len mov [esp], edx mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_2 fild qword [esp] ; ST = total_error_2 1.0 data_len fdiv st2 ; ST = total_error_2/data_len 1.0 data_len fldln2 ; ST = ln2 total_error_2/data_len 1.0 data_len fmulp st1 ; ST = ln2*total_error_2/data_len 1.0 data_len fyl2x ; ST = log2(ln2*total_error_2/data_len) data_len fstp dword [ebx + 8] ; residual_bits_per_sample[2] = log2(ln2*total_error_2/data_len) ST = data_len jmp short .rbps_3 .total_error_2_is_0: mov [ebx + 8], eax ; residual_bits_per_sample[2] = 0.0 .rbps_3: test esi, esi jz .total_error_3_is_0 fld1 ; ST = 1.0 data_len mov [esp], esi mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_3 fild qword [esp] ; ST = total_error_3 1.0 data_len fdiv st2 ; ST = total_error_3/data_len 1.0 data_len fldln2 ; ST = ln2 total_error_3/data_len 1.0 data_len fmulp st1 ; ST = ln2*total_error_3/data_len 1.0 data_len fyl2x ; ST = log2(ln2*total_error_3/data_len) data_len fstp dword [ebx + 12] ; residual_bits_per_sample[3] = log2(ln2*total_error_3/data_len) ST = data_len jmp short .rbps_4 .total_error_3_is_0: mov [ebx + 12], eax ; residual_bits_per_sample[3] = 0.0 .rbps_4: test edi, edi jz .total_error_4_is_0 fld1 ; ST = 1.0 data_len mov [esp], edi mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_4 fild qword [esp] ; ST = total_error_4 1.0 data_len fdiv st2 ; ST = total_error_4/data_len 1.0 data_len fldln2 ; ST = ln2 total_error_4/data_len 1.0 data_len fmulp st1 ; ST = ln2*total_error_4/data_len 1.0 data_len fyl2x ; ST = log2(ln2*total_error_4/data_len) data_len fstp dword [ebx + 16] ; residual_bits_per_sample[4] = log2(ln2*total_error_4/data_len) ST = data_len jmp short .rbps_end .total_error_4_is_0: mov [ebx + 16], eax ; residual_bits_per_sample[4] = 0.0 .rbps_end: fstp st0 ; ST = [empty] jmp short .end .data_len_is_0: ; data_len == 0, so residual_bits_per_sample[*] = 0.0 mov ecx, 5 ; eax still == 0, ecx = # of dwords of 0 to store mov edi, [esp + 44] rep stosd .end: mov eax, ebp ; return order add esp, byte 16 pop edi pop esi pop ebx pop ebp ret end