diff --git a/src/libFLAC/i386/fixed_asm.nasm b/src/libFLAC/i386/fixed_asm.nasm index 7316b540..68c6895e 100644 --- a/src/libFLAC/i386/fixed_asm.nasm +++ b/src/libFLAC/i386/fixed_asm.nasm @@ -106,27 +106,35 @@ FLAC__fixed_compute_best_predictor_asm: add edi, ebx sub edi, ebp ; last_error_3 = last_error_2 - (data[-2] - 2*data[-3] + data[-4]); mov ebx, [esp + 28] ; ebx = data[] - mov ecx, [esp + 32] ; ecx = data_len movd mm6, ecx ; mm6 = 0:last_error_0 movd mm3, edx ; mm3 = 0:last_error_1 movd mm7, esi ; mm7 = 0:last_error_2 movd mm4, edi ; mm4 = 0:last_error_3 punpckldq mm6, mm3 ; mm6 = last_error_1:last_error_0 punpckldq mm7, mm4 ; mm7 = last_error_3:last_error_2 + mov ecx, [esp + 32] ; ecx = data_len + ; for(i = 0; i < data_len; i++) { + ; error_0 = data[i] ; save = error_0; total_error_0 += local_abs(error_0); + ; error_1 -= last_error_0; last_error_0 = save; save = error_1; total_error_1 += local_abs(error_1); + ; error_2 -= last_error_1; last_error_1 = save; save = error_2; total_error_2 += local_abs(error_2); + ; error_3 -= last_error_2; last_error_2 = save; save = error_3; total_error_3 += local_abs(error_3); + ; error_4 -= last_error_3; last_error_3 = save; total_error_4 += local_abs(error_4); + ; } .loop: mov eax, [ebx] ; eax = error_0 = data[i] add ebx, 4 - mov edx, eax ; edx = error_0 mov edi, eax ; edi == save = error_0 + mov edx, eax ; edx = error_0 neg edx ; edx = -error_0 cmovns eax, edx ; eax = abs(error_0) movd mm5, eax ; mm5 = 0:abs(error_0) movd edx, mm6 ; edx = last_error_0 mov eax, edi ; eax = error(error_0) - pshufw mm3, mm6, 4eh ; 4eh=1-0-3-2, mm3 = last_error_0:last_error_1 + movaps mm3, mm6 ; mm3 = last_error_1:last_error_0 + psrlq mm3, 32 ; mm3 = 0:last_error_1 movd mm6, edi ; mm6 = 0:last_error_0(=save) - punpckldq mm6, mm3 ; mm6 = last_error_1:last_error_0 + punpckldq mm6, mm3 ; mm6 = last_error_1:last_error_0(=save) sub eax, edx ; error -= last_error_0 mov edi, eax ; edi == save = error_1 mov edx, eax ; edx = error_1 @@ -134,21 +142,23 @@ FLAC__fixed_compute_best_predictor_asm: cmovns eax, edx ; eax = abs(error_1) movd mm4, eax ; mm4 = 0:abs(error_1) punpckldq mm5, mm4 ; mm5 = abs(error_1):abs(error_0) - pshufw mm3, mm6, 4eh ; 4eh=1-0-3-2, mm3 = last_error_0:last_error_1 + movaps mm3, mm6 ; mm3 = last_error_1:last_error_0 + psrlq mm3, 32 ; mm3 = 0:last_error_1 movd edx, mm3 ; edx = last_error_1 mov eax, edi ; eax = error(error_1) movd mm4, edi ; mm4 = 0:save punpckldq mm6, mm4 ; mm6 = last_error_1(=save):last_error_0 sub eax, edx ; error -= last_error_1 mov edi, eax ; edi == save = error_2 - paddd mm0, mm5 ; [CR] total_error_1 += abs(error_1) ; total_error_0 += abs(error_0) mov edx, eax ; edx = error_2 + paddd mm0, mm5 ; [CR] total_error_1 += abs(error_1) ; total_error_0 += abs(error_0) neg edx ; edx = -error_2 cmovns eax, edx ; eax = abs(error_2) movd mm5, eax ; mm5 = 0:abs(error_2) movd edx, mm7 ; edx = last_error_2 mov eax, edi ; eax = error(error_2) - pshufw mm3, mm7, 4eh ; 4eh=1-0-3-2, mm3 = last_error_2:last_error_3 + movaps mm3, mm7 ; mm3 = last_error_3:last_error_2 + psrlq mm3, 32 ; mm3 = 0:last_error_3 movd mm7, edi ; mm7 = 0:last_error_2(=save) punpckldq mm7, mm3 ; mm7 = last_error_3:last_error_2 sub eax, edx ; error -= last_error_2 @@ -158,14 +168,15 @@ FLAC__fixed_compute_best_predictor_asm: cmovns eax, edx ; eax = abs(error_3) movd mm4, eax ; mm4 = 0:abs(error_3) punpckldq mm5, mm4 ; mm5 = abs(error_3):abs(error_2) - pshufw mm3, mm7, 4eh ; 4eh=1-0-3-2, mm3 = last_error_2:last_error_3 + movaps mm3, mm7 + psrlq mm3, 32 ; mm3 = 0:last_error_3 movd edx, mm3 ; edx = last_error_3 mov eax, edi ; eax = error(error_3) movd mm4, edi ; mm4 = 0:save punpckldq mm7, mm4 ; mm7 = last_error_3(=save):last_error_2 sub eax, edx ; error -= last_error_3 - paddd mm1, mm5 ; [CR] total_error_3 += abs(error_3) ; total_error_2 += abs(error_2) mov edx, eax ; edx = error_4 + paddd mm1, mm5 ; [CR] total_error_3 += abs(error_3) ; total_error_2 += abs(error_2) neg edx ; edx = -error_4 cmovns eax, edx ; eax = abs(error_4) movd mm5, eax ; mm5 = 0:abs(error_4) @@ -184,10 +195,12 @@ FLAC__fixed_compute_best_predictor_asm: ; else ; order = 4; movd edi, mm2 ; edi = total_error_4 - pshufw mm4, mm1, 4eh ; 4eh=1-0-4-2, mm3 = total_error_2:total_error_3 + movaps mm4, mm1 ; mm4 = total_error_3:total_error_2 + psrlq mm4, 32 ; mm4 = 0:total_error_3 movd edx, mm1 ; edx = total_error_2 movd esi, mm4 ; esi = total_error_3 - pshufw mm3, mm0, 4eh ; 4eh=1-0-3-2, mm3 = total_error_0:total_error_1 + movaps mm3, mm0 ; mm3 = total_error_1:total_error_0 + psrlq mm3, 32 ; mm3 = 0:total_error_1 movd ebx, mm0 ; ebx = total_error_0 movd ecx, mm3 ; ecx = total_error_1 emms @@ -207,17 +220,17 @@ FLAC__fixed_compute_best_predictor_asm: jmp short .got_order .not_order_0: cmp eax, ecx - jne .not_order_0 + jne .not_order_1 mov ebp, 1 jmp short .got_order .not_order_1: cmp eax, edx - jne .not_order_0 + jne .not_order_2 mov ebp, 2 jmp short .got_order .not_order_2: cmp eax, esi - jne .not_order_0 + jne .not_order_3 mov ebp, 3 jmp short .got_order .not_order_3: @@ -228,90 +241,88 @@ FLAC__fixed_compute_best_predictor_asm: ; residual_bits_per_sample[2] = (real)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (real)total_error_2 / (real) data_len) / M_LN2 : 0.0); ; residual_bits_per_sample[3] = (real)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (real)total_error_3 / (real) data_len) / M_LN2 : 0.0); ; residual_bits_per_sample[4] = (real)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (real)total_error_4 / (real) data_len) / M_LN2 : 0.0); - fild dword [esp + 32] ; ST = data_len (NOTE: assumes data_len is <2gigs) - fldz ; ST = 0.0 data_len xor eax, eax cmp eax, [esp + 32] je near .data_len_is_0 + fild dword [esp + 32] ; ST = data_len (NOTE: assumes data_len is <2gigs) .rbps_0: - cmp eax, ebx - je .total_error_0_is_0 - fld1 ; ST = 1.0 0.0 data_len + test ebx, ebx + jz .total_error_0_is_0 + fld1 ; ST = 1.0 data_len mov [esp], ebx mov [esp + 4], eax ; [esp + 0] = (uint64)total_error_0 - fild qword [esp] ; ST = total_error_0 1.0 0.0 data_len - fdiv st3 ; ST = total_error_0/data_len 1.0 0.0 data_len - fldln2 ; ST = ln2 total_error_0/data_len 1.0 0.0 data_len - fmulp st1 ; ST = ln2*total_error_0/data_len 1.0 0.0 data_len - fyl2x ; ST = log2(ln2*total_error_0/data_len) 0.0 data_len mov ebx, [esp + 36] - fstp dword [ebx] ; residual_bits_per_sample[0] = log2(ln2*total_error_0/data_len) ST = 0.0 data_len + fild qword [esp] ; ST = total_error_0 1.0 data_len + fdiv st2 ; ST = total_error_0/data_len 1.0 data_len + fldln2 ; ST = ln2 total_error_0/data_len 1.0 data_len + fmulp st1 ; ST = ln2*total_error_0/data_len 1.0 data_len + fyl2x ; ST = log2(ln2*total_error_0/data_len) data_len + fstp dword [ebx] ; residual_bits_per_sample[0] = log2(ln2*total_error_0/data_len) ST = data_len jmp short .rbps_1 .total_error_0_is_0: mov ebx, [esp + 36] - fst dword [ebx] ; ST = 0.0 data_len + mov [ebx], eax ; residual_bits_per_sample[0] = 0.0 .rbps_1: - cmp eax, ecx - je .total_error_1_is_0 - fld1 ; ST = 1.0 0.0 data_len + test ecx, ecx + jz .total_error_1_is_0 + fld1 ; ST = 1.0 data_len mov [esp], ecx mov [esp + 4], eax ; [esp + 0] = (uint64)total_error_1 - fild qword [esp] ; ST = total_error_1 1.0 0.0 data_len - fdiv st3 ; ST = total_error_1/data_len 1.0 0.0 data_len - fldln2 ; ST = ln2 total_error_1/data_len 1.0 0.0 data_len - fmulp st1 ; ST = ln2*total_error_1/data_len 1.0 0.0 data_len - fyl2x ; ST = log2(ln2*total_error_1/data_len) 0.0 data_len - fstp dword [ebx + 4] ; residual_bits_per_sample[1] = log2(ln2*total_error_1/data_len) ST = 0.0 data_len + fild qword [esp] ; ST = total_error_1 1.0 data_len + fdiv st2 ; ST = total_error_1/data_len 1.0 data_len + fldln2 ; ST = ln2 total_error_1/data_len 1.0 data_len + fmulp st1 ; ST = ln2*total_error_1/data_len 1.0 data_len + fyl2x ; ST = log2(ln2*total_error_1/data_len) data_len + fstp dword [ebx + 4] ; residual_bits_per_sample[1] = log2(ln2*total_error_1/data_len) ST = data_len jmp short .rbps_2 .total_error_1_is_0: - fst dword [ebx + 4] ; residual_bits_per_sample[1] = 0.0 ST = 0.0 data_len + mov [ebx + 4], eax ; residual_bits_per_sample[1] = 0.0 .rbps_2: - cmp eax, edx - je .total_error_2_is_0 - fld1 ; ST = 1.0 0.0 data_len + test edx, edx + jz .total_error_2_is_0 + fld1 ; ST = 1.0 data_len mov [esp], edx mov [esp + 4], eax ; [esp + 0] = (uint64)total_error_2 - fild qword [esp] ; ST = total_error_2 1.0 0.0 data_len - fdiv st3 ; ST = total_error_2/data_len 1.0 0.0 data_len - fldln2 ; ST = ln2 total_error_2/data_len 1.0 0.0 data_len - fmulp st1 ; ST = ln2*total_error_2/data_len 1.0 0.0 data_len - fyl2x ; ST = log2(ln2*total_error_2/data_len) 0.0 data_len - fstp dword [ebx + 8] ; residual_bits_per_sample[2] = log2(ln2*total_error_2/data_len) ST = 0.0 data_len + fild qword [esp] ; ST = total_error_2 1.0 data_len + fdiv st2 ; ST = total_error_2/data_len 1.0 data_len + fldln2 ; ST = ln2 total_error_2/data_len 1.0 data_len + fmulp st1 ; ST = ln2*total_error_2/data_len 1.0 data_len + fyl2x ; ST = log2(ln2*total_error_2/data_len) data_len + fstp dword [ebx + 8] ; residual_bits_per_sample[2] = log2(ln2*total_error_2/data_len) ST = data_len jmp short .rbps_3 .total_error_2_is_0: - fst dword [ebx + 8] ; residual_bits_per_sample[2] = 0.0 ST = 0.0 data_len + mov [ebx + 8], eax ; residual_bits_per_sample[2] = 0.0 .rbps_3: - cmp eax, esi - je .total_error_3_is_0 - fld1 ; ST = 1.0 0.0 data_len + test esi, esi + jz .total_error_3_is_0 + fld1 ; ST = 1.0 data_len mov [esp], esi mov [esp + 4], eax ; [esp + 0] = (uint64)total_error_3 - fild qword [esp] ; ST = total_error_3 1.0 0.0 data_len - fdiv st3 ; ST = total_error_3/data_len 1.0 0.0 data_len - fldln2 ; ST = ln2 total_error_3/data_len 1.0 0.0 data_len - fmulp st1 ; ST = ln2*total_error_3/data_len 1.0 0.0 data_len - fyl2x ; ST = log2(ln2*total_error_3/data_len) 0.0 data_len - fstp dword [ebx + 12] ; residual_bits_per_sample[3] = log2(ln2*total_error_3/data_len) ST = 0.0 data_len + fild qword [esp] ; ST = total_error_3 1.0 data_len + fdiv st2 ; ST = total_error_3/data_len 1.0 data_len + fldln2 ; ST = ln2 total_error_3/data_len 1.0 data_len + fmulp st1 ; ST = ln2*total_error_3/data_len 1.0 data_len + fyl2x ; ST = log2(ln2*total_error_3/data_len) data_len + fstp dword [ebx + 12] ; residual_bits_per_sample[3] = log2(ln2*total_error_3/data_len) ST = data_len jmp short .rbps_4 .total_error_3_is_0: - fst dword [ebx + 12] ; residual_bits_per_sample[3] = 0.0 ST = 0.0 data_len + mov [ebx + 12], eax ; residual_bits_per_sample[3] = 0.0 .rbps_4: - cmp eax, edi - je .total_error_4_is_0 - fld1 ; ST = 1.0 0.0 data_len + test edi, edi + jz .total_error_4_is_0 + fld1 ; ST = 1.0 data_len mov [esp], edi mov [esp + 4], eax ; [esp + 0] = (uint64)total_error_4 - fild qword [esp] ; ST = total_error_4 1.0 0.0 data_len - fdiv st3 ; ST = total_error_4/data_len 1.0 0.0 data_len - fldln2 ; ST = ln2 total_error_4/data_len 1.0 0.0 data_len - fmulp st1 ; ST = ln2*total_error_4/data_len 1.0 0.0 data_len - fyl2x ; ST = log2(ln2*total_error_4/data_len) 0.0 data_len - fstp dword [ebx + 16] ; residual_bits_per_sample[2] = log2(ln2*total_error_4/data_len) ST = 0.0 data_len + fild qword [esp] ; ST = total_error_4 1.0 data_len + fdiv st2 ; ST = total_error_4/data_len 1.0 data_len + fldln2 ; ST = ln2 total_error_4/data_len 1.0 data_len + fmulp st1 ; ST = ln2*total_error_4/data_len 1.0 data_len + fyl2x ; ST = log2(ln2*total_error_4/data_len) data_len + fstp dword [ebx + 16] ; residual_bits_per_sample[4] = log2(ln2*total_error_4/data_len) ST = data_len jmp short .rbps_end .total_error_4_is_0: - fst dword [ebx + 16] ; residual_bits_per_sample[2] = 0.0 ST = 0.0 data_len + mov [ebx + 16], eax ; residual_bits_per_sample[4] = 0.0 .rbps_end: - fstp st0 ; ST = data_len fstp st0 ; ST = [empty] jmp short .end .data_len_is_0: