From 7e9278934e314f62f25932261b4c4197d06cd4f3 Mon Sep 17 00:00:00 2001 From: Erik de Castro Lopo Date: Tue, 7 Jan 2014 21:35:03 +1100 Subject: [PATCH] libFLAC : Add asm versions for two _wide() functions. GCC generates slow ia32 code for FLAC__lpc_restore_signal_wide() and FLAC__lpc_compute_residual_from_qlp_coefficients_wide() so 24-bit encoding/decoding is slower for GCC compile than for MSVS or ICC compile. This patch adds ia32 asm versions of these functions. Patch-from: lvqcl --- src/libFLAC/ia32/lpc_asm.nasm | 563 ++++++++++++++++++++++++++++++ src/libFLAC/include/private/lpc.h | 2 + src/libFLAC/stream_decoder.c | 3 +- src/libFLAC/stream_encoder.c | 1 + 4 files changed, 568 insertions(+), 1 deletion(-) diff --git a/src/libFLAC/ia32/lpc_asm.nasm b/src/libFLAC/ia32/lpc_asm.nasm index 2b7a8019..432437d8 100644 --- a/src/libFLAC/ia32/lpc_asm.nasm +++ b/src/libFLAC/ia32/lpc_asm.nasm @@ -43,8 +43,10 @@ cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx +cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32 cglobal FLAC__lpc_restore_signal_asm_ia32 cglobal FLAC__lpc_restore_signal_asm_ia32_mmx +cglobal FLAC__lpc_restore_signal_wide_asm_ia32 code_section @@ -1603,4 +1605,565 @@ cident FLAC__lpc_restore_signal_asm_ia32_mmx pop ebp ret + +; ********************************************************************** +; +;void FLAC__lpc_compute_residual_from_qlp_coefficients_wide(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) +; { +; unsigned i, j; +; FLAC__int64 sum; +; +; FLAC__ASSERT(order > 0); +; +; for(i = 0; i < data_len; i++) { +; sum = 0; +; for(j = 0; j < order; j++) +; sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1]; +; residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization); +; } +; } + ALIGN 16 +cident FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32 + ;[esp + 40] residual[] + ;[esp + 36] lp_quantization + ;[esp + 32] order + ;[esp + 28] qlp_coeff[] + ;[esp + 24] data_len + ;[esp + 20] data[] + + ;ASSERT(order > 0) + ;ASSERT(order <= 32) + + push ebp + push ebx + push esi + push edi + + mov ebx, [esp + 24] ; ebx = data_len + test ebx, ebx + jz near .end ; do nothing if data_len == 0 + +.begin: + mov eax, [esp + 32] ; eax = order + cmp eax, 1 + jg short .i_32 + + mov esi, [esp + 40] ; esi = residual[] + mov edi, [esp + 20] ; edi = data[] + mov ecx, [esp + 28] ; ecx = qlp_coeff[] + mov ebp, [ecx] ; ebp = qlp_coeff[0] + mov eax, [edi - 4] ; eax = data[-1] + mov cl, [esp + 36] ; cl = lp_quantization + ALIGN 16 +.i_1_loop_i: + imul ebp ; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1] + shrd eax, edx, cl ; 0 <= lp_quantization <= 15 + neg eax + add eax, [edi] + mov [esi], eax + mov eax, [edi] + add esi, 4 + add edi, 4 + dec ebx + jnz .i_1_loop_i + jmp .end + +.mov_eip_to_eax: + mov eax, [esp] + ret + +.i_32: ; eax = order + neg eax + add eax, eax + lea ebp, [eax + eax * 4 + .jumper_0 - .get_eip0] + call .mov_eip_to_eax +.get_eip0: + add ebp, eax + inc ebp ; compensate for the shorter opcode on the last iteration + + mov ebx, [esp + 28] ; ebx = qlp_coeff[] + mov edi, [esp + 20] ; edi = data[] + sub [esp + 40], edi ; residual[] -= data[] + + xor ecx, ecx + xor esi, esi + jmp ebp + +;eax = -- +;edx = -- +;ecx = 0 +;esi = 0 +; +;ebx = qlp_coeff[] +;edi = data[] +;ebp = @address + + mov eax, [ebx + 124] ; eax = qlp_coeff[31] + imul dword [edi - 128] ; edx:eax = qlp_coeff[31] * data[i-32] + add ecx, eax + adc esi, edx ; sum += qlp_coeff[31] * data[i-32] + + mov eax, [ebx + 120] ; eax = qlp_coeff[30] + imul dword [edi - 124] ; edx:eax = qlp_coeff[30] * data[i-31] + add ecx, eax + adc esi, edx ; sum += qlp_coeff[30] * data[i-31] + + mov eax, [ebx + 116] + imul dword [edi - 120] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 112] + imul dword [edi - 116] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 108] + imul dword [edi - 112] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 104] + imul dword [edi - 108] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 100] + imul dword [edi - 104] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 96] + imul dword [edi - 100] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 92] + imul dword [edi - 96] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 88] + imul dword [edi - 92] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 84] + imul dword [edi - 88] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 80] + imul dword [edi - 84] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 76] + imul dword [edi - 80] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 72] + imul dword [edi - 76] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 68] + imul dword [edi - 72] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 64] + imul dword [edi - 68] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 60] + imul dword [edi - 64] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 56] + imul dword [edi - 60] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 52] + imul dword [edi - 56] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 48] + imul dword [edi - 52] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 44] + imul dword [edi - 48] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 40] + imul dword [edi - 44] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 36] + imul dword [edi - 40] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 32] + imul dword [edi - 36] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 28] + imul dword [edi - 32] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 24] + imul dword [edi - 28] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 20] + imul dword [edi - 24] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 16] + imul dword [edi - 20] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 12] + imul dword [edi - 16] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 8] + imul dword [edi - 12] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 4] + imul dword [edi - 8] + add ecx, eax + adc esi, edx + + mov eax, [ebx] ; eax = qlp_coeff[ 0] (NOTE: one byte missing from instruction) + imul dword [edi - 4] ; edx:eax = qlp_coeff[ 0] * data[i- 1] + add ecx, eax + adc esi, edx ; sum += qlp_coeff[ 0] * data[i- 1] + +.jumper_0: + mov edx, ecx +;esi:edx = sum + mov ecx, [esp + 36] ; cl = lp_quantization + shrd edx, esi, cl ; edx = (sum >> lp_quantization) +;eax = -- +;ecx = -- +;edx = sum >> lp_q +;esi = -- + neg edx ; edx = -(sum >> lp_quantization) + mov eax, [esp + 40] ; residual[] - data[] + add edx, [edi] ; edx = data[i] - (sum >> lp_quantization) + mov [edi + eax], edx + add edi, 4 + + dec dword [esp + 24] + jz short .end + xor ecx, ecx + xor esi, esi + jmp ebp + +.end: + pop edi + pop esi + pop ebx + pop ebp + ret + +; ********************************************************************** +; +; void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]) +; { +; unsigned i, j; +; FLAC__int64 sum; +; +; FLAC__ASSERT(order > 0); +; +; for(i = 0; i < data_len; i++) { +; sum = 0; +; for(j = 0; j < order; j++) +; sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1]; +; data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantization); +; } +; } + ALIGN 16 +cident FLAC__lpc_restore_signal_wide_asm_ia32 + ;[esp + 40] data[] + ;[esp + 36] lp_quantization + ;[esp + 32] order + ;[esp + 28] qlp_coeff[] + ;[esp + 24] data_len + ;[esp + 20] residual[] + + ;ASSERT(order > 0) + ;ASSERT(order <= 32) + + push ebp + push ebx + push esi + push edi + + mov ebx, [esp + 24] ; ebx = data_len + test ebx, ebx + jz near .end ; do nothing if data_len == 0 + +.begin: + mov eax, [esp + 32] ; eax = order + cmp eax, 1 + jg short .x87_32 + + mov esi, [esp + 20] ; esi = residual[] + mov edi, [esp + 40] ; edi = data[] + mov ecx, [esp + 28] ; ecx = qlp_coeff[] + mov ebp, [ecx] ; ebp = qlp_coeff[0] + mov eax, [edi - 4] ; eax = data[-1] + mov cl, [esp + 36] ; cl = lp_quantization + ALIGN 16 +.x87_1_loop_i: + imul ebp ; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1] + shrd eax, edx, cl ; 0 <= lp_quantization <= 15 +; + add eax, [esi] + mov [edi], eax +; + add esi, 4 + add edi, 4 + dec ebx + jnz .x87_1_loop_i + jmp .end + +.mov_eip_to_eax: + mov eax, [esp] + ret + +.x87_32: ; eax = order + neg eax + add eax, eax + lea ebp, [eax + eax * 4 + .jumper_0 - .get_eip0] + call .mov_eip_to_eax +.get_eip0: + add ebp, eax + inc ebp ; compensate for the shorter opcode on the last iteration + + mov ebx, [esp + 28] ; ebx = qlp_coeff[] + mov edi, [esp + 40] ; esi = data[] + sub [esp + 20], edi ; residual[] -= data[] + + xor ecx, ecx + xor esi, esi + jmp ebp + +;eax = -- +;edx = -- +;ecx = 0 +;esi = 0 +; +;ebx = qlp_coeff[] +;edi = data[] +;ebp = @address + + mov eax, [ebx + 124] ; eax = qlp_coeff[31] + imul dword [edi - 128] ; edx:eax = qlp_coeff[31] * data[i-32] + add ecx, eax + adc esi, edx ; sum += qlp_coeff[31] * data[i-32] + + mov eax, [ebx + 120] ; eax = qlp_coeff[30] + imul dword [edi - 124] ; edx:eax = qlp_coeff[30] * data[i-31] + add ecx, eax + adc esi, edx ; sum += qlp_coeff[30] * data[i-31] + + mov eax, [ebx + 116] + imul dword [edi - 120] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 112] + imul dword [edi - 116] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 108] + imul dword [edi - 112] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 104] + imul dword [edi - 108] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 100] + imul dword [edi - 104] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 96] + imul dword [edi - 100] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 92] + imul dword [edi - 96] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 88] + imul dword [edi - 92] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 84] + imul dword [edi - 88] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 80] + imul dword [edi - 84] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 76] + imul dword [edi - 80] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 72] + imul dword [edi - 76] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 68] + imul dword [edi - 72] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 64] + imul dword [edi - 68] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 60] + imul dword [edi - 64] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 56] + imul dword [edi - 60] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 52] + imul dword [edi - 56] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 48] + imul dword [edi - 52] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 44] + imul dword [edi - 48] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 40] + imul dword [edi - 44] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 36] + imul dword [edi - 40] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 32] + imul dword [edi - 36] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 28] + imul dword [edi - 32] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 24] + imul dword [edi - 28] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 20] + imul dword [edi - 24] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 16] + imul dword [edi - 20] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 12] + imul dword [edi - 16] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 8] + imul dword [edi - 12] + add ecx, eax + adc esi, edx + + mov eax, [ebx + 4] + imul dword [edi - 8] + add ecx, eax + adc esi, edx + + mov eax, [ebx] ; eax = qlp_coeff[ 0] (NOTE: one byte missing from instruction) + imul dword [edi - 4] ; edx:eax = qlp_coeff[ 0] * data[i- 1] + add ecx, eax + adc esi, edx ; sum += qlp_coeff[ 0] * data[i- 1] + +.jumper_0: + mov edx, ecx +;esi:edx = sum + mov ecx, [esp + 36] ; cl = lp_quantization + shrd edx, esi, cl ; edx = (sum >> lp_quantization) +;eax = -- +;ecx = -- +;edx = sum >> lp_q +;esi = -- +; + mov eax, [esp + 20] ; residual[] - data[] + add edx, [edi + eax] ; edx = residual[i] + (sum >> lp_quantization) + mov [edi], edx ; data[i] = residual[i] + (sum >> lp_quantization) + add edi, 4 + + dec dword [esp + 24] + jz short .end + xor ecx, ecx + xor esi, esi + jmp ebp + +.end: + pop edi + pop esi + pop ebx + pop ebp + ret + ; end diff --git a/src/libFLAC/include/private/lpc.h b/src/libFLAC/include/private/lpc.h index 2e8c4b5e..27760b48 100644 --- a/src/libFLAC/include/private/lpc.h +++ b/src/libFLAC/include/private/lpc.h @@ -152,6 +152,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide(const FLAC__int32 *da # ifdef FLAC__HAS_NASM void FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]); void FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]); +void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]); # endif # endif # if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN @@ -187,6 +188,7 @@ void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], unsigned data_l # ifdef FLAC__HAS_NASM void FLAC__lpc_restore_signal_asm_ia32(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]); void FLAC__lpc_restore_signal_asm_ia32_mmx(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]); +void FLAC__lpc_restore_signal_wide_asm_ia32(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]); # endif /* FLAC__HAS_NASM */ # elif defined FLAC__CPU_PPC void FLAC__lpc_restore_signal_asm_ppc_altivec_16(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]); diff --git a/src/libFLAC/stream_decoder.c b/src/libFLAC/stream_decoder.c index be9e2965..d8cd7142 100644 --- a/src/libFLAC/stream_decoder.c +++ b/src/libFLAC/stream_decoder.c @@ -404,6 +404,7 @@ static FLAC__StreamDecoderInitStatus init_stream_internal_( if(decoder->private_->cpuinfo.ia32.bswap) decoder->private_->local_bitreader_read_rice_signed_block = FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap; #endif + decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide_asm_ia32; if(decoder->private_->cpuinfo.ia32.mmx) { decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal_asm_ia32; decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_asm_ia32_mmx; @@ -416,7 +417,7 @@ static FLAC__StreamDecoderInitStatus init_stream_internal_( } #endif #ifdef FLAC__HAS_X86INTRIN -# ifdef FLAC__SSE4_SUPPORTED +# if defined FLAC__SSE4_SUPPORTED && 0 /* now we have FLAC__lpc_restore_signal_wide_asm_ia32() which is slightly faster */ if(decoder->private_->cpuinfo.ia32.sse41) decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide_intrin_sse41; # endif diff --git a/src/libFLAC/stream_encoder.c b/src/libFLAC/stream_encoder.c index 191d0f20..cbf28156 100644 --- a/src/libFLAC/stream_encoder.c +++ b/src/libFLAC/stream_encoder.c @@ -891,6 +891,7 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_( # ifdef FLAC__CPU_IA32 FLAC__ASSERT(encoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_IA32); # ifdef FLAC__HAS_NASM + encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32; if(encoder->private_->cpuinfo.ia32.sse) { if(encoder->protected_->max_lpc_order < 4) encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4;