From 7e9278934e314f62f25932261b4c4197d06cd4f3 Mon Sep 17 00:00:00 2001
From: Erik de Castro Lopo <erikd@mega-nerd.com>
Date: Tue, 7 Jan 2014 21:35:03 +1100
Subject: [PATCH] libFLAC : Add asm versions for two _wide() functions.

GCC generates slow ia32 code for FLAC__lpc_restore_signal_wide() and
FLAC__lpc_compute_residual_from_qlp_coefficients_wide() so 24-bit
encoding/decoding is slower for GCC compile than for MSVS or ICC
compile. This patch adds ia32 asm versions of these functions.

Patch-from: lvqcl <lvqcl.mail@gmail.com>
---
 src/libFLAC/ia32/lpc_asm.nasm     | 563 ++++++++++++++++++++++++++++++
 src/libFLAC/include/private/lpc.h |   2 +
 src/libFLAC/stream_decoder.c      |   3 +-
 src/libFLAC/stream_encoder.c      |   1 +
 4 files changed, 568 insertions(+), 1 deletion(-)

diff --git a/src/libFLAC/ia32/lpc_asm.nasm b/src/libFLAC/ia32/lpc_asm.nasm
index 2b7a8019..432437d8 100644
--- a/src/libFLAC/ia32/lpc_asm.nasm
+++ b/src/libFLAC/ia32/lpc_asm.nasm
@@ -43,8 +43,10 @@ cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16
 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
+cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32
 cglobal FLAC__lpc_restore_signal_asm_ia32
 cglobal FLAC__lpc_restore_signal_asm_ia32_mmx
+cglobal FLAC__lpc_restore_signal_wide_asm_ia32
 
 	code_section
 
@@ -1603,4 +1605,565 @@ cident FLAC__lpc_restore_signal_asm_ia32_mmx
 	pop	ebp
 	ret
 
+
+; **********************************************************************
+;
+;void FLAC__lpc_compute_residual_from_qlp_coefficients_wide(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
+; {
+; 	unsigned i, j;
+; 	FLAC__int64 sum;
+;
+; 	FLAC__ASSERT(order > 0);
+;
+;	for(i = 0; i < data_len; i++) {
+;		sum = 0;
+;		for(j = 0; j < order; j++)
+;			sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1];
+;		residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
+;	}
+; }
+	ALIGN	16
+cident FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32
+	;[esp + 40]	residual[]
+	;[esp + 36]	lp_quantization
+	;[esp + 32]	order
+	;[esp + 28]	qlp_coeff[]
+	;[esp + 24]	data_len
+	;[esp + 20]	data[]
+
+	;ASSERT(order > 0)
+	;ASSERT(order <= 32)
+
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+
+	mov	ebx, [esp + 24]			; ebx = data_len
+	test	ebx, ebx
+	jz	near .end				; do nothing if data_len == 0
+
+.begin:
+	mov	eax, [esp + 32]			; eax = order
+	cmp	eax, 1
+	jg	short .i_32
+
+	mov	esi, [esp + 40]			; esi = residual[]
+	mov	edi, [esp + 20]			; edi = data[]
+	mov	ecx, [esp + 28]			; ecx = qlp_coeff[]
+	mov	ebp, [ecx]				; ebp = qlp_coeff[0]
+	mov	eax, [edi - 4]			; eax = data[-1]
+	mov	cl, [esp + 36]			; cl = lp_quantization
+	ALIGN	16
+.i_1_loop_i:
+	imul	ebp					; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1]
+	shrd	eax, edx, cl		; 0 <= lp_quantization <= 15
+	neg	eax
+	add	eax, [edi]
+	mov	[esi], eax
+	mov	eax, [edi]
+	add	esi, 4
+	add	edi, 4
+	dec	ebx
+	jnz	.i_1_loop_i
+	jmp	.end
+
+.mov_eip_to_eax:
+	mov eax, [esp]
+	ret
+
+.i_32:	; eax = order
+	neg	eax
+	add eax, eax
+	lea	ebp, [eax + eax * 4 + .jumper_0 - .get_eip0]
+	call	.mov_eip_to_eax
+.get_eip0:
+	add	ebp, eax
+	inc	ebp				; compensate for the shorter opcode on the last iteration
+
+	mov	ebx, [esp + 28]			; ebx = qlp_coeff[]
+	mov	edi, [esp + 20]			; edi = data[]
+	sub [esp + 40], edi			; residual[] -= data[]
+
+	xor	ecx, ecx
+	xor	esi, esi
+	jmp	ebp
+
+;eax = --
+;edx = --
+;ecx = 0
+;esi = 0
+;
+;ebx = qlp_coeff[]
+;edi = data[]
+;ebp = @address
+
+	mov	eax, [ebx + 124]			; eax =  qlp_coeff[31]
+	imul	dword [edi - 128]		; edx:eax =  qlp_coeff[31] * data[i-32]
+	add	ecx, eax
+	adc	esi, edx					; sum += qlp_coeff[31] * data[i-32]
+
+	mov	eax, [ebx + 120]			; eax =  qlp_coeff[30]
+	imul	dword [edi - 124]		; edx:eax =  qlp_coeff[30] * data[i-31]
+	add	ecx, eax
+	adc	esi, edx					; sum += qlp_coeff[30] * data[i-31]
+
+	mov	eax, [ebx + 116]
+	imul	dword [edi - 120]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 112]
+	imul	dword [edi - 116]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 108]
+	imul	dword [edi - 112]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 104]
+	imul	dword [edi - 108]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 100]
+	imul	dword [edi - 104]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 96]
+	imul	dword [edi - 100]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 92]
+	imul	dword [edi - 96]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 88]
+	imul	dword [edi - 92]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 84]
+	imul	dword [edi - 88]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 80]
+	imul	dword [edi - 84]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 76]
+	imul	dword [edi - 80]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 72]
+	imul	dword [edi - 76]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 68]
+	imul	dword [edi - 72]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 64]
+	imul	dword [edi - 68]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 60]
+	imul	dword [edi - 64]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 56]
+	imul	dword [edi - 60]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 52]
+	imul	dword [edi - 56]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 48]
+	imul	dword [edi - 52]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 44]
+	imul	dword [edi - 48]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 40]
+	imul	dword [edi - 44]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 36]
+	imul	dword [edi - 40]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 32]
+	imul	dword [edi - 36]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 28]
+	imul	dword [edi - 32]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 24]
+	imul	dword [edi - 28]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 20]
+	imul	dword [edi - 24]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 16]
+	imul	dword [edi - 20]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 12]
+	imul	dword [edi - 16]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 8]
+	imul	dword [edi - 12]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 4]
+	imul	dword [edi - 8]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx]					; eax =  qlp_coeff[ 0] (NOTE: one byte missing from instruction)
+	imul	dword [edi - 4]			; edx:eax =  qlp_coeff[ 0] * data[i- 1]
+	add	ecx, eax
+	adc	esi, edx					; sum += qlp_coeff[ 0] * data[i- 1]
+
+.jumper_0:
+	mov edx, ecx
+;esi:edx = sum
+	mov	ecx, [esp + 36]			; cl = lp_quantization
+	shrd	edx, esi, cl		; edx = (sum >> lp_quantization)
+;eax = --
+;ecx = --
+;edx = sum >> lp_q
+;esi = --
+	neg	edx						; edx = -(sum >> lp_quantization)
+	mov eax, [esp + 40]			; residual[] - data[]
+	add	edx, [edi]				; edx = data[i] - (sum >> lp_quantization)
+	mov	[edi + eax], edx
+	add	edi, 4
+
+	dec	dword [esp + 24]
+	jz	short .end
+	xor	ecx, ecx
+	xor	esi, esi
+	jmp	ebp
+
+.end:
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+
+; **********************************************************************
+;
+; void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
+; {
+; 	unsigned i, j;
+; 	FLAC__int64 sum;
+;
+; 	FLAC__ASSERT(order > 0);
+;
+; 	for(i = 0; i < data_len; i++) {
+; 		sum = 0;
+; 		for(j = 0; j < order; j++)
+; 			sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1];
+; 		data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantization);
+; 	}
+; }
+	ALIGN	16
+cident FLAC__lpc_restore_signal_wide_asm_ia32
+	;[esp + 40]	data[]
+	;[esp + 36]	lp_quantization
+	;[esp + 32]	order
+	;[esp + 28]	qlp_coeff[]
+	;[esp + 24]	data_len
+	;[esp + 20]	residual[]
+
+	;ASSERT(order > 0)
+	;ASSERT(order <= 32)
+
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+
+	mov	ebx, [esp + 24]			; ebx = data_len
+	test	ebx, ebx
+	jz	near .end				; do nothing if data_len == 0
+
+.begin:
+	mov	eax, [esp + 32]			; eax = order
+	cmp	eax, 1
+	jg	short .x87_32
+
+	mov	esi, [esp + 20]			; esi = residual[]
+	mov	edi, [esp + 40]			; edi = data[]
+	mov	ecx, [esp + 28]			; ecx = qlp_coeff[]
+	mov	ebp, [ecx]				; ebp = qlp_coeff[0]
+	mov	eax, [edi - 4]			; eax = data[-1]
+	mov	cl, [esp + 36]			; cl = lp_quantization
+	ALIGN	16
+.x87_1_loop_i:
+	imul	ebp					; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1]
+	shrd	eax, edx, cl		; 0 <= lp_quantization <= 15
+;
+	add	eax, [esi]
+	mov	[edi], eax
+;
+	add	esi, 4
+	add	edi, 4
+	dec	ebx
+	jnz	.x87_1_loop_i
+	jmp	.end
+
+.mov_eip_to_eax:
+	mov eax, [esp]
+	ret
+
+.x87_32:	; eax = order
+	neg	eax
+	add eax, eax
+	lea	ebp, [eax + eax * 4 + .jumper_0 - .get_eip0]
+	call	.mov_eip_to_eax
+.get_eip0:
+	add	ebp, eax
+	inc	ebp				; compensate for the shorter opcode on the last iteration
+
+	mov	ebx, [esp + 28]			; ebx = qlp_coeff[]
+	mov	edi, [esp + 40]			; esi = data[]
+	sub	[esp + 20], edi			; residual[] -= data[]
+
+	xor	ecx, ecx
+	xor	esi, esi
+	jmp	ebp
+
+;eax = --
+;edx = --
+;ecx = 0
+;esi = 0
+;
+;ebx = qlp_coeff[]
+;edi = data[]
+;ebp = @address
+
+	mov	eax, [ebx + 124]			; eax =  qlp_coeff[31]
+	imul	dword [edi - 128]		; edx:eax =  qlp_coeff[31] * data[i-32]
+	add	ecx, eax
+	adc	esi, edx					; sum += qlp_coeff[31] * data[i-32]
+
+	mov	eax, [ebx + 120]			; eax =  qlp_coeff[30]
+	imul	dword [edi - 124]		; edx:eax =  qlp_coeff[30] * data[i-31]
+	add	ecx, eax
+	adc	esi, edx					; sum += qlp_coeff[30] * data[i-31]
+
+	mov	eax, [ebx + 116]
+	imul	dword [edi - 120]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 112]
+	imul	dword [edi - 116]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 108]
+	imul	dword [edi - 112]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 104]
+	imul	dword [edi - 108]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 100]
+	imul	dword [edi - 104]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 96]
+	imul	dword [edi - 100]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 92]
+	imul	dword [edi - 96]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 88]
+	imul	dword [edi - 92]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 84]
+	imul	dword [edi - 88]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 80]
+	imul	dword [edi - 84]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 76]
+	imul	dword [edi - 80]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 72]
+	imul	dword [edi - 76]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 68]
+	imul	dword [edi - 72]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 64]
+	imul	dword [edi - 68]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 60]
+	imul	dword [edi - 64]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 56]
+	imul	dword [edi - 60]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 52]
+	imul	dword [edi - 56]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 48]
+	imul	dword [edi - 52]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 44]
+	imul	dword [edi - 48]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 40]
+	imul	dword [edi - 44]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 36]
+	imul	dword [edi - 40]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 32]
+	imul	dword [edi - 36]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 28]
+	imul	dword [edi - 32]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 24]
+	imul	dword [edi - 28]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 20]
+	imul	dword [edi - 24]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 16]
+	imul	dword [edi - 20]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 12]
+	imul	dword [edi - 16]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 8]
+	imul	dword [edi - 12]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 4]
+	imul	dword [edi - 8]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx]					; eax =  qlp_coeff[ 0] (NOTE: one byte missing from instruction)
+	imul	dword [edi - 4]			; edx:eax =  qlp_coeff[ 0] * data[i- 1]
+	add	ecx, eax
+	adc	esi, edx					; sum += qlp_coeff[ 0] * data[i- 1]
+
+.jumper_0:
+	mov edx, ecx
+;esi:edx = sum
+	mov	ecx, [esp + 36]			; cl = lp_quantization
+	shrd	edx, esi, cl		; edx = (sum >> lp_quantization)
+;eax = --
+;ecx = --
+;edx = sum >> lp_q
+;esi = --
+;
+	mov eax, [esp + 20]			; residual[] - data[]
+	add	edx, [edi + eax]		; edx = residual[i] + (sum >> lp_quantization)
+	mov	[edi], edx				; data[i] = residual[i] + (sum >> lp_quantization)
+	add	edi, 4
+
+	dec	dword [esp + 24]
+	jz	short .end
+	xor	ecx, ecx
+	xor	esi, esi
+	jmp	ebp
+
+.end:
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+
 ; end
diff --git a/src/libFLAC/include/private/lpc.h b/src/libFLAC/include/private/lpc.h
index 2e8c4b5e..27760b48 100644
--- a/src/libFLAC/include/private/lpc.h
+++ b/src/libFLAC/include/private/lpc.h
@@ -152,6 +152,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide(const FLAC__int32 *da
 #    ifdef FLAC__HAS_NASM
 void FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
 void FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
+void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
 #    endif
 #  endif
 #  if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
@@ -187,6 +188,7 @@ void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], unsigned data_l
 #    ifdef FLAC__HAS_NASM
 void FLAC__lpc_restore_signal_asm_ia32(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
 void FLAC__lpc_restore_signal_asm_ia32_mmx(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
+void FLAC__lpc_restore_signal_wide_asm_ia32(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
 #    endif /* FLAC__HAS_NASM */
 #  elif defined FLAC__CPU_PPC
 void FLAC__lpc_restore_signal_asm_ppc_altivec_16(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
diff --git a/src/libFLAC/stream_decoder.c b/src/libFLAC/stream_decoder.c
index be9e2965..d8cd7142 100644
--- a/src/libFLAC/stream_decoder.c
+++ b/src/libFLAC/stream_decoder.c
@@ -404,6 +404,7 @@ static FLAC__StreamDecoderInitStatus init_stream_internal_(
 		if(decoder->private_->cpuinfo.ia32.bswap)
 			decoder->private_->local_bitreader_read_rice_signed_block = FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap;
 #endif
+		decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide_asm_ia32;
 		if(decoder->private_->cpuinfo.ia32.mmx) {
 			decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal_asm_ia32;
 			decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_asm_ia32_mmx;
@@ -416,7 +417,7 @@ static FLAC__StreamDecoderInitStatus init_stream_internal_(
 		}
 #endif
 #ifdef FLAC__HAS_X86INTRIN
-# ifdef FLAC__SSE4_SUPPORTED
+# if defined FLAC__SSE4_SUPPORTED && 0 /* now we have FLAC__lpc_restore_signal_wide_asm_ia32() which is slightly faster */
 		if(decoder->private_->cpuinfo.ia32.sse41)
 			decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide_intrin_sse41;
 # endif
diff --git a/src/libFLAC/stream_encoder.c b/src/libFLAC/stream_encoder.c
index 191d0f20..cbf28156 100644
--- a/src/libFLAC/stream_encoder.c
+++ b/src/libFLAC/stream_encoder.c
@@ -891,6 +891,7 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
 #  ifdef FLAC__CPU_IA32
 		FLAC__ASSERT(encoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_IA32);
 #   ifdef FLAC__HAS_NASM
+		encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32;
 		if(encoder->private_->cpuinfo.ia32.sse) {
 			if(encoder->protected_->max_lpc_order < 4)
 				encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4;