optimizations

2025-12-16 18:14:25 +00:00 · 2009-09-14 19:11:03 +00:00
parent ded902bec2
commit eba87d6db0
3 changed files with 326 additions and 291 deletions
--- a/CUETools.FlaCuda/FlaCudaWriter.cs
+++ b/CUETools.FlaCuda/FlaCudaWriter.cs
@@ -59,6 +59,7 @@ namespace CUETools.Codecs.FlaCuda
 		int max_frame_size;
 		byte[] frame_buffer = null;
 		BitWriter frame_writer = null;
 		int frame_count = 0;
@@ -70,7 +71,6 @@ namespace CUETools.Codecs.FlaCuda
 		// allocated by flake_encode_init and freed by flake_encode_close
 		byte[] header;
 		int[] verifyBuffer;
 		int[] residualBuffer;
 		float[] windowBuffer;
 		byte[] md5_buffer;
@@ -85,7 +85,6 @@ namespace CUETools.Codecs.FlaCuda
 		Crc16 crc16;
 		MD5 md5;
 		FlacFrame _frame;
 		FlakeReader verify;
 		SeekPoint[] seek_table;
@@ -94,31 +93,18 @@ namespace CUETools.Codecs.FlaCuda
 		bool inited = false;
 		CUDA cuda;
-		CUfunction cudaComputeAutocor;
+		FlaCudaTask task1;
-		CUfunction cudaComputeLPC;
+		FlaCudaTask task2;
-		CUfunction cudaEstimateResidual;
+
 		CUfunction cudaSumResidualChunks;
 		CUfunction cudaSumResidual;
 		CUfunction cudaEncodeResidual;
 		CUdeviceptr cudaSamples;
 		CUdeviceptr cudaWindow;
 		CUdeviceptr cudaAutocorTasks;
 		CUdeviceptr cudaAutocorOutput;
 		CUdeviceptr cudaResidualTasks;
 		CUdeviceptr cudaResidualOutput;
 		IntPtr samplesBufferPtr = IntPtr.Zero;
 		IntPtr autocorTasksPtr = IntPtr.Zero;
 		IntPtr residualTasksPtr = IntPtr.Zero;
 		CUstream cudaStream;
 		CUstream cudaStream1;
 		int nResidualTasks = 0;
 		int nAutocorTasks = 0;
 		int maxFrames = 8;
-		const int MAX_BLOCKSIZE = 4608 * 4;
+		public const int MAX_BLOCKSIZE = 4608 * 4;
-		const int maxResidualParts = (MAX_BLOCKSIZE + 32 * 3) / (32 * 3);
+		internal const int maxFrames = 8;
-		const int maxAutocorParts = MAX_BLOCKSIZE / (256 - 32);
+		internal const int maxResidualParts = (MAX_BLOCKSIZE + 32 * 3) / (32 * 3);
 		internal const int maxAutocorParts = MAX_BLOCKSIZE / (256 - 32);
 		public FlaCudaWriter(string path, int bitsPerSample, int channelCount, int sampleRate, Stream IO)
 		{
@@ -145,7 +131,6 @@ namespace CUETools.Codecs.FlaCuda
 			crc8 = new Crc8();
 			crc16 = new Crc16();
 			_frame = new FlacFrame(channels * 2);
 		}
 		public int TotalSize
@@ -192,11 +177,12 @@ namespace CUETools.Codecs.FlaCuda
 		{
 			if (inited)
 			{
-				while (samplesInBuffer > 0)
+				if (samplesInBuffer > 0)
 				{
 					eparams.block_size = samplesInBuffer;
-					output_frames();
+					output_frames(1);
 				}
 				samplesInBuffer = 0;
 				if (_IO.CanSeek)
 				{
@@ -226,16 +212,8 @@ namespace CUETools.Codecs.FlaCuda
 				_IO.Close();
 				cuda.Free(cudaWindow);
-				cuda.Free(cudaSamples);
+				task1.Dispose();
-				cuda.Free(cudaAutocorTasks);
+				task2.Dispose();
 				cuda.Free(cudaAutocorOutput);
 				cuda.Free(cudaResidualTasks);
 				cuda.Free(cudaResidualOutput);
 				CUDADriver.cuMemFreeHost(samplesBufferPtr);
 				CUDADriver.cuMemFreeHost(residualTasksPtr);
 				CUDADriver.cuMemFreeHost(autocorTasksPtr);
 				cuda.DestroyStream(cudaStream);
 				cuda.DestroyStream(cudaStream1);
 				cuda.Dispose();
 				inited = false;
 			}
@@ -258,16 +236,8 @@ namespace CUETools.Codecs.FlaCuda
 			{
 				_IO.Close();
 				cuda.Free(cudaWindow);
-				cuda.Free(cudaSamples);
+				task1.Dispose();
-				cuda.Free(cudaAutocorTasks);
+				task2.Dispose();
 				cuda.Free(cudaAutocorOutput);
 				cuda.Free(cudaResidualTasks);
 				cuda.Free(cudaResidualOutput);
 				CUDADriver.cuMemFreeHost(samplesBufferPtr);
 				CUDADriver.cuMemFreeHost(residualTasksPtr);
 				CUDADriver.cuMemFreeHost(autocorTasksPtr);
 				cuda.DestroyStream(cudaStream);
 				cuda.DestroyStream(cudaStream1);
 				cuda.Dispose();
 				inited = false;
 			}
@@ -450,17 +420,18 @@ namespace CUETools.Codecs.FlaCuda
 		/// <param name="samples"></param>
 		/// <param name="pos"></param>
 		/// <param name="block"></param>
- 		unsafe void copy_samples(int[,] samples, int pos, int block)
+ 		unsafe void copy_samples(int[,] samples, int pos, int block, FlaCudaTask task)
 		{
-			int* fsamples = (int*)samplesBufferPtr;
+			int* s = ((int*)task.samplesBufferPtr) + samplesInBuffer;
 			fixed (int *src = &samples[pos, 0])
 			{
-				if (channels == 2)
+				if (channels == 2 && eparams.do_midside)
-					AudioSamples.Deinterlace(fsamples + samplesInBuffer, fsamples + FlaCudaWriter.MAX_BLOCKSIZE + samplesInBuffer, src, block);
+					channel_decorrelation(s, s + FlaCudaWriter.MAX_BLOCKSIZE,
 						s + 2 * FlaCudaWriter.MAX_BLOCKSIZE, s + 3 * FlaCudaWriter.MAX_BLOCKSIZE, src, block);
 				else
 					for (int ch = 0; ch < channels; ch++)
 					{
-						int* psamples = fsamples + ch * FlaCudaWriter.MAX_BLOCKSIZE + samplesInBuffer;
+						int* psamples = s + ch * FlaCudaWriter.MAX_BLOCKSIZE;
 						for (int i = 0; i < block; i++)
 							psamples[i] = src[i * channels + ch];
 					}
@@ -509,20 +480,19 @@ namespace CUETools.Codecs.FlaCuda
 			return k_opt;
 		}
-		unsafe static void channel_decorrelation(int* leftS, int* rightS, int *leftM, int *rightM, int blocksize)
+		unsafe static void channel_decorrelation(int* leftS, int* rightS, int *leftM, int *rightM, int* src, int blocksize)
 		{
 			for (int i = 0; i < blocksize; i++)
 			{
-				leftM[i] = (leftS[i] + rightS[i]) >> 1;
+				int l = *(src++);
-				rightM[i] = leftS[i] - rightS[i];
+				int r = *(src++);
 				leftS[i] = l;
 				rightS[i] = r;
 				leftM[i] = (l + r) >> 1;
 				rightM[i] = l - r;
 			}
 		}
 		unsafe void encode_residual_verbatim(int* res, int* smp, uint n)
 		{
 			AudioSamples.MemCpy(res, smp, (int) n);
 		}
 		unsafe void encode_residual_fixed(int* res, int* smp, int n, int order)
 		{
 			int i;
@@ -624,9 +594,8 @@ namespace CUETools.Codecs.FlaCuda
 			}
 		}
-		static unsafe uint calc_rice_params(ref RiceContext rc, int pmin, int pmax, int* data, uint n, uint pred_order)
+		static unsafe uint calc_rice_params(ref RiceContext rc, ref RiceContext tmp_rc, int pmin, int pmax, int* data, uint n, uint pred_order)
 		{
 			RiceContext tmp_rc = new RiceContext(), tmp_rc2;
 			uint* udata = stackalloc uint[(int)n];
 			uint* sums = stackalloc uint[(pmax + 1) * Flake.MAX_PARTITIONS];
 			//uint* bits = stackalloc uint[Flake.MAX_PARTITION_ORDER];
@@ -649,7 +618,7 @@ namespace CUETools.Codecs.FlaCuda
 				{
 					opt_porder = i;
 					opt_bits = bits;
-					tmp_rc2 = rc;
+					RiceContext tmp_rc2 = rc;
 					rc = tmp_rc;
 					tmp_rc = tmp_rc2;
 				}
@@ -666,42 +635,6 @@ namespace CUETools.Codecs.FlaCuda
 			return porder;
 		}
 		static unsafe uint calc_rice_params_fixed(ref RiceContext rc, int pmin, int pmax,
 			int* data, int n, int pred_order, uint bps)
 		{
 			pmin = get_max_p_order(pmin, n, pred_order);
 			pmax = get_max_p_order(pmax, n, pred_order);
 			uint bits = (uint)pred_order * bps + 6;
 			bits += calc_rice_params(ref rc, pmin, pmax, data, (uint)n, (uint)pred_order);
 			return bits;
 		}
 		static unsafe uint calc_rice_params_lpc(ref RiceContext rc, int pmin, int pmax,
 			int* data, int n, int pred_order, uint bps, uint precision)
 		{
 			pmin = get_max_p_order(pmin, n, pred_order);
 			pmax = get_max_p_order(pmax, n, pred_order);
 			uint bits = (uint)pred_order * bps + 4 + 5 + (uint)pred_order * precision + 6;
 			bits += calc_rice_params(ref rc, pmin, pmax, data, (uint)n, (uint)pred_order);
 			return bits;
 		}
 		// select LPC precision based on block size
 		static uint get_precision(int blocksize)
 		{
 			uint lpc_precision;
 			if (blocksize <= 192) lpc_precision = 7U;
 			else if (blocksize <= 384) lpc_precision = 8U;
 			else if (blocksize <= 576) lpc_precision = 9U;
 			else if (blocksize <= 1152) lpc_precision = 10U;
 			else if (blocksize <= 2304) lpc_precision = 11U;
 			else if (blocksize <= 4608) lpc_precision = 12U;
 			else if (blocksize <= 8192) lpc_precision = 13U;
 			else if (blocksize <= 16384) lpc_precision = 14U;
 			else lpc_precision = 15;
 			return lpc_precision;
 		}
 		unsafe void output_frame_header(FlacFrame frame, BitWriter bitwriter)
 		{
 			bitwriter.writebits(15, 0x7FFC);
@@ -799,14 +732,10 @@ namespace CUETools.Codecs.FlaCuda
 				bitwriter.writebits_signed(sub.obits, sub.samples[i]);
 			// LPC coefficients
-			int cbits = 1;
+			bitwriter.writebits(4, sub.best.cbits - 1);
 			for (int i = 0; i < sub.best.order; i++)
 				while (cbits < 16 && sub.best.coefs[i] != (sub.best.coefs[i] << (32 - cbits)) >> (32 - cbits))
 					cbits++;
 			bitwriter.writebits(4, cbits - 1);
 			bitwriter.writebits_signed(5, sub.best.shift);
 			for (int i = 0; i < sub.best.order; i++)
-				bitwriter.writebits_signed(cbits, sub.best.coefs[i]);
+				bitwriter.writebits_signed(sub.best.cbits, sub.best.coefs[i]);
 			// residual
 			output_residual(frame, bitwriter, sub);
@@ -829,6 +758,9 @@ namespace CUETools.Codecs.FlaCuda
 				if (sub.wbits > 0)
 					bitwriter.writebits((int)sub.wbits, 1);
 				//if (frame_writer.Length >= frame_buffer.Length)
 				//    throw new Exception("buffer overflow");
 				// subframe
 				switch (sub.best.type)
 				{
@@ -845,6 +777,8 @@ namespace CUETools.Codecs.FlaCuda
 						output_subframe_lpc(frame, bitwriter, sub);
 						break;
 				}
 				//if (frame_writer.Length >= frame_buffer.Length)
 				//    throw new Exception("buffer overflow");
 			}
 		}
@@ -914,10 +848,10 @@ namespace CUETools.Codecs.FlaCuda
 			_windowcount++;
 		}
-		unsafe void initialize_autocorTasks(int blocksize, int channelsCount, int max_order, int nFrames)
+		unsafe void initialize_autocorTasks(int blocksize, int channelsCount, int max_order, int nFrames, FlaCudaTask task)
 		{
-			computeAutocorTaskStruct* autocorTasks = (computeAutocorTaskStruct*)autocorTasksPtr;
+			computeAutocorTaskStruct* autocorTasks = (computeAutocorTaskStruct*)task.autocorTasksPtr;
-			encodeResidualTaskStruct* residualTasks = (encodeResidualTaskStruct*)residualTasksPtr;
+			encodeResidualTaskStruct* residualTasks = (encodeResidualTaskStruct*)task.residualTasksPtr;
 			nAutocorTasks = 0;
 			nResidualTasks = 0;
 			for (int iFrame = 0; iFrame < nFrames; iFrame++)
@@ -975,9 +909,13 @@ namespace CUETools.Codecs.FlaCuda
 					}
 				}
 			}
-			cuda.CopyHostToDeviceAsync(cudaAutocorTasks, autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * nAutocorTasks), cudaStream);
+			if (sizeof(encodeResidualTaskStruct) * nResidualTasks > task.residualTasksLen)
-			cuda.CopyHostToDeviceAsync(cudaResidualTasks, residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks), cudaStream);
+				throw new Exception("oops");
-			cuda.SynchronizeStream(cudaStream);
+			if (sizeof(computeAutocorTaskStruct) * nAutocorTasks > task.autocorTasksLen)
 				throw new Exception("oops");
 			cuda.CopyHostToDeviceAsync(task.cudaAutocorTasks, task.autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * nAutocorTasks), task.stream);
 			cuda.CopyHostToDeviceAsync(task.cudaResidualTasks, task.residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks), task.stream);
 			task.blocksize = blocksize;
 		}
 		unsafe void encode_residual(FlacFrame frame)
@@ -991,11 +929,15 @@ namespace CUETools.Codecs.FlaCuda
 					case SubframeType.Verbatim:
 						break;
 					case SubframeType.Fixed:
-						encode_residual_fixed(frame.subframes[ch].best.residual, frame.subframes[ch].samples,
+						{
-							frame.blocksize, frame.subframes[ch].best.order);
+							encode_residual_fixed(frame.subframes[ch].best.residual, frame.subframes[ch].samples,
-						frame.subframes[ch].best.size = calc_rice_params_fixed(
+								frame.blocksize, frame.subframes[ch].best.order);
-							ref frame.subframes[ch].best.rc, eparams.min_partition_order, eparams.max_partition_order,
+
-							frame.subframes[ch].best.residual, frame.blocksize, frame.subframes[ch].best.order, frame.subframes[ch].obits);
+							int pmin = get_max_p_order(eparams.min_partition_order, frame.blocksize, frame.subframes[ch].best.order);
 							int pmax = get_max_p_order(eparams.max_partition_order, frame.blocksize, frame.subframes[ch].best.order);
 							uint bits = (uint)frame.subframes[ch].best.order * frame.subframes[ch].obits + 6;
 							frame.subframes[ch].best.size = bits + calc_rice_params(ref frame.subframes[ch].best.rc, ref frame.current.rc, pmin, pmax, frame.subframes[ch].best.residual, (uint)frame.blocksize, (uint)frame.subframes[ch].best.order);
 						}
 						break;
 					case SubframeType.LPC:
 						fixed (int* coefs = frame.subframes[ch].best.coefs)
@@ -1007,18 +949,28 @@ namespace CUETools.Codecs.FlaCuda
 								lpc.encode_residual_long(frame.subframes[ch].best.residual, frame.subframes[ch].samples, frame.blocksize, frame.subframes[ch].best.order, coefs, frame.subframes[ch].best.shift);
 							else
 								lpc.encode_residual(frame.subframes[ch].best.residual, frame.subframes[ch].samples, frame.blocksize, frame.subframes[ch].best.order, coefs, frame.subframes[ch].best.shift);
-							frame.subframes[ch].best.size = calc_rice_params_lpc(
+
-								ref frame.subframes[ch].best.rc, eparams.min_partition_order, eparams.max_partition_order,
+							int pmin = get_max_p_order(eparams.min_partition_order, frame.blocksize, frame.subframes[ch].best.order);
-								frame.subframes[ch].best.residual, frame.blocksize, frame.subframes[ch].best.order, frame.subframes[ch].obits, (uint)frame.subframes[ch].best.cbits);
+							int pmax = get_max_p_order(eparams.max_partition_order, frame.blocksize, frame.subframes[ch].best.order);
 							uint bits = (uint)frame.subframes[ch].best.order * frame.subframes[ch].obits + 4 + 5 + (uint)frame.subframes[ch].best.order * (uint)frame.subframes[ch].best.cbits + 6;
 							frame.subframes[ch].best.size = bits + calc_rice_params(ref frame.subframes[ch].best.rc, ref frame.current.rc, pmin, pmax, frame.subframes[ch].best.residual, (uint)frame.blocksize, (uint)frame.subframes[ch].best.order);
 						}
 						break;
 				}
 				if (frame.subframes[ch].best.size > frame.subframes[ch].obits * (uint)frame.blocksize)
 				{
 #if DEBUG
 					throw new Exception("larger than verbatim");
 #endif
 					frame.subframes[ch].best.type = SubframeType.Verbatim;
 					frame.subframes[ch].best.size = frame.subframes[ch].obits * (uint)frame.blocksize;
 				}
 			}
 		}
-		unsafe void select_best_methods(FlacFrame frame, int channelsCount, int max_order, int iFrame)
+		unsafe void select_best_methods(FlacFrame frame, int channelsCount, int max_order, int iFrame, FlaCudaTask task)
 		{
-			encodeResidualTaskStruct* residualTasks = (encodeResidualTaskStruct*)residualTasksPtr;
+			encodeResidualTaskStruct* residualTasks = (encodeResidualTaskStruct*)task.residualTasksPtr;
 			for (int ch = 0; ch < channelsCount; ch++)
 			{
 				int i;
@@ -1089,14 +1041,13 @@ namespace CUETools.Codecs.FlaCuda
 			}
 		}
-		unsafe void estimate_residual(int blocksize, int channelsCount, int max_order, int nFrames)
+		unsafe void estimate_residual(int blocksize, int channelsCount, int max_order, int nFrames, FlaCudaTask task)
 		{
 			if (blocksize <= 4)
 				return;
-			compute_autocorellation(blocksize, channelsCount, max_order, nFrames);
+			compute_autocorellation(blocksize, channelsCount, max_order, nFrames, task);
 			uint cbits = get_precision(blocksize) + 1;
 			int threads_y;
 			if (max_order >= 4 && max_order <= 8)
 				threads_y = max_order;
@@ -1118,31 +1069,29 @@ namespace CUETools.Codecs.FlaCuda
 			if (partCount > maxResidualParts)
 				throw new Exception("invalid combination of block size and LPC order");
-			cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 0, (uint)cudaResidualOutput.Pointer);
+			cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 0, (uint)task.cudaResidualOutput.Pointer);
-			cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 1, (uint)cudaSamples.Pointer);
+			cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 1, (uint)task.cudaSamples.Pointer);
-			cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 2, (uint)cudaResidualTasks.Pointer);
+			cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 2, (uint)task.cudaResidualTasks.Pointer);
-			cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 3, (uint)max_order);
+			cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 3, (uint)max_order);
-			cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 4, (uint)blocksize);
+			cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 4, (uint)blocksize);
-			cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 5, (uint)partSize);
+			cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 5, (uint)partSize);
-			cuda.SetParameterSize(cudaEstimateResidual, sizeof(uint) * 6);
+			cuda.SetParameterSize(task.cudaEstimateResidual, sizeof(uint) * 6);
-			cuda.SetFunctionBlockShape(cudaEstimateResidual, 32, threads_y, 1);
+			cuda.SetFunctionBlockShape(task.cudaEstimateResidual, 32, threads_y, 1);
-			cuda.SetParameter(cudaSumResidual, 0, (uint)cudaResidualTasks.Pointer);
+			cuda.SetParameter(task.cudaSumResidual, 0, (uint)task.cudaResidualTasks.Pointer);
-			cuda.SetParameter(cudaSumResidual, sizeof(uint), (uint)cudaResidualOutput.Pointer);
+			cuda.SetParameter(task.cudaSumResidual, sizeof(uint), (uint)task.cudaResidualOutput.Pointer);
-			cuda.SetParameter(cudaSumResidual, sizeof(uint) * 2, (uint)partSize);
+			cuda.SetParameter(task.cudaSumResidual, sizeof(uint) * 2, (uint)partSize);
-			cuda.SetParameter(cudaSumResidual, sizeof(uint) * 3, (uint)partCount);
+			cuda.SetParameter(task.cudaSumResidual, sizeof(uint) * 3, (uint)partCount);
-			cuda.SetParameterSize(cudaSumResidual, sizeof(uint) * 4U);
+			cuda.SetParameterSize(task.cudaSumResidual, sizeof(uint) * 4U);
-			cuda.SetFunctionBlockShape(cudaSumResidual, 64, 1, 1);
+			cuda.SetFunctionBlockShape(task.cudaSumResidual, 64, 1, 1);
 			// issue work to the GPU
-			cuda.LaunchAsync(cudaEstimateResidual, partCount, (nResidualTasks / threads_y * nFrames) / maxFrames, cudaStream);
+			cuda.LaunchAsync(task.cudaEstimateResidual, partCount, (nResidualTasks / threads_y * nFrames) / maxFrames, task.stream);
-			//cuda.LaunchAsync(cudaSumResidualChunks, partCount, nResidualTasks, cudaStream);
+			cuda.LaunchAsync(task.cudaSumResidual, 1, (nResidualTasks * nFrames) / maxFrames, task.stream);
-			cuda.LaunchAsync(cudaSumResidual, 1, (nResidualTasks * nFrames) / maxFrames, cudaStream);
+			cuda.CopyDeviceToHostAsync(task.cudaResidualTasks, task.residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * ((nResidualTasks * nFrames) / maxFrames)), task.stream);
 			cuda.CopyDeviceToHostAsync(cudaResidualTasks, residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * ((nResidualTasks * nFrames) / maxFrames)), cudaStream);
 			cuda.SynchronizeStream(cudaStream);
 		}
-		unsafe void compute_autocorellation(int blocksize, int channelsCount, int max_order, int nFrames)
+		unsafe void compute_autocorellation(int blocksize, int channelsCount, int max_order, int nFrames, FlaCudaTask task)
 		{
 			int autocorThreads = 256;
 			int partSize = 2 * autocorThreads - max_order;
@@ -1155,46 +1104,43 @@ namespace CUETools.Codecs.FlaCuda
 			if (blocksize <= 4)
 				return;
-			cuda.SetParameter(cudaComputeAutocor, 0, (uint)cudaAutocorOutput.Pointer);
+			cuda.SetParameter(task.cudaComputeAutocor, 0, (uint)task.cudaAutocorOutput.Pointer);
-			cuda.SetParameter(cudaComputeAutocor, sizeof(uint), (uint)cudaSamples.Pointer);
+			cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint), (uint)task.cudaSamples.Pointer);
-			cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 2, (uint)cudaWindow.Pointer);
+			cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 2, (uint)cudaWindow.Pointer);
-			cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 3, (uint)cudaAutocorTasks.Pointer);
+			cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 3, (uint)task.cudaAutocorTasks.Pointer);
-			cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 4, (uint)max_order);
+			cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 4, (uint)max_order);
-			cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 4 + sizeof(uint), (uint)blocksize);
+			cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 4 + sizeof(uint), (uint)blocksize);
-			cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 4 + sizeof(uint) * 2, (uint)partSize);
+			cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 4 + sizeof(uint) * 2, (uint)partSize);
-			cuda.SetParameterSize(cudaComputeAutocor, (uint)(sizeof(uint) * 4) + sizeof(uint) * 3);
+			cuda.SetParameterSize(task.cudaComputeAutocor, (uint)(sizeof(uint) * 4) + sizeof(uint) * 3);
-			cuda.SetFunctionBlockShape(cudaComputeAutocor, autocorThreads, 1, 1);
+			cuda.SetFunctionBlockShape(task.cudaComputeAutocor, autocorThreads, 1, 1);
-			cuda.SetParameter(cudaComputeLPC, 0, (uint)cudaResidualTasks.Pointer);
+			cuda.SetParameter(task.cudaComputeLPC, 0, (uint)task.cudaResidualTasks.Pointer);
-			cuda.SetParameter(cudaComputeLPC, sizeof(uint), (uint)cudaAutocorOutput.Pointer);
+			cuda.SetParameter(task.cudaComputeLPC, sizeof(uint), (uint)task.cudaAutocorOutput.Pointer);
-			cuda.SetParameter(cudaComputeLPC, sizeof(uint) * 2, (uint)cudaAutocorTasks.Pointer);
+			cuda.SetParameter(task.cudaComputeLPC, sizeof(uint) * 2, (uint)task.cudaAutocorTasks.Pointer);
-			cuda.SetParameter(cudaComputeLPC, sizeof(uint) * 3, (uint)max_order);
+			cuda.SetParameter(task.cudaComputeLPC, sizeof(uint) * 3, (uint)max_order);
-			cuda.SetParameter(cudaComputeLPC, sizeof(uint) * 3 + sizeof(uint), (uint)partCount);
+			cuda.SetParameter(task.cudaComputeLPC, sizeof(uint) * 3 + sizeof(uint), (uint)partCount);
-			cuda.SetParameterSize(cudaComputeLPC, (uint)(sizeof(uint) * 3) + sizeof(uint) * 2);
+			cuda.SetParameterSize(task.cudaComputeLPC, (uint)(sizeof(uint) * 3) + sizeof(uint) * 2);
-			cuda.SetFunctionBlockShape(cudaComputeLPC, (partCount + 31) & ~31, 1, 1);
+			cuda.SetFunctionBlockShape(task.cudaComputeLPC, (partCount + 31) & ~31, 1, 1);
 			// issue work to the GPU
-			cuda.CopyHostToDeviceAsync(cudaSamples, samplesBufferPtr, (uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelsCount), cudaStream);
+			cuda.LaunchAsync(task.cudaComputeAutocor, partCount, (nAutocorTasks * nFrames) / maxFrames, task.stream);
-			cuda.LaunchAsync(cudaComputeAutocor, partCount, (nAutocorTasks * nFrames) / maxFrames, cudaStream);
+			cuda.LaunchAsync(task.cudaComputeLPC, 1, (nAutocorTasks * nFrames) / maxFrames, task.stream);
 			cuda.LaunchAsync(cudaComputeLPC, 1, (nAutocorTasks * nFrames) / maxFrames, cudaStream);
 			//cuda.SynchronizeStream(cudaStream);
 			//cuda.CopyDeviceToHostAsync(cudaResidualTasks, residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks), cudaStream1);
 		}
-		unsafe int encode_frame(bool doMidside, int channelCount, int iFrame)
+		unsafe int encode_frame(bool doMidside, int channelCount, int iFrame, FlaCudaTask task)
 		{
 			fixed (int* r = residualBuffer)
 			{
-				FlacFrame frame = _frame;
+				FlacFrame frame = task.frame;
 				frame.InitSize(eparams.block_size, eparams.variable_block_size != 0);
 				for (int ch = 0; ch < channelCount; ch++)
 				{
-					int* s = ((int*)samplesBufferPtr) + ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * eparams.block_size;
+					int* s = ((int*)task.samplesBufferPtr) + ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * eparams.block_size;
 					frame.subframes[ch].Init(s, r + ch * FlaCudaWriter.MAX_BLOCKSIZE,
 						bits_per_sample + (doMidside && ch == 3 ? 1U : 0U), 0);// get_wasted_bits(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, frame.blocksize));
 				}
-				select_best_methods(frame, channelCount, eparams.max_prediction_order, iFrame);
+				select_best_methods(frame, channelCount, eparams.max_prediction_order, iFrame, task);
 				if (doMidside)
 				{
@@ -1206,11 +1152,13 @@ namespace CUETools.Codecs.FlaCuda
 				encode_residual(frame);
-				BitWriter bitwriter = new BitWriter(frame_buffer, 0, max_frame_size);
+				frame_writer.Reset();
-				output_frame_header(frame, bitwriter);
+				output_frame_header(frame, frame_writer);
-				output_subframes(frame, bitwriter);
+				output_subframes(frame, frame_writer);
-				output_frame_footer(bitwriter);
+				output_frame_footer(frame_writer);
 				if (frame_writer.Length >= frame_buffer.Length)
 					throw new Exception("buffer overflow");
 				if (frame_buffer != null)
 				{
@@ -1219,26 +1167,22 @@ namespace CUETools.Codecs.FlaCuda
 					else
 						frame_count++;
 				}
-				return bitwriter.Length;
+				return frame_writer.Length;
 			}
 		}
-		unsafe int output_frames()
+		unsafe void send_to_GPU(int nFrames, FlaCudaTask task)
 		{
 			bool doMidside = channels == 2 && eparams.do_midside;
 			int channelCount = doMidside ? 2 * channels : channels;
 			int nFrames = Math.Min(samplesInBuffer / eparams.block_size, maxFrames);
-			if (nFrames < 1)
+			cuda.CopyHostToDeviceAsync(task.cudaSamples, task.samplesBufferPtr, (uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelCount), task.stream);
-				throw new Exception("oops");
+		}
-			if (verify != null)
+		unsafe void run_GPU_task(int nFrames, FlaCudaTask task)
-			{
+		{
-				int* r = (int*)samplesBufferPtr;
+			bool doMidside = channels == 2 && eparams.do_midside;
-				fixed (int* s = verifyBuffer)
+			int channelCount = doMidside ? 2 * channels : channels;
 					for (int ch = 0; ch < channels; ch++)
 						AudioSamples.MemCpy(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * FlaCudaWriter.MAX_BLOCKSIZE, samplesInBuffer);
 			}
 			if (eparams.block_size != _windowsize && eparams.block_size > 4)
 				fixed (float* window = windowBuffer)
@@ -1253,17 +1197,25 @@ namespace CUETools.Codecs.FlaCuda
 					if (_windowcount == 0)
 						throw new Exception("invalid windowfunction");
 					cuda.CopyHostToDevice<float>(cudaWindow, windowBuffer);
 					initialize_autocorTasks(eparams.block_size, channelCount, eparams.max_prediction_order, maxFrames);
 				}
 			if (eparams.block_size != task.blocksize)
 				initialize_autocorTasks(eparams.block_size, channelCount, eparams.max_prediction_order, maxFrames, task);
-			if (doMidside)
+			if (verify != null)
-			{				
+			{
-				int* s = ((int*)samplesBufferPtr);
+				int* r = (int*)task.samplesBufferPtr;
-				channel_decorrelation(s, s + FlaCudaWriter.MAX_BLOCKSIZE,
+				fixed (int* s = task.verifyBuffer)
-					s + 2 * FlaCudaWriter.MAX_BLOCKSIZE, s + 3 * FlaCudaWriter.MAX_BLOCKSIZE, eparams.block_size * nFrames);
+					for (int ch = 0; ch < channels; ch++)
 						AudioSamples.MemCpy(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * FlaCudaWriter.MAX_BLOCKSIZE, eparams.block_size * nFrames);
 			}
-			
+
-			estimate_residual(eparams.block_size, channelCount, eparams.max_prediction_order, nFrames);
+			estimate_residual(eparams.block_size, channelCount, eparams.max_prediction_order, nFrames, task);
 		}
 		unsafe int process_result(int nFrames, FlaCudaTask task)
 		{
 			bool doMidside = channels == 2 && eparams.do_midside;
 			int channelCount = doMidside ? 2 * channels : channels;
 			int bs = 0;
 			for (int iFrame = 0; iFrame < nFrames; iFrame++)
@@ -1271,7 +1223,7 @@ namespace CUETools.Codecs.FlaCuda
 				//if (0 != eparams.variable_block_size && 0 == (eparams.block_size & 7) && eparams.block_size >= 128)
 				//    fs = encode_frame_vbs();
 				//else
-				int fs = encode_frame(doMidside, channelCount, iFrame);
+				int fs = encode_frame(doMidside, channelCount, iFrame, task);
 				bs += eparams.block_size;
 				if (seek_table != null && _IO.CanSeek)
@@ -1300,7 +1252,7 @@ namespace CUETools.Codecs.FlaCuda
 					int decoded = verify.DecodeFrame(frame_buffer, 0, fs);
 					if (decoded != fs || verify.Remaining != (ulong)eparams.block_size)
 						throw new Exception("validation failed!");
-					fixed (int* s = verifyBuffer, r = verify.Samples)
+					fixed (int* s = task.verifyBuffer, r = verify.Samples)
 					{
 						for (int ch = 0; ch < channels; ch++)
 							if (AudioSamples.MemCmp(s + iFrame * eparams.block_size + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, eparams.block_size))
@@ -1308,56 +1260,33 @@ namespace CUETools.Codecs.FlaCuda
 					}
 				}
 			}
 			if (bs < samplesInBuffer)
 			{
 				int* s = (int*)samplesBufferPtr;
 				for (int ch = 0; ch < channels; ch++)
 					AudioSamples.MemCpy(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, s + bs + ch * FlaCudaWriter.MAX_BLOCKSIZE, samplesInBuffer - bs);
 			}
 			samplesInBuffer -= bs;
 			return bs;
 		}
 		unsafe void output_frames(int nFrames)
 		{
 			send_to_GPU(nFrames, task1);
 			run_GPU_task(nFrames, task1);
 			cuda.SynchronizeStream(task1.stream);
 			process_result(nFrames, task1);
 		}
 		public unsafe void Write(int[,] buff, int pos, int sampleCount)
 		{
 			bool doMidside = channels == 2 && eparams.do_midside;
 			int channelCount = doMidside ? 2 * channels : channels;
 			if (!inited)
 			{
 				cuda = new CUDA(true, InitializationFlags.None);
-				cuda.CreateContext(0, CUCtxFlags.BlockingSync);
+				cuda.CreateContext(0, CUCtxFlags.SchedAuto);
 				using (Stream cubin = GetType().Assembly.GetManifestResourceStream(GetType(), "flacuda.cubin"))
 				using (StreamReader sr = new StreamReader(cubin))
 					cuda.LoadModule(new ASCIIEncoding().GetBytes(sr.ReadToEnd()));
 				//cuda.LoadModule(System.IO.Path.Combine(Environment.CurrentDirectory, "flacuda.cubin"));
-				cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor");
+				task1 = new FlaCudaTask(cuda, channelCount);
-				cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC");
+				task2 = new FlaCudaTask(cuda, channelCount);
 				cudaEstimateResidual = cuda.GetModuleFunction("cudaEstimateResidual");
 				cudaSumResidual = cuda.GetModuleFunction("cudaSumResidual");
 				cudaSumResidualChunks = cuda.GetModuleFunction("cudaSumResidualChunks");
 				cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual");
 				cudaSamples = cuda.Allocate((uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels)));
 				cudaWindow = cuda.Allocate((uint)sizeof(float) * FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS);
 				cudaAutocorTasks = cuda.Allocate((uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * maxFrames));
 				cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * maxAutocorParts));
 				cudaResidualTasks = cuda.Allocate((uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_ORDER * (lpc.MAX_LPC_WINDOWS + 1) * maxFrames));
 				cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_WINDOWS + 1) * lpc.MAX_LPC_ORDER * maxResidualParts));
 				//cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 4) * maxResidualParts));
 				CUResult cuErr = CUDADriver.cuMemAllocHost(ref samplesBufferPtr, (uint)(sizeof(int) * (channels == 2 ? 4 : channels) * FlaCudaWriter.MAX_BLOCKSIZE));
 				if (cuErr == CUResult.Success)
 					cuErr = CUDADriver.cuMemAllocHost(ref autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * maxFrames));
 				if (cuErr == CUResult.Success)
 					cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_WINDOWS + 1) * lpc.MAX_LPC_ORDER * maxFrames));
 				if (cuErr != CUResult.Success)
 				{
 					if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr); samplesBufferPtr = IntPtr.Zero;
 					if (autocorTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(autocorTasksPtr); autocorTasksPtr = IntPtr.Zero;
 					if (residualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualTasksPtr); residualTasksPtr = IntPtr.Zero;
 					throw new CUDAException(cuErr);
 				}				
 				cudaStream = cuda.CreateStream();
 				cudaStream1 = cuda.CreateStream();
 				if (_IO == null)
 					_IO = new FileStream(_path, FileMode.Create, FileAccess.Write, FileShare.Read);
 				int header_size = flake_encode_init();
@@ -1366,13 +1295,13 @@ namespace CUETools.Codecs.FlaCuda
 					first_frame_offset = _IO.Position;
 				inited = true;
 			}
-
+			int have_data = 0;
 			int len = sampleCount;
 			while (len > 0)
 			{
-				int block = Math.Min(len, FlaCudaWriter.MAX_BLOCKSIZE - samplesInBuffer);
+				int block = Math.Min(len, Math.Min(FlaCudaWriter.MAX_BLOCKSIZE, eparams.block_size * maxFrames) - samplesInBuffer);
-				copy_samples(buff, pos, block);
+				copy_samples(buff, pos, block, task1);
 				if (md5 != null)
 				{
@@ -1383,14 +1312,43 @@ namespace CUETools.Codecs.FlaCuda
 				len -= block;
 				pos += block;
-				while (samplesInBuffer >= eparams.block_size)
+				int nFrames = samplesInBuffer / eparams.block_size;
-					output_frames();
+				if (nFrames > 0)
 				{
 #if DEBUG
 					if (nFrames > maxFrames)
 						throw new Exception("oops");
 #endif
 					send_to_GPU(nFrames, task1);
 					cuda.SynchronizeStream(task2.stream);
 					run_GPU_task(nFrames, task1);
 					if (have_data > 0)
 						process_result(have_data, task2);
 					int bs = eparams.block_size * nFrames;
 					if (bs < samplesInBuffer)
 					{
 						int* s1 = (int*)task1.samplesBufferPtr;
 						int* s2 = (int*)task2.samplesBufferPtr;
 						for (int ch = 0; ch < channelCount; ch++)
 							AudioSamples.MemCpy(s2 + ch * FlaCudaWriter.MAX_BLOCKSIZE, s1 + bs + ch * FlaCudaWriter.MAX_BLOCKSIZE, samplesInBuffer - bs);
 					}
 					samplesInBuffer -= bs;
 					have_data = nFrames;
 					FlaCudaTask tmp = task1;
 					task1 = task2;
 					task2 = tmp;
 				}
 			}
 			if (have_data > 0)
 			{
 				cuda.SynchronizeStream(task2.stream);
 				process_result(have_data, task2);
 			}
 		}
 		public string Path { get { return _path; } }
-		string vendor_string = "FlaCuda#0.1";
+		string vendor_string = "FlaCuda#0.4";
 		int select_blocksize(int samplerate, int time_ms)
 		{
@@ -1620,12 +1578,10 @@ namespace CUETools.Codecs.FlaCuda
 				md5 = new MD5CryptoServiceProvider();
 			if (eparams.do_verify)
 			{
 				verify = new FlakeReader(channels, bits_per_sample);
 				verifyBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * channels];
 			}
-			frame_buffer = new byte[max_frame_size];
+			frame_buffer = new byte[max_frame_size + 1];
 			frame_writer = new BitWriter(frame_buffer, 0, max_frame_size + 1);
 			return header_len;
 		}
@@ -1740,9 +1696,8 @@ namespace CUETools.Codecs.FlaCuda
 			{
 				case 0:
 					do_midside = false;
 					window_function = WindowFunction.Bartlett;
 					max_partition_order = 4;
-					max_prediction_order = 6;
+					max_prediction_order = 4;
 					break;
 				case 1:
 					do_midside = false;
@@ -1753,24 +1708,24 @@ namespace CUETools.Codecs.FlaCuda
 				case 2:
 					window_function = WindowFunction.Bartlett;
 					max_partition_order = 4;
-					max_prediction_order = 4;
+					max_prediction_order = 5;
 					break;
 				case 3:
 					window_function = WindowFunction.Bartlett;
 					max_partition_order = 4;
-					max_prediction_order = 5;
+					max_prediction_order = 7;
 					break;
 				case 4:
 					window_function = WindowFunction.Bartlett;
 					max_partition_order = 4;
-					max_prediction_order = 7;
+					max_prediction_order = 8;
 					break;
 				case 5:
 					window_function = WindowFunction.Bartlett;
 					max_prediction_order = 8;
 					break;
-				case 6:					
+				case 6:
-					max_prediction_order = 8;
+					window_function = WindowFunction.Bartlett;
 					max_prediction_order = 12;
 					break;
 				case 7:					
 					max_prediction_order = 10;
@@ -1810,4 +1765,81 @@ namespace CUETools.Codecs.FlaCuda
 		public fixed int reserved[11];
 		public fixed int coefs[32];
 	};
 	internal class FlaCudaTask
 	{
 		CUDA cuda;
 		public CUfunction cudaComputeAutocor;
 		public CUfunction cudaComputeLPC;
 		public CUfunction cudaEstimateResidual;
 		//public CUfunction cudaSumResidualChunks;
 		public CUfunction cudaSumResidual;
 		//public CUfunction cudaEncodeResidual;
 		public CUdeviceptr cudaSamples;
 		public CUdeviceptr cudaAutocorTasks;
 		public CUdeviceptr cudaAutocorOutput;
 		public CUdeviceptr cudaResidualTasks;
 		public CUdeviceptr cudaResidualOutput;
 		public IntPtr samplesBufferPtr = IntPtr.Zero;
 		public IntPtr autocorTasksPtr = IntPtr.Zero;
 		public IntPtr residualTasksPtr = IntPtr.Zero;
 		public CUstream stream;
 		public int[] verifyBuffer;
 		public int blocksize = 0;
 		public FlacFrame frame;
 		public int autocorTasksLen;
 		public int residualTasksLen;
 		public int samplesBufferLen;
 		unsafe public FlaCudaTask(CUDA _cuda, int channelCount)
 		{
 			cuda = _cuda;
 			autocorTasksLen = sizeof(computeAutocorTaskStruct) * channelCount * lpc.MAX_LPC_WINDOWS * FlaCudaWriter.maxFrames;
 			residualTasksLen = sizeof(encodeResidualTaskStruct) * channelCount * lpc.MAX_LPC_ORDER * (lpc.MAX_LPC_WINDOWS + 1) * FlaCudaWriter.maxFrames;
 			samplesBufferLen = sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelCount;
 			cudaSamples = cuda.Allocate((uint)samplesBufferLen);
 			cudaAutocorTasks = cuda.Allocate((uint)autocorTasksLen);
 			cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * FlaCudaWriter.maxAutocorParts));
 			cudaResidualTasks = cuda.Allocate((uint)residualTasksLen);
 			cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS + 1) * lpc.MAX_LPC_ORDER * FlaCudaWriter.maxResidualParts));
 			CUResult cuErr = CUDADriver.cuMemAllocHost(ref samplesBufferPtr, (uint)samplesBufferLen);
 			if (cuErr == CUResult.Success)
 				cuErr = CUDADriver.cuMemAllocHost(ref autocorTasksPtr, (uint)autocorTasksLen);
 			if (cuErr == CUResult.Success)
 				cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)residualTasksLen);
 			if (cuErr != CUResult.Success)
 			{
 				if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr); samplesBufferPtr = IntPtr.Zero;
 				if (autocorTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(autocorTasksPtr); autocorTasksPtr = IntPtr.Zero;
 				if (residualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualTasksPtr); residualTasksPtr = IntPtr.Zero;
 				throw new CUDAException(cuErr);
 			}
 			cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor");
 			cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC");
 			cudaEstimateResidual = cuda.GetModuleFunction("cudaEstimateResidual");
 			cudaSumResidual = cuda.GetModuleFunction("cudaSumResidual");
 			//cudaSumResidualChunks = cuda.GetModuleFunction("cudaSumResidualChunks");
 			//cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual");
 			stream = cuda.CreateStream();
 			verifyBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * channelCount]; // should be channels, not channelCount. And should null if not doing verify!
 			frame = new FlacFrame(channelCount);
 		}
 		public void Dispose()
 		{
 			cuda.Free(cudaSamples);
 			cuda.Free(cudaAutocorTasks);
 			cuda.Free(cudaAutocorOutput);
 			cuda.Free(cudaResidualTasks);
 			cuda.Free(cudaResidualOutput);
 			CUDADriver.cuMemFreeHost(samplesBufferPtr);
 			CUDADriver.cuMemFreeHost(residualTasksPtr);
 			CUDADriver.cuMemFreeHost(autocorTasksPtr);
 			cuda.DestroyStream(stream);
 		}
 	}
 }
--- a/CUETools.FlaCuda/flacuda.cu
+++ b/CUETools.FlaCuda/flacuda.cu
@@ -81,15 +81,12 @@ extern "C" __global__ void cudaComputeAutocor(
 	//if (tid < 256) shared.product[tid] += shared.product[tid + 256]; __syncthreads();
 	if (tid < 128) shared.product[tid] += shared.product[tid + 128]; __syncthreads();
 	if (tid < 64) shared.product[tid] += shared.product[tid + 64]; __syncthreads();
-	if (tid < 32) 
+	if (tid < 32) shared.product[tid] += shared.product[tid + 32]; __syncthreads();
-	{
+	shared.product[tid] += shared.product[tid + 16];
-	    shared.product[tid] += shared.product[tid + 32];
+	shared.product[tid] += shared.product[tid + 8];
-	    shared.product[tid] += shared.product[tid + 16];
+	shared.product[tid] += shared.product[tid + 4];
-	    shared.product[tid] += shared.product[tid + 8];
+	shared.product[tid] += shared.product[tid + 2];
-	    shared.product[tid] += shared.product[tid + 4];
+	if (tid == 0) shared.sum[lag] = shared.product[0] + shared.product[1]; 
 	    shared.product[tid] += shared.product[tid + 2];
 	    if (tid == 0) shared.sum[lag] = shared.product[0] + shared.product[1]; 
 	}
 	__syncthreads();
    }
    // return results
@@ -167,7 +164,7 @@ extern "C" __global__ void cudaComputeLPC(
 	    shared.ldr[tid] += (tid < order) * __fmul_rz(reff, shared.ldr[order - 1 - tid]) + (tid  == order) * reff;
 	    // Quantization
-	    int precision = 13;
+	    int precision = 13 - (order > 8);
 	    int taskNo = shared.task.residualOffs + order;
 	    shared.bits[tid] = __mul24((33 - __clz(__float2int_rn(fabs(shared.ldr[tid]) * (1 << 15))) - precision), tid <= order);
 	    shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 16]);
--- a/CUETools.FlaCuda/flacuda.cubin
+++ b/CUETools.FlaCuda/flacuda.cubin
@@ -5,7 +5,7 @@ code {
 	name = cudaComputeAutocor
 	lmem = 0
 	smem = 3264
-	reg  = 9
+	reg  = 10
 	bar  = 1
 	const {
 			segname = const
@@ -47,17 +47,18 @@ code {
 		0xc0000a01 0x00000780 0x10039003 0x00000780 
 		0x1000f801 0x0403c780 0x00020c05 0xc0000782 
 		0x04001601 0xe4200780 0x861ffe03 0x00000000 
-		0x307cd1fd 0x6c2047c8 0x10081003 0x00000280 
+		0x307cd1fd 0x6c2047c8 0x10085003 0x00000280 
 		0x300209fd 0x6c00c7e8 0x30040dfd 0x6c0187f8 
 		0x308105fd 0x6c40c7c8 0x00000019 0x20000780 
 		0x2101f011 0x00000003 0x1000f815 0x0403c780 
 		0x308205fd 0x6c40c7c8 0x0000001d 0x20000780 
-		0x308305fd 0x6c40c7d8 0x20000a21 0x04008780 
+		0x308305fd 0x6c40c7c8 0x00000021 0x20000780 
-		0x20009001 0x00000013 0x00020009 0xc0000780 
+		0x307c05fd 0x6c0087d8 0x20000a25 0x04008780 
-		0x1800d601 0x0423c780 0x0002100d 0xc0000780 
+		0x20009201 0x00000013 0x00020009 0xc0000780 
-		0xc400d621 0x00200780 0x00000609 0xc0000780 
+		0x1800d601 0x0423c780 0x0002120d 0xc0000780 
-		0x1c00d601 0x0423c780 0x1000f821 0x0403f280 
+		0xc400d625 0x00200780 0x00000609 0xc0000780 
-		0xe800d601 0x00220780 0x10001001 0x0403e280 
+		0x1c00d601 0x0423c780 0x1000f825 0x0403f280 
 		0xe800d601 0x00224780 0x10001201 0x0403e280 
 		0x08041601 0xe4200780 0x861ffe03 0x00000000 
 		0x00000c01 0xa00007c0 0x00000609 0xc0000680 
 		0xd8145811 0x20000680 0xd810580d 0x20000680 
@@ -67,21 +68,22 @@ code {
 		0xd8125811 0x20000680 0xd810580d 0x20000680 
 		0x1000c001 0x0423c684 0xbc00c001 0x00200680 
 		0x08041601 0xe4200680 0x861ffe03 0x00000000 
-		0xa007c003 0x00000000 0x1007c003 0x00001100 
+		0x00001001 0xa00007c0 0x00000609 0xc0000680 
-		0x00000609 0xc0000780 0xd8115811 0x20000780 
+		0xd8115811 0x20000680 0xd810580d 0x20000680 
-		0xd810580d 0x20000780 0x1000c001 0x0423c784 
+		0x1000c001 0x0423c684 0xbc00c001 0x00200680 
-		0xbc00c001 0x00200780 0x08041601 0xe4200780 
+		0x08041601 0xe4200680 0x861ffe03 0x00000000 
 		0x00000609 0xc0000780 0xd810580d 0x20000780 
 		0x1c00e001 0x0423c780 0xbc00c001 0x00200780 
 		0x08041601 0xe4200780 0x1d00f000 0xbd006000 
 		0x08041601 0xe4200780 0x1d00e800 0xbd006000 
 		0x08041601 0xe4200780 0x1d00e400 0xbd006000 
-		0x08041601 0xe4200780 0x307c05fd 0x6c0147c8 
+		0x08041601 0xe4200780 0xa0080003 0x00000000 
-		0x1007c003 0x00000280 0xd010580d 0x20000780 
+		0x10080003 0x00001100 0xd010580d 0x20000780 
 		0x1c00c201 0x0423c780 0x00020a09 0xc0000780 
 		0xbc00c001 0x00200780 0x08061601 0xe4200780 
 		0xf0000001 0xe0000002 0x861ffe03 0x00000000 
 		0x20018a15 0x00000003 0x30040bfd 0x6c0147c8 
-		0x10047003 0x00000280 0x3002d1fd 0x6c2047c8 
+		0x10049003 0x00000280 0x3002d1fd 0x6c2047c8 
 		0x30000003 0x00000280 0x10004e01 0x0023c780 
 		0x60004805 0x00204780 0x2101f001 0x00000003 
 		0x40030011 0x00000780 0x60020211 0x00010780 
@@ -285,12 +287,12 @@ code {
 			segname = const
 			segnum  = 1
 			offset  = 0
-			bytes   = 52
+			bytes   = 56
 		mem {
 			0x00000003 0x0000001f 0x0000003f 0x00000040 
-			0x00000001 0x00000020 0x7e800000 0x0000000f 
+			0x00000001 0x00000020 0x7e800000 0x00000008 
-			0x00001fff 0xffffe000 0x3e800000 0x0000009e 
+			0x0000000c 0x0000000f 0xfffff000 0x00000fff 
-			0x00000008 
+			0x3e800000 0x0000009e 
 		}
 	}
 	bincode {
@@ -345,7 +347,7 @@ code {
 		0x213fee11 0x0fffffff 0x1000f815 0x0403c780 
 		0xd0047005 0x20000780 0xb08601fd 0x605107d8 
 		0x10000005 0x0403c780 0xa400c019 0xe4204780 
-		0xc08a0c19 0x00401680 0xc08a0205 0x00401680 
+		0xc08c0c19 0x00401680 0xc08c0205 0x00401680 
 		0x90000204 0xc0010c04 0xd0047005 0x20000780 
 		0xc401c019 0x0020c780 0xb0060000 0x20458818 
 		0x300605fd 0x6c0187d8 0xa0077003 0x00000000 
@@ -362,9 +364,11 @@ code {
 		0xa800da05 0xc4304780 0xc0000205 0x04700003 
 		0xa0000205 0x8c0047d0 0x2000d619 0x04214780 
 		0xa0000205 0x44065680 0x30170205 0xec101680 
-		0x31000205 0x0442d680 0x10000a05 0x2440d100 
+		0x31000205 0x04435680 0x10000a05 0x2440d100 
-		0x30020a1d 0x6c0187d0 0x30148205 0x00000003 
+		0x30870bfd 0x6c4107d8 0x100d801d 0x00000003 
-		0xd0840e1d 0x04400780 0x40070205 0x00018780 
+		0x1000101d 0x2440d280 0x20000e05 0x04004780 
 		0x30020a1d 0x6c0187e0 0xd0840e1d 0x04400780 
 		0x30218205 0x00000003 0x40070205 0x00018780 
 		0x00000609 0xc0000780 0x08005a01 0xe4204780 
 		0xd801680d 0x20000780 0x1c00e005 0x0423c780 
 		0x3c01c005 0x8c200780 0x08005a01 0xe4204780 
@@ -375,23 +379,25 @@ code {
 		0x08005a01 0xe4204780 0x1c00c205 0x0423c780 
 		0x3c01c005 0x8c200780 0x08005a01 0xe4204780 
 		0xd0016809 0x20000780 0x390fe005 0x00000003 
-		0x30870205 0xac400780 0x1001801d 0x00000003 
+		0x30890205 0xac400780 0x1001801d 0x00000003 
 		0x307c0205 0x8c000780 0x30010e1d 0xc4000780 
-		0xa0000e1d 0x44014780 0xc407da1d 0x00200780 
+		0xa0000e21 0x44014780 0x103f801d 0x000001ff 
-		0xa0000e1d 0xac004780 0x30880e1d 0xac400780 
+		0xc408da21 0x00200780 0x1000161d 0x2440d280 
-		0xa00b3003 0x00000000 0x30890e1d 0x8c400780 
+		0xa0001021 0xac004780 0x30080e21 0xac000780 
-		0x100b3003 0x00001100 0x30070c21 0xc4100780 
+		0x1000801d 0x0ffffe03 0x1000141d 0x2440d280 
 		0x30080e1d 0x8c000780 0xa00bb003 0x00000000 
 		0x100bb003 0x00002100 0x30070c21 0xc4100780 
 		0x30060c25 0xc4100780 0x20099020 0x2108e820 
 		0x20000621 0x04020780 0x20009021 0x00000007 
 		0xd00e101d 0xa0c00780 0xf0000001 0xe0000002 
 		0x30070c21 0xc4100680 0x30060c25 0xc4100680 
 		0x20001021 0x04024680 0x2000c821 0x04220680 
-		0x21001021 0x04430680 0xd00e1005 0xa0c00680 
+		0x21001021 0x0441c680 0xd00e1005 0xa0c00680 
 		0x307c0ffd 0x6c0087d8 0xa0000e05 0x44065500 
-		0x30170205 0xec101500 0x31000205 0x0442d500 
+		0x30170205 0xec101500 0x31000205 0x04435500 
 		0x10000a05 0x2440d280 0xd007001d 0x0402c780 
 		0x307c0ffd 0x6c0087d8 0xa0000e1d 0x44065500 
-		0x30170e1d 0xec101500 0x31000e1d 0x0442d500 
+		0x30170e1d 0xec101500 0x31000e1d 0x04435500 
 		0x10000a1d 0x2440d280 0x30070205 0x8c000780 
 		0x00000605 0xc0000780 0x30218205 0x00000003 
 		0x04005a01 0xe4204780 0xd4016809 0x20000780 
@@ -402,8 +408,8 @@ code {
 		0x04005a01 0xe4204780 0x1800c405 0x0423c780 
 		0x3801c005 0x8c200780 0x04005a01 0xe4204780 
 		0x1800c205 0x0423c780 0x3801c005 0x8c200780 
-		0x04005a01 0xe4204780 0xa00e2003 0x00000000 
+		0x04005a01 0xe4204780 0xa00ea003 0x00000000 
-		0x100e2003 0x00000100 0x30070c05 0xc4100780 
+		0x100ea003 0x00000100 0x30070c05 0xc4100780 
 		0x30060c19 0xc4100780 0x20000205 0x04018780 
 		0xd0016805 0x20000780 0x2101e818 0x1500e004 
 		0x200c8c19 0x00000003 0xd00e0c05 0xa0c00780