diff --git a/CUETools.FlaCuda/FlaCudaWriter.cs b/CUETools.FlaCuda/FlaCudaWriter.cs
index 8e11785..664b7de 100644
--- a/CUETools.FlaCuda/FlaCudaWriter.cs
+++ b/CUETools.FlaCuda/FlaCudaWriter.cs
@@ -1,9 +1,28 @@
+/**
+ * CUETools.FlaCuda: FLAC audio encoder using CUDA
+ * Copyright (c) 2009 Gregory S. Chudov
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
 using System;
 using System.Collections.Generic;
 using System.IO;
 using System.Security.Cryptography;
 using System.Text;
-using System.Runtime.InteropServices;
+//using System.Runtime.InteropServices;
 using CUETools.Codecs;
 using CUETools.Codecs.FLAKE;
 using GASS.CUDA;
@@ -78,17 +97,20 @@ namespace CUETools.Codecs.FlaCuda
 		CUfunction cudaEncodeResidual;
 		CUdeviceptr cudaSamples;
 		CUdeviceptr cudaWindow;
-		CUdeviceptr cudaAutocor;
+		CUdeviceptr cudaAutocorTasks;
+		CUdeviceptr cudaAutocorOutput;
 		CUdeviceptr cudaResidualTasks;
 		CUdeviceptr cudaResidualOutput;
 		IntPtr samplesBufferPtr = IntPtr.Zero;
-		IntPtr autocorBufferPtr = IntPtr.Zero;
-		IntPtr residualOutputPtr = IntPtr.Zero;
+		IntPtr autocorTasksPtr = IntPtr.Zero;
+		IntPtr autocorOutputPtr = IntPtr.Zero;
 		IntPtr residualTasksPtr = IntPtr.Zero;
+		IntPtr residualOutputPtr = IntPtr.Zero;
 		CUstream cudaStream;
 
 		const int MAX_BLOCKSIZE = 8192;
-		const int maxResidualTasks = MAX_BLOCKSIZE / (256 - 32);
+		const int maxResidualParts = MAX_BLOCKSIZE / (256 - 32);
+		const int maxAutocorParts = MAX_BLOCKSIZE / (256 - 32);
 
 		public FlaCudaWriter(string path, int bitsPerSample, int channelCount, int sampleRate, Stream IO)
 		{
@@ -187,13 +209,15 @@ namespace CUETools.Codecs.FlaCuda
 
 				cuda.Free(cudaWindow);
 				cuda.Free(cudaSamples);
-				cuda.Free(cudaAutocor);
+				cuda.Free(cudaAutocorTasks);
+				cuda.Free(cudaAutocorOutput);
 				cuda.Free(cudaResidualTasks);
 				cuda.Free(cudaResidualOutput);
-				CUDADriver.cuMemFreeHost(autocorBufferPtr);
+				CUDADriver.cuMemFreeHost(autocorOutputPtr);
 				CUDADriver.cuMemFreeHost(residualOutputPtr);
 				CUDADriver.cuMemFreeHost(samplesBufferPtr);
 				CUDADriver.cuMemFreeHost(residualTasksPtr);
+				CUDADriver.cuMemFreeHost(autocorTasksPtr);
 				cuda.DestroyStream(cudaStream);
 				cuda.Dispose();
 				inited = false;
@@ -218,13 +242,15 @@ namespace CUETools.Codecs.FlaCuda
 				_IO.Close();
 				cuda.Free(cudaWindow);
 				cuda.Free(cudaSamples);
-				cuda.Free(cudaAutocor);
+				cuda.Free(cudaAutocorTasks);
+				cuda.Free(cudaAutocorOutput);
 				cuda.Free(cudaResidualTasks);
 				cuda.Free(cudaResidualOutput);
-				CUDADriver.cuMemFreeHost(autocorBufferPtr);
+				CUDADriver.cuMemFreeHost(autocorOutputPtr);
 				CUDADriver.cuMemFreeHost(residualOutputPtr);
 				CUDADriver.cuMemFreeHost(samplesBufferPtr);
 				CUDADriver.cuMemFreeHost(residualTasksPtr);
+				CUDADriver.cuMemFreeHost(autocorTasksPtr);
 				cuda.DestroyStream(cudaStream);
 				cuda.Dispose();
 				inited = false;
@@ -253,18 +279,6 @@ namespace CUETools.Codecs.FlaCuda
 			get { return _blocksize == 0 ? eparams.block_size : _blocksize; }
 		}
 
-		public OrderMethod OrderMethod
-		{
-			get { return eparams.order_method; }
-			set { eparams.order_method = value; }
-		}
-
-		public PredictionType PredictionType
-		{
-			get { return eparams.prediction_type; }
-			set { eparams.prediction_type = value; }
-		}
-
 		public StereoMethod StereoMethod
 		{
 			get { return eparams.stereo_method; }
@@ -323,38 +337,6 @@ namespace CUETools.Codecs.FlaCuda
 			set { eparams.variable_block_size = value; }
 		}
 
-		public int MinPredictionOrder
-		{
-			get
-			{
-				return PredictionType == PredictionType.Fixed ?
-					MinFixedOrder : MinLPCOrder;
-			}
-			set
-			{
-				if (PredictionType == PredictionType.Fixed)
-					MinFixedOrder = value;
-				else
-					MinLPCOrder = value;
-			}
-		}
-
-		public int MaxPredictionOrder
-		{
-			get 
-			{
-				return PredictionType == PredictionType.Fixed ?
-					MaxFixedOrder : MaxLPCOrder; 
-			}
-			set
-			{
-				if (PredictionType == PredictionType.Fixed)
-					MaxFixedOrder = value;
-				else
-					MaxLPCOrder = value;
-			}
-		}
-
 		public int MinLPCOrder
 		{
 			get
@@ -383,48 +365,6 @@ namespace CUETools.Codecs.FlaCuda
 			}
 		}
 
-		public int EstimationDepth
-		{
-			get
-			{
-				return eparams.estimation_depth;
-			}
-			set
-			{
-				if (value > 32 || value < 1)
-					throw new Exception("invalid estimation_depth " + value.ToString());
-				eparams.estimation_depth = value;
-			}
-		}
-
-		public int MinFixedOrder
-		{
-			get
-			{
-				return eparams.min_fixed_order;
-			}
-			set
-			{
-				if (value < 0 || value > eparams.max_fixed_order)
-					throw new Exception("invalid MinFixedOrder " + value.ToString());
-				eparams.min_fixed_order = value;
-			}
-		}
-
-		public int MaxFixedOrder
-		{
-			get
-			{
-				return eparams.max_fixed_order;
-			}
-			set
-			{
-				if (value > 4 || value < eparams.min_fixed_order)
-					throw new Exception("invalid MaxFixedOrder " + value.ToString());
-				eparams.max_fixed_order = value;
-			}
-		}
-
 		public int MinPartitionOrder
 		{
 			get { return eparams.min_partition_order; }
@@ -549,18 +489,6 @@ namespace CUETools.Codecs.FlaCuda
 			return k_opt;
 		}
 
-		unsafe uint calc_decorr_score(FlacFrame frame, int ch)
-		{
-			int* s = frame.subframes[ch].samples;
-			int n = frame.blocksize;
-			ulong sum = 0;
-			for (int i = 2; i < n; i++)
-				sum += (ulong)Math.Abs(s[i] - 2 * s[i - 1] + s[i - 2]);
-			uint nbits;
-			find_optimal_rice_param((uint)(2 * sum), (uint)n, out nbits);
-			return nbits;
-		}
-
 		unsafe static void channel_decorrelation(int* leftS, int* rightS, int *leftM, int *rightM, int blocksize)
 		{
 			for (int i = 0; i < blocksize; i++)
@@ -754,205 +682,6 @@ namespace CUETools.Codecs.FlaCuda
 			return lpc_precision;
 		}
 
-		unsafe void encode_residual_lpc_sub(FlacFrame frame, float * lpcs, int iWindow, int order, int ch)
-		{
-			// select LPC precision based on block size
-			uint lpc_precision = get_precision(frame.blocksize);
-
-			for (int i_precision = eparams.lpc_min_precision_search; i_precision <= eparams.lpc_max_precision_search && lpc_precision + i_precision < 16; i_precision++)
-				// check if we already calculated with this order, window and precision
-				if ((frame.subframes[ch].lpc_ctx[iWindow].done_lpcs[i_precision] & (1U << (order - 1))) == 0)
-				{
-					frame.subframes[ch].lpc_ctx[iWindow].done_lpcs[i_precision] |= (1U << (order - 1));
-
-					uint cbits = lpc_precision + (uint)i_precision;
-
-					frame.current.type = SubframeType.LPC;
-					frame.current.order = order;
-					frame.current.window = iWindow;
-
-					fixed (int* coefs = frame.current.coefs)
-					{
-						lpc.quantize_lpc_coefs(lpcs + (frame.current.order - 1) * lpc.MAX_LPC_ORDER,
-							frame.current.order, cbits, coefs, out frame.current.shift, 15, 0);
-
-						if (frame.current.shift < 0 || frame.current.shift > 15)
-							throw new Exception("negative shift");
-
-						ulong csum = 0;
-						for (int i = frame.current.order; i > 0; i--)
-							csum += (ulong)Math.Abs(coefs[i - 1]);
-
-						if ((csum << (int)frame.subframes[ch].obits) >= 1UL << 32)
-							lpc.encode_residual_long(frame.current.residual, frame.subframes[ch].samples, frame.blocksize, frame.current.order, coefs, frame.current.shift);
-						else
-							lpc.encode_residual(frame.current.residual, frame.subframes[ch].samples, frame.blocksize, frame.current.order, coefs, frame.current.shift);
-
-					}
-
-					frame.current.size = calc_rice_params_lpc(ref frame.current.rc, eparams.min_partition_order, eparams.max_partition_order,
-						frame.current.residual, frame.blocksize, frame.current.order, frame.subframes[ch].obits, cbits);
-
-					frame.ChooseBestSubframe(ch);
-				}
-		}
-
-		unsafe void encode_residual_fixed_sub(FlacFrame frame, int order, int ch)
-		{
-			if ((frame.subframes[ch].done_fixed & (1U << order)) != 0)
-				return; // already calculated;
-
-			frame.current.order = order;
-			frame.current.type = SubframeType.Fixed;
-
-			encode_residual_fixed(frame.current.residual, frame.subframes[ch].samples, frame.blocksize, frame.current.order);
-
-			frame.current.size = calc_rice_params_fixed(ref frame.current.rc, eparams.min_partition_order, eparams.max_partition_order,
-				frame.current.residual, frame.blocksize, frame.current.order, frame.subframes[ch].obits);
-
-			frame.subframes[ch].done_fixed |= (1U << order);
-
-			frame.ChooseBestSubframe(ch);
-		}
-
-		unsafe void encode_residual(FlacFrame frame, int ch, PredictionType predict, OrderMethod omethod, int pass)
-		{
-			int* smp = frame.subframes[ch].samples;
-			int i, n = frame.blocksize;
-			// save best.window, because we can overwrite it later with fixed frame
-			int best_window = frame.subframes[ch].best.type == SubframeType.LPC ? frame.subframes[ch].best.window : -1;
-
-			// CONSTANT
-			for (i = 1; i < n; i++)
-			{
-				if (smp[i] != smp[0]) break;
-			}
-			if (i == n)
-			{
-				frame.subframes[ch].best.type = SubframeType.Constant;
-				frame.subframes[ch].best.residual[0] = smp[0];
-				frame.subframes[ch].best.size = frame.subframes[ch].obits;
-				return;
-			}
-
-			// VERBATIM
-			frame.current.type = SubframeType.Verbatim;
-			frame.current.size = frame.subframes[ch].obits * (uint)frame.blocksize;
-			frame.ChooseBestSubframe(ch);
-
-			if (n < 5 || predict == PredictionType.None)
-				return;
-
-			// FIXED
-			if (predict == PredictionType.Fixed ||
-				(predict == PredictionType.Search && pass != 1) ||
-				//predict == PredictionType.Search ||
-				//(pass == 2 && frame.subframes[ch].best.type == SubframeType.Fixed) ||
-				n <= eparams.max_prediction_order)
-			{
-				int max_fixed_order = Math.Min(eparams.max_fixed_order, 4);
-				int min_fixed_order = Math.Min(eparams.min_fixed_order, max_fixed_order);
-
-				for (i = min_fixed_order; i <= max_fixed_order; i++)
-					encode_residual_fixed_sub(frame, i, ch);
-			}
-
-			// LPC
-			if (n > eparams.max_prediction_order &&
-			   (predict == PredictionType.Levinson ||
-				predict == PredictionType.Search)
-				//predict == PredictionType.Search ||
-				//(pass == 2 && frame.subframes[ch].best.type == SubframeType.LPC))
-				)
-			{
-				float* lpcs = stackalloc float[lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER];
-				int min_order = eparams.min_prediction_order;
-				int max_order = eparams.max_prediction_order;
-
-				for (int iWindow = 0; iWindow < _windowcount; iWindow++)
-				{
-					if (pass == 2 && iWindow != best_window)
-						continue;
-
-					LpcContext lpc_ctx = frame.subframes[ch].lpc_ctx[iWindow];
-			
-					lpc_ctx.GetReflection(max_order, smp, n, frame.window_buffer + iWindow * FlaCudaWriter.MAX_BLOCKSIZE * 2);
-					lpc_ctx.ComputeLPC(lpcs);
-
-					switch (omethod)
-					{
-						case OrderMethod.Max:
-							// always use maximum order
-							encode_residual_lpc_sub(frame, lpcs, iWindow, max_order, ch);
-							break;
-						case OrderMethod.Estimate:
-							// estimated orders
-							// Search at reflection coeff thresholds (where they cross 0.10)
-							{
-								int found = 0;
-								for (i = max_order; i >= min_order && found < eparams.estimation_depth; i--)
-									if (lpc_ctx.IsInterestingOrder(i, max_order))
-									{
-										encode_residual_lpc_sub(frame, lpcs, iWindow, i, ch);
-										found++;
-									}
-								if (0 == found)
-									encode_residual_lpc_sub(frame, lpcs, iWindow, min_order, ch);
-							}
-							break;
-						case OrderMethod.EstSearch2:
-							// Search at reflection coeff thresholds (where they cross 0.10)
-							{
-								int found = 0;
-								for (i = min_order; i <= max_order && found < eparams.estimation_depth; i++)
-									if (lpc_ctx.IsInterestingOrder(i))
-									{
-										encode_residual_lpc_sub(frame, lpcs, iWindow, i, ch);
-										found++;
-									}
-								if (0 == found)
-									encode_residual_lpc_sub(frame, lpcs, iWindow, min_order, ch);
-							}
-							break;
-						case OrderMethod.Search:
-							// brute-force optimal order search
-							for (i = max_order; i >= min_order; i--)
-								encode_residual_lpc_sub(frame, lpcs, iWindow, i, ch);
-							break;
-						case OrderMethod.LogFast:
-							// Try max, est, 32,16,8,4,2,1
-							encode_residual_lpc_sub(frame, lpcs, iWindow, max_order, ch);
-							for (i = lpc.MAX_LPC_ORDER; i >= min_order; i >>= 1)
-								if (i < max_order)
-									encode_residual_lpc_sub(frame, lpcs, iWindow, i, ch);
-							break;
-						case OrderMethod.LogSearch:
-							// do LogFast first
-							encode_residual_lpc_sub(frame, lpcs, iWindow, max_order, ch);
-							for (i = lpc.MAX_LPC_ORDER; i >= min_order; i >>= 1)
-								if (i < max_order)
-									encode_residual_lpc_sub(frame, lpcs, iWindow, i, ch);
-							// if found a good order, try to search around it
-							if (frame.subframes[ch].best.type == SubframeType.LPC)
-							{
-								// log search (written by Michael Niedermayer for FFmpeg)
-								for (int step = lpc.MAX_LPC_ORDER; step > 0; step >>= 1)
-								{
-									int last = frame.subframes[ch].best.order;
-									if (step <= (last + 1) / 2)
-										for (i = last - step; i <= last + step; i += step)
-											if (i >= min_order && i <= max_order)
-												encode_residual_lpc_sub(frame, lpcs, iWindow, i, ch);
-								}
-							}
-							break;
-						default:
-							throw new Exception("unknown ordermethod");
-					}
-				}
-			}
-		}
-
 		unsafe void output_frame_header(FlacFrame frame, BitWriter bitwriter)
 		{
 			bitwriter.writebits(15, 0x7FFC);
@@ -1107,45 +836,6 @@ namespace CUETools.Codecs.FlaCuda
 			bitwriter.flush();
 		}
 
-		unsafe void encode_residual_pass1(FlacFrame frame, int ch)
-		{
-			int max_prediction_order = eparams.max_prediction_order;
-			int max_fixed_order = eparams.max_fixed_order;
-			int min_fixed_order = eparams.min_fixed_order;
-			int lpc_min_precision_search = eparams.lpc_min_precision_search;
-			int lpc_max_precision_search = eparams.lpc_max_precision_search;
-			int max_partition_order = eparams.max_partition_order;
-			int estimation_depth = eparams.estimation_depth;
-			eparams.min_fixed_order = 2;
-			eparams.max_fixed_order = 2;
-			eparams.lpc_min_precision_search = eparams.lpc_max_precision_search;
-			eparams.max_prediction_order = 8;
-			eparams.estimation_depth = 1;
-			encode_residual(frame, ch, eparams.prediction_type, OrderMethod.Estimate, 1);
-			eparams.min_fixed_order = min_fixed_order;
-			eparams.max_fixed_order = max_fixed_order;
-			eparams.max_prediction_order = max_prediction_order;
-			eparams.lpc_min_precision_search = lpc_min_precision_search;
-			eparams.lpc_max_precision_search = lpc_max_precision_search;
-			eparams.max_partition_order = max_partition_order;
-			eparams.estimation_depth = estimation_depth;
-		}
-
-		unsafe void encode_residual_pass2(FlacFrame frame, int ch)
-		{
-			encode_residual(frame, ch, eparams.prediction_type, eparams.order_method, 2);
-		}
-
-		unsafe void encode_residual_onepass(FlacFrame frame, int ch)
-		{
-			if (_windowcount > 1)
-			{
-				encode_residual_pass1(frame, ch);
-				encode_residual_pass2(frame, ch);
-			} else
-				encode_residual(frame, ch, eparams.prediction_type, eparams.order_method, 0);
-		}
-
 		unsafe uint measure_frame_size(FlacFrame frame, bool do_midside)
 		{
 			// crude estimation of header/footer size
@@ -1263,8 +953,11 @@ namespace CUETools.Codecs.FlaCuda
 				}
 			}
 
+			if (frame.blocksize <= 4)
+				return;
+
 			// LPC
-			for (int ch = 0; ch < 4; ch++)
+			for (int ch = 0; ch < channelsCount; ch++)
 			{
 				for (int iWindow = 0; iWindow < _windowcount; iWindow++)
 				{
@@ -1300,14 +993,14 @@ namespace CUETools.Codecs.FlaCuda
 			}
 
 			// FIXED
-			for (int ch = 0; ch < 4; ch++)
+			for (int ch = 0; ch < channelsCount; ch++)
 			{
 				for (int order = 1; order <= 4 && order < frame.blocksize; order++)
 				{
 					int index = (order - 1) + 4 * ch;
 					int nbits = 0;
 					for (int p = 0; p < partCount; p++)
-						nbits += ((int*)residualOutputPtr)[p + partCount * (index + max_order * _windowcount * 4)];
+						nbits += ((int*)residualOutputPtr)[p + partCount * (index + max_order * _windowcount * channelsCount)];
 					nbits += order * (int)frame.subframes[ch].obits + 6;
 					if (frame.subframes[ch].best.size > nbits)
 					{
@@ -1319,14 +1012,20 @@ namespace CUETools.Codecs.FlaCuda
 			}
 		}
 
-		unsafe void estimate_residual(FlacFrame frame, int channelsCount, int max_order, int orders2, int blocks, out int partCount)
+		unsafe void estimate_residual(FlacFrame frame, int channelsCount, int max_order, int autocorPartCount, out int partCount)
 		{
+			encodeResidualTaskStruct* residualTasks = (encodeResidualTaskStruct*)residualTasksPtr;
 			uint cbits = get_precision(frame.blocksize) + 1;
 			int nResidualTasks = 0;
 			int residualThreads = 256;
-			int partSize = residualThreads - 32;
-			
+			int partSize = residualThreads - max_order;
+		
 			partCount = (frame.blocksize + partSize - 1) / partSize;
+			if (partCount > maxResidualParts)
+				throw new Exception("internal error");
+
+			if (frame.blocksize <= 4)
+				return;
 
 			// LPC
 			for (int ch = 0; ch < channelsCount; ch++)
@@ -1336,15 +1035,14 @@ namespace CUETools.Codecs.FlaCuda
 					for (int order = 0; order <= max_order; order++)
 					{
 						ac[order] = 0;
-						for (int i_block = 0; i_block < blocks; i_block++)
-							ac[order] += ((float*)autocorBufferPtr)[orders2 * (i_block + blocks * (ch + channelsCount * iWindow)) + order];
+						for (int i_block = 0; i_block < autocorPartCount; i_block++)
+							ac[order] += ((float*)autocorOutputPtr)[order + (max_order + 1) * (i_block + autocorPartCount * (iWindow + _windowcount * ch))];
 					}
 					frame.subframes[ch].lpc_ctx[iWindow].ComputeReflection(max_order, ac);
 					float* lpcs = stackalloc float[lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER];
 					frame.subframes[ch].lpc_ctx[iWindow].ComputeLPC(lpcs);
 					for (int order = 1; order <= max_order; order++)
 					{
-						encodeResidualTaskStruct* residualTasks = (encodeResidualTaskStruct*)residualTasksPtr;
 						residualTasks[nResidualTasks].residualOrder = order - 1;
 						residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE;
 
@@ -1360,7 +1058,6 @@ namespace CUETools.Codecs.FlaCuda
 			{
 				for (int order = 1; order <= 4; order++)
 				{
-					encodeResidualTaskStruct* residualTasks = (encodeResidualTaskStruct*)residualTasksPtr;
 					residualTasks[nResidualTasks].residualOrder = order - 1;
 					residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE;
 					residualTasks[nResidualTasks].shift = 0;
@@ -1404,29 +1101,48 @@ namespace CUETools.Codecs.FlaCuda
 			cuda.SynchronizeStream(cudaStream);
 		}
 
-		unsafe void compute_autocorellation(FlacFrame frame, int channelsCount, int orders, out int blocks)
+		unsafe void initialize_autocorTasks()
 		{
-			// TODO!!!! replace 4 with channelsCount.
-			int threads = 512;
-			int threads_x = orders;
-			int threads_y = threads / threads_x;
-			int blocks_y = ((threads_x - 1) * (threads_y - 1)) / threads_y;
+			computeAutocorTaskStruct* autocorTasks = (computeAutocorTaskStruct*)autocorTasksPtr;
+			int nAutocorTasks = 0;
+			for (int ch = 0; ch < (channels == 2 ? 4 : channels); ch++)
+				for (int iWindow = 0; iWindow < _windowcount; iWindow++)
+				{
+					autocorTasks[nAutocorTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE;
+					autocorTasks[nAutocorTasks].windowOffs = iWindow * 2 * FlaCudaWriter.MAX_BLOCKSIZE;
+					nAutocorTasks++;
+				}
+			cuda.CopyHostToDeviceAsync(cudaAutocorTasks, autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * nAutocorTasks), cudaStream);
+			cuda.SynchronizeStream(cudaStream);
+		}
 
-			blocks = (frame.blocksize + blocks_y * threads_y - 1) / (blocks_y * threads_y);
+		unsafe void compute_autocorellation(FlacFrame frame, int channelsCount, int max_order, out int partCount)
+		{
+			int autocorThreads = 256;
+			int partSize = autocorThreads - max_order;
+			int nAutocorTasks = _windowcount * channelsCount;
 
-			cuda.SetParameter(cudaComputeAutocor, 0, (uint)cudaAutocor.Pointer);
+			partCount = (frame.blocksize + partSize - 1) / partSize;			
+			if (partCount > maxAutocorParts)
+				throw new Exception("internal error");
+
+			if (frame.blocksize <= 4)
+				return;
+
+			cuda.SetParameter(cudaComputeAutocor, 0, (uint)cudaAutocorOutput.Pointer);
 			cuda.SetParameter(cudaComputeAutocor, IntPtr.Size, (uint)cudaSamples.Pointer);
 			cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 2, (uint)cudaWindow.Pointer);
-			cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3, (uint)frame.blocksize);
-			cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3 + sizeof(uint), (uint)FlaCudaWriter.MAX_BLOCKSIZE);
-			cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3 + sizeof(uint) * 2, (uint)(blocks_y * threads_y));
-			cuda.SetParameterSize(cudaComputeAutocor, (uint)(IntPtr.Size * 3) + sizeof(uint) * 3);
-			cuda.SetFunctionBlockShape(cudaComputeAutocor, threads_x, threads_y, 1);
+			cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3, (uint)cudaAutocorTasks.Pointer);
+			cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 4, (uint)max_order);
+			cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 4 + sizeof(uint), (uint)frame.blocksize);
+			cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 4 + sizeof(uint) * 2, (uint)partSize);
+			cuda.SetParameterSize(cudaComputeAutocor, (uint)(IntPtr.Size * 4) + sizeof(uint) * 3);
+			cuda.SetFunctionBlockShape(cudaComputeAutocor, autocorThreads, 1, 1);
 
 			// issue work to the GPU
-			cuda.CopyHostToDeviceAsync(cudaSamples, samplesBufferPtr, (uint)FlaCudaWriter.MAX_BLOCKSIZE * 4 * sizeof(int), cudaStream);
-			cuda.LaunchAsync(cudaComputeAutocor, blocks, 4 * _windowcount, cudaStream); 
-			cuda.CopyDeviceToHostAsync(cudaAutocor, autocorBufferPtr, (uint)(sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * 4 * _windowcount * blocks), cudaStream);
+			cuda.CopyHostToDeviceAsync(cudaSamples, samplesBufferPtr, (uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelsCount), cudaStream);
+			cuda.LaunchAsync(cudaComputeAutocor, partCount, nAutocorTasks, cudaStream);
+			cuda.CopyDeviceToHostAsync(cudaAutocorOutput, autocorOutputPtr, (uint)(sizeof(float) * partCount * (max_order + 1) * nAutocorTasks), cudaStream);
 			cuda.SynchronizeStream(cudaStream);
 		}
 		
@@ -1450,40 +1166,33 @@ namespace CUETools.Codecs.FlaCuda
 					if (_windowcount == 0)
 						throw new Exception("invalid windowfunction");
 					cuda.CopyHostToDevice<float>(cudaWindow, windowBuffer);
+					initialize_autocorTasks();
 				}
 
-				if (channels != 2 || frame.blocksize <= 32 || eparams.stereo_method == StereoMethod.Independent)
-				{
-					frame.window_buffer = window;
-					frame.current.residual = r + channels * FlaCudaWriter.MAX_BLOCKSIZE;
-					frame.ch_mode = channels != 2 ? ChannelMode.NotStereo : ChannelMode.LeftRight;
-					for (int ch = 0; ch < channels; ch++)
-						frame.subframes[ch].Init(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * FlaCudaWriter.MAX_BLOCKSIZE,
-							bits_per_sample, get_wasted_bits(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, frame.blocksize));
-
-					for (int ch = 0; ch < channels; ch++)
-						encode_residual_onepass(frame, ch);
-				}
-				else
-				{
+				bool doMidside = channels == 2 && eparams.stereo_method != StereoMethod.Independent;
+				int channelCount = doMidside ? 2 * channels : channels;
+				if (doMidside)
 					channel_decorrelation(s, s + FlaCudaWriter.MAX_BLOCKSIZE, s + 2 * FlaCudaWriter.MAX_BLOCKSIZE, s + 3 * FlaCudaWriter.MAX_BLOCKSIZE, frame.blocksize);
-					frame.window_buffer = window;
-					frame.current.residual = r + 4 * FlaCudaWriter.MAX_BLOCKSIZE;
-					for (int ch = 0; ch < 4; ch++)
-						frame.subframes[ch].Init(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * FlaCudaWriter.MAX_BLOCKSIZE,
-							bits_per_sample + (ch == 3 ? 1U : 0U), get_wasted_bits(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, frame.blocksize));
 
-					int blocks, orders = 8;
-					while (orders < eparams.max_prediction_order + 1 && orders < 32)
-						orders <<= 1;
-					compute_autocorellation(frame, 4, orders, out blocks);
-					int partCount, max_order = Math.Min(orders - 1, eparams.max_prediction_order);
-					estimate_residual(frame, 4, max_order, orders, blocks, out partCount);
-					select_best_methods(frame, 4, max_order, partCount);
+				frame.window_buffer = window;
+				for (int ch = 0; ch < channelCount; ch++)
+					frame.subframes[ch].Init(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * FlaCudaWriter.MAX_BLOCKSIZE,
+						bits_per_sample + (doMidside && ch == 3 ? 1U : 0U), get_wasted_bits(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, frame.blocksize));
+
+				int autocorPartCount, residualPartCount;
+				compute_autocorellation(frame, channelCount, eparams.max_prediction_order, out autocorPartCount);
+				estimate_residual(frame, channelCount, eparams.max_prediction_order, autocorPartCount, out residualPartCount);
+				select_best_methods(frame, channelCount, eparams.max_prediction_order, residualPartCount);
+
+				if (doMidside)
+				{
 					measure_frame_size(frame, true);
 					frame.ChooseSubframes();
-					encode_residual(frame);
 				}
+				else
+					frame.ch_mode = channels != 2 ? ChannelMode.NotStereo : ChannelMode.LeftRight;
+
+				encode_residual(frame);
 
 				BitWriter bitwriter = new BitWriter(frame_buffer, 0, max_frame_size);
 
@@ -1574,24 +1283,28 @@ namespace CUETools.Codecs.FlaCuda
 				cuda.LoadModule(System.IO.Path.Combine(Environment.CurrentDirectory, "flacuda.cubin"));
 				cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor");
 				cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual");
-				cudaAutocor = cuda.Allocate((uint)(sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * 22);
 				cudaSamples = cuda.Allocate((uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels)));
 				cudaWindow = cuda.Allocate((uint)sizeof(float) * FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS);
+				cudaAutocorTasks = cuda.Allocate((uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS));
+				cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * maxAutocorParts);
 				cudaResidualTasks = cuda.Allocate((uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS));
-				cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS * maxResidualTasks));
-				CUResult cuErr = CUDADriver.cuMemAllocHost(ref autocorBufferPtr, (uint)(sizeof(float)*(lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * 22));
+				cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER + 1) * lpc.MAX_LPC_WINDOWS * maxResidualParts));
+				CUResult cuErr = CUDADriver.cuMemAllocHost(ref samplesBufferPtr, (uint)(sizeof(int) * (channels == 2 ? 4 : channels) * FlaCudaWriter.MAX_BLOCKSIZE));
 				if (cuErr == CUResult.Success)
-					cuErr = CUDADriver.cuMemAllocHost(ref samplesBufferPtr, (uint)(sizeof(int) * (channels == 2 ? 4 : channels) * FlaCudaWriter.MAX_BLOCKSIZE));
+					cuErr = CUDADriver.cuMemAllocHost(ref autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS));
 				if (cuErr == CUResult.Success)
-					cuErr = CUDADriver.cuMemAllocHost(ref residualOutputPtr, (uint)(sizeof(int) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER * maxResidualTasks));
+					cuErr = CUDADriver.cuMemAllocHost(ref autocorOutputPtr, (uint)(sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * maxAutocorParts));
 				if (cuErr == CUResult.Success)
 					cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER));
+				if (cuErr == CUResult.Success)
+					cuErr = CUDADriver.cuMemAllocHost(ref residualOutputPtr, (uint)(sizeof(int) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER * maxResidualParts));
 				if (cuErr != CUResult.Success)
 				{
-					if (autocorBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(autocorBufferPtr);
-					if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr);
-					if (residualOutputPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualOutputPtr);
-					if (residualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualTasksPtr);
+					if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr); samplesBufferPtr = IntPtr.Zero;
+					if (autocorTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(autocorTasksPtr); autocorTasksPtr = IntPtr.Zero;
+					if (autocorOutputPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(autocorOutputPtr); autocorOutputPtr = IntPtr.Zero;
+					if (residualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualTasksPtr); residualTasksPtr = IntPtr.Zero;
+					if (residualOutputPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualOutputPtr); residualOutputPtr = IntPtr.Zero;
 					throw new CUDAException(cuErr);
 				}				
 				cudaStream = cuda.CreateStream();
@@ -1627,7 +1340,7 @@ namespace CUETools.Codecs.FlaCuda
 
 		public string Path { get { return _path; } }
 
-		string vendor_string = "Flake#0.1";
+		string vendor_string = "FlaCuda#0.1";
 
 		int select_blocksize(int samplerate, int time_ms)
 		{
@@ -1879,20 +1592,6 @@ namespace CUETools.Codecs.FlaCuda
 		// higher prediction orders
 		public int compression;
 
-		// prediction order selection method
-		// set by user prior to calling flake_encode_init
-		// if set to less than 0, it is chosen based on compression.
-		// valid values are 0 to 5
-		// 0 = use maximum order only
-		// 1 = use estimation
-		// 2 = 2-level
-		// 3 = 4-level
-		// 4 = 8-level
-		// 5 = full search
-		// 6 = log search
-		public OrderMethod order_method;
-
-
 		// stereo decorrelation method
 		// set by user prior to calling flake_encode_init
 		// if set to less than 0, it is chosen based on compression.
@@ -1930,28 +1629,6 @@ namespace CUETools.Codecs.FlaCuda
 		// valid values are 1 to 32 
 		public int max_prediction_order;
 
-		// Number of LPC orders to try (for estimate mode)
-		// set by user prior to calling flake_encode_init
-		// if set to less than 0, it is chosen based on compression.
-		// valid values are 1 to 32 
-		public int estimation_depth;
-
-		// minimum fixed prediction order
-		// set by user prior to calling flake_encode_init
-		// if set to less than 0, it is chosen based on compression.
-		// valid values are 0 to 4
-		public int min_fixed_order;
-
-		// maximum fixed prediction order
-		// set by user prior to calling flake_encode_init
-		// if set to less than 0, it is chosen based on compression.
-		// valid values are 0 to 4
-		public int max_fixed_order;
-
-		// type of linear prediction
-		// set by user prior to calling flake_encode_init
-		public PredictionType prediction_type;
-
 		// minimum partition order
 		// set by user prior to calling flake_encode_init
 		// if set to less than 0, it is chosen based on compression.
@@ -1994,16 +1671,11 @@ namespace CUETools.Codecs.FlaCuda
 
 			// default to level 5 params
 			window_function = WindowFunction.Flattop | WindowFunction.Tukey;
-			order_method = OrderMethod.Estimate;
-			stereo_method = StereoMethod.Evaluate;
+			stereo_method = StereoMethod.Search;
 			block_size = 0;
 			block_time_ms = 105;			
-			prediction_type = PredictionType.Search;
 			min_prediction_order = 1;
 			max_prediction_order = 12;
-			estimation_depth = 1;
-			min_fixed_order = 2;
-			max_fixed_order = 2;
 			min_partition_order = 0;
 			max_partition_order = 6;
 			variable_block_size = 0;
@@ -2018,12 +1690,10 @@ namespace CUETools.Codecs.FlaCuda
 			{
 				case 0:
 					block_time_ms = 53;
-					prediction_type = PredictionType.Fixed;
 					stereo_method = StereoMethod.Independent;
 					max_partition_order = 4;
 					break;
 				case 1:
-					prediction_type = PredictionType.Levinson;
 					stereo_method = StereoMethod.Independent;
 					window_function = WindowFunction.Bartlett;
 					max_prediction_order = 8;
@@ -2052,9 +1722,6 @@ namespace CUETools.Codecs.FlaCuda
 				case 7:
 					break;
 				case 8:
-					estimation_depth = 3;
-					min_fixed_order = 0;
-					max_fixed_order = 4;
 					lpc_max_precision_search = 2;
 					break;
 				case 9:
@@ -2062,16 +1729,11 @@ namespace CUETools.Codecs.FlaCuda
 					max_prediction_order = 32;
 					break;
 				case 10:
-					min_fixed_order = 0;
-					max_fixed_order = 4;
 					max_prediction_order = 32;
 					//lpc_max_precision_search = 2;
 					break;
 				case 11:
-					min_fixed_order = 0;
-					max_fixed_order = 4;
 					max_prediction_order = 32;
-					estimation_depth = 5;
 					//lpc_max_precision_search = 2;
 					variable_block_size = 4;
 					break;
@@ -2080,6 +1742,12 @@ namespace CUETools.Codecs.FlaCuda
 			return 0;
 		}
 	}
+
+	unsafe struct computeAutocorTaskStruct
+	{
+		public int samplesOffs;
+		public int windowOffs;
+	};
 	
 	unsafe struct encodeResidualTaskStruct
 	{
diff --git a/CUETools.FlaCuda/flacuda.cu b/CUETools.FlaCuda/flacuda.cu
index 85da056..22c3134 100644
--- a/CUETools.FlaCuda/flacuda.cu
+++ b/CUETools.FlaCuda/flacuda.cu
@@ -1,86 +1,87 @@
-/*
- * Copyright 1993-2007 NVIDIA Corporation.  All rights reserved.
+/**
+ * CUETools.FlaCuda: FLAC audio encoder using CUDA
+ * Copyright (c) 2009 Gregory S. Chudov
  *
- * NOTICE TO USER:
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
  *
- * This source code is subject to NVIDIA ownership rights under U.S. and
- * international Copyright laws.  Users and possessors of this source code
- * are hereby granted a nonexclusive, royalty-free license to use this code
- * in individual and commercial software.
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
- * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
- * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
- * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
- * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
- * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
- * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
- * OR PERFORMANCE OF THIS SOURCE CODE.
- *
- * U.S. Government End Users.   This source code is a "commercial item" as
- * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
- * "commercial computer  software"  and "commercial computer software
- * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
- * and is provided to the U.S. Government only as a commercial end item.
- * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
- * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
- * source code with only those rights set forth herein.
- *
- * Any use of this source code in individual and commercial software must
- * include, in the user documentation and internal comments to the code,
- * the above Disclaimer and U.S. Government End Users Notice.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef _FLACUDA_KERNEL_H_
 #define _FLACUDA_KERNEL_H_
 
+typedef struct
+{
+    int samplesOffs;
+    int windowOffs;
+} computeAutocorTaskStruct;
+
 extern "C" __global__ void cudaComputeAutocor(
     float *output,
     const int *samples,
     const float *window,
+    computeAutocorTaskStruct *tasks,
+    int max_order, // should be <= 32
     int frameSize,
-    int frameOffset,
-    int blocks)
+    int partSize // should be <= blockDim - max_order
+)
 {
     __shared__ struct {
-	float data[512];
-	float matrix[512];
+	float data[256];
+	float product[256];
+	float product2[256];
+	float sum[33];
+	computeAutocorTaskStruct task;
     } shared;
-    const int iWin = blockIdx.y >> 2;
-    const int iCh = blockIdx.y & 3;
-    const int smpBase = iCh * frameOffset;
-    const int winBase = iWin * 2 * frameOffset;
-    const int pos = blockIdx.x * blocks;
-    const int tid = threadIdx.x + threadIdx.y * blockDim.x;
-
-    // fetch blockDim.x*blockDim.y samples
-    shared.data[tid] = pos + tid < frameSize ? samples[smpBase + pos + tid] * window[winBase + pos + tid] : 0.0f;
+    const int tid = threadIdx.x;
+    // fetch task data
+    if (tid < sizeof(shared.task) / sizeof(int))
+	((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid];
     __syncthreads();
 
-    float s = 0.0f;    
-    for (int i = 0; i < blocks; i += blockDim.y) 
-	s += shared.data[i + threadIdx.y] * shared.data[i + threadIdx.y + threadIdx.x];
-    shared.matrix[tid] = s;
+    const int pos = blockIdx.x * partSize;
+    const int productLen = min(frameSize - pos - max_order, partSize);
+    const int dataLen = productLen + max_order;
+
+    // fetch samples
+    shared.data[tid] = tid < dataLen ? samples[shared.task.samplesOffs + pos + tid] * window[shared.task.windowOffs + pos + tid]: 0.0f;
     __syncthreads();
 
-    // reduction in shared mem
-    for(unsigned int s=blockDim.y/2; s>1; s>>=1)
+    for (int lag = 0; lag <= max_order; lag++)
     {
-	if (threadIdx.y < s)
-	    shared.matrix[tid] += shared.matrix[tid + s * blockDim.x];
+	shared.product[tid] = tid < productLen ? shared.data[tid] * shared.data[tid + lag] : 0.0f;
 	__syncthreads();
+
+	// product sum: reduction in shared mem
+	//if (tid < 256) shared.product[tid] += shared.product[tid + 256]; __syncthreads();
+	if (tid < 128) shared.product[tid] += shared.product[tid + 128]; __syncthreads();
+	if (tid < 64) shared.product[tid] += shared.product[tid + 64]; __syncthreads();
+	if (tid < 32) shared.product[tid] += shared.product[tid + 32]; __syncthreads();
+	shared.product[tid] += shared.product[tid + 16];
+	shared.product[tid] += shared.product[tid + 8];
+	shared.product[tid] += shared.product[tid + 4];
+	shared.product[tid] += shared.product[tid + 2];
+	if (tid == 0) shared.sum[lag] = shared.product[0] + shared.product[1];
     }
 
     // return results
-    if (threadIdx.y == 0)
-	output[(blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x] = shared.matrix[threadIdx.x] + shared.matrix[threadIdx.x + blockDim.x];
+    if (tid <= max_order)
+	output[(blockIdx.x + blockIdx.y * gridDim.x) * (max_order + 1) + tid] = shared.sum[tid];
 }
 
 typedef struct
 {
-    int residualOrder;
+    int residualOrder; // <= 32
     int samplesOffs;
     int shift;
     int reserved;
@@ -92,7 +93,7 @@ extern "C" __global__ void cudaEncodeResidual(
     int*samples,
     encodeResidualTaskStruct *tasks,
     int frameSize,
-    int partSize
+    int partSize // should be <= blockDim - max_order
     )
 {
     __shared__ struct {
@@ -147,206 +148,4 @@ extern "C" __global__ void cudaEncodeResidual(
 	output[blockIdx.x + blockIdx.y * gridDim.x] = shared.rice[0];
 }
 
-#if 0
-extern "C" __global__ void cudaComputeAutocor3int(const int * samples, const float * window, int* output, int frameSize, int frameOffset, int channels)
-{
-    extern __shared__ short shared[];
-    int *ishared = (int*)shared;
-    const int lag = blockIdx.x;
-    
-    // fetch signal, multiply by window and split high bits/low bits
-    const int iWin = blockIdx.y >> 2; // blockIdx.y/channels;
-    const int iCh = (blockIdx.y  - iWin * channels);
-    const int smpBase = iCh * frameOffset;
-    const int winBase = iWin * 2 * frameOffset;
-
-    for(int i = threadIdx.x; i < frameSize; i += blockDim.x)
-    {
-	float val = samples[smpBase + i] * window[winBase + i];
-	int ival = __float2int_rz(fabs(val));
-	int sg =  (1 - 2 *signbit(val));
-	//int ival = (int) val;
-	//int sg =  ival < 0 ? -1 : 1;
-	ival = ival < 0 ? -ival : ival;
-	shared[i*2] = __mul24(sg, (ival >> 9));
-	shared[i*2+1] = __mul24(sg, (ival & ((1 << 9) - 1)));
-    }
-    __syncthreads();
-
-    // correlation
-    int sum1 = 0;
-    int sum2 = 0;
-    int sum3 = 0;
-    for (int i = threadIdx.x; i < frameSize - lag; i += blockDim.x)
-    {
-	sum1 += __mul24(shared[2*i], shared[2*(i+lag)]);
-	sum2 += __mul24(shared[2*i+1], shared[2*(i+lag)]);
-	sum2 += __mul24(shared[2*i], shared[2*(i+lag)+1]);
-	sum3 += __mul24(shared[2*i+1], shared[2*(i+lag)+1]);
-    }    
-    __syncthreads();
-    ishared[threadIdx.x] = sum1;
-    ishared[threadIdx.x + blockDim.x] = sum2;
-    ishared[threadIdx.x + blockDim.x * 2] = sum3;
-    __syncthreads();
-
-    // reduction in shared mem
-    for(unsigned int s=blockDim.x/2; s>0; s>>=1) 
-    {
-        if (threadIdx.x < s)
-        {
-            ishared[threadIdx.x] += ishared[threadIdx.x + s];
-	    ishared[threadIdx.x + blockDim.x] += ishared[threadIdx.x + s + blockDim.x];
-	    ishared[threadIdx.x + blockDim.x * 2] += ishared[threadIdx.x + s + blockDim.x * 2];
-        }
-        __syncthreads();
-    }
-
-    if (threadIdx.x == 0) {
-	output[(blockIdx.x + blockIdx.y * gridDim.x) * 3] = ishared[threadIdx.x];
-	output[(blockIdx.x + blockIdx.y * gridDim.x) * 3 + 1] = ishared[threadIdx.x + blockDim.x];
-	output[(blockIdx.x + blockIdx.y * gridDim.x) * 3 + 2] = ishared[threadIdx.x + blockDim.x * 2];
-    } 
-}
-
-__device__ float Bartlett(int i, int blockSize)
-{
-    float n = fminf(i, blockSize - i);
-    float k = 2.0f / blockSize * (blockSize / 2.0f - n);
-    k = 1.0f - k * k;
-    return k*k;
-}
-
-extern "C" __global__ void cudaComputeAutocorPart(const int * samples, const float * window, float* output, int frameSize, int frameOffset, int iCh, int iWin)
-{
-    extern __shared__ float fshared[];
-    // fetch signal, multiply by window
-    //const int iWin = blockIdx.y;
-    //const int iCh = blockIdx.x;
-    const int smpBase = iCh * frameOffset;
-    const int winBase = iWin * 2 * frameOffset;
-    float * matrix = fshared + 129;
-
-    // initialize results matrix
-    matrix[threadIdx.x + threadIdx.y * (blockDim.x + 1)] = 0.0f;
-
-    // prefetch blockDim.x + blockDim.y samples
-    int tid = threadIdx.x + threadIdx.y * blockDim.x;
-    if (tid < blockDim.x + blockDim.y)
-    {
-	if (blockIdx.x * blockDim.y + tid < frameSize)
-	    fshared[tid] = samples[smpBase + blockIdx.x * blockDim.y + tid] * window[winBase + blockIdx.x * blockDim.y + tid];
-	else
-	    fshared[tid] = 0.0f;
-    }
-
-    __syncthreads();
-
-    matrix[threadIdx.x + threadIdx.y * (1 + blockDim.x)] += fshared[threadIdx.y] * fshared[threadIdx.y + threadIdx.x];
-
-    __syncthreads();
-
-    // reduction in shared mem
-    for(unsigned int s=blockDim.y/2; s>0; s>>=1)
-    {
-	if (threadIdx.y < s)
-	    matrix[threadIdx.x + threadIdx.y * (1 + blockDim.x)] += matrix[threadIdx.x + (s + threadIdx.y) * (1 + blockDim.x)];
-	__syncthreads();
-    }
-
-    // return results
-    if (threadIdx.y == 0)
-	output[blockIdx.x * blockDim.x + threadIdx.x] = matrix[threadIdx.x];
-}
-
-extern "C" __global__ void cudaComputeAutocor2(const int * samples, const float * window, float* output, int frameSize, int frameOffset)
-{
-    extern __shared__ float fshared[];   
-    // fetch signal, multiply by window
-    const int iWin = blockIdx.y;
-    const int iCh = blockIdx.x;
-    const int smpBase = iCh * frameOffset;
-    const int winBase = iWin * 2 * frameOffset;
-
-    for(int i = threadIdx.x + threadIdx.y * blockDim.x; i < frameSize; i += blockDim.x * blockDim.y)
-	fshared[i] = samples[smpBase + i] * window[winBase + i];
-
-    __syncthreads();
-
-    const int lag = threadIdx.y;
-
-    // correlation
-    float sum = 0.0f;
-    for (int i = threadIdx.x; i < frameSize - lag; i += blockDim.x)
-	sum += fshared[i] * fshared[i+lag];
-    __syncthreads();
-
-    fshared[threadIdx.x + threadIdx.y * blockDim.x] = sum;
-
-    __syncthreads();
-
-    // reduction in shared mem
-    for(unsigned int s=blockDim.x/2; s>0; s>>=1)
-    {
-        if (threadIdx.x < s)
-            fshared[threadIdx.x + threadIdx.y * blockDim.x] += fshared[threadIdx.x + s + threadIdx.y * blockDim.x];
-        __syncthreads();
-    }
-
- //   if (threadIdx.x < 32)
- //   {
-	//if (blockDim.x >= 64) fshared[threadIdx.x + threadIdx.y * blockDim.x] += fshared[threadIdx.x + 32 + threadIdx.y * blockDim.x];
-	//if (blockDim.x >= 32) fshared[threadIdx.x + threadIdx.y * blockDim.x] += fshared[threadIdx.x + 16 + threadIdx.y * blockDim.x];
-	//if (blockDim.x >= 16) fshared[threadIdx.x + threadIdx.y * blockDim.x] += fshared[threadIdx.x +  8 + threadIdx.y * blockDim.x];
-	//if (blockDim.x >=  8) fshared[threadIdx.x + threadIdx.y * blockDim.x] += fshared[threadIdx.x +  4 + threadIdx.y * blockDim.x];
-	//if (blockDim.x >=  4) fshared[threadIdx.x + threadIdx.y * blockDim.x] += fshared[threadIdx.x +  2 + threadIdx.y * blockDim.x];
-	//if (blockDim.x >=  2) fshared[threadIdx.x + threadIdx.y * blockDim.x] += fshared[threadIdx.x +  1 + threadIdx.y * blockDim.x];
- //   }
-
-    if (threadIdx.x == 0) {
-	output[(blockIdx.x + blockIdx.y * gridDim.x) * blockDim.y + threadIdx.y] 
-	= fshared[threadIdx.x + threadIdx.y * blockDim.x];
-    } 
-}
-
-extern "C" __global__ void cudaComputeAutocorOld(const int * samples, const float * window, float* output, int frameSize, int frameOffset, int channels)
-{
-    extern __shared__ float fshared[];
-    const int lag = blockIdx.x;
-    
-    // fetch signal, multiply by window
-    const int iWin = blockIdx.y >> 2; // blockIdx.y/channels;
-    const int iCh = (blockIdx.y  - iWin * channels);
-    const int smpBase = iCh * frameOffset;
-    const int winBase = iWin * 2 * frameOffset;
-
-    for(int i = threadIdx.x; i < frameSize; i += blockDim.x)
-	fshared[i] = samples[smpBase + i] * window[winBase + i];
-
-    __syncthreads();
-
-    // correlation
-    float sum = 0.0f;
-    for (int i = threadIdx.x; i < frameSize - lag; i += blockDim.x)
-	sum += fshared[i] * fshared[i+lag];
-    __syncthreads();
-
-    fshared[threadIdx.x] = sum;
-
-    __syncthreads();
-
-    // reduction in shared mem
-    for(unsigned int s=blockDim.x/2; s>0; s>>=1) 
-    {
-        if (threadIdx.x < s)
-            fshared[threadIdx.x] += fshared[threadIdx.x + s];
-        __syncthreads();
-    }
-
-    if (threadIdx.x == 0) {
-	output[blockIdx.x + blockIdx.y * gridDim.x] = fshared[threadIdx.x];
-    } 
-}
 #endif
-
-#endif // _FLACUDA_KERNEL_H_