diff --git a/CUETools.Codecs.FLACCL/FLACCLWriter.cs b/CUETools.Codecs.FLACCL/FLACCLWriter.cs
index 65b5cf2..bbd537b 100644
--- a/CUETools.Codecs.FLACCL/FLACCLWriter.cs
+++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs
@@ -141,7 +141,6 @@ namespace CUETools.Codecs.FLACCL
 
 		public const int MAX_BLOCKSIZE = 4096 * 16;
 		internal const int maxFrames = 128;
-		internal const int maxAutocorParts = (MAX_BLOCKSIZE + 255) / 256;
 
 		public FLACCLWriter(string path, Stream IO, AudioPCMConfig pcm)
 		{
@@ -1116,12 +1115,6 @@ namespace CUETools.Codecs.FLACCL
 			if (task.frameSize <= 4)
 				return;
 
-			//int autocorPartSize = (2 * 256 - eparams.max_prediction_order) & ~15;
-			int autocorPartSize = 32 * 7;
-			int autocorPartCount = (task.frameSize + autocorPartSize - 1) / autocorPartSize;
-			if (autocorPartCount > maxAutocorParts)
-				throw new Exception("internal error");
-
 			int max_porder = get_max_p_order(eparams.max_partition_order, task.frameSize, eparams.max_prediction_order);
 			int calcPartitionPartSize = task.frameSize >> max_porder;
 			while (calcPartitionPartSize < 16 && max_porder > 0)
@@ -1140,12 +1133,10 @@ namespace CUETools.Codecs.FLACCL
 			cudaChannelDecorr.SetArg(2, (uint)MAX_BLOCKSIZE);
 
 			task.cudaComputeLPC.SetArg(0, task.cudaResidualTasks);
-			task.cudaComputeLPC.SetArg(1, (uint)task.nResidualTasksPerChannel);
-			task.cudaComputeLPC.SetArg(2, task.cudaAutocorOutput);
-			task.cudaComputeLPC.SetArg(3, (uint)eparams.max_prediction_order);
-			task.cudaComputeLPC.SetArg(4, task.cudaLPCData);
-			task.cudaComputeLPC.SetArg(5, (uint)_windowcount);
-			task.cudaComputeLPC.SetArg(6, (uint)autocorPartCount);
+			task.cudaComputeLPC.SetArg(1, task.cudaAutocorOutput);
+			task.cudaComputeLPC.SetArg(2, task.cudaLPCData);
+			task.cudaComputeLPC.SetArg(3, (uint)task.nResidualTasksPerChannel);
+			task.cudaComputeLPC.SetArg(4, (uint)_windowcount);
 
 			//task.cudaComputeLPCLattice.SetArg(0, task.cudaResidualTasks);
 			//task.cudaComputeLPCLattice.SetArg(1, (uint)task.nResidualTasksPerChannel);
@@ -1156,12 +1147,11 @@ namespace CUETools.Codecs.FLACCL
 			//cuda.SetFunctionBlockShape(task.cudaComputeLPCLattice, 256, 1, 1);
 
 			task.cudaQuantizeLPC.SetArg(0, task.cudaResidualTasks);
-			task.cudaQuantizeLPC.SetArg(1, (uint)task.nResidualTasksPerChannel);
-			task.cudaQuantizeLPC.SetArg(2, (uint)task.nTasksPerWindow);
-			task.cudaQuantizeLPC.SetArg(3, task.cudaLPCData);
-			task.cudaQuantizeLPC.SetArg(4, (uint)eparams.max_prediction_order);
-			task.cudaQuantizeLPC.SetArg(5, (uint)eparams.lpc_min_precision_search);
-			task.cudaQuantizeLPC.SetArg(6, (uint)(eparams.lpc_max_precision_search - eparams.lpc_min_precision_search));
+			task.cudaQuantizeLPC.SetArg(1, task.cudaLPCData);
+			task.cudaQuantizeLPC.SetArg(2, (uint)task.nResidualTasksPerChannel);
+			task.cudaQuantizeLPC.SetArg(3, (uint)task.nTasksPerWindow);
+			task.cudaQuantizeLPC.SetArg(4, (uint)eparams.lpc_min_precision_search);
+			task.cudaQuantizeLPC.SetArg(5, (uint)(eparams.lpc_max_precision_search - eparams.lpc_min_precision_search));
 
 			task.cudaCopyBestMethod.SetArg(0, task.cudaBestResidualTasks);
 			task.cudaCopyBestMethod.SetArg(1, task.cudaResidualTasks);
@@ -1216,7 +1206,7 @@ namespace CUETools.Codecs.FLACCL
 
 			// geometry???
 			task.openCLCQ.EnqueueBarrier();
-			task.EnqueueComputeAutocor(autocorPartCount, channelsCount, cudaWindow, eparams.max_prediction_order);
+			task.EnqueueComputeAutocor(channelsCount, cudaWindow, eparams.max_prediction_order);
 
 			//float* autoc = stackalloc float[1024];
 			//task.openCLCQ.EnqueueBarrier();
@@ -1524,6 +1514,7 @@ namespace CUETools.Codecs.FLACCL
 				if (OpenCL.NumberOfPlatforms < 1)
 					throw new Exception("no opencl platforms found");
 
+				int groupSize = 64;
 				OCLMan = new OpenCLManager();
 				// Attempt to save binaries after compilation, as well as load precompiled binaries
 				// to avoid compilation. Usually you'll want this to be true. 
@@ -1543,7 +1534,9 @@ namespace CUETools.Codecs.FLACCL
 				OCLMan.RequireImageSupport = false;
 				// The Defines string gets prepended to any and all sources that are compiled
 				// and serve as a convenient way to pass configuration information to the compilation process
-				OCLMan.Defines = "#define MAX_ORDER " + eparams.max_prediction_order.ToString();
+				OCLMan.Defines =
+					"#define MAX_ORDER " + eparams.max_prediction_order.ToString() + "\n" +
+					"#define GROUP_SIZE " + groupSize.ToString() + "\n";
 				// The BuildOptions string is passed directly to clBuild and can be used to do debug builds etc
 				OCLMan.BuildOptions = "";
 
@@ -1596,13 +1589,13 @@ namespace CUETools.Codecs.FLACCL
 				if (_IO.CanSeek)
 					first_frame_offset = _IO.Position;
 
-				task1 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify);
-				task2 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify);
+				task1 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify, groupSize);
+				task2 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify, groupSize);
 				if (_settings.CPUThreads > 0)
 				{
 					cpu_tasks = new FLACCLTask[_settings.CPUThreads];
 					for (int i = 0; i < cpu_tasks.Length; i++)
-						cpu_tasks[i] = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify);
+						cpu_tasks[i] = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify, groupSize);
 				}
 				cudaWindow = openCLProgram.Context.CreateBuffer(MemFlags.READ_ONLY, sizeof(float) * FLACCLWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS);
 
@@ -2276,8 +2269,11 @@ namespace CUETools.Codecs.FLACCL
 		public bool done = false;
 		public bool exit = false;
 
-		unsafe public FLACCLTask(Program _openCLProgram, int channelCount, int channels, uint bits_per_sample, int max_frame_size, bool do_verify)
+		public int groupSize = 128;
+
+		unsafe public FLACCLTask(Program _openCLProgram, int channelCount, int channels, uint bits_per_sample, int max_frame_size, bool do_verify, int groupSize)
 		{
+			this.groupSize = groupSize;
 			openCLProgram = _openCLProgram;
 			Device[] openCLDevices = openCLProgram.Context.Platform.QueryDevices(DeviceType.GPU);
 			openCLCQ = openCLProgram.Context.CreateCommandQueue(openCLDevices[0], CommandQueueProperties.PROFILING_ENABLE);
@@ -2296,7 +2292,7 @@ namespace CUETools.Codecs.FLACCL
 			cudaPartitions = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, partitionsLen);
 			cudaRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, riceParamsLen);
 			cudaBestRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceParamsLen / 4);
-			cudaAutocorOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * (FLACCLWriter.maxAutocorParts + FLACCLWriter.maxFrames));
+			cudaAutocorOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * FLACCLWriter.maxFrames);
 			cudaResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, residualTasksLen);
 			cudaBestResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, bestResidualTasksLen);
 			cudaResidualOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 8) * 64 /*FLACCLWriter.maxResidualParts*/ * FLACCLWriter.maxFrames);
@@ -2397,36 +2393,22 @@ namespace CUETools.Codecs.FLACCL
 			cudaFindWastedBits.SetArg(1, cudaSamples);
 			cudaFindWastedBits.SetArg(2, nResidualTasksPerChannel);
 
-			int workX = 128; // 256
 			int grpX = frameCount * channelsCount;
-			//openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { 128 }, new int[] { 128 });
-			//openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { 128 }, null);
-			openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { grpX * workX }, new int[] { workX });
-
-			//openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { 256 * 128 }, new int[] { 128 });
-			//openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { grpX * workX }, null);
-			//cuda.SetFunctionBlockShape(task.cudaFindWastedBits, 256, 1, 1);
-			//cuda.LaunchAsync(task.cudaFindWastedBits, channelsCount * task.frameCount, 1, task.stream);
+			openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { grpX * groupSize }, new int[] { groupSize });
 		}
 
-		public void EnqueueComputeAutocor(int autocorPartCount, int channelsCount, Mem cudaWindow, int max_prediction_order)
+		public void EnqueueComputeAutocor(int channelsCount, Mem cudaWindow, int max_prediction_order)
 		{
 			cudaComputeAutocor.SetArg(0, cudaAutocorOutput);
 			cudaComputeAutocor.SetArg(1, cudaSamples);
 			cudaComputeAutocor.SetArg(2, cudaWindow);
 			cudaComputeAutocor.SetArg(3, cudaResidualTasks);
-			cudaComputeAutocor.SetArg(4, max_prediction_order);
-			cudaComputeAutocor.SetArg(5, (uint)nAutocorTasksPerChannel - 1);
-			cudaComputeAutocor.SetArg(6, (uint)nResidualTasksPerChannel);
-			int workX = autocorPartCount;
+			cudaComputeAutocor.SetArg(4, (uint)nAutocorTasksPerChannel - 1);
+			cudaComputeAutocor.SetArg(5, (uint)nResidualTasksPerChannel);
+
+			int workX = max_prediction_order / 4 + 1;
 			int workY = nAutocorTasksPerChannel * channelsCount * frameCount;
-			int ws = 32;
-			int wy = 4;
-			openCLCQ.EnqueueNDRangeKernel(cudaComputeAutocor, 2, null, new int[] { workX * ws, workY * wy }, new int[] { ws, wy });
-			//openCLCQ.EnqueueNDRangeKernel(cudaComputeAutocor, 3, null, new int[] { workX * ws, workY, max_prediction_order + 1 }, new int[] { ws, 1, 1 });
-			
-			//cuda.SetFunctionBlockShape(task.cudaComputeAutocor, 32, 8, 1);
-			//cuda.LaunchAsync(task.cudaComputeAutocor, autocorPartCount, task.nAutocorTasksPerChannel * channelsCount * task.frameCount, task.stream);
+			openCLCQ.EnqueueNDRangeKernel(cudaComputeAutocor, 2, null, new int[] { workX * groupSize, workY }, new int[] { groupSize, 1 });
 		}
 
 		public void EnqueueEstimateResidual(int channelsCount, int max_prediction_order)
@@ -2435,11 +2417,8 @@ namespace CUETools.Codecs.FLACCL
 			cudaEstimateResidual.SetArg(1, cudaSamples);
 			cudaEstimateResidual.SetArg(2, cudaResidualTasks);
 
-			int threads_x = 128;
-			int workX = threads_x;
-			int workY = nResidualTasksPerChannel * channelsCount * frameCount;
-
-			openCLCQ.EnqueueNDRangeKernel(cudaEstimateResidual, 2, null, new int[] { workX, workY }, new int[] { threads_x, 1 });
+			int work = nResidualTasksPerChannel * channelsCount * frameCount;
+			openCLCQ.EnqueueNDRangeKernel(cudaEstimateResidual, 1, null, new int[] { groupSize * work }, new int[] { groupSize });
 		}
 
 		public void EnqueueChooseBestMethod(int channelsCount)
diff --git a/CUETools.Codecs.FLACCL/flac.cl b/CUETools.Codecs.FLACCL/flac.cl
index 6c30b50..47257c5 100644
--- a/CUETools.Codecs.FLACCL/flac.cl
+++ b/CUETools.Codecs.FLACCL/flac.cl
@@ -50,11 +50,7 @@ typedef struct
 typedef struct
 {
     FLACCLSubframeData data;
-    union 
-    {
-	int coefs[32]; // fixme: should be short?
-	int4 coefs4[8];
-    };
+    int coefs[32]; // fixme: should be short?
 } FLACCLSubframeTask;
 
 __kernel void cudaStereoDecorr(
@@ -103,15 +99,15 @@ __kernel void cudaChannelDecorr2(
 #define __ffs(a) (32 - clz(a & (-a)))
 //#define __ffs(a) (33 - clz(~a & (a - 1)))
 
-__kernel __attribute__((reqd_work_group_size(128, 1, 1)))
+__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
 void cudaFindWastedBits(
     __global FLACCLSubframeTask *tasks,
     __global int *samples,
     int tasksPerChannel
 )
 {
-    __local volatile int wbits[128];
-    __local volatile int abits[128];
+    __local int abits[GROUP_SIZE];
+    __local int wbits[GROUP_SIZE];
     __local FLACCLSubframeData task;
 
     int tid = get_local_id(0);
@@ -129,12 +125,6 @@ void cudaFindWastedBits(
     wbits[tid] = w;
     abits[tid] = a;
     barrier(CLK_LOCAL_MEM_FENCE);
-    //atom_or(shared.wbits, shared.wbits[tid]);
-    //atom_or(shared.abits, shared.abits[tid]);
-    //SUM256(shared.wbits, tid, |=);
-    //SUM256(shared.abits, tid, |=);
-    //SUM128(wbits, tid, |=);
-    //SUM128(abits, tid, |=);
 
     for (int s = get_local_size(0) / 2; s > 0; s >>= 1)
     {
@@ -146,160 +136,81 @@ void cudaFindWastedBits(
 	barrier(CLK_LOCAL_MEM_FENCE);
     }
     
-    if (tid == 0)
-	task.wbits = max(0,__ffs(wbits[0]) - 1);
-    if (tid == 0)
-	task.abits = 32 - clz(abits[0]) - task.wbits;
- //   if (tid == 0)
-	//task.wbits = get_num_groups(0);
- //   if (tid == 0)
-	//task.abits = get_local_size(0);
-    
-    barrier(CLK_LOCAL_MEM_FENCE);
-
+    w = max(0,__ffs(wbits[0]) - 1);
+    a = 32 - clz(abits[0]) - w;
     if (tid < tasksPerChannel)
-	tasks[get_group_id(0) * tasksPerChannel + tid].data.wbits = task.wbits;
+	tasks[get_group_id(0) * tasksPerChannel + tid].data.wbits = w;
     if (tid < tasksPerChannel)
-	tasks[get_group_id(0) * tasksPerChannel + tid].data.abits = task.abits;
+	tasks[get_group_id(0) * tasksPerChannel + tid].data.abits = a;
 }
 
-//__kernel __attribute__((reqd_work_group_size(32, 4, 1)))
-//void cudaComputeAutocor(
-//    __global float *output,
-//    __global const int *samples,
-//    __global const float *window,
-//    __global FLACCLSubframeTask *tasks,
-//    const int max_order, // should be <= 32
-//    const int windowCount, // windows (log2: 0,1)
-//    const int taskCount // tasks per block
-//)
-//{
-//    __local struct {
-//	float data[256];
-//	volatile float product[128];
-//	FLACCLSubframeData task;
-//	volatile int dataPos;
-//	volatile int dataLen;
-//    } shared;
-//    const int tid = get_local_id(0) + get_local_id(1) * 32;
-//    // fetch task data
-//    if (tid < sizeof(shared.task) / sizeof(int))
-//	((__local int*)&shared.task)[tid] = ((__global int*)(tasks + taskCount * (get_group_id(1) >> windowCount)))[tid];
-//    if (tid == 0) 
-//    {
-//	shared.dataPos = get_group_id(0) * 7 * 32;
-//	shared.dataLen = min(shared.task.blocksize - shared.dataPos, 7 * 32 + max_order);
-//    }
-//    barrier(CLK_LOCAL_MEM_FENCE);
-//
-//    // fetch samples
-//    shared.data[tid] = tid < shared.dataLen ? samples[tid] * window[tid]: 0.0f;
-//    int tid2 = tid + 128;
-//    shared.data[tid2] = tid2 < shared.dataLen ? samples[tid2] * window[tid2]: 0.0f;
-//    barrier(CLK_LOCAL_MEM_FENCE);
-//
-//    for (int lag = 0; lag <= max_order; lag ++)
-//    {
-//	if (lag <= 12)
-//	    shared.product[tid] = 0.0f;
-//	barrier(CLK_LOCAL_MEM_FENCE);
-//    }
-//    barrier(CLK_LOCAL_MEM_FENCE);
-//    if (tid <= max_order)
-//	output[(get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) + tid] = shared.product[tid];
-//}
-
-__kernel __attribute__((reqd_work_group_size(32, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
 void cudaComputeAutocor(
     __global float *output,
     __global const int *samples,
     __global const float *window,
     __global FLACCLSubframeTask *tasks,
-    const int max_order, // should be <= 32
     const int windowCount, // windows (log2: 0,1)
     const int taskCount // tasks per block
 )
 {
-    __local struct {
-	float data[256];
-	volatile float product[128];
-	FLACCLSubframeData task;
-	volatile float result[33];
-	volatile int dataPos;
-	volatile int dataLen;
-	volatile int windowOffs;
-	volatile int samplesOffs;
-	//volatile int resultOffs;
-    } shared;
-    const int tid = get_local_id(0) + get_local_id(1) * 32;
+    __local float data[GROUP_SIZE * 2];
+    __local float product[GROUP_SIZE];
+    __local FLACCLSubframeData task;
+    const int tid = get_local_id(0);
     // fetch task data
-    if (tid < sizeof(shared.task) / sizeof(int))
-	((__local int*)&shared.task)[tid] = ((__global int*)(tasks + taskCount * (get_group_id(1) >> windowCount)))[tid];
-    if (tid == 0) 
-    {
-	shared.dataPos = get_group_id(0) * 7 * 32;
-	shared.windowOffs = (get_group_id(1) & ((1 << windowCount)-1)) * shared.task.blocksize + shared.dataPos;
-	shared.samplesOffs = shared.task.samplesOffs + shared.dataPos;
-	shared.dataLen = min(shared.task.blocksize - shared.dataPos, 7 * 32 + max_order);
-    }
-    //if (tid == 32)
-	//shared.resultOffs = __mul24(get_group_id(0) + __mul24(get_group_id(1), get_num_groups(0)), max_order + 1);
+    if (tid < sizeof(task) / sizeof(int))
+	((__local int*)&task)[tid] = ((__global int*)(tasks + taskCount * (get_group_id(1) >> windowCount)))[tid];
     barrier(CLK_LOCAL_MEM_FENCE);
+    
+    int bs = task.blocksize;
+    int windowOffs = (get_group_id(1) & ((1 << windowCount)-1)) * bs;
 
-    // fetch samples
-    shared.data[tid] = tid < shared.dataLen ? samples[shared.samplesOffs + tid] * window[shared.windowOffs + tid]: 0.0f;
-    int tid2 = tid + 128;
-    shared.data[tid2] = tid2 < shared.dataLen ? samples[shared.samplesOffs + tid2] * window[shared.windowOffs + tid2]: 0.0f;
-    barrier(CLK_LOCAL_MEM_FENCE);
+    data[tid] = tid < bs ? samples[task.samplesOffs + tid] * window[windowOffs + tid] : 0.0f;
 
-    const int ptr = get_local_id(0) * 7;
-    //if (get_local_id(1) == 0) for (int lag = 0; lag <= max_order; lag ++)
-    //for (int lag = get_local_id(1); lag <= max_order; lag += get_local_size(1))
-    for (int lag0 = 0; lag0 <= max_order; lag0 += get_local_size(1))
+    int tid0 = tid % (GROUP_SIZE >> 2);
+    int tid1 = tid / (GROUP_SIZE >> 2);
+    int lag0 = get_group_id(0) * 4;
+    __local float4 * dptr = ((__local float4 *)&data[0]) + tid0;
+    __local float4 * dptr1 = ((__local float4 *)&data[lag0 + tid1]) + tid0;
+    
+    float prod = 0.0f;
+    for (int pos = 0; pos < bs; pos += GROUP_SIZE)
     {
-	////const int productLen = min(shared.task.blocksize - get_group_id(0) * partSize - lag, partSize);
-	const int lag = lag0 + get_local_id(1);
-	const int ptr2 = ptr + lag;
-	shared.product[tid] =
-	    shared.data[ptr + 0] * shared.data[ptr2 + 0] +
-	    shared.data[ptr + 1] * shared.data[ptr2 + 1] +
-	    shared.data[ptr + 2] * shared.data[ptr2 + 2] +
-	    shared.data[ptr + 3] * shared.data[ptr2 + 3] +
-	    shared.data[ptr + 4] * shared.data[ptr2 + 4] +
-	    shared.data[ptr + 5] * shared.data[ptr2 + 5] +
-	    shared.data[ptr + 6] * shared.data[ptr2 + 6];
+	// fetch samples
+	float nextData = pos + tid + GROUP_SIZE < bs ? samples[task.samplesOffs + pos + tid + GROUP_SIZE] * window[windowOffs + pos + tid + GROUP_SIZE] : 0.0f;
+	data[tid + GROUP_SIZE] = nextData;
 	barrier(CLK_LOCAL_MEM_FENCE);
-	for (int l = 16; l > 1; l >>= 1)
-	{
-	    if (get_local_id(0) < l)
-		shared.product[tid] = shared.product[tid] + shared.product[tid + l];
-	    barrier(CLK_LOCAL_MEM_FENCE);
-	}
-	// return results
-	if (get_local_id(0) == 0 && lag <= max_order)
-	    shared.result[lag] = shared.product[tid] + shared.product[tid + 1];
+
+	prod += dot(*dptr, *dptr1);
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	data[tid] = nextData;
+    }
+    product[tid] = prod;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int l = (GROUP_SIZE >> 3); l > 0; l >>= 1)
+    {
+	if (tid0 < l)
+	    product[tid] = product[tid] + product[tid + l];
 	barrier(CLK_LOCAL_MEM_FENCE);
     }
-    if (tid <= max_order)
-	output[(get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) + tid] = shared.result[tid];
-    //output[(get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) + tid] = shared.product[tid];
-    //output[(get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) + tid] = shared.windowOffs;
+    if (tid < 4 && tid + lag0 <= MAX_ORDER)
+	output[get_group_id(1) * (MAX_ORDER + 1) + tid + lag0] = product[tid * (GROUP_SIZE >> 2)];
 }
 
 __kernel __attribute__((reqd_work_group_size(32, 1, 1)))
 void cudaComputeLPC(
     __global FLACCLSubframeTask *tasks,
-    int taskCount, // tasks per block
-    __global float*autoc,
-    int max_order, // should be <= 32
+    __global float *autoc,
     __global float *lpcs,
-    int windowCount,
-    int partCount
+    int taskCount, // tasks per block
+    int windowCount
 )
 {
     __local struct {
 	FLACCLSubframeData task;
-	volatile float parts[32];
 	volatile float ldr[32];
 	volatile float gen1[32];
 	volatile float error[32];
@@ -311,40 +222,19 @@ void cudaComputeLPC(
     
     // fetch task data
     if (tid < sizeof(shared.task) / sizeof(int))
-	((__local int*)&shared.task)[tid] = ((__global int*)(tasks + get_group_id(1) * taskCount))[tid];
+	((__local int*)&shared.task)[tid] = ((__global int*)(tasks + get_group_id(1)))[tid];
     if (tid == 0)
     {
-	shared.lpcOffs = (get_group_id(0) + get_group_id(1) * windowCount) * (max_order + 1) * 32;
-	shared.autocOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) * partCount;
+	shared.lpcOffs = (get_group_id(0) + get_group_id(1) * windowCount) * (MAX_ORDER + 1) * 32;
+	shared.autocOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1);
     }
     barrier(CLK_LOCAL_MEM_FENCE);
     
-    // add up autocorrelation parts
+    if (get_local_id(0) <= MAX_ORDER)
+	shared.autoc[get_local_id(0)] = autoc[shared.autocOffs + get_local_id(0)];
+    if (get_local_id(0) + get_local_size(0) <= MAX_ORDER)
+	shared.autoc[get_local_id(0) + get_local_size(0)] = autoc[shared.autocOffs + get_local_id(0) + get_local_size(0)];
 
- //   for (int order = get_local_id(0); order <= max_order; order += 32)
- //   {
-	//float sum = 0.0f;
-	//for (int pos = 0; pos < partCount; pos++)
-	//    sum += autoc[shared.autocOffs + pos * (max_order + 1) + order];
-	//shared.autoc[order] = sum;
- //   }
-
-    for (int order = 0; order <= max_order; order ++)
-    {
-	float part = 0.0f;
-	for (int pos = get_local_id(0); pos < partCount; pos += get_local_size(0))
-	    part += autoc[shared.autocOffs + pos * (max_order + 1) + order];
-	shared.parts[tid] = part;
-	barrier(CLK_LOCAL_MEM_FENCE);
-	for (int l = get_local_size(0) / 2; l > 1; l >>= 1)
-	{
-	    if (get_local_id(0) < l)
-		shared.parts[tid] += shared.parts[tid + l];
-	    barrier(CLK_LOCAL_MEM_FENCE);
-	}
-	if (get_local_id(0) == 0)
-	    shared.autoc[order] = shared.parts[tid] + shared.parts[tid + 1];
-    }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     // Compute LPC using Schur and Levinson-Durbin recursion
@@ -352,19 +242,19 @@ void cudaComputeLPC(
     shared.ldr[get_local_id(0)] = 0.0f;
     float error = shared.autoc[0];
     barrier(CLK_LOCAL_MEM_FENCE);
-    for (int order = 0; order < max_order; order++)
+    for (int order = 0; order < MAX_ORDER; order++)
     {
 	// Schur recursion
 	float reff = -shared.gen1[0] / error;
 	error += shared.gen1[0] * reff; // Equivalent to error *= (1 - reff * reff);
 	float gen1;
-	if (get_local_id(0) < max_order - 1 - order)
+	if (get_local_id(0) < MAX_ORDER - 1 - order)
 	{
 	    gen1 = shared.gen1[get_local_id(0) + 1] + reff * gen0;
 	    gen0 += shared.gen1[get_local_id(0) + 1] * reff;
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
-	if (get_local_id(0) < max_order - 1 - order)
+	if (get_local_id(0) < MAX_ORDER - 1 - order)
 	    shared.gen1[get_local_id(0)] = gen1;
 
 	// Store prediction error
@@ -385,118 +275,16 @@ void cudaComputeLPC(
     }
     barrier(CLK_LOCAL_MEM_FENCE);
     // Output prediction error estimates
-    if (get_local_id(0) < max_order)
-	lpcs[shared.lpcOffs + max_order * 32 + get_local_id(0)] = shared.error[get_local_id(0)];
+    if (get_local_id(0) < MAX_ORDER)
+	lpcs[shared.lpcOffs + MAX_ORDER * 32 + get_local_id(0)] = shared.error[get_local_id(0)];
 }
 
-//__kernel void cudaComputeLPCLattice(
-//    FLACCLSubframeTask *tasks,
-//    const int taskCount, // tasks per block
-//    const int *samples,
-//    const int windowCount,
-//    const int max_order, // should be <= 12
-//    float*lpcs
-//)
-//{
-//    __local struct {
-//	volatile FLACCLSubframeData task;
-//	volatile float F[512];
-//	volatile float arp[32];
-//	volatile float tmp[256];
-//	volatile float error[32];
-//	volatile int lpcOffs;
-//    } shared;
-//
-//    // fetch task data
-//    if (get_local_id(0) < sizeof(shared.task) / sizeof(int))
-//	((int*)&shared.task)[get_local_id(0)] = ((int*)(tasks + taskCount * get_group_id(1)))[get_local_id(0)];
-//    if (get_local_id(0) == 0)
-//	shared.lpcOffs = __mul24(__mul24(get_group_id(1) + 1, windowCount) - 1, max_order + 1) * 32;
-//    barrier(CLK_LOCAL_MEM_FENCE);
-//
-//    // F = samples; B = samples
-//    float s1 = get_local_id(0) < shared.task.blocksize ? (samples[shared.task.samplesOffs + get_local_id(0)]) / 32768.0f : 0.0f;
-//    float s2 = get_local_id(0) + 256 < shared.task.blocksize ? (samples[shared.task.samplesOffs + get_local_id(0) + 256]) / 32768.0f : 0.0f;
-//    shared.F[get_local_id(0)] = s1;
-//    shared.F[get_local_id(0) + 256] = s2;
-//    barrier(CLK_LOCAL_MEM_FENCE);
-//
-//    shared.tmp[get_local_id(0)] = FSQR(s1) + FSQR(s2);
-//    barrier(CLK_LOCAL_MEM_FENCE);
-//    SUM256(shared.tmp, get_local_id(0), +=);
-//    barrier(CLK_LOCAL_MEM_FENCE);
-//    float DEN = shared.tmp[0];
-//    barrier(CLK_LOCAL_MEM_FENCE);
-//
-//    for (int order = 0; order < max_order; order++)
-//    {
-//	// reff = F(order+1:frameSize) * B(1:frameSize-order)' / DEN
-//	int idxF = get_local_id(0) + order + 1;
-//	int idxF2 = idxF + 256;
-//
-//	shared.tmp[get_local_id(0)] = idxF < shared.task.blocksize ? shared.F[idxF] * s1 : 0.0f;
-//	shared.tmp[get_local_id(0)] += idxF2 < shared.task.blocksize ? shared.F[idxF2] * s2 : 0.0f;
-//	barrier(CLK_LOCAL_MEM_FENCE); 
-//	SUM256(shared.tmp, get_local_id(0), +=);
-//	barrier(CLK_LOCAL_MEM_FENCE);
-//	float reff = shared.tmp[0] / DEN;
-//	barrier(CLK_LOCAL_MEM_FENCE);
-//
-//	// arp(order) = rc(order) = reff
-//	if (get_local_id(0) == 0)
-//	    shared.arp[order] = reff;
-//	    //shared.rc[order - 1] = shared.lpc[order - 1][order - 1] = reff;
-//
-//	// Levinson-Durbin recursion
-//	// arp(1:order-1) = arp(1:order-1) - reff * arp(order-1:-1:1)
-//	if (get_local_id(0) < order)
-//	    shared.arp[get_local_id(0)] = shared.arp[get_local_id(0)] - reff * shared.arp[order - 1 - get_local_id(0)];
-//	
-//	// Output coeffs
-//	if (get_local_id(0) <= order)
-//	    lpcs[shared.lpcOffs + order * 32 + get_local_id(0)] = shared.arp[order - get_local_id(0)];
-//
-//	// F1 = F(order+1:frameSize) - reff * B(1:frameSize-order)
-//	// B(1:frameSize-order) = B(1:frameSize-order) - reff * F(order+1:frameSize)
-//	// F(order+1:frameSize) = F1
-//	if (idxF < shared.task.blocksize)
-//	{
-//	    float f1 = shared.F[idxF];
-//	    shared.F[idxF] -= reff * s1;
-//	    s1 -= reff * f1;
-//	}
-//	if (idxF2 < shared.task.blocksize)
-//	{
-//	    float f2 = shared.F[idxF2];
-//	    shared.F[idxF2] -= reff * s2;
-//	    s2 -= reff * f2;
-//	}
-//
-//	// DEN = F(order+1:frameSize) * F(order+1:frameSize)' + B(1:frameSize-order) * B(1:frameSize-order)' (BURG)
-//	shared.tmp[get_local_id(0)] = (idxF + 1 < shared.task.blocksize ? FSQR(shared.F[idxF]) + FSQR(s1) : 0);
-//	shared.tmp[get_local_id(0)] += (idxF2 + 1 < shared.task.blocksize ? FSQR(shared.F[idxF2]) + FSQR(s2) : 0);
-//	barrier(CLK_LOCAL_MEM_FENCE);
-//	SUM256(shared.tmp, get_local_id(0), +=);
-//	barrier(CLK_LOCAL_MEM_FENCE);
-//	DEN = shared.tmp[0] / 2;
-//	// shared.PE[order-1] = shared.tmp[0] / 2 / (frameSize - order + 1);
-//	if (get_local_id(0) == 0)
-//	    shared.error[order] = DEN / (shared.task.blocksize - order);
-//	barrier(CLK_LOCAL_MEM_FENCE);
-//    }
-//
-//    // Output prediction error estimates
-//    if (get_local_id(0) < max_order)
-//	lpcs[shared.lpcOffs + max_order * 32 + get_local_id(0)] = shared.error[get_local_id(0)];
-//}
-
 __kernel __attribute__((reqd_work_group_size(32, 4, 1)))
 void cudaQuantizeLPC(
     __global FLACCLSubframeTask *tasks,
+    __global float*lpcs,
     int taskCount, // tasks per block
     int taskCountLPC, // tasks per set of coeffs (<= 32)
-    __global float*lpcs,
-    int max_order, // should be <= 32
     int minprecision,
     int precisions
 )
@@ -515,32 +303,30 @@ void cudaQuantizeLPC(
     if (tid < sizeof(shared.task) / sizeof(int))
 	((__local int*)&shared.task)[tid] = ((__global int*)(tasks + get_group_id(1) * taskCount))[tid];
     if (tid == 0)
-	shared.lpcOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) * 32;
+	shared.lpcOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1) * 32;
     barrier(CLK_LOCAL_MEM_FENCE);
 
+    // Select best orders based on Akaike's Criteria
     if (get_local_id(1) == 0)
     {
-	shared.index[get_local_id(0)] = min(max_order - 1, get_local_id(0));
-	shared.error[get_local_id(0)] = shared.task.blocksize * 64 + get_local_id(0);
-	shared.index[32 + get_local_id(0)] = min(max_order - 1, get_local_id(0));
-	shared.error[32 + get_local_id(0)] = shared.task.blocksize * 64 + get_local_id(0);
-
-        // Select best orders based on Akaike's Criteria
+	shared.index[tid] = min(MAX_ORDER - 1, tid);
+	shared.error[tid] = shared.task.blocksize * 64 + tid;
+	shared.index[32 + tid] = min(MAX_ORDER - 1, tid);
+	shared.error[32 + tid] = shared.task.blocksize * 64 + tid;
 
 	// Load prediction error estimates
-	if (get_local_id(0) < max_order)
-	    shared.error[get_local_id(0)] = shared.task.blocksize * log(lpcs[shared.lpcOffs + max_order * 32 + get_local_id(0)]) + get_local_id(0) * 5.12f * log(shared.task.blocksize);
-	    //shared.error[get_local_id(0)] = shared.task.blocksize * log(lpcs[shared.lpcOffs + max_order * 32 + get_local_id(0)]) + get_local_id(0) * 0.30f * (shared.task.abits + 1) * log(shared.task.blocksize);
+	if (tid < MAX_ORDER)
+	    shared.error[tid] = shared.task.blocksize * log(lpcs[shared.lpcOffs + MAX_ORDER * 32 + tid]) + tid * 5.12f * log(shared.task.blocksize);
+	    //shared.error[get_local_id(0)] = shared.task.blocksize * log(lpcs[shared.lpcOffs + MAX_ORDER * 32 + get_local_id(0)]) + get_local_id(0) * 0.30f * (shared.task.abits + 1) * log(shared.task.blocksize);
     }
-
     barrier(CLK_LOCAL_MEM_FENCE);
 
     // Sort using bitonic sort
     for(int size = 2; size < 64; size <<= 1){
 	//Bitonic merge
-	int ddd = (get_local_id(0) & (size / 2)) == 0;
+	int ddd = (tid & (size / 2)) == 0;
 	for(int stride = size / 2; stride > 0; stride >>= 1){
-	    int pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+	    int pos = 2 * tid - (tid & (stride - 1));
 	    float e0, e1;
 	    int i0, i1;
 	    if (get_local_id(1) == 0)
@@ -551,7 +337,7 @@ void cudaQuantizeLPC(
 		i1 = shared.index[pos + stride];
 	    }
 	    barrier(CLK_LOCAL_MEM_FENCE);
-	    if ((e0 >= e1) == ddd && get_local_id(1) == 0)
+	    if (get_local_id(1) == 0 && (e0 >= e1) == ddd)
 	    {
 		shared.error[pos] = e1;
 		shared.error[pos + stride] = e0;
@@ -566,7 +352,7 @@ void cudaQuantizeLPC(
     {
 	for(int stride = 32; stride > 0; stride >>= 1){
 	    //barrier(CLK_LOCAL_MEM_FENCE);
-	    int pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+	    int pos = 2 * tid - (tid & (stride - 1));
 	    float e0, e1;
 	    int i0, i1;
 	    if (get_local_id(1) == 0)
@@ -577,7 +363,7 @@ void cudaQuantizeLPC(
 		i1 = shared.index[pos + stride];
 	    }
 	    barrier(CLK_LOCAL_MEM_FENCE);
-	    if (e0 >= e1 && get_local_id(1) == 0)
+	    if (get_local_id(1) == 0 && e0 >= e1)
 	    {
 		shared.error[pos] = e1;
 		shared.error[pos + stride] = e0;
@@ -653,21 +439,21 @@ void cudaQuantizeLPC(
     }
 }
 
-__kernel __attribute__(( vec_type_hint (int4)))
+__kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
 void cudaEstimateResidual(
     __global int*output,
     __global int*samples,
     __global FLACCLSubframeTask *tasks
     )
 {
-    __local float data[128 * 2];
-    __local int residual[128];
+    __local float data[GROUP_SIZE * 2];
+    __local int residual[GROUP_SIZE];
     __local FLACCLSubframeTask task;
     __local float4 coefsf4[8];
 
     const int tid = get_local_id(0);
     if (tid < sizeof(task)/sizeof(int))
-	((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(1)]))[tid];
+	((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(0)]))[tid];
     barrier(CLK_GLOBAL_MEM_FENCE);
 
     int ro = task.data.residualOrder;
@@ -677,11 +463,11 @@ void cudaEstimateResidual(
     if (tid < 32)
 	((__local float *)&coefsf4[0])[tid] = select(0.0f, ((float)task.coefs[tid]) / (1 << task.data.shift), tid < ro);
     data[tid] = tid < bs ? (float)(samples[task.data.samplesOffs + tid] >> task.data.wbits) : 0.0f;
-    for (int pos = 0; pos < bs; pos += get_local_size(0))
+    for (int pos = 0; pos < bs; pos += GROUP_SIZE)
     {
 	// fetch samples
-	float nextData = pos + tid + get_local_size(0) < bs ? (float)(samples[task.data.samplesOffs + pos + tid + get_local_size(0)] >> task.data.wbits) : 0.0f;
-	data[tid + get_local_size(0)] = nextData;
+	float nextData = pos + tid + GROUP_SIZE < bs ? (float)(samples[task.data.samplesOffs + pos + tid + GROUP_SIZE] >> task.data.wbits) : 0.0f;
+	data[tid + GROUP_SIZE] = nextData;
 	barrier(CLK_LOCAL_MEM_FENCE);
 
 	// compute residual
@@ -718,19 +504,19 @@ void cudaEstimateResidual(
 	data[tid] = nextData;
     }
 
-    int residualLen = (bs - ro) / get_local_size(0) + select(0, 1, tid < (bs - ro) % get_local_size(0));
+    int residualLen = (bs - ro) / GROUP_SIZE + select(0, 1, tid < (bs - ro) % GROUP_SIZE);
     int k = clamp(convert_int_rtn(log2((res + 0.000001f) / (residualLen + 0.000001f))), 0, 14);
     residual[tid] = residualLen * (k + 1) + (convert_int_rtz(res) >> k);
 
     barrier(CLK_LOCAL_MEM_FENCE);
-    for (int l = get_local_size(0) / 2; l > 0; l >>= 1)
+    for (int l = GROUP_SIZE / 2; l > 0; l >>= 1)
     {
 	if (tid < l)
 	    residual[tid] += residual[tid + l];
 	barrier(CLK_LOCAL_MEM_FENCE);
     }
     if (tid == 0)
-	output[get_group_id(1)] = residual[0];
+	output[get_group_id(0)] = residual[0];
 }
 
 __kernel void cudaChooseBestMethod(