diff --git a/CUETools.Codecs.FLACCL/CUETools.Codecs.FLACCL.csproj b/CUETools.Codecs.FLACCL/CUETools.Codecs.FLACCL.csproj
index 8e31c17..ff9d027 100644
--- a/CUETools.Codecs.FLACCL/CUETools.Codecs.FLACCL.csproj
+++ b/CUETools.Codecs.FLACCL/CUETools.Codecs.FLACCL.csproj
@@ -65,6 +65,9 @@
     </ProjectReference>
   </ItemGroup>
   <ItemGroup>
+    <Content Include="flaccpu.cl">
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </Content>
     <None Include="flac.cu">
     </None>
     <Content Include="flac.cl">
diff --git a/CUETools.Codecs.FLACCL/FLACCLWriter.cs b/CUETools.Codecs.FLACCL/FLACCLWriter.cs
index 5b38d73..0626210 100644
--- a/CUETools.Codecs.FLACCL/FLACCLWriter.cs
+++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs
@@ -33,7 +33,14 @@ namespace CUETools.Codecs.FLACCL
 {
 	public class FLACCLWriterSettings
 	{
-		public FLACCLWriterSettings() { DoVerify = false; GPUOnly = true; DoMD5 = true; GroupSize = 64; }
+		public FLACCLWriterSettings() 
+		{ 
+			this.DoVerify = false; 
+			this.GPUOnly = true; 
+			this.DoMD5 = true; 
+			this.GroupSize = 64;
+			this.DeviceType = OpenCLDeviceType.GPU;
+		}
 
 		[DefaultValue(false)]
 		[DisplayName("Verify")]
@@ -56,6 +63,13 @@ namespace CUETools.Codecs.FLACCL
 		[SRDescription(typeof(Properties.Resources), "DescriptionDefines")]
 		public string Defines { get; set; }
 
+		[SRDescription(typeof(Properties.Resources), "DescriptionPlatform")]
+		public string Platform { get; set; }
+
+		[DefaultValue(OpenCLDeviceType.GPU)]
+		[SRDescription(typeof(Properties.Resources), "DescriptionDeviceType")]
+		public OpenCLDeviceType DeviceType { get; set; }
+
 		int cpu_threads = 1;
 		[DefaultValue(1)]
 		[SRDescription(typeof(Properties.Resources), "DescriptionCPUThreads")]
@@ -74,6 +88,12 @@ namespace CUETools.Codecs.FLACCL
 		}
 	}
 
+	public enum OpenCLDeviceType : ulong
+	{
+		CPU = DeviceType.CPU,
+		GPU = DeviceType.GPU
+	}
+
 	[AudioEncoderClass("FLACCL", "flac", true, "0 1 2 3 4 5 6 7 8 9 10 11", "8", 2, typeof(FLACCLWriterSettings))]
 	//[AudioEncoderClass("FLACCL nonsub", "flac", true, "9 10 11", "9", 1, typeof(FLACCLWriterSettings))]
 	public class FLACCLWriter : IAudioDest
@@ -223,6 +243,11 @@ namespace CUETools.Codecs.FLACCL
 				if (value as FLACCLWriterSettings == null)
 					throw new Exception("Unsupported options " + value);
 				_settings = value as FLACCLWriterSettings;
+				if (_settings.DeviceType == OpenCLDeviceType.CPU)
+				{
+					_settings.GroupSize = 1;
+					_settings.GPUOnly = false;
+				}
 				eparams.flake_set_defaults(_compressionLevel, !_settings.GPUOnly);
 			}
 		}
@@ -449,7 +474,7 @@ namespace CUETools.Codecs.FLACCL
 			}
 			set
 			{
-				if (value < 0 || value > eparams.max_fixed_order)
+				if (value < 0 || value > 4)
 					throw new Exception("invalid MinFixedOrder " + value.ToString());
 				eparams.min_fixed_order = value;
 			}
@@ -463,7 +488,7 @@ namespace CUETools.Codecs.FLACCL
 			}
 			set
 			{
-				if (value > 4 || value < eparams.min_fixed_order)
+				if (value > 4 || value < 0)
 					throw new Exception("invalid MaxFixedOrder " + value.ToString());
 				eparams.max_fixed_order = value;
 			}
@@ -963,7 +988,7 @@ namespace CUETools.Codecs.FLACCL
 
 			task.nResidualTasks = 0;
 			task.nTasksPerWindow = Math.Min(32, eparams.orders_per_window);
-			task.nResidualTasksPerChannel = task.nWindowFunctions * task.nTasksPerWindow + 1 + (eparams.do_constant ? 1 : 0) + eparams.max_fixed_order - eparams.min_fixed_order;
+			task.nResidualTasksPerChannel = task.nWindowFunctions * task.nTasksPerWindow + (eparams.do_constant ? 1 : 0) + Math.Max(0, 1 + eparams.max_fixed_order - eparams.min_fixed_order);
 			//if (task.nResidualTasksPerChannel >= 4)
 			//    task.nResidualTasksPerChannel = (task.nResidualTasksPerChannel + 7) & ~7;
 			for (int iFrame = 0; iFrame < nFrames; iFrame++)
@@ -1444,7 +1469,7 @@ namespace CUETools.Codecs.FLACCL
 				if (OpenCL.NumberOfPlatforms < 1)
 					throw new Exception("no opencl platforms found");
 
-				int groupSize = _settings.GroupSize;
+				int groupSize = _settings.DeviceType == OpenCLDeviceType.CPU ? 1 : _settings.GroupSize;
 				OCLMan = new OpenCLManager();
 				// Attempt to save binaries after compilation, as well as load precompiled binaries
 				// to avoid compilation. Usually you'll want this to be true. 
@@ -1475,12 +1500,30 @@ namespace CUETools.Codecs.FLACCL
 				OCLMan.BuildOptions = "";
 				OCLMan.SourcePath = System.IO.Path.GetDirectoryName(GetType().Assembly.Location);
 				OCLMan.BinaryPath = System.IO.Path.Combine(System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData), "CUE Tools"), "OpenCL");
-				OCLMan.CreateDefaultContext(0, DeviceType.GPU);
+				int platformId = 0;
+				if (_settings.Platform != null)
+				{
+					platformId = -1;
+					string platforms = "";
+					for (int i = 0; i < OpenCL.NumberOfPlatforms; i++)
+					{
+						var platform = OpenCL.GetPlatform(i);
+						platforms += " \"" + platform.Name + "\"";
+						if (platform.Name.Equals(_settings.Platform, StringComparison.InvariantCultureIgnoreCase))
+						{
+							platformId = i;
+							break;
+						}
+					}
+					if (platformId < 0)
+						throw new Exception("unknown platform \"" + _settings.Platform + "\". Platforms available:" + platforms);
+				}
+				OCLMan.CreateDefaultContext(platformId, (DeviceType)_settings.DeviceType);
 
 				openCLContext = OCLMan.Context;
 				try
 				{
-					openCLProgram = OCLMan.CompileFile("flac.cl");
+					openCLProgram = OCLMan.CompileFile(_settings.DeviceType == OpenCLDeviceType.CPU ? "flaccpu.cl" : "flac.cl");
 				}
 				catch (OpenCLBuildException ex)
 				{
@@ -2218,13 +2261,12 @@ namespace CUETools.Codecs.FLACCL
 			this.channelsCount = channelsCount;
 			this.writer = writer;
 			openCLProgram = _openCLProgram;
-			Device[] openCLDevices = openCLProgram.Context.Platform.QueryDevices(DeviceType.GPU);
 #if DEBUG
 			var prop = CommandQueueProperties.PROFILING_ENABLE;
 #else
 			var prop = CommandQueueProperties.NONE;
 #endif
-			openCLCQ = openCLProgram.Context.CreateCommandQueue(openCLDevices[0], prop);
+			openCLCQ = openCLProgram.Context.CreateCommandQueue(openCLProgram.Context.Devices[0], prop);
 
 			residualTasksLen = sizeof(FLACCLSubframeTask) * channelsCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 8) * FLACCLWriter.maxFrames;
 			bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channels * FLACCLWriter.maxFrames;
@@ -2260,12 +2302,15 @@ namespace CUETools.Codecs.FLACCL
 			clChooseBestMethod = openCLProgram.CreateKernel("clChooseBestMethod");
 			clCopyBestMethod = openCLProgram.CreateKernel("clCopyBestMethod");
 			clCopyBestMethodStereo = openCLProgram.CreateKernel("clCopyBestMethodStereo");
-			clEncodeResidual = openCLProgram.CreateKernel("clEncodeResidual");
-			clCalcPartition = openCLProgram.CreateKernel("clCalcPartition");
-			clCalcPartition16 = openCLProgram.CreateKernel("clCalcPartition16");
-			clSumPartition = openCLProgram.CreateKernel("clSumPartition");
-			clFindRiceParameter = openCLProgram.CreateKernel("clFindRiceParameter");
-			clFindPartitionOrder = openCLProgram.CreateKernel("clFindPartitionOrder");
+			if (writer._settings.GPUOnly)
+			{
+				clEncodeResidual = openCLProgram.CreateKernel("clEncodeResidual");
+				clCalcPartition = openCLProgram.CreateKernel("clCalcPartition");
+				clCalcPartition16 = openCLProgram.CreateKernel("clCalcPartition16");
+				clSumPartition = openCLProgram.CreateKernel("clSumPartition");
+				clFindRiceParameter = openCLProgram.CreateKernel("clFindRiceParameter");
+				clFindPartitionOrder = openCLProgram.CreateKernel("clFindPartitionOrder");
+			}
 
 			samplesBuffer = new int[FLACCLWriter.MAX_BLOCKSIZE * channelsCount];
 			outputBuffer = new byte[max_frame_size * FLACCLWriter.maxFrames + 1];
@@ -2304,12 +2349,15 @@ namespace CUETools.Codecs.FLACCL
 			clChooseBestMethod.Dispose();
 			clCopyBestMethod.Dispose();
 			clCopyBestMethodStereo.Dispose();
-			clEncodeResidual.Dispose();
-			clCalcPartition.Dispose();
-			clCalcPartition16.Dispose();
-			clSumPartition.Dispose();
-			clFindRiceParameter.Dispose();
-			clFindPartitionOrder.Dispose();
+			if (writer._settings.GPUOnly)
+			{
+				clEncodeResidual.Dispose();
+				clCalcPartition.Dispose();
+				clCalcPartition16.Dispose();
+				clSumPartition.Dispose();
+				clFindRiceParameter.Dispose();
+				clFindPartitionOrder.Dispose();
+			}
 
 			clSamples.Dispose();
 			clSamplesBytes.Dispose();
@@ -2390,15 +2438,13 @@ namespace CUETools.Codecs.FLACCL
 				nWindowFunctions);
 
 			clComputeLPC.SetArgs(
-				clResidualTasks,
 				clAutocorOutput,
 				clLPCData,
-				nResidualTasksPerChannel,
 				nWindowFunctions);
 
 			openCLCQ.EnqueueNDRangeKernel(
 				clComputeLPC,
-				32, 1,
+				Math.Min(groupSize, 32), 1,
 				nWindowFunctions,
 				channelsCount * frameCount);
 
@@ -2412,7 +2458,7 @@ namespace CUETools.Codecs.FLACCL
 
 			openCLCQ.EnqueueNDRangeKernel(
 				clQuantizeLPC,
-				32, 1,
+				Math.Min(groupSize, 32), 1,
 				nWindowFunctions,
 				channelsCount * frameCount);
 
@@ -2433,7 +2479,7 @@ namespace CUETools.Codecs.FLACCL
 
 			openCLCQ.EnqueueNDRangeKernel(
 				clChooseBestMethod,
-				32, channelsCount * frameCount);
+				Math.Min(groupSize, 32), channelsCount * frameCount);
 
 			if (channels == 2 && channelsCount == 4)
 			{
@@ -2444,7 +2490,7 @@ namespace CUETools.Codecs.FLACCL
 
 				openCLCQ.EnqueueNDRangeKernel(
 					clCopyBestMethodStereo,
-					64, frameCount);
+					Math.Min(groupSize, 64), frameCount);
 			}
 			else
 			{
@@ -2455,7 +2501,7 @@ namespace CUETools.Codecs.FLACCL
 
 				openCLCQ.EnqueueNDRangeKernel(
 					clCopyBestMethod,
-					64, channels * frameCount);
+					Math.Min(groupSize, 64), channels * frameCount);
 			}
 
 			if (writer._settings.GPUOnly)
diff --git a/CUETools.Codecs.FLACCL/flac.cl b/CUETools.Codecs.FLACCL/flac.cl
index 6d446ba..015761c 100644
--- a/CUETools.Codecs.FLACCL/flac.cl
+++ b/CUETools.Codecs.FLACCL/flac.cl
@@ -1,6 +1,6 @@
 /**
  * CUETools.FLACCL: FLAC audio encoder using OpenCL
- * Copyright (c) 2009 Gregory S. Chudov
+ * Copyright (c) 2010 Gregory S. Chudov
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -24,8 +24,6 @@
 #pragma OPENCL EXTENSION cl_amd_printf : enable
 #endif
 
-#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
-
 //#pragma OPENCL EXTENSION cl_amd_fp64 : enable
 
 typedef enum
@@ -125,7 +123,7 @@ void clFindWastedBits(
     barrier(CLK_LOCAL_MEM_FENCE);
 
     int w = 0, a = 0;
-    for (int pos = tid; pos + tid < task.blocksize; pos += GROUP_SIZE)
+    for (int pos = tid; pos < task.blocksize; pos += GROUP_SIZE)
     {
 	int smp = samples[task.samplesOffs + pos];
 	w |= smp;
@@ -213,38 +211,25 @@ void clComputeAutocor(
 
 __kernel __attribute__((reqd_work_group_size(32, 1, 1)))
 void clComputeLPC(
-    __global FLACCLSubframeTask *tasks,
     __global float *autoc,
     __global float *lpcs,
-    int taskCount, // tasks per block
     int windowCount
 )
 {
     __local struct {
-	FLACCLSubframeData task;
 	volatile float ldr[32];
 	volatile float gen1[32];
 	volatile float error[32];
 	volatile float autoc[33];
-	volatile int lpcOffs;
-	volatile int autocOffs;
     } shared;
-    const int tid = get_local_id(0);// + get_local_id(1) * 32;
-    
-    // fetch task data
-    if (tid < sizeof(shared.task) / sizeof(int))
-	((__local int*)&shared.task)[tid] = ((__global int*)(tasks + get_group_id(1)))[tid];
-    if (tid == 0)
-    {
-	shared.lpcOffs = (get_group_id(0) + get_group_id(1) * windowCount) * (MAX_ORDER + 1) * 32;
-	shared.autocOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
+    const int tid = get_local_id(0);// + get_local_id(1) * 32;    
+    int lpcOffs = (get_group_id(0) + get_group_id(1) * windowCount) * (MAX_ORDER + 1) * 32;
+    int autocOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1);
     
     if (get_local_id(0) <= MAX_ORDER)
-	shared.autoc[get_local_id(0)] = autoc[shared.autocOffs + get_local_id(0)];
+	shared.autoc[get_local_id(0)] = autoc[autocOffs + get_local_id(0)];
     if (get_local_id(0) + get_local_size(0) <= MAX_ORDER)
-	shared.autoc[get_local_id(0) + get_local_size(0)] = autoc[shared.autocOffs + get_local_id(0) + get_local_size(0)];
+	shared.autoc[get_local_id(0) + get_local_size(0)] = autoc[autocOffs + get_local_id(0) + get_local_size(0)];
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
@@ -296,14 +281,14 @@ void clComputeLPC(
 
 	// Output coeffs
 	if (get_local_id(0) <= order)
-	    lpcs[shared.lpcOffs + order * 32 + get_local_id(0)] = -shared.ldr[order - get_local_id(0)];
+	    lpcs[lpcOffs + order * 32 + get_local_id(0)] = -shared.ldr[order - get_local_id(0)];
 	//if (get_local_id(0) <= order + 1 && fabs(-shared.ldr[0]) > 3000)
 	//    printf("coef[%d] == %f, autoc == %f, error == %f\n", get_local_id(0), -shared.ldr[order - get_local_id(0)], shared.autoc[get_local_id(0)], shared.error[get_local_id(0)]);
     }
     barrier(CLK_LOCAL_MEM_FENCE);
     // Output prediction error estimates
     if (get_local_id(0) < MAX_ORDER)
-	lpcs[shared.lpcOffs + MAX_ORDER * 32 + get_local_id(0)] = shared.error[get_local_id(0)];
+	lpcs[lpcOffs + MAX_ORDER * 32 + get_local_id(0)] = shared.error[get_local_id(0)];
 }
 
 __kernel __attribute__((reqd_work_group_size(32, 1, 1)))
@@ -318,9 +303,10 @@ void clQuantizeLPC(
 {
     __local struct {
 	FLACCLSubframeData task;
-	volatile int tmpi[32];
 	volatile int index[64];
 	volatile float error[64];
+	volatile int maxcoef[32];
+	volatile int maxcoef2[32];
 	volatile int lpcOffs;
     } shared;
 
@@ -338,6 +324,8 @@ void clQuantizeLPC(
     shared.error[tid] = shared.task.blocksize * 64 + tid;
     shared.index[32 + tid] = MAX_ORDER - 1;
     shared.error[32 + tid] = shared.task.blocksize * 64 + tid + 32;
+    shared.maxcoef[tid] = 0;
+    shared.maxcoef2[tid] = 0;
 
     // Load prediction error estimates
     if (tid < MAX_ORDER)
@@ -399,21 +387,14 @@ void clQuantizeLPC(
 	// get 15 bits of each coeff
 	int coef = convert_int_rte(lpc * (1 << 15));
 	// remove sign bits
-	shared.tmpi[tid] = coef ^ (coef >> 31);
+	atomic_or(shared.maxcoef + i, coef ^ (coef >> 31));
 	barrier(CLK_LOCAL_MEM_FENCE);
-	// OR reduction
-	for (int l = get_local_size(0) / 2; l > 1; l >>= 1)
-	{
-	    if (tid < l)
-		shared.tmpi[tid] |= shared.tmpi[tid + l];
-	    barrier(CLK_LOCAL_MEM_FENCE);
-	}
 	//SUM32(shared.tmpi,tid,|=);
 	// choose precision	
 	//int cbits = max(3, min(10, 5 + (shared.task.abits >> 1))); //  - convert_int_rte(shared.PE[order - 1])
 	int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (shared.task.blocksize <= 2304) - (shared.task.blocksize <= 1152) - (shared.task.blocksize <= 576), shared.task.abits), clz(order) + 1 - shared.task.abits));
 	// calculate shift based on precision and number of leading zeroes in coeffs
-	int shift = max(0,min(15, clz(shared.tmpi[0] | shared.tmpi[1]) - 18 + cbits));
+	int shift = max(0,min(15, clz(shared.maxcoef[i]) - 18 + cbits));
 
 	//cbits = 13;
 	//shift = 15;
@@ -426,18 +407,10 @@ void clQuantizeLPC(
 	//shared.tmp[tid] = (tid != 0) * (shared.arp[tid - 1]*(1 << shared.task.shift) - shared.task.coefs[tid - 1]);
 	//shared.task.coefs[tid] = max(-(1 << (shared.task.cbits - 1)), min((1 << (shared.task.cbits - 1))-1, convert_int_rte((shared.arp[tid]) * (1 << shared.task.shift) + shared.tmp[tid])));
 	// remove sign bits
-	shared.tmpi[tid] = coef ^ (coef >> 31);
+	atomic_or(shared.maxcoef2 + i, coef ^ (coef >> 31));
 	barrier(CLK_LOCAL_MEM_FENCE);
-	// OR reduction
-	for (int l = get_local_size(0) / 2; l > 1; l >>= 1)
-	{
-	    if (tid < l)
-		shared.tmpi[tid] |= shared.tmpi[tid + l];
-	    barrier(CLK_LOCAL_MEM_FENCE);
-	}
-	//SUM32(shared.tmpi,tid,|=);
 	// calculate actual number of bits (+1 for sign)
-	cbits = 1 + 32 - clz(shared.tmpi[0] | shared.tmpi[1]);
+	cbits = 1 + 32 - clz(shared.maxcoef2[i]);
 
 	// output shift, cbits and output coeffs
 	int taskNo = get_group_id(1) * taskCount + get_group_id(0) * taskCountLPC + i;
@@ -452,10 +425,6 @@ void clQuantizeLPC(
     }
 }
 
-#ifndef PARTORDER
-#define PARTORDER 4
-#endif
-
 __kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
 void clEstimateResidual(
     __global int*output,
@@ -463,10 +432,10 @@ void clEstimateResidual(
     __global FLACCLSubframeTask *tasks
     )
 {
-    __local int data[GROUP_SIZE * 2];
+    __local float data[GROUP_SIZE * 2];
     __local FLACCLSubframeTask task;
-    __local int residual[GROUP_SIZE];
-    __local int len[GROUP_SIZE >> PARTORDER];
+    __local int psum[64];
+    __local float fcoef[32];
 
     const int tid = get_local_id(0);
     if (tid < sizeof(task)/sizeof(int))
@@ -476,101 +445,74 @@ void clEstimateResidual(
     int ro = task.data.residualOrder;
     int bs = task.data.blocksize;
 
-    if (tid < 32 && tid >= ro)
-	task.coefs[tid] = 0;
-    if (tid < (GROUP_SIZE >> PARTORDER))
-	len[tid] = 0;
-    data[tid] = 0;
+    if (tid < 32)
+	fcoef[tid] = select(0.0f, - ((float) task.coefs[tid]) / (1 << task.data.shift), tid < ro);
+	//fcoef[tid] = select(0.0f, - ((float) task.coefs[tid + ro - MAX_ORDER]) / (1 << task.data.shift), tid + ro >= MAX_ORDER && tid < MAX_ORDER);
+    if (tid < 64)
+	psum[tid] = 0;
+    data[tid] = 0.0f;
+
+    int partOrder = clz(64) - clz(bs - 1) + 1;
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    __local int4 * cptr = (__local int4 *)&task.coefs[0];
-    int4 cptr0 = cptr[0];
-#if MAX_ORDER > 4
-    int4 cptr1 = cptr[1];
+    float4 cptr0 = vload4(0, &fcoef[0]);
+    float4 cptr1 = vload4(1, &fcoef[0]);
 #if MAX_ORDER > 8
-    int4 cptr2 = cptr[2];
+    float4 cptr2 = vload4(2, &fcoef[0]);
 #endif
-#endif
-
     for (int pos = 0; pos < bs; pos += GROUP_SIZE)
     {
 	// fetch samples
 	int offs = pos + tid;
-	int nextData = offs < bs ? samples[task.data.samplesOffs + offs] >> task.data.wbits : 0;
+	float nextData = offs < bs ? samples[task.data.samplesOffs + offs] >> task.data.wbits : 0.0f;
 	data[tid + GROUP_SIZE] = nextData;
 	barrier(CLK_LOCAL_MEM_FENCE);
 
 	// compute residual
-	__local int4 * dptr = (__local int4 *)&data[tid + GROUP_SIZE - ro];
-	int4 sum = dptr[0] * cptr0
-#if MAX_ORDER > 4
-	    + dptr[1] * cptr1
+	__local float* dptr = &data[tid + GROUP_SIZE - ro];
+	float4 sum = cptr0 * vload4(0, dptr)
+	    + cptr1 * vload4(1, dptr)
 #if MAX_ORDER > 8
-	    + dptr[2] * cptr2
-#if MAX_ORDER > 12
-	    + dptr[3] * cptr[3]
-#if MAX_ORDER > 16
-	    + dptr[4] * cptr[4]
-	    + dptr[5] * cptr[5]
-	    + dptr[6] * cptr[6]
-	    + dptr[7] * cptr[7]
-#endif
-#endif
-#endif
+	    + cptr2 * vload4(2, dptr)
+  #if MAX_ORDER > 12
+	    + vload4(3, &fcoef[0]) * vload4(3, dptr)
+    #if MAX_ORDER > 16
+	    + vload4(4, &fcoef[0]) * vload4(4, dptr)
+	    + vload4(5, &fcoef[0]) * vload4(5, dptr)
+	    + vload4(6, &fcoef[0]) * vload4(6, dptr)
+	    + vload4(7, &fcoef[0]) * vload4(7, dptr)
+    #endif
+  #endif
 #endif
 	    ;
-    	
-	int t = nextData - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift);
+
+	int t = convert_int_rte(nextData + sum.x + sum.y + sum.z + sum.w);
+	barrier(CLK_LOCAL_MEM_FENCE);
+	data[tid] = nextData;
 	// ensure we're within frame bounds
 	t = select(0, t, offs >= ro && offs < bs);
 	// overflow protection
 	t = clamp(t, -0x7fffff, 0x7fffff);
 	// convert to unsigned
-	residual[tid] = (t << 1) ^ (t >> 31);
-	barrier(CLK_LOCAL_MEM_FENCE);
-	data[tid] = nextData;
-
-	// calculate rice partition bit length for every 16 samples
-	if (tid < (GROUP_SIZE >> PARTORDER))
-	{
-	    //__local int4 * chunk = (__local int4 *)&residual[tid << PARTORDER];
-	    __local int4 * chunk = ((__local int4 *)residual) + (tid << (PARTORDER - 2));
-#if PARTORDER == 3
-	    int4 sum = chunk[0] + chunk[1];
-#elif PARTORDER == 4
-	    int4 sum = chunk[0] + chunk[1] + chunk[2] + chunk[3]; // [0 .. (1 << (PARTORDER - 2)) - 1]
-#elif PARTORDER == 5
-	    int4 sum = chunk[0] + chunk[1] + chunk[2] + chunk[3] + chunk[4] + chunk[5] + chunk[6] + chunk[7];
-#else
-#error Invalid PARTORDER
-#endif
-	    int res = sum.x + sum.y + sum.z + sum.w;
-	    int k = clamp(clz(1 << PARTORDER) - clz(res), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16)
-#ifdef EXTRAMODE
-#if PARTORDER == 3
-	    sum = (chunk[0] >> k) + (chunk[1] >> k);
-#elif PARTORDER == 4
-	    sum = (chunk[0] >> k) + (chunk[1] >> k) + (chunk[2] >> k) + (chunk[3] >> k);
-#else
-#error Invalid PARTORDER
-#endif
-	    len[tid] += (k << PARTORDER) + sum.x + sum.y + sum.z + sum.w;
-#else
-	    len[tid] += (k << PARTORDER) + (res >> k);
-#endif
-	}
+	atomic_add(&psum[offs >> partOrder], (t << 1) ^ (t >> 31));
     }
 
+    // calculate rice partition bit length for every (1 << partOrder) samples
+    if (tid < 64)
+    {
+	int k = clamp(clz(1 << partOrder) - clz(psum[tid]), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16)
+	psum[tid] = (k << partOrder) + (psum[tid] >> k);
+    }
     barrier(CLK_LOCAL_MEM_FENCE);
-    for (int l = GROUP_SIZE >> (PARTORDER + 1); l > 0; l >>= 1)
+    for (int l = 32; l > 0; l >>= 1)
     {
 	if (tid < l)
-	    len[tid] += len[tid + l];
+	    psum[tid] += psum[tid + l];
 	barrier(CLK_LOCAL_MEM_FENCE);
     }
     if (tid == 0)
-	output[get_group_id(0)] = len[0] + (bs - ro);
+	output[get_group_id(0)] = psum[0] + (bs - ro);
 }
 
 __kernel __attribute__((reqd_work_group_size(32, 1, 1)))
@@ -580,15 +522,16 @@ void clChooseBestMethod(
     int taskCount
     )
 {
-    __local struct {
-	volatile int index[32];
-	volatile int length[32];
-    } shared;
+    int best_length = 0x7fffffff;
+    int best_index = 0;
+    __local int partLen[32];
     __local FLACCLSubframeData task;
     const int tid = get_local_id(0);
     
-    shared.length[tid] = 0x7fffffff;
-    shared.index[tid] = tid;
+    // fetch part sum
+    if (tid < taskCount)
+	partLen[tid] = residual[tid + taskCount * get_group_id(0)];
+    barrier(CLK_LOCAL_MEM_FENCE);
     for (int taskNo = 0; taskNo < taskCount; taskNo++)
     {
 	// fetch task data
@@ -599,46 +542,27 @@ void clChooseBestMethod(
 
 	if (tid == 0)
 	{
-	    // fetch part sum
-	    int partLen = residual[taskNo + taskCount * get_group_id(0)];
-	    //// calculate part size
-	    //int residualLen = task[get_local_id(1)].data.blocksize - task[get_local_id(1)].data.residualOrder;
-	    //residualLen = residualLen * (task[get_local_id(1)].data.type != Constant || psum != 0);
-	    //// calculate rice parameter
-	    //int k = max(0, min(14, convert_int_rtz(log2((psum + 0.000001f) / (residualLen + 0.000001f) + 0.5f))));
-	    //// calculate part bit length
-	    //int partLen = residualLen * (k + 1) + (psum >> k);
-
+	    int pl = partLen[taskNo];
 	    int obits = task.obits - task.wbits;
-	    shared.length[taskNo] =
-		min(obits * task.blocksize,
-		    task.type == Fixed ? task.residualOrder * obits + 6 + (4 * 1/2) + partLen :
-		    task.type == LPC ? task.residualOrder * obits + 4 + 5 + task.residualOrder * task.cbits + 6 + (4 * 1/2)/* << porder */ + partLen :
-		    task.type == Constant ? obits * select(1, task.blocksize, partLen != task.blocksize - task.residualOrder) : 
+	    int len = min(obits * task.blocksize,
+		    task.type == Fixed ? task.residualOrder * obits + 6 + (4 * 1/2) + pl :
+		    task.type == LPC ? task.residualOrder * obits + 4 + 5 + task.residualOrder * task.cbits + 6 + (4 * 1/2)/* << porder */ + pl :
+		    task.type == Constant ? obits * select(1, task.blocksize, pl != task.blocksize - task.residualOrder) : 
 		    obits * task.blocksize);
+
+	    tasks[taskNo + taskCount * get_group_id(0)].data.size = len;
+	    if (len < best_length)
+	    {
+		best_length = len;
+		best_index = taskNo;
+	    }
 	}
 
 	barrier(CLK_LOCAL_MEM_FENCE);
     }
-    //shared.index[get_local_id(0)] = get_local_id(0);
-    //shared.length[get_local_id(0)] = (get_local_id(0) < taskCount) ? tasks[get_local_id(0) + taskCount * get_group_id(0)].size : 0x7fffffff;
 
-    if (tid < taskCount)
-	tasks[tid + taskCount * get_group_id(0)].data.size = shared.length[tid];
-
-    int l1 = shared.length[tid];
-    for (int l = 16; l > 0; l >>= 1)
-    {
-	if (tid < l)
-	{
-	    int l2 = shared.length[tid + l];
-	    shared.index[tid] = shared.index[tid + select(0, l, l2 < l1)];
-	    shared.length[tid] = l1 = min(l1, l2);
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-    }
     if (tid == 0)
-	tasks[taskCount * get_group_id(0)].data.best_index = taskCount * get_group_id(0) + shared.index[0];
+	tasks[taskCount * get_group_id(0)].data.best_index = taskCount * get_group_id(0) + best_index;
 }
 
 __kernel __attribute__((reqd_work_group_size(64, 1, 1)))
@@ -788,13 +712,13 @@ void clCalcPartition(
     int psize // == task.blocksize >> max_porder?
     )
 {
-    __local int pl[(GROUP_SIZE / 16)][15];
+    __local int pl[(GROUP_SIZE / 8)][15];
     __local FLACCLSubframeData task;
 
     const int tid = get_local_id(0);
     if (tid < sizeof(task) / sizeof(int))
 	((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(1)]))[tid];
-    if (tid < (GROUP_SIZE / 16))
+    if (tid < (GROUP_SIZE / 8))
     {
 	for (int k = 0; k <= 14; k++)
 	    pl[tid][k] = 0;
@@ -807,13 +731,14 @@ void clCalcPartition(
     {
 	// fetch residual
 	int s = (offs >= task.residualOrder && offs < end) ? residual[task.residualOffs + offs] : 0;
-	// convert to unsigned
+	// overflow protection
 	s = clamp(s, -0x7fffff, 0x7fffff);
+	// convert to unsigned
 	s = (s << 1) ^ (s >> 31);
 	// calc number of unary bits for each residual sample with each rice paramater
-	int part = (offs - start) / psize;
+	int part = (offs - start) / psize + (tid & 1) * (GROUP_SIZE / 16);
 	for (int k = 0; k <= 14; k++)
-	    atom_add(&pl[part][k], s >> k);
+	    atomic_add(&pl[part][k], s >> k);
 	    //pl[part][k] += s >> k;
     }   
     barrier(CLK_LOCAL_MEM_FENCE);
@@ -825,7 +750,8 @@ void clCalcPartition(
 	{
 	    // output length
 	    const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (k << (max_porder + 1));
-	    partition_lengths[pos + part] = min(0x7fffff, pl[tid][k]) + select(psize, psize - task.residualOrder, part == 0) * (k + 1);
+	    int plen = pl[tid][k] + pl[tid + (GROUP_SIZE / 16)][k];
+	    partition_lengths[pos + part] = min(0x7fffff, plen) + (psize - select(0, task.residualOrder, part == 0)) * (k + 1);
 	 //   if (get_group_id(1) == 0)
 		//printf("pl[%d][%d] == %d\n", k, part, min(0x7fffff, pl[k][tid]) + (psize - task.residualOrder * (part == 0)) * (k + 1));
 	}
@@ -911,7 +837,7 @@ void clCalcPartition16(
 	// convert to unsigned
 	res[tid] = (s << 1) ^ (s >> 31);
 
-	// for (int k = 0; k < 15; k++) atom_add(&pl[x][k], s >> k);
+	// for (int k = 0; k < 15; k++) atomic_add(&pl[x][k], s >> k);
 
 	barrier(CLK_LOCAL_MEM_FENCE);
 	data[tid] = nextData;
@@ -1001,25 +927,23 @@ void clFindPartitionOrder(
     int max_porder
     )
 {
-    __local int partlen[9];
+    __local int partlen[16];
     __local FLACCLSubframeData task;
 
     const int pos = (get_group_id(0) << (max_porder + 2)) + (2 << max_porder);
     if (get_local_id(0) < sizeof(task) / sizeof(int))
 	((__local int*)&task)[get_local_id(0)] = ((__global int*)(&tasks[get_group_id(0)]))[get_local_id(0)];
-    if (get_local_id(0) < 9)
+    if (get_local_id(0) < 16)
 	partlen[get_local_id(0)] = 0;
     barrier(CLK_LOCAL_MEM_FENCE);
 
     // fetch partition lengths
-    for (int offs = 0; offs < (2 << max_porder); offs += GROUP_SIZE)
+    int lim = (2 << max_porder) - 1;
+    for (int offs = get_local_id(0); offs < lim; offs += GROUP_SIZE)
     {
-	if (offs + get_local_id(0) < (2 << max_porder) - 1)
-	{
-	    int len = rice_parameters[pos + offs + get_local_id(0)];
-	    int porder = 31 - clz((2 << max_porder) - 1 - offs - get_local_id(0));
-	    atom_add(&partlen[porder], len);
-	}
+	int len = rice_parameters[pos + offs];
+	int porder = 31 - clz(lim - offs);
+	atomic_add(&partlen[porder], len);
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
diff --git a/CUETools.Codecs.FLACCL/flaccpu.cl b/CUETools.Codecs.FLACCL/flaccpu.cl
new file mode 100644
index 0000000..a075299
--- /dev/null
+++ b/CUETools.Codecs.FLACCL/flaccpu.cl
@@ -0,0 +1,492 @@
+/**
+ * CUETools.FLACCL: FLAC audio encoder using OpenCL
+ * Copyright (c) 2010 Gregory S. Chudov
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef _FLACCL_KERNEL_H_
+#define _FLACCL_KERNEL_H_
+
+#ifdef DEBUG
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#endif
+
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+
+#pragma OPENCL EXTENSION cl_amd_fp64 : enable
+
+typedef enum
+{
+    Constant = 0,
+    Verbatim = 1,
+    Fixed = 8,
+    LPC = 32
+} SubframeType;
+
+typedef struct
+{
+    int residualOrder; // <= 32
+    int samplesOffs;
+    int shift;
+    int cbits;
+    int size;
+    int type;
+    int obits;
+    int blocksize;
+    int best_index;
+    int channel;
+    int residualOffs;
+    int wbits;
+    int abits;
+    int porder;
+    int reserved[2];
+} FLACCLSubframeData;
+
+typedef struct
+{
+    FLACCLSubframeData data;
+    int coefs[32]; // fixme: should be short?
+} FLACCLSubframeTask;
+
+__kernel void clStereoDecorr(
+    __global int *samples,
+    __global short2 *src,
+    int offset
+)
+{
+    int pos = get_global_id(0);
+    if (pos < offset)
+    {
+	short2 s = src[pos];
+	samples[pos] = s.x;
+	samples[1 * offset + pos] = s.y;
+	samples[2 * offset + pos] = (s.x + s.y) >> 1;
+	samples[3 * offset + pos] = s.x - s.y;
+    }
+}
+
+__kernel void clChannelDecorr2(
+    __global int *samples,
+    __global short2 *src,
+    int offset
+)
+{
+    int pos = get_global_id(0);
+    if (pos < offset)
+    {
+	short2 s = src[pos];
+	samples[pos] = s.x;
+	samples[1 * offset + pos] = s.y;
+    }
+}
+
+//__kernel void clChannelDecorr(
+//    int *samples,
+//    short *src,
+//    int offset
+//)
+//{
+//    int pos = get_global_id(0);
+//    if (pos < offset)
+//	samples[get_group_id(1) * offset + pos] = src[pos * get_num_groups(1) + get_group_id(1)];
+//}
+
+#define __ffs(a) (32 - clz(a & (-a)))
+//#define __ffs(a) (33 - clz(~a & (a - 1)))
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
+void clFindWastedBits(
+    __global FLACCLSubframeTask *tasks,
+    __global int *samples,
+    int tasksPerChannel
+)
+{
+    __global FLACCLSubframeTask* ptask = &tasks[get_group_id(0) * tasksPerChannel];
+    int w = 0, a = 0;
+    for (int pos = 0; pos < ptask->data.blocksize; pos ++)
+    {
+	int smp = samples[ptask->data.samplesOffs + pos];
+	w |= smp;
+	a |= smp ^ (smp >> 31);
+    }    
+    w = max(0,__ffs(w) - 1);
+    a = 32 - clz(a) - w;
+    for (int i = 0; i < tasksPerChannel; i++)
+    {
+	ptask[i].data.wbits = w;
+	ptask[i].data.abits = a;
+    }
+}
+
+// get_num_groups(0) == number of tasks
+// get_num_groups(1) == number of windows
+__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
+void clComputeAutocor(
+    __global float *output,
+    __global const int *samples,
+    __global const float *window,
+    __global FLACCLSubframeTask *tasks,
+    const int taskCount // tasks per block
+)
+{
+    FLACCLSubframeData task = tasks[get_group_id(0) * taskCount].data;
+    int len = task.blocksize;
+    int windowOffs = get_group_id(1) * len;
+    float data1[4096 + 32];
+
+    // TODO!!!!!!!!!!! if (bs > 4096) data1[bs + 32]
+
+    for (int tid = 0; tid < len; tid++)
+	data1[tid] = samples[task.samplesOffs + tid] * window[windowOffs + tid];
+    data1[len] = 0.0f;
+
+    for (int i = 0; i <= MAX_ORDER; ++i)
+    {
+	double temp = 1.0;
+	double temp2 = 1.0;
+	float* finish = data1 + len - i;
+
+	for (float* pdata = data1; pdata < finish; pdata += 2)
+	{
+	    temp += pdata[i] * pdata[0];
+	    temp2 += pdata[i + 1] * pdata[1];
+	}
+	output[(get_group_id(0) * get_num_groups(1) + get_group_id(1)) * (MAX_ORDER + 1) + i] = temp + temp2;
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
+void clComputeLPC(
+    __global float *pautoc,
+    __global float *lpcs,
+    int windowCount
+)
+{
+    int lpcOffs = (get_group_id(0) + get_group_id(1) * windowCount) * (MAX_ORDER + 1) * 32;
+    int autocOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1);
+    volatile double ldr[32];
+    volatile double gen0[32];
+    volatile double gen1[32];
+    volatile double err[32];
+    __global float* autoc = pautoc + autocOffs;
+    
+    for (int i = 0; i < MAX_ORDER; i++)
+    {
+	gen0[i] = gen1[i] = autoc[i + 1];
+	ldr[i] = 0.0;
+    }
+
+    // Compute LPC using Schur and Levinson-Durbin recursion
+    double error = autoc[0];
+    for (int order = 0; order < MAX_ORDER; order++)
+    {
+	// Schur recursion
+	double reff = -gen1[0] / error;
+	//error += gen1[0] * reff; // Equivalent to error *= (1 - reff * reff);
+	error *= (1 - reff * reff);
+	
+	for (int j = 0; j < MAX_ORDER - 1 - order; j++)
+	{
+	    gen1[j] = gen1[j + 1] + reff * gen0[j];
+	    gen0[j] = gen1[j + 1] * reff + gen0[j];
+	}
+
+	err[order] = error;
+
+	// Levinson-Durbin recursion
+
+	ldr[order] = reff;
+	for (int j = 0; j < order / 2; j++)
+	{
+	    double tmp = ldr[j];
+	    ldr[j] += reff * ldr[order - 1 - j];
+	    ldr[order - 1 - j] += reff * tmp;
+	}
+	if (0 != (order & 1))
+	    ldr[order / 2] += ldr[order / 2] * reff;
+	
+	// Output coeffs
+	for (int j = 0; j <= order; j++)
+	    lpcs[lpcOffs + order * 32 + j] = -ldr[order - j];
+    }
+    // Output prediction error estimates
+    for (int j = 0; j < MAX_ORDER; j++)
+	lpcs[lpcOffs + MAX_ORDER * 32 + j] = err[j];
+}
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
+void clQuantizeLPC(
+    __global FLACCLSubframeTask *tasks,
+    __global float*lpcs,
+    int taskCount, // tasks per block
+    int taskCountLPC, // tasks per set of coeffs (<= 32)
+    int minprecision,
+    int precisions
+)
+{
+    int bs = tasks[get_group_id(1) * taskCount].data.blocksize;
+    int abits = tasks[get_group_id(1) * taskCount].data.abits;
+    int lpcOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1) * 32;
+    float error[MAX_ORDER];
+    int best_orders[MAX_ORDER];
+
+    // Load prediction error estimates based on Akaike's Criteria
+    for (int tid = 0; tid < MAX_ORDER; tid++)
+    {
+	error[tid] = bs * log(lpcs[lpcOffs + MAX_ORDER * 32 + tid]) + tid * 4.12f * log(bs);
+	best_orders[tid] = tid;
+    }
+    
+    // Select best orders 
+    for (int i = 0; i < MAX_ORDER && i < taskCountLPC; i++)
+    {
+	for (int j = i + 1; j < MAX_ORDER; j++)
+	{
+	    if (error[best_orders[j]] < error[best_orders[i]])
+	    {
+		int tmp = best_orders[j];
+		best_orders[j] = best_orders[i];
+		best_orders[i] = tmp;
+	    }
+	}				
+    }
+
+    // Quantization
+    for (int i = 0; i < taskCountLPC; i ++)
+    {
+	int order = best_orders[i >> precisions];
+	int tmpi = 0;
+	for (int tid = 0; tid <= order; tid ++)
+	{
+	    float lpc = lpcs[lpcOffs + order * 32 + tid];
+	    // get 15 bits of each coeff
+	    int c = convert_int_rte(lpc * (1 << 15));
+	    // remove sign bits
+	    tmpi |= c ^ (c >> 31);
+	}
+	// choose precision	
+	//int cbits = max(3, min(10, 5 + (abits >> 1))); //  - convert_int_rte(shared.PE[order - 1])
+	int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (bs <= 2304) - (bs <= 1152) - (bs <= 576), abits), clz(order) + 1 - abits));
+	// calculate shift based on precision and number of leading zeroes in coeffs
+	int shift = max(0,min(15, clz(tmpi) - 18 + cbits));
+
+	int taskNo = get_group_id(1) * taskCount + get_group_id(0) * taskCountLPC + i;
+	tmpi = 0;
+	for (int tid = 0; tid <= order; tid ++)
+	{
+	    float lpc = lpcs[lpcOffs + order * 32 + tid];
+	    // quantize coeffs with given shift
+	    int c = convert_int_rte(clamp(lpc * (1 << shift), -1 << (cbits - 1), 1 << (cbits - 1)));
+	    // remove sign bits
+	    tmpi |= c ^ (c >> 31);
+	    tasks[taskNo].coefs[tid] = c;
+	}
+	// calculate actual number of bits (+1 for sign)
+	cbits = 1 + 32 - clz(tmpi);
+	// output shift, cbits, ro
+	tasks[taskNo].data.shift = shift;
+	tasks[taskNo].data.cbits = cbits;
+	tasks[taskNo].data.residualOrder = order + 1;
+    }
+}
+
+#define ESTIMATE_N(ro,sum) for (int pos = ro; pos < bs; pos ++) { \
+	__global int *ptr = data + pos - ro; \
+	int t = clamp((data[pos] - ((sum) >> task.data.shift)) >> task.data.wbits, -0x7fffff, 0x7fffff); \
+	len[pos >> (12 - EPO)] += (t << 1) ^ (t >> 31); \
+    }
+
+//	int sum = 0; for (int i = 0; i < ro; i++) sum += *(ptr++) * task.coefs[i];
+
+__kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(1, 1, 1)))
+void clEstimateResidual(
+    __global int*output,
+    __global int*samples,
+    __global FLACCLSubframeTask *tasks
+    )
+{
+    FLACCLSubframeTask task = tasks[get_group_id(0)];
+    int ro = task.data.residualOrder;
+    int bs = task.data.blocksize;
+#define EPO 6
+    int len[1 << EPO];
+
+#if 0
+    //float data[4096 + 32];
+    //float fcoef[32];
+
+    // TODO!!!!!!!!!!! if (bs > 4096) data1[bs + 32]
+
+    for (int tid = 0; tid < bs; tid++)
+	data[tid] = (float)samples[task.data.samplesOffs + tid] / (1 << task.data.wbits);
+    for (int tid = 0; tid < 32; tid++)
+	fcoef[tid] = select(0.0f, - ((float) task.coefs[tid]) / (1 << task.data.shift), tid < ro);
+    float4 c0 = vload4(0, &fcoef[0]);
+    float4 c1 = vload4(1, &fcoef[0]);
+    float4 c2 = vload4(2, &fcoef[0]);
+#else
+    __global int *data = &samples[task.data.samplesOffs];
+    for (int i = ro; i < 32; i++)
+	task.coefs[i] = 0;
+#endif
+    for (int i = 0; i < 1 << EPO; i++)
+	len[i] = 0;
+
+    switch (ro)
+    {
+	case 0: ESTIMATE_N(0, 0) break;
+	case 1: ESTIMATE_N(1, *ptr * task.coefs[0]) break;
+	case 2: ESTIMATE_N(2, *(ptr++) * task.coefs[0] + *ptr * task.coefs[1]) break;
+	case 3: ESTIMATE_N(3, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *ptr * task.coefs[2]) break;
+	case 4: ESTIMATE_N(4, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *ptr * task.coefs[3]) break;
+	case 5: ESTIMATE_N(5, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *ptr * task.coefs[4]) break;
+	case 6: ESTIMATE_N(6, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *ptr * task.coefs[5]) break;
+	case 7: ESTIMATE_N(7, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *ptr * task.coefs[6]) break;
+	case 8: ESTIMATE_N(8, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *ptr * task.coefs[7]) break;
+	case 9: ESTIMATE_N(9, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7] + *ptr * task.coefs[8]) break;
+	case 10: ESTIMATE_N(10, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7] + *(ptr++) * task.coefs[8] + *ptr * task.coefs[9]) break;
+	case 11: ESTIMATE_N(11, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7] + *(ptr++) * task.coefs[8] + *(ptr++) * task.coefs[9] + *ptr * task.coefs[10]) break;
+	case 12: ESTIMATE_N(12, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7] + *(ptr++) * task.coefs[8] + *(ptr++) * task.coefs[9] + *(ptr++) * task.coefs[10] + *ptr * task.coefs[11]) break;
+	default:
+	    for (int pos = ro; pos < bs; pos ++)
+	    {
+	#if 0
+		float sum = dot(vload4(0, data + pos - ro), c0)
+		    + dot(vload4(1, data + pos - ro), c1)
+		    + dot(vload4(2, data + pos - ro), c2)
+		    ;
+		int t = convert_int_rte(data[pos] + sum);
+	#else
+		__global int *ptr = data + pos - ro;
+		int sum = 
+		      *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3]
+		    + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7]
+		    + *(ptr++) * task.coefs[8] + *(ptr++) * task.coefs[9] + *(ptr++) * task.coefs[10] + *(ptr++) * task.coefs[11]
+		    ;
+		for (int i = 12; i < ro; i++)
+		    sum += *(ptr++) * task.coefs[i];
+		int t = (data[pos] - (sum >> task.data.shift)) >> task.data.wbits;
+	#endif
+		// overflow protection
+		t = clamp(t, -0x7fffff, 0x7fffff);
+		// convert to unsigned
+		t = (t << 1) ^ (t >> 31);
+		len[pos >> (12 - EPO)] += t;
+	    }
+	    break;
+    }
+
+    int total = 0;
+    for (int i = 0; i < 1 << EPO; i++)
+    {
+	int res = min(0x7fffff,len[i]);
+	int k = clamp(clz(1 << (12 - EPO)) - clz(res), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16)
+	total += (k << (12 - EPO)) + (res >> k);
+    }
+    output[get_group_id(0)] = min(0x7ffffff, total) + (bs - ro);
+}
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
+void clChooseBestMethod(
+    __global FLACCLSubframeTask *tasks,
+    __global int *residual,
+    int taskCount
+    )
+{
+    int best_length = 0x7fffff;
+    int best_no = 0;
+    for (int taskNo = 0; taskNo < taskCount; taskNo++)
+    {
+	// fetch task data
+	__global FLACCLSubframeTask* ptask = tasks + taskNo + taskCount * get_group_id(0);
+	// fetch part sum
+	int partLen = residual[taskNo + taskCount * get_group_id(0)];
+	int obits = ptask->data.obits - ptask->data.wbits;
+	int bs = ptask->data.blocksize;
+	int ro = ptask->data.residualOrder;
+	int len = min(obits * bs,
+		ptask->data.type == Fixed ? ro * obits + 6 + (4 * 1/2) + partLen :
+		ptask->data.type == LPC ? ro * obits + 4 + 5 + ro * ptask->data.cbits + 6 + (4 * 1/2)/* << porder */ + partLen :
+		ptask->data.type == Constant ? obits * select(1, bs, partLen != bs - ro) :
+		obits * bs);
+	ptask->data.size = len;
+	if (len < best_length)
+	{
+	    best_length = len;
+	    best_no = taskNo;
+	}
+    }
+
+    tasks[taskCount * get_group_id(0)].data.best_index = taskCount * get_group_id(0) + best_no;
+}
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
+void clCopyBestMethod(
+    __global FLACCLSubframeTask *tasks_out,
+    __global FLACCLSubframeTask *tasks,
+    int count
+    )
+{
+    int best_index = tasks[count * get_group_id(0)].data.best_index;
+    tasks_out[get_group_id(0)] = tasks[best_index];
+}
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
+void clCopyBestMethodStereo(
+    __global FLACCLSubframeTask *tasks_out,
+    __global FLACCLSubframeTask *tasks,
+    int count
+    )
+{
+    int best_index[4];
+    int best_size[4];
+    int lr_index[2];
+
+    for (int i = 0; i < 4; i++)
+    {
+	int best = tasks[count * (get_group_id(0) * 4 + i)].data.best_index;
+	best_index[i] = best;
+	best_size[i] = tasks[best].data.size;
+    }
+    
+    int bitsBest = best_size[2] + best_size[3]; // MidSide
+    lr_index[0] = best_index[2];
+    lr_index[1] = best_index[3];
+    if (bitsBest > best_size[3] + best_size[1]) // RightSide
+    {
+	bitsBest = best_size[3] + best_size[1];
+	lr_index[0] = best_index[3];
+	lr_index[1] = best_index[1];
+    }
+    if (bitsBest > best_size[0] + best_size[3]) // LeftSide
+    {
+	bitsBest = best_size[0] + best_size[3];
+	lr_index[0] = best_index[0];
+	lr_index[1] = best_index[3];
+    }
+    if (bitsBest > best_size[0] + best_size[1]) // LeftRight
+    {
+	bitsBest = best_size[0] + best_size[1];
+	lr_index[0] = best_index[0];
+	lr_index[1] = best_index[1];
+    }
+    tasks_out[2 * get_group_id(0)] = tasks[lr_index[0]];
+    tasks_out[2 * get_group_id(0)].data.residualOffs = tasks[best_index[0]].data.residualOffs;
+    tasks_out[2 * get_group_id(0) + 1] = tasks[lr_index[1]];
+    tasks_out[2 * get_group_id(0) + 1].data.residualOffs = tasks[best_index[1]].data.residualOffs;
+}
+
+#endif
diff --git a/CUETools.FLACCL.cmd/Program.cs b/CUETools.FLACCL.cmd/Program.cs
index 60719f7..377e2ef 100644
--- a/CUETools.FLACCL.cmd/Program.cs
+++ b/CUETools.FLACCL.cmd/Program.cs
@@ -72,6 +72,7 @@ namespace CUETools.FLACCL.cmd
 			string window_function = null;
 			string input_file = null;
 			string output_file = null;
+			string device_type = null;
 			int min_partition_order = -1, max_partition_order = -1,
 				min_lpc_order = -1, max_lpc_order = -1,
 				min_fixed_order = -1, max_fixed_order = -1,
@@ -112,6 +113,10 @@ namespace CUETools.FLACCL.cmd
 					settings.GroupSize = intarg;
 				else if (args[arg] == "--define" && arg + 2 < args.Length)
 					settings.Defines += "#define " + args[++arg] + " " + args[++arg] + "\n";
+				else if (args[arg] == "--opencl-platform" && ++arg < args.Length)
+					settings.Platform = args[arg];
+				else if (args[arg] == "--opencl-type" && ++arg < args.Length)
+					device_type = args[arg];
 				else if ((args[arg] == "-o" || args[arg] == "--output") && ++arg < args.Length)
 					output_file = args[arg];
 				else if ((args[arg] == "-s" || args[arg] == "--stereo") && ++arg < args.Length)
@@ -210,6 +215,8 @@ namespace CUETools.FLACCL.cmd
 
 			try
 			{
+				if (device_type != null)
+					settings.DeviceType = (OpenCLDeviceType)(Enum.Parse(typeof(OpenCLDeviceType), device_type, true));
 				encoder.Settings = settings;
 				if (level >= 0)
 					encoder.CompressionLevel = level;