CPU mode + GPU optimizations

2025-12-16 18:14:25 +00:00 · 2010-10-25 04:50:36 +00:00
parent c10a62b17f
commit f619c82ef3
5 changed files with 671 additions and 199 deletions
--- a/CUETools.Codecs.FLACCL/CUETools.Codecs.FLACCL.csproj
+++ b/CUETools.Codecs.FLACCL/CUETools.Codecs.FLACCL.csproj
@@ -65,6 +65,9 @@
    </ProjectReference>
  </ItemGroup>
  <ItemGroup>
    <Content Include="flaccpu.cl">
      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
    </Content>
    <None Include="flac.cu">
    </None>
    <Content Include="flac.cl">
--- a/CUETools.Codecs.FLACCL/FLACCLWriter.cs
+++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs
@@ -33,7 +33,14 @@ namespace CUETools.Codecs.FLACCL
 {
 	public class FLACCLWriterSettings
 	{
-		public FLACCLWriterSettings() { DoVerify = false; GPUOnly = true; DoMD5 = true; GroupSize = 64; }
+		public FLACCLWriterSettings() 
 		{ 
 			this.DoVerify = false; 
 			this.GPUOnly = true; 
 			this.DoMD5 = true; 
 			this.GroupSize = 64;
 			this.DeviceType = OpenCLDeviceType.GPU;
 		}
 		[DefaultValue(false)]
 		[DisplayName("Verify")]
@@ -56,6 +63,13 @@ namespace CUETools.Codecs.FLACCL
 		[SRDescription(typeof(Properties.Resources), "DescriptionDefines")]
 		public string Defines { get; set; }
 		[SRDescription(typeof(Properties.Resources), "DescriptionPlatform")]
 		public string Platform { get; set; }
 		[DefaultValue(OpenCLDeviceType.GPU)]
 		[SRDescription(typeof(Properties.Resources), "DescriptionDeviceType")]
 		public OpenCLDeviceType DeviceType { get; set; }
 		int cpu_threads = 1;
 		[DefaultValue(1)]
 		[SRDescription(typeof(Properties.Resources), "DescriptionCPUThreads")]
@@ -74,6 +88,12 @@ namespace CUETools.Codecs.FLACCL
 		}
 	}
 	public enum OpenCLDeviceType : ulong
 	{
 		CPU = DeviceType.CPU,
 		GPU = DeviceType.GPU
 	}
 	[AudioEncoderClass("FLACCL", "flac", true, "0 1 2 3 4 5 6 7 8 9 10 11", "8", 2, typeof(FLACCLWriterSettings))]
 	//[AudioEncoderClass("FLACCL nonsub", "flac", true, "9 10 11", "9", 1, typeof(FLACCLWriterSettings))]
 	public class FLACCLWriter : IAudioDest
@@ -223,6 +243,11 @@ namespace CUETools.Codecs.FLACCL
 				if (value as FLACCLWriterSettings == null)
 					throw new Exception("Unsupported options " + value);
 				_settings = value as FLACCLWriterSettings;
 				if (_settings.DeviceType == OpenCLDeviceType.CPU)
 				{
 					_settings.GroupSize = 1;
 					_settings.GPUOnly = false;
 				}
 				eparams.flake_set_defaults(_compressionLevel, !_settings.GPUOnly);
 			}
 		}
@@ -449,7 +474,7 @@ namespace CUETools.Codecs.FLACCL
 			}
 			set
 			{
-				if (value < 0 || value > eparams.max_fixed_order)
+				if (value < 0 || value > 4)
 					throw new Exception("invalid MinFixedOrder " + value.ToString());
 				eparams.min_fixed_order = value;
 			}
@@ -463,7 +488,7 @@ namespace CUETools.Codecs.FLACCL
 			}
 			set
 			{
-				if (value > 4 || value < eparams.min_fixed_order)
+				if (value > 4 || value < 0)
 					throw new Exception("invalid MaxFixedOrder " + value.ToString());
 				eparams.max_fixed_order = value;
 			}
@@ -963,7 +988,7 @@ namespace CUETools.Codecs.FLACCL
 			task.nResidualTasks = 0;
 			task.nTasksPerWindow = Math.Min(32, eparams.orders_per_window);
-			task.nResidualTasksPerChannel = task.nWindowFunctions * task.nTasksPerWindow + 1 + (eparams.do_constant ? 1 : 0) + eparams.max_fixed_order - eparams.min_fixed_order;
+			task.nResidualTasksPerChannel = task.nWindowFunctions * task.nTasksPerWindow + (eparams.do_constant ? 1 : 0) + Math.Max(0, 1 + eparams.max_fixed_order - eparams.min_fixed_order);
 			//if (task.nResidualTasksPerChannel >= 4)
 			//    task.nResidualTasksPerChannel = (task.nResidualTasksPerChannel + 7) & ~7;
 			for (int iFrame = 0; iFrame < nFrames; iFrame++)
@@ -1444,7 +1469,7 @@ namespace CUETools.Codecs.FLACCL
 				if (OpenCL.NumberOfPlatforms < 1)
 					throw new Exception("no opencl platforms found");
-				int groupSize = _settings.GroupSize;
+				int groupSize = _settings.DeviceType == OpenCLDeviceType.CPU ? 1 : _settings.GroupSize;
 				OCLMan = new OpenCLManager();
 				// Attempt to save binaries after compilation, as well as load precompiled binaries
 				// to avoid compilation. Usually you'll want this to be true. 
@@ -1475,12 +1500,30 @@ namespace CUETools.Codecs.FLACCL
 				OCLMan.BuildOptions = "";
 				OCLMan.SourcePath = System.IO.Path.GetDirectoryName(GetType().Assembly.Location);
 				OCLMan.BinaryPath = System.IO.Path.Combine(System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData), "CUE Tools"), "OpenCL");
-				OCLMan.CreateDefaultContext(0, DeviceType.GPU);
+				int platformId = 0;
 				if (_settings.Platform != null)
 				{
 					platformId = -1;
 					string platforms = "";
 					for (int i = 0; i < OpenCL.NumberOfPlatforms; i++)
 					{
 						var platform = OpenCL.GetPlatform(i);
 						platforms += " \"" + platform.Name + "\"";
 						if (platform.Name.Equals(_settings.Platform, StringComparison.InvariantCultureIgnoreCase))
 						{
 							platformId = i;
 							break;
 						}
 					}
 					if (platformId < 0)
 						throw new Exception("unknown platform \"" + _settings.Platform + "\". Platforms available:" + platforms);
 				}
 				OCLMan.CreateDefaultContext(platformId, (DeviceType)_settings.DeviceType);
 				openCLContext = OCLMan.Context;
 				try
 				{
-					openCLProgram = OCLMan.CompileFile("flac.cl");
+					openCLProgram = OCLMan.CompileFile(_settings.DeviceType == OpenCLDeviceType.CPU ? "flaccpu.cl" : "flac.cl");
 				}
 				catch (OpenCLBuildException ex)
 				{
@@ -2218,13 +2261,12 @@ namespace CUETools.Codecs.FLACCL
 			this.channelsCount = channelsCount;
 			this.writer = writer;
 			openCLProgram = _openCLProgram;
 			Device[] openCLDevices = openCLProgram.Context.Platform.QueryDevices(DeviceType.GPU);
 #if DEBUG
 			var prop = CommandQueueProperties.PROFILING_ENABLE;
 #else
 			var prop = CommandQueueProperties.NONE;
 #endif
-			openCLCQ = openCLProgram.Context.CreateCommandQueue(openCLDevices[0], prop);
+			openCLCQ = openCLProgram.Context.CreateCommandQueue(openCLProgram.Context.Devices[0], prop);
 			residualTasksLen = sizeof(FLACCLSubframeTask) * channelsCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 8) * FLACCLWriter.maxFrames;
 			bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channels * FLACCLWriter.maxFrames;
@@ -2260,12 +2302,15 @@ namespace CUETools.Codecs.FLACCL
 			clChooseBestMethod = openCLProgram.CreateKernel("clChooseBestMethod");
 			clCopyBestMethod = openCLProgram.CreateKernel("clCopyBestMethod");
 			clCopyBestMethodStereo = openCLProgram.CreateKernel("clCopyBestMethodStereo");
 			if (writer._settings.GPUOnly)
 			{
 				clEncodeResidual = openCLProgram.CreateKernel("clEncodeResidual");
 				clCalcPartition = openCLProgram.CreateKernel("clCalcPartition");
 				clCalcPartition16 = openCLProgram.CreateKernel("clCalcPartition16");
 				clSumPartition = openCLProgram.CreateKernel("clSumPartition");
 				clFindRiceParameter = openCLProgram.CreateKernel("clFindRiceParameter");
 				clFindPartitionOrder = openCLProgram.CreateKernel("clFindPartitionOrder");
 			}
 			samplesBuffer = new int[FLACCLWriter.MAX_BLOCKSIZE * channelsCount];
 			outputBuffer = new byte[max_frame_size * FLACCLWriter.maxFrames + 1];
@@ -2304,12 +2349,15 @@ namespace CUETools.Codecs.FLACCL
 			clChooseBestMethod.Dispose();
 			clCopyBestMethod.Dispose();
 			clCopyBestMethodStereo.Dispose();
 			if (writer._settings.GPUOnly)
 			{
 				clEncodeResidual.Dispose();
 				clCalcPartition.Dispose();
 				clCalcPartition16.Dispose();
 				clSumPartition.Dispose();
 				clFindRiceParameter.Dispose();
 				clFindPartitionOrder.Dispose();
 			}
 			clSamples.Dispose();
 			clSamplesBytes.Dispose();
@@ -2390,15 +2438,13 @@ namespace CUETools.Codecs.FLACCL
 				nWindowFunctions);
 			clComputeLPC.SetArgs(
 				clResidualTasks,
 				clAutocorOutput,
 				clLPCData,
 				nResidualTasksPerChannel,
 				nWindowFunctions);
 			openCLCQ.EnqueueNDRangeKernel(
 				clComputeLPC,
-				32, 1,
+				Math.Min(groupSize, 32), 1,
 				nWindowFunctions,
 				channelsCount * frameCount);
@@ -2412,7 +2458,7 @@ namespace CUETools.Codecs.FLACCL
 			openCLCQ.EnqueueNDRangeKernel(
 				clQuantizeLPC,
-				32, 1,
+				Math.Min(groupSize, 32), 1,
 				nWindowFunctions,
 				channelsCount * frameCount);
@@ -2433,7 +2479,7 @@ namespace CUETools.Codecs.FLACCL
 			openCLCQ.EnqueueNDRangeKernel(
 				clChooseBestMethod,
-				32, channelsCount * frameCount);
+				Math.Min(groupSize, 32), channelsCount * frameCount);
 			if (channels == 2 && channelsCount == 4)
 			{
@@ -2444,7 +2490,7 @@ namespace CUETools.Codecs.FLACCL
 				openCLCQ.EnqueueNDRangeKernel(
 					clCopyBestMethodStereo,
-					64, frameCount);
+					Math.Min(groupSize, 64), frameCount);
 			}
 			else
 			{
@@ -2455,7 +2501,7 @@ namespace CUETools.Codecs.FLACCL
 				openCLCQ.EnqueueNDRangeKernel(
 					clCopyBestMethod,
-					64, channels * frameCount);
+					Math.Min(groupSize, 64), channels * frameCount);
 			}
 			if (writer._settings.GPUOnly)
--- a/CUETools.Codecs.FLACCL/flac.cl
+++ b/CUETools.Codecs.FLACCL/flac.cl
@@ -1,6 +1,6 @@
 /**
 * CUETools.FLACCL: FLAC audio encoder using OpenCL
- * Copyright (c) 2009 Gregory S. Chudov
+ * Copyright (c) 2010 Gregory S. Chudov
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
@@ -24,8 +24,6 @@
 #pragma OPENCL EXTENSION cl_amd_printf : enable
 #endif
 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
 //#pragma OPENCL EXTENSION cl_amd_fp64 : enable
 typedef enum
@@ -125,7 +123,7 @@ void clFindWastedBits(
    barrier(CLK_LOCAL_MEM_FENCE);
    int w = 0, a = 0;
-    for (int pos = tid; pos + tid < task.blocksize; pos += GROUP_SIZE)
+    for (int pos = tid; pos < task.blocksize; pos += GROUP_SIZE)
    {
 	int smp = samples[task.samplesOffs + pos];
 	w |= smp;
@@ -213,38 +211,25 @@ void clComputeAutocor(
 __kernel __attribute__((reqd_work_group_size(32, 1, 1)))
 void clComputeLPC(
    __global FLACCLSubframeTask *tasks,
    __global float *autoc,
    __global float *lpcs,
    int taskCount, // tasks per block
    int windowCount
 )
 {
    __local struct {
 	FLACCLSubframeData task;
 	volatile float ldr[32];
 	volatile float gen1[32];
 	volatile float error[32];
 	volatile float autoc[33];
 	volatile int lpcOffs;
 	volatile int autocOffs;
    } shared;
    const int tid = get_local_id(0);// + get_local_id(1) * 32;    
-    
+    int lpcOffs = (get_group_id(0) + get_group_id(1) * windowCount) * (MAX_ORDER + 1) * 32;
-    // fetch task data
+    int autocOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1);
    if (tid < sizeof(shared.task) / sizeof(int))
 	((__local int*)&shared.task)[tid] = ((__global int*)(tasks + get_group_id(1)))[tid];
    if (tid == 0)
    {
 	shared.lpcOffs = (get_group_id(0) + get_group_id(1) * windowCount) * (MAX_ORDER + 1) * 32;
 	shared.autocOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1);
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (get_local_id(0) <= MAX_ORDER)
-	shared.autoc[get_local_id(0)] = autoc[shared.autocOffs + get_local_id(0)];
+	shared.autoc[get_local_id(0)] = autoc[autocOffs + get_local_id(0)];
    if (get_local_id(0) + get_local_size(0) <= MAX_ORDER)
-	shared.autoc[get_local_id(0) + get_local_size(0)] = autoc[shared.autocOffs + get_local_id(0) + get_local_size(0)];
+	shared.autoc[get_local_id(0) + get_local_size(0)] = autoc[autocOffs + get_local_id(0) + get_local_size(0)];
    barrier(CLK_LOCAL_MEM_FENCE);
@@ -296,14 +281,14 @@ void clComputeLPC(
 	// Output coeffs
 	if (get_local_id(0) <= order)
-	    lpcs[shared.lpcOffs + order * 32 + get_local_id(0)] = -shared.ldr[order - get_local_id(0)];
+	    lpcs[lpcOffs + order * 32 + get_local_id(0)] = -shared.ldr[order - get_local_id(0)];
 	//if (get_local_id(0) <= order + 1 && fabs(-shared.ldr[0]) > 3000)
 	//    printf("coef[%d] == %f, autoc == %f, error == %f\n", get_local_id(0), -shared.ldr[order - get_local_id(0)], shared.autoc[get_local_id(0)], shared.error[get_local_id(0)]);
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    // Output prediction error estimates
    if (get_local_id(0) < MAX_ORDER)
-	lpcs[shared.lpcOffs + MAX_ORDER * 32 + get_local_id(0)] = shared.error[get_local_id(0)];
+	lpcs[lpcOffs + MAX_ORDER * 32 + get_local_id(0)] = shared.error[get_local_id(0)];
 }
 __kernel __attribute__((reqd_work_group_size(32, 1, 1)))
@@ -318,9 +303,10 @@ void clQuantizeLPC(
 {
    __local struct {
 	FLACCLSubframeData task;
 	volatile int tmpi[32];
 	volatile int index[64];
 	volatile float error[64];
 	volatile int maxcoef[32];
 	volatile int maxcoef2[32];
 	volatile int lpcOffs;
    } shared;
@@ -338,6 +324,8 @@ void clQuantizeLPC(
    shared.error[tid] = shared.task.blocksize * 64 + tid;
    shared.index[32 + tid] = MAX_ORDER - 1;
    shared.error[32 + tid] = shared.task.blocksize * 64 + tid + 32;
    shared.maxcoef[tid] = 0;
    shared.maxcoef2[tid] = 0;
    // Load prediction error estimates
    if (tid < MAX_ORDER)
@@ -399,21 +387,14 @@ void clQuantizeLPC(
 	// get 15 bits of each coeff
 	int coef = convert_int_rte(lpc * (1 << 15));
 	// remove sign bits
-	shared.tmpi[tid] = coef ^ (coef >> 31);
+	atomic_or(shared.maxcoef + i, coef ^ (coef >> 31));
 	barrier(CLK_LOCAL_MEM_FENCE);
 	// OR reduction
 	for (int l = get_local_size(0) / 2; l > 1; l >>= 1)
 	{
 	    if (tid < l)
 		shared.tmpi[tid] |= shared.tmpi[tid + l];
 	    barrier(CLK_LOCAL_MEM_FENCE);
 	}
 	//SUM32(shared.tmpi,tid,|=);
 	// choose precision	
 	//int cbits = max(3, min(10, 5 + (shared.task.abits >> 1))); //  - convert_int_rte(shared.PE[order - 1])
 	int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (shared.task.blocksize <= 2304) - (shared.task.blocksize <= 1152) - (shared.task.blocksize <= 576), shared.task.abits), clz(order) + 1 - shared.task.abits));
 	// calculate shift based on precision and number of leading zeroes in coeffs
-	int shift = max(0,min(15, clz(shared.tmpi[0] | shared.tmpi[1]) - 18 + cbits));
+	int shift = max(0,min(15, clz(shared.maxcoef[i]) - 18 + cbits));
 	//cbits = 13;
 	//shift = 15;
@@ -426,18 +407,10 @@ void clQuantizeLPC(
 	//shared.tmp[tid] = (tid != 0) * (shared.arp[tid - 1]*(1 << shared.task.shift) - shared.task.coefs[tid - 1]);
 	//shared.task.coefs[tid] = max(-(1 << (shared.task.cbits - 1)), min((1 << (shared.task.cbits - 1))-1, convert_int_rte((shared.arp[tid]) * (1 << shared.task.shift) + shared.tmp[tid])));
 	// remove sign bits
-	shared.tmpi[tid] = coef ^ (coef >> 31);
+	atomic_or(shared.maxcoef2 + i, coef ^ (coef >> 31));
 	barrier(CLK_LOCAL_MEM_FENCE);
 	// OR reduction
 	for (int l = get_local_size(0) / 2; l > 1; l >>= 1)
 	{
 	    if (tid < l)
 		shared.tmpi[tid] |= shared.tmpi[tid + l];
 	    barrier(CLK_LOCAL_MEM_FENCE);
 	}
 	//SUM32(shared.tmpi,tid,|=);
 	// calculate actual number of bits (+1 for sign)
-	cbits = 1 + 32 - clz(shared.tmpi[0] | shared.tmpi[1]);
+	cbits = 1 + 32 - clz(shared.maxcoef2[i]);
 	// output shift, cbits and output coeffs
 	int taskNo = get_group_id(1) * taskCount + get_group_id(0) * taskCountLPC + i;
@@ -452,10 +425,6 @@ void clQuantizeLPC(
    }
 }
 #ifndef PARTORDER
 #define PARTORDER 4
 #endif
 __kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
 void clEstimateResidual(
    __global int*output,
@@ -463,10 +432,10 @@ void clEstimateResidual(
    __global FLACCLSubframeTask *tasks
    )
 {
-    __local int data[GROUP_SIZE * 2];
+    __local float data[GROUP_SIZE * 2];
    __local FLACCLSubframeTask task;
-    __local int residual[GROUP_SIZE];
+    __local int psum[64];
-    __local int len[GROUP_SIZE >> PARTORDER];
+    __local float fcoef[32];
    const int tid = get_local_id(0);
    if (tid < sizeof(task)/sizeof(int))
@@ -476,101 +445,74 @@ void clEstimateResidual(
    int ro = task.data.residualOrder;
    int bs = task.data.blocksize;
-    if (tid < 32 && tid >= ro)
+    if (tid < 32)
-	task.coefs[tid] = 0;
+	fcoef[tid] = select(0.0f, - ((float) task.coefs[tid]) / (1 << task.data.shift), tid < ro);
-    if (tid < (GROUP_SIZE >> PARTORDER))
+	//fcoef[tid] = select(0.0f, - ((float) task.coefs[tid + ro - MAX_ORDER]) / (1 << task.data.shift), tid + ro >= MAX_ORDER && tid < MAX_ORDER);
-	len[tid] = 0;
+    if (tid < 64)
-    data[tid] = 0;
+	psum[tid] = 0;
    data[tid] = 0.0f;
    int partOrder = clz(64) - clz(bs - 1) + 1;
    barrier(CLK_LOCAL_MEM_FENCE);
-    __local int4 * cptr = (__local int4 *)&task.coefs[0];
+    float4 cptr0 = vload4(0, &fcoef[0]);
-    int4 cptr0 = cptr[0];
+    float4 cptr1 = vload4(1, &fcoef[0]);
 #if MAX_ORDER > 4
    int4 cptr1 = cptr[1];
 #if MAX_ORDER > 8
-    int4 cptr2 = cptr[2];
+    float4 cptr2 = vload4(2, &fcoef[0]);
 #endif
 #endif
    for (int pos = 0; pos < bs; pos += GROUP_SIZE)
    {
 	// fetch samples
 	int offs = pos + tid;
-	int nextData = offs < bs ? samples[task.data.samplesOffs + offs] >> task.data.wbits : 0;
+	float nextData = offs < bs ? samples[task.data.samplesOffs + offs] >> task.data.wbits : 0.0f;
 	data[tid + GROUP_SIZE] = nextData;
 	barrier(CLK_LOCAL_MEM_FENCE);
 	// compute residual
-	__local int4 * dptr = (__local int4 *)&data[tid + GROUP_SIZE - ro];
+	__local float* dptr = &data[tid + GROUP_SIZE - ro];
-	int4 sum = dptr[0] * cptr0
+	float4 sum = cptr0 * vload4(0, dptr)
-#if MAX_ORDER > 4
+	    + cptr1 * vload4(1, dptr)
 	    + dptr[1] * cptr1
 #if MAX_ORDER > 8
-	    + dptr[2] * cptr2
+	    + cptr2 * vload4(2, dptr)
-#if MAX_ORDER > 12
+  #if MAX_ORDER > 12
-	    + dptr[3] * cptr[3]
+	    + vload4(3, &fcoef[0]) * vload4(3, dptr)
-#if MAX_ORDER > 16
+    #if MAX_ORDER > 16
-	    + dptr[4] * cptr[4]
+	    + vload4(4, &fcoef[0]) * vload4(4, dptr)
-	    + dptr[5] * cptr[5]
+	    + vload4(5, &fcoef[0]) * vload4(5, dptr)
-	    + dptr[6] * cptr[6]
+	    + vload4(6, &fcoef[0]) * vload4(6, dptr)
-	    + dptr[7] * cptr[7]
+	    + vload4(7, &fcoef[0]) * vload4(7, dptr)
-#endif
+    #endif
-#endif
+  #endif
 #endif
 #endif
 	    ;
-	int t = nextData - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift);
+	int t = convert_int_rte(nextData + sum.x + sum.y + sum.z + sum.w);
 	barrier(CLK_LOCAL_MEM_FENCE);
 	data[tid] = nextData;
 	// ensure we're within frame bounds
 	t = select(0, t, offs >= ro && offs < bs);
 	// overflow protection
 	t = clamp(t, -0x7fffff, 0x7fffff);
 	// convert to unsigned
-	residual[tid] = (t << 1) ^ (t >> 31);
+	atomic_add(&psum[offs >> partOrder], (t << 1) ^ (t >> 31));
-	barrier(CLK_LOCAL_MEM_FENCE);
+    }
 	data[tid] = nextData;
-	// calculate rice partition bit length for every 16 samples
+    // calculate rice partition bit length for every (1 << partOrder) samples
-	if (tid < (GROUP_SIZE >> PARTORDER))
+    if (tid < 64)
    {
-	    //__local int4 * chunk = (__local int4 *)&residual[tid << PARTORDER];
+	int k = clamp(clz(1 << partOrder) - clz(psum[tid]), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16)
-	    __local int4 * chunk = ((__local int4 *)residual) + (tid << (PARTORDER - 2));
+	psum[tid] = (k << partOrder) + (psum[tid] >> k);
 #if PARTORDER == 3
 	    int4 sum = chunk[0] + chunk[1];
 #elif PARTORDER == 4
 	    int4 sum = chunk[0] + chunk[1] + chunk[2] + chunk[3]; // [0 .. (1 << (PARTORDER - 2)) - 1]
 #elif PARTORDER == 5
 	    int4 sum = chunk[0] + chunk[1] + chunk[2] + chunk[3] + chunk[4] + chunk[5] + chunk[6] + chunk[7];
 #else
 #error Invalid PARTORDER
 #endif
 	    int res = sum.x + sum.y + sum.z + sum.w;
 	    int k = clamp(clz(1 << PARTORDER) - clz(res), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16)
 #ifdef EXTRAMODE
 #if PARTORDER == 3
 	    sum = (chunk[0] >> k) + (chunk[1] >> k);
 #elif PARTORDER == 4
 	    sum = (chunk[0] >> k) + (chunk[1] >> k) + (chunk[2] >> k) + (chunk[3] >> k);
 #else
 #error Invalid PARTORDER
 #endif
 	    len[tid] += (k << PARTORDER) + sum.x + sum.y + sum.z + sum.w;
 #else
 	    len[tid] += (k << PARTORDER) + (res >> k);
 #endif
    }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int l = GROUP_SIZE >> (PARTORDER + 1); l > 0; l >>= 1)
+    for (int l = 32; l > 0; l >>= 1)
    {
 	if (tid < l)
-	    len[tid] += len[tid + l];
+	    psum[tid] += psum[tid + l];
 	barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (tid == 0)
-	output[get_group_id(0)] = len[0] + (bs - ro);
+	output[get_group_id(0)] = psum[0] + (bs - ro);
 }
 __kernel __attribute__((reqd_work_group_size(32, 1, 1)))
@@ -580,15 +522,16 @@ void clChooseBestMethod(
    int taskCount
    )
 {
-    __local struct {
+    int best_length = 0x7fffffff;
-	volatile int index[32];
+    int best_index = 0;
-	volatile int length[32];
+    __local int partLen[32];
    } shared;
    __local FLACCLSubframeData task;
    const int tid = get_local_id(0);
-    shared.length[tid] = 0x7fffffff;
+    // fetch part sum
-    shared.index[tid] = tid;
+    if (tid < taskCount)
 	partLen[tid] = residual[tid + taskCount * get_group_id(0)];
    barrier(CLK_LOCAL_MEM_FENCE);
    for (int taskNo = 0; taskNo < taskCount; taskNo++)
    {
 	// fetch task data
@@ -599,46 +542,27 @@ void clChooseBestMethod(
 	if (tid == 0)
 	{
-	    // fetch part sum
+	    int pl = partLen[taskNo];
 	    int partLen = residual[taskNo + taskCount * get_group_id(0)];
 	    //// calculate part size
 	    //int residualLen = task[get_local_id(1)].data.blocksize - task[get_local_id(1)].data.residualOrder;
 	    //residualLen = residualLen * (task[get_local_id(1)].data.type != Constant || psum != 0);
 	    //// calculate rice parameter
 	    //int k = max(0, min(14, convert_int_rtz(log2((psum + 0.000001f) / (residualLen + 0.000001f) + 0.5f))));
 	    //// calculate part bit length
 	    //int partLen = residualLen * (k + 1) + (psum >> k);
 	    int obits = task.obits - task.wbits;
-	    shared.length[taskNo] =
+	    int len = min(obits * task.blocksize,
-		min(obits * task.blocksize,
+		    task.type == Fixed ? task.residualOrder * obits + 6 + (4 * 1/2) + pl :
-		    task.type == Fixed ? task.residualOrder * obits + 6 + (4 * 1/2) + partLen :
+		    task.type == LPC ? task.residualOrder * obits + 4 + 5 + task.residualOrder * task.cbits + 6 + (4 * 1/2)/* << porder */ + pl :
-		    task.type == LPC ? task.residualOrder * obits + 4 + 5 + task.residualOrder * task.cbits + 6 + (4 * 1/2)/* << porder */ + partLen :
+		    task.type == Constant ? obits * select(1, task.blocksize, pl != task.blocksize - task.residualOrder) : 
 		    task.type == Constant ? obits * select(1, task.blocksize, partLen != task.blocksize - task.residualOrder) : 
 		    obits * task.blocksize);
 	    tasks[taskNo + taskCount * get_group_id(0)].data.size = len;
 	    if (len < best_length)
 	    {
 		best_length = len;
 		best_index = taskNo;
 	    }
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
    }
    //shared.index[get_local_id(0)] = get_local_id(0);
    //shared.length[get_local_id(0)] = (get_local_id(0) < taskCount) ? tasks[get_local_id(0) + taskCount * get_group_id(0)].size : 0x7fffffff;
    if (tid < taskCount)
 	tasks[tid + taskCount * get_group_id(0)].data.size = shared.length[tid];
    int l1 = shared.length[tid];
    for (int l = 16; l > 0; l >>= 1)
    {
 	if (tid < l)
 	{
 	    int l2 = shared.length[tid + l];
 	    shared.index[tid] = shared.index[tid + select(0, l, l2 < l1)];
 	    shared.length[tid] = l1 = min(l1, l2);
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (tid == 0)
-	tasks[taskCount * get_group_id(0)].data.best_index = taskCount * get_group_id(0) + shared.index[0];
+	tasks[taskCount * get_group_id(0)].data.best_index = taskCount * get_group_id(0) + best_index;
 }
 __kernel __attribute__((reqd_work_group_size(64, 1, 1)))
@@ -788,13 +712,13 @@ void clCalcPartition(
    int psize // == task.blocksize >> max_porder?
    )
 {
-    __local int pl[(GROUP_SIZE / 16)][15];
+    __local int pl[(GROUP_SIZE / 8)][15];
    __local FLACCLSubframeData task;
    const int tid = get_local_id(0);
    if (tid < sizeof(task) / sizeof(int))
 	((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(1)]))[tid];
-    if (tid < (GROUP_SIZE / 16))
+    if (tid < (GROUP_SIZE / 8))
    {
 	for (int k = 0; k <= 14; k++)
 	    pl[tid][k] = 0;
@@ -807,13 +731,14 @@ void clCalcPartition(
    {
 	// fetch residual
 	int s = (offs >= task.residualOrder && offs < end) ? residual[task.residualOffs + offs] : 0;
-	// convert to unsigned
+	// overflow protection
 	s = clamp(s, -0x7fffff, 0x7fffff);
 	// convert to unsigned
 	s = (s << 1) ^ (s >> 31);
 	// calc number of unary bits for each residual sample with each rice paramater
-	int part = (offs - start) / psize;
+	int part = (offs - start) / psize + (tid & 1) * (GROUP_SIZE / 16);
 	for (int k = 0; k <= 14; k++)
-	    atom_add(&pl[part][k], s >> k);
+	    atomic_add(&pl[part][k], s >> k);
 	    //pl[part][k] += s >> k;
    }   
    barrier(CLK_LOCAL_MEM_FENCE);
@@ -825,7 +750,8 @@ void clCalcPartition(
 	{
 	    // output length
 	    const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (k << (max_porder + 1));
-	    partition_lengths[pos + part] = min(0x7fffff, pl[tid][k]) + select(psize, psize - task.residualOrder, part == 0) * (k + 1);
+	    int plen = pl[tid][k] + pl[tid + (GROUP_SIZE / 16)][k];
 	    partition_lengths[pos + part] = min(0x7fffff, plen) + (psize - select(0, task.residualOrder, part == 0)) * (k + 1);
 	 //   if (get_group_id(1) == 0)
 		//printf("pl[%d][%d] == %d\n", k, part, min(0x7fffff, pl[k][tid]) + (psize - task.residualOrder * (part == 0)) * (k + 1));
 	}
@@ -911,7 +837,7 @@ void clCalcPartition16(
 	// convert to unsigned
 	res[tid] = (s << 1) ^ (s >> 31);
-	// for (int k = 0; k < 15; k++) atom_add(&pl[x][k], s >> k);
+	// for (int k = 0; k < 15; k++) atomic_add(&pl[x][k], s >> k);
 	barrier(CLK_LOCAL_MEM_FENCE);
 	data[tid] = nextData;
@@ -1001,25 +927,23 @@ void clFindPartitionOrder(
    int max_porder
    )
 {
-    __local int partlen[9];
+    __local int partlen[16];
    __local FLACCLSubframeData task;
    const int pos = (get_group_id(0) << (max_porder + 2)) + (2 << max_porder);
    if (get_local_id(0) < sizeof(task) / sizeof(int))
 	((__local int*)&task)[get_local_id(0)] = ((__global int*)(&tasks[get_group_id(0)]))[get_local_id(0)];
-    if (get_local_id(0) < 9)
+    if (get_local_id(0) < 16)
 	partlen[get_local_id(0)] = 0;
    barrier(CLK_LOCAL_MEM_FENCE);
    // fetch partition lengths
-    for (int offs = 0; offs < (2 << max_porder); offs += GROUP_SIZE)
+    int lim = (2 << max_porder) - 1;
    for (int offs = get_local_id(0); offs < lim; offs += GROUP_SIZE)
    {
-	if (offs + get_local_id(0) < (2 << max_porder) - 1)
+	int len = rice_parameters[pos + offs];
-	{
+	int porder = 31 - clz(lim - offs);
-	    int len = rice_parameters[pos + offs + get_local_id(0)];
+	atomic_add(&partlen[porder], len);
 	    int porder = 31 - clz((2 << max_porder) - 1 - offs - get_local_id(0));
 	    atom_add(&partlen[porder], len);
 	}
    }
    barrier(CLK_LOCAL_MEM_FENCE);
--- a/CUETools.Codecs.FLACCL/flaccpu.cl
+++ b/CUETools.Codecs.FLACCL/flaccpu.cl
@@ -0,0 +1,492 @@
 /**
 * CUETools.FLACCL: FLAC audio encoder using OpenCL
 * Copyright (c) 2010 Gregory S. Chudov
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 #ifndef _FLACCL_KERNEL_H_
 #define _FLACCL_KERNEL_H_
 #ifdef DEBUG
 #pragma OPENCL EXTENSION cl_amd_printf : enable
 #endif
 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
 #pragma OPENCL EXTENSION cl_amd_fp64 : enable
 typedef enum
 {
    Constant = 0,
    Verbatim = 1,
    Fixed = 8,
    LPC = 32
 } SubframeType;
 typedef struct
 {
    int residualOrder; // <= 32
    int samplesOffs;
    int shift;
    int cbits;
    int size;
    int type;
    int obits;
    int blocksize;
    int best_index;
    int channel;
    int residualOffs;
    int wbits;
    int abits;
    int porder;
    int reserved[2];
 } FLACCLSubframeData;
 typedef struct
 {
    FLACCLSubframeData data;
    int coefs[32]; // fixme: should be short?
 } FLACCLSubframeTask;
 __kernel void clStereoDecorr(
    __global int *samples,
    __global short2 *src,
    int offset
 )
 {
    int pos = get_global_id(0);
    if (pos < offset)
    {
 	short2 s = src[pos];
 	samples[pos] = s.x;
 	samples[1 * offset + pos] = s.y;
 	samples[2 * offset + pos] = (s.x + s.y) >> 1;
 	samples[3 * offset + pos] = s.x - s.y;
    }
 }
 __kernel void clChannelDecorr2(
    __global int *samples,
    __global short2 *src,
    int offset
 )
 {
    int pos = get_global_id(0);
    if (pos < offset)
    {
 	short2 s = src[pos];
 	samples[pos] = s.x;
 	samples[1 * offset + pos] = s.y;
    }
 }
 //__kernel void clChannelDecorr(
 //    int *samples,
 //    short *src,
 //    int offset
 //)
 //{
 //    int pos = get_global_id(0);
 //    if (pos < offset)
 //	samples[get_group_id(1) * offset + pos] = src[pos * get_num_groups(1) + get_group_id(1)];
 //}
 #define __ffs(a) (32 - clz(a & (-a)))
 //#define __ffs(a) (33 - clz(~a & (a - 1)))
 __kernel __attribute__((reqd_work_group_size(1, 1, 1)))
 void clFindWastedBits(
    __global FLACCLSubframeTask *tasks,
    __global int *samples,
    int tasksPerChannel
 )
 {
    __global FLACCLSubframeTask* ptask = &tasks[get_group_id(0) * tasksPerChannel];
    int w = 0, a = 0;
    for (int pos = 0; pos < ptask->data.blocksize; pos ++)
    {
 	int smp = samples[ptask->data.samplesOffs + pos];
 	w |= smp;
 	a |= smp ^ (smp >> 31);
    }    
    w = max(0,__ffs(w) - 1);
    a = 32 - clz(a) - w;
    for (int i = 0; i < tasksPerChannel; i++)
    {
 	ptask[i].data.wbits = w;
 	ptask[i].data.abits = a;
    }
 }
 // get_num_groups(0) == number of tasks
 // get_num_groups(1) == number of windows
 __kernel __attribute__((reqd_work_group_size(1, 1, 1)))
 void clComputeAutocor(
    __global float *output,
    __global const int *samples,
    __global const float *window,
    __global FLACCLSubframeTask *tasks,
    const int taskCount // tasks per block
 )
 {
    FLACCLSubframeData task = tasks[get_group_id(0) * taskCount].data;
    int len = task.blocksize;
    int windowOffs = get_group_id(1) * len;
    float data1[4096 + 32];
    // TODO!!!!!!!!!!! if (bs > 4096) data1[bs + 32]
    for (int tid = 0; tid < len; tid++)
 	data1[tid] = samples[task.samplesOffs + tid] * window[windowOffs + tid];
    data1[len] = 0.0f;
    for (int i = 0; i <= MAX_ORDER; ++i)
    {
 	double temp = 1.0;
 	double temp2 = 1.0;
 	float* finish = data1 + len - i;
 	for (float* pdata = data1; pdata < finish; pdata += 2)
 	{
 	    temp += pdata[i] * pdata[0];
 	    temp2 += pdata[i + 1] * pdata[1];
 	}
 	output[(get_group_id(0) * get_num_groups(1) + get_group_id(1)) * (MAX_ORDER + 1) + i] = temp + temp2;
    }
 }
 __kernel __attribute__((reqd_work_group_size(1, 1, 1)))
 void clComputeLPC(
    __global float *pautoc,
    __global float *lpcs,
    int windowCount
 )
 {
    int lpcOffs = (get_group_id(0) + get_group_id(1) * windowCount) * (MAX_ORDER + 1) * 32;
    int autocOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1);
    volatile double ldr[32];
    volatile double gen0[32];
    volatile double gen1[32];
    volatile double err[32];
    __global float* autoc = pautoc + autocOffs;
    for (int i = 0; i < MAX_ORDER; i++)
    {
 	gen0[i] = gen1[i] = autoc[i + 1];
 	ldr[i] = 0.0;
    }
    // Compute LPC using Schur and Levinson-Durbin recursion
    double error = autoc[0];
    for (int order = 0; order < MAX_ORDER; order++)
    {
 	// Schur recursion
 	double reff = -gen1[0] / error;
 	//error += gen1[0] * reff; // Equivalent to error *= (1 - reff * reff);
 	error *= (1 - reff * reff);
 	for (int j = 0; j < MAX_ORDER - 1 - order; j++)
 	{
 	    gen1[j] = gen1[j + 1] + reff * gen0[j];
 	    gen0[j] = gen1[j + 1] * reff + gen0[j];
 	}
 	err[order] = error;
 	// Levinson-Durbin recursion
 	ldr[order] = reff;
 	for (int j = 0; j < order / 2; j++)
 	{
 	    double tmp = ldr[j];
 	    ldr[j] += reff * ldr[order - 1 - j];
 	    ldr[order - 1 - j] += reff * tmp;
 	}
 	if (0 != (order & 1))
 	    ldr[order / 2] += ldr[order / 2] * reff;
 	// Output coeffs
 	for (int j = 0; j <= order; j++)
 	    lpcs[lpcOffs + order * 32 + j] = -ldr[order - j];
    }
    // Output prediction error estimates
    for (int j = 0; j < MAX_ORDER; j++)
 	lpcs[lpcOffs + MAX_ORDER * 32 + j] = err[j];
 }
 __kernel __attribute__((reqd_work_group_size(1, 1, 1)))
 void clQuantizeLPC(
    __global FLACCLSubframeTask *tasks,
    __global float*lpcs,
    int taskCount, // tasks per block
    int taskCountLPC, // tasks per set of coeffs (<= 32)
    int minprecision,
    int precisions
 )
 {
    int bs = tasks[get_group_id(1) * taskCount].data.blocksize;
    int abits = tasks[get_group_id(1) * taskCount].data.abits;
    int lpcOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1) * 32;
    float error[MAX_ORDER];
    int best_orders[MAX_ORDER];
    // Load prediction error estimates based on Akaike's Criteria
    for (int tid = 0; tid < MAX_ORDER; tid++)
    {
 	error[tid] = bs * log(lpcs[lpcOffs + MAX_ORDER * 32 + tid]) + tid * 4.12f * log(bs);
 	best_orders[tid] = tid;
    }
    // Select best orders 
    for (int i = 0; i < MAX_ORDER && i < taskCountLPC; i++)
    {
 	for (int j = i + 1; j < MAX_ORDER; j++)
 	{
 	    if (error[best_orders[j]] < error[best_orders[i]])
 	    {
 		int tmp = best_orders[j];
 		best_orders[j] = best_orders[i];
 		best_orders[i] = tmp;
 	    }
 	}				
    }
    // Quantization
    for (int i = 0; i < taskCountLPC; i ++)
    {
 	int order = best_orders[i >> precisions];
 	int tmpi = 0;
 	for (int tid = 0; tid <= order; tid ++)
 	{
 	    float lpc = lpcs[lpcOffs + order * 32 + tid];
 	    // get 15 bits of each coeff
 	    int c = convert_int_rte(lpc * (1 << 15));
 	    // remove sign bits
 	    tmpi |= c ^ (c >> 31);
 	}
 	// choose precision	
 	//int cbits = max(3, min(10, 5 + (abits >> 1))); //  - convert_int_rte(shared.PE[order - 1])
 	int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (bs <= 2304) - (bs <= 1152) - (bs <= 576), abits), clz(order) + 1 - abits));
 	// calculate shift based on precision and number of leading zeroes in coeffs
 	int shift = max(0,min(15, clz(tmpi) - 18 + cbits));
 	int taskNo = get_group_id(1) * taskCount + get_group_id(0) * taskCountLPC + i;
 	tmpi = 0;
 	for (int tid = 0; tid <= order; tid ++)
 	{
 	    float lpc = lpcs[lpcOffs + order * 32 + tid];
 	    // quantize coeffs with given shift
 	    int c = convert_int_rte(clamp(lpc * (1 << shift), -1 << (cbits - 1), 1 << (cbits - 1)));
 	    // remove sign bits
 	    tmpi |= c ^ (c >> 31);
 	    tasks[taskNo].coefs[tid] = c;
 	}
 	// calculate actual number of bits (+1 for sign)
 	cbits = 1 + 32 - clz(tmpi);
 	// output shift, cbits, ro
 	tasks[taskNo].data.shift = shift;
 	tasks[taskNo].data.cbits = cbits;
 	tasks[taskNo].data.residualOrder = order + 1;
    }
 }
 #define ESTIMATE_N(ro,sum) for (int pos = ro; pos < bs; pos ++) { \
 	__global int *ptr = data + pos - ro; \
 	int t = clamp((data[pos] - ((sum) >> task.data.shift)) >> task.data.wbits, -0x7fffff, 0x7fffff); \
 	len[pos >> (12 - EPO)] += (t << 1) ^ (t >> 31); \
    }
 //	int sum = 0; for (int i = 0; i < ro; i++) sum += *(ptr++) * task.coefs[i];
 __kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(1, 1, 1)))
 void clEstimateResidual(
    __global int*output,
    __global int*samples,
    __global FLACCLSubframeTask *tasks
    )
 {
    FLACCLSubframeTask task = tasks[get_group_id(0)];
    int ro = task.data.residualOrder;
    int bs = task.data.blocksize;
 #define EPO 6
    int len[1 << EPO];
 #if 0
    //float data[4096 + 32];
    //float fcoef[32];
    // TODO!!!!!!!!!!! if (bs > 4096) data1[bs + 32]
    for (int tid = 0; tid < bs; tid++)
 	data[tid] = (float)samples[task.data.samplesOffs + tid] / (1 << task.data.wbits);
    for (int tid = 0; tid < 32; tid++)
 	fcoef[tid] = select(0.0f, - ((float) task.coefs[tid]) / (1 << task.data.shift), tid < ro);
    float4 c0 = vload4(0, &fcoef[0]);
    float4 c1 = vload4(1, &fcoef[0]);
    float4 c2 = vload4(2, &fcoef[0]);
 #else
    __global int *data = &samples[task.data.samplesOffs];
    for (int i = ro; i < 32; i++)
 	task.coefs[i] = 0;
 #endif
    for (int i = 0; i < 1 << EPO; i++)
 	len[i] = 0;
    switch (ro)
    {
 	case 0: ESTIMATE_N(0, 0) break;
 	case 1: ESTIMATE_N(1, *ptr * task.coefs[0]) break;
 	case 2: ESTIMATE_N(2, *(ptr++) * task.coefs[0] + *ptr * task.coefs[1]) break;
 	case 3: ESTIMATE_N(3, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *ptr * task.coefs[2]) break;
 	case 4: ESTIMATE_N(4, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *ptr * task.coefs[3]) break;
 	case 5: ESTIMATE_N(5, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *ptr * task.coefs[4]) break;
 	case 6: ESTIMATE_N(6, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *ptr * task.coefs[5]) break;
 	case 7: ESTIMATE_N(7, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *ptr * task.coefs[6]) break;
 	case 8: ESTIMATE_N(8, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *ptr * task.coefs[7]) break;
 	case 9: ESTIMATE_N(9, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7] + *ptr * task.coefs[8]) break;
 	case 10: ESTIMATE_N(10, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7] + *(ptr++) * task.coefs[8] + *ptr * task.coefs[9]) break;
 	case 11: ESTIMATE_N(11, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7] + *(ptr++) * task.coefs[8] + *(ptr++) * task.coefs[9] + *ptr * task.coefs[10]) break;
 	case 12: ESTIMATE_N(12, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7] + *(ptr++) * task.coefs[8] + *(ptr++) * task.coefs[9] + *(ptr++) * task.coefs[10] + *ptr * task.coefs[11]) break;
 	default:
 	    for (int pos = ro; pos < bs; pos ++)
 	    {
 	#if 0
 		float sum = dot(vload4(0, data + pos - ro), c0)
 		    + dot(vload4(1, data + pos - ro), c1)
 		    + dot(vload4(2, data + pos - ro), c2)
 		    ;
 		int t = convert_int_rte(data[pos] + sum);
 	#else
 		__global int *ptr = data + pos - ro;
 		int sum = 
 		      *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3]
 		    + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7]
 		    + *(ptr++) * task.coefs[8] + *(ptr++) * task.coefs[9] + *(ptr++) * task.coefs[10] + *(ptr++) * task.coefs[11]
 		    ;
 		for (int i = 12; i < ro; i++)
 		    sum += *(ptr++) * task.coefs[i];
 		int t = (data[pos] - (sum >> task.data.shift)) >> task.data.wbits;
 	#endif
 		// overflow protection
 		t = clamp(t, -0x7fffff, 0x7fffff);
 		// convert to unsigned
 		t = (t << 1) ^ (t >> 31);
 		len[pos >> (12 - EPO)] += t;
 	    }
 	    break;
    }
    int total = 0;
    for (int i = 0; i < 1 << EPO; i++)
    {
 	int res = min(0x7fffff,len[i]);
 	int k = clamp(clz(1 << (12 - EPO)) - clz(res), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16)
 	total += (k << (12 - EPO)) + (res >> k);
    }
    output[get_group_id(0)] = min(0x7ffffff, total) + (bs - ro);
 }
 __kernel __attribute__((reqd_work_group_size(1, 1, 1)))
 void clChooseBestMethod(
    __global FLACCLSubframeTask *tasks,
    __global int *residual,
    int taskCount
    )
 {
    int best_length = 0x7fffff;
    int best_no = 0;
    for (int taskNo = 0; taskNo < taskCount; taskNo++)
    {
 	// fetch task data
 	__global FLACCLSubframeTask* ptask = tasks + taskNo + taskCount * get_group_id(0);
 	// fetch part sum
 	int partLen = residual[taskNo + taskCount * get_group_id(0)];
 	int obits = ptask->data.obits - ptask->data.wbits;
 	int bs = ptask->data.blocksize;
 	int ro = ptask->data.residualOrder;
 	int len = min(obits * bs,
 		ptask->data.type == Fixed ? ro * obits + 6 + (4 * 1/2) + partLen :
 		ptask->data.type == LPC ? ro * obits + 4 + 5 + ro * ptask->data.cbits + 6 + (4 * 1/2)/* << porder */ + partLen :
 		ptask->data.type == Constant ? obits * select(1, bs, partLen != bs - ro) :
 		obits * bs);
 	ptask->data.size = len;
 	if (len < best_length)
 	{
 	    best_length = len;
 	    best_no = taskNo;
 	}
    }
    tasks[taskCount * get_group_id(0)].data.best_index = taskCount * get_group_id(0) + best_no;
 }
 __kernel __attribute__((reqd_work_group_size(1, 1, 1)))
 void clCopyBestMethod(
    __global FLACCLSubframeTask *tasks_out,
    __global FLACCLSubframeTask *tasks,
    int count
    )
 {
    int best_index = tasks[count * get_group_id(0)].data.best_index;
    tasks_out[get_group_id(0)] = tasks[best_index];
 }
 __kernel __attribute__((reqd_work_group_size(1, 1, 1)))
 void clCopyBestMethodStereo(
    __global FLACCLSubframeTask *tasks_out,
    __global FLACCLSubframeTask *tasks,
    int count
    )
 {
    int best_index[4];
    int best_size[4];
    int lr_index[2];
    for (int i = 0; i < 4; i++)
    {
 	int best = tasks[count * (get_group_id(0) * 4 + i)].data.best_index;
 	best_index[i] = best;
 	best_size[i] = tasks[best].data.size;
    }
    int bitsBest = best_size[2] + best_size[3]; // MidSide
    lr_index[0] = best_index[2];
    lr_index[1] = best_index[3];
    if (bitsBest > best_size[3] + best_size[1]) // RightSide
    {
 	bitsBest = best_size[3] + best_size[1];
 	lr_index[0] = best_index[3];
 	lr_index[1] = best_index[1];
    }
    if (bitsBest > best_size[0] + best_size[3]) // LeftSide
    {
 	bitsBest = best_size[0] + best_size[3];
 	lr_index[0] = best_index[0];
 	lr_index[1] = best_index[3];
    }
    if (bitsBest > best_size[0] + best_size[1]) // LeftRight
    {
 	bitsBest = best_size[0] + best_size[1];
 	lr_index[0] = best_index[0];
 	lr_index[1] = best_index[1];
    }
    tasks_out[2 * get_group_id(0)] = tasks[lr_index[0]];
    tasks_out[2 * get_group_id(0)].data.residualOffs = tasks[best_index[0]].data.residualOffs;
    tasks_out[2 * get_group_id(0) + 1] = tasks[lr_index[1]];
    tasks_out[2 * get_group_id(0) + 1].data.residualOffs = tasks[best_index[1]].data.residualOffs;
 }
 #endif
--- a/CUETools.FLACCL.cmd/Program.cs
+++ b/CUETools.FLACCL.cmd/Program.cs
@@ -72,6 +72,7 @@ namespace CUETools.FLACCL.cmd
 			string window_function = null;
 			string input_file = null;
 			string output_file = null;
 			string device_type = null;
 			int min_partition_order = -1, max_partition_order = -1,
 				min_lpc_order = -1, max_lpc_order = -1,
 				min_fixed_order = -1, max_fixed_order = -1,
@@ -112,6 +113,10 @@ namespace CUETools.FLACCL.cmd
 					settings.GroupSize = intarg;
 				else if (args[arg] == "--define" && arg + 2 < args.Length)
 					settings.Defines += "#define " + args[++arg] + " " + args[++arg] + "\n";
 				else if (args[arg] == "--opencl-platform" && ++arg < args.Length)
 					settings.Platform = args[arg];
 				else if (args[arg] == "--opencl-type" && ++arg < args.Length)
 					device_type = args[arg];
 				else if ((args[arg] == "-o" || args[arg] == "--output") && ++arg < args.Length)
 					output_file = args[arg];
 				else if ((args[arg] == "-s" || args[arg] == "--stereo") && ++arg < args.Length)
@@ -210,6 +215,8 @@ namespace CUETools.FLACCL.cmd
 			try
 			{
 				if (device_type != null)
 					settings.DeviceType = (OpenCLDeviceType)(Enum.Parse(typeof(OpenCLDeviceType), device_type, true));
 				encoder.Settings = settings;
 				if (level >= 0)
 					encoder.CompressionLevel = level;