optimizations

2025-12-16 18:14:25 +00:00 · 2010-12-02 15:58:41 +00:00
parent 64a04f0912
commit 6783bba2e9
7 changed files with 283 additions and 209 deletions
--- a/CUETools.Codecs.FLACCL/CUETools.Codecs.FLACCL.csproj
+++ b/CUETools.Codecs.FLACCL/CUETools.Codecs.FLACCL.csproj
@@ -65,8 +65,6 @@
    </ProjectReference>
  </ItemGroup>
  <ItemGroup>
    <None Include="flac.cu">
    </None>
    <Content Include="flac.cl">
      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
    </Content>
--- a/CUETools.Codecs.FLACCL/FLACCLWriter.cs
+++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs
@@ -41,7 +41,7 @@ namespace CUETools.Codecs.FLACCL
            this.MappedMemory = false;
 			this.DoMD5 = true; 
 			this.GroupSize = 128;
-			this.TaskSize = 32;
+			this.TaskSize = 8;
 			this.DeviceType = OpenCLDeviceType.GPU;
 		}
@@ -67,17 +67,19 @@ namespace CUETools.Codecs.FLACCL
        [SRDescription(typeof(Properties.Resources), "DescriptionMappedMemory")]
        public bool MappedMemory { get; set; }
 		[TypeConverter(typeof(FLACCLWriterSettingsGroupSizeConverter))]
 		[DefaultValue(128)]
 		[SRDescription(typeof(Properties.Resources), "DescriptionGroupSize")]
 		public int GroupSize { get; set; }
-		[DefaultValue(32)]
+		[DefaultValue(8)]
 		[SRDescription(typeof(Properties.Resources), "DescriptionTaskSize")]
 		public int TaskSize { get; set; }
 		[SRDescription(typeof(Properties.Resources), "DescriptionDefines")]
 		public string Defines { get; set; }
 		[TypeConverter(typeof(FLACCLWriterSettingsPlatformConverter))]
 		[SRDescription(typeof(Properties.Resources), "DescriptionPlatform")]
 		public string Platform { get; set; }
@@ -103,6 +105,35 @@ namespace CUETools.Codecs.FLACCL
 		}
 	}
 	public class FLACCLWriterSettingsPlatformConverter : TypeConverter
 	{
 		public override bool GetStandardValuesSupported(ITypeDescriptorContext context)
 		{
 			return true;
 		}
 		public override StandardValuesCollection GetStandardValues(ITypeDescriptorContext context)
 		{			
 			var res = new List<string>();
 			foreach (var p in OpenCL.GetPlatforms())
 				res.Add(p.Name);
 			return new StandardValuesCollection(res);
 		}
 	}
 	public class FLACCLWriterSettingsGroupSizeConverter : TypeConverter
 	{
 		public override bool GetStandardValuesSupported(ITypeDescriptorContext context)
 		{
 			return true;
 		}
 		public override StandardValuesCollection GetStandardValues(ITypeDescriptorContext context)
 		{
 			return new StandardValuesCollection(new int[] { 64, 128, 256 });
 		}
 	}
 	public enum OpenCLDeviceType : ulong
 	{
 		CPU = DeviceType.CPU,
@@ -173,6 +204,8 @@ namespace CUETools.Codecs.FLACCL
 		FLACCLTask[] cpu_tasks;
 		int oldest_cpu_task = 0;
 		internal int framesPerTask;
 		AudioPCMConfig _pcm;
 		public const int MAX_BLOCKSIZE = 65536;
@@ -1037,7 +1070,7 @@ namespace CUETools.Codecs.FLACCL
 				if (task.nWindowFunctions == 0)
 					throw new Exception("invalid windowfunction");
                if (!_settings.MappedMemory)
-				    task.openCLCQ.EnqueueWriteBuffer(task.clWindowFunctions, true, 0, sizeof(float) * task.nWindowFunctions * task.frameSize, task.clWindowFunctionsPtr);
+				    task.openCLCQ.EnqueueWriteBuffer(task.clWindowFunctions, false, 0, sizeof(float) * task.nWindowFunctions * task.frameSize, task.clWindowFunctionsPtr);
 			}
 			task.nResidualTasks = 0;
@@ -1163,9 +1196,9 @@ namespace CUETools.Codecs.FLACCL
 				throw new Exception("oops");
            if (!_settings.MappedMemory)
-			    task.openCLCQ.EnqueueWriteBuffer(task.clResidualTasks, true, 0, sizeof(FLACCLSubframeTask) * task.nResidualTasks, task.clResidualTasksPtr);
+			    task.openCLCQ.EnqueueWriteBuffer(task.clResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * task.nResidualTasks, task.clResidualTasksPtr);
 			if (!_settings.MappedMemory)
-				task.openCLCQ.EnqueueWriteBuffer(task.clSelectedTasks, true, 0, sizeof(int) * (nFrames * channelsCount * task.nEstimateTasksPerChannel), task.clSelectedTasksPtr);		
+				task.openCLCQ.EnqueueWriteBuffer(task.clSelectedTasks, false, 0, sizeof(int) * (nFrames * channelsCount * task.nEstimateTasksPerChannel), task.clSelectedTasksPtr);		
 		}
 		unsafe void encode_residual(FLACCLTask task)
@@ -1471,7 +1504,7 @@ namespace CUETools.Codecs.FLACCL
 			int channelsCount = doMidside ? 2 * channels : channels;
 			if (task.nResidualTasks == 0)
-				initializeSubframeTasks(task.frameSize, channelsCount, _settings.TaskSize, task);
+				initializeSubframeTasks(task.frameSize, channelsCount, framesPerTask, task);
 			estimate_residual(task, channelsCount);
 		}
@@ -1611,10 +1644,9 @@ namespace CUETools.Codecs.FLACCL
 				}
 				OCLMan.CreateDefaultContext(platformId, (DeviceType)_settings.DeviceType);
-				bool haveAtom = false;
+				this.framesPerTask = (int)OCLMan.Context.Devices[0].MaxComputeUnits * _settings.TaskSize;
-				if (OCLMan.Context.Devices[0].Extensions.Contains("cl_khr_local_int32_extended_atomics"))
+
-					haveAtom = true;					
+				if (!OCLMan.Context.Devices[0].Extensions.Contains("cl_khr_local_int32_extended_atomics"))
 				else
 					_settings.GPUOnly = false;
 				// The Defines string gets prepended to any and all sources that are compiled
@@ -1625,13 +1657,20 @@ namespace CUETools.Codecs.FLACCL
 					"#define FLACCL_VERSION \"" + vendor_string + "\"\n" +
 					(_settings.GPUOnly ? "#define DO_PARTITIONS\n" : "") +
 					(_settings.DoRice ? "#define DO_RICE\n" : "") +
 					(haveAtom ? "#define HAVE_ATOM\n" : "") +
 #if DEBUG
 					"#define DEBUG\n" +
 #endif
 					(_settings.DeviceType == OpenCLDeviceType.CPU ? "#define FLACCL_CPU\n" : "") +
 					_settings.Defines + "\n";
 				var exts = new string[] { "cl_khr_local_int32_base_atomics", "cl_khr_local_int32_extended_atomics", "cl_khr_fp64", "cl_amd_fp64" };
 				foreach (string extension in exts)
 					if (OCLMan.Context.Devices[0].Extensions.Contains(extension))
 					{
 						OCLMan.Defines += "#pragma OPENCL EXTENSION " + extension + ": enable\n";
 						OCLMan.Defines += "#define HAVE_" + extension + "\n";
 					}
 				try
 				{
 					openCLProgram = OCLMan.CompileFile("flac.cl");
@@ -1698,7 +1737,7 @@ namespace CUETools.Codecs.FLACCL
 			int pos = 0;
 			while (pos < buff.Length)
 			{
-				int block = Math.Min(buff.Length - pos, eparams.block_size * _settings.TaskSize - samplesInBuffer);
+				int block = Math.Min(buff.Length - pos, eparams.block_size * framesPerTask - samplesInBuffer);
 				fixed (byte* buf = buff.Bytes)
 					AudioSamples.MemCpy(((byte*)task1.clSamplesBytesPtr) + samplesInBuffer * _pcm.BlockAlign, buf + pos * _pcm.BlockAlign, block * _pcm.BlockAlign);
@@ -1707,7 +1746,7 @@ namespace CUETools.Codecs.FLACCL
 				pos += block;
 				int nFrames = samplesInBuffer / eparams.block_size;
-				if (nFrames >= _settings.TaskSize)
+				if (nFrames >= framesPerTask)
 					do_output_frames(nFrames);
 			}
 			if (md5 != null)
@@ -2405,7 +2444,7 @@ namespace CUETools.Codecs.FLACCL
 			openCLCQ = openCLProgram.Context.CreateCommandQueue(openCLProgram.Context.Devices[0], prop);
            int MAX_ORDER = this.writer.eparams.max_prediction_order;
-			int MAX_FRAMES = this.writer._settings.TaskSize;
+			int MAX_FRAMES = this.writer.framesPerTask;
 			int MAX_CHANNELSIZE = MAX_FRAMES * writer.eparams.block_size;
 			residualTasksLen = sizeof(FLACCLSubframeTask) * 32 * channelsCount * MAX_FRAMES;
 			bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channels * MAX_FRAMES;
@@ -2440,14 +2479,14 @@ namespace CUETools.Codecs.FLACCL
 				clSelectedTasksPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, selectedLen);
 				clRiceOutputPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceLen);
-                clSamplesBytesPtr = openCLCQ.EnqueueMapBuffer(clSamplesBytesPinned, true, MapFlags.WRITE, 0, samplesBufferLen / 2);
+                clSamplesBytesPtr = openCLCQ.EnqueueMapBuffer(clSamplesBytesPinned, true, MapFlags.READ_WRITE, 0, samplesBufferLen / 2);
-				clResidualPtr = openCLCQ.EnqueueMapBuffer(clResidualPinned, true, MapFlags.WRITE, 0, residualBufferLen);
+				clResidualPtr = openCLCQ.EnqueueMapBuffer(clResidualPinned, true, MapFlags.READ_WRITE, 0, residualBufferLen);
-                clBestRiceParamsPtr = openCLCQ.EnqueueMapBuffer(clBestRiceParamsPinned, true, MapFlags.WRITE, 0, riceParamsLen / 4);
+				clBestRiceParamsPtr = openCLCQ.EnqueueMapBuffer(clBestRiceParamsPinned, true, MapFlags.READ_WRITE, 0, riceParamsLen / 4);
-                clResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clResidualTasksPinned, true, MapFlags.WRITE, 0, residualTasksLen);
+				clResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clResidualTasksPinned, true, MapFlags.READ_WRITE, 0, residualTasksLen);
-                clBestResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clBestResidualTasksPinned, true, MapFlags.WRITE, 0, bestResidualTasksLen);
+				clBestResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clBestResidualTasksPinned, true, MapFlags.READ_WRITE, 0, bestResidualTasksLen);
-                clWindowFunctionsPtr = openCLCQ.EnqueueMapBuffer(clWindowFunctionsPinned, true, MapFlags.WRITE, 0, wndLen);
+				clWindowFunctionsPtr = openCLCQ.EnqueueMapBuffer(clWindowFunctionsPinned, true, MapFlags.READ_WRITE, 0, wndLen);
-				clSelectedTasksPtr = openCLCQ.EnqueueMapBuffer(clSelectedTasksPinned, true, MapFlags.WRITE, 0, selectedLen);
+				clSelectedTasksPtr = openCLCQ.EnqueueMapBuffer(clSelectedTasksPinned, true, MapFlags.READ_WRITE, 0, selectedLen);
-				clRiceOutputPtr = openCLCQ.EnqueueMapBuffer(clRiceOutputPinned, true, MapFlags.WRITE, 0, riceLen);
+				clRiceOutputPtr = openCLCQ.EnqueueMapBuffer(clRiceOutputPinned, true, MapFlags.READ_WRITE, 0, riceLen);
 			}
            else
            {
@@ -2904,7 +2943,7 @@ namespace CUETools.Codecs.FLACCL
 					openCLCQ.EnqueueNDRangeKernel(
 						clCalcOutputOffsets,
-						groupSize,
+						openCLCQ.Device.DeviceType == DeviceType.CPU ? groupSize : 32,
 						1);
 					clRiceEncoding.SetArgs(
--- a/CUETools.Codecs.FLACCL/Properties/Resources.Designer.cs
+++ b/CUETools.Codecs.FLACCL/Properties/Resources.Designer.cs
@@ -124,7 +124,7 @@ namespace CUETools.Codecs.FLACCL.Properties {
        }
        /// <summary>
-        ///   Looks up a localized string similar to OpenCL platform to use (ATI Stream, NVIDIA OpenCL, Intel OpenCL, etc).
+        ///   Looks up a localized string similar to OpenCL platform to use.
        /// </summary>
        internal static string DescriptionPlatform {
            get {
@@ -133,7 +133,7 @@ namespace CUETools.Codecs.FLACCL.Properties {
        }
        /// <summary>
-        ///   Looks up a localized string similar to Number of frames processed simultaniously (32, 64).
+        ///   Looks up a localized string similar to Number of frames processed per one multiprocessor.
        /// </summary>
        internal static string DescriptionTaskSize {
            get {
--- a/CUETools.Codecs.FLACCL/Properties/Resources.resx
+++ b/CUETools.Codecs.FLACCL/Properties/Resources.resx
@@ -139,10 +139,10 @@
    <value>Device uses host memory (Don't use)</value>
  </data>
  <data name="DescriptionPlatform" xml:space="preserve">
-    <value>OpenCL platform to use (ATI Stream, NVIDIA OpenCL, Intel OpenCL, etc)</value>
+    <value>OpenCL platform to use</value>
  </data>
  <data name="DescriptionTaskSize" xml:space="preserve">
-    <value>Number of frames processed simultaniously (32, 64)</value>
+    <value>Number of frames processed per one multiprocessor</value>
  </data>
  <data name="DoMD5Description" xml:space="preserve">
    <value>Calculate MD5 hash for audio stream</value>
--- a/CUETools.Codecs.FLACCL/flac.cl
+++ b/CUETools.Codecs.FLACCL/flac.cl
@@ -28,10 +28,30 @@
 #pragma OPENCL EXTENSION cl_amd_printf : enable
 #endif
-#ifdef __CPU__
+#if defined(HAVE_cl_khr_local_int32_base_atomics) && defined(HAVE_cl_khr_local_int32_extended_atomics)
-#pragma OPENCL EXTENSION cl_amd_fp64 : enable
+#define HAVE_ATOM
 #endif
 #if defined(HAVE_cl_khr_fp64) || defined(HAVE_cl_amd_fp64)
 #define HAVE_DOUBLE
 #define ZEROD 0.0
 //#define FAST_DOUBLE
 #else
 #define double float
 #define double4 float4
 #define ZEROD 0.0f
 #endif
 #if defined(HAVE_DOUBLE) && defined(FAST_DOUBLE)
 #define fastdouble double
 #define fastdouble4 double4
 #define ZEROFD 0.0
 #else
 #define fastdouble float
 #define fastdouble4 float4
 #define ZEROFD 0.0f
 #endif
 //#if __OPENCL_VERSION__ == 110
 #ifdef AMD
 #define iclamp(a,b,c) clamp(a,b,c)
@@ -45,11 +65,6 @@
 #define WARP_SIZE 32
 #ifdef HAVE_ATOM
 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics: enable
 #pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics: enable
 #endif
 typedef enum
 {
    Constant = 0,
@@ -303,7 +318,6 @@ void clComputeAutocor(
 #else
 // get_num_groups(0) == number of tasks
 // get_num_groups(1) == number of windows
 #if 0
 __kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
 void clComputeAutocor(
    __global float *output,
@@ -313,7 +327,7 @@ void clComputeAutocor(
    const int taskCount // tasks per block
 )
 {
-    __local float data[GROUP_SIZE * 2];
+    __local fastdouble data[GROUP_SIZE * 2];
    __local FLACCLSubframeData task;
    const int tid = get_local_id(0);
    // fetch task data
@@ -322,160 +336,50 @@ void clComputeAutocor(
    barrier(CLK_LOCAL_MEM_FENCE);
    int bs = task.blocksize;
-    int windowOffs = get_group_id(1) * bs;
+    data[tid] = ZEROFD;
 //   if (tid < GROUP_SIZE / 4)
 //   {
 	//float4 dd = 0.0f;
 	//if (tid * 4 < bs)
 	//    dd = vload4(tid, window + windowOffs) * convert_float4(vload4(tid, samples + task.samplesOffs));
 	//vstore4(dd, tid, &data[0]);
 //   }
    data[tid] = 0.0f;
    // This simpler code doesn't work somehow!!!
    //data[tid] = tid < bs ? samples[task.samplesOffs + tid] * window[windowOffs + tid] : 0.0f;
    const int THREADS_FOR_ORDERS = MAX_ORDER < 8 ? 8 : MAX_ORDER < 16 ? 16 : MAX_ORDER < 32 ? 32 : 64;
    float corr = 0.0f;
    float corr1 = 0.0f;
    for (int pos = 0; pos < bs; pos += GROUP_SIZE)
    {
 	// fetch samples
 	float nextData = pos + tid < bs ? samples[task.samplesOffs + pos + tid] * window[windowOffs + pos + tid] : 0.0f;
 	data[tid + GROUP_SIZE] = nextData;
 	barrier(CLK_LOCAL_MEM_FENCE);
 	int lag = tid & (THREADS_FOR_ORDERS - 1);
 	int tid1 = tid + GROUP_SIZE - lag;
 #ifdef AMD
 	float4 res = 0.0f;
 	for (int i = 0; i < THREADS_FOR_ORDERS / 4; i++)
 	    res += vload4(i, &data[tid1 - lag]) * vload4(i, &data[tid1]);
 	corr += res.x + res.y + res.w + res.z;
 #else
 	float res = 0.0f;
 	for (int i = 0; i < THREADS_FOR_ORDERS; i++)
 	    res += data[tid1 - lag + i] * data[tid1 + i];
 	corr += res;
 #endif
 	if ((pos & (GROUP_SIZE * 15)) == 0)
 	{
 	    corr1 += corr;
 	    corr = 0.0f;
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	data[tid] = nextData;
    }
    data[tid] = corr + corr1;
    barrier(CLK_LOCAL_MEM_FENCE);
    for (int i = GROUP_SIZE / 2; i >= THREADS_FOR_ORDERS; i >>= 1)
    {
 	if (tid < i)
 	    data[tid] += data[tid + i];
 	barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (tid <= MAX_ORDER)
 	output[(get_group_id(0) * get_num_groups(1) + get_group_id(1)) * (MAX_ORDER + 1) + tid] = data[tid];
 }
 #else
 __kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
 void clComputeAutocor(
    __global float *output,
    __global const int *samples,
    __global const float *window,
    __global FLACCLSubframeTask *tasks,
    const int taskCount // tasks per block
 )
 {
    __local float data[GROUP_SIZE * 2 + 32];
    __local FLACCLSubframeData task;
    const int tid = get_local_id(0);
    // fetch task data
    if (tid < sizeof(task) / sizeof(int))
 	((__local int*)&task)[tid] = ((__global int*)(tasks + taskCount * get_group_id(0)))[tid];
    barrier(CLK_LOCAL_MEM_FENCE);
    int bs = task.blocksize;
    data[tid] = 0.0f;
    if (tid < 32)
 	data[GROUP_SIZE * 2 + tid] = 0.0f;
    const int THREADS_FOR_ORDERS = MAX_ORDER < 8 ? 8 : MAX_ORDER < 16 ? 16 : MAX_ORDER < 32 ? 32 : 64;
    int lag = tid & (THREADS_FOR_ORDERS - 1);
    int tid1 = tid + GROUP_SIZE - lag;
    int pos = 0;
    const __global float * wptr = &window[get_group_id(1) * bs];
-#ifdef AMD
+//    const __global int * sptr = &samples[task.samplesOffs];
-    float4 corr = 0.0f;
+    double corr = ZEROD;
-#else
+
    float corr = 0.0f;
 #endif
    float corr1 = 0.0f;
    for (pos = 0; pos + GROUP_SIZE - 1 < bs; pos += GROUP_SIZE)
    {
 	// fetch samples
 	int off = pos + tid;
-//    const __global int * sptr = &samples[task.samplesOffs];
+	// fetch samples
-	float nextData = samples[task.samplesOffs + off] * wptr[off];
+	fastdouble nextData = samples[task.samplesOffs + off] * wptr[off];
 	data[tid + GROUP_SIZE] = nextData;
 	barrier(CLK_LOCAL_MEM_FENCE);
-#ifdef AMD
+	fastdouble4 tmp = ZEROFD;
 	for (int i = 0; i < THREADS_FOR_ORDERS / 4; i++)
-	    corr += vload4(i, &data[tid1 - lag]) * vload4(i, &data[tid1]);
+	    tmp += vload4(i, &data[tid1 - lag]) * vload4(i, &data[tid1]);
-#else
+	corr += (tmp.x + tmp.y) + (tmp.w + tmp.z);
 	for (int i = 0; i < THREADS_FOR_ORDERS; i++)
 	    corr += data[tid1 - lag + i] * data[tid1 + i];
 #endif
 	if ((pos & (GROUP_SIZE * 15)) == 0)
 	{
 #ifdef AMD
 	    corr1 += (corr.x + corr.y) + (corr.w + corr.z);
 #else
 	    corr1 += corr;
 #endif
 	    corr = 0.0f;
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	data[tid] = nextData;
    }
    if (pos < bs)
    {
 	// fetch samples
 	int off = pos + tid;
-	float nextData = off < bs ? samples[task.samplesOffs + off] * wptr[off] : 0.0f;
+	// fetch samples
 	double nextData = off < bs ? samples[task.samplesOffs + off] * wptr[off] : ZEROD;
 	data[tid + GROUP_SIZE] = nextData;
 	barrier(CLK_LOCAL_MEM_FENCE);
-	int lag = tid & (THREADS_FOR_ORDERS - 1);
+	fastdouble4 tmp = ZEROFD;
 	int tid1 = tid + GROUP_SIZE - lag;
 //#if 1
 #ifdef AMD
 	float4 res = 0.0f;
 	for (int i = 0; i < THREADS_FOR_ORDERS / 4; i++)
-	    res += vload4(i, &data[tid1 - lag]) * vload4(i, &data[tid1]);
+	    tmp += vload4(i, &data[tid1 - lag]) * vload4(i, &data[tid1]);
-	corr1 += res.x + res.y + res.w + res.z;
+	corr += (tmp.x + tmp.y) + (tmp.w + tmp.z);
 #else
 	for (int i = 0; i < THREADS_FOR_ORDERS; i++)
 	    corr1 += data[tid1 - lag + i] * data[tid1 + i];
 #endif
 	barrier(CLK_LOCAL_MEM_FENCE);
 	data[tid] = nextData;
    }
-#ifdef AMD
+    data[tid] = corr;
    corr1 += corr.x + corr.y + corr.w + corr.z;
 #else
    corr1 += corr;
 #endif
    data[tid] = corr1;
    barrier(CLK_LOCAL_MEM_FENCE);
    for (int i = GROUP_SIZE / 2; i >= THREADS_FOR_ORDERS; i >>= 1)
    {
@@ -488,7 +392,6 @@ void clComputeAutocor(
 	output[(get_group_id(0) * get_num_groups(1) + get_group_id(1)) * (MAX_ORDER + 1) + tid] = data[tid];
 }
 #endif
 #endif
 #ifdef FLACCL_CPU
 __kernel __attribute__((reqd_work_group_size(1, 1, 1)))
@@ -558,8 +461,8 @@ void clComputeLPC(
 )
 {
    __local struct {
-	volatile float ldr[32];
+	volatile double ldr[32];
-	volatile float gen1[32];
+	volatile double gen1[32];
 	volatile float error[32];
 	volatile float autoc[33];
    } shared;
@@ -575,9 +478,9 @@ void clComputeLPC(
    barrier(CLK_LOCAL_MEM_FENCE);
    // Compute LPC using Schur and Levinson-Durbin recursion
-    float gen0 = shared.gen1[get_local_id(0)] = shared.autoc[get_local_id(0)+1];
+    double gen0 = shared.gen1[get_local_id(0)] = shared.autoc[get_local_id(0)+1];
-    shared.ldr[get_local_id(0)] = 0.0f;
+    shared.ldr[get_local_id(0)] = ZEROD;
-    float error = shared.autoc[0];
+    double error = shared.autoc[0];
 #ifdef DEBUGPRINT1
    int magic = shared.autoc[0] == 177286873088.0f;
@@ -589,10 +492,10 @@ void clComputeLPC(
    for (int order = 0; order < MAX_ORDER; order++)
    {
 	// Schur recursion
-	float reff = -shared.gen1[0] / error;
+	double reff = -shared.gen1[0] / error;
 	//error += shared.gen1[0] * reff; // Equivalent to error *= (1 - reff * reff);
 	error *= (1 - reff * reff);
-	float gen1;
+	double gen1;
 	if (get_local_id(0) < MAX_ORDER - 1 - order)
 	{
 	    gen1 = shared.gen1[get_local_id(0) + 1] + reff * gen0;
@@ -613,7 +516,7 @@ void clComputeLPC(
 	    shared.error[order] = error;
 	// Levinson-Durbin recursion
-	float ldr = shared.ldr[get_local_id(0)];
+	double ldr = shared.ldr[get_local_id(0)];
        barrier(CLK_LOCAL_MEM_FENCE);
 	if (get_local_id(0) < order)
 	    shared.ldr[order - 1 - get_local_id(0)] += reff * ldr;
@@ -651,13 +554,21 @@ void clQuantizeLPC(
    float error[MAX_ORDER];
    int best_orders[MAX_ORDER];
    int best8 = 0;
    // Load prediction error estimates based on Akaike's Criteria
    for (int tid = 0; tid < MAX_ORDER; tid++)
    {
-	error[tid] = bs * log(lpcs[lpcOffs + MAX_ORDER * 32 + tid]) + tid * 4.12f * log((float)bs);
+	error[tid] = bs * log(1.0f + lpcs[lpcOffs + MAX_ORDER * 32 + tid]) + tid * 4.12f * log((float)bs);
 	best_orders[tid] = tid;
 	if (tid < 8 && error[tid] < error[best8])
 	    best8 = tid;
    }
 #if 0
    for (int i = best8 + 1; i < MAX_ORDER; i++)
 	error[i] += 20.5f * log((float)bs);
 #endif
    // Select best orders 
    for (int i = 0; i < MAX_ORDER && i < taskCountLPC; i++)
    {
@@ -730,6 +641,7 @@ void clQuantizeLPC(
 #ifndef HAVE_ATOM
 	volatile int tmp[32];
 #endif
 //    	volatile int best8;
    } shared;
    const int tid = get_local_id(0);
@@ -752,6 +664,17 @@ void clQuantizeLPC(
    if (tid < MAX_ORDER)
 	shared.error[tid] = shared.task.blocksize * log(lpcs[lpcOffs + MAX_ORDER * 32 + tid]) + tid * 4.12f * log((float)shared.task.blocksize);
 	//shared.error[get_local_id(0)] = shared.task.blocksize * log(lpcs[lpcOffs + MAX_ORDER * 32 + get_local_id(0)]) + get_local_id(0) * 0.30f * (shared.task.abits + 1) * log(shared.task.blocksize);
 #if 0
    if (tid == 0)
    {
 	int b8 = 0;
        for (int i = 1; i < 8; i++)
 	    if (shared.error[i] < shared.error[b8])
 		b8 = i;
 	shared.best8 = b8;
    }
    shared.error[tid] += select(0.0f, 20.5f * log((float)shared.task.blocksize), tid > shared.best8);
 #endif
    barrier(CLK_LOCAL_MEM_FENCE);
    // Sort using bitonic sort
@@ -1452,6 +1375,7 @@ void clCalcPartition16(
    __local FLACCLSubframeTask task;
    __local int data[GROUP_SIZE * 2];
    __local int res[GROUP_SIZE];
    __local int pl[GROUP_SIZE >> 4][15];
    const int tid = get_local_id(0);
    if (tid < sizeof(task) / sizeof(int))
@@ -1524,8 +1448,16 @@ void clCalcPartition16(
 	sum = (vload4(0,chunk) >> k) + (vload4(1,chunk) >> k) + (vload4(2,chunk) >> k) + (vload4(3,chunk) >> k);
 	s = sum.x + sum.y + sum.z + sum.w;
 #if 0
 	if (k <= 14 && offs < bs)
 	    plptr[offs >> 4] = min(0x7fffff, s) + (16 - select(0, ro, offs < 16)) * (k + 1);
 #else
 	if (k <= 14) pl[x][k] = min(0x7fffff, s) + (16 - select(0, ro, offs < 16)) * (k + 1);
 	barrier(CLK_LOCAL_MEM_FENCE);
        int k1 = tid >> 3, x1 = tid & 7;
 	if (k1 <= 14 && (pos >> 4) + x1 < (1 << max_porder))
 	    partition_lengths[(15 << (max_porder + 1)) * get_group_id(0) + (k1 << (max_porder + 1)) + (pos >> 4) + x1] = pl[x1][k1];
 #endif
 //    	if (task.data.blocksize == 16 && x == 0 && k <= 14)
 //	    printf("[%d] = %d = s:%d + %d * (k:%d + 1), ro=%d, offs=%d, lpos=%d\n", k, partition_lengths[lpos], s, (16 - select(0, ro, offs < 16)), k, ro, offs, lpos);
@@ -1849,13 +1781,12 @@ inline int len_utf8(int n)
 #else
    int bts = 31 - clz(n);
 #endif
-    if (bts < 7)
+    return select(8, 8 * ((bts + 4) / 5), bts > 6);
 	return 8;
    return 8 * ((bts + 4) / 5);
 }
 #ifdef FLACCL_CPU
 // get_global_id(0) * channels == task index
-__kernel 
+__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
 void clCalcOutputOffsets(
    __global int *residual,
    __global int *samples,
@@ -1875,8 +1806,7 @@ void clCalcOutputOffsets(
 	    ;
 	int bs = tasks[iFrame * channels].data.blocksize;
 	//public static readonly int[] flac_blocksizes = new int[15] { 0, 192, 576, 1152, 2304, 4608, 0, 0, 256, 512, 1024, 2048, 4096, 8192, 16384 };	
-	if (bs != 4096 && bs != 4608) // TODO: check all other standard sizes
+	offset += select(0, select(8, 16, bs >= 256), bs != 4096 && bs != 4608); // TODO: check all other standard sizes
 	    offset += select(8, 16, bs >= 256);
 	// assert (offset % 8) == 0
 	offset += 8;
@@ -1893,6 +1823,56 @@ void clCalcOutputOffsets(
 	offset += 16;
    }
 }
 #else
 // get_global_id(0) * channels == task index
 __kernel __attribute__((reqd_work_group_size(32, 1, 1)))
 void clCalcOutputOffsets(
    __global int *residual,
    __global int *samples,
    __global FLACCLSubframeTask *tasks,
    int channels1,
    int frameCount,
    int firstFrame
    )
 {
    const int channels = 2;
    __local FLACCLSubframeData ltasks[2];
    __local volatile int mypos[2];
    int offset = 0;
    for (int iFrame = 0; iFrame < frameCount; iFrame++)
    {
 	if (get_local_id(0) < sizeof(ltasks[0]) / sizeof(int))
 	    for (int ch = 0; ch < channels; ch++)
 		((__local int*)&ltasks[ch])[get_local_id(0)] = ((__global int*)(&tasks[iFrame * channels + ch]))[get_local_id(0)];
 	//printf("len_utf8(%d) == %d\n", firstFrame + iFrame, len_utf8(firstFrame + iFrame));
 	offset += 15 + 1 + 4 + 4 + 4 + 3 + 1 + len_utf8(firstFrame + iFrame)
 	    // + 8-16 // custom block size
 	    // + 8-16 // custom sample rate
 	    ;
 	int bs = ltasks[0].blocksize;
 	//public static readonly int[] flac_blocksizes = new int[15] { 0, 192, 576, 1152, 2304, 4608, 0, 0, 256, 512, 1024, 2048, 4096, 8192, 16384 };	
 	offset += select(0, select(8, 16, bs >= 256), bs != 4096 && bs != 4608); // TODO: check all other standard sizes
 	// assert (offset % 8) == 0
 	offset += 8;
 	if (get_local_id(0) < channels)
 	{
 	    int ch = get_local_id(0);
 	    // Add 64 bits to separate frames if header is too small so they can intersect
 	    int mylen = 8 + ltasks[ch].wbits + 64 + ltasks[ch].size;
 	    mypos[ch] = mylen;
 	    for (int offset = 1; offset < WARP_SIZE && offset < channels; offset <<= 1)
 		if (ch >= offset) mypos[ch] += mypos[ch - offset];
 	    mypos[ch] += offset;
 	    tasks[iFrame * channels + ch].data.encodingOffset = mypos[ch] - ltasks[ch].size + ltasks[ch].headerLen;
 	}
 	offset = mypos[channels - 1];
 	offset = (offset + 7) & ~7;
 	offset += 16;
    }
 }
 #endif
 // get_group_id(0) == task index
 __kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
@@ -2000,7 +1980,9 @@ void clRiceEncoding(
 #else
    __local unsigned int data[GROUP_SIZE];
    __local volatile int mypos[GROUP_SIZE+1];
-    //__local int brp[256];
+#if 0
    __local int brp[256];
 #endif
    __local volatile int warppos[WARP_SIZE];
    __local FLACCLSubframeData task;
@@ -2014,8 +1996,10 @@ void clRiceEncoding(
 	mypos[GROUP_SIZE] = 0;
    if (tid < WARP_SIZE)
 	warppos[tid] = 0;
- //   for (int offs = tid; offs < (1 << task.porder); offs ++)
+#if 0
-	//brp[offs] = best_rice_parameters[(get_group_id(0) << max_porder) + offs];
+    for (int offs = tid; offs < (1 << task.porder); offs ++)
 	brp[offs] = best_rice_parameters[(get_group_id(0) << max_porder) + offs];
 #endif
    data[tid] = 0;
    barrier(CLK_LOCAL_MEM_FENCE);
    const int bs = task.blocksize;
@@ -2023,19 +2007,23 @@ void clRiceEncoding(
    int plen = bs >> task.porder;
    //int plenoffs = 12 - task.porder;
    unsigned int remainder = 0U;
-    for (int pos = 0; pos < bs; pos += GROUP_SIZE)
+    int pos;
    for (pos = 0; pos + GROUP_SIZE - 1 < bs; pos += GROUP_SIZE)
    {
 	int offs = pos + tid;
-	int v = offs < bs ? residual[task.residualOffs + offs] : 0;
+	int v = residual[task.residualOffs + offs];
 	int part = offs / plen; // >> plenoffs;
-	//int k = brp[min(255, part)];
+#if 0
-	int k = offs < bs ? best_rice_parameters[(get_group_id(0) << max_porder) + part] : 0;
+	int k = brp[part];
 #else
 	int k = best_rice_parameters[(get_group_id(0) << max_porder) + part];
 #endif
 	int pstart = offs == task.residualOrder || offs == part * plen;
 	v = (v << 1) ^ (v >> 31);
 	int mylen = select(0, (v >> k) + 1 + k + select(0, 4, pstart), offs >= task.residualOrder && offs < bs);
 	mypos[tid] = mylen;
 	// Inclusive scan(+)
 #if 1
 	int lane = (tid & (WARP_SIZE - 1));
 	for (int offset = 1; offset < WARP_SIZE; offset <<= 1)
 	    mypos[tid] += mypos[select(GROUP_SIZE, tid - offset, lane >= offset)];
@@ -2052,19 +2040,68 @@ void clRiceEncoding(
 	mp += start + select(0, warppos[tid / WARP_SIZE - 1], tid / WARP_SIZE > 0);
 	int start32 = start >> 5;
 	start += mypos[GROUP_SIZE - 1] + warppos[GROUP_SIZE / WARP_SIZE - 2];
-#else
+
 	//if (tid == GROUP_SIZE - 1 && mypos[tid] > (GROUP_SIZE/2) * 32)
 	//    printf("Oops: %d\n", mypos[tid]);
 	data[tid] = select(0U, remainder, tid == 0);
 	barrier(CLK_LOCAL_MEM_FENCE);
-	for (int offset = 1; offset < GROUP_SIZE; offset <<= 1)
+	if (mylen)
 	{
-	    int t = tid >= offset ? mypos[tid - offset] : 0;
+	    if (pstart)
-	    barrier(CLK_LOCAL_MEM_FENCE);
+	    {
-	    mypos[tid] += t;
+		int kpos = mp - mylen;
-	    barrier(CLK_LOCAL_MEM_FENCE);
+		int kpos0 = (kpos >> 5) - start32;
 		int kpos1 = kpos & 31;
 		unsigned int kval = (unsigned int)k << 28;
 		unsigned int kval0 = kval >> kpos1;
 		unsigned int kval1 = kval << (32 - kpos1);
 		if (kval0) atom_or(&data[kpos0], kval0);
 		if (kpos1 && kval1) atom_or(&data[kpos0 + 1], kval1);
 	    }
-	int mp = start + mypos[tid];
+	    int qpos = mp - k - 1;
-	int start32 = start / 32;
+	    int qpos0 = (qpos >> 5) - start32;
-	start += mypos[GROUP_SIZE - 1];
+	    int qpos1 = qpos & 31;
-#endif
+	    unsigned int qval = (1U << 31) | ((unsigned int)v << (31 - k));
 	    unsigned int qval0 = qval >> qpos1;
 	    unsigned int qval1= qval << (32 - qpos1);
 	    if (qval0) atom_or(&data[qpos0], qval0);
 	    if (qpos1 && qval1) atom_or(&data[qpos0 + 1], qval1);
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	if ((start32 + tid) * 32 <= start)
 	    output[start32 + tid] = as_int(as_char4(data[tid]).wzyx);
 	remainder = data[start / 32 - start32];
    }
    if (pos < bs)
    {
 	int offs = pos + tid;
 	int v = offs < bs ? residual[task.residualOffs + offs] : 0;
 	int part = offs / plen; // >> plenoffs;
 	//int k = brp[min(255, part)];
 	int k = offs < bs ? best_rice_parameters[(get_group_id(0) << max_porder) + part] : 0;
 	int pstart = offs == task.residualOrder || offs == part * plen;
 	v = (v << 1) ^ (v >> 31);
 	int mylen = select(0, (v >> k) + 1 + k + select(0, 4, pstart), offs >= task.residualOrder && offs < bs);
 	mypos[tid] = mylen;
 	// Inclusive scan(+)
 	int lane = (tid & (WARP_SIZE - 1));
 	for (int offset = 1; offset < WARP_SIZE; offset <<= 1)
 	    mypos[tid] += mypos[select(GROUP_SIZE, tid - offset, lane >= offset)];
 	int mp = mypos[tid];
 	if ((tid & (WARP_SIZE - 1)) == WARP_SIZE - 1)
 	    warppos[tid/WARP_SIZE] = mp;
 	barrier(CLK_LOCAL_MEM_FENCE);
 	if (tid < GROUP_SIZE/WARP_SIZE)
 	{
 	    for (int offset = 1; offset < GROUP_SIZE/WARP_SIZE; offset <<= 1)
 		warppos[tid] += warppos[select(GROUP_SIZE/WARP_SIZE, tid - offset, tid >= offset)];
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	mp += start + select(0, warppos[tid / WARP_SIZE - 1], tid / WARP_SIZE > 0);
 	int start32 = start >> 5;
 	start += mypos[GROUP_SIZE - 1] + warppos[GROUP_SIZE / WARP_SIZE - 2];
 	//if (tid == GROUP_SIZE - 1 && mypos[tid] > (GROUP_SIZE/2) * 32)
 	//    printf("Oops: %d\n", mypos[tid]);
 	data[tid] = select(0U, remainder, tid == 0);
--- a/CUETools.FLACCL.cmd/CUETools.FLACL.cmd.csproj
+++ b/CUETools.FLACCL.cmd/CUETools.FLACL.cmd.csproj
@@ -32,6 +32,7 @@
    <DefineConstants>TRACE</DefineConstants>
    <ErrorReport>prompt</ErrorReport>
    <WarningLevel>4</WarningLevel>
    <PlatformTarget>AnyCPU</PlatformTarget>
  </PropertyGroup>
  <ItemGroup>
    <Reference Include="System" />
--- a/CUETools.FLACCL.cmd/Program.cs
+++ b/CUETools.FLACCL.cmd/Program.cs
@@ -47,17 +47,16 @@ namespace CUETools.FLACCL.cmd
 			Console.WriteLine("OpenCL Options:");
 			Console.WriteLine();
 			Console.WriteLine(" --opencl-type <X>    CPU or GPU, default GPU");
-			Console.WriteLine(" --opencl-platform '' 'ATI Stream', 'NVIDIA Cuda', 'Intel OpenCL' etc");
+			Console.WriteLine(" --opencl-platform    'ATI Stream', 'NVIDIA CUDA', 'Intel OpenCL' etc");
 			Console.WriteLine(" --group-size #       Set GPU workgroup size (64,128,256)");
-			Console.WriteLine(" --task-size #        Set number of frames per GPU call, default 32");
+			Console.WriteLine(" --task-size #        Set number of frames per multiprocessor, default 8");
 			Console.WriteLine(" --slow-gpu           Some encoding stages are done on CPU");
-			Console.WriteLine(" --do-rice            Experimental mode, not recommended");
+			Console.WriteLine(" --fast-gpu           Experimental mode, not recommended");
 			Console.WriteLine(" --define <X> <Y>     OpenCL preprocessor definition");
 			Console.WriteLine();
 			Console.WriteLine("Advanced Options:");
 			Console.WriteLine();
 			Console.WriteLine(" -b #                 Block size");
 			Console.WriteLine(" -v #                 Variable block size mode (0,4)");
 			Console.WriteLine(" -s <method>          Stereo decorrelation (independent,search)");
 			Console.WriteLine(" -r #[,#]             Rice partition order {max} or {min},{max} (0..8)");
 			Console.WriteLine();
@@ -88,9 +87,7 @@ namespace CUETools.FLACCL.cmd
 				min_precision = -1, max_precision = -1,
 				orders_per_window = -1, orders_per_channel = -1,
 				blocksize = -1;
 #if DEBUG
 			int input_len = 4096, input_val = 0;
 #endif
 			int level = -1, padding = -1, vbr_mode = -1;
 			bool do_seektable = true;
 			bool buffered = false;
@@ -111,7 +108,7 @@ namespace CUETools.FLACCL.cmd
 					do_seektable = false;
 				else if (args[arg] == "--slow-gpu")
 					settings.GPUOnly = false;
-				else if (args[arg] == "--do-rice")
+				else if (args[arg] == "--fast-gpu")
 					settings.DoRice = true;
 				else if (args[arg] == "--no-md5")
 					settings.DoMD5 = false;
@@ -135,12 +132,10 @@ namespace CUETools.FLACCL.cmd
 					settings.MappedMemory = true;
 				else if (args[arg] == "--opencl-type" && ++arg < args.Length)
 					device_type = args[arg];
 #if DEBUG
 				else if (args[arg] == "--input-length" && ++arg < args.Length && int.TryParse(args[arg], out intarg))
 					input_len = intarg;
 				else if (args[arg] == "--input-value" && ++arg < args.Length && int.TryParse(args[arg], out intarg))
 					input_val = intarg;
 #endif
 				else if ((args[arg] == "-o" || args[arg] == "--output") && ++arg < args.Length)
 					output_file = args[arg];
 				else if ((args[arg] == "-s" || args[arg] == "--stereo") && ++arg < args.Length)
@@ -218,10 +213,8 @@ namespace CUETools.FLACCL.cmd
 			IAudioSource audioSource;
 			if (input_file == "-")
 				audioSource = new WAVReader("", Console.OpenStandardInput());
 #if DEBUG
 			else if (input_file == "nul")
 				audioSource = new SilenceGenerator(input_len, input_val);
 #endif
 			else if (File.Exists(input_file) && Path.GetExtension(input_file) == ".wav")
 				audioSource = new WAVReader(input_file, null);
 			else if (File.Exists(input_file) && Path.GetExtension(input_file) == ".flac")
@@ -326,6 +319,9 @@ namespace CUETools.FLACCL.cmd
 				Console.Error.Write("\r                                                                         \r");
 				Console.WriteLine("Error     : {0}", ex.Message);
 				Console.WriteLine("{0}", ex.BuildLogs[0]);
 				if (debug)
 					using (StreamWriter sw = new StreamWriter("debug.txt", true))
 						sw.WriteLine("{0}\n{1}\n{2}", ex.Message, ex.StackTrace, ex.BuildLogs[0]);
 				audioDest.Delete();
 				audioSource.Close();
 				return 4;
@@ -335,6 +331,9 @@ namespace CUETools.FLACCL.cmd
 			{
 			    Console.Error.Write("\r                                                                         \r");
 			    Console.WriteLine("Error     : {0}", ex.Message);
 				if (debug)
 					using (StreamWriter sw = new StreamWriter("debug.txt", true))
 						sw.WriteLine("{0}\n{1}", ex.Message, ex.StackTrace);
 			    audioDest.Delete();
 			    audioSource.Close();
 			    return 4;