From 4a47615f7c9b2652950b65efb30e2d1158986a9b Mon Sep 17 00:00:00 2001
From: chudov <devnull@localhost>
Date: Fri, 15 Oct 2010 19:56:36 +0000
Subject: [PATCH] opencl flac encoder

---
 CUETools.Codecs.FLACCL/FLACCLWriter.cs | 107 ++++++++----
 CUETools.Codecs.FLACCL/flac.cl         | 217 ++++++++++++++++++-------
 CUETools.FLACCL.cmd/Program.cs         |  21 +--
 3 files changed, 245 insertions(+), 100 deletions(-)

diff --git a/CUETools.Codecs.FLACCL/FLACCLWriter.cs b/CUETools.Codecs.FLACCL/FLACCLWriter.cs
index c525c71..0dc7f60 100644
--- a/CUETools.Codecs.FLACCL/FLACCLWriter.cs
+++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs
@@ -53,6 +53,9 @@ namespace CUETools.Codecs.FLACCL
 		[SRDescription(typeof(Properties.Resources), "DescriptionGroupSize")]
 		public int GroupSize { get; set; }
 
+		[SRDescription(typeof(Properties.Resources), "DescriptionDefines")]
+		public string Defines { get; set; }
+
 		int cpu_threads = 1;
 		[DefaultValue(1)]
 		[SRDescription(typeof(Properties.Resources), "DescriptionCPUThreads")]
@@ -474,6 +477,12 @@ namespace CUETools.Codecs.FLACCL
 			}
 		}
 
+		public bool DoConstant
+		{
+			get { return eparams.do_constant; }
+			set { eparams.do_constant = value; }
+		}
+
 		public int MinPartitionOrder
 		{
 			get { return eparams.min_partition_order; }
@@ -1173,12 +1182,8 @@ namespace CUETools.Codecs.FLACCL
 				return;
 
 			int max_porder = get_max_p_order(eparams.max_partition_order, task.frameSize, eparams.max_prediction_order);
-			int calcPartitionPartSize = task.frameSize >> max_porder;
-			while (calcPartitionPartSize < 16 && max_porder > 0)
-			{
-				calcPartitionPartSize <<= 1;
+			while ((task.frameSize >> max_porder) < 16 && max_porder > 0)
 				max_porder--;
-			}
 
 			if (channels != 2) throw new Exception("channels != 2"); // need to Enqueue cudaChannelDecorr for each channel
 			Kernel cudaChannelDecorr = channels == 2 ? (channelsCount == 4 ? task.cudaStereoDecorr : task.cudaChannelDecorr2) : null;// task.cudaChannelDecorr;
@@ -1212,12 +1217,6 @@ namespace CUETools.Codecs.FLACCL
 			task.cudaEncodeResidual.SetArg(1, task.cudaSamples);
 			task.cudaEncodeResidual.SetArg(2, task.cudaBestResidualTasks);
 
-			task.cudaCalcPartition.SetArg(0, task.cudaPartitions);
-			task.cudaCalcPartition.SetArg(1, task.cudaResidual);
-			task.cudaCalcPartition.SetArg(2, task.cudaBestResidualTasks);
-			task.cudaCalcPartition.SetArg(3, max_porder);
-			task.cudaCalcPartition.SetArg(4, calcPartitionPartSize);
-
 			task.cudaSumPartition.SetArg(0, task.cudaPartitions);
 			task.cudaSumPartition.SetArg(1, max_porder);
 
@@ -1276,10 +1275,19 @@ namespace CUETools.Codecs.FLACCL
 				task.openCLCQ.EnqueueNDRangeKernel(task.cudaCopyBestMethod, 2, null, new int[] { 64, channels * task.frameCount }, new int[] { 64, 1 });
 			if (_settings.GPUOnly)
 			{
-				task.openCLCQ.EnqueueBarrier();
-				task.openCLCQ.EnqueueNDRangeKernel(task.cudaEncodeResidual, 1, null, new int[] { task.groupSize * channels * task.frameCount }, new int[] { task.groupSize });
-				task.openCLCQ.EnqueueBarrier();
-				task.openCLCQ.EnqueueNDRangeKernel(task.cudaCalcPartition, 2, null, new int[] { task.groupSize * (1 << max_porder), channels * task.frameCount }, new int[] { task.groupSize, 1 });
+				task.max_porder = max_porder;
+				if (task.frameSize >> max_porder == 16)
+				{
+					task.openCLCQ.EnqueueBarrier();
+					task.EnqueueCalcPartition16(channels);
+				}
+				else
+				{
+					task.openCLCQ.EnqueueBarrier();
+					task.openCLCQ.EnqueueNDRangeKernel(task.cudaEncodeResidual, 1, null, new int[] { task.groupSize * channels * task.frameCount }, new int[] { task.groupSize });
+					task.openCLCQ.EnqueueBarrier();
+					task.EnqueueCalcPartition(channels);
+				}
 				if (max_porder > 0)
 				{
 					task.openCLCQ.EnqueueBarrier();
@@ -1293,7 +1301,6 @@ namespace CUETools.Codecs.FLACCL
 				task.openCLCQ.EnqueueBarrier();
 				task.openCLCQ.EnqueueReadBuffer(task.cudaResidual, false, 0, sizeof(int) * MAX_BLOCKSIZE * channels, task.residualBufferPtr.AddrOfPinnedObject());
 				task.openCLCQ.EnqueueReadBuffer(task.cudaBestRiceParams, false, 0, sizeof(int) * (1 << max_porder) * channels * task.frameCount, task.bestRiceParamsPtr.AddrOfPinnedObject());
-			    task.max_porder = max_porder;
 			}
 			task.openCLCQ.EnqueueBarrier();
 			task.openCLCQ.EnqueueReadBuffer(task.cudaBestResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * channels * task.frameCount, task.bestResidualTasksPtr.AddrOfPinnedObject());
@@ -1575,11 +1582,12 @@ namespace CUETools.Codecs.FLACCL
 				// and serve as a convenient way to pass configuration information to the compilation process
 				OCLMan.Defines =
 					"#define MAX_ORDER " + eparams.max_prediction_order.ToString() + "\n" +
-					"#define GROUP_SIZE " + groupSize.ToString() + "\n";
+					"#define GROUP_SIZE " + groupSize.ToString() + "\n" +
+					_settings.Defines + "\n";
 				// The BuildOptions string is passed directly to clBuild and can be used to do debug builds etc
 				OCLMan.BuildOptions = "";
 				OCLMan.SourcePath = System.IO.Path.GetDirectoryName(GetType().Assembly.Location);
-				//OCLMan.BinaryPath = ;
+				OCLMan.BinaryPath = System.IO.Path.Combine(System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData), "CUE Tools"), "OpenCL");
 				OCLMan.CreateDefaultContext(0, DeviceType.GPU);
 
 				openCLContext = OCLMan.Context;
@@ -1778,7 +1786,7 @@ namespace CUETools.Codecs.FLACCL
 
 		public string Path { get { return _path; } }
 
-		public static readonly string vendor_string = "FLACCL#.91";
+		public static readonly string vendor_string = "FLACCL#0.1";
 
 		int select_blocksize(int samplerate, int time_ms)
 		{
@@ -2142,41 +2150,48 @@ namespace CUETools.Codecs.FLACCL
 					do_constant = false;
 					do_wasted = false;
 					do_midside = false;
+					window_function = WindowFunction.Bartlett;
 					orders_per_window = 1;
 					max_partition_order = 4;
 					max_prediction_order = 7;
-					min_fixed_order = 2;
+					min_fixed_order = 3;
 					max_fixed_order = 2;
 					break;
 				case 1:
+					do_constant = false;
 					do_wasted = false;
 					do_midside = false;
 					window_function = WindowFunction.Bartlett;
 					orders_per_window = 1;
-					max_prediction_order = 12;
+					min_fixed_order = 2;
+					max_fixed_order = 2;
+					max_prediction_order = 7;
 					max_partition_order = 4;
 					break;
 				case 2:
 					do_constant = false;
+					do_midside = false;
 					window_function = WindowFunction.Bartlett;
-					min_fixed_order = 3;
+					min_fixed_order = 2;
 					max_fixed_order = 2;
 					orders_per_window = 1;
-					max_prediction_order = 7;
+					max_prediction_order = 8;
 					max_partition_order = 4;
 					break;
 				case 3:
 					window_function = WindowFunction.Bartlett;
+					do_constant = false;
 					min_fixed_order = 2;
 					max_fixed_order = 2;
-					orders_per_window = 6;
-					max_prediction_order = 7;
+					orders_per_window = 1;
+					max_prediction_order = 8;
 					max_partition_order = 4;
 					break;
 				case 4:
+					do_constant = false;
 					min_fixed_order = 2;
 					max_fixed_order = 2;
-					orders_per_window = 3;
+					orders_per_window = 1;
 					max_prediction_order = 8;
 					max_partition_order = 4;
 					break;
@@ -2184,18 +2199,21 @@ namespace CUETools.Codecs.FLACCL
 					do_constant = false;
 					min_fixed_order = 2;
 					max_fixed_order = 2;
-					orders_per_window = 1;
+					orders_per_window = 2;
+					max_prediction_order = 8;
 					break;
 				case 6:
+					do_constant = false;
+					min_fixed_order = 2;
+					max_fixed_order = 2;
+					orders_per_window = 1;
+					break;
+				case 7:
+					do_constant = false;
 					min_fixed_order = 2;
 					max_fixed_order = 2;
 					orders_per_window = 3;
 					break;
-				case 7:
-					min_fixed_order = 2;
-					max_fixed_order = 2;
-					orders_per_window = 7;
-					break;
 				case 8:
 					orders_per_window = 12;
 					break;
@@ -2264,6 +2282,7 @@ namespace CUETools.Codecs.FLACCL
 		public Kernel cudaCopyBestMethodStereo;
 		public Kernel cudaEncodeResidual;
 		public Kernel cudaCalcPartition;
+		public Kernel cudaCalcPartition16;
 		public Kernel cudaSumPartition;
 		public Kernel cudaFindRiceParameter;
 		public Kernel cudaFindPartitionOrder;
@@ -2355,6 +2374,7 @@ namespace CUETools.Codecs.FLACCL
 			cudaCopyBestMethodStereo = openCLProgram.CreateKernel("cudaCopyBestMethodStereo");
 			cudaEncodeResidual = openCLProgram.CreateKernel("cudaEncodeResidual");
 			cudaCalcPartition = openCLProgram.CreateKernel("cudaCalcPartition");
+			cudaCalcPartition16 = openCLProgram.CreateKernel("cudaCalcPartition16");
 			cudaSumPartition = openCLProgram.CreateKernel("cudaSumPartition");
 			cudaFindRiceParameter = openCLProgram.CreateKernel("cudaFindRiceParameter");
 			cudaFindPartitionOrder = openCLProgram.CreateKernel("cudaFindPartitionOrder");
@@ -2398,6 +2418,7 @@ namespace CUETools.Codecs.FLACCL
 			cudaCopyBestMethodStereo.Dispose();
 			cudaEncodeResidual.Dispose();
 			cudaCalcPartition.Dispose();
+			cudaCalcPartition16.Dispose();
 			cudaSumPartition.Dispose();
 			cudaFindRiceParameter.Dispose();
 			cudaFindPartitionOrder.Dispose();
@@ -2464,6 +2485,28 @@ namespace CUETools.Codecs.FLACCL
 			openCLCQ.EnqueueNDRangeKernel(cudaChooseBestMethod, 2, null, new int[] { 32, channelsCount * frameCount }, new int[] { 32, 1 });
 		}
 
+		public void EnqueueCalcPartition16(int channels)
+		{
+			cudaCalcPartition16.SetArg(0, cudaPartitions);
+			cudaCalcPartition16.SetArg(1, cudaResidual);
+			cudaCalcPartition16.SetArg(2, cudaSamples);
+			cudaCalcPartition16.SetArg(3, cudaBestResidualTasks);
+			cudaCalcPartition16.SetArg(4, max_porder);
+
+			openCLCQ.EnqueueNDRangeKernel(cudaCalcPartition16, 2, null, new int[] { groupSize, channels * frameCount }, new int[] { groupSize, 1 });
+		}
+
+		public void EnqueueCalcPartition(int channels)
+		{
+			cudaCalcPartition.SetArg(0, cudaPartitions);
+			cudaCalcPartition.SetArg(1, cudaResidual);
+			cudaCalcPartition.SetArg(2, cudaBestResidualTasks);
+			cudaCalcPartition.SetArg(3, max_porder);
+			cudaCalcPartition.SetArg(4, frameSize >> max_porder);
+
+			openCLCQ.EnqueueNDRangeKernel(cudaCalcPartition, 2, null, new int[] { groupSize * (1 << max_porder), channels * frameCount }, new int[] { groupSize, 1 });
+		}
+
 		public unsafe FLACCLSubframeTask* ResidualTasks
 		{
 			get
diff --git a/CUETools.Codecs.FLACCL/flac.cl b/CUETools.Codecs.FLACCL/flac.cl
index d8e09d7..b826560 100644
--- a/CUETools.Codecs.FLACCL/flac.cl
+++ b/CUETools.Codecs.FLACCL/flac.cl
@@ -449,7 +449,7 @@ void cudaQuantizeLPC(
     }
 }
 
-#define DONT_BEACCURATE
+#define BEACCURATE
 
 __kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
 void cudaEstimateResidual(
@@ -481,24 +481,35 @@ void cudaEstimateResidual(
     if (tid < GROUP_SIZE / 16)
 	len[tid] = 0;
 #else
-    float res = 0.0f;
+    long res = 0;
+#endif
+    data[tid] = 0;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    __local int4 * cptr = (__local int4 *)&task.coefs[0];
+    int4 cptr0 = cptr[0];
+#if MAX_ORDER > 4
+    int4 cptr1 = cptr[1];
+#if MAX_ORDER > 8
+    int4 cptr2 = cptr[2];
+#endif
 #endif
-    data[tid] = tid < bs ? samples[task.data.samplesOffs + tid] >> task.data.wbits : 0;
     for (int pos = 0; pos < bs; pos += GROUP_SIZE)
     {
 	// fetch samples
-	int nextData = pos + tid + GROUP_SIZE < bs ? samples[task.data.samplesOffs + pos + tid + GROUP_SIZE] >> task.data.wbits : 0;
+	int offs = pos + tid;
+	int nextData = offs < bs ? samples[task.data.samplesOffs + offs] >> task.data.wbits : 0;
 	data[tid + GROUP_SIZE] = nextData;
 	barrier(CLK_LOCAL_MEM_FENCE);
 
 	// compute residual
-	__local int4 * dptr = (__local int4 *)&data[tid];
-	__local int4 * cptr = (__local int4 *)&task.coefs[0];
-	int4 sum = dptr[0] * cptr[0]
+	__local int4 * dptr = (__local int4 *)&data[tid + GROUP_SIZE - ro];
+	int4 sum = dptr[0] * cptr0
 #if MAX_ORDER > 4
-	    + dptr[1] * cptr[1]
+	    + dptr[1] * cptr1
 #if MAX_ORDER > 8
-	    + dptr[2] * cptr[2]
+	    + dptr[2] * cptr2
 #if MAX_ORDER > 12
 	    + dptr[3] * cptr[3]
 #if MAX_ORDER > 16
@@ -512,23 +523,23 @@ void cudaEstimateResidual(
 #endif
 	    ;
 	
-	int t = select(0, data[tid + ro] - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift), pos + tid + ro < bs);
+	int t = select(0, data[tid + GROUP_SIZE] - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift), offs >= ro && offs < bs);
 #ifdef BEACCURATE
-	residual[tid] = min((t << 1) ^ (t >> 31), 0x7fffff);
+	t = clamp(t, -0x7fffff, 0x7fffff);
+	residual[tid] = (t << 1) ^ (t >> 31);
 #else
-	res += fabs(t);
+	res += (t << 1) ^ (t >> 31);
 #endif
-	barrier(CLK_LOCAL_MEM_FENCE);
+	barrier(CLK_GLOBAL_MEM_FENCE);
 
 #ifdef BEACCURATE
 	if (tid < GROUP_SIZE / 16)
 	{
-	    __local int4 * chunk = ((__local int4 *)residual) + tid * 4;
+	    __local int4 * chunk = ((__local int4 *)residual) + (tid << 2);
 	    int4 sum = chunk[0] + chunk[1] + chunk[2] + chunk[3];
 	    int res = sum.x + sum.y + sum.z + sum.w;
-	    int k = clamp(clz(16) - clz(res), 0, 14);
-	    len[tid] += 16 * k + (res >> k);
-	    k = clamp(clz(16) - clz(res), 0, 14);
+	    int k = clamp(27 - clz(res), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16)
+	    len[tid] += (k << 4) + (res >> k);
 	}
 #endif
 
@@ -557,7 +568,7 @@ void cudaEstimateResidual(
     if (tid == 0)
     {
 	int residualLen = (bs - ro);
-	float sum = residual[0] * 2;// + residualLen / 2;
+	float sum = residual[0];// + residualLen / 2;
 	//int k = clamp(convert_int_rtn(log2((sum + 0.000001f) / (residualLen + 0.000001f))), 0, 14);
 	int k;
 	frexp((sum + 0.000001f) / residualLen, &k);
@@ -608,7 +619,7 @@ void cudaChooseBestMethod(
 		min(obits * task.blocksize,
 		    task.type == Fixed ? task.residualOrder * obits + 6 + (4 * 1/2) + partLen :
 		    task.type == LPC ? task.residualOrder * obits + 4 + 5 + task.residualOrder * task.cbits + 6 + (4 * 1/2)/* << porder */ + partLen :
-		    task.type == Constant ? obits * (1 + task.blocksize * (partLen != 0)) : 
+		    task.type == Constant ? obits * select(1, task.blocksize, partLen != task.blocksize - task.residualOrder) : 
 		    obits * task.blocksize);
 	}
 
@@ -721,21 +732,50 @@ void cudaEncodeResidual(
     int bs = task.data.blocksize;
     int ro = task.data.residualOrder;
 
-    data[tid] = tid < bs ? samples[task.data.samplesOffs + tid] >> task.data.wbits : 0;
+    if (tid < 32 && tid >= ro)
+	task.coefs[tid] = 0;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    __local int4 * cptr = (__local int4 *)&task.coefs[0];
+    int4 cptr0 = cptr[0];
+#if MAX_ORDER > 4
+    int4 cptr1 = cptr[1];
+#if MAX_ORDER > 8
+    int4 cptr2 = cptr[2];
+#endif
+#endif
+
+    data[tid] = 0;
     for (int pos = 0; pos < bs; pos += GROUP_SIZE)
     {
 	// fetch samples
-	float nextData = pos + tid + GROUP_SIZE < bs ? samples[task.data.samplesOffs + pos + tid + GROUP_SIZE] >> task.data.wbits : 0;
+	int off = pos + tid;
+	int nextData = off < bs ? samples[task.data.samplesOffs + off] >> task.data.wbits : 0;
 	data[tid + GROUP_SIZE] = nextData;
 	barrier(CLK_LOCAL_MEM_FENCE);
 
 	// compute residual
-	int sum = 0;
-	for (int c = 0; c < ro; c++)
-	    sum += data[tid + c] * task.coefs[c];
-	sum = data[tid + ro] - (sum >> task.data.shift);
-	if (pos + tid + ro < bs)
-	    output[task.data.residualOffs + pos + tid + ro] = sum;
+	__local int4 * dptr = (__local int4 *)&data[tid + GROUP_SIZE - ro];
+	int4 sum = dptr[0] * cptr0
+#if MAX_ORDER > 4
+	    + dptr[1] * cptr1
+#if MAX_ORDER > 8
+	    + dptr[2] * cptr2
+#if MAX_ORDER > 12
+	    + dptr[3] * cptr[3]
+#if MAX_ORDER > 16
+	    + dptr[4] * cptr[4]
+	    + dptr[5] * cptr[5]
+	    + dptr[6] * cptr[6]
+	    + dptr[7] * cptr[7]
+#endif
+#endif
+#endif
+#endif
+	    ;
+	if (off >= ro && off < bs)
+	    output[task.data.residualOffs + off] = data[tid + GROUP_SIZE] - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift);
 
 	barrier(CLK_LOCAL_MEM_FENCE);
 	data[tid] = nextData;
@@ -795,6 +835,98 @@ void cudaCalcPartition(
     }
 }
 
+// get_group_id(1) == task index
+__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
+void cudaCalcPartition16(
+    __global int *partition_lengths,
+    __global int *residual,
+    __global int *samples,
+    __global FLACCLSubframeTask *tasks,
+    int max_porder // <= 8
+    )
+{
+    __local FLACCLSubframeTask task;
+    __local int data[GROUP_SIZE * 2];
+    __local int res[GROUP_SIZE];
+
+    const int tid = get_local_id(0);
+    if (tid < sizeof(task) / sizeof(int))
+	((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(1)]))[tid];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int bs = task.data.blocksize;
+    int ro = task.data.residualOrder;
+
+    if (tid >= ro && tid < 32)
+	task.coefs[tid] = 0;
+
+    int k = tid % 16;
+    int x = tid / 16;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    __local int4 * cptr = (__local int4 *)&task.coefs[0];
+    int4 cptr0 = cptr[0];
+#if MAX_ORDER > 4
+    int4 cptr1 = cptr[1];
+#if MAX_ORDER > 8
+    int4 cptr2 = cptr[2];
+#endif
+#endif
+
+    data[tid] = 0;
+    for (int pos = 0; pos < bs; pos += GROUP_SIZE)
+    {
+	int offs = pos + tid;
+	// fetch samples
+	int nextData = offs < bs ? samples[task.data.samplesOffs + offs] >> task.data.wbits : 0;
+	data[tid + GROUP_SIZE] = nextData;
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	// compute residual
+	__local int4 * dptr = (__local int4 *)&data[tid + GROUP_SIZE - ro];
+	int4 sum = dptr[0] * cptr0
+#if MAX_ORDER > 4
+	    + dptr[1] * cptr1
+#if MAX_ORDER > 8
+	    + dptr[2] * cptr2
+#if MAX_ORDER > 12
+	    + dptr[3] * cptr[3]
+#if MAX_ORDER > 16
+	    + dptr[4] * cptr[4]
+	    + dptr[5] * cptr[5]
+	    + dptr[6] * cptr[6]
+	    + dptr[7] * cptr[7]
+#endif
+#endif
+#endif
+#endif
+	    ;
+	int s = select(0, nextData - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift), offs >= ro && offs < bs);
+
+	// output residual
+	if (offs < bs)
+	    residual[task.data.residualOffs + offs] = s;
+
+	//int s = select(0, residual[task.data.residualOffs + offs], offs >= ro && offs < bs);
+	
+	s = clamp(s, -0x7fffff, 0x7fffff);
+	// convert to unsigned
+	res[tid] = (s << 1) ^ (s >> 31);
+	barrier(CLK_LOCAL_MEM_FENCE);
+	data[tid] = nextData;
+
+	// calc number of unary bits for each residual sample with each rice paramater
+	__local int4 * chunk = (__local int4 *)&res[x << 4];
+	sum = (chunk[0] >> k) + (chunk[1] >> k) + (chunk[2] >> k) + (chunk[3] >> k);
+	s = sum.x + sum.y + sum.z + sum.w;
+
+	const int lpos = (15 << (max_porder + 1)) * get_group_id(1) + (k << (max_porder + 1)) + offs / 16;
+	if (k <= 14)
+	    partition_lengths[lpos] = min(0x7fffff, s) + (16 - select(0, ro, offs < 16)) * (k + 1);
+    }    
+}
+
 // Sums partition lengths for a certain k == get_group_id(0)
 // Requires 128 threads
 // get_group_id(0) == k
@@ -949,36 +1081,5 @@ void cudaFindPartitionOrder(
 	if (offs + get_local_id(0) < (1 << porder))
 	    best_rice_parameters[(get_group_id(0) << max_porder) + offs + get_local_id(0)] = rice_parameters[pos - (2 << porder) + offs + get_local_id(0)];
     // FIXME: should be bytes?
- //   if (get_local_id(0) < (1 << porder))
-	//shared.tmp[get_local_id(0)] = rice_parameters[pos - (2 << porder) + get_local_id(0)];
- //   barrier(CLK_LOCAL_MEM_FENCE);
- //   if (get_local_id(0) < max(1, (1 << porder) >> 2))
- //   {
-	//char4 ch;
-	//ch.x = shared.tmp[(get_local_id(0) << 2)];
-	//ch.y = shared.tmp[(get_local_id(0) << 2) + 1];
-	//ch.z = shared.tmp[(get_local_id(0) << 2) + 2];
-	//ch.w = shared.tmp[(get_local_id(0) << 2) + 3];
-	//shared.ch[get_local_id(0)] = ch
- //   }	
- //   barrier(CLK_LOCAL_MEM_FENCE);
- //   if (get_local_id(0) < max(1, (1 << porder) >> 2))
-	//best_rice_parameters[(get_group_id(1) << max_porder) + get_local_id(0)] = shared.ch[get_local_id(0)];
 }
-
-//#endif
-//
-//#if 0
-//    if (get_local_id(0) < order)
-//    {
-//	for (int i = 0; i < order; i++)
-//	    if (get_local_id(0) >= i)
-//		sum[get_local_id(0) - i] += coefs[get_local_id(0)] * sample[order - i - 1];
-//	fot (int i = order; i < blocksize; i++)
-//	{
-//	    if (!get_local_id(0)) sample[order + i] = s = residual[order + i] + (sum[order + i] >> shift);
-//	    sum[get_local_id(0) + i + 1] += coefs[get_local_id(0)] * s;
-//	}
-//    }
-//#endif
 #endif
diff --git a/CUETools.FLACCL.cmd/Program.cs b/CUETools.FLACCL.cmd/Program.cs
index 87bdcea..00a8d37 100644
--- a/CUETools.FLACCL.cmd/Program.cs
+++ b/CUETools.FLACCL.cmd/Program.cs
@@ -83,6 +83,7 @@ namespace CUETools.FLACCL.cmd
 			bool do_seektable = true;
 			bool buffered = false;
 			bool ok = true;
+			int intarg;
 
 			for (int arg = 0; arg < args.Length; arg++)
 			{
@@ -108,12 +109,10 @@ namespace CUETools.FLACCL.cmd
 					ok = (++arg < args.Length) && int.TryParse(args[arg], out val);
 					settings.CPUThreads = val;
 				}
-				else if (args[arg] == "--group-size")
-				{
-					int val = settings.GroupSize;
-					ok = (++arg < args.Length) && int.TryParse(args[arg], out val);
-					settings.GroupSize = val;
-				}
+				else if (args[arg] == "--group-size" && ++arg < args.Length && int.TryParse(args[arg], out intarg))
+					settings.GroupSize = intarg;
+				else if (args[arg] == "--define" && arg + 2 < args.Length)
+					settings.Defines += "#define " + args[++arg] + " " + args[++arg] + "\n";
 				else if ((args[arg] == "-o" || args[arg] == "--output") && ++arg < args.Length)
 					output_file = args[arg];
 				else if ((args[arg] == "-s" || args[arg] == "--stereo") && ++arg < args.Length)
@@ -167,7 +166,7 @@ namespace CUETools.FLACCL.cmd
 			}
 			if (!quiet)
 			{
-				Console.WriteLine("{0}, Copyright (C) 2009 Gregory S. Chudov.", FLACCLWriter.vendor_string);
+				Console.WriteLine("{0}, Copyright (C) 2010 Gregory S. Chudov.", FLACCLWriter.vendor_string);
 				Console.WriteLine("This is free software under the GNU GPLv3+ license; There is NO WARRANTY, to");
 				Console.WriteLine("the extent permitted by law. <http://www.gnu.org/licenses/> for details.");
 			}
@@ -317,19 +316,21 @@ namespace CUETools.FLACCL.cmd
 			if (debug)
 			{
 				Console.SetOut(stdout);
-				Console.Out.WriteLine("{0}\t{1}\t{2}\t{3}\t{4} ({5})\t{6} ({7})\t{8}..{9}\t{10}\t{11}",
+				Console.Out.WriteLine("{0}\t{1}\t{2}\t{3}\t{4} ({5})\t{6}/{7}+{12}{13}\t{8}..{9}\t{10}\t{11}",
 					encoder.TotalSize,
 					encoder.UserProcessorTime.TotalSeconds > 0 ? encoder.UserProcessorTime.TotalSeconds : totalElapsed.TotalSeconds,
 					encoder.StereoMethod.ToString().PadRight(15),
 					encoder.WindowFunction.ToString().PadRight(15),
 					encoder.MaxPartitionOrder,
 					settings.GPUOnly ? "GPU" : "CPU",
-					encoder.MaxLPCOrder,
 					encoder.OrdersPerWindow,
+					encoder.MaxLPCOrder,
 					encoder.MinPrecisionSearch,
 					encoder.MaxPrecisionSearch,
 					encoder.BlockSize,
-					encoder.VBRMode
+					encoder.VBRMode,
+					encoder.MaxFixedOrder - encoder.MinFixedOrder + 1,
+					encoder.DoConstant ? "c" : ""
 					);
 			}
 			return 0;