opencl flac encoder

2025-12-16 18:14:25 +00:00 · 2010-10-15 19:56:36 +00:00
parent badfb2fec8
commit 4a47615f7c
3 changed files with 245 additions and 100 deletions
--- a/CUETools.Codecs.FLACCL/FLACCLWriter.cs
+++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs
@@ -53,6 +53,9 @@ namespace CUETools.Codecs.FLACCL
 		[SRDescription(typeof(Properties.Resources), "DescriptionGroupSize")]
 		public int GroupSize { get; set; }
 		[SRDescription(typeof(Properties.Resources), "DescriptionDefines")]
 		public string Defines { get; set; }
 		int cpu_threads = 1;
 		[DefaultValue(1)]
 		[SRDescription(typeof(Properties.Resources), "DescriptionCPUThreads")]
@@ -474,6 +477,12 @@ namespace CUETools.Codecs.FLACCL
 			}
 		}
 		public bool DoConstant
 		{
 			get { return eparams.do_constant; }
 			set { eparams.do_constant = value; }
 		}
 		public int MinPartitionOrder
 		{
 			get { return eparams.min_partition_order; }
@@ -1173,12 +1182,8 @@ namespace CUETools.Codecs.FLACCL
 				return;
 			int max_porder = get_max_p_order(eparams.max_partition_order, task.frameSize, eparams.max_prediction_order);
-			int calcPartitionPartSize = task.frameSize >> max_porder;
+			while ((task.frameSize >> max_porder) < 16 && max_porder > 0)
 			while (calcPartitionPartSize < 16 && max_porder > 0)
 			{
 				calcPartitionPartSize <<= 1;
 				max_porder--;
 			}
 			if (channels != 2) throw new Exception("channels != 2"); // need to Enqueue cudaChannelDecorr for each channel
 			Kernel cudaChannelDecorr = channels == 2 ? (channelsCount == 4 ? task.cudaStereoDecorr : task.cudaChannelDecorr2) : null;// task.cudaChannelDecorr;
@@ -1212,12 +1217,6 @@ namespace CUETools.Codecs.FLACCL
 			task.cudaEncodeResidual.SetArg(1, task.cudaSamples);
 			task.cudaEncodeResidual.SetArg(2, task.cudaBestResidualTasks);
 			task.cudaCalcPartition.SetArg(0, task.cudaPartitions);
 			task.cudaCalcPartition.SetArg(1, task.cudaResidual);
 			task.cudaCalcPartition.SetArg(2, task.cudaBestResidualTasks);
 			task.cudaCalcPartition.SetArg(3, max_porder);
 			task.cudaCalcPartition.SetArg(4, calcPartitionPartSize);
 			task.cudaSumPartition.SetArg(0, task.cudaPartitions);
 			task.cudaSumPartition.SetArg(1, max_porder);
@@ -1275,11 +1274,20 @@ namespace CUETools.Codecs.FLACCL
 			else
 				task.openCLCQ.EnqueueNDRangeKernel(task.cudaCopyBestMethod, 2, null, new int[] { 64, channels * task.frameCount }, new int[] { 64, 1 });
 			if (_settings.GPUOnly)
 			{
 				task.max_porder = max_porder;
 				if (task.frameSize >> max_porder == 16)
 				{
 					task.openCLCQ.EnqueueBarrier();
 					task.EnqueueCalcPartition16(channels);
 				}
 				else
 				{
 					task.openCLCQ.EnqueueBarrier();
 					task.openCLCQ.EnqueueNDRangeKernel(task.cudaEncodeResidual, 1, null, new int[] { task.groupSize * channels * task.frameCount }, new int[] { task.groupSize });
 					task.openCLCQ.EnqueueBarrier();
-				task.openCLCQ.EnqueueNDRangeKernel(task.cudaCalcPartition, 2, null, new int[] { task.groupSize * (1 << max_porder), channels * task.frameCount }, new int[] { task.groupSize, 1 });
+					task.EnqueueCalcPartition(channels);
 				}
 				if (max_porder > 0)
 				{
 					task.openCLCQ.EnqueueBarrier();
@@ -1293,7 +1301,6 @@ namespace CUETools.Codecs.FLACCL
 				task.openCLCQ.EnqueueBarrier();
 				task.openCLCQ.EnqueueReadBuffer(task.cudaResidual, false, 0, sizeof(int) * MAX_BLOCKSIZE * channels, task.residualBufferPtr.AddrOfPinnedObject());
 				task.openCLCQ.EnqueueReadBuffer(task.cudaBestRiceParams, false, 0, sizeof(int) * (1 << max_porder) * channels * task.frameCount, task.bestRiceParamsPtr.AddrOfPinnedObject());
 			    task.max_porder = max_porder;
 			}
 			task.openCLCQ.EnqueueBarrier();
 			task.openCLCQ.EnqueueReadBuffer(task.cudaBestResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * channels * task.frameCount, task.bestResidualTasksPtr.AddrOfPinnedObject());
@@ -1575,11 +1582,12 @@ namespace CUETools.Codecs.FLACCL
 				// and serve as a convenient way to pass configuration information to the compilation process
 				OCLMan.Defines =
 					"#define MAX_ORDER " + eparams.max_prediction_order.ToString() + "\n" +
-					"#define GROUP_SIZE " + groupSize.ToString() + "\n";
+					"#define GROUP_SIZE " + groupSize.ToString() + "\n" +
 					_settings.Defines + "\n";
 				// The BuildOptions string is passed directly to clBuild and can be used to do debug builds etc
 				OCLMan.BuildOptions = "";
 				OCLMan.SourcePath = System.IO.Path.GetDirectoryName(GetType().Assembly.Location);
-				//OCLMan.BinaryPath = ;
+				OCLMan.BinaryPath = System.IO.Path.Combine(System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData), "CUE Tools"), "OpenCL");
 				OCLMan.CreateDefaultContext(0, DeviceType.GPU);
 				openCLContext = OCLMan.Context;
@@ -1778,7 +1786,7 @@ namespace CUETools.Codecs.FLACCL
 		public string Path { get { return _path; } }
-		public static readonly string vendor_string = "FLACCL#.91";
+		public static readonly string vendor_string = "FLACCL#0.1";
 		int select_blocksize(int samplerate, int time_ms)
 		{
@@ -2142,41 +2150,48 @@ namespace CUETools.Codecs.FLACCL
 					do_constant = false;
 					do_wasted = false;
 					do_midside = false;
 					window_function = WindowFunction.Bartlett;
 					orders_per_window = 1;
 					max_partition_order = 4;
 					max_prediction_order = 7;
-					min_fixed_order = 2;
+					min_fixed_order = 3;
 					max_fixed_order = 2;
 					break;
 				case 1:
 					do_constant = false;
 					do_wasted = false;
 					do_midside = false;
 					window_function = WindowFunction.Bartlett;
 					orders_per_window = 1;
-					max_prediction_order = 12;
+					min_fixed_order = 2;
 					max_fixed_order = 2;
 					max_prediction_order = 7;
 					max_partition_order = 4;
 					break;
 				case 2:
 					do_constant = false;
 					do_midside = false;
 					window_function = WindowFunction.Bartlett;
-					min_fixed_order = 3;
+					min_fixed_order = 2;
 					max_fixed_order = 2;
 					orders_per_window = 1;
-					max_prediction_order = 7;
+					max_prediction_order = 8;
 					max_partition_order = 4;
 					break;
 				case 3:
 					window_function = WindowFunction.Bartlett;
 					do_constant = false;
 					min_fixed_order = 2;
 					max_fixed_order = 2;
-					orders_per_window = 6;
+					orders_per_window = 1;
-					max_prediction_order = 7;
+					max_prediction_order = 8;
 					max_partition_order = 4;
 					break;
 				case 4:
 					do_constant = false;
 					min_fixed_order = 2;
 					max_fixed_order = 2;
-					orders_per_window = 3;
+					orders_per_window = 1;
 					max_prediction_order = 8;
 					max_partition_order = 4;
 					break;
@@ -2184,18 +2199,21 @@ namespace CUETools.Codecs.FLACCL
 					do_constant = false;
 					min_fixed_order = 2;
 					max_fixed_order = 2;
-					orders_per_window = 1;
+					orders_per_window = 2;
 					max_prediction_order = 8;
 					break;
 				case 6:
 					do_constant = false;
 					min_fixed_order = 2;
 					max_fixed_order = 2;
 					orders_per_window = 1;
 					break;
 				case 7:
 					do_constant = false;
 					min_fixed_order = 2;
 					max_fixed_order = 2;
 					orders_per_window = 3;
 					break;
 				case 7:
 					min_fixed_order = 2;
 					max_fixed_order = 2;
 					orders_per_window = 7;
 					break;
 				case 8:
 					orders_per_window = 12;
 					break;
@@ -2264,6 +2282,7 @@ namespace CUETools.Codecs.FLACCL
 		public Kernel cudaCopyBestMethodStereo;
 		public Kernel cudaEncodeResidual;
 		public Kernel cudaCalcPartition;
 		public Kernel cudaCalcPartition16;
 		public Kernel cudaSumPartition;
 		public Kernel cudaFindRiceParameter;
 		public Kernel cudaFindPartitionOrder;
@@ -2355,6 +2374,7 @@ namespace CUETools.Codecs.FLACCL
 			cudaCopyBestMethodStereo = openCLProgram.CreateKernel("cudaCopyBestMethodStereo");
 			cudaEncodeResidual = openCLProgram.CreateKernel("cudaEncodeResidual");
 			cudaCalcPartition = openCLProgram.CreateKernel("cudaCalcPartition");
 			cudaCalcPartition16 = openCLProgram.CreateKernel("cudaCalcPartition16");
 			cudaSumPartition = openCLProgram.CreateKernel("cudaSumPartition");
 			cudaFindRiceParameter = openCLProgram.CreateKernel("cudaFindRiceParameter");
 			cudaFindPartitionOrder = openCLProgram.CreateKernel("cudaFindPartitionOrder");
@@ -2398,6 +2418,7 @@ namespace CUETools.Codecs.FLACCL
 			cudaCopyBestMethodStereo.Dispose();
 			cudaEncodeResidual.Dispose();
 			cudaCalcPartition.Dispose();
 			cudaCalcPartition16.Dispose();
 			cudaSumPartition.Dispose();
 			cudaFindRiceParameter.Dispose();
 			cudaFindPartitionOrder.Dispose();
@@ -2464,6 +2485,28 @@ namespace CUETools.Codecs.FLACCL
 			openCLCQ.EnqueueNDRangeKernel(cudaChooseBestMethod, 2, null, new int[] { 32, channelsCount * frameCount }, new int[] { 32, 1 });
 		}
 		public void EnqueueCalcPartition16(int channels)
 		{
 			cudaCalcPartition16.SetArg(0, cudaPartitions);
 			cudaCalcPartition16.SetArg(1, cudaResidual);
 			cudaCalcPartition16.SetArg(2, cudaSamples);
 			cudaCalcPartition16.SetArg(3, cudaBestResidualTasks);
 			cudaCalcPartition16.SetArg(4, max_porder);
 			openCLCQ.EnqueueNDRangeKernel(cudaCalcPartition16, 2, null, new int[] { groupSize, channels * frameCount }, new int[] { groupSize, 1 });
 		}
 		public void EnqueueCalcPartition(int channels)
 		{
 			cudaCalcPartition.SetArg(0, cudaPartitions);
 			cudaCalcPartition.SetArg(1, cudaResidual);
 			cudaCalcPartition.SetArg(2, cudaBestResidualTasks);
 			cudaCalcPartition.SetArg(3, max_porder);
 			cudaCalcPartition.SetArg(4, frameSize >> max_porder);
 			openCLCQ.EnqueueNDRangeKernel(cudaCalcPartition, 2, null, new int[] { groupSize * (1 << max_porder), channels * frameCount }, new int[] { groupSize, 1 });
 		}
 		public unsafe FLACCLSubframeTask* ResidualTasks
 		{
 			get
--- a/CUETools.Codecs.FLACCL/flac.cl
+++ b/CUETools.Codecs.FLACCL/flac.cl
@@ -449,7 +449,7 @@ void cudaQuantizeLPC(
    }
 }
-#define DONT_BEACCURATE
+#define BEACCURATE
 __kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
 void cudaEstimateResidual(
@@ -481,24 +481,35 @@ void cudaEstimateResidual(
    if (tid < GROUP_SIZE / 16)
 	len[tid] = 0;
 #else
-    float res = 0.0f;
+    long res = 0;
 #endif
    data[tid] = 0;
    barrier(CLK_LOCAL_MEM_FENCE);
    __local int4 * cptr = (__local int4 *)&task.coefs[0];
    int4 cptr0 = cptr[0];
 #if MAX_ORDER > 4
    int4 cptr1 = cptr[1];
 #if MAX_ORDER > 8
    int4 cptr2 = cptr[2];
 #endif
 #endif
    data[tid] = tid < bs ? samples[task.data.samplesOffs + tid] >> task.data.wbits : 0;
    for (int pos = 0; pos < bs; pos += GROUP_SIZE)
    {
 	// fetch samples
-	int nextData = pos + tid + GROUP_SIZE < bs ? samples[task.data.samplesOffs + pos + tid + GROUP_SIZE] >> task.data.wbits : 0;
+	int offs = pos + tid;
 	int nextData = offs < bs ? samples[task.data.samplesOffs + offs] >> task.data.wbits : 0;
 	data[tid + GROUP_SIZE] = nextData;
 	barrier(CLK_LOCAL_MEM_FENCE);
 	// compute residual
-	__local int4 * dptr = (__local int4 *)&data[tid];
+	__local int4 * dptr = (__local int4 *)&data[tid + GROUP_SIZE - ro];
-	__local int4 * cptr = (__local int4 *)&task.coefs[0];
+	int4 sum = dptr[0] * cptr0
 	int4 sum = dptr[0] * cptr[0]
 #if MAX_ORDER > 4
-	    + dptr[1] * cptr[1]
+	    + dptr[1] * cptr1
 #if MAX_ORDER > 8
-	    + dptr[2] * cptr[2]
+	    + dptr[2] * cptr2
 #if MAX_ORDER > 12
 	    + dptr[3] * cptr[3]
 #if MAX_ORDER > 16
@@ -512,23 +523,23 @@ void cudaEstimateResidual(
 #endif
 	    ;
-	int t = select(0, data[tid + ro] - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift), pos + tid + ro < bs);
+	int t = select(0, data[tid + GROUP_SIZE] - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift), offs >= ro && offs < bs);
 #ifdef BEACCURATE
-	residual[tid] = min((t << 1) ^ (t >> 31), 0x7fffff);
+	t = clamp(t, -0x7fffff, 0x7fffff);
 	residual[tid] = (t << 1) ^ (t >> 31);
 #else
-	res += fabs(t);
+	res += (t << 1) ^ (t >> 31);
 #endif
-	barrier(CLK_LOCAL_MEM_FENCE);
+	barrier(CLK_GLOBAL_MEM_FENCE);
 #ifdef BEACCURATE
 	if (tid < GROUP_SIZE / 16)
 	{
-	    __local int4 * chunk = ((__local int4 *)residual) + tid * 4;
+	    __local int4 * chunk = ((__local int4 *)residual) + (tid << 2);
 	    int4 sum = chunk[0] + chunk[1] + chunk[2] + chunk[3];
 	    int res = sum.x + sum.y + sum.z + sum.w;
-	    int k = clamp(clz(16) - clz(res), 0, 14);
+	    int k = clamp(27 - clz(res), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16)
-	    len[tid] += 16 * k + (res >> k);
+	    len[tid] += (k << 4) + (res >> k);
 	    k = clamp(clz(16) - clz(res), 0, 14);
 	}
 #endif
@@ -557,7 +568,7 @@ void cudaEstimateResidual(
    if (tid == 0)
    {
 	int residualLen = (bs - ro);
-	float sum = residual[0] * 2;// + residualLen / 2;
+	float sum = residual[0];// + residualLen / 2;
 	//int k = clamp(convert_int_rtn(log2((sum + 0.000001f) / (residualLen + 0.000001f))), 0, 14);
 	int k;
 	frexp((sum + 0.000001f) / residualLen, &k);
@@ -608,7 +619,7 @@ void cudaChooseBestMethod(
 		min(obits * task.blocksize,
 		    task.type == Fixed ? task.residualOrder * obits + 6 + (4 * 1/2) + partLen :
 		    task.type == LPC ? task.residualOrder * obits + 4 + 5 + task.residualOrder * task.cbits + 6 + (4 * 1/2)/* << porder */ + partLen :
-		    task.type == Constant ? obits * (1 + task.blocksize * (partLen != 0)) : 
+		    task.type == Constant ? obits * select(1, task.blocksize, partLen != task.blocksize - task.residualOrder) : 
 		    obits * task.blocksize);
 	}
@@ -721,21 +732,50 @@ void cudaEncodeResidual(
    int bs = task.data.blocksize;
    int ro = task.data.residualOrder;
-    data[tid] = tid < bs ? samples[task.data.samplesOffs + tid] >> task.data.wbits : 0;
+    if (tid < 32 && tid >= ro)
 	task.coefs[tid] = 0;
    barrier(CLK_LOCAL_MEM_FENCE);
    __local int4 * cptr = (__local int4 *)&task.coefs[0];
    int4 cptr0 = cptr[0];
 #if MAX_ORDER > 4
    int4 cptr1 = cptr[1];
 #if MAX_ORDER > 8
    int4 cptr2 = cptr[2];
 #endif
 #endif
    data[tid] = 0;
    for (int pos = 0; pos < bs; pos += GROUP_SIZE)
    {
 	// fetch samples
-	float nextData = pos + tid + GROUP_SIZE < bs ? samples[task.data.samplesOffs + pos + tid + GROUP_SIZE] >> task.data.wbits : 0;
+	int off = pos + tid;
 	int nextData = off < bs ? samples[task.data.samplesOffs + off] >> task.data.wbits : 0;
 	data[tid + GROUP_SIZE] = nextData;
 	barrier(CLK_LOCAL_MEM_FENCE);
 	// compute residual
-	int sum = 0;
+	__local int4 * dptr = (__local int4 *)&data[tid + GROUP_SIZE - ro];
-	for (int c = 0; c < ro; c++)
+	int4 sum = dptr[0] * cptr0
-	    sum += data[tid + c] * task.coefs[c];
+#if MAX_ORDER > 4
-	sum = data[tid + ro] - (sum >> task.data.shift);
+	    + dptr[1] * cptr1
-	if (pos + tid + ro < bs)
+#if MAX_ORDER > 8
-	    output[task.data.residualOffs + pos + tid + ro] = sum;
+	    + dptr[2] * cptr2
 #if MAX_ORDER > 12
 	    + dptr[3] * cptr[3]
 #if MAX_ORDER > 16
 	    + dptr[4] * cptr[4]
 	    + dptr[5] * cptr[5]
 	    + dptr[6] * cptr[6]
 	    + dptr[7] * cptr[7]
 #endif
 #endif
 #endif
 #endif
 	    ;
 	if (off >= ro && off < bs)
 	    output[task.data.residualOffs + off] = data[tid + GROUP_SIZE] - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift);
 	barrier(CLK_LOCAL_MEM_FENCE);
 	data[tid] = nextData;
@@ -795,6 +835,98 @@ void cudaCalcPartition(
    }
 }
 // get_group_id(1) == task index
 __kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
 void cudaCalcPartition16(
    __global int *partition_lengths,
    __global int *residual,
    __global int *samples,
    __global FLACCLSubframeTask *tasks,
    int max_porder // <= 8
    )
 {
    __local FLACCLSubframeTask task;
    __local int data[GROUP_SIZE * 2];
    __local int res[GROUP_SIZE];
    const int tid = get_local_id(0);
    if (tid < sizeof(task) / sizeof(int))
 	((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(1)]))[tid];
    barrier(CLK_LOCAL_MEM_FENCE);
    int bs = task.data.blocksize;
    int ro = task.data.residualOrder;
    if (tid >= ro && tid < 32)
 	task.coefs[tid] = 0;
    int k = tid % 16;
    int x = tid / 16;
    barrier(CLK_LOCAL_MEM_FENCE);
    __local int4 * cptr = (__local int4 *)&task.coefs[0];
    int4 cptr0 = cptr[0];
 #if MAX_ORDER > 4
    int4 cptr1 = cptr[1];
 #if MAX_ORDER > 8
    int4 cptr2 = cptr[2];
 #endif
 #endif
    data[tid] = 0;
    for (int pos = 0; pos < bs; pos += GROUP_SIZE)
    {
 	int offs = pos + tid;
 	// fetch samples
 	int nextData = offs < bs ? samples[task.data.samplesOffs + offs] >> task.data.wbits : 0;
 	data[tid + GROUP_SIZE] = nextData;
 	barrier(CLK_LOCAL_MEM_FENCE);
 	// compute residual
 	__local int4 * dptr = (__local int4 *)&data[tid + GROUP_SIZE - ro];
 	int4 sum = dptr[0] * cptr0
 #if MAX_ORDER > 4
 	    + dptr[1] * cptr1
 #if MAX_ORDER > 8
 	    + dptr[2] * cptr2
 #if MAX_ORDER > 12
 	    + dptr[3] * cptr[3]
 #if MAX_ORDER > 16
 	    + dptr[4] * cptr[4]
 	    + dptr[5] * cptr[5]
 	    + dptr[6] * cptr[6]
 	    + dptr[7] * cptr[7]
 #endif
 #endif
 #endif
 #endif
 	    ;
 	int s = select(0, nextData - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift), offs >= ro && offs < bs);
 	// output residual
 	if (offs < bs)
 	    residual[task.data.residualOffs + offs] = s;
 	//int s = select(0, residual[task.data.residualOffs + offs], offs >= ro && offs < bs);
 	s = clamp(s, -0x7fffff, 0x7fffff);
 	// convert to unsigned
 	res[tid] = (s << 1) ^ (s >> 31);
 	barrier(CLK_LOCAL_MEM_FENCE);
 	data[tid] = nextData;
 	// calc number of unary bits for each residual sample with each rice paramater
 	__local int4 * chunk = (__local int4 *)&res[x << 4];
 	sum = (chunk[0] >> k) + (chunk[1] >> k) + (chunk[2] >> k) + (chunk[3] >> k);
 	s = sum.x + sum.y + sum.z + sum.w;
 	const int lpos = (15 << (max_porder + 1)) * get_group_id(1) + (k << (max_porder + 1)) + offs / 16;
 	if (k <= 14)
 	    partition_lengths[lpos] = min(0x7fffff, s) + (16 - select(0, ro, offs < 16)) * (k + 1);
    }    
 }
 // Sums partition lengths for a certain k == get_group_id(0)
 // Requires 128 threads
 // get_group_id(0) == k
@@ -949,36 +1081,5 @@ void cudaFindPartitionOrder(
 	if (offs + get_local_id(0) < (1 << porder))
 	    best_rice_parameters[(get_group_id(0) << max_porder) + offs + get_local_id(0)] = rice_parameters[pos - (2 << porder) + offs + get_local_id(0)];
    // FIXME: should be bytes?
 //   if (get_local_id(0) < (1 << porder))
 	//shared.tmp[get_local_id(0)] = rice_parameters[pos - (2 << porder) + get_local_id(0)];
 //   barrier(CLK_LOCAL_MEM_FENCE);
 //   if (get_local_id(0) < max(1, (1 << porder) >> 2))
 //   {
 	//char4 ch;
 	//ch.x = shared.tmp[(get_local_id(0) << 2)];
 	//ch.y = shared.tmp[(get_local_id(0) << 2) + 1];
 	//ch.z = shared.tmp[(get_local_id(0) << 2) + 2];
 	//ch.w = shared.tmp[(get_local_id(0) << 2) + 3];
 	//shared.ch[get_local_id(0)] = ch
 //   }	
 //   barrier(CLK_LOCAL_MEM_FENCE);
 //   if (get_local_id(0) < max(1, (1 << porder) >> 2))
 	//best_rice_parameters[(get_group_id(1) << max_porder) + get_local_id(0)] = shared.ch[get_local_id(0)];
 }
 //#endif
 //
 //#if 0
 //    if (get_local_id(0) < order)
 //    {
 //	for (int i = 0; i < order; i++)
 //	    if (get_local_id(0) >= i)
 //		sum[get_local_id(0) - i] += coefs[get_local_id(0)] * sample[order - i - 1];
 //	fot (int i = order; i < blocksize; i++)
 //	{
 //	    if (!get_local_id(0)) sample[order + i] = s = residual[order + i] + (sum[order + i] >> shift);
 //	    sum[get_local_id(0) + i + 1] += coefs[get_local_id(0)] * s;
 //	}
 //    }
 //#endif
 #endif
--- a/CUETools.FLACCL.cmd/Program.cs
+++ b/CUETools.FLACCL.cmd/Program.cs
@@ -83,6 +83,7 @@ namespace CUETools.FLACCL.cmd
 			bool do_seektable = true;
 			bool buffered = false;
 			bool ok = true;
 			int intarg;
 			for (int arg = 0; arg < args.Length; arg++)
 			{
@@ -108,12 +109,10 @@ namespace CUETools.FLACCL.cmd
 					ok = (++arg < args.Length) && int.TryParse(args[arg], out val);
 					settings.CPUThreads = val;
 				}
-				else if (args[arg] == "--group-size")
+				else if (args[arg] == "--group-size" && ++arg < args.Length && int.TryParse(args[arg], out intarg))
-				{
+					settings.GroupSize = intarg;
-					int val = settings.GroupSize;
+				else if (args[arg] == "--define" && arg + 2 < args.Length)
-					ok = (++arg < args.Length) && int.TryParse(args[arg], out val);
+					settings.Defines += "#define " + args[++arg] + " " + args[++arg] + "\n";
 					settings.GroupSize = val;
 				}
 				else if ((args[arg] == "-o" || args[arg] == "--output") && ++arg < args.Length)
 					output_file = args[arg];
 				else if ((args[arg] == "-s" || args[arg] == "--stereo") && ++arg < args.Length)
@@ -167,7 +166,7 @@ namespace CUETools.FLACCL.cmd
 			}
 			if (!quiet)
 			{
-				Console.WriteLine("{0}, Copyright (C) 2009 Gregory S. Chudov.", FLACCLWriter.vendor_string);
+				Console.WriteLine("{0}, Copyright (C) 2010 Gregory S. Chudov.", FLACCLWriter.vendor_string);
 				Console.WriteLine("This is free software under the GNU GPLv3+ license; There is NO WARRANTY, to");
 				Console.WriteLine("the extent permitted by law. <http://www.gnu.org/licenses/> for details.");
 			}
@@ -317,19 +316,21 @@ namespace CUETools.FLACCL.cmd
 			if (debug)
 			{
 				Console.SetOut(stdout);
-				Console.Out.WriteLine("{0}\t{1}\t{2}\t{3}\t{4} ({5})\t{6} ({7})\t{8}..{9}\t{10}\t{11}",
+				Console.Out.WriteLine("{0}\t{1}\t{2}\t{3}\t{4} ({5})\t{6}/{7}+{12}{13}\t{8}..{9}\t{10}\t{11}",
 					encoder.TotalSize,
 					encoder.UserProcessorTime.TotalSeconds > 0 ? encoder.UserProcessorTime.TotalSeconds : totalElapsed.TotalSeconds,
 					encoder.StereoMethod.ToString().PadRight(15),
 					encoder.WindowFunction.ToString().PadRight(15),
 					encoder.MaxPartitionOrder,
 					settings.GPUOnly ? "GPU" : "CPU",
 					encoder.MaxLPCOrder,
 					encoder.OrdersPerWindow,
 					encoder.MaxLPCOrder,
 					encoder.MinPrecisionSearch,
 					encoder.MaxPrecisionSearch,
 					encoder.BlockSize,
-					encoder.VBRMode
+					encoder.VBRMode,
 					encoder.MaxFixedOrder - encoder.MinFixedOrder + 1,
 					encoder.DoConstant ? "c" : ""
 					);
 			}
 			return 0;