Intel OpenCL: 24-bit/multichannel support

2025-12-16 18:14:25 +00:00 · 2010-12-09 15:51:01 +00:00
parent 6585ea2001
commit 59d5bd13de
2 changed files with 71 additions and 37 deletions
--- a/CUETools.Codecs.FLACCL/FLACCLWriter.cs
+++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs
@@ -88,7 +88,7 @@ namespace CUETools.Codecs.FLACCL
 		public OpenCLDeviceType DeviceType { get; set; }
 		int cpu_threads = 0;
-		[DefaultValue(1)]
+		[DefaultValue(0)]
 		[SRDescription(typeof(Properties.Resources), "DescriptionCPUThreads")]
 		public int CPUThreads
 		{
@@ -2582,7 +2582,7 @@ namespace CUETools.Codecs.FLACCL
            int MAX_ORDER = this.writer.eparams.max_prediction_order;
 			int MAX_FRAMES = this.writer.framesPerTask;
-			int MAX_CHANNELSIZE = MAX_FRAMES * writer.eparams.block_size;
+			int MAX_CHANNELSIZE = MAX_FRAMES * ((writer.eparams.block_size + 3) & ~3);
 			residualTasksLen = sizeof(FLACCLSubframeTask) * 32 * channelsCount * MAX_FRAMES;
 			bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channels * MAX_FRAMES;
 			int samplesBufferLen = writer.PCM.BlockAlign * MAX_CHANNELSIZE * channelsCount;
--- a/CUETools.Codecs.FLACCL/flac.cl
+++ b/CUETools.Codecs.FLACCL/flac.cl
@@ -941,6 +941,8 @@ inline residual_t calc_residual(__global int *ptr, int * coefs, int ro)
 	default: ENCODE_N(ro, action) \
    }
 #define TEMPBLOCK1 TEMPBLOCK
 __kernel __attribute__(( vec_type_hint (int4))) __attribute__((reqd_work_group_size(1, 1, 1)))
 void clEstimateResidual(
    __global int*samples,
@@ -953,38 +955,70 @@ void clEstimateResidual(
    int ro = task.data.residualOrder;
    int bs = task.data.blocksize;
 #define EPO 6
-    int len[1 << EPO]; // blocksize / 64!!!!
+    float len[1 << EPO]; // blocksize / 64!!!!
    __global int *data = &samples[task.data.samplesOffs];
    for (int i = 0; i < 1 << EPO; i++)
 	len[i] = 0.0f;
 #if defined(AMD)
    for (int i = ro; i < 32; i++)
 	task.coefs[i] = 0;
    for (int i = 0; i < 1 << EPO; i++)
 	len[i] = 0;
-#if defined(AMD) || BITS_PER_SAMPLE > 16
+    SWITCH_N((len[pos >> (12 - EPO)] += fabs((float)t)))
    SWITCH_N((t = (t << 1) ^ (t >> 31), len[pos >> (12 - EPO)] += t & 0x7fffff))
 #else
-    int4 c0 = vload4(0, &task.coefs[0]);
+    float fcoef[32];
-    int4 c1 = vload4(1, &task.coefs[0]);
+    for (int tid = 0; tid < 32; tid++)
-    int4 c2 = vload4(2, &task.coefs[0]);
+	fcoef[tid] = tid < MAX_ORDER && tid + ro - MAX_ORDER >= 0 ? - ((float) task.coefs[tid + ro - MAX_ORDER]) / (1 << task.data.shift) : 0.0f;
-    for (int pos = ro; pos < bs; pos ++) 
+    float4 fc0 = vload4(0, &fcoef[0]);
    float4 fc1 = vload4(1, &fcoef[0]);
 #if MAX_ORDER > 8
    float4 fc2 = vload4(2, &fcoef[0]);
 #endif
    float fdata[MAX_ORDER + TEMPBLOCK1 + 32];
    for (int pos = 0; pos < MAX_ORDER; pos++)
 	fdata[pos] = 0.0f;
    for (int pos = MAX_ORDER + TEMPBLOCK1; pos < MAX_ORDER + TEMPBLOCK1 + 32; pos++)
 	fdata[pos] = 0.0f;
    for (int bpos = 0; bpos < bs; bpos += TEMPBLOCK1)
    {
-	__global int * dptr = data + pos - ro;
+	int end = min(bpos + TEMPBLOCK1, bs);
-	int4 sum 
+
-	    = c0 * vload4(0, dptr)
+	for (int pos = max(bpos - ro, 0); pos < max(bpos, ro); pos++)
-	    + c1 * vload4(1, dptr)
+	    fdata[MAX_ORDER + pos - bpos] = (float)(data[pos] >> task.data.wbits);	
-	    + c2 * vload4(2, dptr);
+
-	int t = (data[pos] - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift)) >> task.data.wbits;
+	for (int pos = max(bpos, ro); pos < end; pos ++)
-	t = (t << 1) ^ (t >> 31);
+	{
-	len[pos >> (12 - EPO)] += t & 0x7fffff;
+	    float next = (float)(data[pos] >> task.data.wbits);
-	//len[pos >> (12 - EPO)] += min(0x7ffffffU, (unsigned int)t);
+	    float * dptr = fdata + pos - bpos;
 	    dptr[MAX_ORDER] = next;
 	    float4 sum 
 		= fc0 * vload4(0, dptr)
 		+ fc1 * vload4(1, dptr)
 #if MAX_ORDER > 8
 		+ fc2 * vload4(2, dptr)
  #if MAX_ORDER > 12
 		+ vload4(3, &fcoef[0]) * vload4(3, dptr)
    #if MAX_ORDER > 16
 		+ vload4(4, &fcoef[0]) * vload4(4, dptr)
 		+ vload4(5, &fcoef[0]) * vload4(5, dptr)
 		+ vload4(6, &fcoef[0]) * vload4(6, dptr)
 		+ vload4(7, &fcoef[0]) * vload4(7, dptr)
    #endif
  #endif
 #endif
 		;
 	    next += sum.x + sum.y + sum.z + sum.w;
 	    len[pos >> (12 - EPO)] += fabs(next);
 	}
    }
 #endif
    int total = 0;
    for (int i = 0; i < 1 << EPO; i++)
    {
-	int res = len[i];
+	int res = convert_int_sat_rte(len[i] * 2);
 	int k = iclamp(31 - fastclz(res) - (12 - EPO), 0, MAX_RICE_PARAM); // 25 - clz(res) == clz(64) - clz(res) == log2(res / 64)
 	total += (k << (12 - EPO)) + (res >> k);
    }
@@ -1008,10 +1042,10 @@ void clEstimateResidual(
 {
    __local float data[GROUP_SIZE * 2 + 32];
 #if !defined(AMD) || !defined(HAVE_ATOM)
-    __local volatile int idata[GROUP_SIZE + 16];
+    __local volatile uint idata[GROUP_SIZE + 16];
 #endif
    __local FLACCLSubframeTask task;
-    __local int psum[MAX_BLOCKSIZE >> ESTPARTLOG];
+    __local uint psum[MAX_BLOCKSIZE >> ESTPARTLOG];
    __local float fcoef[32];
    __local int selectedTask;
@@ -1075,16 +1109,16 @@ void clEstimateResidual(
 	    ;
 	float2 sum2 = sum4.s01 + sum4.s23;
-	int t = convert_int_rte(nextData + (sum2.s0 + sum2.s1));
+	int it = convert_int_sat_rte(nextData + (sum2.s0 + sum2.s1));
 //	int t = (int)(nextData + sum.x + sum.y + sum.z + sum.w);
 	barrier(CLK_LOCAL_MEM_FENCE);
 	data[tid] = nextData;
 	// ensure we're within frame bounds
 	t = select(0, t, offs >= ro);
 	// overflow protection
 	t = iclamp(t, -0x7fffff, 0x7fffff);
 	// convert to unsigned
-	t = (t << 1) ^ (t >> 31);
+	uint t = (it << 1) ^ (it >> 31);
 	// ensure we're within frame bounds
 	t = select(0U, t, offs >= ro);
 	// overflow protection
 	t = min(t, 0x7ffffffU);
 #if !defined(AMD) || !defined(HAVE_ATOM)
 	idata[tid] = t;
 	for (int l = 16; l > 1; l >>= 1)
@@ -1123,15 +1157,15 @@ void clEstimateResidual(
 #endif
 	    ;
-	int t = convert_int_rte(nextData + sum.x + sum.y + sum.z + sum.w);
+	int it = convert_int_sat_rte(nextData + sum.x + sum.y + sum.z + sum.w);
 	barrier(CLK_LOCAL_MEM_FENCE);
 	data[tid] = nextData;
 	// ensure we're within frame bounds
 	t = select(0, t, offs >= ro && offs < bs);
 	// overflow protection
 	t = iclamp(t, -0x7fffff, 0x7fffff);
 	// convert to unsigned
-	t = (t << 1) ^ (t >> 31);
+	uint t = (it << 1) ^ (it >> 31);
 	// ensure we're within frame bounds
 	t = select(0U, t, offs >= ro && offs < bs);
 	// overflow protection
 	t = min(t, 0x7ffffffU);
 #if !defined(AMD) || !defined(HAVE_ATOM)
 	idata[tid] = t;
 	for (int l = 16; l > 1; l >>= 1)
@@ -1147,7 +1181,7 @@ void clEstimateResidual(
    // calculate rice partition bit length for every 32 samples
    barrier(CLK_LOCAL_MEM_FENCE);
    // Bug: if (MAX_BLOCKSIZE >> (ESTPARTLOG + 1)) > GROUP_SIZE
-    int pl = get_local_id(0) < (MAX_BLOCKSIZE >> (ESTPARTLOG + 1)) ? psum[tid * 2] + psum[tid * 2 + 1] : 0;
+    uint pl = get_local_id(0) < (MAX_BLOCKSIZE >> (ESTPARTLOG + 1)) ? psum[tid * 2] + psum[tid * 2 + 1] : 0;
    barrier(CLK_LOCAL_MEM_FENCE);
 //   for (int pos = 0; pos < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2; pos += GROUP_SIZE)
 //   {
@@ -1170,7 +1204,7 @@ void clEstimateResidual(
    }
    if (tid == 0)
    {
-	int pl = psum[0] + (bs - ro);
+	int pl = (int)psum[0] + (bs - ro);
 	int obits = task.data.obits - task.data.wbits;
 	int len = min(obits * task.data.blocksize,
 		task.data.type == Fixed ? task.data.residualOrder * obits + 6 + RICE_PARAM_BITS + pl :