Intel OpenCL: 24-bit/multichannel support

This commit is contained in:
chudov
2010-12-09 15:51:01 +00:00
parent 6585ea2001
commit 59d5bd13de
2 changed files with 71 additions and 37 deletions

View File

@@ -88,7 +88,7 @@ namespace CUETools.Codecs.FLACCL
public OpenCLDeviceType DeviceType { get; set; } public OpenCLDeviceType DeviceType { get; set; }
int cpu_threads = 0; int cpu_threads = 0;
[DefaultValue(1)] [DefaultValue(0)]
[SRDescription(typeof(Properties.Resources), "DescriptionCPUThreads")] [SRDescription(typeof(Properties.Resources), "DescriptionCPUThreads")]
public int CPUThreads public int CPUThreads
{ {
@@ -2582,7 +2582,7 @@ namespace CUETools.Codecs.FLACCL
int MAX_ORDER = this.writer.eparams.max_prediction_order; int MAX_ORDER = this.writer.eparams.max_prediction_order;
int MAX_FRAMES = this.writer.framesPerTask; int MAX_FRAMES = this.writer.framesPerTask;
int MAX_CHANNELSIZE = MAX_FRAMES * writer.eparams.block_size; int MAX_CHANNELSIZE = MAX_FRAMES * ((writer.eparams.block_size + 3) & ~3);
residualTasksLen = sizeof(FLACCLSubframeTask) * 32 * channelsCount * MAX_FRAMES; residualTasksLen = sizeof(FLACCLSubframeTask) * 32 * channelsCount * MAX_FRAMES;
bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channels * MAX_FRAMES; bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channels * MAX_FRAMES;
int samplesBufferLen = writer.PCM.BlockAlign * MAX_CHANNELSIZE * channelsCount; int samplesBufferLen = writer.PCM.BlockAlign * MAX_CHANNELSIZE * channelsCount;

View File

@@ -941,6 +941,8 @@ inline residual_t calc_residual(__global int *ptr, int * coefs, int ro)
default: ENCODE_N(ro, action) \ default: ENCODE_N(ro, action) \
} }
#define TEMPBLOCK1 TEMPBLOCK
__kernel __attribute__(( vec_type_hint (int4))) __attribute__((reqd_work_group_size(1, 1, 1))) __kernel __attribute__(( vec_type_hint (int4))) __attribute__((reqd_work_group_size(1, 1, 1)))
void clEstimateResidual( void clEstimateResidual(
__global int*samples, __global int*samples,
@@ -953,38 +955,70 @@ void clEstimateResidual(
int ro = task.data.residualOrder; int ro = task.data.residualOrder;
int bs = task.data.blocksize; int bs = task.data.blocksize;
#define EPO 6 #define EPO 6
int len[1 << EPO]; // blocksize / 64!!!! float len[1 << EPO]; // blocksize / 64!!!!
__global int *data = &samples[task.data.samplesOffs]; __global int *data = &samples[task.data.samplesOffs];
for (int i = 0; i < 1 << EPO; i++)
len[i] = 0.0f;
#if defined(AMD)
for (int i = ro; i < 32; i++) for (int i = ro; i < 32; i++)
task.coefs[i] = 0; task.coefs[i] = 0;
for (int i = 0; i < 1 << EPO; i++)
len[i] = 0;
#if defined(AMD) || BITS_PER_SAMPLE > 16 SWITCH_N((len[pos >> (12 - EPO)] += fabs((float)t)))
SWITCH_N((t = (t << 1) ^ (t >> 31), len[pos >> (12 - EPO)] += t & 0x7fffff))
#else #else
int4 c0 = vload4(0, &task.coefs[0]); float fcoef[32];
int4 c1 = vload4(1, &task.coefs[0]); for (int tid = 0; tid < 32; tid++)
int4 c2 = vload4(2, &task.coefs[0]); fcoef[tid] = tid < MAX_ORDER && tid + ro - MAX_ORDER >= 0 ? - ((float) task.coefs[tid + ro - MAX_ORDER]) / (1 << task.data.shift) : 0.0f;
for (int pos = ro; pos < bs; pos ++) float4 fc0 = vload4(0, &fcoef[0]);
float4 fc1 = vload4(1, &fcoef[0]);
#if MAX_ORDER > 8
float4 fc2 = vload4(2, &fcoef[0]);
#endif
float fdata[MAX_ORDER + TEMPBLOCK1 + 32];
for (int pos = 0; pos < MAX_ORDER; pos++)
fdata[pos] = 0.0f;
for (int pos = MAX_ORDER + TEMPBLOCK1; pos < MAX_ORDER + TEMPBLOCK1 + 32; pos++)
fdata[pos] = 0.0f;
for (int bpos = 0; bpos < bs; bpos += TEMPBLOCK1)
{ {
__global int * dptr = data + pos - ro; int end = min(bpos + TEMPBLOCK1, bs);
int4 sum
= c0 * vload4(0, dptr) for (int pos = max(bpos - ro, 0); pos < max(bpos, ro); pos++)
+ c1 * vload4(1, dptr) fdata[MAX_ORDER + pos - bpos] = (float)(data[pos] >> task.data.wbits);
+ c2 * vload4(2, dptr);
int t = (data[pos] - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift)) >> task.data.wbits; for (int pos = max(bpos, ro); pos < end; pos ++)
t = (t << 1) ^ (t >> 31); {
len[pos >> (12 - EPO)] += t & 0x7fffff; float next = (float)(data[pos] >> task.data.wbits);
//len[pos >> (12 - EPO)] += min(0x7ffffffU, (unsigned int)t); float * dptr = fdata + pos - bpos;
dptr[MAX_ORDER] = next;
float4 sum
= fc0 * vload4(0, dptr)
+ fc1 * vload4(1, dptr)
#if MAX_ORDER > 8
+ fc2 * vload4(2, dptr)
#if MAX_ORDER > 12
+ vload4(3, &fcoef[0]) * vload4(3, dptr)
#if MAX_ORDER > 16
+ vload4(4, &fcoef[0]) * vload4(4, dptr)
+ vload4(5, &fcoef[0]) * vload4(5, dptr)
+ vload4(6, &fcoef[0]) * vload4(6, dptr)
+ vload4(7, &fcoef[0]) * vload4(7, dptr)
#endif
#endif
#endif
;
next += sum.x + sum.y + sum.z + sum.w;
len[pos >> (12 - EPO)] += fabs(next);
}
} }
#endif #endif
int total = 0; int total = 0;
for (int i = 0; i < 1 << EPO; i++) for (int i = 0; i < 1 << EPO; i++)
{ {
int res = len[i]; int res = convert_int_sat_rte(len[i] * 2);
int k = iclamp(31 - fastclz(res) - (12 - EPO), 0, MAX_RICE_PARAM); // 25 - clz(res) == clz(64) - clz(res) == log2(res / 64) int k = iclamp(31 - fastclz(res) - (12 - EPO), 0, MAX_RICE_PARAM); // 25 - clz(res) == clz(64) - clz(res) == log2(res / 64)
total += (k << (12 - EPO)) + (res >> k); total += (k << (12 - EPO)) + (res >> k);
} }
@@ -1008,10 +1042,10 @@ void clEstimateResidual(
{ {
__local float data[GROUP_SIZE * 2 + 32]; __local float data[GROUP_SIZE * 2 + 32];
#if !defined(AMD) || !defined(HAVE_ATOM) #if !defined(AMD) || !defined(HAVE_ATOM)
__local volatile int idata[GROUP_SIZE + 16]; __local volatile uint idata[GROUP_SIZE + 16];
#endif #endif
__local FLACCLSubframeTask task; __local FLACCLSubframeTask task;
__local int psum[MAX_BLOCKSIZE >> ESTPARTLOG]; __local uint psum[MAX_BLOCKSIZE >> ESTPARTLOG];
__local float fcoef[32]; __local float fcoef[32];
__local int selectedTask; __local int selectedTask;
@@ -1075,16 +1109,16 @@ void clEstimateResidual(
; ;
float2 sum2 = sum4.s01 + sum4.s23; float2 sum2 = sum4.s01 + sum4.s23;
int t = convert_int_rte(nextData + (sum2.s0 + sum2.s1)); int it = convert_int_sat_rte(nextData + (sum2.s0 + sum2.s1));
// int t = (int)(nextData + sum.x + sum.y + sum.z + sum.w); // int t = (int)(nextData + sum.x + sum.y + sum.z + sum.w);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
data[tid] = nextData; data[tid] = nextData;
// ensure we're within frame bounds
t = select(0, t, offs >= ro);
// overflow protection
t = iclamp(t, -0x7fffff, 0x7fffff);
// convert to unsigned // convert to unsigned
t = (t << 1) ^ (t >> 31); uint t = (it << 1) ^ (it >> 31);
// ensure we're within frame bounds
t = select(0U, t, offs >= ro);
// overflow protection
t = min(t, 0x7ffffffU);
#if !defined(AMD) || !defined(HAVE_ATOM) #if !defined(AMD) || !defined(HAVE_ATOM)
idata[tid] = t; idata[tid] = t;
for (int l = 16; l > 1; l >>= 1) for (int l = 16; l > 1; l >>= 1)
@@ -1123,15 +1157,15 @@ void clEstimateResidual(
#endif #endif
; ;
int t = convert_int_rte(nextData + sum.x + sum.y + sum.z + sum.w); int it = convert_int_sat_rte(nextData + sum.x + sum.y + sum.z + sum.w);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
data[tid] = nextData; data[tid] = nextData;
// ensure we're within frame bounds
t = select(0, t, offs >= ro && offs < bs);
// overflow protection
t = iclamp(t, -0x7fffff, 0x7fffff);
// convert to unsigned // convert to unsigned
t = (t << 1) ^ (t >> 31); uint t = (it << 1) ^ (it >> 31);
// ensure we're within frame bounds
t = select(0U, t, offs >= ro && offs < bs);
// overflow protection
t = min(t, 0x7ffffffU);
#if !defined(AMD) || !defined(HAVE_ATOM) #if !defined(AMD) || !defined(HAVE_ATOM)
idata[tid] = t; idata[tid] = t;
for (int l = 16; l > 1; l >>= 1) for (int l = 16; l > 1; l >>= 1)
@@ -1147,7 +1181,7 @@ void clEstimateResidual(
// calculate rice partition bit length for every 32 samples // calculate rice partition bit length for every 32 samples
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// Bug: if (MAX_BLOCKSIZE >> (ESTPARTLOG + 1)) > GROUP_SIZE // Bug: if (MAX_BLOCKSIZE >> (ESTPARTLOG + 1)) > GROUP_SIZE
int pl = get_local_id(0) < (MAX_BLOCKSIZE >> (ESTPARTLOG + 1)) ? psum[tid * 2] + psum[tid * 2 + 1] : 0; uint pl = get_local_id(0) < (MAX_BLOCKSIZE >> (ESTPARTLOG + 1)) ? psum[tid * 2] + psum[tid * 2 + 1] : 0;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// for (int pos = 0; pos < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2; pos += GROUP_SIZE) // for (int pos = 0; pos < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2; pos += GROUP_SIZE)
// { // {
@@ -1170,7 +1204,7 @@ void clEstimateResidual(
} }
if (tid == 0) if (tid == 0)
{ {
int pl = psum[0] + (bs - ro); int pl = (int)psum[0] + (bs - ro);
int obits = task.data.obits - task.data.wbits; int obits = task.data.obits - task.data.wbits;
int len = min(obits * task.data.blocksize, int len = min(obits * task.data.blocksize,
task.data.type == Fixed ? task.data.residualOrder * obits + 6 + RICE_PARAM_BITS + pl : task.data.type == Fixed ? task.data.residualOrder * obits + 6 + RICE_PARAM_BITS + pl :