mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
Intel OpenCL: 24-bit/multichannel support
This commit is contained in:
@@ -954,18 +954,18 @@ void clEstimateResidual(
|
|||||||
FLACCLSubframeTask task = tasks[selectedTask];
|
FLACCLSubframeTask task = tasks[selectedTask];
|
||||||
int ro = task.data.residualOrder;
|
int ro = task.data.residualOrder;
|
||||||
int bs = task.data.blocksize;
|
int bs = task.data.blocksize;
|
||||||
#define EPO 6
|
#define ERPARTS (MAX_BLOCKSIZE >> 6)
|
||||||
float len[1 << EPO]; // blocksize / 64!!!!
|
float len[ERPARTS]; // blocksize / 64!!!!
|
||||||
|
|
||||||
__global int *data = &samples[task.data.samplesOffs];
|
__global int *data = &samples[task.data.samplesOffs];
|
||||||
for (int i = 0; i < 1 << EPO; i++)
|
for (int i = 0; i < ERPARTS; i++)
|
||||||
len[i] = 0.0f;
|
len[i] = 0.0f;
|
||||||
|
|
||||||
#if defined(AMD)
|
#if defined(AMD)
|
||||||
for (int i = ro; i < 32; i++)
|
for (int i = ro; i < 32; i++)
|
||||||
task.coefs[i] = 0;
|
task.coefs[i] = 0;
|
||||||
|
|
||||||
SWITCH_N((len[pos >> (12 - EPO)] += fabs((float)t)))
|
SWITCH_N((len[pos >> 6] += fabs((float)t)))
|
||||||
#else
|
#else
|
||||||
float fcoef[32];
|
float fcoef[32];
|
||||||
for (int tid = 0; tid < 32; tid++)
|
for (int tid = 0; tid < 32; tid++)
|
||||||
@@ -977,6 +977,40 @@ void clEstimateResidual(
|
|||||||
float4 fc2 = vload4(2, &fcoef[0]);
|
float4 fc2 = vload4(2, &fcoef[0]);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if MAX_ORDER == 8
|
||||||
|
float fdata[32];
|
||||||
|
for (int pos = 0; pos < MAX_ORDER + ro; pos++)
|
||||||
|
fdata[pos] = pos < MAX_ORDER ? 0.0f : (float)(data[pos - MAX_ORDER] >> task.data.wbits);
|
||||||
|
float4 fd0 = vload4(0, &fdata[ro]);
|
||||||
|
float4 fd1 = vload4(1, &fdata[ro]);
|
||||||
|
for (int pos = ro; pos < bs; pos ++)
|
||||||
|
{
|
||||||
|
float4 sum = fc0 * fd0 + fc1 * fd1;
|
||||||
|
fd0 = fd0.s1230;
|
||||||
|
fd1 = fd1.s1230;
|
||||||
|
fd0.s3 = fd1.s3;
|
||||||
|
fd1.s3 = (float)(data[pos] >> task.data.wbits);
|
||||||
|
len[pos >> 6] += fabs(fd1.s3 + (sum.x + sum.y + sum.z + sum.w));
|
||||||
|
}
|
||||||
|
#elif MAX_ORDER == 12
|
||||||
|
float fdata[32];
|
||||||
|
for (int pos = 0; pos < MAX_ORDER + ro; pos++)
|
||||||
|
fdata[pos] = pos < MAX_ORDER ? 0.0f : (float)(data[pos - MAX_ORDER] >> task.data.wbits);
|
||||||
|
float4 fd0 = vload4(0, &fdata[ro]);
|
||||||
|
float4 fd1 = vload4(1, &fdata[ro]);
|
||||||
|
float4 fd2 = vload4(2, &fdata[ro]);
|
||||||
|
for (int pos = ro; pos < bs; pos ++)
|
||||||
|
{
|
||||||
|
float4 sum = fc0 * fd0 + fc1 * fd1 + fc2 * fd2;
|
||||||
|
fd0 = fd0.s1230;
|
||||||
|
fd1 = fd1.s1230;
|
||||||
|
fd2 = fd2.s1230;
|
||||||
|
fd0.s3 = fd1.s3;
|
||||||
|
fd1.s3 = fd2.s3;
|
||||||
|
fd2.s3 = (float)(data[pos] >> task.data.wbits);
|
||||||
|
len[pos >> 6] += fabs(fd2.s3 + (sum.x + sum.y + sum.z + sum.w));
|
||||||
|
}
|
||||||
|
#else
|
||||||
float fdata[MAX_ORDER + TEMPBLOCK1 + 32];
|
float fdata[MAX_ORDER + TEMPBLOCK1 + 32];
|
||||||
for (int pos = 0; pos < MAX_ORDER; pos++)
|
for (int pos = 0; pos < MAX_ORDER; pos++)
|
||||||
fdata[pos] = 0.0f;
|
fdata[pos] = 0.0f;
|
||||||
@@ -1011,16 +1045,17 @@ void clEstimateResidual(
|
|||||||
#endif
|
#endif
|
||||||
;
|
;
|
||||||
next += sum.x + sum.y + sum.z + sum.w;
|
next += sum.x + sum.y + sum.z + sum.w;
|
||||||
len[pos >> (12 - EPO)] += fabs(next);
|
len[pos >> 6] += fabs(next);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
int total = 0;
|
int total = 0;
|
||||||
for (int i = 0; i < 1 << EPO; i++)
|
for (int i = 0; i < ERPARTS; i++)
|
||||||
{
|
{
|
||||||
int res = convert_int_sat_rte(len[i] * 2);
|
int res = convert_int_sat_rte(len[i] * 2);
|
||||||
int k = iclamp(31 - fastclz(res) - (12 - EPO), 0, MAX_RICE_PARAM); // 25 - clz(res) == clz(64) - clz(res) == log2(res / 64)
|
int k = iclamp(31 - fastclz(res) - 6, 0, MAX_RICE_PARAM); // 25 - clz(res) == clz(64) - clz(res) == log2(res / 64)
|
||||||
total += (k << (12 - EPO)) + (res >> k);
|
total += (k << 6) + (res >> k);
|
||||||
}
|
}
|
||||||
int partLen = min(0x7ffffff, total) + (bs - ro);
|
int partLen = min(0x7ffffff, total) + (bs - ro);
|
||||||
int obits = task.data.obits - task.data.wbits;
|
int obits = task.data.obits - task.data.wbits;
|
||||||
|
|||||||
Reference in New Issue
Block a user