Intel OpenCL

This commit is contained in:
chudov
2010-11-20 14:06:10 +00:00
parent ac35093c52
commit 72c1f0b5ce
2 changed files with 62 additions and 25 deletions

View File

@@ -1498,7 +1498,7 @@ namespace CUETools.Codecs.FLACCL
for (int ch = 0; ch < channels; ch++) for (int ch = 0; ch < channels; ch++)
{ {
short* res = ((short*)task.clSamplesBytesPtr) + iFrame * channels * task.frameSize + ch; short* res = ((short*)task.clSamplesBytesPtr) + iFrame * channels * task.frameSize + ch;
int* smp = r + ch * task.channelSize; int* smp = r + ch * Flake.MAX_BLOCKSIZE;
for (int i = task.frameSize; i > 0; i--) for (int i = task.frameSize; i > 0; i--)
{ {
//if (AudioSamples.MemCmp(s + iFrame * task.frameSize + ch * FLACCLWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, task.frameSize)) //if (AudioSamples.MemCmp(s + iFrame * task.frameSize + ch * FLACCLWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, task.frameSize))

View File

@@ -229,7 +229,7 @@ void clFindWastedBits(
#endif #endif
#ifdef FLACCL_CPU #ifdef FLACCL_CPU
#define TEMPBLOCK 128 #define TEMPBLOCK 512
#define STORE_AC(ro, val) if (ro <= MAX_ORDER) pout[ro] = val; #define STORE_AC(ro, val) if (ro <= MAX_ORDER) pout[ro] = val;
#define STORE_AC4(ro, val) STORE_AC(ro*4+0, val##ro.x) STORE_AC(ro*4+1, val##ro.y) STORE_AC(ro*4+2, val##ro.z) STORE_AC(ro*4+3, val##ro.w) #define STORE_AC4(ro, val) STORE_AC(ro*4+0, val##ro.x) STORE_AC(ro*4+1, val##ro.y) STORE_AC(ro*4+2, val##ro.z) STORE_AC(ro*4+3, val##ro.w)
@@ -765,6 +765,19 @@ void clQuantizeLPC(
#endif #endif
#ifdef FLACCL_CPU #ifdef FLACCL_CPU
inline int fastclz(int iv)
{
unsigned int v = (unsigned int)iv;
int x = (0 != (v >> 16)) * 16;
x += (0 != (v >> (x + 8))) * 8;
x += (0 != (v >> (x + 4))) * 4;
x += (0 != (v >> (x + 2))) * 2;
x += (0 != (v >> (x + 1)));
x += (0 != (v >> x));
return 32 - x;
}
inline int calc_residual(__global int *ptr, int * coefs, int ro) inline int calc_residual(__global int *ptr, int * coefs, int ro)
{ {
int sum = 0; int sum = 0;
@@ -817,7 +830,7 @@ void clEstimateResidual(
len[i] = 0; len[i] = 0;
#ifdef AMD #ifdef AMD
SWITCH_N((t = clamp(t, -0x7fffff, 0x7fffff), len[pos >> (12 - EPO)] += (t << 1) ^ (t >> 31))) SWITCH_N((t = (t << 1) ^ (t >> 31), len[pos >> (12 - EPO)] += t & 0x7fffff))
#else #else
int4 c0 = vload4(0, &task.coefs[0]); int4 c0 = vload4(0, &task.coefs[0]);
int4 c1 = vload4(1, &task.coefs[0]); int4 c1 = vload4(1, &task.coefs[0]);
@@ -831,15 +844,16 @@ void clEstimateResidual(
+ c1 * vload4(1, dptr) + c1 * vload4(1, dptr)
+ c2 * vload4(2, dptr); + c2 * vload4(2, dptr);
int t = (data[pos] - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift)) >> task.data.wbits; int t = (data[pos] - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift)) >> task.data.wbits;
t = iclamp(t, -0x7fffff, 0x7fffff); t = (t << 1) ^ (t >> 31);
len[pos >> (12 - EPO)] += (t << 1) ^ (t >> 31); len[pos >> (12 - EPO)] += t & 0x7fffff;
//len[pos >> (12 - EPO)] += min(0x7ffffffU, (unsigned int)t);
} }
#endif #endif
int total = 0; int total = 0;
for (int i = 0; i < 1 << EPO; i++) for (int i = 0; i < 1 << EPO; i++)
{ {
int res = min(0x7fffff,len[i]); int res = min(0x7fffff,len[i]);
int k = clamp(clz(1 << (12 - EPO)) - clz(res), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16) int k = iclamp(31 - (12 - EPO) - fastclz(res), 0, 14); // 25 - clz(res) == clz(64) - clz(res) == log2(res / 64)
total += (k << (12 - EPO)) + (res >> k); total += (k << (12 - EPO)) + (res >> k);
} }
int partLen = min(0x7ffffff, total) + (bs - ro); int partLen = min(0x7ffffff, total) + (bs - ro);
@@ -1267,9 +1281,9 @@ void clCalcPartition16(
__global int *pl = partition_lengths + (1 << (max_porder + 1)) * get_global_id(0); __global int *pl = partition_lengths + (1 << (max_porder + 1)) * get_global_id(0);
for (int p = 0; p < (1 << max_porder); p++) for (int p = 0; p < (1 << max_porder); p++)
pl[p] = 0; pl[p] = 0;
//__global int *rptr = residual + task.data.residualOffs; __global int *rptr = residual + task.data.residualOffs;
//SWITCH_N((rptr[pos] = t, pl[pos >> 4] += (t << 1) ^ (t >> 31))); SWITCH_N((rptr[pos] = t, pl[pos >> 4] += (t << 1) ^ (t >> 31)));
SWITCH_N((residual[task.data.residualOffs + pos] = t, t = clamp(t, -0x7fffff, 0x7fffff), t = (t << 1) ^ (t >> 31), pl[pos >> 4] += t)); //SWITCH_N((residual[task.data.residualOffs + pos] = t, t = (t << 1) ^ (t >> 31), pl[pos >> 4] += t));
} }
#else #else
// get_group_id(0) == task index // get_group_id(0) == task index
@@ -1439,20 +1453,37 @@ void clFindRiceParameter(
//int psize = task->data.blocksize >> max_porder; //int psize = task->data.blocksize >> max_porder;
int bs = task->data.blocksize; int bs = task->data.blocksize;
int ro = task->data.residualOrder; int ro = task->data.residualOrder;
for (int offs = 0; offs < lim; offs ++) __global int* ppl = &partition_lengths[get_group_id(0) << (max_porder + 1)];
__global int* prp = &rice_parameters[get_group_id(0) << (max_porder + 2)];
__global int* pol = prp + (1 << (max_porder + 1));
for (int porder = max_porder; porder >= 0; porder--)
{ {
int pl = partition_lengths[(1 << (max_porder + 1)) * get_group_id(0) + offs]; int pos = (2 << max_porder) - (2 << porder);
int porder = 31 - clz(lim - offs); int fin = pos + (1 << porder);
int ps = (bs >> porder) - select(0, ro, offs == lim + 1 - (2 << porder));
//if (ps <= 0) int pl = ppl[pos];
// printf("max_porder == %d, porder == %d, ro == %d\n", max_porder, porder, ro); int ps = (bs >> porder) - ro;
int k = clamp(31 - clz(pl / max(1, ps)), 0, 14); int k = iclamp(31 - fastclz(pl / max(1, ps)), 0, 14);
int plk = ps * (k + 1) + (pl >> k); int plk = ps * (k + 1) + (pl >> k);
// output rice parameter // output rice parameter
rice_parameters[(get_group_id(0) << (max_porder + 2)) + offs] = k; prp[pos] = k;
// output length // output length
rice_parameters[(get_group_id(0) << (max_porder + 2)) + (1 << (max_porder + 1)) + offs] = plk; pol[pos] = plk;
ps = (bs >> porder);
for (int offs = pos + 1; offs < fin; offs++)
{
pl = ppl[offs];
k = iclamp(31 - fastclz(pl / ps), 0, 14);
plk = ps * (k + 1) + (pl >> k);
// output rice parameter
prp[offs] = k;
// output length
pol[offs] = plk;
}
} }
} }
#else #else
@@ -1503,12 +1534,12 @@ void clFindPartitionOrder(
partlen[p] = 0; partlen[p] = 0;
// fetch partition lengths // fetch partition lengths
const int pos = (get_group_id(0) << (max_porder + 2)) + (2 << max_porder); const int pos = (get_group_id(0) << (max_porder + 2)) + (2 << max_porder);
int lim = (2 << max_porder) - 1;
for (int offs = 0; offs < lim; offs ++) for (int porder = max_porder; porder >= 0; porder--)
{ {
int len = rice_parameters[pos + offs]; int start = (2 << max_porder) - (2 << porder);
int porder = 31 - clz(lim - offs); for (int offs = 0; offs < (1 << porder); offs ++)
partlen[porder] += len; partlen[porder] += rice_parameters[pos + start + offs];
} }
int best_length = partlen[0] + 4; int best_length = partlen[0] + 4;
@@ -1657,7 +1688,11 @@ inline void flush(BitWriter *bw)
inline int len_utf8(int n) inline int len_utf8(int n)
{ {
#ifdef FLACCL_CPU
int bts = 31 - fastclz(n);
#else
int bts = 31 - clz(n); int bts = 31 - clz(n);
#endif
if (bts < 7) if (bts < 7)
return 8; return 8;
return 8 * ((bts + 4) / 5); return 8 * ((bts + 4) / 5);
@@ -1855,6 +1890,8 @@ void clRiceEncoding(
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
#endif #endif
//if (tid == GROUP_SIZE - 1 && mypos[tid] > (GROUP_SIZE/2) * 32)
// printf("Oops: %d\n", mypos[tid]);
mypos[tid] += start; mypos[tid] += start;
int start32 = start / 32; int start32 = start / 32;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);