mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
testing on Fermi
This commit is contained in:
@@ -474,13 +474,6 @@ void clEstimateResidual(
|
|||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
#ifdef AMD
|
|
||||||
float4 cptr0 = vload4(0, &fcoef[0]);
|
|
||||||
float4 cptr1 = vload4(1, &fcoef[0]);
|
|
||||||
#if MAX_ORDER > 8
|
|
||||||
float4 cptr2 = vload4(2, &fcoef[0]);
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
for (int pos = 0; pos < bs; pos += GROUP_SIZE)
|
for (int pos = 0; pos < bs; pos += GROUP_SIZE)
|
||||||
{
|
{
|
||||||
// fetch samples
|
// fetch samples
|
||||||
@@ -491,11 +484,10 @@ void clEstimateResidual(
|
|||||||
|
|
||||||
// compute residual
|
// compute residual
|
||||||
__local float* dptr = &data[tid + GROUP_SIZE - ro];
|
__local float* dptr = &data[tid + GROUP_SIZE - ro];
|
||||||
#ifdef AMD
|
float4 sum = vload4(0, &fcoef[0]) * vload4(0, dptr)
|
||||||
float4 sum = cptr0 * vload4(0, dptr)
|
+ vload4(1, &fcoef[0]) * vload4(1, dptr)
|
||||||
+ cptr1 * vload4(1, dptr)
|
|
||||||
#if MAX_ORDER > 8
|
#if MAX_ORDER > 8
|
||||||
+ cptr2 * vload4(2, dptr)
|
+ vload4(2, &fcoef[0]) * vload4(2, dptr)
|
||||||
#if MAX_ORDER > 12
|
#if MAX_ORDER > 12
|
||||||
+ vload4(3, &fcoef[0]) * vload4(3, dptr)
|
+ vload4(3, &fcoef[0]) * vload4(3, dptr)
|
||||||
#if MAX_ORDER > 16
|
#if MAX_ORDER > 16
|
||||||
@@ -509,25 +501,33 @@ void clEstimateResidual(
|
|||||||
;
|
;
|
||||||
|
|
||||||
int t = convert_int_rte(nextData + sum.x + sum.y + sum.z + sum.w);
|
int t = convert_int_rte(nextData + sum.x + sum.y + sum.z + sum.w);
|
||||||
#else
|
|
||||||
float sum =
|
|
||||||
fcoef[0] * dptr[0] + fcoef[1] * dptr[1] + fcoef[2] * dptr[2] + fcoef[3] * dptr[3] +
|
|
||||||
fcoef[4] * dptr[4] + fcoef[5] * dptr[5] + fcoef[6] * dptr[6] + fcoef[7] * dptr[7] +
|
|
||||||
fcoef[8] * dptr[8] + fcoef[9] * dptr[9] + fcoef[10] * dptr[10] + fcoef[11] * dptr[11] ;
|
|
||||||
int t = convert_int_rte(nextData + sum);
|
|
||||||
#endif
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
data[tid] = nextData;
|
|
||||||
// ensure we're within frame bounds
|
// ensure we're within frame bounds
|
||||||
t = select(0, t, offs >= ro && offs < bs);
|
t = select(0, t, offs >= ro && offs < bs);
|
||||||
// overflow protection
|
// overflow protection
|
||||||
t = iclamp(t, -0x7fffff, 0x7fffff);
|
t = iclamp(t, -0x7fffff, 0x7fffff);
|
||||||
// convert to unsigned
|
// convert to unsigned
|
||||||
//if (offs < bs)
|
#ifdef AMD
|
||||||
atom_add(&psum[offs >> partOrder], (t << 1) ^ (t >> 31));
|
data[tid] = nextData;
|
||||||
|
atom_add(&psum[min(63,offs >> partOrder)], (t << 1) ^ (t >> 31));
|
||||||
|
#else
|
||||||
|
data[tid] = (t << 1) ^ (t >> 31);
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
int ps = (1 << partOrder) - 1;
|
||||||
|
for (int l = 1 << (partOrder - 1); l > 0; l >>= 1)
|
||||||
|
{
|
||||||
|
if ((tid & ps) < l)
|
||||||
|
data[tid] += data[tid + l];
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
if ((tid & ps) == 0)
|
||||||
|
psum[min(63,offs >> partOrder)] += data[tid];
|
||||||
|
data[tid] = nextData;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// calculate rice partition bit length for every (1 << partOrder) samples
|
// calculate rice partition bit length for every (1 << partOrder) samples
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
if (tid < 64)
|
if (tid < 64)
|
||||||
{
|
{
|
||||||
int k = iclamp(clz(1 << partOrder) - clz(psum[tid]), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16)
|
int k = iclamp(clz(1 << partOrder) - clz(psum[tid]), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16)
|
||||||
|
|||||||
@@ -131,8 +131,9 @@ void clFindWastedBits(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#define TEMPBLOCK 64
|
#define TEMPBLOCK 128
|
||||||
|
|
||||||
|
#if 0
|
||||||
// get_num_groups(0) == number of tasks
|
// get_num_groups(0) == number of tasks
|
||||||
// get_num_groups(1) == number of windows
|
// get_num_groups(1) == number of windows
|
||||||
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||||
@@ -173,6 +174,78 @@ void clComputeAutocor(
|
|||||||
for (int i = 0; i <= MAX_ORDER; ++i)
|
for (int i = 0; i <= MAX_ORDER; ++i)
|
||||||
pout[i] = ac[i];
|
pout[i] = ac[i];
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
#define STORE_AC(ro, val) if (ro <= MAX_ORDER) pout[ro] = val;
|
||||||
|
#define STORE_AC4(ro, val) STORE_AC(ro*4+0, val##ro.x) STORE_AC(ro*4+1, val##ro.y) STORE_AC(ro*4+2, val##ro.z) STORE_AC(ro*4+3, val##ro.w)
|
||||||
|
|
||||||
|
// get_num_groups(0) == number of tasks
|
||||||
|
// get_num_groups(1) == number of windows
|
||||||
|
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||||
|
void clComputeAutocor(
|
||||||
|
__global float *output,
|
||||||
|
__global const int *samples,
|
||||||
|
__global const float *window,
|
||||||
|
__global FLACCLSubframeTask *tasks,
|
||||||
|
const int taskCount // tasks per block
|
||||||
|
)
|
||||||
|
{
|
||||||
|
FLACCLSubframeData task = tasks[get_group_id(0) * taskCount].data;
|
||||||
|
int len = task.blocksize;
|
||||||
|
int windowOffs = get_group_id(1) * len;
|
||||||
|
float data[TEMPBLOCK + MAX_ORDER + 3];
|
||||||
|
double4 ac0 = 0.0, ac1 = 0.0, ac2 = 0.0, ac3 = 0.0, ac4 = 0.0, ac5 = 0.0, ac6 = 0.0, ac7 = 0.0, ac8 = 0.0;
|
||||||
|
|
||||||
|
for (int pos = 0; pos < len; pos += TEMPBLOCK)
|
||||||
|
{
|
||||||
|
for (int tid = 0; tid < TEMPBLOCK + MAX_ORDER + 3; tid++)
|
||||||
|
data[tid] = tid < len - pos ? samples[task.samplesOffs + pos + tid] * window[windowOffs + pos + tid] : 0.0f;
|
||||||
|
|
||||||
|
for (int j = 0; j < TEMPBLOCK;)
|
||||||
|
{
|
||||||
|
float4 temp0 = 0.0f, temp1 = 0.0f, temp2 = 0.0f, temp3 = 0.0f, temp4 = 0.0f, temp5 = 0.0f, temp6 = 0.0f, temp7 = 0.0f, temp8 = 0.0f;
|
||||||
|
for (int k = 0; k < 32; k++)
|
||||||
|
{
|
||||||
|
float d0 = data[j];
|
||||||
|
temp0 += d0 * vload4(0, &data[j]);
|
||||||
|
temp1 += d0 * vload4(1, &data[j]);
|
||||||
|
#if MAX_ORDER >= 8
|
||||||
|
temp2 += d0 * vload4(2, &data[j]);
|
||||||
|
#if MAX_ORDER >= 12
|
||||||
|
temp3 += d0 * vload4(3, &data[j]);
|
||||||
|
#if MAX_ORDER >= 16
|
||||||
|
temp4 += d0 * vload4(4, &data[j]);
|
||||||
|
temp5 += d0 * vload4(5, &data[j]);
|
||||||
|
temp6 += d0 * vload4(6, &data[j]);
|
||||||
|
temp7 += d0 * vload4(7, &data[j]);
|
||||||
|
temp8 += d0 * vload4(8, &data[j]);
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
ac0 += convert_double4(temp0);
|
||||||
|
ac1 += convert_double4(temp1);
|
||||||
|
#if MAX_ORDER >= 8
|
||||||
|
ac2 += convert_double4(temp2);
|
||||||
|
#if MAX_ORDER >= 12
|
||||||
|
ac3 += convert_double4(temp3);
|
||||||
|
#if MAX_ORDER >= 16
|
||||||
|
ac4 += convert_double4(temp4);
|
||||||
|
ac5 += convert_double4(temp5);
|
||||||
|
ac6 += convert_double4(temp6);
|
||||||
|
ac7 += convert_double4(temp7);
|
||||||
|
ac8 += convert_double4(temp8);
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
__global float * pout = &output[(get_group_id(0) * get_num_groups(1) + get_group_id(1)) * (MAX_ORDER + 1)];
|
||||||
|
STORE_AC4(0, ac) STORE_AC4(1, ac) STORE_AC4(2, ac) STORE_AC4(3, ac)
|
||||||
|
STORE_AC4(4, ac) STORE_AC4(5, ac) STORE_AC4(6, ac) STORE_AC4(7, ac)
|
||||||
|
STORE_AC4(8, ac)
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||||
void clComputeLPC(
|
void clComputeLPC(
|
||||||
|
|||||||
Reference in New Issue
Block a user