mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
testing on Fermi
This commit is contained in:
@@ -799,7 +799,7 @@ void clEstimateResidual(
|
|||||||
__global FLACCLSubframeTask *tasks
|
__global FLACCLSubframeTask *tasks
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
__local float data[GROUP_SIZE * 2];
|
__local float data[GROUP_SIZE * 2 + 32];
|
||||||
__local FLACCLSubframeTask task;
|
__local FLACCLSubframeTask task;
|
||||||
__local int psum[64];
|
__local int psum[64];
|
||||||
__local float fcoef[32];
|
__local float fcoef[32];
|
||||||
@@ -818,11 +818,14 @@ void clEstimateResidual(
|
|||||||
int bs = task.data.blocksize;
|
int bs = task.data.blocksize;
|
||||||
|
|
||||||
if (tid < 32)
|
if (tid < 32)
|
||||||
fcoef[tid] = select(0.0f, - ((float) task.coefs[tid]) / (1 << task.data.shift), tid < ro);
|
//fcoef[tid] = select(0.0f, - ((float) task.coefs[tid]) / (1 << task.data.shift), tid < ro);
|
||||||
//fcoef[tid] = select(0.0f, - ((float) task.coefs[tid + ro - MAX_ORDER]) / (1 << task.data.shift), tid + ro >= MAX_ORDER && tid < MAX_ORDER);
|
fcoef[tid] = tid < MAX_ORDER && tid + ro - MAX_ORDER >= 0 ? - ((float) task.coefs[tid + ro - MAX_ORDER]) / (1 << task.data.shift) : 0.0f;
|
||||||
if (tid < 64)
|
if (tid < 64)
|
||||||
psum[tid] = 0;
|
psum[tid] = 0;
|
||||||
data[tid] = 0.0f;
|
data[tid] = 0.0f;
|
||||||
|
// need to initialize "extra" data, because NaNs can produce wierd results even when multipled by zero extra coefs
|
||||||
|
if (tid < 32)
|
||||||
|
data[GROUP_SIZE * 2 + tid] = 0.0f;
|
||||||
|
|
||||||
int partOrder = max(1, clz(64) - clz(bs - 1) + 1);
|
int partOrder = max(1, clz(64) - clz(bs - 1) + 1);
|
||||||
|
|
||||||
@@ -844,7 +847,7 @@ void clEstimateResidual(
|
|||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
// compute residual
|
// compute residual
|
||||||
__local float* dptr = &data[tid + GROUP_SIZE - ro];
|
__local float* dptr = &data[tid + GROUP_SIZE - MAX_ORDER];
|
||||||
float4 sum
|
float4 sum
|
||||||
#ifdef AMD
|
#ifdef AMD
|
||||||
= fc0 * vload4(0, dptr)
|
= fc0 * vload4(0, dptr)
|
||||||
|
|||||||
Reference in New Issue
Block a user