diff --git a/CUETools.Codecs.FLACCL/flac.cl b/CUETools.Codecs.FLACCL/flac.cl index 1d8511d..d3777a8 100644 --- a/CUETools.Codecs.FLACCL/flac.cl +++ b/CUETools.Codecs.FLACCL/flac.cl @@ -474,6 +474,13 @@ void clEstimateResidual( barrier(CLK_LOCAL_MEM_FENCE); +#ifdef AMD + float4 fc0 = vload4(0, &fcoef[0]); + float4 fc1 = vload4(1, &fcoef[0]); +#if MAX_ORDER > 8 + float4 fc2 = vload4(2, &fcoef[0]); +#endif +#endif for (int pos = 0; pos < bs; pos += GROUP_SIZE) { // fetch samples @@ -484,10 +491,20 @@ void clEstimateResidual( // compute residual __local float* dptr = &data[tid + GROUP_SIZE - ro]; - float4 sum = vload4(0, &fcoef[0]) * vload4(0, dptr) + float4 sum +#ifdef AMD + = fc0 * vload4(0, dptr) + + fc1 * vload4(1, dptr) +#else + = vload4(0, &fcoef[0]) * vload4(0, dptr) + vload4(1, &fcoef[0]) * vload4(1, dptr) +#endif #if MAX_ORDER > 8 +#ifdef AMD + + fc2 * vload4(2, dptr) +#else + vload4(2, &fcoef[0]) * vload4(2, dptr) +#endif #if MAX_ORDER > 12 + vload4(3, &fcoef[0]) * vload4(3, dptr) #if MAX_ORDER > 16 @@ -502,15 +519,20 @@ void clEstimateResidual( int t = convert_int_rte(nextData + sum.x + sum.y + sum.z + sum.w); barrier(CLK_LOCAL_MEM_FENCE); +#ifdef AMD + data[tid] = nextData; // ensure we're within frame bounds t = select(0, t, offs >= ro && offs < bs); // overflow protection t = iclamp(t, -0x7fffff, 0x7fffff); // convert to unsigned -#ifdef AMD - data[tid] = nextData; atom_add(&psum[min(63,offs >> partOrder)], (t << 1) ^ (t >> 31)); #else + // ensure we're within frame bounds + t = select(0, t, offs >= ro && offs < bs); + // overflow protection + t = iclamp(t, -0x7fffff, 0x7fffff); + // convert to unsigned data[tid] = (t << 1) ^ (t >> 31); barrier(CLK_LOCAL_MEM_FENCE); int ps = (1 << partOrder) - 1;