diff --git a/CUETools.Codecs.FLACCL/flac.cl b/CUETools.Codecs.FLACCL/flac.cl index 63f2d44..1d8511d 100644 --- a/CUETools.Codecs.FLACCL/flac.cl +++ b/CUETools.Codecs.FLACCL/flac.cl @@ -474,13 +474,6 @@ void clEstimateResidual( barrier(CLK_LOCAL_MEM_FENCE); -#ifdef AMD - float4 cptr0 = vload4(0, &fcoef[0]); - float4 cptr1 = vload4(1, &fcoef[0]); -#if MAX_ORDER > 8 - float4 cptr2 = vload4(2, &fcoef[0]); -#endif -#endif for (int pos = 0; pos < bs; pos += GROUP_SIZE) { // fetch samples @@ -491,11 +484,10 @@ void clEstimateResidual( // compute residual __local float* dptr = &data[tid + GROUP_SIZE - ro]; -#ifdef AMD - float4 sum = cptr0 * vload4(0, dptr) - + cptr1 * vload4(1, dptr) + float4 sum = vload4(0, &fcoef[0]) * vload4(0, dptr) + + vload4(1, &fcoef[0]) * vload4(1, dptr) #if MAX_ORDER > 8 - + cptr2 * vload4(2, dptr) + + vload4(2, &fcoef[0]) * vload4(2, dptr) #if MAX_ORDER > 12 + vload4(3, &fcoef[0]) * vload4(3, dptr) #if MAX_ORDER > 16 @@ -509,25 +501,33 @@ void clEstimateResidual( ; int t = convert_int_rte(nextData + sum.x + sum.y + sum.z + sum.w); -#else - float sum = - fcoef[0] * dptr[0] + fcoef[1] * dptr[1] + fcoef[2] * dptr[2] + fcoef[3] * dptr[3] + - fcoef[4] * dptr[4] + fcoef[5] * dptr[5] + fcoef[6] * dptr[6] + fcoef[7] * dptr[7] + - fcoef[8] * dptr[8] + fcoef[9] * dptr[9] + fcoef[10] * dptr[10] + fcoef[11] * dptr[11] ; - int t = convert_int_rte(nextData + sum); -#endif barrier(CLK_LOCAL_MEM_FENCE); - data[tid] = nextData; // ensure we're within frame bounds t = select(0, t, offs >= ro && offs < bs); // overflow protection t = iclamp(t, -0x7fffff, 0x7fffff); // convert to unsigned - //if (offs < bs) - atom_add(&psum[offs >> partOrder], (t << 1) ^ (t >> 31)); +#ifdef AMD + data[tid] = nextData; + atom_add(&psum[min(63,offs >> partOrder)], (t << 1) ^ (t >> 31)); +#else + data[tid] = (t << 1) ^ (t >> 31); + barrier(CLK_LOCAL_MEM_FENCE); + int ps = (1 << partOrder) - 1; + for (int l = 1 << (partOrder - 1); l > 0; l >>= 1) + { + if ((tid & ps) < l) + data[tid] += data[tid + l]; + barrier(CLK_LOCAL_MEM_FENCE); + } + if ((tid & ps) == 0) + psum[min(63,offs >> partOrder)] += data[tid]; + data[tid] = nextData; +#endif } // calculate rice partition bit length for every (1 << partOrder) samples + barrier(CLK_LOCAL_MEM_FENCE); if (tid < 64) { int k = iclamp(clz(1 << partOrder) - clz(psum[tid]), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16) diff --git a/CUETools.Codecs.FLACCL/flaccpu.cl b/CUETools.Codecs.FLACCL/flaccpu.cl index 0aa06a1..8b3f562 100644 --- a/CUETools.Codecs.FLACCL/flaccpu.cl +++ b/CUETools.Codecs.FLACCL/flaccpu.cl @@ -131,8 +131,9 @@ void clFindWastedBits( } } -#define TEMPBLOCK 64 +#define TEMPBLOCK 128 +#if 0 // get_num_groups(0) == number of tasks // get_num_groups(1) == number of windows __kernel __attribute__((reqd_work_group_size(1, 1, 1))) @@ -173,6 +174,78 @@ void clComputeAutocor( for (int i = 0; i <= MAX_ORDER; ++i) pout[i] = ac[i]; } +#else +#define STORE_AC(ro, val) if (ro <= MAX_ORDER) pout[ro] = val; +#define STORE_AC4(ro, val) STORE_AC(ro*4+0, val##ro.x) STORE_AC(ro*4+1, val##ro.y) STORE_AC(ro*4+2, val##ro.z) STORE_AC(ro*4+3, val##ro.w) + +// get_num_groups(0) == number of tasks +// get_num_groups(1) == number of windows +__kernel __attribute__((reqd_work_group_size(1, 1, 1))) +void clComputeAutocor( + __global float *output, + __global const int *samples, + __global const float *window, + __global FLACCLSubframeTask *tasks, + const int taskCount // tasks per block +) +{ + FLACCLSubframeData task = tasks[get_group_id(0) * taskCount].data; + int len = task.blocksize; + int windowOffs = get_group_id(1) * len; + float data[TEMPBLOCK + MAX_ORDER + 3]; + double4 ac0 = 0.0, ac1 = 0.0, ac2 = 0.0, ac3 = 0.0, ac4 = 0.0, ac5 = 0.0, ac6 = 0.0, ac7 = 0.0, ac8 = 0.0; + + for (int pos = 0; pos < len; pos += TEMPBLOCK) + { + for (int tid = 0; tid < TEMPBLOCK + MAX_ORDER + 3; tid++) + data[tid] = tid < len - pos ? samples[task.samplesOffs + pos + tid] * window[windowOffs + pos + tid] : 0.0f; + + for (int j = 0; j < TEMPBLOCK;) + { + float4 temp0 = 0.0f, temp1 = 0.0f, temp2 = 0.0f, temp3 = 0.0f, temp4 = 0.0f, temp5 = 0.0f, temp6 = 0.0f, temp7 = 0.0f, temp8 = 0.0f; + for (int k = 0; k < 32; k++) + { + float d0 = data[j]; + temp0 += d0 * vload4(0, &data[j]); + temp1 += d0 * vload4(1, &data[j]); +#if MAX_ORDER >= 8 + temp2 += d0 * vload4(2, &data[j]); +#if MAX_ORDER >= 12 + temp3 += d0 * vload4(3, &data[j]); +#if MAX_ORDER >= 16 + temp4 += d0 * vload4(4, &data[j]); + temp5 += d0 * vload4(5, &data[j]); + temp6 += d0 * vload4(6, &data[j]); + temp7 += d0 * vload4(7, &data[j]); + temp8 += d0 * vload4(8, &data[j]); +#endif +#endif +#endif + j++; + } + ac0 += convert_double4(temp0); + ac1 += convert_double4(temp1); + #if MAX_ORDER >= 8 + ac2 += convert_double4(temp2); + #if MAX_ORDER >= 12 + ac3 += convert_double4(temp3); + #if MAX_ORDER >= 16 + ac4 += convert_double4(temp4); + ac5 += convert_double4(temp5); + ac6 += convert_double4(temp6); + ac7 += convert_double4(temp7); + ac8 += convert_double4(temp8); + #endif + #endif + #endif + } + } + __global float * pout = &output[(get_group_id(0) * get_num_groups(1) + get_group_id(1)) * (MAX_ORDER + 1)]; + STORE_AC4(0, ac) STORE_AC4(1, ac) STORE_AC4(2, ac) STORE_AC4(3, ac) + STORE_AC4(4, ac) STORE_AC4(5, ac) STORE_AC4(6, ac) STORE_AC4(7, ac) + STORE_AC4(8, ac) +} +#endif __kernel __attribute__((reqd_work_group_size(1, 1, 1))) void clComputeLPC(