diff --git a/CUETools.Codecs.FLACCL/FLACCLWriter.cs b/CUETools.Codecs.FLACCL/FLACCLWriter.cs index c6bd794..c839e17 100644 --- a/CUETools.Codecs.FLACCL/FLACCLWriter.cs +++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs @@ -1826,7 +1826,7 @@ namespace CUETools.Codecs.FLACCL public string Path { get { return _path; } } - public static readonly string vendor_string = "FLACCL#0.3"; + public static readonly string vendor_string = "FLACCL#0.4"; int select_blocksize(int samplerate, int time_ms) { diff --git a/CUETools.Codecs.FLACCL/flac.cl b/CUETools.Codecs.FLACCL/flac.cl index a7d3bd8..2cc5a3b 100644 --- a/CUETools.Codecs.FLACCL/flac.cl +++ b/CUETools.Codecs.FLACCL/flac.cl @@ -303,6 +303,7 @@ void clComputeAutocor( #else // get_num_groups(0) == number of tasks // get_num_groups(1) == number of windows +#if 0 __kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) void clComputeAutocor( __global float *output, @@ -346,7 +347,6 @@ void clComputeAutocor( int lag = tid & (THREADS_FOR_ORDERS - 1); int tid1 = tid + GROUP_SIZE - lag; -//#if 1 #ifdef AMD float4 res = 0.0f; for (int i = 0; i < THREADS_FOR_ORDERS / 4; i++) @@ -380,6 +380,114 @@ void clComputeAutocor( if (tid <= MAX_ORDER) output[(get_group_id(0) * get_num_groups(1) + get_group_id(1)) * (MAX_ORDER + 1) + tid] = data[tid]; } +#else +__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) +void clComputeAutocor( + __global float *output, + __global const int *samples, + __global const float *window, + __global FLACCLSubframeTask *tasks, + const int taskCount // tasks per block +) +{ + __local float data[GROUP_SIZE * 2 + 32]; + __local FLACCLSubframeData task; + const int tid = get_local_id(0); + // fetch task data + if (tid < sizeof(task) / sizeof(int)) + ((__local int*)&task)[tid] = ((__global int*)(tasks + taskCount * get_group_id(0)))[tid]; + barrier(CLK_LOCAL_MEM_FENCE); + + int bs = task.blocksize; + data[tid] = 0.0f; + if (tid < 32) + data[GROUP_SIZE * 2 + tid] = 0.0f; + + const int THREADS_FOR_ORDERS = MAX_ORDER < 8 ? 8 : MAX_ORDER < 16 ? 16 : MAX_ORDER < 32 ? 32 : 64; + int lag = tid & (THREADS_FOR_ORDERS - 1); + int tid1 = tid + GROUP_SIZE - lag; + int pos = 0; + const __global float * wptr = &window[get_group_id(1) * bs]; +#ifdef AMD + float4 corr = 0.0f; +#else + float corr = 0.0f; +#endif + float corr1 = 0.0f; + for (pos = 0; pos + GROUP_SIZE - 1 < bs; pos += GROUP_SIZE) + { + // fetch samples + int off = pos + tid; +// const __global int * sptr = &samples[task.samplesOffs]; + float nextData = samples[task.samplesOffs + off] * wptr[off]; + data[tid + GROUP_SIZE] = nextData; + barrier(CLK_LOCAL_MEM_FENCE); + +#ifdef AMD + for (int i = 0; i < THREADS_FOR_ORDERS / 4; i++) + corr += vload4(i, &data[tid1 - lag]) * vload4(i, &data[tid1]); +#else + for (int i = 0; i < THREADS_FOR_ORDERS; i++) + corr += data[tid1 - lag + i] * data[tid1 + i]; +#endif + + if ((pos & (GROUP_SIZE * 15)) == 0) + { +#ifdef AMD + corr1 += (corr.x + corr.y) + (corr.w + corr.z); +#else + corr1 += corr; +#endif + corr = 0.0f; + } + + barrier(CLK_LOCAL_MEM_FENCE); + data[tid] = nextData; + } + if (pos < bs) + { + // fetch samples + int off = pos + tid; + float nextData = off < bs ? samples[task.samplesOffs + off] * wptr[off] : 0.0f; + data[tid + GROUP_SIZE] = nextData; + barrier(CLK_LOCAL_MEM_FENCE); + + int lag = tid & (THREADS_FOR_ORDERS - 1); + int tid1 = tid + GROUP_SIZE - lag; +//#if 1 +#ifdef AMD + float4 res = 0.0f; + for (int i = 0; i < THREADS_FOR_ORDERS / 4; i++) + res += vload4(i, &data[tid1 - lag]) * vload4(i, &data[tid1]); + corr1 += res.x + res.y + res.w + res.z; +#else + for (int i = 0; i < THREADS_FOR_ORDERS; i++) + corr1 += data[tid1 - lag + i] * data[tid1 + i]; +#endif + + barrier(CLK_LOCAL_MEM_FENCE); + data[tid] = nextData; + } + +#ifdef AMD + corr1 += corr.x + corr.y + corr.w + corr.z; +#else + corr1 += corr; +#endif + + data[tid] = corr1; + barrier(CLK_LOCAL_MEM_FENCE); + for (int i = GROUP_SIZE / 2; i >= THREADS_FOR_ORDERS; i >>= 1) + { + if (tid < i) + data[tid] += data[tid + i]; + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (tid <= MAX_ORDER) + output[(get_group_id(0) * get_num_groups(1) + get_group_id(1)) * (MAX_ORDER + 1) + tid] = data[tid]; +} +#endif #endif #ifdef FLACCL_CPU @@ -915,11 +1023,63 @@ void clEstimateResidual( #if MAX_ORDER > 8 float4 fc2 = vload4(2, &fcoef[0]); #endif - for (int pos = 0; pos < bs; pos += GROUP_SIZE) + __global int * rptr = &samples[task.data.samplesOffs]; + int wb = task.data.wbits; + int pos; + for (pos = 0; pos + GROUP_SIZE - 1 < bs; pos += GROUP_SIZE) { // fetch samples int offs = pos + tid; - float nextData = offs < bs ? samples[task.data.samplesOffs + offs] >> task.data.wbits : 0.0f; + float nextData = rptr[offs] >> wb; + data[tid + GROUP_SIZE] = nextData; + barrier(CLK_LOCAL_MEM_FENCE); + + // compute residual + __local float* dptr = &data[tid + GROUP_SIZE - MAX_ORDER]; + float4 sum4 + = fc0 * vload4(0, dptr) + + fc1 * vload4(1, dptr) +#if MAX_ORDER > 8 + + fc2 * vload4(2, dptr) + #if MAX_ORDER > 12 + + vload4(3, &fcoef[0]) * vload4(3, dptr) + #if MAX_ORDER > 16 + + vload4(4, &fcoef[0]) * vload4(4, dptr) + + vload4(5, &fcoef[0]) * vload4(5, dptr) + + vload4(6, &fcoef[0]) * vload4(6, dptr) + + vload4(7, &fcoef[0]) * vload4(7, dptr) + #endif + #endif +#endif + ; + + float2 sum2 = sum4.s01 + sum4.s23; + int t = convert_int_rte(nextData + (sum2.s0 + sum2.s1)); +// int t = (int)(nextData + sum.x + sum.y + sum.z + sum.w); + barrier(CLK_LOCAL_MEM_FENCE); + data[tid] = nextData; + // ensure we're within frame bounds + t = select(0, t, offs >= ro); + // overflow protection + t = iclamp(t, -0x7fffff, 0x7fffff); + // convert to unsigned + t = (t << 1) ^ (t >> 31); +#if !defined(AMD) || !defined(HAVE_ATOM) + idata[tid] = t; + for (int l = 16; l > 1; l >>= 1) + idata[tid] += idata[tid + l]; + if ((tid & 31) == 0) + psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG] = idata[tid] + idata[tid + 1]; +#else + atom_add(&psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG], t); +#endif + } +#if 1 + if (pos < bs) + { + // fetch samples + int offs = pos + tid; + float nextData = offs < bs ? rptr[offs] >> wb : 0.0f; data[tid + GROUP_SIZE] = nextData; barrier(CLK_LOCAL_MEM_FENCE); @@ -961,6 +1121,7 @@ void clEstimateResidual( atom_add(&psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG], t); #endif } +#endif // calculate rice partition bit length for every 32 samples barrier(CLK_LOCAL_MEM_FENCE);