diff --git a/CUETools.Codecs.FLACCL/FLACCLWriter.cs b/CUETools.Codecs.FLACCL/FLACCLWriter.cs index 536c935..d8295d1 100644 --- a/CUETools.Codecs.FLACCL/FLACCLWriter.cs +++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs @@ -2217,6 +2217,7 @@ namespace CUETools.Codecs.FLACCL public Kernel clComputeLPC; //public Kernel cudaComputeLPCLattice; public Kernel clQuantizeLPC; + public Kernel clSelectStereoTasks; public Kernel clEstimateResidual; public Kernel clChooseBestMethod; public Kernel clCopyBestMethod; @@ -2372,6 +2373,7 @@ namespace CUETools.Codecs.FLACCL clComputeLPC = openCLProgram.CreateKernel("clComputeLPC"); clQuantizeLPC = openCLProgram.CreateKernel("clQuantizeLPC"); //cudaComputeLPCLattice = openCLProgram.CreateKernel("clComputeLPCLattice"); + clSelectStereoTasks = openCLProgram.CreateKernel("clSelectStereoTasks"); clEstimateResidual = openCLProgram.CreateKernel("clEstimateResidual"); clChooseBestMethod = openCLProgram.CreateKernel("clChooseBestMethod"); clCopyBestMethod = openCLProgram.CreateKernel("clCopyBestMethod"); @@ -2421,6 +2423,7 @@ namespace CUETools.Codecs.FLACCL clComputeLPC.Dispose(); clQuantizeLPC.Dispose(); //cudaComputeLPCLattice.Dispose(); + clSelectStereoTasks.Dispose(); clEstimateResidual.Dispose(); clChooseBestMethod.Dispose(); clCopyBestMethod.Dispose(); @@ -2524,9 +2527,9 @@ namespace CUETools.Codecs.FLACCL clChannelDecorr.SetArgs( clSamples, clSamplesBytes, - FLACCLWriter.MAX_BLOCKSIZE); + FLACCLWriter.MAX_BLOCKSIZE/4); - openCLCQ.EnqueueNDRangeKernel(clChannelDecorr, 0, frameSize * frameCount); + openCLCQ.EnqueueNDRangeKernel(clChannelDecorr, 0, frameSize * frameCount / 4); if (eparams.do_wasted) { @@ -2723,7 +2726,6 @@ namespace CUETools.Codecs.FLACCL } } -#if HJHKHJ public static class OpenCLExtensions { public static void SetArgs(this Kernel kernel, params object[] args) @@ -2754,5 +2756,4 @@ namespace CUETools.Codecs.FLACCL queue.EnqueueNDRangeKernel(kernel, 2, null, new long[] { localSizeX * globalSizeX, localSizeY * globalSizeY }, new long[] { localSizeX, localSizeY }); } } -#endif } diff --git a/CUETools.Codecs.FLACCL/flaccpu.cl b/CUETools.Codecs.FLACCL/flaccpu.cl index 8b3f562..676dcf1 100644 --- a/CUETools.Codecs.FLACCL/flaccpu.cl +++ b/CUETools.Codecs.FLACCL/flaccpu.cl @@ -52,7 +52,8 @@ typedef struct int wbits; int abits; int porder; - int reserved[2]; + int ignore; + int reserved; } FLACCLSubframeData; typedef struct @@ -62,34 +63,60 @@ typedef struct } FLACCLSubframeTask; __kernel void clStereoDecorr( - __global int *samples, - __global short2 *src, + __global int4 *samples, + __global int4 *src, int offset ) { int pos = get_global_id(0); if (pos < offset) { - short2 s = src[pos]; - samples[pos] = s.x; - samples[1 * offset + pos] = s.y; - samples[2 * offset + pos] = (s.x + s.y) >> 1; - samples[3 * offset + pos] = s.x - s.y; + int4 s = src[pos]; + int4 x = (s << 16) >> 16; + int4 y = s >> 16; + samples[pos] = x; + samples[1 * offset + pos] = y; + samples[2 * offset + pos] = (x + y) >> 1; + samples[3 * offset + pos] = x - y; } } +__kernel void clWindowRectangle(__global float* window, int windowOffset) +{ + window[get_global_id(0)] = 1.0f; +} + +__kernel void clWindowFlattop(__global float* window, int windowOffset) +{ + float p = M_PI * get_global_id(0) / (get_global_size(0) - 1); + window[get_global_id(0)] = 1.0f + - 1.93f * cos(2 * p) + + 1.29f * cos(4 * p) + - 0.388f * cos(6 * p) + + 0.0322f * cos(8 * p); +} + +__kernel void clWindowTukey(__global float* window, int windowOffset, float p) +{ + int Np = (int)(p / 2.0f * get_global_size(0)) - 1; + int n = select(max(Np, get_global_id(0) - (get_global_size(0) - Np - 1) + Np), get_global_id(0), get_global_id(0) <= Np); + window[get_global_id(0)] = 0.5f - 0.5f * cos(M_PI * n / Np); +} + __kernel void clChannelDecorr2( - __global int *samples, - __global short2 *src, + __global int4 *samples, + __global int4 *src, int offset ) { int pos = get_global_id(0); if (pos < offset) { - short2 s = src[pos]; - samples[pos] = s.x; - samples[1 * offset + pos] = s.y; + int4 s = src[pos]; + int4 x = (s << 16) >> 16; + int4 y = s >> 16; + samples[pos] = x; + samples[1 * offset + pos] = y; } } @@ -128,6 +155,8 @@ void clFindWastedBits( { ptask[i].data.wbits = w; ptask[i].data.abits = a; + ptask[i].data.ignore = 0;//i != 0; + ptask[i].data.size = ptask[i].data.obits * ptask[i].data.blocksize; } } @@ -413,6 +442,20 @@ inline int calc_residual(__global int *ptr, int * coefs, int ro) default: ENCODE_N(ro, action) \ } +__kernel __attribute__((reqd_work_group_size(1, 1, 1))) +void clSelectStereoTasks( + __global FLACCLSubframeTask *tasks, + int count + ) +{ + for (int i = 0; i < count; i++) + { + __global FLACCLSubframeTask* ptask = tasks + count * get_group_id(0) + i; + ptask->data.ignore = i != 0; + ptask->data.size = ptask->data.obits * ptask->data.blocksize; + } +} + __kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(1, 1, 1))) void clEstimateResidual( __global int*samples, @@ -425,6 +468,12 @@ void clEstimateResidual( #define EPO 6 int len[1 << EPO]; // blocksize / 64!!!! + if (task.data.ignore) + { + tasks[get_group_id(0)].data.size = task.data.obits * bs; + return; + } + __global int *data = &samples[task.data.samplesOffs]; // for (int i = ro; i < 32; i++) //task.coefs[i] = 0; @@ -579,13 +628,15 @@ void clCalcPartition16( int max_porder // <= 8 ) { - FLACCLSubframeTask task = tasks[get_group_id(0)]; + FLACCLSubframeTask task = tasks[get_global_id(0)]; int bs = task.data.blocksize; int ro = task.data.residualOrder; __global int *data = &samples[task.data.samplesOffs]; - __global int *pl = partition_lengths + (1 << (max_porder + 1)) * get_group_id(0); + __global int *pl = partition_lengths + (1 << (max_porder + 1)) * get_global_id(0); for (int p = 0; p < (1 << max_porder); p++) pl[p] = 0; + //__global int *rptr = residual + task.data.residualOffs; + //SWITCH_N((rptr[pos] = t, pl[pos >> 4] += (t << 1) ^ (t >> 31))); SWITCH_N((residual[task.data.residualOffs + pos] = t, t = clamp(t, -0x7fffff, 0x7fffff), t = (t << 1) ^ (t >> 31), pl[pos >> 4] += t)); }