diff --git a/CUETools.Codecs.FLACCL/FLACCLWriter.cs b/CUETools.Codecs.FLACCL/FLACCLWriter.cs index c15f583..2970e40 100644 --- a/CUETools.Codecs.FLACCL/FLACCLWriter.cs +++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs @@ -354,6 +354,19 @@ namespace CUETools.Codecs.FLACCL if (inited) { _IO.Close(); + if (task2.frameCount > 0) + { + if (cpu_tasks != null) + { + for (int i = 0; i < cpu_tasks.Length; i++) + { + wait_for_cpu_task(); + oldest_cpu_task = (oldest_cpu_task + 1) % cpu_tasks.Length; + } + } + task2.openCLCQ.Finish(); // cuda.SynchronizeStream(task2.stream); + task2.frameCount = 0; + } task1.Dispose(); task2.Dispose(); if (cpu_tasks != null) @@ -685,6 +698,38 @@ namespace CUETools.Codecs.FLACCL } } + /// + /// Special case when (n >> pmax) == 32 + /// + /// + /// + /// + /// + /// + /// + static unsafe void calc_sums32(int pmin, int pmax, uint* data, uint n, uint pred_order, ulong* sums) + { + int parts = (1 << pmax); + uint* res = data + pred_order; + uint cnt = 32 - pred_order; + ulong sum = 0UL; + for (uint j = cnt; j > 0; j--) + sum += *(res++); + sums[0] = sum; + for (int i = 1; i < parts; i++) + { + sums[i] = 0UL + + *(res++) + *(res++) + *(res++) + *(res++) + + *(res++) + *(res++) + *(res++) + *(res++) + + *(res++) + *(res++) + *(res++) + *(res++) + + *(res++) + *(res++) + *(res++) + *(res++) + + *(res++) + *(res++) + *(res++) + *(res++) + + *(res++) + *(res++) + *(res++) + *(res++) + + *(res++) + *(res++) + *(res++) + *(res++) + + *(res++) + *(res++) + *(res++) + *(res++); + } + } + /// /// Special case when (n >> pmax) == 18 /// @@ -728,7 +773,9 @@ namespace CUETools.Codecs.FLACCL udata[i] = (uint)((data[i] << 1) ^ (data[i] >> 31)); // sums for highest level - if ((n >> pmax) == 18) + if ((n >> pmax) == 32) + calc_sums32(pmin, pmax, udata, n, pred_order, sums + pmax * Flake.MAX_PARTITIONS); + else if ((n >> pmax) == 18) calc_sums18(pmin, pmax, udata, n, pred_order, sums + pmax * Flake.MAX_PARTITIONS); else if ((n >> pmax) == 16) calc_sums16(pmin, pmax, udata, n, pred_order, sums + pmax * Flake.MAX_PARTITIONS); @@ -1726,6 +1773,7 @@ namespace CUETools.Codecs.FLACCL OCLMan.Defines = "#define MAX_ORDER " + eparams.max_prediction_order.ToString() + "\n" + "#define GROUP_SIZE " + groupSize.ToString() + "\n" + + "#define GROUP_SIZE_LOG " + BitReader.log2i(groupSize).ToString() + "\n" + "#define FLACCL_VERSION \"" + Vendor + "\"\n" + (UseGPUOnly ? "#define DO_PARTITIONS\n" : "") + (UseGPURice ? "#define DO_RICE\n" : "") + @@ -2437,6 +2485,7 @@ namespace CUETools.Codecs.FLACCL public Kernel clEncodeResidual; public Kernel clCalcPartition; public Kernel clCalcPartition16; + public Kernel clCalcPartition32; public Kernel clSumPartition; public Kernel clFindRiceParameter; public Kernel clFindPartitionOrder; @@ -2530,7 +2579,8 @@ namespace CUETools.Codecs.FLACCL int MAX_CHANNELSIZE = MAX_FRAMES * ((writer.m_blockSize + 3) & ~3); residualTasksLen = sizeof(FLACCLSubframeTask) * 32 * channelsCount * MAX_FRAMES; bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channels * MAX_FRAMES; - int samplesBufferLen = writer.Settings.PCM.BlockAlign * MAX_CHANNELSIZE * channelsCount; + int samplesBytesLen = writer.Settings.PCM.BlockAlign * MAX_CHANNELSIZE; + int samplesBufferLen = sizeof(int) * MAX_CHANNELSIZE * channelsCount; int residualBufferLen = sizeof(int) * MAX_CHANNELSIZE * channels; // need to adjust residualOffset? int partitionsLen = sizeof(int) * ((writer.Settings.PCM.BitsPerSample > 16 ? 31 : 15) * 2 << 8) * channels * MAX_FRAMES; int riceParamsLen = sizeof(int) * (4 << 8) * channels * MAX_FRAMES; @@ -2543,7 +2593,7 @@ namespace CUETools.Codecs.FLACCL if (!this.UseMappedMemory) { - clSamplesBytes = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, samplesBufferLen / 2); + clSamplesBytes = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, samplesBytesLen); clResidual = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, residualBufferLen); clBestRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, riceParamsLen / 4); clResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, residualTasksLen); @@ -2552,7 +2602,7 @@ namespace CUETools.Codecs.FLACCL clSelectedTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, selectedLen); clRiceOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, riceLen); - clSamplesBytesPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, samplesBufferLen / 2); + clSamplesBytesPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, samplesBytesLen); clResidualPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, residualBufferLen); clBestRiceParamsPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceParamsLen / 4); clResidualTasksPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, residualTasksLen); @@ -2561,7 +2611,7 @@ namespace CUETools.Codecs.FLACCL clSelectedTasksPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, selectedLen); clRiceOutputPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceLen); - clSamplesBytesPtr = openCLCQ.EnqueueMapBuffer(clSamplesBytesPinned, true, MapFlags.READ_WRITE, 0, samplesBufferLen / 2); + clSamplesBytesPtr = openCLCQ.EnqueueMapBuffer(clSamplesBytesPinned, true, MapFlags.READ_WRITE, 0, samplesBytesLen); clResidualPtr = openCLCQ.EnqueueMapBuffer(clResidualPinned, true, MapFlags.READ_WRITE, 0, residualBufferLen); clBestRiceParamsPtr = openCLCQ.EnqueueMapBuffer(clBestRiceParamsPinned, true, MapFlags.READ_WRITE, 0, riceParamsLen / 4); clResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clResidualTasksPinned, true, MapFlags.READ_WRITE, 0, residualTasksLen); @@ -2572,7 +2622,7 @@ namespace CUETools.Codecs.FLACCL } else { - clSamplesBytes = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, (uint)samplesBufferLen / 2); + clSamplesBytes = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, (uint)samplesBytesLen); clResidual = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, residualBufferLen); clBestRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceParamsLen / 4); clResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, residualTasksLen); @@ -2581,7 +2631,7 @@ namespace CUETools.Codecs.FLACCL clSelectedTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, selectedLen); clRiceOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceLen); - clSamplesBytesPtr = openCLCQ.EnqueueMapBuffer(clSamplesBytes, true, MapFlags.READ_WRITE, 0, samplesBufferLen / 2); + clSamplesBytesPtr = openCLCQ.EnqueueMapBuffer(clSamplesBytes, true, MapFlags.READ_WRITE, 0, samplesBytesLen); clResidualPtr = openCLCQ.EnqueueMapBuffer(clResidual, true, MapFlags.READ_WRITE, 0, residualBufferLen); clBestRiceParamsPtr = openCLCQ.EnqueueMapBuffer(clBestRiceParams, true, MapFlags.READ_WRITE, 0, riceParamsLen / 4); clResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clResidualTasks, true, MapFlags.READ_WRITE, 0, residualTasksLen); @@ -2630,6 +2680,7 @@ namespace CUETools.Codecs.FLACCL { clCalcPartition = openCLProgram.CreateKernel("clCalcPartition"); clCalcPartition16 = openCLProgram.CreateKernel("clCalcPartition16"); + clCalcPartition32 = openCLProgram.CreateKernel("clCalcPartition32"); } clSumPartition = openCLProgram.CreateKernel("clSumPartition"); clFindRiceParameter = openCLProgram.CreateKernel("clFindRiceParameter"); @@ -2684,6 +2735,7 @@ namespace CUETools.Codecs.FLACCL { clCalcPartition.Dispose(); clCalcPartition16.Dispose(); + clCalcPartition32.Dispose(); } clSumPartition.Dispose(); clFindRiceParameter.Dispose(); @@ -2759,7 +2811,9 @@ namespace CUETools.Codecs.FLACCL clSelectedTasks.Dispose(); clRiceOutput.Dispose(); + openCLCQ.Finish(); openCLCQ.Dispose(); + openCLCQ = null; GC.SuppressFinalize(this); } @@ -2808,6 +2862,9 @@ namespace CUETools.Codecs.FLACCL } else { + // channelSize == blockSize * nFrames; + // clSamplesBytes length = blockSize * nFrames * blockalign + // clSamples length = 4 * blockSize * nFrames * channels clChannelDecorrX.SetArgs( clSamples, clSamplesBytes, @@ -2982,6 +3039,18 @@ namespace CUETools.Codecs.FLACCL clCalcPartition16, groupSize, channels * frameCount); } + else if (frameSize >> max_porder == 32) + { + clCalcPartition32.SetArgs( + clPartitions, + clResidual, + clBestResidualTasks, + max_porder); + + openCLCQ.EnqueueNDRangeKernel( + clCalcPartition32, + groupSize, channels * frameCount); + } else { clCalcPartition.SetArgs( diff --git a/CUETools.Codecs.FLACCL/flac.cl b/CUETools.Codecs.FLACCL/flac.cl index 80d23e5..2b6829e 100644 --- a/CUETools.Codecs.FLACCL/flac.cl +++ b/CUETools.Codecs.FLACCL/flac.cl @@ -1519,12 +1519,76 @@ void clCalcPartition16( for (int k0 = 0; k0 <= MAX_RICE_PARAM; k0 += 16) { - int k1 = k0 + (tid >> 3), x1 = tid & 7; + int k1 = k0 + (tid >> (GROUP_SIZE_LOG - 4)), x1 = tid & ((1 << (GROUP_SIZE_LOG - 4)) - 1); if (k1 <= MAX_RICE_PARAM && (pos >> 4) + x1 < (1 << max_porder)) partition_lengths[((MAX_RICE_PARAM + 1) << (max_porder + 1)) * get_group_id(0) + (k1 << (max_porder + 1)) + (pos >> 4) + x1] = pl[x1][k1]; } } } +__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) +void clCalcPartition32( + __global unsigned int *partition_lengths, + __global int *residual, + __global FLACCLSubframeTask *tasks, + int max_porder // <= 8 + ) +{ + __local FLACCLSubframeData task; + __local unsigned int res[GROUP_SIZE]; + __local unsigned int pl[GROUP_SIZE >> 5][32]; + const int tid = get_local_id(0); + if (tid < sizeof(task) / sizeof(int)) + ((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(0)]))[tid]; + barrier(CLK_LOCAL_MEM_FENCE); + + int bs = task.blocksize; + int ro = task.residualOrder; + + barrier(CLK_LOCAL_MEM_FENCE); + + for (int pos = 0; pos < bs; pos += GROUP_SIZE) + { + int offs = pos + tid; + // fetch residual + int s = (offs >= ro && offs < bs) ? residual[task.residualOffs + offs] : 0; + // convert to unsigned + res[tid] = (s << 1) ^ (s >> 31); + + barrier(CLK_LOCAL_MEM_FENCE); + + // we must ensure that psize * (t >> k) doesn't overflow; + uint4 lim = 0x07ffffffU; + int x = tid >> 5; + __local uint * chunk = &res[x << 5]; + // calc number of unary bits for each group of 32 residual samples + // with each rice parameter. + int k = tid & 31; + uint4 rsum + = min(lim, vload4(0,chunk) >> k) + + min(lim, vload4(1,chunk) >> k) + + min(lim, vload4(2,chunk) >> k) + + min(lim, vload4(3,chunk) >> k) + + min(lim, vload4(4,chunk) >> k) + + min(lim, vload4(5,chunk) >> k) + + min(lim, vload4(6,chunk) >> k) + + min(lim, vload4(7,chunk) >> k) + ; + uint rs = rsum.x + rsum.y + rsum.z + rsum.w; + + // We can safely limit length here to 0x007fffffU, not causing length + // mismatch, because any such length would cause Verbatim frame anyway. + // And this limit protects us from overflows when calculating larger + // partitions, as we can have a maximum of 2^8 partitions, resulting + // in maximum partition length of 0x7fffffffU + change. + if (k <= MAX_RICE_PARAM) pl[x][k] = min(0x007fffffU, rs) + (uint)(32 - select(0, ro, offs < 32)) * (k + 1); + + barrier(CLK_LOCAL_MEM_FENCE); + + int k1 = (tid >> (GROUP_SIZE_LOG - 5)), x1 = tid & ((1 << (GROUP_SIZE_LOG - 5)) - 1); + if (k1 <= MAX_RICE_PARAM && (pos >> 5) + x1 < (1 << max_porder)) + partition_lengths[((MAX_RICE_PARAM + 1) << (max_porder + 1)) * get_group_id(0) + (k1 << (max_porder + 1)) + (pos >> 5) + x1] = pl[x1][k1]; + } +} #endif #ifdef FLACCL_CPU