From 349123ec19532712efd0ebeafa2f06d7e352333c Mon Sep 17 00:00:00 2001 From: chudov Date: Sun, 17 Oct 2010 05:35:11 +0000 Subject: [PATCH] opencl flac encoder --- CUETools.Codecs.FLACCL/FLACCLWriter.cs | 742 +++++++++++++------------ CUETools.Codecs.FLACCL/flac.cl | 82 +-- 2 files changed, 410 insertions(+), 414 deletions(-) diff --git a/CUETools.Codecs.FLACCL/FLACCLWriter.cs b/CUETools.Codecs.FLACCL/FLACCLWriter.cs index 0dc7f60..ce6ca00 100644 --- a/CUETools.Codecs.FLACCL/FLACCLWriter.cs +++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs @@ -98,7 +98,7 @@ namespace CUETools.Codecs.FLACCL // if 0, stream length is unknown int sample_count = -1; - FlakeEncodeParams eparams; + internal FlakeEncodeParams eparams; // maximum frame size in bytes // this can be used to allocate memory for output @@ -115,14 +115,12 @@ namespace CUETools.Codecs.FLACCL // allocated by flake_encode_init and freed by flake_encode_close byte[] header; - float[] windowBuffer; int samplesInBuffer = 0; int max_frames = 0; int _compressionLevel = 7; int _blocksize = 0; int _totalSize = 0; - int _windowsize = 0, _windowcount = 0; Crc8 crc8; Crc16 crc16; @@ -142,8 +140,6 @@ namespace CUETools.Codecs.FLACCL FLACCLTask[] cpu_tasks; int oldest_cpu_task = 0; - Mem cudaWindow; - AudioPCMConfig _pcm; public const int MAX_BLOCKSIZE = 4096 * 16; @@ -167,8 +163,6 @@ namespace CUETools.Codecs.FLACCL _path = path; _IO = IO; - windowBuffer = new float[FLACCLWriter.MAX_BLOCKSIZE * lpc.MAX_LPC_WINDOWS]; - eparams.flake_set_defaults(_compressionLevel, !_settings.GPUOnly); eparams.padding_size = 8192; @@ -216,7 +210,7 @@ namespace CUETools.Codecs.FLACCL } } - FLACCLWriterSettings _settings = new FLACCLWriterSettings(); + internal FLACCLWriterSettings _settings = new FLACCLWriterSettings(); public object Settings { @@ -299,14 +293,13 @@ namespace CUETools.Codecs.FLACCL } _IO.Close(); - cudaWindow.Dispose(); task1.Dispose(); task2.Dispose(); if (cpu_tasks != null) foreach (FLACCLTask task in cpu_tasks) task.Dispose(); openCLProgram.Dispose(); - openCLContext.Dispose(); + OCLMan.Dispose(); inited = false; } } @@ -323,14 +316,13 @@ namespace CUETools.Codecs.FLACCL if (inited) { _IO.Close(); - cudaWindow.Dispose(); task1.Dispose(); task2.Dispose(); if (cpu_tasks != null) foreach (FLACCLTask task in cpu_tasks) task.Dispose(); openCLProgram.Dispose(); - openCLContext.Dispose(); + OCLMan.Dispose(); inited = false; } @@ -717,7 +709,7 @@ namespace CUETools.Codecs.FLACCL return opt_bits; } - static int get_max_p_order(int max_porder, int n, int order) + internal static int get_max_p_order(int max_porder, int n, int order) { int porder = Math.Min(max_porder, BitReader.log2i(n ^ (n - 1))); if (order > 0) @@ -847,6 +839,25 @@ namespace CUETools.Codecs.FLACCL return (uint)(sub.best.order * sub.obits + 9 + sub.best.order * sub.best.cbits + measure_residual(frame, sub)); } + unsafe uint + measure_subframe_fixed(FlacFrame frame, FlacSubframeInfo sub) + { + return (uint)(sub.best.order * sub.obits + measure_residual(frame, sub)); + } + + unsafe uint + measure_subframe(FlacFrame frame, FlacSubframeInfo sub) + { + switch (sub.best.type) + { + case SubframeType.Fixed: + return measure_subframe_fixed(frame, sub); + case SubframeType.LPC: + return measure_subframe_lpc(frame, sub); + } + throw new Exception("not supported subframe type"); + } + unsafe void output_subframe_lpc(FlacFrame frame, FlacSubframeInfo sub) { @@ -915,12 +926,12 @@ namespace CUETools.Codecs.FLACCL unsafe delegate void window_function(float* window, int size); - unsafe void calculate_window(float* window, window_function func, WindowFunction flag) + unsafe void calculate_window(FLACCLTask task, window_function func, WindowFunction flag) { - if ((eparams.window_function & flag) == 0 || _windowcount == lpc.MAX_LPC_WINDOWS) + if ((eparams.window_function & flag) == 0 || task.nWindowFunctions == lpc.MAX_LPC_WINDOWS) return; - func(window + _windowcount * _windowsize, _windowsize); + func(((float*)task.clWindowFunctions.HostPtr) + task.nWindowFunctions * task.frameSize, task.frameSize); //int sz = _windowsize; //float* pos = window + _windowcount * FLACCLWriter.MAX_BLOCKSIZE * 2; //do @@ -931,22 +942,35 @@ namespace CUETools.Codecs.FLACCL // pos += sz; // sz >>= 1; //} while (sz >= 32); - _windowcount++; + task.nWindowFunctions++; } unsafe void initializeSubframeTasks(int blocksize, int channelsCount, int nFrames, FLACCLTask task) { + task.frameSize = blocksize; + task.nWindowFunctions = 0; + if (task.frameSize > 4) + { + calculate_window(task, lpc.window_welch, WindowFunction.Welch); + calculate_window(task, lpc.window_flattop, WindowFunction.Flattop); + calculate_window(task, lpc.window_tukey, WindowFunction.Tukey); + calculate_window(task, lpc.window_hann, WindowFunction.Hann); + calculate_window(task, lpc.window_bartlett, WindowFunction.Bartlett); + if (task.nWindowFunctions == 0) + throw new Exception("invalid windowfunction"); + task.openCLCQ.EnqueueWriteBuffer(task.clWindowFunctions, true, 0, sizeof(float) * task.nWindowFunctions * task.frameSize, task.clWindowFunctions.HostPtr); + } + task.nResidualTasks = 0; task.nTasksPerWindow = Math.Min(32, eparams.orders_per_window); - task.nResidualTasksPerChannel = _windowcount * task.nTasksPerWindow + 1 + (eparams.do_constant ? 1 : 0) + eparams.max_fixed_order - eparams.min_fixed_order; + task.nResidualTasksPerChannel = task.nWindowFunctions * task.nTasksPerWindow + 1 + (eparams.do_constant ? 1 : 0) + eparams.max_fixed_order - eparams.min_fixed_order; //if (task.nResidualTasksPerChannel >= 4) // task.nResidualTasksPerChannel = (task.nResidualTasksPerChannel + 7) & ~7; - task.nAutocorTasksPerChannel = _windowcount; for (int iFrame = 0; iFrame < nFrames; iFrame++) { for (int ch = 0; ch < channelsCount; ch++) { - for (int iWindow = 0; iWindow < _windowcount; iWindow++) + for (int iWindow = 0; iWindow < task.nWindowFunctions; iWindow++) { // LPC tasks for (int order = 0; order < task.nTasksPerWindow; order++) @@ -1032,10 +1056,8 @@ namespace CUETools.Codecs.FLACCL } if (sizeof(FLACCLSubframeTask) * task.nResidualTasks > task.residualTasksLen) throw new Exception("oops"); - task.openCLCQ.EnqueueWriteBuffer(task.cudaResidualTasks, true, 0, sizeof(FLACCLSubframeTask) * task.nResidualTasks, task.residualTasksPtr.AddrOfPinnedObject()); - task.openCLCQ.EnqueueBarrier(); - task.frameSize = blocksize; + task.openCLCQ.EnqueueWriteBuffer(task.clResidualTasks, true, 0, sizeof(FLACCLSubframeTask) * task.nResidualTasks, task.clResidualTasks.HostPtr); } unsafe void encode_residual(FLACCLTask task) @@ -1075,7 +1097,7 @@ namespace CUETools.Codecs.FLACCL // check size if (_settings.GPUOnly) { - uint real_size = measure_subframe_lpc(task.frame, task.frame.subframes[ch]); + uint real_size = measure_subframe(task.frame, task.frame.subframes[ch]); if (real_size != task.frame.subframes[ch].best.size) throw new Exception("size reported incorrectly"); } @@ -1097,7 +1119,7 @@ namespace CUETools.Codecs.FLACCL task.frame.subframes[ch].best.rc = new RiceContext(); #endif task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order); - task.frame.subframes[ch].best.size = measure_subframe_lpc(task.frame, task.frame.subframes[ch]); + task.frame.subframes[ch].best.size = measure_subframe(task.frame, task.frame.subframes[ch]); #if KJHKJH // check size if (_settings.GPUOnly && oldsize > task.frame.subframes[ch].best.size) @@ -1145,7 +1167,7 @@ namespace CUETools.Codecs.FLACCL for (int ch = 0; ch < channels; ch++) { int index = ch + iFrame * channels; - frame.subframes[ch].best.residual = ((int*)task.residualBufferPtr.AddrOfPinnedObject()) + task.BestResidualTasks[index].residualOffs; + frame.subframes[ch].best.residual = ((int*)task.clResidual.HostPtr) + task.BestResidualTasks[index].residualOffs; frame.subframes[ch].best.type = SubframeType.Verbatim; frame.subframes[ch].best.size = (uint)(frame.subframes[ch].obits * frame.blocksize); frame.subframes[ch].wbits = 0; @@ -1166,11 +1188,14 @@ namespace CUETools.Codecs.FLACCL frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i]; if (_settings.GPUOnly && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC)) { - int* riceParams = ((int*)task.bestRiceParamsPtr.AddrOfPinnedObject()) + (index << task.max_porder); + int* riceParams = ((int*)task.clBestRiceParams.HostPtr) + (index << task.max_porder); fixed (int* dstParams = frame.subframes[ch].best.rc.rparams) AudioSamples.MemCpy(dstParams, riceParams, (1 << frame.subframes[ch].best.rc.porder)); //for (int i = 0; i < (1 << frame.subframes[ch].best.rc.porder); i++) // frame.subframes[ch].best.rc.rparams[i] = riceParams[i]; + uint real_size = measure_subframe(frame, frame.subframes[ch]); + if (real_size != task.frame.subframes[ch].best.size) + throw new Exception("size reported incorrectly"); } } } @@ -1178,135 +1203,8 @@ namespace CUETools.Codecs.FLACCL unsafe void estimate_residual(FLACCLTask task, int channelsCount) { - if (task.frameSize <= 4) - return; - - int max_porder = get_max_p_order(eparams.max_partition_order, task.frameSize, eparams.max_prediction_order); - while ((task.frameSize >> max_porder) < 16 && max_porder > 0) - max_porder--; - - if (channels != 2) throw new Exception("channels != 2"); // need to Enqueue cudaChannelDecorr for each channel - Kernel cudaChannelDecorr = channels == 2 ? (channelsCount == 4 ? task.cudaStereoDecorr : task.cudaChannelDecorr2) : null;// task.cudaChannelDecorr; - - cudaChannelDecorr.SetArg(0, task.cudaSamples); - cudaChannelDecorr.SetArg(1, task.cudaSamplesBytes); - cudaChannelDecorr.SetArg(2, (uint)MAX_BLOCKSIZE); - - task.cudaComputeLPC.SetArg(0, task.cudaResidualTasks); - task.cudaComputeLPC.SetArg(1, task.cudaAutocorOutput); - task.cudaComputeLPC.SetArg(2, task.cudaLPCData); - task.cudaComputeLPC.SetArg(3, task.nResidualTasksPerChannel); - task.cudaComputeLPC.SetArg(4, (uint)_windowcount); - - task.cudaQuantizeLPC.SetArg(0, task.cudaResidualTasks); - task.cudaQuantizeLPC.SetArg(1, task.cudaLPCData); - task.cudaQuantizeLPC.SetArg(2, task.nResidualTasksPerChannel); - task.cudaQuantizeLPC.SetArg(3, (uint)task.nTasksPerWindow); - task.cudaQuantizeLPC.SetArg(4, (uint)eparams.lpc_min_precision_search); - task.cudaQuantizeLPC.SetArg(5, (uint)(eparams.lpc_max_precision_search - eparams.lpc_min_precision_search)); - - task.cudaCopyBestMethod.SetArg(0, task.cudaBestResidualTasks); - task.cudaCopyBestMethod.SetArg(1, task.cudaResidualTasks); - task.cudaCopyBestMethod.SetArg(2, task.nResidualTasksPerChannel); - - task.cudaCopyBestMethodStereo.SetArg(0, task.cudaBestResidualTasks); - task.cudaCopyBestMethodStereo.SetArg(1, task.cudaResidualTasks); - task.cudaCopyBestMethodStereo.SetArg(2, task.nResidualTasksPerChannel); - - task.cudaEncodeResidual.SetArg(0, task.cudaResidual); - task.cudaEncodeResidual.SetArg(1, task.cudaSamples); - task.cudaEncodeResidual.SetArg(2, task.cudaBestResidualTasks); - - task.cudaSumPartition.SetArg(0, task.cudaPartitions); - task.cudaSumPartition.SetArg(1, max_porder); - - task.cudaFindRiceParameter.SetArg(0, task.cudaRiceParams); - task.cudaFindRiceParameter.SetArg(1, task.cudaPartitions); - task.cudaFindRiceParameter.SetArg(2, max_porder); - - task.cudaFindPartitionOrder.SetArg(0, task.cudaBestRiceParams); - task.cudaFindPartitionOrder.SetArg(1, task.cudaBestResidualTasks); - task.cudaFindPartitionOrder.SetArg(2, task.cudaRiceParams); - task.cudaFindPartitionOrder.SetArg(3, max_porder); - - // issue work to the GPU - task.openCLCQ.EnqueueBarrier(); - task.openCLCQ.EnqueueNDRangeKernel(cudaChannelDecorr, 1, null, new int[] { task.frameCount * task.frameSize }, null ); - //task.openCLCQ.EnqueueNDRangeKernel(cudaChannelDecorr, 1, null, new int[] { 64 * 128 }, new int[] { 128 }); - - if (eparams.do_wasted) - { - task.openCLCQ.EnqueueBarrier(); - task.EnqueueFindWasted(channelsCount); - } - - // geometry??? - task.openCLCQ.EnqueueBarrier(); - task.EnqueueComputeAutocor(channelsCount, cudaWindow, eparams.max_prediction_order); - - //float* autoc = stackalloc float[1024]; - //task.openCLCQ.EnqueueBarrier(); - //task.openCLCQ.EnqueueReadBuffer(task.cudaAutocorOutput, true, 0, sizeof(float) * 1024, (IntPtr)autoc); - - task.openCLCQ.EnqueueBarrier(); - task.openCLCQ.EnqueueNDRangeKernel(task.cudaComputeLPC, 2, null, new int[] { task.nAutocorTasksPerChannel * 32, channelsCount * task.frameCount }, new int[] { 32, 1 }); - - //float* lpcs = stackalloc float[1024]; - //task.openCLCQ.EnqueueBarrier(); - //task.openCLCQ.EnqueueReadBuffer(task.cudaLPCData, true, 0, sizeof(float) * 1024, (IntPtr)lpcs); - - task.openCLCQ.EnqueueBarrier(); - task.openCLCQ.EnqueueNDRangeKernel(task.cudaQuantizeLPC, 2, null, new int[] { task.nAutocorTasksPerChannel * 32, channelsCount * task.frameCount }, new int[] { 32, 1 }); - - task.openCLCQ.EnqueueBarrier(); - task.EnqueueEstimateResidual(channelsCount); - - //int* rr = stackalloc int[1024]; - //task.openCLCQ.EnqueueBarrier(); - //task.openCLCQ.EnqueueReadBuffer(task.cudaResidualOutput, true, 0, sizeof(int) * 1024, (IntPtr)rr); - - task.openCLCQ.EnqueueBarrier(); - task.EnqueueChooseBestMethod(channelsCount); - - task.openCLCQ.EnqueueBarrier(); - if (channels == 2 && channelsCount == 4) - task.openCLCQ.EnqueueNDRangeKernel(task.cudaCopyBestMethodStereo, 2, null, new int[] { 64, task.frameCount }, new int[] { 64, 1 }); - else - task.openCLCQ.EnqueueNDRangeKernel(task.cudaCopyBestMethod, 2, null, new int[] { 64, channels * task.frameCount }, new int[] { 64, 1 }); - if (_settings.GPUOnly) - { - task.max_porder = max_porder; - if (task.frameSize >> max_porder == 16) - { - task.openCLCQ.EnqueueBarrier(); - task.EnqueueCalcPartition16(channels); - } - else - { - task.openCLCQ.EnqueueBarrier(); - task.openCLCQ.EnqueueNDRangeKernel(task.cudaEncodeResidual, 1, null, new int[] { task.groupSize * channels * task.frameCount }, new int[] { task.groupSize }); - task.openCLCQ.EnqueueBarrier(); - task.EnqueueCalcPartition(channels); - } - if (max_porder > 0) - { - task.openCLCQ.EnqueueBarrier(); - task.openCLCQ.EnqueueNDRangeKernel(task.cudaSumPartition, 2, null, new int[] { 128 * (Flake.MAX_RICE_PARAM + 1), channels * task.frameCount }, new int[] { 128, 1 }); - } - task.openCLCQ.EnqueueBarrier(); - task.openCLCQ.EnqueueNDRangeKernel(task.cudaFindRiceParameter, 2, null, new int[] { Math.Max(task.groupSize, 8 * (2 << max_porder)), channels * task.frameCount }, new int[] { task.groupSize, 1 }); - //if (max_porder > 0) // need to run even if max_porder==0 just to calculate the final frame size - task.openCLCQ.EnqueueBarrier(); - task.openCLCQ.EnqueueNDRangeKernel(task.cudaFindPartitionOrder, 1, null, new int[] { task.groupSize * channels * task.frameCount }, new int[] { task.groupSize }); - task.openCLCQ.EnqueueBarrier(); - task.openCLCQ.EnqueueReadBuffer(task.cudaResidual, false, 0, sizeof(int) * MAX_BLOCKSIZE * channels, task.residualBufferPtr.AddrOfPinnedObject()); - task.openCLCQ.EnqueueReadBuffer(task.cudaBestRiceParams, false, 0, sizeof(int) * (1 << max_porder) * channels * task.frameCount, task.bestRiceParamsPtr.AddrOfPinnedObject()); - } - task.openCLCQ.EnqueueBarrier(); - task.openCLCQ.EnqueueReadBuffer(task.cudaBestResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * channels * task.frameCount, task.bestResidualTasksPtr.AddrOfPinnedObject()); - //task.openCLCQ.EnqueueBarrier(); - //task.openCLCQ.EnqueueReadBuffer(task.cudaResidualTasks, true, 0, sizeof(FLACCLSubframeTask) * task.nResidualTasks, task.residualTasksPtr.AddrOfPinnedObject()); - //task.openCLCQ.EnqueueBarrier(); + if (task.frameSize >= 4) + task.EnqueueKernels(); } /// @@ -1317,7 +1215,7 @@ namespace CUETools.Codecs.FLACCL unsafe void unpack_samples(FLACCLTask task, int count) { int iFrame = task.frame.frame_number; - short* src = ((short*)task.samplesBytesPtr.AddrOfPinnedObject()) + iFrame * channels * task.frameSize; + short* src = ((short*)task.clSamplesBytes.HostPtr) + iFrame * channels * task.frameSize; switch (task.frame.ch_mode) { @@ -1404,7 +1302,7 @@ namespace CUETools.Codecs.FLACCL for (int ch = 0; ch < channelCount; ch++) task.frame.subframes[ch].Init( smp + ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * task.frameSize, - ((int*)task.residualBufferPtr.AddrOfPinnedObject()) + ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * task.frameSize, + ((int*)task.clResidual.HostPtr) + ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * task.frameSize, _pcm.BitsPerSample + (doMidside && ch == 3 ? 1 : 0), 0); select_best_methods(task.frame, channelCount, iFrame, task); @@ -1437,8 +1335,9 @@ namespace CUETools.Codecs.FLACCL task.framePos = frame_pos; frame_count += nFrames; frame_pos += nFrames * blocksize; - task.openCLCQ.EnqueueWriteBuffer(task.cudaSamplesBytes, false, 0, sizeof(short) * channels * blocksize * nFrames, task.samplesBytesPtr.AddrOfPinnedObject()); - task.openCLCQ.EnqueueBarrier(); + task.openCLCQ.EnqueueWriteBuffer(task.clSamplesBytes, false, 0, sizeof(short) * channels * blocksize * nFrames, task.clSamplesBytes.HostPtr); + //task.openCLCQ.EnqueueUnmapMemObject(task.cudaSamplesBytes, task.cudaSamplesBytes.HostPtr); + //task.openCLCQ.EnqueueMapBuffer(task.cudaSamplesBytes, true, MapFlags.WRITE, 0, task.samplesBufferLen / 2); } unsafe void run_GPU_task(FLACCLTask task) @@ -1446,21 +1345,6 @@ namespace CUETools.Codecs.FLACCL bool doMidside = channels == 2 && eparams.do_midside; int channelsCount = doMidside ? 2 * channels : channels; - if (task.frameSize != _windowsize && task.frameSize > 4) - fixed (float* window = windowBuffer) - { - _windowsize = task.frameSize; - _windowcount = 0; - calculate_window(window, lpc.window_welch, WindowFunction.Welch); - calculate_window(window, lpc.window_flattop, WindowFunction.Flattop); - calculate_window(window, lpc.window_tukey, WindowFunction.Tukey); - calculate_window(window, lpc.window_hann, WindowFunction.Hann); - calculate_window(window, lpc.window_bartlett, WindowFunction.Bartlett); - if (_windowcount == 0) - throw new Exception("invalid windowfunction"); - task.openCLCQ.EnqueueWriteBuffer(cudaWindow, true, 0, sizeof(float) * windowBuffer.Length, (IntPtr)window); - task.openCLCQ.EnqueueBarrier(); - } if (task.nResidualTasks == 0) initializeSubframeTasks(task.frameSize, channelsCount, max_frames, task); @@ -1493,7 +1377,7 @@ namespace CUETools.Codecs.FLACCL { for (int ch = 0; ch < channels; ch++) { - short* res = ((short*)task.samplesBytesPtr.AddrOfPinnedObject()) + iFrame * channels * task.frameSize + ch; + short* res = ((short*)task.clSamplesBytes.HostPtr) + iFrame * channels * task.frameSize + ch; int* smp = r + ch * Flake.MAX_BLOCKSIZE; for (int i = task.frameSize; i > 0; i--) { @@ -1637,16 +1521,14 @@ namespace CUETools.Codecs.FLACCL if (_IO.CanSeek) first_frame_offset = _IO.Position; - task1 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify, groupSize); - task2 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify, groupSize); + task1 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, this, groupSize); + task2 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, this, groupSize); if (_settings.CPUThreads > 0) { cpu_tasks = new FLACCLTask[_settings.CPUThreads]; for (int i = 0; i < cpu_tasks.Length; i++) - cpu_tasks[i] = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify, groupSize); + cpu_tasks[i] = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, this, groupSize); } - cudaWindow = openCLProgram.Context.CreateBuffer(MemFlags.READ_ONLY, sizeof(float) * FLACCLWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS); - inited = true; } } @@ -1661,7 +1543,7 @@ namespace CUETools.Codecs.FLACCL int block = Math.Min(buff.Length - pos, eparams.block_size * max_frames - samplesInBuffer); fixed (byte* buf = buff.Bytes) - AudioSamples.MemCpy(((byte*)task1.samplesBytesPtr.AddrOfPinnedObject()) + samplesInBuffer * _pcm.BlockAlign, buf + pos * _pcm.BlockAlign, block * _pcm.BlockAlign); + AudioSamples.MemCpy(((byte*)task1.clSamplesBytes.HostPtr) + samplesInBuffer * _pcm.BlockAlign, buf + pos * _pcm.BlockAlign, block * _pcm.BlockAlign); samplesInBuffer += block; pos += block; @@ -1775,8 +1657,8 @@ namespace CUETools.Codecs.FLACCL samplesInBuffer -= bs; if (samplesInBuffer > 0) AudioSamples.MemCpy( - ((byte*)task2.samplesBytesPtr.AddrOfPinnedObject()), - ((byte*)task1.samplesBytesPtr.AddrOfPinnedObject()) + bs * _pcm.BlockAlign, + ((byte*)task2.clSamplesBytes.HostPtr), + ((byte*)task1.clSamplesBytes.HostPtr) + bs * _pcm.BlockAlign, samplesInBuffer * _pcm.BlockAlign); FLACCLTask tmp = task1; task1 = task2; @@ -1786,7 +1668,7 @@ namespace CUETools.Codecs.FLACCL public string Path { get { return _path; } } - public static readonly string vendor_string = "FLACCL#0.1"; + public static readonly string vendor_string = "FLACCL#0.2"; int select_blocksize(int samplerate, int time_ms) { @@ -2268,40 +2150,36 @@ namespace CUETools.Codecs.FLACCL { Program openCLProgram; public CommandQueue openCLCQ; - public Kernel cudaStereoDecorr; + public Kernel clStereoDecorr; //public Kernel cudaChannelDecorr; - public Kernel cudaChannelDecorr2; - public Kernel cudaFindWastedBits; - public Kernel cudaComputeAutocor; - public Kernel cudaComputeLPC; + public Kernel clChannelDecorr2; + public Kernel clFindWastedBits; + public Kernel clComputeAutocor; + public Kernel clComputeLPC; //public Kernel cudaComputeLPCLattice; - public Kernel cudaQuantizeLPC; - public Kernel cudaEstimateResidual; - public Kernel cudaChooseBestMethod; - public Kernel cudaCopyBestMethod; - public Kernel cudaCopyBestMethodStereo; - public Kernel cudaEncodeResidual; - public Kernel cudaCalcPartition; - public Kernel cudaCalcPartition16; - public Kernel cudaSumPartition; - public Kernel cudaFindRiceParameter; - public Kernel cudaFindPartitionOrder; - public Mem cudaSamplesBytes; - public Mem cudaSamples; - public Mem cudaLPCData; - public Mem cudaResidual; - public Mem cudaPartitions; - public Mem cudaRiceParams; - public Mem cudaBestRiceParams; - public Mem cudaAutocorOutput; - public Mem cudaResidualTasks; - public Mem cudaResidualOutput; - public Mem cudaBestResidualTasks; - public GCHandle samplesBytesPtr; - public GCHandle residualBufferPtr; - public GCHandle bestRiceParamsPtr; - public GCHandle residualTasksPtr; - public GCHandle bestResidualTasksPtr; + public Kernel clQuantizeLPC; + public Kernel clEstimateResidual; + public Kernel clChooseBestMethod; + public Kernel clCopyBestMethod; + public Kernel clCopyBestMethodStereo; + public Kernel clEncodeResidual; + public Kernel clCalcPartition; + public Kernel clCalcPartition16; + public Kernel clSumPartition; + public Kernel clFindRiceParameter; + public Kernel clFindPartitionOrder; + public Mem clSamplesBytes; + public Mem clSamples; + public Mem clLPCData; + public Mem clResidual; + public Mem clPartitions; + public Mem clRiceParams; + public Mem clBestRiceParams; + public Mem clAutocorOutput; + public Mem clResidualTasks; + public Mem clResidualOutput; + public Mem clBestResidualTasks; + public Mem clWindowFunctions; public int[] samplesBuffer; public byte[] outputBuffer; public int outputSize = 0; @@ -2316,7 +2194,7 @@ namespace CUETools.Codecs.FLACCL public int nResidualTasks = 0; public int nResidualTasksPerChannel = 0; public int nTasksPerWindow = 0; - public int nAutocorTasksPerChannel = 0; + public int nWindowFunctions = 0; public int max_porder = 0; public FlakeReader verify; @@ -2327,64 +2205,70 @@ namespace CUETools.Codecs.FLACCL public bool exit = false; public int groupSize = 128; + public int channels, channelsCount; + public FLACCLWriter writer; - unsafe public FLACCLTask(Program _openCLProgram, int channelCount, int channels, uint bits_per_sample, int max_frame_size, bool do_verify, int groupSize) + unsafe public FLACCLTask(Program _openCLProgram, int channelsCount, int channels, uint bits_per_sample, int max_frame_size, FLACCLWriter writer, int groupSize) { this.groupSize = groupSize; + this.channels = channels; + this.channelsCount = channelsCount; + this.writer = writer; openCLProgram = _openCLProgram; Device[] openCLDevices = openCLProgram.Context.Platform.QueryDevices(DeviceType.GPU); - openCLCQ = openCLProgram.Context.CreateCommandQueue(openCLDevices[0], CommandQueueProperties.PROFILING_ENABLE); +#if DEBUG + var prop = CommandQueueProperties.PROFILING_ENABLE; +#else + var prop = CommandQueueProperties.NONE; +#endif + openCLCQ = openCLProgram.Context.CreateCommandQueue(openCLDevices[0], prop); - residualTasksLen = sizeof(FLACCLSubframeTask) * channelCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 8) * FLACCLWriter.maxFrames; - bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channelCount * FLACCLWriter.maxFrames; - samplesBufferLen = sizeof(int) * FLACCLWriter.MAX_BLOCKSIZE * channelCount; - int partitionsLen = sizeof(int) * (30 << 8) * channelCount * FLACCLWriter.maxFrames; - int riceParamsLen = sizeof(int) * (4 << 8) * channelCount * FLACCLWriter.maxFrames; - int lpcDataLen = sizeof(float) * 32 * 33 * lpc.MAX_LPC_WINDOWS * channelCount * FLACCLWriter.maxFrames; + residualTasksLen = sizeof(FLACCLSubframeTask) * channelsCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 8) * FLACCLWriter.maxFrames; + bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channels * FLACCLWriter.maxFrames; + samplesBufferLen = sizeof(int) * FLACCLWriter.MAX_BLOCKSIZE * channelsCount; + int partitionsLen = sizeof(int) * (30 << 8) * channels * FLACCLWriter.maxFrames; + int riceParamsLen = sizeof(int) * (4 << 8) * channels * FLACCLWriter.maxFrames; + int lpcDataLen = sizeof(float) * 32 * 33 * lpc.MAX_LPC_WINDOWS * channelsCount * FLACCLWriter.maxFrames; - cudaSamplesBytes = openCLProgram.Context.CreateBuffer(MemFlags.READ_ONLY | MemFlags.ALLOC_HOST_PTR, (uint)samplesBufferLen / 2); - cudaSamples = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, samplesBufferLen); - cudaResidual = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, samplesBufferLen); - cudaLPCData = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, lpcDataLen); - cudaPartitions = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, partitionsLen); - cudaRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, riceParamsLen); - cudaBestRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceParamsLen / 4); - cudaAutocorOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * FLACCLWriter.maxFrames); - cudaResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, residualTasksLen); - cudaBestResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, bestResidualTasksLen); - cudaResidualOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 8) * 64 /*FLACCLWriter.maxResidualParts*/ * FLACCLWriter.maxFrames); + clSamplesBytes = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, (uint)samplesBufferLen / 2); + //openCLCQ.EnqueueMapBuffer(cudaSamplesBytes, true, MapFlags.WRITE, 0, samplesBufferLen / 2); + clSamples = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, samplesBufferLen); + clResidual = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, samplesBufferLen); + clLPCData = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, lpcDataLen); + clPartitions = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, partitionsLen); + clRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, riceParamsLen); + clBestRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceParamsLen / 4); + clAutocorOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, sizeof(float) * channelsCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * FLACCLWriter.maxFrames); + clResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, residualTasksLen); + clBestResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, bestResidualTasksLen); + clResidualOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, sizeof(int) * channelsCount * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 8) * 64 /*FLACCLWriter.maxResidualParts*/ * FLACCLWriter.maxFrames); + clWindowFunctions = openCLProgram.Context.CreateBuffer(MemFlags.READ_ONLY | MemFlags.ALLOC_HOST_PTR, sizeof(float) * FLACCLWriter.MAX_BLOCKSIZE /** 2*/ * lpc.MAX_LPC_WINDOWS); - samplesBytesPtr = GCHandle.Alloc(new byte[samplesBufferLen / 2], GCHandleType.Pinned); - residualBufferPtr = GCHandle.Alloc(new byte[samplesBufferLen], GCHandleType.Pinned); - bestRiceParamsPtr = GCHandle.Alloc(new byte[riceParamsLen / 4], GCHandleType.Pinned); - residualTasksPtr = GCHandle.Alloc(new byte[residualTasksLen], GCHandleType.Pinned); - bestResidualTasksPtr = GCHandle.Alloc(new byte[bestResidualTasksLen], GCHandleType.Pinned); - - cudaComputeAutocor = openCLProgram.CreateKernel("cudaComputeAutocor"); - cudaStereoDecorr = openCLProgram.CreateKernel("cudaStereoDecorr"); + clComputeAutocor = openCLProgram.CreateKernel("cudaComputeAutocor"); + clStereoDecorr = openCLProgram.CreateKernel("cudaStereoDecorr"); //cudaChannelDecorr = openCLProgram.CreateKernel("cudaChannelDecorr"); - cudaChannelDecorr2 = openCLProgram.CreateKernel("cudaChannelDecorr2"); - cudaFindWastedBits = openCLProgram.CreateKernel("cudaFindWastedBits"); - cudaComputeLPC = openCLProgram.CreateKernel("cudaComputeLPC"); - cudaQuantizeLPC = openCLProgram.CreateKernel("cudaQuantizeLPC"); + clChannelDecorr2 = openCLProgram.CreateKernel("cudaChannelDecorr2"); + clFindWastedBits = openCLProgram.CreateKernel("cudaFindWastedBits"); + clComputeLPC = openCLProgram.CreateKernel("cudaComputeLPC"); + clQuantizeLPC = openCLProgram.CreateKernel("cudaQuantizeLPC"); //cudaComputeLPCLattice = openCLProgram.CreateKernel("cudaComputeLPCLattice"); - cudaEstimateResidual = openCLProgram.CreateKernel("cudaEstimateResidual"); - cudaChooseBestMethod = openCLProgram.CreateKernel("cudaChooseBestMethod"); - cudaCopyBestMethod = openCLProgram.CreateKernel("cudaCopyBestMethod"); - cudaCopyBestMethodStereo = openCLProgram.CreateKernel("cudaCopyBestMethodStereo"); - cudaEncodeResidual = openCLProgram.CreateKernel("cudaEncodeResidual"); - cudaCalcPartition = openCLProgram.CreateKernel("cudaCalcPartition"); - cudaCalcPartition16 = openCLProgram.CreateKernel("cudaCalcPartition16"); - cudaSumPartition = openCLProgram.CreateKernel("cudaSumPartition"); - cudaFindRiceParameter = openCLProgram.CreateKernel("cudaFindRiceParameter"); - cudaFindPartitionOrder = openCLProgram.CreateKernel("cudaFindPartitionOrder"); + clEstimateResidual = openCLProgram.CreateKernel("cudaEstimateResidual"); + clChooseBestMethod = openCLProgram.CreateKernel("cudaChooseBestMethod"); + clCopyBestMethod = openCLProgram.CreateKernel("cudaCopyBestMethod"); + clCopyBestMethodStereo = openCLProgram.CreateKernel("cudaCopyBestMethodStereo"); + clEncodeResidual = openCLProgram.CreateKernel("cudaEncodeResidual"); + clCalcPartition = openCLProgram.CreateKernel("cudaCalcPartition"); + clCalcPartition16 = openCLProgram.CreateKernel("cudaCalcPartition16"); + clSumPartition = openCLProgram.CreateKernel("cudaSumPartition"); + clFindRiceParameter = openCLProgram.CreateKernel("cudaFindRiceParameter"); + clFindPartitionOrder = openCLProgram.CreateKernel("cudaFindPartitionOrder"); - samplesBuffer = new int[FLACCLWriter.MAX_BLOCKSIZE * channelCount]; + samplesBuffer = new int[FLACCLWriter.MAX_BLOCKSIZE * channelsCount]; outputBuffer = new byte[max_frame_size * FLACCLWriter.maxFrames + 1]; - frame = new FlacFrame(channelCount); + frame = new FlacFrame(channelsCount); frame.writer = new BitWriter(outputBuffer, 0, outputBuffer.Length); - if (do_verify) + if (writer._settings.DoVerify) { verify = new FlakeReader(new AudioPCMConfig((int)bits_per_sample, channels, 44100)); verify.DoCRC = false; @@ -2404,114 +2288,44 @@ namespace CUETools.Codecs.FLACCL workThread = null; } - cudaComputeAutocor.Dispose(); - cudaStereoDecorr.Dispose(); + clComputeAutocor.Dispose(); + clStereoDecorr.Dispose(); //cudaChannelDecorr.Dispose(); - cudaChannelDecorr2.Dispose(); - cudaFindWastedBits.Dispose(); - cudaComputeLPC.Dispose(); - cudaQuantizeLPC.Dispose(); + clChannelDecorr2.Dispose(); + clFindWastedBits.Dispose(); + clComputeLPC.Dispose(); + clQuantizeLPC.Dispose(); //cudaComputeLPCLattice.Dispose(); - cudaEstimateResidual.Dispose(); - cudaChooseBestMethod.Dispose(); - cudaCopyBestMethod.Dispose(); - cudaCopyBestMethodStereo.Dispose(); - cudaEncodeResidual.Dispose(); - cudaCalcPartition.Dispose(); - cudaCalcPartition16.Dispose(); - cudaSumPartition.Dispose(); - cudaFindRiceParameter.Dispose(); - cudaFindPartitionOrder.Dispose(); + clEstimateResidual.Dispose(); + clChooseBestMethod.Dispose(); + clCopyBestMethod.Dispose(); + clCopyBestMethodStereo.Dispose(); + clEncodeResidual.Dispose(); + clCalcPartition.Dispose(); + clCalcPartition16.Dispose(); + clSumPartition.Dispose(); + clFindRiceParameter.Dispose(); + clFindPartitionOrder.Dispose(); - cudaSamples.Dispose(); - cudaSamplesBytes.Dispose(); - cudaLPCData.Dispose(); - cudaResidual.Dispose(); - cudaPartitions.Dispose(); - cudaAutocorOutput.Dispose(); - cudaResidualTasks.Dispose(); - cudaResidualOutput.Dispose(); - cudaBestResidualTasks.Dispose(); - - samplesBytesPtr.Free(); - residualBufferPtr.Free(); - bestRiceParamsPtr.Free(); - residualTasksPtr.Free(); - bestResidualTasksPtr.Free(); + clSamples.Dispose(); + clSamplesBytes.Dispose(); + clLPCData.Dispose(); + clResidual.Dispose(); + clPartitions.Dispose(); + clAutocorOutput.Dispose(); + clResidualTasks.Dispose(); + clResidualOutput.Dispose(); + clBestResidualTasks.Dispose(); + clWindowFunctions.Dispose(); openCLCQ.Dispose(); } - public void EnqueueFindWasted(int channelsCount) - { - cudaFindWastedBits.SetArg(0, cudaResidualTasks); - cudaFindWastedBits.SetArg(1, cudaSamples); - cudaFindWastedBits.SetArg(2, nResidualTasksPerChannel); - - int grpX = frameCount * channelsCount; - openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { grpX * groupSize }, new int[] { groupSize }); - } - - public void EnqueueComputeAutocor(int channelsCount, Mem cudaWindow, int max_prediction_order) - { - cudaComputeAutocor.SetArg(0, cudaAutocorOutput); - cudaComputeAutocor.SetArg(1, cudaSamples); - cudaComputeAutocor.SetArg(2, cudaWindow); - cudaComputeAutocor.SetArg(3, cudaResidualTasks); - cudaComputeAutocor.SetArg(4, nAutocorTasksPerChannel - 1); - cudaComputeAutocor.SetArg(5, nResidualTasksPerChannel); - - int workX = max_prediction_order / 4 + 1; - int workY = nAutocorTasksPerChannel * channelsCount * frameCount; - openCLCQ.EnqueueNDRangeKernel(cudaComputeAutocor, 2, null, new int[] { workX * groupSize, workY }, new int[] { groupSize, 1 }); - } - - public void EnqueueEstimateResidual(int channelsCount) - { - cudaEstimateResidual.SetArg(0, cudaResidualOutput); - cudaEstimateResidual.SetArg(1, cudaSamples); - cudaEstimateResidual.SetArg(2, cudaResidualTasks); - - int work = nResidualTasksPerChannel * channelsCount * frameCount; - openCLCQ.EnqueueNDRangeKernel(cudaEstimateResidual, 1, null, new int[] { groupSize * work }, new int[] { groupSize }); - } - - public void EnqueueChooseBestMethod(int channelsCount) - { - cudaChooseBestMethod.SetArg(0, cudaResidualTasks); - cudaChooseBestMethod.SetArg(1, cudaResidualOutput); - cudaChooseBestMethod.SetArg(2, nResidualTasksPerChannel); - - openCLCQ.EnqueueNDRangeKernel(cudaChooseBestMethod, 2, null, new int[] { 32, channelsCount * frameCount }, new int[] { 32, 1 }); - } - - public void EnqueueCalcPartition16(int channels) - { - cudaCalcPartition16.SetArg(0, cudaPartitions); - cudaCalcPartition16.SetArg(1, cudaResidual); - cudaCalcPartition16.SetArg(2, cudaSamples); - cudaCalcPartition16.SetArg(3, cudaBestResidualTasks); - cudaCalcPartition16.SetArg(4, max_porder); - - openCLCQ.EnqueueNDRangeKernel(cudaCalcPartition16, 2, null, new int[] { groupSize, channels * frameCount }, new int[] { groupSize, 1 }); - } - - public void EnqueueCalcPartition(int channels) - { - cudaCalcPartition.SetArg(0, cudaPartitions); - cudaCalcPartition.SetArg(1, cudaResidual); - cudaCalcPartition.SetArg(2, cudaBestResidualTasks); - cudaCalcPartition.SetArg(3, max_porder); - cudaCalcPartition.SetArg(4, frameSize >> max_porder); - - openCLCQ.EnqueueNDRangeKernel(cudaCalcPartition, 2, null, new int[] { groupSize * (1 << max_porder), channels * frameCount }, new int[] { groupSize, 1 }); - } - public unsafe FLACCLSubframeTask* ResidualTasks { get { - return (FLACCLSubframeTask*)residualTasksPtr.AddrOfPinnedObject(); + return (FLACCLSubframeTask*)clResidualTasks.HostPtr; } } @@ -2519,8 +2333,216 @@ namespace CUETools.Codecs.FLACCL { get { - return (FLACCLSubframeTask*)bestResidualTasksPtr.AddrOfPinnedObject(); + return (FLACCLSubframeTask*)clBestResidualTasks.HostPtr; } } + + internal unsafe void EnqueueKernels() + { + FlakeEncodeParams eparams = writer.eparams; + + this.max_porder = FLACCLWriter.get_max_p_order(eparams.max_partition_order, frameSize, eparams.max_prediction_order); + while ((frameSize >> max_porder) < 16 && max_porder > 0) + this.max_porder--; + + if (channels != 2) throw new Exception("channels != 2"); // need to Enqueue cudaChannelDecorr for each channel + Kernel cudaChannelDecorr = channels == 2 ? (channelsCount == 4 ? clStereoDecorr : clChannelDecorr2) : null;// cudaChannelDecorr; + + // openCLCQ.EnqueueMapBuffer(cudaSamplesBytes + //openCLCQ.EnqueueUnmapMemObject(cudaSamplesBytes, cudaSamplesBytes.HostPtr); + + // issue work to the GPU + cudaChannelDecorr.SetArgs( + clSamples, + clSamplesBytes, + FLACCLWriter.MAX_BLOCKSIZE); + + openCLCQ.EnqueueNDRangeKernel(cudaChannelDecorr, 0, frameSize * frameCount); + + if (eparams.do_wasted) + { + clFindWastedBits.SetArgs( + clResidualTasks, + clSamples, + nResidualTasksPerChannel); + + openCLCQ.EnqueueNDRangeKernel( + clFindWastedBits, + groupSize, + frameCount * channelsCount); + } + + clComputeAutocor.SetArgs( + clAutocorOutput, + clSamples, + clWindowFunctions, + clResidualTasks, + nWindowFunctions - 1, + nResidualTasksPerChannel); + + openCLCQ.EnqueueNDRangeKernel( + clComputeAutocor, + groupSize, 1, + eparams.max_prediction_order / 4 + 1, + nWindowFunctions * channelsCount * frameCount); + + clComputeLPC.SetArgs( + clResidualTasks, + clAutocorOutput, + clLPCData, + nResidualTasksPerChannel, + nWindowFunctions); + + openCLCQ.EnqueueNDRangeKernel( + clComputeLPC, + 32, 1, + nWindowFunctions, + channelsCount * frameCount); + + clQuantizeLPC.SetArgs( + clResidualTasks, + clLPCData, + nResidualTasksPerChannel, + nTasksPerWindow, + eparams.lpc_min_precision_search, + eparams.lpc_max_precision_search - eparams.lpc_min_precision_search); + + openCLCQ.EnqueueNDRangeKernel( + clQuantizeLPC, + 32, 1, + nWindowFunctions, + channelsCount * frameCount); + + clEstimateResidual.SetArgs( + clResidualOutput, + clSamples, + clResidualTasks); + + openCLCQ.EnqueueNDRangeKernel( + clEstimateResidual, + groupSize, + nResidualTasksPerChannel * channelsCount * frameCount); + + clChooseBestMethod.SetArgs( + clResidualTasks, + clResidualOutput, + nResidualTasksPerChannel); + + openCLCQ.EnqueueNDRangeKernel( + clChooseBestMethod, + 32, channelsCount * frameCount); + + if (channels == 2 && channelsCount == 4) + { + clCopyBestMethodStereo.SetArgs( + clBestResidualTasks, + clResidualTasks, + nResidualTasksPerChannel); + + openCLCQ.EnqueueNDRangeKernel( + clCopyBestMethodStereo, + 64, frameCount); + } + else + { + clCopyBestMethod.SetArgs( + clBestResidualTasks, + clResidualTasks, + nResidualTasksPerChannel); + + openCLCQ.EnqueueNDRangeKernel( + clCopyBestMethod, + 64, channels * frameCount); + } + + if (writer._settings.GPUOnly) + { + if (frameSize >> max_porder == 16) + { + clCalcPartition16.SetArgs( + clPartitions, + clResidual, + clSamples, + clBestResidualTasks, + max_porder); + + openCLCQ.EnqueueNDRangeKernel( + clCalcPartition16, + groupSize, channels * frameCount); + } + else + { + clEncodeResidual.SetArgs( + clResidual, + clSamples, + clBestResidualTasks); + + openCLCQ.EnqueueNDRangeKernel( + clEncodeResidual, + groupSize, channels * frameCount); + + clCalcPartition.SetArgs( + clPartitions, + clResidual, + clBestResidualTasks, + max_porder, + frameSize >> max_porder); + + openCLCQ.EnqueueNDRangeKernel( + clCalcPartition, + groupSize, 1, + 1 << max_porder, + channels * frameCount); + } + + if (max_porder > 0) + { + clSumPartition.SetArgs( + clPartitions, + max_porder); + + openCLCQ.EnqueueNDRangeKernel( + clSumPartition, + 128, 1, + (Flake.MAX_RICE_PARAM + 1), + channels * frameCount); + } + + clFindRiceParameter.SetArgs( + clRiceParams, + clPartitions, + max_porder); + + openCLCQ.EnqueueNDRangeKernel( + clFindRiceParameter, + groupSize, 1, + Math.Max(1, 8 * (2 << max_porder) / groupSize), + channels * frameCount); + + //if (max_porder > 0) // need to run even if max_porder==0 just to calculate the final frame size + clFindPartitionOrder.SetArgs( + clBestRiceParams, + clBestResidualTasks, + clRiceParams, + max_porder); + + openCLCQ.EnqueueNDRangeKernel( + clFindPartitionOrder, + groupSize, + channels * frameCount); + + //openCLCQ.EnqueueReadBuffer(cudaBestRiceParams, false, 0, sizeof(int) * (1 << max_porder) * channels * frameCount, cudaBestRiceParams.HostPtr); + //openCLCQ.EnqueueReadBuffer(cudaResidual, false, 0, sizeof(int) * MAX_BLOCKSIZE * channels, cudaResidual.HostPtr); + openCLCQ.EnqueueMapBuffer(clBestRiceParams, false, MapFlags.READ, 0, sizeof(int) * (1 << max_porder) * channels * frameCount); + openCLCQ.EnqueueUnmapMemObject(clBestRiceParams, clBestRiceParams.HostPtr); + openCLCQ.EnqueueMapBuffer(clResidual, false, MapFlags.READ, 0, sizeof(int) * FLACCLWriter.MAX_BLOCKSIZE * channels); + openCLCQ.EnqueueUnmapMemObject(clResidual, clResidual.HostPtr); + } + //openCLCQ.EnqueueReadBuffer(cudaBestResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * channels * frameCount, cudaBestResidualTasks.HostPtr); + openCLCQ.EnqueueMapBuffer(clBestResidualTasks, false, MapFlags.READ, 0, sizeof(FLACCLSubframeTask) * channels * frameCount); + openCLCQ.EnqueueUnmapMemObject(clBestResidualTasks, clBestResidualTasks.HostPtr); + + //openCLCQ.EnqueueMapBuffer(cudaSamplesBytes, false, MapFlags.WRITE, 0, samplesBufferLen / 2); + } } } diff --git a/CUETools.Codecs.FLACCL/flac.cl b/CUETools.Codecs.FLACCL/flac.cl index b826560..586f0ad 100644 --- a/CUETools.Codecs.FLACCL/flac.cl +++ b/CUETools.Codecs.FLACCL/flac.cl @@ -449,8 +449,6 @@ void cudaQuantizeLPC( } } -#define BEACCURATE - __kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) void cudaEstimateResidual( __global int*output, @@ -460,12 +458,8 @@ void cudaEstimateResidual( { __local int data[GROUP_SIZE * 2]; __local FLACCLSubframeTask task; -#ifdef BEACCURATE __local int residual[GROUP_SIZE]; __local int len[GROUP_SIZE / 16]; -#else - __local float residual[GROUP_SIZE]; -#endif const int tid = get_local_id(0); if (tid < sizeof(task)/sizeof(int)) @@ -477,12 +471,8 @@ void cudaEstimateResidual( if (tid < 32 && tid >= ro) task.coefs[tid] = 0; -#ifdef BEACCURATE if (tid < GROUP_SIZE / 16) len[tid] = 0; -#else - long res = 0; -#endif data[tid] = 0; barrier(CLK_LOCAL_MEM_FENCE); @@ -495,6 +485,7 @@ void cudaEstimateResidual( int4 cptr2 = cptr[2]; #endif #endif + for (int pos = 0; pos < bs; pos += GROUP_SIZE) { // fetch samples @@ -522,31 +513,34 @@ void cudaEstimateResidual( #endif #endif ; - - int t = select(0, data[tid + GROUP_SIZE] - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift), offs >= ro && offs < bs); -#ifdef BEACCURATE + + int t = data[tid + GROUP_SIZE] - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift); + // ensure we're within frame bounds + t = select(0, t, offs >= ro && offs < bs); + // overflow protection t = clamp(t, -0x7fffff, 0x7fffff); + // convert to unsigned residual[tid] = (t << 1) ^ (t >> 31); -#else - res += (t << 1) ^ (t >> 31); -#endif barrier(CLK_GLOBAL_MEM_FENCE); -#ifdef BEACCURATE + // calculate rice partition bit length for every 16 samples if (tid < GROUP_SIZE / 16) { __local int4 * chunk = ((__local int4 *)residual) + (tid << 2); int4 sum = chunk[0] + chunk[1] + chunk[2] + chunk[3]; int res = sum.x + sum.y + sum.z + sum.w; int k = clamp(27 - clz(res), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16) +#ifdef EXTRAMODE + sum = (chunk[0] >> k) + (chunk[1] >> k) + (chunk[2] >> k) + (chunk[3] >> k); + len[tid] += (k << 4) + sum.x + sum.y + sum.z + sum.w; +#else len[tid] += (k << 4) + (res >> k); - } #endif + } data[tid] = nextData; } -#ifdef BEACCURATE barrier(CLK_LOCAL_MEM_FENCE); for (int l = GROUP_SIZE / 32; l > 0; l >>= 1) { @@ -556,26 +550,6 @@ void cudaEstimateResidual( } if (tid == 0) output[get_group_id(0)] = len[0] + (bs - ro); -#else - residual[tid] = res; - barrier(CLK_LOCAL_MEM_FENCE); - for (int l = GROUP_SIZE / 2; l > 0; l >>= 1) - { - if (tid < l) - residual[tid] += residual[tid + l]; - barrier(CLK_LOCAL_MEM_FENCE); - } - if (tid == 0) - { - int residualLen = (bs - ro); - float sum = residual[0];// + residualLen / 2; - //int k = clamp(convert_int_rtn(log2((sum + 0.000001f) / (residualLen + 0.000001f))), 0, 14); - int k; - frexp((sum + 0.000001f) / residualLen, &k); - k = clamp(k - 1, 0, 14); - output[get_group_id(0)] = residualLen * (k + 1) + convert_int_rtn(min((float)0xffffff, sum / (1 << k))); - } -#endif } __kernel __attribute__((reqd_work_group_size(32, 1, 1))) @@ -598,14 +572,14 @@ void cudaChooseBestMethod( { // fetch task data if (tid < sizeof(task) / sizeof(int)) - ((__local int*)&task)[tid] = ((__global int*)(&tasks[taskNo + taskCount * get_group_id(1)].data))[tid]; + ((__local int*)&task)[tid] = ((__global int*)(&tasks[taskNo + taskCount * get_group_id(0)].data))[tid]; barrier(CLK_LOCAL_MEM_FENCE); if (tid == 0) { // fetch part sum - int partLen = residual[taskNo + taskCount * get_group_id(1)]; + int partLen = residual[taskNo + taskCount * get_group_id(0)]; //// calculate part size //int residualLen = task[get_local_id(1)].data.blocksize - task[get_local_id(1)].data.residualOrder; //residualLen = residualLen * (task[get_local_id(1)].data.type != Constant || psum != 0); @@ -626,10 +600,10 @@ void cudaChooseBestMethod( barrier(CLK_LOCAL_MEM_FENCE); } //shared.index[get_local_id(0)] = get_local_id(0); - //shared.length[get_local_id(0)] = (get_local_id(0) < taskCount) ? tasks[get_local_id(0) + taskCount * get_group_id(1)].size : 0x7fffffff; + //shared.length[get_local_id(0)] = (get_local_id(0) < taskCount) ? tasks[get_local_id(0) + taskCount * get_group_id(0)].size : 0x7fffffff; if (tid < taskCount) - tasks[tid + taskCount * get_group_id(1)].data.size = shared.length[tid]; + tasks[tid + taskCount * get_group_id(0)].data.size = shared.length[tid]; int l1 = shared.length[tid]; for (int l = 16; l > 0; l >>= 1) @@ -643,7 +617,7 @@ void cudaChooseBestMethod( barrier(CLK_LOCAL_MEM_FENCE); } if (tid == 0) - tasks[taskCount * get_group_id(1)].data.best_index = taskCount * get_group_id(1) + shared.index[0]; + tasks[taskCount * get_group_id(0)].data.best_index = taskCount * get_group_id(0) + shared.index[0]; } __kernel __attribute__((reqd_work_group_size(64, 1, 1))) @@ -655,10 +629,10 @@ void cudaCopyBestMethod( { __local int best_index; if (get_local_id(0) == 0) - best_index = tasks[count * get_group_id(1)].data.best_index; + best_index = tasks[count * get_group_id(0)].data.best_index; barrier(CLK_LOCAL_MEM_FENCE); if (get_local_id(0) < sizeof(FLACCLSubframeTask)/sizeof(int)) - ((__global int*)(tasks_out + get_group_id(1)))[get_local_id(0)] = ((__global int*)(tasks + best_index))[get_local_id(0)]; + ((__global int*)(tasks_out + get_group_id(0)))[get_local_id(0)] = ((__global int*)(tasks + best_index))[get_local_id(0)]; } __kernel __attribute__((reqd_work_group_size(64, 1, 1))) @@ -674,7 +648,7 @@ void cudaCopyBestMethodStereo( int lr_index[2]; } shared; if (get_local_id(0) < 4) - shared.best_index[get_local_id(0)] = tasks[count * (get_group_id(1) * 4 + get_local_id(0))].data.best_index; + shared.best_index[get_local_id(0)] = tasks[count * (get_group_id(0) * 4 + get_local_id(0))].data.best_index; barrier(CLK_LOCAL_MEM_FENCE); if (get_local_id(0) < 4) shared.best_size[get_local_id(0)] = tasks[shared.best_index[get_local_id(0)]].data.size; @@ -705,13 +679,13 @@ void cudaCopyBestMethodStereo( } barrier(CLK_LOCAL_MEM_FENCE); if (get_local_id(0) < sizeof(FLACCLSubframeTask)/sizeof(int)) - ((__global int*)(tasks_out + 2 * get_group_id(1)))[get_local_id(0)] = ((__global int*)(tasks + shared.lr_index[0]))[get_local_id(0)]; + ((__global int*)(tasks_out + 2 * get_group_id(0)))[get_local_id(0)] = ((__global int*)(tasks + shared.lr_index[0]))[get_local_id(0)]; if (get_local_id(0) == 0) - tasks_out[2 * get_group_id(1)].data.residualOffs = tasks[shared.best_index[0]].data.residualOffs; + tasks_out[2 * get_group_id(0)].data.residualOffs = tasks[shared.best_index[0]].data.residualOffs; if (get_local_id(0) < sizeof(FLACCLSubframeTask)/sizeof(int)) - ((__global int*)(tasks_out + 2 * get_group_id(1) + 1))[get_local_id(0)] = ((__global int*)(tasks + shared.lr_index[1]))[get_local_id(0)]; + ((__global int*)(tasks_out + 2 * get_group_id(0) + 1))[get_local_id(0)] = ((__global int*)(tasks + shared.lr_index[1]))[get_local_id(0)]; if (get_local_id(0) == 0) - tasks_out[2 * get_group_id(1) + 1].data.residualOffs = tasks[shared.best_index[1]].data.residualOffs; + tasks_out[2 * get_group_id(0) + 1].data.residualOffs = tasks[shared.best_index[1]].data.residualOffs; } // get_group_id(0) == task index @@ -835,7 +809,7 @@ void cudaCalcPartition( } } -// get_group_id(1) == task index +// get_group_id(0) == task index __kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) void cudaCalcPartition16( __global int *partition_lengths, @@ -851,7 +825,7 @@ void cudaCalcPartition16( const int tid = get_local_id(0); if (tid < sizeof(task) / sizeof(int)) - ((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(1)]))[tid]; + ((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(0)]))[tid]; barrier(CLK_LOCAL_MEM_FENCE); int bs = task.data.blocksize; @@ -921,7 +895,7 @@ void cudaCalcPartition16( sum = (chunk[0] >> k) + (chunk[1] >> k) + (chunk[2] >> k) + (chunk[3] >> k); s = sum.x + sum.y + sum.z + sum.w; - const int lpos = (15 << (max_porder + 1)) * get_group_id(1) + (k << (max_porder + 1)) + offs / 16; + const int lpos = (15 << (max_porder + 1)) * get_group_id(0) + (k << (max_porder + 1)) + offs / 16; if (k <= 14) partition_lengths[lpos] = min(0x7fffff, s) + (16 - select(0, ro, offs < 16)) * (k + 1); }