opencl flac encoder

This commit is contained in:
chudov
2010-10-10 23:28:38 +00:00
parent facb0338c5
commit 04ca40e627
2 changed files with 364 additions and 314 deletions

View File

@@ -33,7 +33,7 @@ namespace CUETools.Codecs.FLACCL
{ {
public class FLACCLWriterSettings public class FLACCLWriterSettings
{ {
public FLACCLWriterSettings() { DoVerify = false; GPUOnly = false; DoMD5 = true; } public FLACCLWriterSettings() { DoVerify = false; GPUOnly = true; DoMD5 = true; GroupSize = 64; }
[DefaultValue(false)] [DefaultValue(false)]
[DisplayName("Verify")] [DisplayName("Verify")]
@@ -49,6 +49,10 @@ namespace CUETools.Codecs.FLACCL
[SRDescription(typeof(Properties.Resources), "DescriptionGPUOnly")] [SRDescription(typeof(Properties.Resources), "DescriptionGPUOnly")]
public bool GPUOnly { get; set; } public bool GPUOnly { get; set; }
[DefaultValue(64)]
[SRDescription(typeof(Properties.Resources), "DescriptionGroupSize")]
public int GroupSize { get; set; }
int cpu_threads = 1; int cpu_threads = 1;
[DefaultValue(1)] [DefaultValue(1)]
[SRDescription(typeof(Properties.Resources), "DescriptionCPUThreads")] [SRDescription(typeof(Properties.Resources), "DescriptionCPUThreads")]
@@ -1007,7 +1011,7 @@ namespace CUETools.Codecs.FLACCL
if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true; if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true;
break; break;
case SubframeType.Fixed: case SubframeType.Fixed:
// if (!_settings.GPUOnly) if (!_settings.GPUOnly)
{ {
if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true; if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true;
encode_residual_fixed(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, encode_residual_fixed(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples,
@@ -1025,7 +1029,7 @@ namespace CUETools.Codecs.FLACCL
ulong csum = 0; ulong csum = 0;
for (int i = task.frame.subframes[ch].best.order; i > 0; i--) for (int i = task.frame.subframes[ch].best.order; i > 0; i--)
csum += (ulong)Math.Abs(coefs[i - 1]); csum += (ulong)Math.Abs(coefs[i - 1]);
// if ((csum << task.frame.subframes[ch].obits) >= 1UL << 32 || !_settings.GPUOnly) if ((csum << task.frame.subframes[ch].obits) >= 1UL << 32 || !_settings.GPUOnly)
{ {
if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true; if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true;
if ((csum << task.frame.subframes[ch].obits) >= 1UL << 32) if ((csum << task.frame.subframes[ch].obits) >= 1UL << 32)
@@ -1098,14 +1102,14 @@ namespace CUETools.Codecs.FLACCL
frame.subframes[ch].best.rc.porder = task.BestResidualTasks[index].porder; frame.subframes[ch].best.rc.porder = task.BestResidualTasks[index].porder;
for (int i = 0; i < task.BestResidualTasks[index].residualOrder; i++) for (int i = 0; i < task.BestResidualTasks[index].residualOrder; i++)
frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i]; frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i];
//if (_settings.GPUOnly && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC)) if (_settings.GPUOnly && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC))
//{ {
// int* riceParams = ((int*)task.bestRiceParamsPtr.AddrOfPinnedObject()) + (index << task.max_porder); int* riceParams = ((int*)task.bestRiceParamsPtr.AddrOfPinnedObject()) + (index << task.max_porder);
// fixed (int* dstParams = frame.subframes[ch].best.rc.rparams) fixed (int* dstParams = frame.subframes[ch].best.rc.rparams)
// AudioSamples.MemCpy(dstParams, riceParams, (1 << frame.subframes[ch].best.rc.porder)); AudioSamples.MemCpy(dstParams, riceParams, (1 << frame.subframes[ch].best.rc.porder));
// //for (int i = 0; i < (1 << frame.subframes[ch].best.rc.porder); i++) //for (int i = 0; i < (1 << frame.subframes[ch].best.rc.porder); i++)
// // frame.subframes[ch].best.rc.rparams[i] = riceParams[i]; // frame.subframes[ch].best.rc.rparams[i] = riceParams[i];
//} }
} }
} }
} }
@@ -1122,11 +1126,9 @@ namespace CUETools.Codecs.FLACCL
calcPartitionPartSize <<= 1; calcPartitionPartSize <<= 1;
max_porder--; max_porder--;
} }
int calcPartitionPartCount = (calcPartitionPartSize >= 128) ? 1 : (256 / calcPartitionPartSize);
if (channels != 2) throw new Exception("channels != 2"); // need to Enqueue cudaChannelDecorr for each channel if (channels != 2) throw new Exception("channels != 2"); // need to Enqueue cudaChannelDecorr for each channel
Kernel cudaChannelDecorr = channels == 2 ? (channelsCount == 4 ? task.cudaStereoDecorr : task.cudaChannelDecorr2) : null;// task.cudaChannelDecorr; Kernel cudaChannelDecorr = channels == 2 ? (channelsCount == 4 ? task.cudaStereoDecorr : task.cudaChannelDecorr2) : null;// task.cudaChannelDecorr;
//Kernel cudaCalcPartition = calcPartitionPartSize >= 128 ? task.cudaCalcLargePartition : calcPartitionPartSize == 16 && task.frameSize >= 256 ? task.cudaCalcPartition16 : task.cudaCalcPartition;
cudaChannelDecorr.SetArg(0, task.cudaSamples); cudaChannelDecorr.SetArg(0, task.cudaSamples);
cudaChannelDecorr.SetArg(1, task.cudaSamplesBytes); cudaChannelDecorr.SetArg(1, task.cudaSamplesBytes);
@@ -1138,14 +1140,6 @@ namespace CUETools.Codecs.FLACCL
task.cudaComputeLPC.SetArg(3, (uint)task.nResidualTasksPerChannel); task.cudaComputeLPC.SetArg(3, (uint)task.nResidualTasksPerChannel);
task.cudaComputeLPC.SetArg(4, (uint)_windowcount); task.cudaComputeLPC.SetArg(4, (uint)_windowcount);
//task.cudaComputeLPCLattice.SetArg(0, task.cudaResidualTasks);
//task.cudaComputeLPCLattice.SetArg(1, (uint)task.nResidualTasksPerChannel);
//task.cudaComputeLPCLattice.SetArg(2, task.cudaSamples);
//task.cudaComputeLPCLattice.SetArg(3, (uint)_windowcount);
//task.cudaComputeLPCLattice.SetArg(4, (uint)eparams.max_prediction_order);
//task.cudaComputeLPCLattice.SetArg(5, task.cudaLPCData);
//cuda.SetFunctionBlockShape(task.cudaComputeLPCLattice, 256, 1, 1);
task.cudaQuantizeLPC.SetArg(0, task.cudaResidualTasks); task.cudaQuantizeLPC.SetArg(0, task.cudaResidualTasks);
task.cudaQuantizeLPC.SetArg(1, task.cudaLPCData); task.cudaQuantizeLPC.SetArg(1, task.cudaLPCData);
task.cudaQuantizeLPC.SetArg(2, (uint)task.nResidualTasksPerChannel); task.cudaQuantizeLPC.SetArg(2, (uint)task.nResidualTasksPerChannel);
@@ -1159,44 +1153,34 @@ namespace CUETools.Codecs.FLACCL
task.cudaCopyBestMethodStereo.SetArg(0, task.cudaBestResidualTasks); task.cudaCopyBestMethodStereo.SetArg(0, task.cudaBestResidualTasks);
task.cudaCopyBestMethodStereo.SetArg(1, task.cudaResidualTasks); task.cudaCopyBestMethodStereo.SetArg(1, task.cudaResidualTasks);
task.cudaCopyBestMethodStereo.SetArg(2, (uint)task.nResidualTasksPerChannel); task.cudaCopyBestMethodStereo.SetArg(2, task.nResidualTasksPerChannel);
//task.cudaEncodeResidual.SetArg(0, task.cudaResidual); task.cudaEncodeResidual.SetArg(0, task.cudaResidual);
//task.cudaEncodeResidual.SetArg(1, task.cudaSamples); task.cudaEncodeResidual.SetArg(1, task.cudaSamples);
//task.cudaEncodeResidual.SetArg(2, task.cudaBestResidualTasks); task.cudaEncodeResidual.SetArg(2, task.cudaBestResidualTasks);
//cuda.SetFunctionBlockShape(task.cudaEncodeResidual, residualPartSize, 1, 1);
//cudaCalcPartition.SetArg(0, task.cudaPartitions); task.cudaCalcPartition.SetArg(0, task.cudaPartitions);
//cudaCalcPartition.SetArg(1, task.cudaResidual); task.cudaCalcPartition.SetArg(1, task.cudaResidual);
//cudaCalcPartition.SetArg(2, task.cudaSamples); task.cudaCalcPartition.SetArg(2, task.cudaBestResidualTasks);
//cudaCalcPartition.SetArg(3, task.cudaBestResidualTasks); task.cudaCalcPartition.SetArg(3, max_porder);
//cudaCalcPartition.SetArg(4, (uint)max_porder); task.cudaCalcPartition.SetArg(4, calcPartitionPartSize);
//cudaCalcPartition.SetArg(5, (uint)calcPartitionPartSize);
//cudaCalcPartition.SetArg(6, (uint)calcPartitionPartCount);
//cuda.SetFunctionBlockShape(cudaCalcPartition, 16, 16, 1);
//task.cudaSumPartition.SetArg(0, task.cudaPartitions); task.cudaSumPartition.SetArg(0, task.cudaPartitions);
//task.cudaSumPartition.SetArg(1, (uint)max_porder); task.cudaSumPartition.SetArg(1, max_porder);
//cuda.SetFunctionBlockShape(task.cudaSumPartition, Math.Max(32, 1 << (max_porder - 1)), 1, 1);
//task.cudaFindRiceParameter.SetArg(0, task.cudaRiceParams); task.cudaFindRiceParameter.SetArg(0, task.cudaRiceParams);
//task.cudaFindRiceParameter.SetArg(1, task.cudaPartitions); task.cudaFindRiceParameter.SetArg(1, task.cudaPartitions);
//task.cudaFindRiceParameter.SetArg(2, (uint)max_porder); task.cudaFindRiceParameter.SetArg(2, max_porder);
//cuda.SetFunctionBlockShape(task.cudaFindRiceParameter, 32, 8, 1);
//task.cudaFindPartitionOrder.SetArg(0, task.cudaBestRiceParams);
//task.cudaFindPartitionOrder.SetArg(1, task.cudaBestResidualTasks);
//task.cudaFindPartitionOrder.SetArg(2, task.cudaRiceParams);
//task.cudaFindPartitionOrder.SetArg(3, (uint)max_porder);
//cuda.SetFunctionBlockShape(task.cudaFindPartitionOrder, 256, 1, 1);
task.cudaFindPartitionOrder.SetArg(0, task.cudaBestRiceParams);
task.cudaFindPartitionOrder.SetArg(1, task.cudaBestResidualTasks);
task.cudaFindPartitionOrder.SetArg(2, task.cudaRiceParams);
task.cudaFindPartitionOrder.SetArg(3, max_porder);
// issue work to the GPU // issue work to the GPU
task.openCLCQ.EnqueueBarrier(); task.openCLCQ.EnqueueBarrier();
task.openCLCQ.EnqueueNDRangeKernel(cudaChannelDecorr, 1, null, new int[] { task.frameCount * task.frameSize }, null ); task.openCLCQ.EnqueueNDRangeKernel(cudaChannelDecorr, 1, null, new int[] { task.frameCount * task.frameSize }, null );
//task.openCLCQ.EnqueueNDRangeKernel(cudaChannelDecorr, 1, null, new int[] { 64 * 128 }, new int[] { 128 }); //task.openCLCQ.EnqueueNDRangeKernel(cudaChannelDecorr, 1, null, new int[] { 64 * 128 }, new int[] { 128 });
//cuda.SetFunctionBlockShape(cudaChannelDecorr, 256, 1, 1);
//cuda.LaunchAsync(cudaChannelDecorr, (task.frameCount * task.frameSize + 255) / 256, channels == 2 ? 1 : channels, task.stream);
if (eparams.do_wasted) if (eparams.do_wasted)
{ {
@@ -1214,7 +1198,6 @@ namespace CUETools.Codecs.FLACCL
task.openCLCQ.EnqueueBarrier(); task.openCLCQ.EnqueueBarrier();
task.openCLCQ.EnqueueNDRangeKernel(task.cudaComputeLPC, 2, null, new int[] { task.nAutocorTasksPerChannel * 32, channelsCount * task.frameCount }, new int[] { 32, 1 }); task.openCLCQ.EnqueueNDRangeKernel(task.cudaComputeLPC, 2, null, new int[] { task.nAutocorTasksPerChannel * 32, channelsCount * task.frameCount }, new int[] { 32, 1 });
//cuda.SetFunctionBlockShape(task.cudaComputeLPC, 32, 1, 1);
//float* lpcs = stackalloc float[1024]; //float* lpcs = stackalloc float[1024];
//task.openCLCQ.EnqueueBarrier(); //task.openCLCQ.EnqueueBarrier();
@@ -1222,10 +1205,9 @@ namespace CUETools.Codecs.FLACCL
task.openCLCQ.EnqueueBarrier(); task.openCLCQ.EnqueueBarrier();
task.openCLCQ.EnqueueNDRangeKernel(task.cudaQuantizeLPC, 2, null, new int[] { task.nAutocorTasksPerChannel * 32, channelsCount * task.frameCount }, new int[] { 32, 1 }); task.openCLCQ.EnqueueNDRangeKernel(task.cudaQuantizeLPC, 2, null, new int[] { task.nAutocorTasksPerChannel * 32, channelsCount * task.frameCount }, new int[] { 32, 1 });
//cuda.SetFunctionBlockShape(task.cudaQuantizeLPC, 32, 4, 1);
task.openCLCQ.EnqueueBarrier(); task.openCLCQ.EnqueueBarrier();
task.EnqueueEstimateResidual(channelsCount, eparams.max_prediction_order); task.EnqueueEstimateResidual(channelsCount);
//int* rr = stackalloc int[1024]; //int* rr = stackalloc int[1024];
//task.openCLCQ.EnqueueBarrier(); //task.openCLCQ.EnqueueBarrier();
@@ -1237,25 +1219,29 @@ namespace CUETools.Codecs.FLACCL
task.openCLCQ.EnqueueBarrier(); task.openCLCQ.EnqueueBarrier();
if (channels == 2 && channelsCount == 4) if (channels == 2 && channelsCount == 4)
task.openCLCQ.EnqueueNDRangeKernel(task.cudaCopyBestMethodStereo, 2, null, new int[] { 64, task.frameCount }, new int[] { 64, 1 }); task.openCLCQ.EnqueueNDRangeKernel(task.cudaCopyBestMethodStereo, 2, null, new int[] { 64, task.frameCount }, new int[] { 64, 1 });
//cuda.SetFunctionBlockShape(task.cudaCopyBestMethodStereo, 64, 1, 1);
else else
task.openCLCQ.EnqueueNDRangeKernel(task.cudaCopyBestMethod, 2, null, new int[] { 64, channels * task.frameCount }, new int[] { 64, 1 }); task.openCLCQ.EnqueueNDRangeKernel(task.cudaCopyBestMethod, 2, null, new int[] { 64, channels * task.frameCount }, new int[] { 64, 1 });
//cuda.SetFunctionBlockShape(task.cudaCopyBestMethod, 64, 1, 1); if (_settings.GPUOnly)
//if (_settings.GPUOnly) {
//{ task.openCLCQ.EnqueueBarrier();
// int bsz = calcPartitionPartCount * calcPartitionPartSize; task.openCLCQ.EnqueueNDRangeKernel(task.cudaEncodeResidual, 1, null, new int[] { task.groupSize * channels * task.frameCount }, new int[] { task.groupSize });
// if (cudaCalcPartition.Pointer == task.cudaCalcLargePartition.Pointer) task.openCLCQ.EnqueueBarrier();
// cuda.LaunchAsync(task.cudaEncodeResidual, residualPartCount, channels * task.frameCount, task.stream); task.openCLCQ.EnqueueNDRangeKernel(task.cudaCalcPartition, 2, null, new int[] { task.groupSize * (1 << max_porder), channels * task.frameCount }, new int[] { task.groupSize, 1 });
// cuda.LaunchAsync(cudaCalcPartition, (task.frameSize + bsz - 1) / bsz, channels * task.frameCount, task.stream); if (max_porder > 0)
// if (max_porder > 0) {
// cuda.LaunchAsync(task.cudaSumPartition, Flake.MAX_RICE_PARAM + 1, channels * task.frameCount, task.stream); task.openCLCQ.EnqueueBarrier();
// cuda.LaunchAsync(task.cudaFindRiceParameter, ((2 << max_porder) + 31) / 32, channels * task.frameCount, task.stream); task.openCLCQ.EnqueueNDRangeKernel(task.cudaSumPartition, 2, null, new int[] { 128 * (Flake.MAX_RICE_PARAM + 1), channels * task.frameCount }, new int[] { 128, 1 });
// //if (max_porder > 0) // need to run even if max_porder==0 just to calculate the final frame size }
// cuda.LaunchAsync(task.cudaFindPartitionOrder, 1, channels * task.frameCount, task.stream); task.openCLCQ.EnqueueBarrier();
// cuda.CopyDeviceToHostAsync(task.cudaResidual, task.residualBufferPtr, (uint)(sizeof(int) * MAX_BLOCKSIZE * channels), task.stream); task.openCLCQ.EnqueueNDRangeKernel(task.cudaFindRiceParameter, 2, null, new int[] { Math.Max(task.groupSize, 8 * (2 << max_porder)), channels * task.frameCount }, new int[] { task.groupSize, 1 });
// cuda.CopyDeviceToHostAsync(task.cudaBestRiceParams, task.bestRiceParamsPtr, (uint)(sizeof(int) * (1 << max_porder) * channels * task.frameCount), task.stream); //if (max_porder > 0) // need to run even if max_porder==0 just to calculate the final frame size
// task.max_porder = max_porder; task.openCLCQ.EnqueueBarrier();
//} task.openCLCQ.EnqueueNDRangeKernel(task.cudaFindPartitionOrder, 1, null, new int[] { task.groupSize * channels * task.frameCount }, new int[] { task.groupSize });
task.openCLCQ.EnqueueBarrier();
task.openCLCQ.EnqueueReadBuffer(task.cudaResidual, false, 0, sizeof(int) * MAX_BLOCKSIZE * channels, task.residualBufferPtr.AddrOfPinnedObject());
task.openCLCQ.EnqueueReadBuffer(task.cudaBestRiceParams, false, 0, sizeof(int) * (1 << max_porder) * channels * task.frameCount, task.bestRiceParamsPtr.AddrOfPinnedObject());
task.max_porder = max_porder;
}
task.openCLCQ.EnqueueBarrier(); task.openCLCQ.EnqueueBarrier();
task.openCLCQ.EnqueueReadBuffer(task.cudaBestResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * channels * task.frameCount, task.bestResidualTasksPtr.AddrOfPinnedObject()); task.openCLCQ.EnqueueReadBuffer(task.cudaBestResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * channels * task.frameCount, task.bestResidualTasksPtr.AddrOfPinnedObject());
//task.openCLCQ.EnqueueBarrier(); //task.openCLCQ.EnqueueBarrier();
@@ -1514,7 +1500,7 @@ namespace CUETools.Codecs.FLACCL
if (OpenCL.NumberOfPlatforms < 1) if (OpenCL.NumberOfPlatforms < 1)
throw new Exception("no opencl platforms found"); throw new Exception("no opencl platforms found");
int groupSize = 64; int groupSize = _settings.GroupSize;
OCLMan = new OpenCLManager(); OCLMan = new OpenCLManager();
// Attempt to save binaries after compilation, as well as load precompiled binaries // Attempt to save binaries after compilation, as well as load precompiled binaries
// to avoid compilation. Usually you'll want this to be true. // to avoid compilation. Usually you'll want this to be true.
@@ -2223,13 +2209,11 @@ namespace CUETools.Codecs.FLACCL
public Kernel cudaChooseBestMethod; public Kernel cudaChooseBestMethod;
public Kernel cudaCopyBestMethod; public Kernel cudaCopyBestMethod;
public Kernel cudaCopyBestMethodStereo; public Kernel cudaCopyBestMethodStereo;
//public Kernel cudaEncodeResidual; public Kernel cudaEncodeResidual;
//public Kernel cudaCalcPartition; public Kernel cudaCalcPartition;
//public Kernel cudaCalcPartition16; public Kernel cudaSumPartition;
//public Kernel cudaCalcLargePartition; public Kernel cudaFindRiceParameter;
//public Kernel cudaSumPartition; public Kernel cudaFindPartitionOrder;
//public Kernel cudaFindRiceParameter;
//public Kernel cudaFindPartitionOrder;
public Mem cudaSamplesBytes; public Mem cudaSamplesBytes;
public Mem cudaSamples; public Mem cudaSamples;
public Mem cudaLPCData; public Mem cudaLPCData;
@@ -2261,7 +2245,7 @@ namespace CUETools.Codecs.FLACCL
public int nResidualTasksPerChannel = 0; public int nResidualTasksPerChannel = 0;
public int nTasksPerWindow = 0; public int nTasksPerWindow = 0;
public int nAutocorTasksPerChannel = 0; public int nAutocorTasksPerChannel = 0;
//public int max_porder = 0; public int max_porder = 0;
public FlakeReader verify; public FlakeReader verify;
@@ -2316,13 +2300,11 @@ namespace CUETools.Codecs.FLACCL
cudaChooseBestMethod = openCLProgram.CreateKernel("cudaChooseBestMethod"); cudaChooseBestMethod = openCLProgram.CreateKernel("cudaChooseBestMethod");
cudaCopyBestMethod = openCLProgram.CreateKernel("cudaCopyBestMethod"); cudaCopyBestMethod = openCLProgram.CreateKernel("cudaCopyBestMethod");
cudaCopyBestMethodStereo = openCLProgram.CreateKernel("cudaCopyBestMethodStereo"); cudaCopyBestMethodStereo = openCLProgram.CreateKernel("cudaCopyBestMethodStereo");
//cudaEncodeResidual = openCLProgram.CreateKernel("cudaEncodeResidual"); cudaEncodeResidual = openCLProgram.CreateKernel("cudaEncodeResidual");
//cudaCalcPartition = openCLProgram.CreateKernel("cudaCalcPartition"); cudaCalcPartition = openCLProgram.CreateKernel("cudaCalcPartition");
//cudaCalcPartition16 = openCLProgram.CreateKernel("cudaCalcPartition16"); cudaSumPartition = openCLProgram.CreateKernel("cudaSumPartition");
//cudaCalcLargePartition = openCLProgram.CreateKernel("cudaCalcLargePartition"); cudaFindRiceParameter = openCLProgram.CreateKernel("cudaFindRiceParameter");
//cudaSumPartition = openCLProgram.CreateKernel("cudaSumPartition"); cudaFindPartitionOrder = openCLProgram.CreateKernel("cudaFindPartitionOrder");
//cudaFindRiceParameter = openCLProgram.CreateKernel("cudaFindRiceParameter");
//cudaFindPartitionOrder = openCLProgram.CreateKernel("cudaFindPartitionOrder");
samplesBuffer = new int[FLACCLWriter.MAX_BLOCKSIZE * channelCount]; samplesBuffer = new int[FLACCLWriter.MAX_BLOCKSIZE * channelCount];
outputBuffer = new byte[max_frame_size * FLACCLWriter.maxFrames + 1]; outputBuffer = new byte[max_frame_size * FLACCLWriter.maxFrames + 1];
@@ -2361,13 +2343,11 @@ namespace CUETools.Codecs.FLACCL
cudaChooseBestMethod.Dispose(); cudaChooseBestMethod.Dispose();
cudaCopyBestMethod.Dispose(); cudaCopyBestMethod.Dispose();
cudaCopyBestMethodStereo.Dispose(); cudaCopyBestMethodStereo.Dispose();
//cudaEncodeResidual.Dispose(); cudaEncodeResidual.Dispose();
//cudaCalcPartition.Dispose(); cudaCalcPartition.Dispose();
//cudaCalcPartition16.Dispose(); cudaSumPartition.Dispose();
//cudaCalcLargePartition.Dispose(); cudaFindRiceParameter.Dispose();
//cudaSumPartition.Dispose(); cudaFindPartitionOrder.Dispose();
//cudaFindRiceParameter.Dispose();
//cudaFindPartitionOrder.Dispose();
cudaSamples.Dispose(); cudaSamples.Dispose();
cudaSamplesBytes.Dispose(); cudaSamplesBytes.Dispose();
@@ -2412,7 +2392,7 @@ namespace CUETools.Codecs.FLACCL
openCLCQ.EnqueueNDRangeKernel(cudaComputeAutocor, 2, null, new int[] { workX * groupSize, workY }, new int[] { groupSize, 1 }); openCLCQ.EnqueueNDRangeKernel(cudaComputeAutocor, 2, null, new int[] { workX * groupSize, workY }, new int[] { groupSize, 1 });
} }
public void EnqueueEstimateResidual(int channelsCount, int max_prediction_order) public void EnqueueEstimateResidual(int channelsCount)
{ {
cudaEstimateResidual.SetArg(0, cudaResidualOutput); cudaEstimateResidual.SetArg(0, cudaResidualOutput);
cudaEstimateResidual.SetArg(1, cudaSamples); cudaEstimateResidual.SetArg(1, cudaSamples);
@@ -2429,7 +2409,6 @@ namespace CUETools.Codecs.FLACCL
cudaChooseBestMethod.SetArg(2, (uint)nResidualTasksPerChannel); cudaChooseBestMethod.SetArg(2, (uint)nResidualTasksPerChannel);
openCLCQ.EnqueueNDRangeKernel(cudaChooseBestMethod, 2, null, new int[] { 32, channelsCount * frameCount }, new int[] { 32, 1 }); openCLCQ.EnqueueNDRangeKernel(cudaChooseBestMethod, 2, null, new int[] { 32, channelsCount * frameCount }, new int[] { 32, 1 });
//cuda.SetFunctionBlockShape(task.cudaChooseBestMethod, 32, 8, 1);
} }
public unsafe FLACCLSubframeTask* ResidualTasks public unsafe FLACCLSubframeTask* ResidualTasks

View File

@@ -20,6 +20,8 @@
#ifndef _FLACCL_KERNEL_H_ #ifndef _FLACCL_KERNEL_H_
#define _FLACCL_KERNEL_H_ #define _FLACCL_KERNEL_H_
//#pragma OPENCL EXTENSION cl_amd_fp64 : enable
typedef enum typedef enum
{ {
Constant = 0, Constant = 0,
@@ -116,7 +118,7 @@ void cudaFindWastedBits(
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
int w = 0, a = 0; int w = 0, a = 0;
for (int pos = 0; pos < task.blocksize; pos += get_local_size(0)) for (int pos = 0; pos < task.blocksize; pos += GROUP_SIZE)
{ {
int smp = pos + tid < task.blocksize ? samples[task.samplesOffs + pos + tid] : 0; int smp = pos + tid < task.blocksize ? samples[task.samplesOffs + pos + tid] : 0;
w |= smp; w |= smp;
@@ -126,7 +128,7 @@ void cudaFindWastedBits(
abits[tid] = a; abits[tid] = a;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (int s = get_local_size(0) / 2; s > 0; s >>= 1) for (int s = GROUP_SIZE / 2; s > 0; s >>= 1)
{ {
if (tid < s) if (tid < s)
{ {
@@ -200,6 +202,12 @@ void cudaComputeAutocor(
output[get_group_id(1) * (MAX_ORDER + 1) + tid + lag0] = product[tid * (GROUP_SIZE >> 2)]; output[get_group_id(1) * (MAX_ORDER + 1) + tid + lag0] = product[tid * (GROUP_SIZE >> 2)];
} }
//#define DEBUGPRINT
#ifdef DEBUGPRINT
#pragma OPENCL EXTENSION cl_amd_printf : enable
#endif
__kernel __attribute__((reqd_work_group_size(32, 1, 1))) __kernel __attribute__((reqd_work_group_size(32, 1, 1)))
void cudaComputeLPC( void cudaComputeLPC(
__global FLACCLSubframeTask *tasks, __global FLACCLSubframeTask *tasks,
@@ -241,12 +249,20 @@ void cudaComputeLPC(
float gen0 = shared.gen1[get_local_id(0)] = shared.autoc[get_local_id(0)+1]; float gen0 = shared.gen1[get_local_id(0)] = shared.autoc[get_local_id(0)+1];
shared.ldr[get_local_id(0)] = 0.0f; shared.ldr[get_local_id(0)] = 0.0f;
float error = shared.autoc[0]; float error = shared.autoc[0];
#ifdef DEBUGPRINT
int magic = shared.autoc[0] == 177286873088.0f;
if (magic && get_local_id(0) <= MAX_ORDER)
printf("autoc[%d] == %f\n", get_local_id(0), shared.autoc[get_local_id(0)]);
#endif
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (int order = 0; order < MAX_ORDER; order++) for (int order = 0; order < MAX_ORDER; order++)
{ {
// Schur recursion // Schur recursion
float reff = -shared.gen1[0] / error; float reff = -shared.gen1[0] / error;
error += shared.gen1[0] * reff; // Equivalent to error *= (1 - reff * reff); error += shared.gen1[0] * reff; // Equivalent to error *= (1 - reff * reff);
//error *= (1 - reff * reff);
float gen1; float gen1;
if (get_local_id(0) < MAX_ORDER - 1 - order) if (get_local_id(0) < MAX_ORDER - 1 - order)
{ {
@@ -256,6 +272,12 @@ void cudaComputeLPC(
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < MAX_ORDER - 1 - order) if (get_local_id(0) < MAX_ORDER - 1 - order)
shared.gen1[get_local_id(0)] = gen1; shared.gen1[get_local_id(0)] = gen1;
#ifdef DEBUGPRINT
if (magic && get_local_id(0) == 0)
printf("order == %d, reff == %f, error = %f\n", order, reff, error);
if (magic && get_local_id(0) <= MAX_ORDER)
printf("gen[%d] == %f, %f\n", get_local_id(0), gen0, gen1);
#endif
// Store prediction error // Store prediction error
if (get_local_id(0) == 0) if (get_local_id(0) == 0)
@@ -272,6 +294,8 @@ void cudaComputeLPC(
// Output coeffs // Output coeffs
if (get_local_id(0) <= order) if (get_local_id(0) <= order)
lpcs[shared.lpcOffs + order * 32 + get_local_id(0)] = -shared.ldr[order - get_local_id(0)]; lpcs[shared.lpcOffs + order * 32 + get_local_id(0)] = -shared.ldr[order - get_local_id(0)];
//if (get_local_id(0) <= order + 1 && fabs(-shared.ldr[0]) > 3000)
// printf("coef[%d] == %f, autoc == %f, error == %f\n", get_local_id(0), -shared.ldr[order - get_local_id(0)], shared.autoc[get_local_id(0)], shared.error[get_local_id(0)]);
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// Output prediction error estimates // Output prediction error estimates
@@ -309,12 +333,12 @@ void cudaQuantizeLPC(
// Select best orders based on Akaike's Criteria // Select best orders based on Akaike's Criteria
shared.index[tid] = min(MAX_ORDER - 1, tid); shared.index[tid] = min(MAX_ORDER - 1, tid);
shared.error[tid] = shared.task.blocksize * 64 + tid; shared.error[tid] = shared.task.blocksize * 64 + tid;
shared.index[32 + tid] = min(MAX_ORDER - 1, tid); shared.index[32 + tid] = MAX_ORDER - 1;
shared.error[32 + tid] = shared.task.blocksize * 64 + tid; shared.error[32 + tid] = shared.task.blocksize * 64 + tid + 32;
// Load prediction error estimates // Load prediction error estimates
if (tid < MAX_ORDER) if (tid < MAX_ORDER)
shared.error[tid] = shared.task.blocksize * log(lpcs[shared.lpcOffs + MAX_ORDER * 32 + tid]) + tid * 5.12f * log(shared.task.blocksize); shared.error[tid] = shared.task.blocksize * log(lpcs[shared.lpcOffs + MAX_ORDER * 32 + tid]) + tid * 4.12f * log(shared.task.blocksize);
//shared.error[get_local_id(0)] = shared.task.blocksize * log(lpcs[shared.lpcOffs + MAX_ORDER * 32 + get_local_id(0)]) + get_local_id(0) * 0.30f * (shared.task.abits + 1) * log(shared.task.blocksize); //shared.error[get_local_id(0)] = shared.task.blocksize * log(lpcs[shared.lpcOffs + MAX_ORDER * 32 + get_local_id(0)]) + get_local_id(0) * 0.30f * (shared.task.abits + 1) * log(shared.task.blocksize);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@@ -361,6 +385,9 @@ void cudaQuantizeLPC(
} }
} }
//shared.index[tid] = MAX_ORDER - 1;
//barrier(CLK_LOCAL_MEM_FENCE);
// Quantization // Quantization
for (int i = 0; i < taskCountLPC; i ++) for (int i = 0; i < taskCountLPC; i ++)
{ {
@@ -410,8 +437,6 @@ void cudaQuantizeLPC(
cbits = 1 + 32 - clz(shared.tmpi[0] | shared.tmpi[1]); cbits = 1 + 32 - clz(shared.tmpi[0] | shared.tmpi[1]);
// output shift, cbits and output coeffs // output shift, cbits and output coeffs
if (i < taskCountLPC)
{
int taskNo = get_group_id(1) * taskCount + get_group_id(0) * taskCountLPC + i; int taskNo = get_group_id(1) * taskCount + get_group_id(0) * taskCountLPC + i;
if (tid == 0) if (tid == 0)
tasks[taskNo].data.shift = shift; tasks[taskNo].data.shift = shift;
@@ -422,9 +447,10 @@ void cudaQuantizeLPC(
if (tid <= order) if (tid <= order)
tasks[taskNo].coefs[tid] = coef; tasks[taskNo].coefs[tid] = coef;
} }
}
} }
#define DONT_BEACCURATE
__kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) __kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
void cudaEstimateResidual( void cudaEstimateResidual(
__global int*output, __global int*output,
@@ -432,10 +458,14 @@ void cudaEstimateResidual(
__global FLACCLSubframeTask *tasks __global FLACCLSubframeTask *tasks
) )
{ {
__local float data[GROUP_SIZE * 2]; __local int data[GROUP_SIZE * 2];
__local int residual[GROUP_SIZE];
__local FLACCLSubframeTask task; __local FLACCLSubframeTask task;
__local float4 coefsf4[8]; #ifdef BEACCURATE
__local int residual[GROUP_SIZE];
__local int len[GROUP_SIZE / 16];
#else
__local float residual[GROUP_SIZE];
#endif
const int tid = get_local_id(0); const int tid = get_local_id(0);
if (tid < sizeof(task)/sizeof(int)) if (tid < sizeof(task)/sizeof(int))
@@ -444,56 +474,79 @@ void cudaEstimateResidual(
int ro = task.data.residualOrder; int ro = task.data.residualOrder;
int bs = task.data.blocksize; int bs = task.data.blocksize;
float res = 0;
if (tid < 32) if (tid < 32 && tid >= ro)
((__local float *)&coefsf4[0])[tid] = select(0.0f, ((float)task.coefs[tid]) / (1 << task.data.shift), tid < ro); task.coefs[tid] = 0;
data[tid] = tid < bs ? (float)(samples[task.data.samplesOffs + tid] >> task.data.wbits) : 0.0f; #ifdef BEACCURATE
if (tid < GROUP_SIZE / 16)
len[tid] = 0;
#else
float res = 0.0f;
#endif
data[tid] = tid < bs ? samples[task.data.samplesOffs + tid] >> task.data.wbits : 0;
for (int pos = 0; pos < bs; pos += GROUP_SIZE) for (int pos = 0; pos < bs; pos += GROUP_SIZE)
{ {
// fetch samples // fetch samples
float nextData = pos + tid + GROUP_SIZE < bs ? (float)(samples[task.data.samplesOffs + pos + tid + GROUP_SIZE] >> task.data.wbits) : 0.0f; int nextData = pos + tid + GROUP_SIZE < bs ? samples[task.data.samplesOffs + pos + tid + GROUP_SIZE] >> task.data.wbits : 0;
data[tid + GROUP_SIZE] = nextData; data[tid + GROUP_SIZE] = nextData;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// compute residual // compute residual
__local float4 * dptr = (__local float4 *)&data[tid]; __local int4 * dptr = (__local int4 *)&data[tid];
float sumf = data[tid + ro] - __local int4 * cptr = (__local int4 *)&task.coefs[0];
( dot(dptr[0], coefsf4[0]) int4 sum = dptr[0] * cptr[0]
+ dot(dptr[1], coefsf4[1]) #if MAX_ORDER > 4
+ dptr[1] * cptr[1]
#if MAX_ORDER > 8 #if MAX_ORDER > 8
+ dot(dptr[2], coefsf4[2]) + dptr[2] * cptr[2]
#if MAX_ORDER > 12 #if MAX_ORDER > 12
+ dot(dptr[3], coefsf4[3]) + dptr[3] * cptr[3]
#if MAX_ORDER > 16 #if MAX_ORDER > 16
+ dot(dptr[4], coefsf4[4]) + dptr[4] * cptr[4]
+ dot(dptr[5], coefsf4[5]) + dptr[5] * cptr[5]
+ dot(dptr[6], coefsf4[6]) + dptr[6] * cptr[6]
+ dot(dptr[7], coefsf4[7]) + dptr[7] * cptr[7]
#endif #endif
#endif #endif
#endif #endif
); #endif
//residual[tid] = sum; ;
res += select(0.0f, min(fabs(sumf), (float)0x7fffff), pos + tid + ro < bs); int t = select(0, data[tid + ro] - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift), pos + tid + ro < bs);
#ifdef BEACCURATE
residual[tid] = min((t << 1) ^ (t >> 31), 0x7fffff);
#else
res += fabs(t);
#endif
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
//int k = min(33 - clz(sum), 14); #ifdef BEACCURATE
//res += select(0, 1 + k, pos + tid + ro < bs); if (tid < GROUP_SIZE / 16)
{
//sum = residual[tid] + residual[tid + 1] + residual[tid + 2] + residual[tid + 3] __local int4 * chunk = ((__local int4 *)residual) + tid * 4;
// + residual[tid + 4] + residual[tid + 5] + residual[tid + 6] + residual[tid + 7]; int4 sum = chunk[0] + chunk[1] + chunk[2] + chunk[3];
//int k = clamp(29 - clz(sum), 0, 14); int res = sum.x + sum.y + sum.z + sum.w;
//res += select(0, 8 * (k + 1) + (sum >> k), pos + tid + ro < bs && !(tid & 7)); int k = clamp(clz(16) - clz(res), 0, 14);
len[tid] += 16 * k + (res >> k);
k = clamp(clz(16) - clz(res), 0, 14);
}
#endif
data[tid] = nextData; data[tid] = nextData;
} }
int residualLen = (bs - ro) / GROUP_SIZE + select(0, 1, tid < (bs - ro) % GROUP_SIZE); #ifdef BEACCURATE
int k = clamp(convert_int_rtn(log2((res + 0.000001f) / (residualLen + 0.000001f))), 0, 14); barrier(CLK_LOCAL_MEM_FENCE);
residual[tid] = residualLen * (k + 1) + (convert_int_rtz(res) >> k); for (int l = GROUP_SIZE / 32; l > 0; l >>= 1)
{
if (tid < l)
len[tid] += len[tid + l];
barrier(CLK_LOCAL_MEM_FENCE);
}
if (tid == 0)
output[get_group_id(0)] = len[0] + (bs - ro);
#else
residual[tid] = res;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (int l = GROUP_SIZE / 2; l > 0; l >>= 1) for (int l = GROUP_SIZE / 2; l > 0; l >>= 1)
{ {
@@ -502,7 +555,16 @@ void cudaEstimateResidual(
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
if (tid == 0) if (tid == 0)
output[get_group_id(0)] = residual[0]; {
int residualLen = (bs - ro);
float sum = residual[0] * 2;// + residualLen / 2;
//int k = clamp(convert_int_rtn(log2((sum + 0.000001f) / (residualLen + 0.000001f))), 0, 14);
int k;
frexp((sum + 0.000001f) / residualLen, &k);
k = clamp(k - 1, 0, 14);
output[get_group_id(0)] = residualLen * (k + 1) + convert_int_rtn(min((float)0xffffff, sum / (1 << k)));
}
#endif
} }
__kernel __attribute__((reqd_work_group_size(32, 1, 1))) __kernel __attribute__((reqd_work_group_size(32, 1, 1)))
@@ -641,6 +703,7 @@ void cudaCopyBestMethodStereo(
tasks_out[2 * get_group_id(1) + 1].data.residualOffs = tasks[shared.best_index[1]].data.residualOffs; tasks_out[2 * get_group_id(1) + 1].data.residualOffs = tasks[shared.best_index[1]].data.residualOffs;
} }
// get_group_id(0) == task index
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
void cudaEncodeResidual( void cudaEncodeResidual(
__global int *output, __global int *output,
@@ -652,7 +715,7 @@ void cudaEncodeResidual(
__local int data[GROUP_SIZE * 2]; __local int data[GROUP_SIZE * 2];
const int tid = get_local_id(0); const int tid = get_local_id(0);
if (get_local_id(0) < sizeof(task) / sizeof(int)) if (get_local_id(0) < sizeof(task) / sizeof(int))
((__local int*)&task)[get_local_id(0)] = ((__global int*)(&tasks[get_group_id(1)]))[get_local_id(0)]; ((__local int*)&task)[get_local_id(0)] = ((__global int*)(&tasks[get_group_id(0)]))[get_local_id(0)];
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
int bs = task.data.blocksize; int bs = task.data.blocksize;
@@ -679,6 +742,8 @@ void cudaEncodeResidual(
} }
} }
// get_group_id(0) == partition index
// get_group_id(1) == task index
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
void cudaCalcPartition( void cudaCalcPartition(
__global int *partition_lengths, __global int *partition_lengths,
@@ -697,8 +762,8 @@ void cudaCalcPartition(
((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(1)]))[tid]; ((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(1)]))[tid];
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
int k = tid % (GROUP_SIZE / 16); int k = tid % 16;
int x = tid / (GROUP_SIZE / 16); int x = tid / 16;
int sum = 0; int sum = 0;
for (int pos0 = 0; pos0 < psize; pos0 += GROUP_SIZE) for (int pos0 = 0; pos0 < psize; pos0 += GROUP_SIZE)
@@ -707,7 +772,7 @@ void cudaCalcPartition(
// fetch residual // fetch residual
int s = (offs >= task.residualOrder && pos0 + tid < psize) ? residual[task.residualOffs + offs] : 0; int s = (offs >= task.residualOrder && pos0 + tid < psize) ? residual[task.residualOffs + offs] : 0;
// convert to unsigned // convert to unsigned
data[tid] = min(0xfffff, (s << 1) ^ (s >> 31)); data[tid] = min(0x7fffff, (s << 1) ^ (s >> 31));
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// calc number of unary bits for each residual sample with each rice paramater // calc number of unary bits for each residual sample with each rice paramater
@@ -716,7 +781,7 @@ void cudaCalcPartition(
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
length[x][k] = min(0xfffff, sum); length[x][k] = min(0x7fffff, sum);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (x == 0) if (x == 0)
@@ -726,174 +791,180 @@ void cudaCalcPartition(
// output length // output length
const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (k << (max_porder + 1)); const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (k << (max_porder + 1));
if (k <= 14) if (k <= 14)
partition_lengths[pos + get_group_id(0)] = min(0xfffff,length[0][k]) + (psize - task.residualOrder * (get_group_id(0) == 0)) * (k + 1); partition_lengths[pos + get_group_id(0)] = min(0x7fffff,length[0][k]) + (psize - task.residualOrder * (get_group_id(0) == 0)) * (k + 1);
} }
} }
//// Sums partition lengths for a certain k == get_group_id(0) // Sums partition lengths for a certain k == get_group_id(0)
//// Requires 128 threads // Requires 128 threads
//__kernel void cudaSumPartition( // get_group_id(0) == k
// int* partition_lengths, // get_group_id(1) == task index
// int max_porder __kernel __attribute__((reqd_work_group_size(128, 1, 1)))
// ) void cudaSumPartition(
//{ __global int* partition_lengths,
// __local struct { int max_porder
// volatile int data[512+32]; // max_porder <= 8, data length <= 1 << 9. )
// } shared; {
// __local int data[512]; // max_porder <= 8, data length <= 1 << 9.
// const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (get_group_id(0) << (max_porder + 1)); const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (get_group_id(0) << (max_porder + 1));
//
// // fetch partition lengths // fetch partition lengths
// shared.data[get_local_id(0)] = get_local_id(0) < (1 << max_porder) ? partition_lengths[pos + get_local_id(0)] : 0; data[get_local_id(0)] = get_local_id(0) < (1 << max_porder) ? partition_lengths[pos + get_local_id(0)] : 0;
// shared.data[get_local_size(0) + get_local_id(0)] = get_local_size(0) + get_local_id(0) < (1 << max_porder) ? partition_lengths[pos + get_local_size(0) + get_local_id(0)] : 0; data[get_local_size(0) + get_local_id(0)] = get_local_size(0) + get_local_id(0) < (1 << max_porder) ? partition_lengths[pos + get_local_size(0) + get_local_id(0)] : 0;
// barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
//
// int in_pos = (get_local_id(0) << 1); int in_pos = (get_local_id(0) << 1);
// int out_pos = (1 << max_porder) + get_local_id(0); int out_pos = (1 << max_porder) + get_local_id(0);
// int bs; for (int bs = 1 << (max_porder - 1); bs > 0; bs >>= 1)
// for (bs = 1 << (max_porder - 1); bs > 32; bs >>= 1) {
// { if (get_local_id(0) < bs) data[out_pos] = data[in_pos] + data[in_pos + 1];
// if (get_local_id(0) < bs) shared.data[out_pos] = shared.data[in_pos] + shared.data[in_pos + 1]; in_pos += bs << 1;
// in_pos += bs << 1; out_pos += bs;
// out_pos += bs; barrier(CLK_LOCAL_MEM_FENCE);
// barrier(CLK_LOCAL_MEM_FENCE); }
// } if (get_local_id(0) < (1 << max_porder))
// if (get_local_id(0) < 32) partition_lengths[pos + (1 << max_porder) + get_local_id(0)] = data[(1 << max_porder) + get_local_id(0)];
// for (; bs > 0; bs >>= 1) if (get_local_size(0) + get_local_id(0) < (1 << max_porder))
// { partition_lengths[pos + (1 << max_porder) + get_local_size(0) + get_local_id(0)] = data[(1 << max_porder) + get_local_size(0) + get_local_id(0)];
// shared.data[out_pos] = shared.data[in_pos] + shared.data[in_pos + 1]; }
// in_pos += bs << 1;
// out_pos += bs; // Finds optimal rice parameter for several partitions at a time.
// } // get_group_id(0) == chunk index (chunk size is GROUP_SIZE / 8, so total task size is 8 * (2 << max_porder))
// barrier(CLK_LOCAL_MEM_FENCE); // get_group_id(1) == task index
// if (get_local_id(0) < (1 << max_porder)) __kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
// partition_lengths[pos + (1 << max_porder) + get_local_id(0)] = shared.data[(1 << max_porder) + get_local_id(0)]; void cudaFindRiceParameter(
// if (get_local_size(0) + get_local_id(0) < (1 << max_porder)) __global int* rice_parameters,
// partition_lengths[pos + (1 << max_porder) + get_local_size(0) + get_local_id(0)] = shared.data[(1 << max_porder) + get_local_size(0) + get_local_id(0)]; __global int* partition_lengths,
//} int max_porder
// )
//// Finds optimal rice parameter for up to 16 partitions at a time. {
//// Requires 16x16 threads __local struct {
//__kernel void cudaFindRiceParameter( volatile int length[GROUP_SIZE];
// int* rice_parameters, volatile int index[GROUP_SIZE];
// int* partition_lengths, } shared;
// int max_porder const int tid = get_local_id(0);
// ) const int ws = GROUP_SIZE / 8;
//{ const int parts = min(ws, 2 << max_porder);
// __local struct { const int p = tid % ws;
// volatile int length[256]; const int k = tid / ws; // 0..7
// volatile int index[256]; const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (k << (max_porder + 1));
// } shared;
// const int tid = get_local_id(0) + (get_local_id(1) << 5); // read length for 32 partitions
// const int parts = min(32, 2 << max_porder); int l1 = (p < parts) ? partition_lengths[pos + get_group_id(0) * ws + p] : 0xffffff;
// const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (get_local_id(1) << (max_porder + 1)); int l2 = (k + 8 <= 14 && p < parts) ? partition_lengths[pos + (8 << (max_porder + 1)) + get_group_id(0) * ws + p] : 0xffffff;
// // find best rice parameter
// // read length for 32 partitions shared.index[tid] = k + ((l2 < l1) << 3);
// int l1 = (get_local_id(0) < parts) ? partition_lengths[pos + get_group_id(0) * 32 + get_local_id(0)] : 0xffffff; shared.length[tid] = l1 = min(l1, l2);
// int l2 = (get_local_id(1) + 8 <= 14 && get_local_id(0) < parts) ? partition_lengths[pos + (8 << (max_porder + 1)) + get_group_id(0) * 32 + get_local_id(0)] : 0xffffff; barrier(CLK_LOCAL_MEM_FENCE);
// // find best rice parameter
// shared.index[tid] = get_local_id(1) + ((l2 < l1) << 3);
// shared.length[tid] = l1 = min(l1, l2);
// barrier(CLK_LOCAL_MEM_FENCE);
//#pragma unroll 3 //#pragma unroll 3
// for (int sh = 7; sh >= 5; sh --) for (int lsh = GROUP_SIZE / 2; lsh >= ws; lsh >>= 1)
// { {
// if (tid < (1 << sh)) if (tid < lsh)
// { {
// l2 = shared.length[tid + (1 << sh)]; l2 = shared.length[tid + lsh];
// shared.index[tid] = shared.index[tid + ((l2 < l1) << sh)]; shared.index[tid] = shared.index[tid + (l2 < l1) * lsh];
// shared.length[tid] = l1 = min(l1, l2); shared.length[tid] = l1 = min(l1, l2);
// } }
// barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// } }
// if (tid < parts) if (tid < parts)
// { {
// // output rice parameter // output rice parameter
// rice_parameters[(get_group_id(1) << (max_porder + 2)) + get_group_id(0) * parts + tid] = shared.index[tid]; rice_parameters[(get_group_id(1) << (max_porder + 2)) + get_group_id(0) * parts + tid] = shared.index[tid];
// // output length // output length
// rice_parameters[(get_group_id(1) << (max_porder + 2)) + (1 << (max_porder + 1)) + get_group_id(0) * parts + tid] = shared.length[tid]; rice_parameters[(get_group_id(1) << (max_porder + 2)) + (1 << (max_porder + 1)) + get_group_id(0) * parts + tid] = shared.length[tid];
// } }
//} }
//
//__kernel void cudaFindPartitionOrder( // get_group_id(0) == task index
// int* best_rice_parameters, __kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
// FLACCLSubframeTask *tasks, void cudaFindPartitionOrder(
// int* rice_parameters, __global int* best_rice_parameters,
// int max_porder __global FLACCLSubframeTask *tasks,
// ) __global int* rice_parameters,
//{ int max_porder
// __local struct { )
// int data[512]; {
// volatile int tmp[256]; __local struct {
// int length[32]; int length[32];
// int index[32]; int index[32];
// //char4 ch[64]; } shared;
// FLACCLSubframeTask task; __local int partlen[GROUP_SIZE];
// } shared; __local FLACCLSubframeData task;
// const int pos = (get_group_id(1) << (max_porder + 2)) + (2 << max_porder);
// if (get_local_id(0) < sizeof(shared.task) / sizeof(int)) const int pos = (get_group_id(0) << (max_porder + 2)) + (2 << max_porder);
// ((int*)&shared.task)[get_local_id(0)] = ((int*)(&tasks[get_group_id(1)]))[get_local_id(0)]; if (get_local_id(0) < sizeof(task) / sizeof(int))
// // fetch partition lengths ((__local int*)&task)[get_local_id(0)] = ((__global int*)(&tasks[get_group_id(0)]))[get_local_id(0)];
// shared.data[get_local_id(0)] = get_local_id(0) < (2 << max_porder) ? rice_parameters[pos + get_local_id(0)] : 0; // fetch partition lengths
// shared.data[get_local_id(0) + 256] = get_local_id(0) + 256 < (2 << max_porder) ? rice_parameters[pos + 256 + get_local_id(0)] : 0; barrier(CLK_LOCAL_MEM_FENCE);
// barrier(CLK_LOCAL_MEM_FENCE);
// for (int porder = max_porder; porder >= 0; porder--)
// for (int porder = max_porder; porder >= 0; porder--) {
// { int len = 0;
// shared.tmp[get_local_id(0)] = (get_local_id(0) < (1 << porder)) * shared.data[(2 << max_porder) - (2 << porder) + get_local_id(0)]; for (int offs = 0; offs < (1 << porder); offs += GROUP_SIZE)
// barrier(CLK_LOCAL_MEM_FENCE); len += offs + get_local_id(0) < (1 << porder) ? rice_parameters[pos + (2 << max_porder) - (2 << porder) + offs + get_local_id(0)] : 0;
// SUM256(shared.tmp, get_local_id(0), +=); partlen[get_local_id(0)] = len;
// if (get_local_id(0) == 0) barrier(CLK_LOCAL_MEM_FENCE);
// shared.length[porder] = shared.tmp[0] + (4 << porder); for (int l = min(GROUP_SIZE, 1 << porder) / 2; l > 0; l >>= 1)
// barrier(CLK_LOCAL_MEM_FENCE); {
// } if (get_local_id(0) < l)
// partlen[get_local_id(0)] += partlen[get_local_id(0) + l];
// if (get_local_id(0) < 32) barrier(CLK_LOCAL_MEM_FENCE);
// { }
// shared.index[get_local_id(0)] = get_local_id(0); if (get_local_id(0) == 0)
// if (get_local_id(0) > max_porder) shared.length[porder] = partlen[0] + (4 << porder);
// shared.length[get_local_id(0)] = 0xfffffff; barrier(CLK_LOCAL_MEM_FENCE);
// int l1 = shared.length[get_local_id(0)]; }
// #pragma unroll 4
// for (int sh = 3; sh >= 0; sh --) if (get_local_id(0) < 32 && get_local_id(0) > max_porder)
// { shared.length[get_local_id(0)] = 0xfffffff;
// int l2 = shared.length[get_local_id(0) + (1 << sh)]; if (get_local_id(0) < 32)
// shared.index[get_local_id(0)] = shared.index[get_local_id(0) + ((l2 < l1) << sh)]; shared.index[get_local_id(0)] = get_local_id(0);
// shared.length[get_local_id(0)] = l1 = min(l1, l2); barrier(CLK_LOCAL_MEM_FENCE);
// } int l1 = get_local_id(0) <= max_porder ? shared.length[get_local_id(0)] : 0xfffffff;
// if (get_local_id(0) == 0) for (int sh = 3; sh >= 0; sh --)
// tasks[get_group_id(1)].data.porder = shared.index[0]; {
// if (get_local_id(0) == 0) if (get_local_id(0) < (1 << sh))
// { {
// int obits = shared.task.data.obits - shared.task.data.wbits; int l2 = shared.length[get_local_id(0) + (1 << sh)];
// tasks[get_group_id(1)].data.size = shared.index[get_local_id(0)] = shared.index[get_local_id(0) + ((l2 < l1) << sh)];
// shared.task.data.type == Fixed ? shared.task.data.residualOrder * obits + 6 + l1 : shared.length[get_local_id(0)] = l1 = min(l1, l2);
// shared.task.data.type == LPC ? shared.task.data.residualOrder * obits + 6 + l1 + 4 + 5 + shared.task.data.residualOrder * shared.task.data.cbits : }
// shared.task.data.type == Constant ? obits : obits * shared.task.data.blocksize; barrier(CLK_LOCAL_MEM_FENCE);
// } }
// } if (get_local_id(0) == 0)
// barrier(CLK_LOCAL_MEM_FENCE); tasks[get_group_id(0)].data.porder = shared.index[0];
// int porder = shared.index[0]; if (get_local_id(0) == 0)
// if (get_local_id(0) < (1 << porder)) {
// best_rice_parameters[(get_group_id(1) << max_porder) + get_local_id(0)] = rice_parameters[pos - (2 << porder) + get_local_id(0)]; int obits = task.obits - task.wbits;
// // FIXME: should be bytes? tasks[get_group_id(0)].data.size =
// // if (get_local_id(0) < (1 << porder)) task.type == Fixed ? task.residualOrder * obits + 6 + l1 :
// //shared.tmp[get_local_id(0)] = rice_parameters[pos - (2 << porder) + get_local_id(0)]; task.type == LPC ? task.residualOrder * obits + 6 + l1 + 4 + 5 + task.residualOrder * task.cbits :
// // barrier(CLK_LOCAL_MEM_FENCE); task.type == Constant ? obits : obits * task.blocksize;
// // if (get_local_id(0) < max(1, (1 << porder) >> 2)) }
// // { barrier(CLK_LOCAL_MEM_FENCE);
// //char4 ch; int porder = shared.index[0];
// //ch.x = shared.tmp[(get_local_id(0) << 2)]; for (int offs = 0; offs < (1 << porder); offs += GROUP_SIZE)
// //ch.y = shared.tmp[(get_local_id(0) << 2) + 1]; if (offs + get_local_id(0) < (1 << porder))
// //ch.z = shared.tmp[(get_local_id(0) << 2) + 2]; best_rice_parameters[(get_group_id(0) << max_porder) + offs + get_local_id(0)] = rice_parameters[pos - (2 << porder) + offs + get_local_id(0)];
// //ch.w = shared.tmp[(get_local_id(0) << 2) + 3]; // FIXME: should be bytes?
// //shared.ch[get_local_id(0)] = ch // if (get_local_id(0) < (1 << porder))
// // } //shared.tmp[get_local_id(0)] = rice_parameters[pos - (2 << porder) + get_local_id(0)];
// // barrier(CLK_LOCAL_MEM_FENCE); // barrier(CLK_LOCAL_MEM_FENCE);
// // if (get_local_id(0) < max(1, (1 << porder) >> 2)) // if (get_local_id(0) < max(1, (1 << porder) >> 2))
// //best_rice_parameters[(get_group_id(1) << max_porder) + get_local_id(0)] = shared.ch[get_local_id(0)]; // {
//} //char4 ch;
// //ch.x = shared.tmp[(get_local_id(0) << 2)];
//ch.y = shared.tmp[(get_local_id(0) << 2) + 1];
//ch.z = shared.tmp[(get_local_id(0) << 2) + 2];
//ch.w = shared.tmp[(get_local_id(0) << 2) + 3];
//shared.ch[get_local_id(0)] = ch
// }
// barrier(CLK_LOCAL_MEM_FENCE);
// if (get_local_id(0) < max(1, (1 << porder) >> 2))
//best_rice_parameters[(get_group_id(1) << max_porder) + get_local_id(0)] = shared.ch[get_local_id(0)];
}
//#endif //#endif
// //
//#if 0 //#if 0