mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
opencl flac encoder
This commit is contained in:
@@ -53,6 +53,9 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
[SRDescription(typeof(Properties.Resources), "DescriptionGroupSize")]
|
[SRDescription(typeof(Properties.Resources), "DescriptionGroupSize")]
|
||||||
public int GroupSize { get; set; }
|
public int GroupSize { get; set; }
|
||||||
|
|
||||||
|
[SRDescription(typeof(Properties.Resources), "DescriptionDefines")]
|
||||||
|
public string Defines { get; set; }
|
||||||
|
|
||||||
int cpu_threads = 1;
|
int cpu_threads = 1;
|
||||||
[DefaultValue(1)]
|
[DefaultValue(1)]
|
||||||
[SRDescription(typeof(Properties.Resources), "DescriptionCPUThreads")]
|
[SRDescription(typeof(Properties.Resources), "DescriptionCPUThreads")]
|
||||||
@@ -474,6 +477,12 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public bool DoConstant
|
||||||
|
{
|
||||||
|
get { return eparams.do_constant; }
|
||||||
|
set { eparams.do_constant = value; }
|
||||||
|
}
|
||||||
|
|
||||||
public int MinPartitionOrder
|
public int MinPartitionOrder
|
||||||
{
|
{
|
||||||
get { return eparams.min_partition_order; }
|
get { return eparams.min_partition_order; }
|
||||||
@@ -1173,12 +1182,8 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
return;
|
return;
|
||||||
|
|
||||||
int max_porder = get_max_p_order(eparams.max_partition_order, task.frameSize, eparams.max_prediction_order);
|
int max_porder = get_max_p_order(eparams.max_partition_order, task.frameSize, eparams.max_prediction_order);
|
||||||
int calcPartitionPartSize = task.frameSize >> max_porder;
|
while ((task.frameSize >> max_porder) < 16 && max_porder > 0)
|
||||||
while (calcPartitionPartSize < 16 && max_porder > 0)
|
|
||||||
{
|
|
||||||
calcPartitionPartSize <<= 1;
|
|
||||||
max_porder--;
|
max_porder--;
|
||||||
}
|
|
||||||
|
|
||||||
if (channels != 2) throw new Exception("channels != 2"); // need to Enqueue cudaChannelDecorr for each channel
|
if (channels != 2) throw new Exception("channels != 2"); // need to Enqueue cudaChannelDecorr for each channel
|
||||||
Kernel cudaChannelDecorr = channels == 2 ? (channelsCount == 4 ? task.cudaStereoDecorr : task.cudaChannelDecorr2) : null;// task.cudaChannelDecorr;
|
Kernel cudaChannelDecorr = channels == 2 ? (channelsCount == 4 ? task.cudaStereoDecorr : task.cudaChannelDecorr2) : null;// task.cudaChannelDecorr;
|
||||||
@@ -1212,12 +1217,6 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
task.cudaEncodeResidual.SetArg(1, task.cudaSamples);
|
task.cudaEncodeResidual.SetArg(1, task.cudaSamples);
|
||||||
task.cudaEncodeResidual.SetArg(2, task.cudaBestResidualTasks);
|
task.cudaEncodeResidual.SetArg(2, task.cudaBestResidualTasks);
|
||||||
|
|
||||||
task.cudaCalcPartition.SetArg(0, task.cudaPartitions);
|
|
||||||
task.cudaCalcPartition.SetArg(1, task.cudaResidual);
|
|
||||||
task.cudaCalcPartition.SetArg(2, task.cudaBestResidualTasks);
|
|
||||||
task.cudaCalcPartition.SetArg(3, max_porder);
|
|
||||||
task.cudaCalcPartition.SetArg(4, calcPartitionPartSize);
|
|
||||||
|
|
||||||
task.cudaSumPartition.SetArg(0, task.cudaPartitions);
|
task.cudaSumPartition.SetArg(0, task.cudaPartitions);
|
||||||
task.cudaSumPartition.SetArg(1, max_porder);
|
task.cudaSumPartition.SetArg(1, max_porder);
|
||||||
|
|
||||||
@@ -1275,11 +1274,20 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
else
|
else
|
||||||
task.openCLCQ.EnqueueNDRangeKernel(task.cudaCopyBestMethod, 2, null, new int[] { 64, channels * task.frameCount }, new int[] { 64, 1 });
|
task.openCLCQ.EnqueueNDRangeKernel(task.cudaCopyBestMethod, 2, null, new int[] { 64, channels * task.frameCount }, new int[] { 64, 1 });
|
||||||
if (_settings.GPUOnly)
|
if (_settings.GPUOnly)
|
||||||
|
{
|
||||||
|
task.max_porder = max_porder;
|
||||||
|
if (task.frameSize >> max_porder == 16)
|
||||||
|
{
|
||||||
|
task.openCLCQ.EnqueueBarrier();
|
||||||
|
task.EnqueueCalcPartition16(channels);
|
||||||
|
}
|
||||||
|
else
|
||||||
{
|
{
|
||||||
task.openCLCQ.EnqueueBarrier();
|
task.openCLCQ.EnqueueBarrier();
|
||||||
task.openCLCQ.EnqueueNDRangeKernel(task.cudaEncodeResidual, 1, null, new int[] { task.groupSize * channels * task.frameCount }, new int[] { task.groupSize });
|
task.openCLCQ.EnqueueNDRangeKernel(task.cudaEncodeResidual, 1, null, new int[] { task.groupSize * channels * task.frameCount }, new int[] { task.groupSize });
|
||||||
task.openCLCQ.EnqueueBarrier();
|
task.openCLCQ.EnqueueBarrier();
|
||||||
task.openCLCQ.EnqueueNDRangeKernel(task.cudaCalcPartition, 2, null, new int[] { task.groupSize * (1 << max_porder), channels * task.frameCount }, new int[] { task.groupSize, 1 });
|
task.EnqueueCalcPartition(channels);
|
||||||
|
}
|
||||||
if (max_porder > 0)
|
if (max_porder > 0)
|
||||||
{
|
{
|
||||||
task.openCLCQ.EnqueueBarrier();
|
task.openCLCQ.EnqueueBarrier();
|
||||||
@@ -1293,7 +1301,6 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
task.openCLCQ.EnqueueBarrier();
|
task.openCLCQ.EnqueueBarrier();
|
||||||
task.openCLCQ.EnqueueReadBuffer(task.cudaResidual, false, 0, sizeof(int) * MAX_BLOCKSIZE * channels, task.residualBufferPtr.AddrOfPinnedObject());
|
task.openCLCQ.EnqueueReadBuffer(task.cudaResidual, false, 0, sizeof(int) * MAX_BLOCKSIZE * channels, task.residualBufferPtr.AddrOfPinnedObject());
|
||||||
task.openCLCQ.EnqueueReadBuffer(task.cudaBestRiceParams, false, 0, sizeof(int) * (1 << max_porder) * channels * task.frameCount, task.bestRiceParamsPtr.AddrOfPinnedObject());
|
task.openCLCQ.EnqueueReadBuffer(task.cudaBestRiceParams, false, 0, sizeof(int) * (1 << max_porder) * channels * task.frameCount, task.bestRiceParamsPtr.AddrOfPinnedObject());
|
||||||
task.max_porder = max_porder;
|
|
||||||
}
|
}
|
||||||
task.openCLCQ.EnqueueBarrier();
|
task.openCLCQ.EnqueueBarrier();
|
||||||
task.openCLCQ.EnqueueReadBuffer(task.cudaBestResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * channels * task.frameCount, task.bestResidualTasksPtr.AddrOfPinnedObject());
|
task.openCLCQ.EnqueueReadBuffer(task.cudaBestResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * channels * task.frameCount, task.bestResidualTasksPtr.AddrOfPinnedObject());
|
||||||
@@ -1575,11 +1582,12 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
// and serve as a convenient way to pass configuration information to the compilation process
|
// and serve as a convenient way to pass configuration information to the compilation process
|
||||||
OCLMan.Defines =
|
OCLMan.Defines =
|
||||||
"#define MAX_ORDER " + eparams.max_prediction_order.ToString() + "\n" +
|
"#define MAX_ORDER " + eparams.max_prediction_order.ToString() + "\n" +
|
||||||
"#define GROUP_SIZE " + groupSize.ToString() + "\n";
|
"#define GROUP_SIZE " + groupSize.ToString() + "\n" +
|
||||||
|
_settings.Defines + "\n";
|
||||||
// The BuildOptions string is passed directly to clBuild and can be used to do debug builds etc
|
// The BuildOptions string is passed directly to clBuild and can be used to do debug builds etc
|
||||||
OCLMan.BuildOptions = "";
|
OCLMan.BuildOptions = "";
|
||||||
OCLMan.SourcePath = System.IO.Path.GetDirectoryName(GetType().Assembly.Location);
|
OCLMan.SourcePath = System.IO.Path.GetDirectoryName(GetType().Assembly.Location);
|
||||||
//OCLMan.BinaryPath = ;
|
OCLMan.BinaryPath = System.IO.Path.Combine(System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData), "CUE Tools"), "OpenCL");
|
||||||
OCLMan.CreateDefaultContext(0, DeviceType.GPU);
|
OCLMan.CreateDefaultContext(0, DeviceType.GPU);
|
||||||
|
|
||||||
openCLContext = OCLMan.Context;
|
openCLContext = OCLMan.Context;
|
||||||
@@ -1778,7 +1786,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
|
|
||||||
public string Path { get { return _path; } }
|
public string Path { get { return _path; } }
|
||||||
|
|
||||||
public static readonly string vendor_string = "FLACCL#.91";
|
public static readonly string vendor_string = "FLACCL#0.1";
|
||||||
|
|
||||||
int select_blocksize(int samplerate, int time_ms)
|
int select_blocksize(int samplerate, int time_ms)
|
||||||
{
|
{
|
||||||
@@ -2142,41 +2150,48 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
do_constant = false;
|
do_constant = false;
|
||||||
do_wasted = false;
|
do_wasted = false;
|
||||||
do_midside = false;
|
do_midside = false;
|
||||||
|
window_function = WindowFunction.Bartlett;
|
||||||
orders_per_window = 1;
|
orders_per_window = 1;
|
||||||
max_partition_order = 4;
|
max_partition_order = 4;
|
||||||
max_prediction_order = 7;
|
max_prediction_order = 7;
|
||||||
min_fixed_order = 2;
|
min_fixed_order = 3;
|
||||||
max_fixed_order = 2;
|
max_fixed_order = 2;
|
||||||
break;
|
break;
|
||||||
case 1:
|
case 1:
|
||||||
|
do_constant = false;
|
||||||
do_wasted = false;
|
do_wasted = false;
|
||||||
do_midside = false;
|
do_midside = false;
|
||||||
window_function = WindowFunction.Bartlett;
|
window_function = WindowFunction.Bartlett;
|
||||||
orders_per_window = 1;
|
orders_per_window = 1;
|
||||||
max_prediction_order = 12;
|
min_fixed_order = 2;
|
||||||
|
max_fixed_order = 2;
|
||||||
|
max_prediction_order = 7;
|
||||||
max_partition_order = 4;
|
max_partition_order = 4;
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
do_constant = false;
|
do_constant = false;
|
||||||
|
do_midside = false;
|
||||||
window_function = WindowFunction.Bartlett;
|
window_function = WindowFunction.Bartlett;
|
||||||
min_fixed_order = 3;
|
min_fixed_order = 2;
|
||||||
max_fixed_order = 2;
|
max_fixed_order = 2;
|
||||||
orders_per_window = 1;
|
orders_per_window = 1;
|
||||||
max_prediction_order = 7;
|
max_prediction_order = 8;
|
||||||
max_partition_order = 4;
|
max_partition_order = 4;
|
||||||
break;
|
break;
|
||||||
case 3:
|
case 3:
|
||||||
window_function = WindowFunction.Bartlett;
|
window_function = WindowFunction.Bartlett;
|
||||||
|
do_constant = false;
|
||||||
min_fixed_order = 2;
|
min_fixed_order = 2;
|
||||||
max_fixed_order = 2;
|
max_fixed_order = 2;
|
||||||
orders_per_window = 6;
|
orders_per_window = 1;
|
||||||
max_prediction_order = 7;
|
max_prediction_order = 8;
|
||||||
max_partition_order = 4;
|
max_partition_order = 4;
|
||||||
break;
|
break;
|
||||||
case 4:
|
case 4:
|
||||||
|
do_constant = false;
|
||||||
min_fixed_order = 2;
|
min_fixed_order = 2;
|
||||||
max_fixed_order = 2;
|
max_fixed_order = 2;
|
||||||
orders_per_window = 3;
|
orders_per_window = 1;
|
||||||
max_prediction_order = 8;
|
max_prediction_order = 8;
|
||||||
max_partition_order = 4;
|
max_partition_order = 4;
|
||||||
break;
|
break;
|
||||||
@@ -2184,18 +2199,21 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
do_constant = false;
|
do_constant = false;
|
||||||
min_fixed_order = 2;
|
min_fixed_order = 2;
|
||||||
max_fixed_order = 2;
|
max_fixed_order = 2;
|
||||||
orders_per_window = 1;
|
orders_per_window = 2;
|
||||||
|
max_prediction_order = 8;
|
||||||
break;
|
break;
|
||||||
case 6:
|
case 6:
|
||||||
|
do_constant = false;
|
||||||
|
min_fixed_order = 2;
|
||||||
|
max_fixed_order = 2;
|
||||||
|
orders_per_window = 1;
|
||||||
|
break;
|
||||||
|
case 7:
|
||||||
|
do_constant = false;
|
||||||
min_fixed_order = 2;
|
min_fixed_order = 2;
|
||||||
max_fixed_order = 2;
|
max_fixed_order = 2;
|
||||||
orders_per_window = 3;
|
orders_per_window = 3;
|
||||||
break;
|
break;
|
||||||
case 7:
|
|
||||||
min_fixed_order = 2;
|
|
||||||
max_fixed_order = 2;
|
|
||||||
orders_per_window = 7;
|
|
||||||
break;
|
|
||||||
case 8:
|
case 8:
|
||||||
orders_per_window = 12;
|
orders_per_window = 12;
|
||||||
break;
|
break;
|
||||||
@@ -2264,6 +2282,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
public Kernel cudaCopyBestMethodStereo;
|
public Kernel cudaCopyBestMethodStereo;
|
||||||
public Kernel cudaEncodeResidual;
|
public Kernel cudaEncodeResidual;
|
||||||
public Kernel cudaCalcPartition;
|
public Kernel cudaCalcPartition;
|
||||||
|
public Kernel cudaCalcPartition16;
|
||||||
public Kernel cudaSumPartition;
|
public Kernel cudaSumPartition;
|
||||||
public Kernel cudaFindRiceParameter;
|
public Kernel cudaFindRiceParameter;
|
||||||
public Kernel cudaFindPartitionOrder;
|
public Kernel cudaFindPartitionOrder;
|
||||||
@@ -2355,6 +2374,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
cudaCopyBestMethodStereo = openCLProgram.CreateKernel("cudaCopyBestMethodStereo");
|
cudaCopyBestMethodStereo = openCLProgram.CreateKernel("cudaCopyBestMethodStereo");
|
||||||
cudaEncodeResidual = openCLProgram.CreateKernel("cudaEncodeResidual");
|
cudaEncodeResidual = openCLProgram.CreateKernel("cudaEncodeResidual");
|
||||||
cudaCalcPartition = openCLProgram.CreateKernel("cudaCalcPartition");
|
cudaCalcPartition = openCLProgram.CreateKernel("cudaCalcPartition");
|
||||||
|
cudaCalcPartition16 = openCLProgram.CreateKernel("cudaCalcPartition16");
|
||||||
cudaSumPartition = openCLProgram.CreateKernel("cudaSumPartition");
|
cudaSumPartition = openCLProgram.CreateKernel("cudaSumPartition");
|
||||||
cudaFindRiceParameter = openCLProgram.CreateKernel("cudaFindRiceParameter");
|
cudaFindRiceParameter = openCLProgram.CreateKernel("cudaFindRiceParameter");
|
||||||
cudaFindPartitionOrder = openCLProgram.CreateKernel("cudaFindPartitionOrder");
|
cudaFindPartitionOrder = openCLProgram.CreateKernel("cudaFindPartitionOrder");
|
||||||
@@ -2398,6 +2418,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
cudaCopyBestMethodStereo.Dispose();
|
cudaCopyBestMethodStereo.Dispose();
|
||||||
cudaEncodeResidual.Dispose();
|
cudaEncodeResidual.Dispose();
|
||||||
cudaCalcPartition.Dispose();
|
cudaCalcPartition.Dispose();
|
||||||
|
cudaCalcPartition16.Dispose();
|
||||||
cudaSumPartition.Dispose();
|
cudaSumPartition.Dispose();
|
||||||
cudaFindRiceParameter.Dispose();
|
cudaFindRiceParameter.Dispose();
|
||||||
cudaFindPartitionOrder.Dispose();
|
cudaFindPartitionOrder.Dispose();
|
||||||
@@ -2464,6 +2485,28 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
openCLCQ.EnqueueNDRangeKernel(cudaChooseBestMethod, 2, null, new int[] { 32, channelsCount * frameCount }, new int[] { 32, 1 });
|
openCLCQ.EnqueueNDRangeKernel(cudaChooseBestMethod, 2, null, new int[] { 32, channelsCount * frameCount }, new int[] { 32, 1 });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void EnqueueCalcPartition16(int channels)
|
||||||
|
{
|
||||||
|
cudaCalcPartition16.SetArg(0, cudaPartitions);
|
||||||
|
cudaCalcPartition16.SetArg(1, cudaResidual);
|
||||||
|
cudaCalcPartition16.SetArg(2, cudaSamples);
|
||||||
|
cudaCalcPartition16.SetArg(3, cudaBestResidualTasks);
|
||||||
|
cudaCalcPartition16.SetArg(4, max_porder);
|
||||||
|
|
||||||
|
openCLCQ.EnqueueNDRangeKernel(cudaCalcPartition16, 2, null, new int[] { groupSize, channels * frameCount }, new int[] { groupSize, 1 });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void EnqueueCalcPartition(int channels)
|
||||||
|
{
|
||||||
|
cudaCalcPartition.SetArg(0, cudaPartitions);
|
||||||
|
cudaCalcPartition.SetArg(1, cudaResidual);
|
||||||
|
cudaCalcPartition.SetArg(2, cudaBestResidualTasks);
|
||||||
|
cudaCalcPartition.SetArg(3, max_porder);
|
||||||
|
cudaCalcPartition.SetArg(4, frameSize >> max_porder);
|
||||||
|
|
||||||
|
openCLCQ.EnqueueNDRangeKernel(cudaCalcPartition, 2, null, new int[] { groupSize * (1 << max_porder), channels * frameCount }, new int[] { groupSize, 1 });
|
||||||
|
}
|
||||||
|
|
||||||
public unsafe FLACCLSubframeTask* ResidualTasks
|
public unsafe FLACCLSubframeTask* ResidualTasks
|
||||||
{
|
{
|
||||||
get
|
get
|
||||||
|
|||||||
@@ -449,7 +449,7 @@ void cudaQuantizeLPC(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#define DONT_BEACCURATE
|
#define BEACCURATE
|
||||||
|
|
||||||
__kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
|
__kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
|
||||||
void cudaEstimateResidual(
|
void cudaEstimateResidual(
|
||||||
@@ -481,24 +481,35 @@ void cudaEstimateResidual(
|
|||||||
if (tid < GROUP_SIZE / 16)
|
if (tid < GROUP_SIZE / 16)
|
||||||
len[tid] = 0;
|
len[tid] = 0;
|
||||||
#else
|
#else
|
||||||
float res = 0.0f;
|
long res = 0;
|
||||||
|
#endif
|
||||||
|
data[tid] = 0;
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
__local int4 * cptr = (__local int4 *)&task.coefs[0];
|
||||||
|
int4 cptr0 = cptr[0];
|
||||||
|
#if MAX_ORDER > 4
|
||||||
|
int4 cptr1 = cptr[1];
|
||||||
|
#if MAX_ORDER > 8
|
||||||
|
int4 cptr2 = cptr[2];
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
data[tid] = tid < bs ? samples[task.data.samplesOffs + tid] >> task.data.wbits : 0;
|
|
||||||
for (int pos = 0; pos < bs; pos += GROUP_SIZE)
|
for (int pos = 0; pos < bs; pos += GROUP_SIZE)
|
||||||
{
|
{
|
||||||
// fetch samples
|
// fetch samples
|
||||||
int nextData = pos + tid + GROUP_SIZE < bs ? samples[task.data.samplesOffs + pos + tid + GROUP_SIZE] >> task.data.wbits : 0;
|
int offs = pos + tid;
|
||||||
|
int nextData = offs < bs ? samples[task.data.samplesOffs + offs] >> task.data.wbits : 0;
|
||||||
data[tid + GROUP_SIZE] = nextData;
|
data[tid + GROUP_SIZE] = nextData;
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
// compute residual
|
// compute residual
|
||||||
__local int4 * dptr = (__local int4 *)&data[tid];
|
__local int4 * dptr = (__local int4 *)&data[tid + GROUP_SIZE - ro];
|
||||||
__local int4 * cptr = (__local int4 *)&task.coefs[0];
|
int4 sum = dptr[0] * cptr0
|
||||||
int4 sum = dptr[0] * cptr[0]
|
|
||||||
#if MAX_ORDER > 4
|
#if MAX_ORDER > 4
|
||||||
+ dptr[1] * cptr[1]
|
+ dptr[1] * cptr1
|
||||||
#if MAX_ORDER > 8
|
#if MAX_ORDER > 8
|
||||||
+ dptr[2] * cptr[2]
|
+ dptr[2] * cptr2
|
||||||
#if MAX_ORDER > 12
|
#if MAX_ORDER > 12
|
||||||
+ dptr[3] * cptr[3]
|
+ dptr[3] * cptr[3]
|
||||||
#if MAX_ORDER > 16
|
#if MAX_ORDER > 16
|
||||||
@@ -512,23 +523,23 @@ void cudaEstimateResidual(
|
|||||||
#endif
|
#endif
|
||||||
;
|
;
|
||||||
|
|
||||||
int t = select(0, data[tid + ro] - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift), pos + tid + ro < bs);
|
int t = select(0, data[tid + GROUP_SIZE] - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift), offs >= ro && offs < bs);
|
||||||
#ifdef BEACCURATE
|
#ifdef BEACCURATE
|
||||||
residual[tid] = min((t << 1) ^ (t >> 31), 0x7fffff);
|
t = clamp(t, -0x7fffff, 0x7fffff);
|
||||||
|
residual[tid] = (t << 1) ^ (t >> 31);
|
||||||
#else
|
#else
|
||||||
res += fabs(t);
|
res += (t << 1) ^ (t >> 31);
|
||||||
#endif
|
#endif
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||||
|
|
||||||
#ifdef BEACCURATE
|
#ifdef BEACCURATE
|
||||||
if (tid < GROUP_SIZE / 16)
|
if (tid < GROUP_SIZE / 16)
|
||||||
{
|
{
|
||||||
__local int4 * chunk = ((__local int4 *)residual) + tid * 4;
|
__local int4 * chunk = ((__local int4 *)residual) + (tid << 2);
|
||||||
int4 sum = chunk[0] + chunk[1] + chunk[2] + chunk[3];
|
int4 sum = chunk[0] + chunk[1] + chunk[2] + chunk[3];
|
||||||
int res = sum.x + sum.y + sum.z + sum.w;
|
int res = sum.x + sum.y + sum.z + sum.w;
|
||||||
int k = clamp(clz(16) - clz(res), 0, 14);
|
int k = clamp(27 - clz(res), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16)
|
||||||
len[tid] += 16 * k + (res >> k);
|
len[tid] += (k << 4) + (res >> k);
|
||||||
k = clamp(clz(16) - clz(res), 0, 14);
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@@ -557,7 +568,7 @@ void cudaEstimateResidual(
|
|||||||
if (tid == 0)
|
if (tid == 0)
|
||||||
{
|
{
|
||||||
int residualLen = (bs - ro);
|
int residualLen = (bs - ro);
|
||||||
float sum = residual[0] * 2;// + residualLen / 2;
|
float sum = residual[0];// + residualLen / 2;
|
||||||
//int k = clamp(convert_int_rtn(log2((sum + 0.000001f) / (residualLen + 0.000001f))), 0, 14);
|
//int k = clamp(convert_int_rtn(log2((sum + 0.000001f) / (residualLen + 0.000001f))), 0, 14);
|
||||||
int k;
|
int k;
|
||||||
frexp((sum + 0.000001f) / residualLen, &k);
|
frexp((sum + 0.000001f) / residualLen, &k);
|
||||||
@@ -608,7 +619,7 @@ void cudaChooseBestMethod(
|
|||||||
min(obits * task.blocksize,
|
min(obits * task.blocksize,
|
||||||
task.type == Fixed ? task.residualOrder * obits + 6 + (4 * 1/2) + partLen :
|
task.type == Fixed ? task.residualOrder * obits + 6 + (4 * 1/2) + partLen :
|
||||||
task.type == LPC ? task.residualOrder * obits + 4 + 5 + task.residualOrder * task.cbits + 6 + (4 * 1/2)/* << porder */ + partLen :
|
task.type == LPC ? task.residualOrder * obits + 4 + 5 + task.residualOrder * task.cbits + 6 + (4 * 1/2)/* << porder */ + partLen :
|
||||||
task.type == Constant ? obits * (1 + task.blocksize * (partLen != 0)) :
|
task.type == Constant ? obits * select(1, task.blocksize, partLen != task.blocksize - task.residualOrder) :
|
||||||
obits * task.blocksize);
|
obits * task.blocksize);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -721,21 +732,50 @@ void cudaEncodeResidual(
|
|||||||
int bs = task.data.blocksize;
|
int bs = task.data.blocksize;
|
||||||
int ro = task.data.residualOrder;
|
int ro = task.data.residualOrder;
|
||||||
|
|
||||||
data[tid] = tid < bs ? samples[task.data.samplesOffs + tid] >> task.data.wbits : 0;
|
if (tid < 32 && tid >= ro)
|
||||||
|
task.coefs[tid] = 0;
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
__local int4 * cptr = (__local int4 *)&task.coefs[0];
|
||||||
|
int4 cptr0 = cptr[0];
|
||||||
|
#if MAX_ORDER > 4
|
||||||
|
int4 cptr1 = cptr[1];
|
||||||
|
#if MAX_ORDER > 8
|
||||||
|
int4 cptr2 = cptr[2];
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
data[tid] = 0;
|
||||||
for (int pos = 0; pos < bs; pos += GROUP_SIZE)
|
for (int pos = 0; pos < bs; pos += GROUP_SIZE)
|
||||||
{
|
{
|
||||||
// fetch samples
|
// fetch samples
|
||||||
float nextData = pos + tid + GROUP_SIZE < bs ? samples[task.data.samplesOffs + pos + tid + GROUP_SIZE] >> task.data.wbits : 0;
|
int off = pos + tid;
|
||||||
|
int nextData = off < bs ? samples[task.data.samplesOffs + off] >> task.data.wbits : 0;
|
||||||
data[tid + GROUP_SIZE] = nextData;
|
data[tid + GROUP_SIZE] = nextData;
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
// compute residual
|
// compute residual
|
||||||
int sum = 0;
|
__local int4 * dptr = (__local int4 *)&data[tid + GROUP_SIZE - ro];
|
||||||
for (int c = 0; c < ro; c++)
|
int4 sum = dptr[0] * cptr0
|
||||||
sum += data[tid + c] * task.coefs[c];
|
#if MAX_ORDER > 4
|
||||||
sum = data[tid + ro] - (sum >> task.data.shift);
|
+ dptr[1] * cptr1
|
||||||
if (pos + tid + ro < bs)
|
#if MAX_ORDER > 8
|
||||||
output[task.data.residualOffs + pos + tid + ro] = sum;
|
+ dptr[2] * cptr2
|
||||||
|
#if MAX_ORDER > 12
|
||||||
|
+ dptr[3] * cptr[3]
|
||||||
|
#if MAX_ORDER > 16
|
||||||
|
+ dptr[4] * cptr[4]
|
||||||
|
+ dptr[5] * cptr[5]
|
||||||
|
+ dptr[6] * cptr[6]
|
||||||
|
+ dptr[7] * cptr[7]
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
;
|
||||||
|
if (off >= ro && off < bs)
|
||||||
|
output[task.data.residualOffs + off] = data[tid + GROUP_SIZE] - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift);
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
data[tid] = nextData;
|
data[tid] = nextData;
|
||||||
@@ -795,6 +835,98 @@ void cudaCalcPartition(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// get_group_id(1) == task index
|
||||||
|
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
|
||||||
|
void cudaCalcPartition16(
|
||||||
|
__global int *partition_lengths,
|
||||||
|
__global int *residual,
|
||||||
|
__global int *samples,
|
||||||
|
__global FLACCLSubframeTask *tasks,
|
||||||
|
int max_porder // <= 8
|
||||||
|
)
|
||||||
|
{
|
||||||
|
__local FLACCLSubframeTask task;
|
||||||
|
__local int data[GROUP_SIZE * 2];
|
||||||
|
__local int res[GROUP_SIZE];
|
||||||
|
|
||||||
|
const int tid = get_local_id(0);
|
||||||
|
if (tid < sizeof(task) / sizeof(int))
|
||||||
|
((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(1)]))[tid];
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
int bs = task.data.blocksize;
|
||||||
|
int ro = task.data.residualOrder;
|
||||||
|
|
||||||
|
if (tid >= ro && tid < 32)
|
||||||
|
task.coefs[tid] = 0;
|
||||||
|
|
||||||
|
int k = tid % 16;
|
||||||
|
int x = tid / 16;
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
__local int4 * cptr = (__local int4 *)&task.coefs[0];
|
||||||
|
int4 cptr0 = cptr[0];
|
||||||
|
#if MAX_ORDER > 4
|
||||||
|
int4 cptr1 = cptr[1];
|
||||||
|
#if MAX_ORDER > 8
|
||||||
|
int4 cptr2 = cptr[2];
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
data[tid] = 0;
|
||||||
|
for (int pos = 0; pos < bs; pos += GROUP_SIZE)
|
||||||
|
{
|
||||||
|
int offs = pos + tid;
|
||||||
|
// fetch samples
|
||||||
|
int nextData = offs < bs ? samples[task.data.samplesOffs + offs] >> task.data.wbits : 0;
|
||||||
|
data[tid + GROUP_SIZE] = nextData;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
// compute residual
|
||||||
|
__local int4 * dptr = (__local int4 *)&data[tid + GROUP_SIZE - ro];
|
||||||
|
int4 sum = dptr[0] * cptr0
|
||||||
|
#if MAX_ORDER > 4
|
||||||
|
+ dptr[1] * cptr1
|
||||||
|
#if MAX_ORDER > 8
|
||||||
|
+ dptr[2] * cptr2
|
||||||
|
#if MAX_ORDER > 12
|
||||||
|
+ dptr[3] * cptr[3]
|
||||||
|
#if MAX_ORDER > 16
|
||||||
|
+ dptr[4] * cptr[4]
|
||||||
|
+ dptr[5] * cptr[5]
|
||||||
|
+ dptr[6] * cptr[6]
|
||||||
|
+ dptr[7] * cptr[7]
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
;
|
||||||
|
int s = select(0, nextData - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift), offs >= ro && offs < bs);
|
||||||
|
|
||||||
|
// output residual
|
||||||
|
if (offs < bs)
|
||||||
|
residual[task.data.residualOffs + offs] = s;
|
||||||
|
|
||||||
|
//int s = select(0, residual[task.data.residualOffs + offs], offs >= ro && offs < bs);
|
||||||
|
|
||||||
|
s = clamp(s, -0x7fffff, 0x7fffff);
|
||||||
|
// convert to unsigned
|
||||||
|
res[tid] = (s << 1) ^ (s >> 31);
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
data[tid] = nextData;
|
||||||
|
|
||||||
|
// calc number of unary bits for each residual sample with each rice paramater
|
||||||
|
__local int4 * chunk = (__local int4 *)&res[x << 4];
|
||||||
|
sum = (chunk[0] >> k) + (chunk[1] >> k) + (chunk[2] >> k) + (chunk[3] >> k);
|
||||||
|
s = sum.x + sum.y + sum.z + sum.w;
|
||||||
|
|
||||||
|
const int lpos = (15 << (max_porder + 1)) * get_group_id(1) + (k << (max_porder + 1)) + offs / 16;
|
||||||
|
if (k <= 14)
|
||||||
|
partition_lengths[lpos] = min(0x7fffff, s) + (16 - select(0, ro, offs < 16)) * (k + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Sums partition lengths for a certain k == get_group_id(0)
|
// Sums partition lengths for a certain k == get_group_id(0)
|
||||||
// Requires 128 threads
|
// Requires 128 threads
|
||||||
// get_group_id(0) == k
|
// get_group_id(0) == k
|
||||||
@@ -949,36 +1081,5 @@ void cudaFindPartitionOrder(
|
|||||||
if (offs + get_local_id(0) < (1 << porder))
|
if (offs + get_local_id(0) < (1 << porder))
|
||||||
best_rice_parameters[(get_group_id(0) << max_porder) + offs + get_local_id(0)] = rice_parameters[pos - (2 << porder) + offs + get_local_id(0)];
|
best_rice_parameters[(get_group_id(0) << max_porder) + offs + get_local_id(0)] = rice_parameters[pos - (2 << porder) + offs + get_local_id(0)];
|
||||||
// FIXME: should be bytes?
|
// FIXME: should be bytes?
|
||||||
// if (get_local_id(0) < (1 << porder))
|
|
||||||
//shared.tmp[get_local_id(0)] = rice_parameters[pos - (2 << porder) + get_local_id(0)];
|
|
||||||
// barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
// if (get_local_id(0) < max(1, (1 << porder) >> 2))
|
|
||||||
// {
|
|
||||||
//char4 ch;
|
|
||||||
//ch.x = shared.tmp[(get_local_id(0) << 2)];
|
|
||||||
//ch.y = shared.tmp[(get_local_id(0) << 2) + 1];
|
|
||||||
//ch.z = shared.tmp[(get_local_id(0) << 2) + 2];
|
|
||||||
//ch.w = shared.tmp[(get_local_id(0) << 2) + 3];
|
|
||||||
//shared.ch[get_local_id(0)] = ch
|
|
||||||
// }
|
|
||||||
// barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
// if (get_local_id(0) < max(1, (1 << porder) >> 2))
|
|
||||||
//best_rice_parameters[(get_group_id(1) << max_porder) + get_local_id(0)] = shared.ch[get_local_id(0)];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//#endif
|
|
||||||
//
|
|
||||||
//#if 0
|
|
||||||
// if (get_local_id(0) < order)
|
|
||||||
// {
|
|
||||||
// for (int i = 0; i < order; i++)
|
|
||||||
// if (get_local_id(0) >= i)
|
|
||||||
// sum[get_local_id(0) - i] += coefs[get_local_id(0)] * sample[order - i - 1];
|
|
||||||
// fot (int i = order; i < blocksize; i++)
|
|
||||||
// {
|
|
||||||
// if (!get_local_id(0)) sample[order + i] = s = residual[order + i] + (sum[order + i] >> shift);
|
|
||||||
// sum[get_local_id(0) + i + 1] += coefs[get_local_id(0)] * s;
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
//#endif
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -83,6 +83,7 @@ namespace CUETools.FLACCL.cmd
|
|||||||
bool do_seektable = true;
|
bool do_seektable = true;
|
||||||
bool buffered = false;
|
bool buffered = false;
|
||||||
bool ok = true;
|
bool ok = true;
|
||||||
|
int intarg;
|
||||||
|
|
||||||
for (int arg = 0; arg < args.Length; arg++)
|
for (int arg = 0; arg < args.Length; arg++)
|
||||||
{
|
{
|
||||||
@@ -108,12 +109,10 @@ namespace CUETools.FLACCL.cmd
|
|||||||
ok = (++arg < args.Length) && int.TryParse(args[arg], out val);
|
ok = (++arg < args.Length) && int.TryParse(args[arg], out val);
|
||||||
settings.CPUThreads = val;
|
settings.CPUThreads = val;
|
||||||
}
|
}
|
||||||
else if (args[arg] == "--group-size")
|
else if (args[arg] == "--group-size" && ++arg < args.Length && int.TryParse(args[arg], out intarg))
|
||||||
{
|
settings.GroupSize = intarg;
|
||||||
int val = settings.GroupSize;
|
else if (args[arg] == "--define" && arg + 2 < args.Length)
|
||||||
ok = (++arg < args.Length) && int.TryParse(args[arg], out val);
|
settings.Defines += "#define " + args[++arg] + " " + args[++arg] + "\n";
|
||||||
settings.GroupSize = val;
|
|
||||||
}
|
|
||||||
else if ((args[arg] == "-o" || args[arg] == "--output") && ++arg < args.Length)
|
else if ((args[arg] == "-o" || args[arg] == "--output") && ++arg < args.Length)
|
||||||
output_file = args[arg];
|
output_file = args[arg];
|
||||||
else if ((args[arg] == "-s" || args[arg] == "--stereo") && ++arg < args.Length)
|
else if ((args[arg] == "-s" || args[arg] == "--stereo") && ++arg < args.Length)
|
||||||
@@ -167,7 +166,7 @@ namespace CUETools.FLACCL.cmd
|
|||||||
}
|
}
|
||||||
if (!quiet)
|
if (!quiet)
|
||||||
{
|
{
|
||||||
Console.WriteLine("{0}, Copyright (C) 2009 Gregory S. Chudov.", FLACCLWriter.vendor_string);
|
Console.WriteLine("{0}, Copyright (C) 2010 Gregory S. Chudov.", FLACCLWriter.vendor_string);
|
||||||
Console.WriteLine("This is free software under the GNU GPLv3+ license; There is NO WARRANTY, to");
|
Console.WriteLine("This is free software under the GNU GPLv3+ license; There is NO WARRANTY, to");
|
||||||
Console.WriteLine("the extent permitted by law. <http://www.gnu.org/licenses/> for details.");
|
Console.WriteLine("the extent permitted by law. <http://www.gnu.org/licenses/> for details.");
|
||||||
}
|
}
|
||||||
@@ -317,19 +316,21 @@ namespace CUETools.FLACCL.cmd
|
|||||||
if (debug)
|
if (debug)
|
||||||
{
|
{
|
||||||
Console.SetOut(stdout);
|
Console.SetOut(stdout);
|
||||||
Console.Out.WriteLine("{0}\t{1}\t{2}\t{3}\t{4} ({5})\t{6} ({7})\t{8}..{9}\t{10}\t{11}",
|
Console.Out.WriteLine("{0}\t{1}\t{2}\t{3}\t{4} ({5})\t{6}/{7}+{12}{13}\t{8}..{9}\t{10}\t{11}",
|
||||||
encoder.TotalSize,
|
encoder.TotalSize,
|
||||||
encoder.UserProcessorTime.TotalSeconds > 0 ? encoder.UserProcessorTime.TotalSeconds : totalElapsed.TotalSeconds,
|
encoder.UserProcessorTime.TotalSeconds > 0 ? encoder.UserProcessorTime.TotalSeconds : totalElapsed.TotalSeconds,
|
||||||
encoder.StereoMethod.ToString().PadRight(15),
|
encoder.StereoMethod.ToString().PadRight(15),
|
||||||
encoder.WindowFunction.ToString().PadRight(15),
|
encoder.WindowFunction.ToString().PadRight(15),
|
||||||
encoder.MaxPartitionOrder,
|
encoder.MaxPartitionOrder,
|
||||||
settings.GPUOnly ? "GPU" : "CPU",
|
settings.GPUOnly ? "GPU" : "CPU",
|
||||||
encoder.MaxLPCOrder,
|
|
||||||
encoder.OrdersPerWindow,
|
encoder.OrdersPerWindow,
|
||||||
|
encoder.MaxLPCOrder,
|
||||||
encoder.MinPrecisionSearch,
|
encoder.MinPrecisionSearch,
|
||||||
encoder.MaxPrecisionSearch,
|
encoder.MaxPrecisionSearch,
|
||||||
encoder.BlockSize,
|
encoder.BlockSize,
|
||||||
encoder.VBRMode
|
encoder.VBRMode,
|
||||||
|
encoder.MaxFixedOrder - encoder.MinFixedOrder + 1,
|
||||||
|
encoder.DoConstant ? "c" : ""
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
|
|||||||
Reference in New Issue
Block a user