mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
optimizations
This commit is contained in:
@@ -65,8 +65,6 @@
|
|||||||
</ProjectReference>
|
</ProjectReference>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<None Include="flac.cu">
|
|
||||||
</None>
|
|
||||||
<Content Include="flac.cl">
|
<Content Include="flac.cl">
|
||||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||||
</Content>
|
</Content>
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
this.MappedMemory = false;
|
this.MappedMemory = false;
|
||||||
this.DoMD5 = true;
|
this.DoMD5 = true;
|
||||||
this.GroupSize = 128;
|
this.GroupSize = 128;
|
||||||
this.TaskSize = 32;
|
this.TaskSize = 8;
|
||||||
this.DeviceType = OpenCLDeviceType.GPU;
|
this.DeviceType = OpenCLDeviceType.GPU;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -67,17 +67,19 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
[SRDescription(typeof(Properties.Resources), "DescriptionMappedMemory")]
|
[SRDescription(typeof(Properties.Resources), "DescriptionMappedMemory")]
|
||||||
public bool MappedMemory { get; set; }
|
public bool MappedMemory { get; set; }
|
||||||
|
|
||||||
|
[TypeConverter(typeof(FLACCLWriterSettingsGroupSizeConverter))]
|
||||||
[DefaultValue(128)]
|
[DefaultValue(128)]
|
||||||
[SRDescription(typeof(Properties.Resources), "DescriptionGroupSize")]
|
[SRDescription(typeof(Properties.Resources), "DescriptionGroupSize")]
|
||||||
public int GroupSize { get; set; }
|
public int GroupSize { get; set; }
|
||||||
|
|
||||||
[DefaultValue(32)]
|
[DefaultValue(8)]
|
||||||
[SRDescription(typeof(Properties.Resources), "DescriptionTaskSize")]
|
[SRDescription(typeof(Properties.Resources), "DescriptionTaskSize")]
|
||||||
public int TaskSize { get; set; }
|
public int TaskSize { get; set; }
|
||||||
|
|
||||||
[SRDescription(typeof(Properties.Resources), "DescriptionDefines")]
|
[SRDescription(typeof(Properties.Resources), "DescriptionDefines")]
|
||||||
public string Defines { get; set; }
|
public string Defines { get; set; }
|
||||||
|
|
||||||
|
[TypeConverter(typeof(FLACCLWriterSettingsPlatformConverter))]
|
||||||
[SRDescription(typeof(Properties.Resources), "DescriptionPlatform")]
|
[SRDescription(typeof(Properties.Resources), "DescriptionPlatform")]
|
||||||
public string Platform { get; set; }
|
public string Platform { get; set; }
|
||||||
|
|
||||||
@@ -103,6 +105,35 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public class FLACCLWriterSettingsPlatformConverter : TypeConverter
|
||||||
|
{
|
||||||
|
public override bool GetStandardValuesSupported(ITypeDescriptorContext context)
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public override StandardValuesCollection GetStandardValues(ITypeDescriptorContext context)
|
||||||
|
{
|
||||||
|
var res = new List<string>();
|
||||||
|
foreach (var p in OpenCL.GetPlatforms())
|
||||||
|
res.Add(p.Name);
|
||||||
|
return new StandardValuesCollection(res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public class FLACCLWriterSettingsGroupSizeConverter : TypeConverter
|
||||||
|
{
|
||||||
|
public override bool GetStandardValuesSupported(ITypeDescriptorContext context)
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public override StandardValuesCollection GetStandardValues(ITypeDescriptorContext context)
|
||||||
|
{
|
||||||
|
return new StandardValuesCollection(new int[] { 64, 128, 256 });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public enum OpenCLDeviceType : ulong
|
public enum OpenCLDeviceType : ulong
|
||||||
{
|
{
|
||||||
CPU = DeviceType.CPU,
|
CPU = DeviceType.CPU,
|
||||||
@@ -173,6 +204,8 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
FLACCLTask[] cpu_tasks;
|
FLACCLTask[] cpu_tasks;
|
||||||
int oldest_cpu_task = 0;
|
int oldest_cpu_task = 0;
|
||||||
|
|
||||||
|
internal int framesPerTask;
|
||||||
|
|
||||||
AudioPCMConfig _pcm;
|
AudioPCMConfig _pcm;
|
||||||
|
|
||||||
public const int MAX_BLOCKSIZE = 65536;
|
public const int MAX_BLOCKSIZE = 65536;
|
||||||
@@ -1037,7 +1070,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
if (task.nWindowFunctions == 0)
|
if (task.nWindowFunctions == 0)
|
||||||
throw new Exception("invalid windowfunction");
|
throw new Exception("invalid windowfunction");
|
||||||
if (!_settings.MappedMemory)
|
if (!_settings.MappedMemory)
|
||||||
task.openCLCQ.EnqueueWriteBuffer(task.clWindowFunctions, true, 0, sizeof(float) * task.nWindowFunctions * task.frameSize, task.clWindowFunctionsPtr);
|
task.openCLCQ.EnqueueWriteBuffer(task.clWindowFunctions, false, 0, sizeof(float) * task.nWindowFunctions * task.frameSize, task.clWindowFunctionsPtr);
|
||||||
}
|
}
|
||||||
|
|
||||||
task.nResidualTasks = 0;
|
task.nResidualTasks = 0;
|
||||||
@@ -1163,9 +1196,9 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
throw new Exception("oops");
|
throw new Exception("oops");
|
||||||
|
|
||||||
if (!_settings.MappedMemory)
|
if (!_settings.MappedMemory)
|
||||||
task.openCLCQ.EnqueueWriteBuffer(task.clResidualTasks, true, 0, sizeof(FLACCLSubframeTask) * task.nResidualTasks, task.clResidualTasksPtr);
|
task.openCLCQ.EnqueueWriteBuffer(task.clResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * task.nResidualTasks, task.clResidualTasksPtr);
|
||||||
if (!_settings.MappedMemory)
|
if (!_settings.MappedMemory)
|
||||||
task.openCLCQ.EnqueueWriteBuffer(task.clSelectedTasks, true, 0, sizeof(int) * (nFrames * channelsCount * task.nEstimateTasksPerChannel), task.clSelectedTasksPtr);
|
task.openCLCQ.EnqueueWriteBuffer(task.clSelectedTasks, false, 0, sizeof(int) * (nFrames * channelsCount * task.nEstimateTasksPerChannel), task.clSelectedTasksPtr);
|
||||||
}
|
}
|
||||||
|
|
||||||
unsafe void encode_residual(FLACCLTask task)
|
unsafe void encode_residual(FLACCLTask task)
|
||||||
@@ -1471,7 +1504,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
int channelsCount = doMidside ? 2 * channels : channels;
|
int channelsCount = doMidside ? 2 * channels : channels;
|
||||||
|
|
||||||
if (task.nResidualTasks == 0)
|
if (task.nResidualTasks == 0)
|
||||||
initializeSubframeTasks(task.frameSize, channelsCount, _settings.TaskSize, task);
|
initializeSubframeTasks(task.frameSize, channelsCount, framesPerTask, task);
|
||||||
|
|
||||||
estimate_residual(task, channelsCount);
|
estimate_residual(task, channelsCount);
|
||||||
}
|
}
|
||||||
@@ -1611,10 +1644,9 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
}
|
}
|
||||||
OCLMan.CreateDefaultContext(platformId, (DeviceType)_settings.DeviceType);
|
OCLMan.CreateDefaultContext(platformId, (DeviceType)_settings.DeviceType);
|
||||||
|
|
||||||
bool haveAtom = false;
|
this.framesPerTask = (int)OCLMan.Context.Devices[0].MaxComputeUnits * _settings.TaskSize;
|
||||||
if (OCLMan.Context.Devices[0].Extensions.Contains("cl_khr_local_int32_extended_atomics"))
|
|
||||||
haveAtom = true;
|
if (!OCLMan.Context.Devices[0].Extensions.Contains("cl_khr_local_int32_extended_atomics"))
|
||||||
else
|
|
||||||
_settings.GPUOnly = false;
|
_settings.GPUOnly = false;
|
||||||
|
|
||||||
// The Defines string gets prepended to any and all sources that are compiled
|
// The Defines string gets prepended to any and all sources that are compiled
|
||||||
@@ -1625,13 +1657,20 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
"#define FLACCL_VERSION \"" + vendor_string + "\"\n" +
|
"#define FLACCL_VERSION \"" + vendor_string + "\"\n" +
|
||||||
(_settings.GPUOnly ? "#define DO_PARTITIONS\n" : "") +
|
(_settings.GPUOnly ? "#define DO_PARTITIONS\n" : "") +
|
||||||
(_settings.DoRice ? "#define DO_RICE\n" : "") +
|
(_settings.DoRice ? "#define DO_RICE\n" : "") +
|
||||||
(haveAtom ? "#define HAVE_ATOM\n" : "") +
|
|
||||||
#if DEBUG
|
#if DEBUG
|
||||||
"#define DEBUG\n" +
|
"#define DEBUG\n" +
|
||||||
#endif
|
#endif
|
||||||
(_settings.DeviceType == OpenCLDeviceType.CPU ? "#define FLACCL_CPU\n" : "") +
|
(_settings.DeviceType == OpenCLDeviceType.CPU ? "#define FLACCL_CPU\n" : "") +
|
||||||
_settings.Defines + "\n";
|
_settings.Defines + "\n";
|
||||||
|
|
||||||
|
var exts = new string[] { "cl_khr_local_int32_base_atomics", "cl_khr_local_int32_extended_atomics", "cl_khr_fp64", "cl_amd_fp64" };
|
||||||
|
foreach (string extension in exts)
|
||||||
|
if (OCLMan.Context.Devices[0].Extensions.Contains(extension))
|
||||||
|
{
|
||||||
|
OCLMan.Defines += "#pragma OPENCL EXTENSION " + extension + ": enable\n";
|
||||||
|
OCLMan.Defines += "#define HAVE_" + extension + "\n";
|
||||||
|
}
|
||||||
|
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
openCLProgram = OCLMan.CompileFile("flac.cl");
|
openCLProgram = OCLMan.CompileFile("flac.cl");
|
||||||
@@ -1698,7 +1737,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
int pos = 0;
|
int pos = 0;
|
||||||
while (pos < buff.Length)
|
while (pos < buff.Length)
|
||||||
{
|
{
|
||||||
int block = Math.Min(buff.Length - pos, eparams.block_size * _settings.TaskSize - samplesInBuffer);
|
int block = Math.Min(buff.Length - pos, eparams.block_size * framesPerTask - samplesInBuffer);
|
||||||
|
|
||||||
fixed (byte* buf = buff.Bytes)
|
fixed (byte* buf = buff.Bytes)
|
||||||
AudioSamples.MemCpy(((byte*)task1.clSamplesBytesPtr) + samplesInBuffer * _pcm.BlockAlign, buf + pos * _pcm.BlockAlign, block * _pcm.BlockAlign);
|
AudioSamples.MemCpy(((byte*)task1.clSamplesBytesPtr) + samplesInBuffer * _pcm.BlockAlign, buf + pos * _pcm.BlockAlign, block * _pcm.BlockAlign);
|
||||||
@@ -1707,7 +1746,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
pos += block;
|
pos += block;
|
||||||
|
|
||||||
int nFrames = samplesInBuffer / eparams.block_size;
|
int nFrames = samplesInBuffer / eparams.block_size;
|
||||||
if (nFrames >= _settings.TaskSize)
|
if (nFrames >= framesPerTask)
|
||||||
do_output_frames(nFrames);
|
do_output_frames(nFrames);
|
||||||
}
|
}
|
||||||
if (md5 != null)
|
if (md5 != null)
|
||||||
@@ -2405,7 +2444,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
openCLCQ = openCLProgram.Context.CreateCommandQueue(openCLProgram.Context.Devices[0], prop);
|
openCLCQ = openCLProgram.Context.CreateCommandQueue(openCLProgram.Context.Devices[0], prop);
|
||||||
|
|
||||||
int MAX_ORDER = this.writer.eparams.max_prediction_order;
|
int MAX_ORDER = this.writer.eparams.max_prediction_order;
|
||||||
int MAX_FRAMES = this.writer._settings.TaskSize;
|
int MAX_FRAMES = this.writer.framesPerTask;
|
||||||
int MAX_CHANNELSIZE = MAX_FRAMES * writer.eparams.block_size;
|
int MAX_CHANNELSIZE = MAX_FRAMES * writer.eparams.block_size;
|
||||||
residualTasksLen = sizeof(FLACCLSubframeTask) * 32 * channelsCount * MAX_FRAMES;
|
residualTasksLen = sizeof(FLACCLSubframeTask) * 32 * channelsCount * MAX_FRAMES;
|
||||||
bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channels * MAX_FRAMES;
|
bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channels * MAX_FRAMES;
|
||||||
@@ -2440,14 +2479,14 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
clSelectedTasksPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, selectedLen);
|
clSelectedTasksPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, selectedLen);
|
||||||
clRiceOutputPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceLen);
|
clRiceOutputPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceLen);
|
||||||
|
|
||||||
clSamplesBytesPtr = openCLCQ.EnqueueMapBuffer(clSamplesBytesPinned, true, MapFlags.WRITE, 0, samplesBufferLen / 2);
|
clSamplesBytesPtr = openCLCQ.EnqueueMapBuffer(clSamplesBytesPinned, true, MapFlags.READ_WRITE, 0, samplesBufferLen / 2);
|
||||||
clResidualPtr = openCLCQ.EnqueueMapBuffer(clResidualPinned, true, MapFlags.WRITE, 0, residualBufferLen);
|
clResidualPtr = openCLCQ.EnqueueMapBuffer(clResidualPinned, true, MapFlags.READ_WRITE, 0, residualBufferLen);
|
||||||
clBestRiceParamsPtr = openCLCQ.EnqueueMapBuffer(clBestRiceParamsPinned, true, MapFlags.WRITE, 0, riceParamsLen / 4);
|
clBestRiceParamsPtr = openCLCQ.EnqueueMapBuffer(clBestRiceParamsPinned, true, MapFlags.READ_WRITE, 0, riceParamsLen / 4);
|
||||||
clResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clResidualTasksPinned, true, MapFlags.WRITE, 0, residualTasksLen);
|
clResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clResidualTasksPinned, true, MapFlags.READ_WRITE, 0, residualTasksLen);
|
||||||
clBestResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clBestResidualTasksPinned, true, MapFlags.WRITE, 0, bestResidualTasksLen);
|
clBestResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clBestResidualTasksPinned, true, MapFlags.READ_WRITE, 0, bestResidualTasksLen);
|
||||||
clWindowFunctionsPtr = openCLCQ.EnqueueMapBuffer(clWindowFunctionsPinned, true, MapFlags.WRITE, 0, wndLen);
|
clWindowFunctionsPtr = openCLCQ.EnqueueMapBuffer(clWindowFunctionsPinned, true, MapFlags.READ_WRITE, 0, wndLen);
|
||||||
clSelectedTasksPtr = openCLCQ.EnqueueMapBuffer(clSelectedTasksPinned, true, MapFlags.WRITE, 0, selectedLen);
|
clSelectedTasksPtr = openCLCQ.EnqueueMapBuffer(clSelectedTasksPinned, true, MapFlags.READ_WRITE, 0, selectedLen);
|
||||||
clRiceOutputPtr = openCLCQ.EnqueueMapBuffer(clRiceOutputPinned, true, MapFlags.WRITE, 0, riceLen);
|
clRiceOutputPtr = openCLCQ.EnqueueMapBuffer(clRiceOutputPinned, true, MapFlags.READ_WRITE, 0, riceLen);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@@ -2904,7 +2943,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
|
|
||||||
openCLCQ.EnqueueNDRangeKernel(
|
openCLCQ.EnqueueNDRangeKernel(
|
||||||
clCalcOutputOffsets,
|
clCalcOutputOffsets,
|
||||||
groupSize,
|
openCLCQ.Device.DeviceType == DeviceType.CPU ? groupSize : 32,
|
||||||
1);
|
1);
|
||||||
|
|
||||||
clRiceEncoding.SetArgs(
|
clRiceEncoding.SetArgs(
|
||||||
|
|||||||
@@ -124,7 +124,7 @@ namespace CUETools.Codecs.FLACCL.Properties {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Looks up a localized string similar to OpenCL platform to use (ATI Stream, NVIDIA OpenCL, Intel OpenCL, etc).
|
/// Looks up a localized string similar to OpenCL platform to use.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
internal static string DescriptionPlatform {
|
internal static string DescriptionPlatform {
|
||||||
get {
|
get {
|
||||||
@@ -133,7 +133,7 @@ namespace CUETools.Codecs.FLACCL.Properties {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Looks up a localized string similar to Number of frames processed simultaniously (32, 64).
|
/// Looks up a localized string similar to Number of frames processed per one multiprocessor.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
internal static string DescriptionTaskSize {
|
internal static string DescriptionTaskSize {
|
||||||
get {
|
get {
|
||||||
|
|||||||
@@ -139,10 +139,10 @@
|
|||||||
<value>Device uses host memory (Don't use)</value>
|
<value>Device uses host memory (Don't use)</value>
|
||||||
</data>
|
</data>
|
||||||
<data name="DescriptionPlatform" xml:space="preserve">
|
<data name="DescriptionPlatform" xml:space="preserve">
|
||||||
<value>OpenCL platform to use (ATI Stream, NVIDIA OpenCL, Intel OpenCL, etc)</value>
|
<value>OpenCL platform to use</value>
|
||||||
</data>
|
</data>
|
||||||
<data name="DescriptionTaskSize" xml:space="preserve">
|
<data name="DescriptionTaskSize" xml:space="preserve">
|
||||||
<value>Number of frames processed simultaniously (32, 64)</value>
|
<value>Number of frames processed per one multiprocessor</value>
|
||||||
</data>
|
</data>
|
||||||
<data name="DoMD5Description" xml:space="preserve">
|
<data name="DoMD5Description" xml:space="preserve">
|
||||||
<value>Calculate MD5 hash for audio stream</value>
|
<value>Calculate MD5 hash for audio stream</value>
|
||||||
|
|||||||
@@ -28,10 +28,30 @@
|
|||||||
#pragma OPENCL EXTENSION cl_amd_printf : enable
|
#pragma OPENCL EXTENSION cl_amd_printf : enable
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __CPU__
|
#if defined(HAVE_cl_khr_local_int32_base_atomics) && defined(HAVE_cl_khr_local_int32_extended_atomics)
|
||||||
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
|
#define HAVE_ATOM
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(HAVE_cl_khr_fp64) || defined(HAVE_cl_amd_fp64)
|
||||||
|
#define HAVE_DOUBLE
|
||||||
|
#define ZEROD 0.0
|
||||||
|
//#define FAST_DOUBLE
|
||||||
|
#else
|
||||||
|
#define double float
|
||||||
|
#define double4 float4
|
||||||
|
#define ZEROD 0.0f
|
||||||
|
#endif
|
||||||
|
#if defined(HAVE_DOUBLE) && defined(FAST_DOUBLE)
|
||||||
|
#define fastdouble double
|
||||||
|
#define fastdouble4 double4
|
||||||
|
#define ZEROFD 0.0
|
||||||
|
#else
|
||||||
|
#define fastdouble float
|
||||||
|
#define fastdouble4 float4
|
||||||
|
#define ZEROFD 0.0f
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
//#if __OPENCL_VERSION__ == 110
|
//#if __OPENCL_VERSION__ == 110
|
||||||
#ifdef AMD
|
#ifdef AMD
|
||||||
#define iclamp(a,b,c) clamp(a,b,c)
|
#define iclamp(a,b,c) clamp(a,b,c)
|
||||||
@@ -45,11 +65,6 @@
|
|||||||
|
|
||||||
#define WARP_SIZE 32
|
#define WARP_SIZE 32
|
||||||
|
|
||||||
#ifdef HAVE_ATOM
|
|
||||||
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics: enable
|
|
||||||
#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics: enable
|
|
||||||
#endif
|
|
||||||
|
|
||||||
typedef enum
|
typedef enum
|
||||||
{
|
{
|
||||||
Constant = 0,
|
Constant = 0,
|
||||||
@@ -303,7 +318,6 @@ void clComputeAutocor(
|
|||||||
#else
|
#else
|
||||||
// get_num_groups(0) == number of tasks
|
// get_num_groups(0) == number of tasks
|
||||||
// get_num_groups(1) == number of windows
|
// get_num_groups(1) == number of windows
|
||||||
#if 0
|
|
||||||
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
|
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
|
||||||
void clComputeAutocor(
|
void clComputeAutocor(
|
||||||
__global float *output,
|
__global float *output,
|
||||||
@@ -313,7 +327,7 @@ void clComputeAutocor(
|
|||||||
const int taskCount // tasks per block
|
const int taskCount // tasks per block
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
__local float data[GROUP_SIZE * 2];
|
__local fastdouble data[GROUP_SIZE * 2];
|
||||||
__local FLACCLSubframeData task;
|
__local FLACCLSubframeData task;
|
||||||
const int tid = get_local_id(0);
|
const int tid = get_local_id(0);
|
||||||
// fetch task data
|
// fetch task data
|
||||||
@@ -322,160 +336,50 @@ void clComputeAutocor(
|
|||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
int bs = task.blocksize;
|
int bs = task.blocksize;
|
||||||
int windowOffs = get_group_id(1) * bs;
|
data[tid] = ZEROFD;
|
||||||
|
|
||||||
// if (tid < GROUP_SIZE / 4)
|
|
||||||
// {
|
|
||||||
//float4 dd = 0.0f;
|
|
||||||
//if (tid * 4 < bs)
|
|
||||||
// dd = vload4(tid, window + windowOffs) * convert_float4(vload4(tid, samples + task.samplesOffs));
|
|
||||||
//vstore4(dd, tid, &data[0]);
|
|
||||||
// }
|
|
||||||
data[tid] = 0.0f;
|
|
||||||
// This simpler code doesn't work somehow!!!
|
|
||||||
//data[tid] = tid < bs ? samples[task.samplesOffs + tid] * window[windowOffs + tid] : 0.0f;
|
|
||||||
|
|
||||||
const int THREADS_FOR_ORDERS = MAX_ORDER < 8 ? 8 : MAX_ORDER < 16 ? 16 : MAX_ORDER < 32 ? 32 : 64;
|
|
||||||
float corr = 0.0f;
|
|
||||||
float corr1 = 0.0f;
|
|
||||||
for (int pos = 0; pos < bs; pos += GROUP_SIZE)
|
|
||||||
{
|
|
||||||
// fetch samples
|
|
||||||
float nextData = pos + tid < bs ? samples[task.samplesOffs + pos + tid] * window[windowOffs + pos + tid] : 0.0f;
|
|
||||||
data[tid + GROUP_SIZE] = nextData;
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
|
|
||||||
int lag = tid & (THREADS_FOR_ORDERS - 1);
|
|
||||||
int tid1 = tid + GROUP_SIZE - lag;
|
|
||||||
#ifdef AMD
|
|
||||||
float4 res = 0.0f;
|
|
||||||
for (int i = 0; i < THREADS_FOR_ORDERS / 4; i++)
|
|
||||||
res += vload4(i, &data[tid1 - lag]) * vload4(i, &data[tid1]);
|
|
||||||
corr += res.x + res.y + res.w + res.z;
|
|
||||||
#else
|
|
||||||
float res = 0.0f;
|
|
||||||
for (int i = 0; i < THREADS_FOR_ORDERS; i++)
|
|
||||||
res += data[tid1 - lag + i] * data[tid1 + i];
|
|
||||||
corr += res;
|
|
||||||
#endif
|
|
||||||
if ((pos & (GROUP_SIZE * 15)) == 0)
|
|
||||||
{
|
|
||||||
corr1 += corr;
|
|
||||||
corr = 0.0f;
|
|
||||||
}
|
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
data[tid] = nextData;
|
|
||||||
}
|
|
||||||
|
|
||||||
data[tid] = corr + corr1;
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
for (int i = GROUP_SIZE / 2; i >= THREADS_FOR_ORDERS; i >>= 1)
|
|
||||||
{
|
|
||||||
if (tid < i)
|
|
||||||
data[tid] += data[tid + i];
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (tid <= MAX_ORDER)
|
|
||||||
output[(get_group_id(0) * get_num_groups(1) + get_group_id(1)) * (MAX_ORDER + 1) + tid] = data[tid];
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
|
|
||||||
void clComputeAutocor(
|
|
||||||
__global float *output,
|
|
||||||
__global const int *samples,
|
|
||||||
__global const float *window,
|
|
||||||
__global FLACCLSubframeTask *tasks,
|
|
||||||
const int taskCount // tasks per block
|
|
||||||
)
|
|
||||||
{
|
|
||||||
__local float data[GROUP_SIZE * 2 + 32];
|
|
||||||
__local FLACCLSubframeData task;
|
|
||||||
const int tid = get_local_id(0);
|
|
||||||
// fetch task data
|
|
||||||
if (tid < sizeof(task) / sizeof(int))
|
|
||||||
((__local int*)&task)[tid] = ((__global int*)(tasks + taskCount * get_group_id(0)))[tid];
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
|
|
||||||
int bs = task.blocksize;
|
|
||||||
data[tid] = 0.0f;
|
|
||||||
if (tid < 32)
|
|
||||||
data[GROUP_SIZE * 2 + tid] = 0.0f;
|
|
||||||
|
|
||||||
const int THREADS_FOR_ORDERS = MAX_ORDER < 8 ? 8 : MAX_ORDER < 16 ? 16 : MAX_ORDER < 32 ? 32 : 64;
|
const int THREADS_FOR_ORDERS = MAX_ORDER < 8 ? 8 : MAX_ORDER < 16 ? 16 : MAX_ORDER < 32 ? 32 : 64;
|
||||||
int lag = tid & (THREADS_FOR_ORDERS - 1);
|
int lag = tid & (THREADS_FOR_ORDERS - 1);
|
||||||
int tid1 = tid + GROUP_SIZE - lag;
|
int tid1 = tid + GROUP_SIZE - lag;
|
||||||
int pos = 0;
|
int pos = 0;
|
||||||
const __global float * wptr = &window[get_group_id(1) * bs];
|
const __global float * wptr = &window[get_group_id(1) * bs];
|
||||||
#ifdef AMD
|
// const __global int * sptr = &samples[task.samplesOffs];
|
||||||
float4 corr = 0.0f;
|
double corr = ZEROD;
|
||||||
#else
|
|
||||||
float corr = 0.0f;
|
|
||||||
#endif
|
|
||||||
float corr1 = 0.0f;
|
|
||||||
for (pos = 0; pos + GROUP_SIZE - 1 < bs; pos += GROUP_SIZE)
|
for (pos = 0; pos + GROUP_SIZE - 1 < bs; pos += GROUP_SIZE)
|
||||||
{
|
{
|
||||||
// fetch samples
|
|
||||||
int off = pos + tid;
|
int off = pos + tid;
|
||||||
// const __global int * sptr = &samples[task.samplesOffs];
|
// fetch samples
|
||||||
float nextData = samples[task.samplesOffs + off] * wptr[off];
|
fastdouble nextData = samples[task.samplesOffs + off] * wptr[off];
|
||||||
data[tid + GROUP_SIZE] = nextData;
|
data[tid + GROUP_SIZE] = nextData;
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
#ifdef AMD
|
fastdouble4 tmp = ZEROFD;
|
||||||
for (int i = 0; i < THREADS_FOR_ORDERS / 4; i++)
|
for (int i = 0; i < THREADS_FOR_ORDERS / 4; i++)
|
||||||
corr += vload4(i, &data[tid1 - lag]) * vload4(i, &data[tid1]);
|
tmp += vload4(i, &data[tid1 - lag]) * vload4(i, &data[tid1]);
|
||||||
#else
|
corr += (tmp.x + tmp.y) + (tmp.w + tmp.z);
|
||||||
for (int i = 0; i < THREADS_FOR_ORDERS; i++)
|
|
||||||
corr += data[tid1 - lag + i] * data[tid1 + i];
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if ((pos & (GROUP_SIZE * 15)) == 0)
|
|
||||||
{
|
|
||||||
#ifdef AMD
|
|
||||||
corr1 += (corr.x + corr.y) + (corr.w + corr.z);
|
|
||||||
#else
|
|
||||||
corr1 += corr;
|
|
||||||
#endif
|
|
||||||
corr = 0.0f;
|
|
||||||
}
|
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
data[tid] = nextData;
|
data[tid] = nextData;
|
||||||
}
|
}
|
||||||
if (pos < bs)
|
if (pos < bs)
|
||||||
{
|
{
|
||||||
// fetch samples
|
|
||||||
int off = pos + tid;
|
int off = pos + tid;
|
||||||
float nextData = off < bs ? samples[task.samplesOffs + off] * wptr[off] : 0.0f;
|
// fetch samples
|
||||||
|
double nextData = off < bs ? samples[task.samplesOffs + off] * wptr[off] : ZEROD;
|
||||||
data[tid + GROUP_SIZE] = nextData;
|
data[tid + GROUP_SIZE] = nextData;
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
int lag = tid & (THREADS_FOR_ORDERS - 1);
|
fastdouble4 tmp = ZEROFD;
|
||||||
int tid1 = tid + GROUP_SIZE - lag;
|
|
||||||
//#if 1
|
|
||||||
#ifdef AMD
|
|
||||||
float4 res = 0.0f;
|
|
||||||
for (int i = 0; i < THREADS_FOR_ORDERS / 4; i++)
|
for (int i = 0; i < THREADS_FOR_ORDERS / 4; i++)
|
||||||
res += vload4(i, &data[tid1 - lag]) * vload4(i, &data[tid1]);
|
tmp += vload4(i, &data[tid1 - lag]) * vload4(i, &data[tid1]);
|
||||||
corr1 += res.x + res.y + res.w + res.z;
|
corr += (tmp.x + tmp.y) + (tmp.w + tmp.z);
|
||||||
#else
|
|
||||||
for (int i = 0; i < THREADS_FOR_ORDERS; i++)
|
|
||||||
corr1 += data[tid1 - lag + i] * data[tid1 + i];
|
|
||||||
#endif
|
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
data[tid] = nextData;
|
data[tid] = nextData;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef AMD
|
data[tid] = corr;
|
||||||
corr1 += corr.x + corr.y + corr.w + corr.z;
|
|
||||||
#else
|
|
||||||
corr1 += corr;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
data[tid] = corr1;
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
for (int i = GROUP_SIZE / 2; i >= THREADS_FOR_ORDERS; i >>= 1)
|
for (int i = GROUP_SIZE / 2; i >= THREADS_FOR_ORDERS; i >>= 1)
|
||||||
{
|
{
|
||||||
@@ -488,7 +392,6 @@ void clComputeAutocor(
|
|||||||
output[(get_group_id(0) * get_num_groups(1) + get_group_id(1)) * (MAX_ORDER + 1) + tid] = data[tid];
|
output[(get_group_id(0) * get_num_groups(1) + get_group_id(1)) * (MAX_ORDER + 1) + tid] = data[tid];
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef FLACCL_CPU
|
#ifdef FLACCL_CPU
|
||||||
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||||
@@ -558,8 +461,8 @@ void clComputeLPC(
|
|||||||
)
|
)
|
||||||
{
|
{
|
||||||
__local struct {
|
__local struct {
|
||||||
volatile float ldr[32];
|
volatile double ldr[32];
|
||||||
volatile float gen1[32];
|
volatile double gen1[32];
|
||||||
volatile float error[32];
|
volatile float error[32];
|
||||||
volatile float autoc[33];
|
volatile float autoc[33];
|
||||||
} shared;
|
} shared;
|
||||||
@@ -575,9 +478,9 @@ void clComputeLPC(
|
|||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
// Compute LPC using Schur and Levinson-Durbin recursion
|
// Compute LPC using Schur and Levinson-Durbin recursion
|
||||||
float gen0 = shared.gen1[get_local_id(0)] = shared.autoc[get_local_id(0)+1];
|
double gen0 = shared.gen1[get_local_id(0)] = shared.autoc[get_local_id(0)+1];
|
||||||
shared.ldr[get_local_id(0)] = 0.0f;
|
shared.ldr[get_local_id(0)] = ZEROD;
|
||||||
float error = shared.autoc[0];
|
double error = shared.autoc[0];
|
||||||
|
|
||||||
#ifdef DEBUGPRINT1
|
#ifdef DEBUGPRINT1
|
||||||
int magic = shared.autoc[0] == 177286873088.0f;
|
int magic = shared.autoc[0] == 177286873088.0f;
|
||||||
@@ -589,10 +492,10 @@ void clComputeLPC(
|
|||||||
for (int order = 0; order < MAX_ORDER; order++)
|
for (int order = 0; order < MAX_ORDER; order++)
|
||||||
{
|
{
|
||||||
// Schur recursion
|
// Schur recursion
|
||||||
float reff = -shared.gen1[0] / error;
|
double reff = -shared.gen1[0] / error;
|
||||||
//error += shared.gen1[0] * reff; // Equivalent to error *= (1 - reff * reff);
|
//error += shared.gen1[0] * reff; // Equivalent to error *= (1 - reff * reff);
|
||||||
error *= (1 - reff * reff);
|
error *= (1 - reff * reff);
|
||||||
float gen1;
|
double gen1;
|
||||||
if (get_local_id(0) < MAX_ORDER - 1 - order)
|
if (get_local_id(0) < MAX_ORDER - 1 - order)
|
||||||
{
|
{
|
||||||
gen1 = shared.gen1[get_local_id(0) + 1] + reff * gen0;
|
gen1 = shared.gen1[get_local_id(0) + 1] + reff * gen0;
|
||||||
@@ -613,7 +516,7 @@ void clComputeLPC(
|
|||||||
shared.error[order] = error;
|
shared.error[order] = error;
|
||||||
|
|
||||||
// Levinson-Durbin recursion
|
// Levinson-Durbin recursion
|
||||||
float ldr = shared.ldr[get_local_id(0)];
|
double ldr = shared.ldr[get_local_id(0)];
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
if (get_local_id(0) < order)
|
if (get_local_id(0) < order)
|
||||||
shared.ldr[order - 1 - get_local_id(0)] += reff * ldr;
|
shared.ldr[order - 1 - get_local_id(0)] += reff * ldr;
|
||||||
@@ -651,13 +554,21 @@ void clQuantizeLPC(
|
|||||||
float error[MAX_ORDER];
|
float error[MAX_ORDER];
|
||||||
int best_orders[MAX_ORDER];
|
int best_orders[MAX_ORDER];
|
||||||
|
|
||||||
|
int best8 = 0;
|
||||||
// Load prediction error estimates based on Akaike's Criteria
|
// Load prediction error estimates based on Akaike's Criteria
|
||||||
for (int tid = 0; tid < MAX_ORDER; tid++)
|
for (int tid = 0; tid < MAX_ORDER; tid++)
|
||||||
{
|
{
|
||||||
error[tid] = bs * log(lpcs[lpcOffs + MAX_ORDER * 32 + tid]) + tid * 4.12f * log((float)bs);
|
error[tid] = bs * log(1.0f + lpcs[lpcOffs + MAX_ORDER * 32 + tid]) + tid * 4.12f * log((float)bs);
|
||||||
best_orders[tid] = tid;
|
best_orders[tid] = tid;
|
||||||
|
if (tid < 8 && error[tid] < error[best8])
|
||||||
|
best8 = tid;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
for (int i = best8 + 1; i < MAX_ORDER; i++)
|
||||||
|
error[i] += 20.5f * log((float)bs);
|
||||||
|
#endif
|
||||||
|
|
||||||
// Select best orders
|
// Select best orders
|
||||||
for (int i = 0; i < MAX_ORDER && i < taskCountLPC; i++)
|
for (int i = 0; i < MAX_ORDER && i < taskCountLPC; i++)
|
||||||
{
|
{
|
||||||
@@ -730,6 +641,7 @@ void clQuantizeLPC(
|
|||||||
#ifndef HAVE_ATOM
|
#ifndef HAVE_ATOM
|
||||||
volatile int tmp[32];
|
volatile int tmp[32];
|
||||||
#endif
|
#endif
|
||||||
|
// volatile int best8;
|
||||||
} shared;
|
} shared;
|
||||||
|
|
||||||
const int tid = get_local_id(0);
|
const int tid = get_local_id(0);
|
||||||
@@ -752,6 +664,17 @@ void clQuantizeLPC(
|
|||||||
if (tid < MAX_ORDER)
|
if (tid < MAX_ORDER)
|
||||||
shared.error[tid] = shared.task.blocksize * log(lpcs[lpcOffs + MAX_ORDER * 32 + tid]) + tid * 4.12f * log((float)shared.task.blocksize);
|
shared.error[tid] = shared.task.blocksize * log(lpcs[lpcOffs + MAX_ORDER * 32 + tid]) + tid * 4.12f * log((float)shared.task.blocksize);
|
||||||
//shared.error[get_local_id(0)] = shared.task.blocksize * log(lpcs[lpcOffs + MAX_ORDER * 32 + get_local_id(0)]) + get_local_id(0) * 0.30f * (shared.task.abits + 1) * log(shared.task.blocksize);
|
//shared.error[get_local_id(0)] = shared.task.blocksize * log(lpcs[lpcOffs + MAX_ORDER * 32 + get_local_id(0)]) + get_local_id(0) * 0.30f * (shared.task.abits + 1) * log(shared.task.blocksize);
|
||||||
|
#if 0
|
||||||
|
if (tid == 0)
|
||||||
|
{
|
||||||
|
int b8 = 0;
|
||||||
|
for (int i = 1; i < 8; i++)
|
||||||
|
if (shared.error[i] < shared.error[b8])
|
||||||
|
b8 = i;
|
||||||
|
shared.best8 = b8;
|
||||||
|
}
|
||||||
|
shared.error[tid] += select(0.0f, 20.5f * log((float)shared.task.blocksize), tid > shared.best8);
|
||||||
|
#endif
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
// Sort using bitonic sort
|
// Sort using bitonic sort
|
||||||
@@ -1452,6 +1375,7 @@ void clCalcPartition16(
|
|||||||
__local FLACCLSubframeTask task;
|
__local FLACCLSubframeTask task;
|
||||||
__local int data[GROUP_SIZE * 2];
|
__local int data[GROUP_SIZE * 2];
|
||||||
__local int res[GROUP_SIZE];
|
__local int res[GROUP_SIZE];
|
||||||
|
__local int pl[GROUP_SIZE >> 4][15];
|
||||||
|
|
||||||
const int tid = get_local_id(0);
|
const int tid = get_local_id(0);
|
||||||
if (tid < sizeof(task) / sizeof(int))
|
if (tid < sizeof(task) / sizeof(int))
|
||||||
@@ -1524,8 +1448,16 @@ void clCalcPartition16(
|
|||||||
sum = (vload4(0,chunk) >> k) + (vload4(1,chunk) >> k) + (vload4(2,chunk) >> k) + (vload4(3,chunk) >> k);
|
sum = (vload4(0,chunk) >> k) + (vload4(1,chunk) >> k) + (vload4(2,chunk) >> k) + (vload4(3,chunk) >> k);
|
||||||
s = sum.x + sum.y + sum.z + sum.w;
|
s = sum.x + sum.y + sum.z + sum.w;
|
||||||
|
|
||||||
|
#if 0
|
||||||
if (k <= 14 && offs < bs)
|
if (k <= 14 && offs < bs)
|
||||||
plptr[offs >> 4] = min(0x7fffff, s) + (16 - select(0, ro, offs < 16)) * (k + 1);
|
plptr[offs >> 4] = min(0x7fffff, s) + (16 - select(0, ro, offs < 16)) * (k + 1);
|
||||||
|
#else
|
||||||
|
if (k <= 14) pl[x][k] = min(0x7fffff, s) + (16 - select(0, ro, offs < 16)) * (k + 1);
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
int k1 = tid >> 3, x1 = tid & 7;
|
||||||
|
if (k1 <= 14 && (pos >> 4) + x1 < (1 << max_porder))
|
||||||
|
partition_lengths[(15 << (max_porder + 1)) * get_group_id(0) + (k1 << (max_porder + 1)) + (pos >> 4) + x1] = pl[x1][k1];
|
||||||
|
#endif
|
||||||
|
|
||||||
// if (task.data.blocksize == 16 && x == 0 && k <= 14)
|
// if (task.data.blocksize == 16 && x == 0 && k <= 14)
|
||||||
// printf("[%d] = %d = s:%d + %d * (k:%d + 1), ro=%d, offs=%d, lpos=%d\n", k, partition_lengths[lpos], s, (16 - select(0, ro, offs < 16)), k, ro, offs, lpos);
|
// printf("[%d] = %d = s:%d + %d * (k:%d + 1), ro=%d, offs=%d, lpos=%d\n", k, partition_lengths[lpos], s, (16 - select(0, ro, offs < 16)), k, ro, offs, lpos);
|
||||||
@@ -1849,13 +1781,12 @@ inline int len_utf8(int n)
|
|||||||
#else
|
#else
|
||||||
int bts = 31 - clz(n);
|
int bts = 31 - clz(n);
|
||||||
#endif
|
#endif
|
||||||
if (bts < 7)
|
return select(8, 8 * ((bts + 4) / 5), bts > 6);
|
||||||
return 8;
|
|
||||||
return 8 * ((bts + 4) / 5);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef FLACCL_CPU
|
||||||
// get_global_id(0) * channels == task index
|
// get_global_id(0) * channels == task index
|
||||||
__kernel
|
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||||
void clCalcOutputOffsets(
|
void clCalcOutputOffsets(
|
||||||
__global int *residual,
|
__global int *residual,
|
||||||
__global int *samples,
|
__global int *samples,
|
||||||
@@ -1875,8 +1806,7 @@ void clCalcOutputOffsets(
|
|||||||
;
|
;
|
||||||
int bs = tasks[iFrame * channels].data.blocksize;
|
int bs = tasks[iFrame * channels].data.blocksize;
|
||||||
//public static readonly int[] flac_blocksizes = new int[15] { 0, 192, 576, 1152, 2304, 4608, 0, 0, 256, 512, 1024, 2048, 4096, 8192, 16384 };
|
//public static readonly int[] flac_blocksizes = new int[15] { 0, 192, 576, 1152, 2304, 4608, 0, 0, 256, 512, 1024, 2048, 4096, 8192, 16384 };
|
||||||
if (bs != 4096 && bs != 4608) // TODO: check all other standard sizes
|
offset += select(0, select(8, 16, bs >= 256), bs != 4096 && bs != 4608); // TODO: check all other standard sizes
|
||||||
offset += select(8, 16, bs >= 256);
|
|
||||||
|
|
||||||
// assert (offset % 8) == 0
|
// assert (offset % 8) == 0
|
||||||
offset += 8;
|
offset += 8;
|
||||||
@@ -1893,6 +1823,56 @@ void clCalcOutputOffsets(
|
|||||||
offset += 16;
|
offset += 16;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
// get_global_id(0) * channels == task index
|
||||||
|
__kernel __attribute__((reqd_work_group_size(32, 1, 1)))
|
||||||
|
void clCalcOutputOffsets(
|
||||||
|
__global int *residual,
|
||||||
|
__global int *samples,
|
||||||
|
__global FLACCLSubframeTask *tasks,
|
||||||
|
int channels1,
|
||||||
|
int frameCount,
|
||||||
|
int firstFrame
|
||||||
|
)
|
||||||
|
{
|
||||||
|
const int channels = 2;
|
||||||
|
__local FLACCLSubframeData ltasks[2];
|
||||||
|
__local volatile int mypos[2];
|
||||||
|
int offset = 0;
|
||||||
|
for (int iFrame = 0; iFrame < frameCount; iFrame++)
|
||||||
|
{
|
||||||
|
if (get_local_id(0) < sizeof(ltasks[0]) / sizeof(int))
|
||||||
|
for (int ch = 0; ch < channels; ch++)
|
||||||
|
((__local int*)<asks[ch])[get_local_id(0)] = ((__global int*)(&tasks[iFrame * channels + ch]))[get_local_id(0)];
|
||||||
|
|
||||||
|
//printf("len_utf8(%d) == %d\n", firstFrame + iFrame, len_utf8(firstFrame + iFrame));
|
||||||
|
offset += 15 + 1 + 4 + 4 + 4 + 3 + 1 + len_utf8(firstFrame + iFrame)
|
||||||
|
// + 8-16 // custom block size
|
||||||
|
// + 8-16 // custom sample rate
|
||||||
|
;
|
||||||
|
int bs = ltasks[0].blocksize;
|
||||||
|
//public static readonly int[] flac_blocksizes = new int[15] { 0, 192, 576, 1152, 2304, 4608, 0, 0, 256, 512, 1024, 2048, 4096, 8192, 16384 };
|
||||||
|
offset += select(0, select(8, 16, bs >= 256), bs != 4096 && bs != 4608); // TODO: check all other standard sizes
|
||||||
|
|
||||||
|
// assert (offset % 8) == 0
|
||||||
|
offset += 8;
|
||||||
|
if (get_local_id(0) < channels)
|
||||||
|
{
|
||||||
|
int ch = get_local_id(0);
|
||||||
|
// Add 64 bits to separate frames if header is too small so they can intersect
|
||||||
|
int mylen = 8 + ltasks[ch].wbits + 64 + ltasks[ch].size;
|
||||||
|
mypos[ch] = mylen;
|
||||||
|
for (int offset = 1; offset < WARP_SIZE && offset < channels; offset <<= 1)
|
||||||
|
if (ch >= offset) mypos[ch] += mypos[ch - offset];
|
||||||
|
mypos[ch] += offset;
|
||||||
|
tasks[iFrame * channels + ch].data.encodingOffset = mypos[ch] - ltasks[ch].size + ltasks[ch].headerLen;
|
||||||
|
}
|
||||||
|
offset = mypos[channels - 1];
|
||||||
|
offset = (offset + 7) & ~7;
|
||||||
|
offset += 16;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// get_group_id(0) == task index
|
// get_group_id(0) == task index
|
||||||
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
|
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
|
||||||
@@ -2000,7 +1980,9 @@ void clRiceEncoding(
|
|||||||
#else
|
#else
|
||||||
__local unsigned int data[GROUP_SIZE];
|
__local unsigned int data[GROUP_SIZE];
|
||||||
__local volatile int mypos[GROUP_SIZE+1];
|
__local volatile int mypos[GROUP_SIZE+1];
|
||||||
//__local int brp[256];
|
#if 0
|
||||||
|
__local int brp[256];
|
||||||
|
#endif
|
||||||
__local volatile int warppos[WARP_SIZE];
|
__local volatile int warppos[WARP_SIZE];
|
||||||
__local FLACCLSubframeData task;
|
__local FLACCLSubframeData task;
|
||||||
|
|
||||||
@@ -2014,8 +1996,10 @@ void clRiceEncoding(
|
|||||||
mypos[GROUP_SIZE] = 0;
|
mypos[GROUP_SIZE] = 0;
|
||||||
if (tid < WARP_SIZE)
|
if (tid < WARP_SIZE)
|
||||||
warppos[tid] = 0;
|
warppos[tid] = 0;
|
||||||
// for (int offs = tid; offs < (1 << task.porder); offs ++)
|
#if 0
|
||||||
//brp[offs] = best_rice_parameters[(get_group_id(0) << max_porder) + offs];
|
for (int offs = tid; offs < (1 << task.porder); offs ++)
|
||||||
|
brp[offs] = best_rice_parameters[(get_group_id(0) << max_porder) + offs];
|
||||||
|
#endif
|
||||||
data[tid] = 0;
|
data[tid] = 0;
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
const int bs = task.blocksize;
|
const int bs = task.blocksize;
|
||||||
@@ -2023,19 +2007,23 @@ void clRiceEncoding(
|
|||||||
int plen = bs >> task.porder;
|
int plen = bs >> task.porder;
|
||||||
//int plenoffs = 12 - task.porder;
|
//int plenoffs = 12 - task.porder;
|
||||||
unsigned int remainder = 0U;
|
unsigned int remainder = 0U;
|
||||||
for (int pos = 0; pos < bs; pos += GROUP_SIZE)
|
int pos;
|
||||||
|
for (pos = 0; pos + GROUP_SIZE - 1 < bs; pos += GROUP_SIZE)
|
||||||
{
|
{
|
||||||
int offs = pos + tid;
|
int offs = pos + tid;
|
||||||
int v = offs < bs ? residual[task.residualOffs + offs] : 0;
|
int v = residual[task.residualOffs + offs];
|
||||||
int part = offs / plen; // >> plenoffs;
|
int part = offs / plen; // >> plenoffs;
|
||||||
//int k = brp[min(255, part)];
|
#if 0
|
||||||
int k = offs < bs ? best_rice_parameters[(get_group_id(0) << max_porder) + part] : 0;
|
int k = brp[part];
|
||||||
|
#else
|
||||||
|
int k = best_rice_parameters[(get_group_id(0) << max_porder) + part];
|
||||||
|
#endif
|
||||||
int pstart = offs == task.residualOrder || offs == part * plen;
|
int pstart = offs == task.residualOrder || offs == part * plen;
|
||||||
v = (v << 1) ^ (v >> 31);
|
v = (v << 1) ^ (v >> 31);
|
||||||
int mylen = select(0, (v >> k) + 1 + k + select(0, 4, pstart), offs >= task.residualOrder && offs < bs);
|
int mylen = select(0, (v >> k) + 1 + k + select(0, 4, pstart), offs >= task.residualOrder && offs < bs);
|
||||||
mypos[tid] = mylen;
|
mypos[tid] = mylen;
|
||||||
|
|
||||||
// Inclusive scan(+)
|
// Inclusive scan(+)
|
||||||
#if 1
|
|
||||||
int lane = (tid & (WARP_SIZE - 1));
|
int lane = (tid & (WARP_SIZE - 1));
|
||||||
for (int offset = 1; offset < WARP_SIZE; offset <<= 1)
|
for (int offset = 1; offset < WARP_SIZE; offset <<= 1)
|
||||||
mypos[tid] += mypos[select(GROUP_SIZE, tid - offset, lane >= offset)];
|
mypos[tid] += mypos[select(GROUP_SIZE, tid - offset, lane >= offset)];
|
||||||
@@ -2052,19 +2040,68 @@ void clRiceEncoding(
|
|||||||
mp += start + select(0, warppos[tid / WARP_SIZE - 1], tid / WARP_SIZE > 0);
|
mp += start + select(0, warppos[tid / WARP_SIZE - 1], tid / WARP_SIZE > 0);
|
||||||
int start32 = start >> 5;
|
int start32 = start >> 5;
|
||||||
start += mypos[GROUP_SIZE - 1] + warppos[GROUP_SIZE / WARP_SIZE - 2];
|
start += mypos[GROUP_SIZE - 1] + warppos[GROUP_SIZE / WARP_SIZE - 2];
|
||||||
#else
|
|
||||||
|
//if (tid == GROUP_SIZE - 1 && mypos[tid] > (GROUP_SIZE/2) * 32)
|
||||||
|
// printf("Oops: %d\n", mypos[tid]);
|
||||||
|
data[tid] = select(0U, remainder, tid == 0);
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
for (int offset = 1; offset < GROUP_SIZE; offset <<= 1)
|
if (mylen)
|
||||||
{
|
{
|
||||||
int t = tid >= offset ? mypos[tid - offset] : 0;
|
if (pstart)
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
{
|
||||||
mypos[tid] += t;
|
int kpos = mp - mylen;
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
int kpos0 = (kpos >> 5) - start32;
|
||||||
|
int kpos1 = kpos & 31;
|
||||||
|
unsigned int kval = (unsigned int)k << 28;
|
||||||
|
unsigned int kval0 = kval >> kpos1;
|
||||||
|
unsigned int kval1 = kval << (32 - kpos1);
|
||||||
|
if (kval0) atom_or(&data[kpos0], kval0);
|
||||||
|
if (kpos1 && kval1) atom_or(&data[kpos0 + 1], kval1);
|
||||||
}
|
}
|
||||||
int mp = start + mypos[tid];
|
int qpos = mp - k - 1;
|
||||||
int start32 = start / 32;
|
int qpos0 = (qpos >> 5) - start32;
|
||||||
start += mypos[GROUP_SIZE - 1];
|
int qpos1 = qpos & 31;
|
||||||
#endif
|
unsigned int qval = (1U << 31) | ((unsigned int)v << (31 - k));
|
||||||
|
unsigned int qval0 = qval >> qpos1;
|
||||||
|
unsigned int qval1= qval << (32 - qpos1);
|
||||||
|
if (qval0) atom_or(&data[qpos0], qval0);
|
||||||
|
if (qpos1 && qval1) atom_or(&data[qpos0 + 1], qval1);
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
if ((start32 + tid) * 32 <= start)
|
||||||
|
output[start32 + tid] = as_int(as_char4(data[tid]).wzyx);
|
||||||
|
remainder = data[start / 32 - start32];
|
||||||
|
}
|
||||||
|
if (pos < bs)
|
||||||
|
{
|
||||||
|
int offs = pos + tid;
|
||||||
|
int v = offs < bs ? residual[task.residualOffs + offs] : 0;
|
||||||
|
int part = offs / plen; // >> plenoffs;
|
||||||
|
//int k = brp[min(255, part)];
|
||||||
|
int k = offs < bs ? best_rice_parameters[(get_group_id(0) << max_porder) + part] : 0;
|
||||||
|
int pstart = offs == task.residualOrder || offs == part * plen;
|
||||||
|
v = (v << 1) ^ (v >> 31);
|
||||||
|
int mylen = select(0, (v >> k) + 1 + k + select(0, 4, pstart), offs >= task.residualOrder && offs < bs);
|
||||||
|
mypos[tid] = mylen;
|
||||||
|
|
||||||
|
// Inclusive scan(+)
|
||||||
|
int lane = (tid & (WARP_SIZE - 1));
|
||||||
|
for (int offset = 1; offset < WARP_SIZE; offset <<= 1)
|
||||||
|
mypos[tid] += mypos[select(GROUP_SIZE, tid - offset, lane >= offset)];
|
||||||
|
int mp = mypos[tid];
|
||||||
|
if ((tid & (WARP_SIZE - 1)) == WARP_SIZE - 1)
|
||||||
|
warppos[tid/WARP_SIZE] = mp;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
if (tid < GROUP_SIZE/WARP_SIZE)
|
||||||
|
{
|
||||||
|
for (int offset = 1; offset < GROUP_SIZE/WARP_SIZE; offset <<= 1)
|
||||||
|
warppos[tid] += warppos[select(GROUP_SIZE/WARP_SIZE, tid - offset, tid >= offset)];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
mp += start + select(0, warppos[tid / WARP_SIZE - 1], tid / WARP_SIZE > 0);
|
||||||
|
int start32 = start >> 5;
|
||||||
|
start += mypos[GROUP_SIZE - 1] + warppos[GROUP_SIZE / WARP_SIZE - 2];
|
||||||
|
|
||||||
//if (tid == GROUP_SIZE - 1 && mypos[tid] > (GROUP_SIZE/2) * 32)
|
//if (tid == GROUP_SIZE - 1 && mypos[tid] > (GROUP_SIZE/2) * 32)
|
||||||
// printf("Oops: %d\n", mypos[tid]);
|
// printf("Oops: %d\n", mypos[tid]);
|
||||||
data[tid] = select(0U, remainder, tid == 0);
|
data[tid] = select(0U, remainder, tid == 0);
|
||||||
|
|||||||
@@ -32,6 +32,7 @@
|
|||||||
<DefineConstants>TRACE</DefineConstants>
|
<DefineConstants>TRACE</DefineConstants>
|
||||||
<ErrorReport>prompt</ErrorReport>
|
<ErrorReport>prompt</ErrorReport>
|
||||||
<WarningLevel>4</WarningLevel>
|
<WarningLevel>4</WarningLevel>
|
||||||
|
<PlatformTarget>AnyCPU</PlatformTarget>
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<Reference Include="System" />
|
<Reference Include="System" />
|
||||||
|
|||||||
@@ -47,17 +47,16 @@ namespace CUETools.FLACCL.cmd
|
|||||||
Console.WriteLine("OpenCL Options:");
|
Console.WriteLine("OpenCL Options:");
|
||||||
Console.WriteLine();
|
Console.WriteLine();
|
||||||
Console.WriteLine(" --opencl-type <X> CPU or GPU, default GPU");
|
Console.WriteLine(" --opencl-type <X> CPU or GPU, default GPU");
|
||||||
Console.WriteLine(" --opencl-platform '' 'ATI Stream', 'NVIDIA Cuda', 'Intel OpenCL' etc");
|
Console.WriteLine(" --opencl-platform 'ATI Stream', 'NVIDIA CUDA', 'Intel OpenCL' etc");
|
||||||
Console.WriteLine(" --group-size # Set GPU workgroup size (64,128,256)");
|
Console.WriteLine(" --group-size # Set GPU workgroup size (64,128,256)");
|
||||||
Console.WriteLine(" --task-size # Set number of frames per GPU call, default 32");
|
Console.WriteLine(" --task-size # Set number of frames per multiprocessor, default 8");
|
||||||
Console.WriteLine(" --slow-gpu Some encoding stages are done on CPU");
|
Console.WriteLine(" --slow-gpu Some encoding stages are done on CPU");
|
||||||
Console.WriteLine(" --do-rice Experimental mode, not recommended");
|
Console.WriteLine(" --fast-gpu Experimental mode, not recommended");
|
||||||
Console.WriteLine(" --define <X> <Y> OpenCL preprocessor definition");
|
Console.WriteLine(" --define <X> <Y> OpenCL preprocessor definition");
|
||||||
Console.WriteLine();
|
Console.WriteLine();
|
||||||
Console.WriteLine("Advanced Options:");
|
Console.WriteLine("Advanced Options:");
|
||||||
Console.WriteLine();
|
Console.WriteLine();
|
||||||
Console.WriteLine(" -b # Block size");
|
Console.WriteLine(" -b # Block size");
|
||||||
Console.WriteLine(" -v # Variable block size mode (0,4)");
|
|
||||||
Console.WriteLine(" -s <method> Stereo decorrelation (independent,search)");
|
Console.WriteLine(" -s <method> Stereo decorrelation (independent,search)");
|
||||||
Console.WriteLine(" -r #[,#] Rice partition order {max} or {min},{max} (0..8)");
|
Console.WriteLine(" -r #[,#] Rice partition order {max} or {min},{max} (0..8)");
|
||||||
Console.WriteLine();
|
Console.WriteLine();
|
||||||
@@ -88,9 +87,7 @@ namespace CUETools.FLACCL.cmd
|
|||||||
min_precision = -1, max_precision = -1,
|
min_precision = -1, max_precision = -1,
|
||||||
orders_per_window = -1, orders_per_channel = -1,
|
orders_per_window = -1, orders_per_channel = -1,
|
||||||
blocksize = -1;
|
blocksize = -1;
|
||||||
#if DEBUG
|
|
||||||
int input_len = 4096, input_val = 0;
|
int input_len = 4096, input_val = 0;
|
||||||
#endif
|
|
||||||
int level = -1, padding = -1, vbr_mode = -1;
|
int level = -1, padding = -1, vbr_mode = -1;
|
||||||
bool do_seektable = true;
|
bool do_seektable = true;
|
||||||
bool buffered = false;
|
bool buffered = false;
|
||||||
@@ -111,7 +108,7 @@ namespace CUETools.FLACCL.cmd
|
|||||||
do_seektable = false;
|
do_seektable = false;
|
||||||
else if (args[arg] == "--slow-gpu")
|
else if (args[arg] == "--slow-gpu")
|
||||||
settings.GPUOnly = false;
|
settings.GPUOnly = false;
|
||||||
else if (args[arg] == "--do-rice")
|
else if (args[arg] == "--fast-gpu")
|
||||||
settings.DoRice = true;
|
settings.DoRice = true;
|
||||||
else if (args[arg] == "--no-md5")
|
else if (args[arg] == "--no-md5")
|
||||||
settings.DoMD5 = false;
|
settings.DoMD5 = false;
|
||||||
@@ -135,12 +132,10 @@ namespace CUETools.FLACCL.cmd
|
|||||||
settings.MappedMemory = true;
|
settings.MappedMemory = true;
|
||||||
else if (args[arg] == "--opencl-type" && ++arg < args.Length)
|
else if (args[arg] == "--opencl-type" && ++arg < args.Length)
|
||||||
device_type = args[arg];
|
device_type = args[arg];
|
||||||
#if DEBUG
|
|
||||||
else if (args[arg] == "--input-length" && ++arg < args.Length && int.TryParse(args[arg], out intarg))
|
else if (args[arg] == "--input-length" && ++arg < args.Length && int.TryParse(args[arg], out intarg))
|
||||||
input_len = intarg;
|
input_len = intarg;
|
||||||
else if (args[arg] == "--input-value" && ++arg < args.Length && int.TryParse(args[arg], out intarg))
|
else if (args[arg] == "--input-value" && ++arg < args.Length && int.TryParse(args[arg], out intarg))
|
||||||
input_val = intarg;
|
input_val = intarg;
|
||||||
#endif
|
|
||||||
else if ((args[arg] == "-o" || args[arg] == "--output") && ++arg < args.Length)
|
else if ((args[arg] == "-o" || args[arg] == "--output") && ++arg < args.Length)
|
||||||
output_file = args[arg];
|
output_file = args[arg];
|
||||||
else if ((args[arg] == "-s" || args[arg] == "--stereo") && ++arg < args.Length)
|
else if ((args[arg] == "-s" || args[arg] == "--stereo") && ++arg < args.Length)
|
||||||
@@ -218,10 +213,8 @@ namespace CUETools.FLACCL.cmd
|
|||||||
IAudioSource audioSource;
|
IAudioSource audioSource;
|
||||||
if (input_file == "-")
|
if (input_file == "-")
|
||||||
audioSource = new WAVReader("", Console.OpenStandardInput());
|
audioSource = new WAVReader("", Console.OpenStandardInput());
|
||||||
#if DEBUG
|
|
||||||
else if (input_file == "nul")
|
else if (input_file == "nul")
|
||||||
audioSource = new SilenceGenerator(input_len, input_val);
|
audioSource = new SilenceGenerator(input_len, input_val);
|
||||||
#endif
|
|
||||||
else if (File.Exists(input_file) && Path.GetExtension(input_file) == ".wav")
|
else if (File.Exists(input_file) && Path.GetExtension(input_file) == ".wav")
|
||||||
audioSource = new WAVReader(input_file, null);
|
audioSource = new WAVReader(input_file, null);
|
||||||
else if (File.Exists(input_file) && Path.GetExtension(input_file) == ".flac")
|
else if (File.Exists(input_file) && Path.GetExtension(input_file) == ".flac")
|
||||||
@@ -326,6 +319,9 @@ namespace CUETools.FLACCL.cmd
|
|||||||
Console.Error.Write("\r \r");
|
Console.Error.Write("\r \r");
|
||||||
Console.WriteLine("Error : {0}", ex.Message);
|
Console.WriteLine("Error : {0}", ex.Message);
|
||||||
Console.WriteLine("{0}", ex.BuildLogs[0]);
|
Console.WriteLine("{0}", ex.BuildLogs[0]);
|
||||||
|
if (debug)
|
||||||
|
using (StreamWriter sw = new StreamWriter("debug.txt", true))
|
||||||
|
sw.WriteLine("{0}\n{1}\n{2}", ex.Message, ex.StackTrace, ex.BuildLogs[0]);
|
||||||
audioDest.Delete();
|
audioDest.Delete();
|
||||||
audioSource.Close();
|
audioSource.Close();
|
||||||
return 4;
|
return 4;
|
||||||
@@ -335,6 +331,9 @@ namespace CUETools.FLACCL.cmd
|
|||||||
{
|
{
|
||||||
Console.Error.Write("\r \r");
|
Console.Error.Write("\r \r");
|
||||||
Console.WriteLine("Error : {0}", ex.Message);
|
Console.WriteLine("Error : {0}", ex.Message);
|
||||||
|
if (debug)
|
||||||
|
using (StreamWriter sw = new StreamWriter("debug.txt", true))
|
||||||
|
sw.WriteLine("{0}\n{1}", ex.Message, ex.StackTrace);
|
||||||
audioDest.Delete();
|
audioDest.Delete();
|
||||||
audioSource.Close();
|
audioSource.Close();
|
||||||
return 4;
|
return 4;
|
||||||
|
|||||||
Reference in New Issue
Block a user