mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
opencl flac encoder
This commit is contained in:
@@ -141,7 +141,6 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
|
|
||||||
public const int MAX_BLOCKSIZE = 4096 * 16;
|
public const int MAX_BLOCKSIZE = 4096 * 16;
|
||||||
internal const int maxFrames = 128;
|
internal const int maxFrames = 128;
|
||||||
internal const int maxAutocorParts = (MAX_BLOCKSIZE + 255) / 256;
|
|
||||||
|
|
||||||
public FLACCLWriter(string path, Stream IO, AudioPCMConfig pcm)
|
public FLACCLWriter(string path, Stream IO, AudioPCMConfig pcm)
|
||||||
{
|
{
|
||||||
@@ -1116,12 +1115,6 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
if (task.frameSize <= 4)
|
if (task.frameSize <= 4)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
//int autocorPartSize = (2 * 256 - eparams.max_prediction_order) & ~15;
|
|
||||||
int autocorPartSize = 32 * 7;
|
|
||||||
int autocorPartCount = (task.frameSize + autocorPartSize - 1) / autocorPartSize;
|
|
||||||
if (autocorPartCount > maxAutocorParts)
|
|
||||||
throw new Exception("internal error");
|
|
||||||
|
|
||||||
int max_porder = get_max_p_order(eparams.max_partition_order, task.frameSize, eparams.max_prediction_order);
|
int max_porder = get_max_p_order(eparams.max_partition_order, task.frameSize, eparams.max_prediction_order);
|
||||||
int calcPartitionPartSize = task.frameSize >> max_porder;
|
int calcPartitionPartSize = task.frameSize >> max_porder;
|
||||||
while (calcPartitionPartSize < 16 && max_porder > 0)
|
while (calcPartitionPartSize < 16 && max_porder > 0)
|
||||||
@@ -1140,12 +1133,10 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
cudaChannelDecorr.SetArg(2, (uint)MAX_BLOCKSIZE);
|
cudaChannelDecorr.SetArg(2, (uint)MAX_BLOCKSIZE);
|
||||||
|
|
||||||
task.cudaComputeLPC.SetArg(0, task.cudaResidualTasks);
|
task.cudaComputeLPC.SetArg(0, task.cudaResidualTasks);
|
||||||
task.cudaComputeLPC.SetArg(1, (uint)task.nResidualTasksPerChannel);
|
task.cudaComputeLPC.SetArg(1, task.cudaAutocorOutput);
|
||||||
task.cudaComputeLPC.SetArg(2, task.cudaAutocorOutput);
|
task.cudaComputeLPC.SetArg(2, task.cudaLPCData);
|
||||||
task.cudaComputeLPC.SetArg(3, (uint)eparams.max_prediction_order);
|
task.cudaComputeLPC.SetArg(3, (uint)task.nResidualTasksPerChannel);
|
||||||
task.cudaComputeLPC.SetArg(4, task.cudaLPCData);
|
task.cudaComputeLPC.SetArg(4, (uint)_windowcount);
|
||||||
task.cudaComputeLPC.SetArg(5, (uint)_windowcount);
|
|
||||||
task.cudaComputeLPC.SetArg(6, (uint)autocorPartCount);
|
|
||||||
|
|
||||||
//task.cudaComputeLPCLattice.SetArg(0, task.cudaResidualTasks);
|
//task.cudaComputeLPCLattice.SetArg(0, task.cudaResidualTasks);
|
||||||
//task.cudaComputeLPCLattice.SetArg(1, (uint)task.nResidualTasksPerChannel);
|
//task.cudaComputeLPCLattice.SetArg(1, (uint)task.nResidualTasksPerChannel);
|
||||||
@@ -1156,12 +1147,11 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
//cuda.SetFunctionBlockShape(task.cudaComputeLPCLattice, 256, 1, 1);
|
//cuda.SetFunctionBlockShape(task.cudaComputeLPCLattice, 256, 1, 1);
|
||||||
|
|
||||||
task.cudaQuantizeLPC.SetArg(0, task.cudaResidualTasks);
|
task.cudaQuantizeLPC.SetArg(0, task.cudaResidualTasks);
|
||||||
task.cudaQuantizeLPC.SetArg(1, (uint)task.nResidualTasksPerChannel);
|
task.cudaQuantizeLPC.SetArg(1, task.cudaLPCData);
|
||||||
task.cudaQuantizeLPC.SetArg(2, (uint)task.nTasksPerWindow);
|
task.cudaQuantizeLPC.SetArg(2, (uint)task.nResidualTasksPerChannel);
|
||||||
task.cudaQuantizeLPC.SetArg(3, task.cudaLPCData);
|
task.cudaQuantizeLPC.SetArg(3, (uint)task.nTasksPerWindow);
|
||||||
task.cudaQuantizeLPC.SetArg(4, (uint)eparams.max_prediction_order);
|
task.cudaQuantizeLPC.SetArg(4, (uint)eparams.lpc_min_precision_search);
|
||||||
task.cudaQuantizeLPC.SetArg(5, (uint)eparams.lpc_min_precision_search);
|
task.cudaQuantizeLPC.SetArg(5, (uint)(eparams.lpc_max_precision_search - eparams.lpc_min_precision_search));
|
||||||
task.cudaQuantizeLPC.SetArg(6, (uint)(eparams.lpc_max_precision_search - eparams.lpc_min_precision_search));
|
|
||||||
|
|
||||||
task.cudaCopyBestMethod.SetArg(0, task.cudaBestResidualTasks);
|
task.cudaCopyBestMethod.SetArg(0, task.cudaBestResidualTasks);
|
||||||
task.cudaCopyBestMethod.SetArg(1, task.cudaResidualTasks);
|
task.cudaCopyBestMethod.SetArg(1, task.cudaResidualTasks);
|
||||||
@@ -1216,7 +1206,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
|
|
||||||
// geometry???
|
// geometry???
|
||||||
task.openCLCQ.EnqueueBarrier();
|
task.openCLCQ.EnqueueBarrier();
|
||||||
task.EnqueueComputeAutocor(autocorPartCount, channelsCount, cudaWindow, eparams.max_prediction_order);
|
task.EnqueueComputeAutocor(channelsCount, cudaWindow, eparams.max_prediction_order);
|
||||||
|
|
||||||
//float* autoc = stackalloc float[1024];
|
//float* autoc = stackalloc float[1024];
|
||||||
//task.openCLCQ.EnqueueBarrier();
|
//task.openCLCQ.EnqueueBarrier();
|
||||||
@@ -1524,6 +1514,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
if (OpenCL.NumberOfPlatforms < 1)
|
if (OpenCL.NumberOfPlatforms < 1)
|
||||||
throw new Exception("no opencl platforms found");
|
throw new Exception("no opencl platforms found");
|
||||||
|
|
||||||
|
int groupSize = 64;
|
||||||
OCLMan = new OpenCLManager();
|
OCLMan = new OpenCLManager();
|
||||||
// Attempt to save binaries after compilation, as well as load precompiled binaries
|
// Attempt to save binaries after compilation, as well as load precompiled binaries
|
||||||
// to avoid compilation. Usually you'll want this to be true.
|
// to avoid compilation. Usually you'll want this to be true.
|
||||||
@@ -1543,7 +1534,9 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
OCLMan.RequireImageSupport = false;
|
OCLMan.RequireImageSupport = false;
|
||||||
// The Defines string gets prepended to any and all sources that are compiled
|
// The Defines string gets prepended to any and all sources that are compiled
|
||||||
// and serve as a convenient way to pass configuration information to the compilation process
|
// and serve as a convenient way to pass configuration information to the compilation process
|
||||||
OCLMan.Defines = "#define MAX_ORDER " + eparams.max_prediction_order.ToString();
|
OCLMan.Defines =
|
||||||
|
"#define MAX_ORDER " + eparams.max_prediction_order.ToString() + "\n" +
|
||||||
|
"#define GROUP_SIZE " + groupSize.ToString() + "\n";
|
||||||
// The BuildOptions string is passed directly to clBuild and can be used to do debug builds etc
|
// The BuildOptions string is passed directly to clBuild and can be used to do debug builds etc
|
||||||
OCLMan.BuildOptions = "";
|
OCLMan.BuildOptions = "";
|
||||||
|
|
||||||
@@ -1596,13 +1589,13 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
if (_IO.CanSeek)
|
if (_IO.CanSeek)
|
||||||
first_frame_offset = _IO.Position;
|
first_frame_offset = _IO.Position;
|
||||||
|
|
||||||
task1 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify);
|
task1 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify, groupSize);
|
||||||
task2 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify);
|
task2 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify, groupSize);
|
||||||
if (_settings.CPUThreads > 0)
|
if (_settings.CPUThreads > 0)
|
||||||
{
|
{
|
||||||
cpu_tasks = new FLACCLTask[_settings.CPUThreads];
|
cpu_tasks = new FLACCLTask[_settings.CPUThreads];
|
||||||
for (int i = 0; i < cpu_tasks.Length; i++)
|
for (int i = 0; i < cpu_tasks.Length; i++)
|
||||||
cpu_tasks[i] = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify);
|
cpu_tasks[i] = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify, groupSize);
|
||||||
}
|
}
|
||||||
cudaWindow = openCLProgram.Context.CreateBuffer(MemFlags.READ_ONLY, sizeof(float) * FLACCLWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS);
|
cudaWindow = openCLProgram.Context.CreateBuffer(MemFlags.READ_ONLY, sizeof(float) * FLACCLWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS);
|
||||||
|
|
||||||
@@ -2276,8 +2269,11 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
public bool done = false;
|
public bool done = false;
|
||||||
public bool exit = false;
|
public bool exit = false;
|
||||||
|
|
||||||
unsafe public FLACCLTask(Program _openCLProgram, int channelCount, int channels, uint bits_per_sample, int max_frame_size, bool do_verify)
|
public int groupSize = 128;
|
||||||
|
|
||||||
|
unsafe public FLACCLTask(Program _openCLProgram, int channelCount, int channels, uint bits_per_sample, int max_frame_size, bool do_verify, int groupSize)
|
||||||
{
|
{
|
||||||
|
this.groupSize = groupSize;
|
||||||
openCLProgram = _openCLProgram;
|
openCLProgram = _openCLProgram;
|
||||||
Device[] openCLDevices = openCLProgram.Context.Platform.QueryDevices(DeviceType.GPU);
|
Device[] openCLDevices = openCLProgram.Context.Platform.QueryDevices(DeviceType.GPU);
|
||||||
openCLCQ = openCLProgram.Context.CreateCommandQueue(openCLDevices[0], CommandQueueProperties.PROFILING_ENABLE);
|
openCLCQ = openCLProgram.Context.CreateCommandQueue(openCLDevices[0], CommandQueueProperties.PROFILING_ENABLE);
|
||||||
@@ -2296,7 +2292,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
cudaPartitions = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, partitionsLen);
|
cudaPartitions = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, partitionsLen);
|
||||||
cudaRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, riceParamsLen);
|
cudaRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, riceParamsLen);
|
||||||
cudaBestRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceParamsLen / 4);
|
cudaBestRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceParamsLen / 4);
|
||||||
cudaAutocorOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * (FLACCLWriter.maxAutocorParts + FLACCLWriter.maxFrames));
|
cudaAutocorOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * FLACCLWriter.maxFrames);
|
||||||
cudaResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, residualTasksLen);
|
cudaResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, residualTasksLen);
|
||||||
cudaBestResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, bestResidualTasksLen);
|
cudaBestResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, bestResidualTasksLen);
|
||||||
cudaResidualOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 8) * 64 /*FLACCLWriter.maxResidualParts*/ * FLACCLWriter.maxFrames);
|
cudaResidualOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 8) * 64 /*FLACCLWriter.maxResidualParts*/ * FLACCLWriter.maxFrames);
|
||||||
@@ -2397,36 +2393,22 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
cudaFindWastedBits.SetArg(1, cudaSamples);
|
cudaFindWastedBits.SetArg(1, cudaSamples);
|
||||||
cudaFindWastedBits.SetArg(2, nResidualTasksPerChannel);
|
cudaFindWastedBits.SetArg(2, nResidualTasksPerChannel);
|
||||||
|
|
||||||
int workX = 128; // 256
|
|
||||||
int grpX = frameCount * channelsCount;
|
int grpX = frameCount * channelsCount;
|
||||||
//openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { 128 }, new int[] { 128 });
|
openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { grpX * groupSize }, new int[] { groupSize });
|
||||||
//openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { 128 }, null);
|
|
||||||
openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { grpX * workX }, new int[] { workX });
|
|
||||||
|
|
||||||
//openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { 256 * 128 }, new int[] { 128 });
|
|
||||||
//openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { grpX * workX }, null);
|
|
||||||
//cuda.SetFunctionBlockShape(task.cudaFindWastedBits, 256, 1, 1);
|
|
||||||
//cuda.LaunchAsync(task.cudaFindWastedBits, channelsCount * task.frameCount, 1, task.stream);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void EnqueueComputeAutocor(int autocorPartCount, int channelsCount, Mem cudaWindow, int max_prediction_order)
|
public void EnqueueComputeAutocor(int channelsCount, Mem cudaWindow, int max_prediction_order)
|
||||||
{
|
{
|
||||||
cudaComputeAutocor.SetArg(0, cudaAutocorOutput);
|
cudaComputeAutocor.SetArg(0, cudaAutocorOutput);
|
||||||
cudaComputeAutocor.SetArg(1, cudaSamples);
|
cudaComputeAutocor.SetArg(1, cudaSamples);
|
||||||
cudaComputeAutocor.SetArg(2, cudaWindow);
|
cudaComputeAutocor.SetArg(2, cudaWindow);
|
||||||
cudaComputeAutocor.SetArg(3, cudaResidualTasks);
|
cudaComputeAutocor.SetArg(3, cudaResidualTasks);
|
||||||
cudaComputeAutocor.SetArg(4, max_prediction_order);
|
cudaComputeAutocor.SetArg(4, (uint)nAutocorTasksPerChannel - 1);
|
||||||
cudaComputeAutocor.SetArg(5, (uint)nAutocorTasksPerChannel - 1);
|
cudaComputeAutocor.SetArg(5, (uint)nResidualTasksPerChannel);
|
||||||
cudaComputeAutocor.SetArg(6, (uint)nResidualTasksPerChannel);
|
|
||||||
int workX = autocorPartCount;
|
int workX = max_prediction_order / 4 + 1;
|
||||||
int workY = nAutocorTasksPerChannel * channelsCount * frameCount;
|
int workY = nAutocorTasksPerChannel * channelsCount * frameCount;
|
||||||
int ws = 32;
|
openCLCQ.EnqueueNDRangeKernel(cudaComputeAutocor, 2, null, new int[] { workX * groupSize, workY }, new int[] { groupSize, 1 });
|
||||||
int wy = 4;
|
|
||||||
openCLCQ.EnqueueNDRangeKernel(cudaComputeAutocor, 2, null, new int[] { workX * ws, workY * wy }, new int[] { ws, wy });
|
|
||||||
//openCLCQ.EnqueueNDRangeKernel(cudaComputeAutocor, 3, null, new int[] { workX * ws, workY, max_prediction_order + 1 }, new int[] { ws, 1, 1 });
|
|
||||||
|
|
||||||
//cuda.SetFunctionBlockShape(task.cudaComputeAutocor, 32, 8, 1);
|
|
||||||
//cuda.LaunchAsync(task.cudaComputeAutocor, autocorPartCount, task.nAutocorTasksPerChannel * channelsCount * task.frameCount, task.stream);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void EnqueueEstimateResidual(int channelsCount, int max_prediction_order)
|
public void EnqueueEstimateResidual(int channelsCount, int max_prediction_order)
|
||||||
@@ -2435,11 +2417,8 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
cudaEstimateResidual.SetArg(1, cudaSamples);
|
cudaEstimateResidual.SetArg(1, cudaSamples);
|
||||||
cudaEstimateResidual.SetArg(2, cudaResidualTasks);
|
cudaEstimateResidual.SetArg(2, cudaResidualTasks);
|
||||||
|
|
||||||
int threads_x = 128;
|
int work = nResidualTasksPerChannel * channelsCount * frameCount;
|
||||||
int workX = threads_x;
|
openCLCQ.EnqueueNDRangeKernel(cudaEstimateResidual, 1, null, new int[] { groupSize * work }, new int[] { groupSize });
|
||||||
int workY = nResidualTasksPerChannel * channelsCount * frameCount;
|
|
||||||
|
|
||||||
openCLCQ.EnqueueNDRangeKernel(cudaEstimateResidual, 2, null, new int[] { workX, workY }, new int[] { threads_x, 1 });
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void EnqueueChooseBestMethod(int channelsCount)
|
public void EnqueueChooseBestMethod(int channelsCount)
|
||||||
|
|||||||
@@ -50,11 +50,7 @@ typedef struct
|
|||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
FLACCLSubframeData data;
|
FLACCLSubframeData data;
|
||||||
union
|
int coefs[32]; // fixme: should be short?
|
||||||
{
|
|
||||||
int coefs[32]; // fixme: should be short?
|
|
||||||
int4 coefs4[8];
|
|
||||||
};
|
|
||||||
} FLACCLSubframeTask;
|
} FLACCLSubframeTask;
|
||||||
|
|
||||||
__kernel void cudaStereoDecorr(
|
__kernel void cudaStereoDecorr(
|
||||||
@@ -103,15 +99,15 @@ __kernel void cudaChannelDecorr2(
|
|||||||
#define __ffs(a) (32 - clz(a & (-a)))
|
#define __ffs(a) (32 - clz(a & (-a)))
|
||||||
//#define __ffs(a) (33 - clz(~a & (a - 1)))
|
//#define __ffs(a) (33 - clz(~a & (a - 1)))
|
||||||
|
|
||||||
__kernel __attribute__((reqd_work_group_size(128, 1, 1)))
|
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
|
||||||
void cudaFindWastedBits(
|
void cudaFindWastedBits(
|
||||||
__global FLACCLSubframeTask *tasks,
|
__global FLACCLSubframeTask *tasks,
|
||||||
__global int *samples,
|
__global int *samples,
|
||||||
int tasksPerChannel
|
int tasksPerChannel
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
__local volatile int wbits[128];
|
__local int abits[GROUP_SIZE];
|
||||||
__local volatile int abits[128];
|
__local int wbits[GROUP_SIZE];
|
||||||
__local FLACCLSubframeData task;
|
__local FLACCLSubframeData task;
|
||||||
|
|
||||||
int tid = get_local_id(0);
|
int tid = get_local_id(0);
|
||||||
@@ -129,12 +125,6 @@ void cudaFindWastedBits(
|
|||||||
wbits[tid] = w;
|
wbits[tid] = w;
|
||||||
abits[tid] = a;
|
abits[tid] = a;
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
//atom_or(shared.wbits, shared.wbits[tid]);
|
|
||||||
//atom_or(shared.abits, shared.abits[tid]);
|
|
||||||
//SUM256(shared.wbits, tid, |=);
|
|
||||||
//SUM256(shared.abits, tid, |=);
|
|
||||||
//SUM128(wbits, tid, |=);
|
|
||||||
//SUM128(abits, tid, |=);
|
|
||||||
|
|
||||||
for (int s = get_local_size(0) / 2; s > 0; s >>= 1)
|
for (int s = get_local_size(0) / 2; s > 0; s >>= 1)
|
||||||
{
|
{
|
||||||
@@ -146,160 +136,81 @@ void cudaFindWastedBits(
|
|||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tid == 0)
|
w = max(0,__ffs(wbits[0]) - 1);
|
||||||
task.wbits = max(0,__ffs(wbits[0]) - 1);
|
a = 32 - clz(abits[0]) - w;
|
||||||
if (tid == 0)
|
|
||||||
task.abits = 32 - clz(abits[0]) - task.wbits;
|
|
||||||
// if (tid == 0)
|
|
||||||
//task.wbits = get_num_groups(0);
|
|
||||||
// if (tid == 0)
|
|
||||||
//task.abits = get_local_size(0);
|
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
|
|
||||||
if (tid < tasksPerChannel)
|
if (tid < tasksPerChannel)
|
||||||
tasks[get_group_id(0) * tasksPerChannel + tid].data.wbits = task.wbits;
|
tasks[get_group_id(0) * tasksPerChannel + tid].data.wbits = w;
|
||||||
if (tid < tasksPerChannel)
|
if (tid < tasksPerChannel)
|
||||||
tasks[get_group_id(0) * tasksPerChannel + tid].data.abits = task.abits;
|
tasks[get_group_id(0) * tasksPerChannel + tid].data.abits = a;
|
||||||
}
|
}
|
||||||
|
|
||||||
//__kernel __attribute__((reqd_work_group_size(32, 4, 1)))
|
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
|
||||||
//void cudaComputeAutocor(
|
|
||||||
// __global float *output,
|
|
||||||
// __global const int *samples,
|
|
||||||
// __global const float *window,
|
|
||||||
// __global FLACCLSubframeTask *tasks,
|
|
||||||
// const int max_order, // should be <= 32
|
|
||||||
// const int windowCount, // windows (log2: 0,1)
|
|
||||||
// const int taskCount // tasks per block
|
|
||||||
//)
|
|
||||||
//{
|
|
||||||
// __local struct {
|
|
||||||
// float data[256];
|
|
||||||
// volatile float product[128];
|
|
||||||
// FLACCLSubframeData task;
|
|
||||||
// volatile int dataPos;
|
|
||||||
// volatile int dataLen;
|
|
||||||
// } shared;
|
|
||||||
// const int tid = get_local_id(0) + get_local_id(1) * 32;
|
|
||||||
// // fetch task data
|
|
||||||
// if (tid < sizeof(shared.task) / sizeof(int))
|
|
||||||
// ((__local int*)&shared.task)[tid] = ((__global int*)(tasks + taskCount * (get_group_id(1) >> windowCount)))[tid];
|
|
||||||
// if (tid == 0)
|
|
||||||
// {
|
|
||||||
// shared.dataPos = get_group_id(0) * 7 * 32;
|
|
||||||
// shared.dataLen = min(shared.task.blocksize - shared.dataPos, 7 * 32 + max_order);
|
|
||||||
// }
|
|
||||||
// barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
//
|
|
||||||
// // fetch samples
|
|
||||||
// shared.data[tid] = tid < shared.dataLen ? samples[tid] * window[tid]: 0.0f;
|
|
||||||
// int tid2 = tid + 128;
|
|
||||||
// shared.data[tid2] = tid2 < shared.dataLen ? samples[tid2] * window[tid2]: 0.0f;
|
|
||||||
// barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
//
|
|
||||||
// for (int lag = 0; lag <= max_order; lag ++)
|
|
||||||
// {
|
|
||||||
// if (lag <= 12)
|
|
||||||
// shared.product[tid] = 0.0f;
|
|
||||||
// barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
// }
|
|
||||||
// barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
// if (tid <= max_order)
|
|
||||||
// output[(get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) + tid] = shared.product[tid];
|
|
||||||
//}
|
|
||||||
|
|
||||||
__kernel __attribute__((reqd_work_group_size(32, 4, 1)))
|
|
||||||
void cudaComputeAutocor(
|
void cudaComputeAutocor(
|
||||||
__global float *output,
|
__global float *output,
|
||||||
__global const int *samples,
|
__global const int *samples,
|
||||||
__global const float *window,
|
__global const float *window,
|
||||||
__global FLACCLSubframeTask *tasks,
|
__global FLACCLSubframeTask *tasks,
|
||||||
const int max_order, // should be <= 32
|
|
||||||
const int windowCount, // windows (log2: 0,1)
|
const int windowCount, // windows (log2: 0,1)
|
||||||
const int taskCount // tasks per block
|
const int taskCount // tasks per block
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
__local struct {
|
__local float data[GROUP_SIZE * 2];
|
||||||
float data[256];
|
__local float product[GROUP_SIZE];
|
||||||
volatile float product[128];
|
__local FLACCLSubframeData task;
|
||||||
FLACCLSubframeData task;
|
const int tid = get_local_id(0);
|
||||||
volatile float result[33];
|
|
||||||
volatile int dataPos;
|
|
||||||
volatile int dataLen;
|
|
||||||
volatile int windowOffs;
|
|
||||||
volatile int samplesOffs;
|
|
||||||
//volatile int resultOffs;
|
|
||||||
} shared;
|
|
||||||
const int tid = get_local_id(0) + get_local_id(1) * 32;
|
|
||||||
// fetch task data
|
// fetch task data
|
||||||
if (tid < sizeof(shared.task) / sizeof(int))
|
if (tid < sizeof(task) / sizeof(int))
|
||||||
((__local int*)&shared.task)[tid] = ((__global int*)(tasks + taskCount * (get_group_id(1) >> windowCount)))[tid];
|
((__local int*)&task)[tid] = ((__global int*)(tasks + taskCount * (get_group_id(1) >> windowCount)))[tid];
|
||||||
if (tid == 0)
|
|
||||||
{
|
|
||||||
shared.dataPos = get_group_id(0) * 7 * 32;
|
|
||||||
shared.windowOffs = (get_group_id(1) & ((1 << windowCount)-1)) * shared.task.blocksize + shared.dataPos;
|
|
||||||
shared.samplesOffs = shared.task.samplesOffs + shared.dataPos;
|
|
||||||
shared.dataLen = min(shared.task.blocksize - shared.dataPos, 7 * 32 + max_order);
|
|
||||||
}
|
|
||||||
//if (tid == 32)
|
|
||||||
//shared.resultOffs = __mul24(get_group_id(0) + __mul24(get_group_id(1), get_num_groups(0)), max_order + 1);
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
int bs = task.blocksize;
|
||||||
|
int windowOffs = (get_group_id(1) & ((1 << windowCount)-1)) * bs;
|
||||||
|
|
||||||
// fetch samples
|
data[tid] = tid < bs ? samples[task.samplesOffs + tid] * window[windowOffs + tid] : 0.0f;
|
||||||
shared.data[tid] = tid < shared.dataLen ? samples[shared.samplesOffs + tid] * window[shared.windowOffs + tid]: 0.0f;
|
|
||||||
int tid2 = tid + 128;
|
|
||||||
shared.data[tid2] = tid2 < shared.dataLen ? samples[shared.samplesOffs + tid2] * window[shared.windowOffs + tid2]: 0.0f;
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
|
|
||||||
const int ptr = get_local_id(0) * 7;
|
int tid0 = tid % (GROUP_SIZE >> 2);
|
||||||
//if (get_local_id(1) == 0) for (int lag = 0; lag <= max_order; lag ++)
|
int tid1 = tid / (GROUP_SIZE >> 2);
|
||||||
//for (int lag = get_local_id(1); lag <= max_order; lag += get_local_size(1))
|
int lag0 = get_group_id(0) * 4;
|
||||||
for (int lag0 = 0; lag0 <= max_order; lag0 += get_local_size(1))
|
__local float4 * dptr = ((__local float4 *)&data[0]) + tid0;
|
||||||
|
__local float4 * dptr1 = ((__local float4 *)&data[lag0 + tid1]) + tid0;
|
||||||
|
|
||||||
|
float prod = 0.0f;
|
||||||
|
for (int pos = 0; pos < bs; pos += GROUP_SIZE)
|
||||||
{
|
{
|
||||||
////const int productLen = min(shared.task.blocksize - get_group_id(0) * partSize - lag, partSize);
|
// fetch samples
|
||||||
const int lag = lag0 + get_local_id(1);
|
float nextData = pos + tid + GROUP_SIZE < bs ? samples[task.samplesOffs + pos + tid + GROUP_SIZE] * window[windowOffs + pos + tid + GROUP_SIZE] : 0.0f;
|
||||||
const int ptr2 = ptr + lag;
|
data[tid + GROUP_SIZE] = nextData;
|
||||||
shared.product[tid] =
|
|
||||||
shared.data[ptr + 0] * shared.data[ptr2 + 0] +
|
|
||||||
shared.data[ptr + 1] * shared.data[ptr2 + 1] +
|
|
||||||
shared.data[ptr + 2] * shared.data[ptr2 + 2] +
|
|
||||||
shared.data[ptr + 3] * shared.data[ptr2 + 3] +
|
|
||||||
shared.data[ptr + 4] * shared.data[ptr2 + 4] +
|
|
||||||
shared.data[ptr + 5] * shared.data[ptr2 + 5] +
|
|
||||||
shared.data[ptr + 6] * shared.data[ptr2 + 6];
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
for (int l = 16; l > 1; l >>= 1)
|
|
||||||
{
|
prod += dot(*dptr, *dptr1);
|
||||||
if (get_local_id(0) < l)
|
|
||||||
shared.product[tid] = shared.product[tid] + shared.product[tid + l];
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
}
|
data[tid] = nextData;
|
||||||
// return results
|
}
|
||||||
if (get_local_id(0) == 0 && lag <= max_order)
|
product[tid] = prod;
|
||||||
shared.result[lag] = shared.product[tid] + shared.product[tid + 1];
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
for (int l = (GROUP_SIZE >> 3); l > 0; l >>= 1)
|
||||||
|
{
|
||||||
|
if (tid0 < l)
|
||||||
|
product[tid] = product[tid] + product[tid + l];
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
}
|
}
|
||||||
if (tid <= max_order)
|
if (tid < 4 && tid + lag0 <= MAX_ORDER)
|
||||||
output[(get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) + tid] = shared.result[tid];
|
output[get_group_id(1) * (MAX_ORDER + 1) + tid + lag0] = product[tid * (GROUP_SIZE >> 2)];
|
||||||
//output[(get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) + tid] = shared.product[tid];
|
|
||||||
//output[(get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) + tid] = shared.windowOffs;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__kernel __attribute__((reqd_work_group_size(32, 1, 1)))
|
__kernel __attribute__((reqd_work_group_size(32, 1, 1)))
|
||||||
void cudaComputeLPC(
|
void cudaComputeLPC(
|
||||||
__global FLACCLSubframeTask *tasks,
|
__global FLACCLSubframeTask *tasks,
|
||||||
int taskCount, // tasks per block
|
__global float *autoc,
|
||||||
__global float*autoc,
|
|
||||||
int max_order, // should be <= 32
|
|
||||||
__global float *lpcs,
|
__global float *lpcs,
|
||||||
int windowCount,
|
int taskCount, // tasks per block
|
||||||
int partCount
|
int windowCount
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
__local struct {
|
__local struct {
|
||||||
FLACCLSubframeData task;
|
FLACCLSubframeData task;
|
||||||
volatile float parts[32];
|
|
||||||
volatile float ldr[32];
|
volatile float ldr[32];
|
||||||
volatile float gen1[32];
|
volatile float gen1[32];
|
||||||
volatile float error[32];
|
volatile float error[32];
|
||||||
@@ -311,40 +222,19 @@ void cudaComputeLPC(
|
|||||||
|
|
||||||
// fetch task data
|
// fetch task data
|
||||||
if (tid < sizeof(shared.task) / sizeof(int))
|
if (tid < sizeof(shared.task) / sizeof(int))
|
||||||
((__local int*)&shared.task)[tid] = ((__global int*)(tasks + get_group_id(1) * taskCount))[tid];
|
((__local int*)&shared.task)[tid] = ((__global int*)(tasks + get_group_id(1)))[tid];
|
||||||
if (tid == 0)
|
if (tid == 0)
|
||||||
{
|
{
|
||||||
shared.lpcOffs = (get_group_id(0) + get_group_id(1) * windowCount) * (max_order + 1) * 32;
|
shared.lpcOffs = (get_group_id(0) + get_group_id(1) * windowCount) * (MAX_ORDER + 1) * 32;
|
||||||
shared.autocOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) * partCount;
|
shared.autocOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1);
|
||||||
}
|
}
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
// add up autocorrelation parts
|
if (get_local_id(0) <= MAX_ORDER)
|
||||||
|
shared.autoc[get_local_id(0)] = autoc[shared.autocOffs + get_local_id(0)];
|
||||||
|
if (get_local_id(0) + get_local_size(0) <= MAX_ORDER)
|
||||||
|
shared.autoc[get_local_id(0) + get_local_size(0)] = autoc[shared.autocOffs + get_local_id(0) + get_local_size(0)];
|
||||||
|
|
||||||
// for (int order = get_local_id(0); order <= max_order; order += 32)
|
|
||||||
// {
|
|
||||||
//float sum = 0.0f;
|
|
||||||
//for (int pos = 0; pos < partCount; pos++)
|
|
||||||
// sum += autoc[shared.autocOffs + pos * (max_order + 1) + order];
|
|
||||||
//shared.autoc[order] = sum;
|
|
||||||
// }
|
|
||||||
|
|
||||||
for (int order = 0; order <= max_order; order ++)
|
|
||||||
{
|
|
||||||
float part = 0.0f;
|
|
||||||
for (int pos = get_local_id(0); pos < partCount; pos += get_local_size(0))
|
|
||||||
part += autoc[shared.autocOffs + pos * (max_order + 1) + order];
|
|
||||||
shared.parts[tid] = part;
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
for (int l = get_local_size(0) / 2; l > 1; l >>= 1)
|
|
||||||
{
|
|
||||||
if (get_local_id(0) < l)
|
|
||||||
shared.parts[tid] += shared.parts[tid + l];
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
}
|
|
||||||
if (get_local_id(0) == 0)
|
|
||||||
shared.autoc[order] = shared.parts[tid] + shared.parts[tid + 1];
|
|
||||||
}
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
// Compute LPC using Schur and Levinson-Durbin recursion
|
// Compute LPC using Schur and Levinson-Durbin recursion
|
||||||
@@ -352,19 +242,19 @@ void cudaComputeLPC(
|
|||||||
shared.ldr[get_local_id(0)] = 0.0f;
|
shared.ldr[get_local_id(0)] = 0.0f;
|
||||||
float error = shared.autoc[0];
|
float error = shared.autoc[0];
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
for (int order = 0; order < max_order; order++)
|
for (int order = 0; order < MAX_ORDER; order++)
|
||||||
{
|
{
|
||||||
// Schur recursion
|
// Schur recursion
|
||||||
float reff = -shared.gen1[0] / error;
|
float reff = -shared.gen1[0] / error;
|
||||||
error += shared.gen1[0] * reff; // Equivalent to error *= (1 - reff * reff);
|
error += shared.gen1[0] * reff; // Equivalent to error *= (1 - reff * reff);
|
||||||
float gen1;
|
float gen1;
|
||||||
if (get_local_id(0) < max_order - 1 - order)
|
if (get_local_id(0) < MAX_ORDER - 1 - order)
|
||||||
{
|
{
|
||||||
gen1 = shared.gen1[get_local_id(0) + 1] + reff * gen0;
|
gen1 = shared.gen1[get_local_id(0) + 1] + reff * gen0;
|
||||||
gen0 += shared.gen1[get_local_id(0) + 1] * reff;
|
gen0 += shared.gen1[get_local_id(0) + 1] * reff;
|
||||||
}
|
}
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
if (get_local_id(0) < max_order - 1 - order)
|
if (get_local_id(0) < MAX_ORDER - 1 - order)
|
||||||
shared.gen1[get_local_id(0)] = gen1;
|
shared.gen1[get_local_id(0)] = gen1;
|
||||||
|
|
||||||
// Store prediction error
|
// Store prediction error
|
||||||
@@ -385,118 +275,16 @@ void cudaComputeLPC(
|
|||||||
}
|
}
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
// Output prediction error estimates
|
// Output prediction error estimates
|
||||||
if (get_local_id(0) < max_order)
|
if (get_local_id(0) < MAX_ORDER)
|
||||||
lpcs[shared.lpcOffs + max_order * 32 + get_local_id(0)] = shared.error[get_local_id(0)];
|
lpcs[shared.lpcOffs + MAX_ORDER * 32 + get_local_id(0)] = shared.error[get_local_id(0)];
|
||||||
}
|
}
|
||||||
|
|
||||||
//__kernel void cudaComputeLPCLattice(
|
|
||||||
// FLACCLSubframeTask *tasks,
|
|
||||||
// const int taskCount, // tasks per block
|
|
||||||
// const int *samples,
|
|
||||||
// const int windowCount,
|
|
||||||
// const int max_order, // should be <= 12
|
|
||||||
// float*lpcs
|
|
||||||
//)
|
|
||||||
//{
|
|
||||||
// __local struct {
|
|
||||||
// volatile FLACCLSubframeData task;
|
|
||||||
// volatile float F[512];
|
|
||||||
// volatile float arp[32];
|
|
||||||
// volatile float tmp[256];
|
|
||||||
// volatile float error[32];
|
|
||||||
// volatile int lpcOffs;
|
|
||||||
// } shared;
|
|
||||||
//
|
|
||||||
// // fetch task data
|
|
||||||
// if (get_local_id(0) < sizeof(shared.task) / sizeof(int))
|
|
||||||
// ((int*)&shared.task)[get_local_id(0)] = ((int*)(tasks + taskCount * get_group_id(1)))[get_local_id(0)];
|
|
||||||
// if (get_local_id(0) == 0)
|
|
||||||
// shared.lpcOffs = __mul24(__mul24(get_group_id(1) + 1, windowCount) - 1, max_order + 1) * 32;
|
|
||||||
// barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
//
|
|
||||||
// // F = samples; B = samples
|
|
||||||
// float s1 = get_local_id(0) < shared.task.blocksize ? (samples[shared.task.samplesOffs + get_local_id(0)]) / 32768.0f : 0.0f;
|
|
||||||
// float s2 = get_local_id(0) + 256 < shared.task.blocksize ? (samples[shared.task.samplesOffs + get_local_id(0) + 256]) / 32768.0f : 0.0f;
|
|
||||||
// shared.F[get_local_id(0)] = s1;
|
|
||||||
// shared.F[get_local_id(0) + 256] = s2;
|
|
||||||
// barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
//
|
|
||||||
// shared.tmp[get_local_id(0)] = FSQR(s1) + FSQR(s2);
|
|
||||||
// barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
// SUM256(shared.tmp, get_local_id(0), +=);
|
|
||||||
// barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
// float DEN = shared.tmp[0];
|
|
||||||
// barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
//
|
|
||||||
// for (int order = 0; order < max_order; order++)
|
|
||||||
// {
|
|
||||||
// // reff = F(order+1:frameSize) * B(1:frameSize-order)' / DEN
|
|
||||||
// int idxF = get_local_id(0) + order + 1;
|
|
||||||
// int idxF2 = idxF + 256;
|
|
||||||
//
|
|
||||||
// shared.tmp[get_local_id(0)] = idxF < shared.task.blocksize ? shared.F[idxF] * s1 : 0.0f;
|
|
||||||
// shared.tmp[get_local_id(0)] += idxF2 < shared.task.blocksize ? shared.F[idxF2] * s2 : 0.0f;
|
|
||||||
// barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
// SUM256(shared.tmp, get_local_id(0), +=);
|
|
||||||
// barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
// float reff = shared.tmp[0] / DEN;
|
|
||||||
// barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
//
|
|
||||||
// // arp(order) = rc(order) = reff
|
|
||||||
// if (get_local_id(0) == 0)
|
|
||||||
// shared.arp[order] = reff;
|
|
||||||
// //shared.rc[order - 1] = shared.lpc[order - 1][order - 1] = reff;
|
|
||||||
//
|
|
||||||
// // Levinson-Durbin recursion
|
|
||||||
// // arp(1:order-1) = arp(1:order-1) - reff * arp(order-1:-1:1)
|
|
||||||
// if (get_local_id(0) < order)
|
|
||||||
// shared.arp[get_local_id(0)] = shared.arp[get_local_id(0)] - reff * shared.arp[order - 1 - get_local_id(0)];
|
|
||||||
//
|
|
||||||
// // Output coeffs
|
|
||||||
// if (get_local_id(0) <= order)
|
|
||||||
// lpcs[shared.lpcOffs + order * 32 + get_local_id(0)] = shared.arp[order - get_local_id(0)];
|
|
||||||
//
|
|
||||||
// // F1 = F(order+1:frameSize) - reff * B(1:frameSize-order)
|
|
||||||
// // B(1:frameSize-order) = B(1:frameSize-order) - reff * F(order+1:frameSize)
|
|
||||||
// // F(order+1:frameSize) = F1
|
|
||||||
// if (idxF < shared.task.blocksize)
|
|
||||||
// {
|
|
||||||
// float f1 = shared.F[idxF];
|
|
||||||
// shared.F[idxF] -= reff * s1;
|
|
||||||
// s1 -= reff * f1;
|
|
||||||
// }
|
|
||||||
// if (idxF2 < shared.task.blocksize)
|
|
||||||
// {
|
|
||||||
// float f2 = shared.F[idxF2];
|
|
||||||
// shared.F[idxF2] -= reff * s2;
|
|
||||||
// s2 -= reff * f2;
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// // DEN = F(order+1:frameSize) * F(order+1:frameSize)' + B(1:frameSize-order) * B(1:frameSize-order)' (BURG)
|
|
||||||
// shared.tmp[get_local_id(0)] = (idxF + 1 < shared.task.blocksize ? FSQR(shared.F[idxF]) + FSQR(s1) : 0);
|
|
||||||
// shared.tmp[get_local_id(0)] += (idxF2 + 1 < shared.task.blocksize ? FSQR(shared.F[idxF2]) + FSQR(s2) : 0);
|
|
||||||
// barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
// SUM256(shared.tmp, get_local_id(0), +=);
|
|
||||||
// barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
// DEN = shared.tmp[0] / 2;
|
|
||||||
// // shared.PE[order-1] = shared.tmp[0] / 2 / (frameSize - order + 1);
|
|
||||||
// if (get_local_id(0) == 0)
|
|
||||||
// shared.error[order] = DEN / (shared.task.blocksize - order);
|
|
||||||
// barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// // Output prediction error estimates
|
|
||||||
// if (get_local_id(0) < max_order)
|
|
||||||
// lpcs[shared.lpcOffs + max_order * 32 + get_local_id(0)] = shared.error[get_local_id(0)];
|
|
||||||
//}
|
|
||||||
|
|
||||||
__kernel __attribute__((reqd_work_group_size(32, 4, 1)))
|
__kernel __attribute__((reqd_work_group_size(32, 4, 1)))
|
||||||
void cudaQuantizeLPC(
|
void cudaQuantizeLPC(
|
||||||
__global FLACCLSubframeTask *tasks,
|
__global FLACCLSubframeTask *tasks,
|
||||||
|
__global float*lpcs,
|
||||||
int taskCount, // tasks per block
|
int taskCount, // tasks per block
|
||||||
int taskCountLPC, // tasks per set of coeffs (<= 32)
|
int taskCountLPC, // tasks per set of coeffs (<= 32)
|
||||||
__global float*lpcs,
|
|
||||||
int max_order, // should be <= 32
|
|
||||||
int minprecision,
|
int minprecision,
|
||||||
int precisions
|
int precisions
|
||||||
)
|
)
|
||||||
@@ -515,32 +303,30 @@ void cudaQuantizeLPC(
|
|||||||
if (tid < sizeof(shared.task) / sizeof(int))
|
if (tid < sizeof(shared.task) / sizeof(int))
|
||||||
((__local int*)&shared.task)[tid] = ((__global int*)(tasks + get_group_id(1) * taskCount))[tid];
|
((__local int*)&shared.task)[tid] = ((__global int*)(tasks + get_group_id(1) * taskCount))[tid];
|
||||||
if (tid == 0)
|
if (tid == 0)
|
||||||
shared.lpcOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) * 32;
|
shared.lpcOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1) * 32;
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
// Select best orders based on Akaike's Criteria
|
||||||
if (get_local_id(1) == 0)
|
if (get_local_id(1) == 0)
|
||||||
{
|
{
|
||||||
shared.index[get_local_id(0)] = min(max_order - 1, get_local_id(0));
|
shared.index[tid] = min(MAX_ORDER - 1, tid);
|
||||||
shared.error[get_local_id(0)] = shared.task.blocksize * 64 + get_local_id(0);
|
shared.error[tid] = shared.task.blocksize * 64 + tid;
|
||||||
shared.index[32 + get_local_id(0)] = min(max_order - 1, get_local_id(0));
|
shared.index[32 + tid] = min(MAX_ORDER - 1, tid);
|
||||||
shared.error[32 + get_local_id(0)] = shared.task.blocksize * 64 + get_local_id(0);
|
shared.error[32 + tid] = shared.task.blocksize * 64 + tid;
|
||||||
|
|
||||||
// Select best orders based on Akaike's Criteria
|
|
||||||
|
|
||||||
// Load prediction error estimates
|
// Load prediction error estimates
|
||||||
if (get_local_id(0) < max_order)
|
if (tid < MAX_ORDER)
|
||||||
shared.error[get_local_id(0)] = shared.task.blocksize * log(lpcs[shared.lpcOffs + max_order * 32 + get_local_id(0)]) + get_local_id(0) * 5.12f * log(shared.task.blocksize);
|
shared.error[tid] = shared.task.blocksize * log(lpcs[shared.lpcOffs + MAX_ORDER * 32 + tid]) + tid * 5.12f * log(shared.task.blocksize);
|
||||||
//shared.error[get_local_id(0)] = shared.task.blocksize * log(lpcs[shared.lpcOffs + max_order * 32 + get_local_id(0)]) + get_local_id(0) * 0.30f * (shared.task.abits + 1) * log(shared.task.blocksize);
|
//shared.error[get_local_id(0)] = shared.task.blocksize * log(lpcs[shared.lpcOffs + MAX_ORDER * 32 + get_local_id(0)]) + get_local_id(0) * 0.30f * (shared.task.abits + 1) * log(shared.task.blocksize);
|
||||||
}
|
}
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
// Sort using bitonic sort
|
// Sort using bitonic sort
|
||||||
for(int size = 2; size < 64; size <<= 1){
|
for(int size = 2; size < 64; size <<= 1){
|
||||||
//Bitonic merge
|
//Bitonic merge
|
||||||
int ddd = (get_local_id(0) & (size / 2)) == 0;
|
int ddd = (tid & (size / 2)) == 0;
|
||||||
for(int stride = size / 2; stride > 0; stride >>= 1){
|
for(int stride = size / 2; stride > 0; stride >>= 1){
|
||||||
int pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
|
int pos = 2 * tid - (tid & (stride - 1));
|
||||||
float e0, e1;
|
float e0, e1;
|
||||||
int i0, i1;
|
int i0, i1;
|
||||||
if (get_local_id(1) == 0)
|
if (get_local_id(1) == 0)
|
||||||
@@ -551,7 +337,7 @@ void cudaQuantizeLPC(
|
|||||||
i1 = shared.index[pos + stride];
|
i1 = shared.index[pos + stride];
|
||||||
}
|
}
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
if ((e0 >= e1) == ddd && get_local_id(1) == 0)
|
if (get_local_id(1) == 0 && (e0 >= e1) == ddd)
|
||||||
{
|
{
|
||||||
shared.error[pos] = e1;
|
shared.error[pos] = e1;
|
||||||
shared.error[pos + stride] = e0;
|
shared.error[pos + stride] = e0;
|
||||||
@@ -566,7 +352,7 @@ void cudaQuantizeLPC(
|
|||||||
{
|
{
|
||||||
for(int stride = 32; stride > 0; stride >>= 1){
|
for(int stride = 32; stride > 0; stride >>= 1){
|
||||||
//barrier(CLK_LOCAL_MEM_FENCE);
|
//barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
int pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
|
int pos = 2 * tid - (tid & (stride - 1));
|
||||||
float e0, e1;
|
float e0, e1;
|
||||||
int i0, i1;
|
int i0, i1;
|
||||||
if (get_local_id(1) == 0)
|
if (get_local_id(1) == 0)
|
||||||
@@ -577,7 +363,7 @@ void cudaQuantizeLPC(
|
|||||||
i1 = shared.index[pos + stride];
|
i1 = shared.index[pos + stride];
|
||||||
}
|
}
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
if (e0 >= e1 && get_local_id(1) == 0)
|
if (get_local_id(1) == 0 && e0 >= e1)
|
||||||
{
|
{
|
||||||
shared.error[pos] = e1;
|
shared.error[pos] = e1;
|
||||||
shared.error[pos + stride] = e0;
|
shared.error[pos + stride] = e0;
|
||||||
@@ -653,21 +439,21 @@ void cudaQuantizeLPC(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__kernel __attribute__(( vec_type_hint (int4)))
|
__kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
|
||||||
void cudaEstimateResidual(
|
void cudaEstimateResidual(
|
||||||
__global int*output,
|
__global int*output,
|
||||||
__global int*samples,
|
__global int*samples,
|
||||||
__global FLACCLSubframeTask *tasks
|
__global FLACCLSubframeTask *tasks
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
__local float data[128 * 2];
|
__local float data[GROUP_SIZE * 2];
|
||||||
__local int residual[128];
|
__local int residual[GROUP_SIZE];
|
||||||
__local FLACCLSubframeTask task;
|
__local FLACCLSubframeTask task;
|
||||||
__local float4 coefsf4[8];
|
__local float4 coefsf4[8];
|
||||||
|
|
||||||
const int tid = get_local_id(0);
|
const int tid = get_local_id(0);
|
||||||
if (tid < sizeof(task)/sizeof(int))
|
if (tid < sizeof(task)/sizeof(int))
|
||||||
((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(1)]))[tid];
|
((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(0)]))[tid];
|
||||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||||
|
|
||||||
int ro = task.data.residualOrder;
|
int ro = task.data.residualOrder;
|
||||||
@@ -677,11 +463,11 @@ void cudaEstimateResidual(
|
|||||||
if (tid < 32)
|
if (tid < 32)
|
||||||
((__local float *)&coefsf4[0])[tid] = select(0.0f, ((float)task.coefs[tid]) / (1 << task.data.shift), tid < ro);
|
((__local float *)&coefsf4[0])[tid] = select(0.0f, ((float)task.coefs[tid]) / (1 << task.data.shift), tid < ro);
|
||||||
data[tid] = tid < bs ? (float)(samples[task.data.samplesOffs + tid] >> task.data.wbits) : 0.0f;
|
data[tid] = tid < bs ? (float)(samples[task.data.samplesOffs + tid] >> task.data.wbits) : 0.0f;
|
||||||
for (int pos = 0; pos < bs; pos += get_local_size(0))
|
for (int pos = 0; pos < bs; pos += GROUP_SIZE)
|
||||||
{
|
{
|
||||||
// fetch samples
|
// fetch samples
|
||||||
float nextData = pos + tid + get_local_size(0) < bs ? (float)(samples[task.data.samplesOffs + pos + tid + get_local_size(0)] >> task.data.wbits) : 0.0f;
|
float nextData = pos + tid + GROUP_SIZE < bs ? (float)(samples[task.data.samplesOffs + pos + tid + GROUP_SIZE] >> task.data.wbits) : 0.0f;
|
||||||
data[tid + get_local_size(0)] = nextData;
|
data[tid + GROUP_SIZE] = nextData;
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
// compute residual
|
// compute residual
|
||||||
@@ -718,19 +504,19 @@ void cudaEstimateResidual(
|
|||||||
data[tid] = nextData;
|
data[tid] = nextData;
|
||||||
}
|
}
|
||||||
|
|
||||||
int residualLen = (bs - ro) / get_local_size(0) + select(0, 1, tid < (bs - ro) % get_local_size(0));
|
int residualLen = (bs - ro) / GROUP_SIZE + select(0, 1, tid < (bs - ro) % GROUP_SIZE);
|
||||||
int k = clamp(convert_int_rtn(log2((res + 0.000001f) / (residualLen + 0.000001f))), 0, 14);
|
int k = clamp(convert_int_rtn(log2((res + 0.000001f) / (residualLen + 0.000001f))), 0, 14);
|
||||||
residual[tid] = residualLen * (k + 1) + (convert_int_rtz(res) >> k);
|
residual[tid] = residualLen * (k + 1) + (convert_int_rtz(res) >> k);
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
for (int l = get_local_size(0) / 2; l > 0; l >>= 1)
|
for (int l = GROUP_SIZE / 2; l > 0; l >>= 1)
|
||||||
{
|
{
|
||||||
if (tid < l)
|
if (tid < l)
|
||||||
residual[tid] += residual[tid + l];
|
residual[tid] += residual[tid + l];
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
}
|
}
|
||||||
if (tid == 0)
|
if (tid == 0)
|
||||||
output[get_group_id(1)] = residual[0];
|
output[get_group_id(0)] = residual[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
__kernel void cudaChooseBestMethod(
|
__kernel void cudaChooseBestMethod(
|
||||||
|
|||||||
Reference in New Issue
Block a user