opencl flac encoder

This commit is contained in:
chudov
2010-09-25 19:53:48 +00:00
parent c4c344e88f
commit c70ced945b
2 changed files with 116 additions and 351 deletions

View File

@@ -141,7 +141,6 @@ namespace CUETools.Codecs.FLACCL
public const int MAX_BLOCKSIZE = 4096 * 16; public const int MAX_BLOCKSIZE = 4096 * 16;
internal const int maxFrames = 128; internal const int maxFrames = 128;
internal const int maxAutocorParts = (MAX_BLOCKSIZE + 255) / 256;
public FLACCLWriter(string path, Stream IO, AudioPCMConfig pcm) public FLACCLWriter(string path, Stream IO, AudioPCMConfig pcm)
{ {
@@ -1116,12 +1115,6 @@ namespace CUETools.Codecs.FLACCL
if (task.frameSize <= 4) if (task.frameSize <= 4)
return; return;
//int autocorPartSize = (2 * 256 - eparams.max_prediction_order) & ~15;
int autocorPartSize = 32 * 7;
int autocorPartCount = (task.frameSize + autocorPartSize - 1) / autocorPartSize;
if (autocorPartCount > maxAutocorParts)
throw new Exception("internal error");
int max_porder = get_max_p_order(eparams.max_partition_order, task.frameSize, eparams.max_prediction_order); int max_porder = get_max_p_order(eparams.max_partition_order, task.frameSize, eparams.max_prediction_order);
int calcPartitionPartSize = task.frameSize >> max_porder; int calcPartitionPartSize = task.frameSize >> max_porder;
while (calcPartitionPartSize < 16 && max_porder > 0) while (calcPartitionPartSize < 16 && max_porder > 0)
@@ -1140,12 +1133,10 @@ namespace CUETools.Codecs.FLACCL
cudaChannelDecorr.SetArg(2, (uint)MAX_BLOCKSIZE); cudaChannelDecorr.SetArg(2, (uint)MAX_BLOCKSIZE);
task.cudaComputeLPC.SetArg(0, task.cudaResidualTasks); task.cudaComputeLPC.SetArg(0, task.cudaResidualTasks);
task.cudaComputeLPC.SetArg(1, (uint)task.nResidualTasksPerChannel); task.cudaComputeLPC.SetArg(1, task.cudaAutocorOutput);
task.cudaComputeLPC.SetArg(2, task.cudaAutocorOutput); task.cudaComputeLPC.SetArg(2, task.cudaLPCData);
task.cudaComputeLPC.SetArg(3, (uint)eparams.max_prediction_order); task.cudaComputeLPC.SetArg(3, (uint)task.nResidualTasksPerChannel);
task.cudaComputeLPC.SetArg(4, task.cudaLPCData); task.cudaComputeLPC.SetArg(4, (uint)_windowcount);
task.cudaComputeLPC.SetArg(5, (uint)_windowcount);
task.cudaComputeLPC.SetArg(6, (uint)autocorPartCount);
//task.cudaComputeLPCLattice.SetArg(0, task.cudaResidualTasks); //task.cudaComputeLPCLattice.SetArg(0, task.cudaResidualTasks);
//task.cudaComputeLPCLattice.SetArg(1, (uint)task.nResidualTasksPerChannel); //task.cudaComputeLPCLattice.SetArg(1, (uint)task.nResidualTasksPerChannel);
@@ -1156,12 +1147,11 @@ namespace CUETools.Codecs.FLACCL
//cuda.SetFunctionBlockShape(task.cudaComputeLPCLattice, 256, 1, 1); //cuda.SetFunctionBlockShape(task.cudaComputeLPCLattice, 256, 1, 1);
task.cudaQuantizeLPC.SetArg(0, task.cudaResidualTasks); task.cudaQuantizeLPC.SetArg(0, task.cudaResidualTasks);
task.cudaQuantizeLPC.SetArg(1, (uint)task.nResidualTasksPerChannel); task.cudaQuantizeLPC.SetArg(1, task.cudaLPCData);
task.cudaQuantizeLPC.SetArg(2, (uint)task.nTasksPerWindow); task.cudaQuantizeLPC.SetArg(2, (uint)task.nResidualTasksPerChannel);
task.cudaQuantizeLPC.SetArg(3, task.cudaLPCData); task.cudaQuantizeLPC.SetArg(3, (uint)task.nTasksPerWindow);
task.cudaQuantizeLPC.SetArg(4, (uint)eparams.max_prediction_order); task.cudaQuantizeLPC.SetArg(4, (uint)eparams.lpc_min_precision_search);
task.cudaQuantizeLPC.SetArg(5, (uint)eparams.lpc_min_precision_search); task.cudaQuantizeLPC.SetArg(5, (uint)(eparams.lpc_max_precision_search - eparams.lpc_min_precision_search));
task.cudaQuantizeLPC.SetArg(6, (uint)(eparams.lpc_max_precision_search - eparams.lpc_min_precision_search));
task.cudaCopyBestMethod.SetArg(0, task.cudaBestResidualTasks); task.cudaCopyBestMethod.SetArg(0, task.cudaBestResidualTasks);
task.cudaCopyBestMethod.SetArg(1, task.cudaResidualTasks); task.cudaCopyBestMethod.SetArg(1, task.cudaResidualTasks);
@@ -1216,7 +1206,7 @@ namespace CUETools.Codecs.FLACCL
// geometry??? // geometry???
task.openCLCQ.EnqueueBarrier(); task.openCLCQ.EnqueueBarrier();
task.EnqueueComputeAutocor(autocorPartCount, channelsCount, cudaWindow, eparams.max_prediction_order); task.EnqueueComputeAutocor(channelsCount, cudaWindow, eparams.max_prediction_order);
//float* autoc = stackalloc float[1024]; //float* autoc = stackalloc float[1024];
//task.openCLCQ.EnqueueBarrier(); //task.openCLCQ.EnqueueBarrier();
@@ -1524,6 +1514,7 @@ namespace CUETools.Codecs.FLACCL
if (OpenCL.NumberOfPlatforms < 1) if (OpenCL.NumberOfPlatforms < 1)
throw new Exception("no opencl platforms found"); throw new Exception("no opencl platforms found");
int groupSize = 64;
OCLMan = new OpenCLManager(); OCLMan = new OpenCLManager();
// Attempt to save binaries after compilation, as well as load precompiled binaries // Attempt to save binaries after compilation, as well as load precompiled binaries
// to avoid compilation. Usually you'll want this to be true. // to avoid compilation. Usually you'll want this to be true.
@@ -1543,7 +1534,9 @@ namespace CUETools.Codecs.FLACCL
OCLMan.RequireImageSupport = false; OCLMan.RequireImageSupport = false;
// The Defines string gets prepended to any and all sources that are compiled // The Defines string gets prepended to any and all sources that are compiled
// and serve as a convenient way to pass configuration information to the compilation process // and serve as a convenient way to pass configuration information to the compilation process
OCLMan.Defines = "#define MAX_ORDER " + eparams.max_prediction_order.ToString(); OCLMan.Defines =
"#define MAX_ORDER " + eparams.max_prediction_order.ToString() + "\n" +
"#define GROUP_SIZE " + groupSize.ToString() + "\n";
// The BuildOptions string is passed directly to clBuild and can be used to do debug builds etc // The BuildOptions string is passed directly to clBuild and can be used to do debug builds etc
OCLMan.BuildOptions = ""; OCLMan.BuildOptions = "";
@@ -1596,13 +1589,13 @@ namespace CUETools.Codecs.FLACCL
if (_IO.CanSeek) if (_IO.CanSeek)
first_frame_offset = _IO.Position; first_frame_offset = _IO.Position;
task1 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify); task1 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify, groupSize);
task2 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify); task2 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify, groupSize);
if (_settings.CPUThreads > 0) if (_settings.CPUThreads > 0)
{ {
cpu_tasks = new FLACCLTask[_settings.CPUThreads]; cpu_tasks = new FLACCLTask[_settings.CPUThreads];
for (int i = 0; i < cpu_tasks.Length; i++) for (int i = 0; i < cpu_tasks.Length; i++)
cpu_tasks[i] = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify); cpu_tasks[i] = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify, groupSize);
} }
cudaWindow = openCLProgram.Context.CreateBuffer(MemFlags.READ_ONLY, sizeof(float) * FLACCLWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS); cudaWindow = openCLProgram.Context.CreateBuffer(MemFlags.READ_ONLY, sizeof(float) * FLACCLWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS);
@@ -2276,8 +2269,11 @@ namespace CUETools.Codecs.FLACCL
public bool done = false; public bool done = false;
public bool exit = false; public bool exit = false;
unsafe public FLACCLTask(Program _openCLProgram, int channelCount, int channels, uint bits_per_sample, int max_frame_size, bool do_verify) public int groupSize = 128;
unsafe public FLACCLTask(Program _openCLProgram, int channelCount, int channels, uint bits_per_sample, int max_frame_size, bool do_verify, int groupSize)
{ {
this.groupSize = groupSize;
openCLProgram = _openCLProgram; openCLProgram = _openCLProgram;
Device[] openCLDevices = openCLProgram.Context.Platform.QueryDevices(DeviceType.GPU); Device[] openCLDevices = openCLProgram.Context.Platform.QueryDevices(DeviceType.GPU);
openCLCQ = openCLProgram.Context.CreateCommandQueue(openCLDevices[0], CommandQueueProperties.PROFILING_ENABLE); openCLCQ = openCLProgram.Context.CreateCommandQueue(openCLDevices[0], CommandQueueProperties.PROFILING_ENABLE);
@@ -2296,7 +2292,7 @@ namespace CUETools.Codecs.FLACCL
cudaPartitions = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, partitionsLen); cudaPartitions = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, partitionsLen);
cudaRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, riceParamsLen); cudaRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, riceParamsLen);
cudaBestRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceParamsLen / 4); cudaBestRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceParamsLen / 4);
cudaAutocorOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * (FLACCLWriter.maxAutocorParts + FLACCLWriter.maxFrames)); cudaAutocorOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * FLACCLWriter.maxFrames);
cudaResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, residualTasksLen); cudaResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, residualTasksLen);
cudaBestResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, bestResidualTasksLen); cudaBestResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, bestResidualTasksLen);
cudaResidualOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 8) * 64 /*FLACCLWriter.maxResidualParts*/ * FLACCLWriter.maxFrames); cudaResidualOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 8) * 64 /*FLACCLWriter.maxResidualParts*/ * FLACCLWriter.maxFrames);
@@ -2397,36 +2393,22 @@ namespace CUETools.Codecs.FLACCL
cudaFindWastedBits.SetArg(1, cudaSamples); cudaFindWastedBits.SetArg(1, cudaSamples);
cudaFindWastedBits.SetArg(2, nResidualTasksPerChannel); cudaFindWastedBits.SetArg(2, nResidualTasksPerChannel);
int workX = 128; // 256
int grpX = frameCount * channelsCount; int grpX = frameCount * channelsCount;
//openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { 128 }, new int[] { 128 }); openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { grpX * groupSize }, new int[] { groupSize });
//openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { 128 }, null);
openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { grpX * workX }, new int[] { workX });
//openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { 256 * 128 }, new int[] { 128 });
//openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { grpX * workX }, null);
//cuda.SetFunctionBlockShape(task.cudaFindWastedBits, 256, 1, 1);
//cuda.LaunchAsync(task.cudaFindWastedBits, channelsCount * task.frameCount, 1, task.stream);
} }
public void EnqueueComputeAutocor(int autocorPartCount, int channelsCount, Mem cudaWindow, int max_prediction_order) public void EnqueueComputeAutocor(int channelsCount, Mem cudaWindow, int max_prediction_order)
{ {
cudaComputeAutocor.SetArg(0, cudaAutocorOutput); cudaComputeAutocor.SetArg(0, cudaAutocorOutput);
cudaComputeAutocor.SetArg(1, cudaSamples); cudaComputeAutocor.SetArg(1, cudaSamples);
cudaComputeAutocor.SetArg(2, cudaWindow); cudaComputeAutocor.SetArg(2, cudaWindow);
cudaComputeAutocor.SetArg(3, cudaResidualTasks); cudaComputeAutocor.SetArg(3, cudaResidualTasks);
cudaComputeAutocor.SetArg(4, max_prediction_order); cudaComputeAutocor.SetArg(4, (uint)nAutocorTasksPerChannel - 1);
cudaComputeAutocor.SetArg(5, (uint)nAutocorTasksPerChannel - 1); cudaComputeAutocor.SetArg(5, (uint)nResidualTasksPerChannel);
cudaComputeAutocor.SetArg(6, (uint)nResidualTasksPerChannel);
int workX = autocorPartCount;
int workY = nAutocorTasksPerChannel * channelsCount * frameCount;
int ws = 32;
int wy = 4;
openCLCQ.EnqueueNDRangeKernel(cudaComputeAutocor, 2, null, new int[] { workX * ws, workY * wy }, new int[] { ws, wy });
//openCLCQ.EnqueueNDRangeKernel(cudaComputeAutocor, 3, null, new int[] { workX * ws, workY, max_prediction_order + 1 }, new int[] { ws, 1, 1 });
//cuda.SetFunctionBlockShape(task.cudaComputeAutocor, 32, 8, 1); int workX = max_prediction_order / 4 + 1;
//cuda.LaunchAsync(task.cudaComputeAutocor, autocorPartCount, task.nAutocorTasksPerChannel * channelsCount * task.frameCount, task.stream); int workY = nAutocorTasksPerChannel * channelsCount * frameCount;
openCLCQ.EnqueueNDRangeKernel(cudaComputeAutocor, 2, null, new int[] { workX * groupSize, workY }, new int[] { groupSize, 1 });
} }
public void EnqueueEstimateResidual(int channelsCount, int max_prediction_order) public void EnqueueEstimateResidual(int channelsCount, int max_prediction_order)
@@ -2435,11 +2417,8 @@ namespace CUETools.Codecs.FLACCL
cudaEstimateResidual.SetArg(1, cudaSamples); cudaEstimateResidual.SetArg(1, cudaSamples);
cudaEstimateResidual.SetArg(2, cudaResidualTasks); cudaEstimateResidual.SetArg(2, cudaResidualTasks);
int threads_x = 128; int work = nResidualTasksPerChannel * channelsCount * frameCount;
int workX = threads_x; openCLCQ.EnqueueNDRangeKernel(cudaEstimateResidual, 1, null, new int[] { groupSize * work }, new int[] { groupSize });
int workY = nResidualTasksPerChannel * channelsCount * frameCount;
openCLCQ.EnqueueNDRangeKernel(cudaEstimateResidual, 2, null, new int[] { workX, workY }, new int[] { threads_x, 1 });
} }
public void EnqueueChooseBestMethod(int channelsCount) public void EnqueueChooseBestMethod(int channelsCount)

View File

@@ -50,11 +50,7 @@ typedef struct
typedef struct typedef struct
{ {
FLACCLSubframeData data; FLACCLSubframeData data;
union int coefs[32]; // fixme: should be short?
{
int coefs[32]; // fixme: should be short?
int4 coefs4[8];
};
} FLACCLSubframeTask; } FLACCLSubframeTask;
__kernel void cudaStereoDecorr( __kernel void cudaStereoDecorr(
@@ -103,15 +99,15 @@ __kernel void cudaChannelDecorr2(
#define __ffs(a) (32 - clz(a & (-a))) #define __ffs(a) (32 - clz(a & (-a)))
//#define __ffs(a) (33 - clz(~a & (a - 1))) //#define __ffs(a) (33 - clz(~a & (a - 1)))
__kernel __attribute__((reqd_work_group_size(128, 1, 1))) __kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
void cudaFindWastedBits( void cudaFindWastedBits(
__global FLACCLSubframeTask *tasks, __global FLACCLSubframeTask *tasks,
__global int *samples, __global int *samples,
int tasksPerChannel int tasksPerChannel
) )
{ {
__local volatile int wbits[128]; __local int abits[GROUP_SIZE];
__local volatile int abits[128]; __local int wbits[GROUP_SIZE];
__local FLACCLSubframeData task; __local FLACCLSubframeData task;
int tid = get_local_id(0); int tid = get_local_id(0);
@@ -129,12 +125,6 @@ void cudaFindWastedBits(
wbits[tid] = w; wbits[tid] = w;
abits[tid] = a; abits[tid] = a;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
//atom_or(shared.wbits, shared.wbits[tid]);
//atom_or(shared.abits, shared.abits[tid]);
//SUM256(shared.wbits, tid, |=);
//SUM256(shared.abits, tid, |=);
//SUM128(wbits, tid, |=);
//SUM128(abits, tid, |=);
for (int s = get_local_size(0) / 2; s > 0; s >>= 1) for (int s = get_local_size(0) / 2; s > 0; s >>= 1)
{ {
@@ -146,160 +136,81 @@ void cudaFindWastedBits(
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
if (tid == 0) w = max(0,__ffs(wbits[0]) - 1);
task.wbits = max(0,__ffs(wbits[0]) - 1); a = 32 - clz(abits[0]) - w;
if (tid == 0)
task.abits = 32 - clz(abits[0]) - task.wbits;
// if (tid == 0)
//task.wbits = get_num_groups(0);
// if (tid == 0)
//task.abits = get_local_size(0);
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < tasksPerChannel) if (tid < tasksPerChannel)
tasks[get_group_id(0) * tasksPerChannel + tid].data.wbits = task.wbits; tasks[get_group_id(0) * tasksPerChannel + tid].data.wbits = w;
if (tid < tasksPerChannel) if (tid < tasksPerChannel)
tasks[get_group_id(0) * tasksPerChannel + tid].data.abits = task.abits; tasks[get_group_id(0) * tasksPerChannel + tid].data.abits = a;
} }
//__kernel __attribute__((reqd_work_group_size(32, 4, 1))) __kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
//void cudaComputeAutocor(
// __global float *output,
// __global const int *samples,
// __global const float *window,
// __global FLACCLSubframeTask *tasks,
// const int max_order, // should be <= 32
// const int windowCount, // windows (log2: 0,1)
// const int taskCount // tasks per block
//)
//{
// __local struct {
// float data[256];
// volatile float product[128];
// FLACCLSubframeData task;
// volatile int dataPos;
// volatile int dataLen;
// } shared;
// const int tid = get_local_id(0) + get_local_id(1) * 32;
// // fetch task data
// if (tid < sizeof(shared.task) / sizeof(int))
// ((__local int*)&shared.task)[tid] = ((__global int*)(tasks + taskCount * (get_group_id(1) >> windowCount)))[tid];
// if (tid == 0)
// {
// shared.dataPos = get_group_id(0) * 7 * 32;
// shared.dataLen = min(shared.task.blocksize - shared.dataPos, 7 * 32 + max_order);
// }
// barrier(CLK_LOCAL_MEM_FENCE);
//
// // fetch samples
// shared.data[tid] = tid < shared.dataLen ? samples[tid] * window[tid]: 0.0f;
// int tid2 = tid + 128;
// shared.data[tid2] = tid2 < shared.dataLen ? samples[tid2] * window[tid2]: 0.0f;
// barrier(CLK_LOCAL_MEM_FENCE);
//
// for (int lag = 0; lag <= max_order; lag ++)
// {
// if (lag <= 12)
// shared.product[tid] = 0.0f;
// barrier(CLK_LOCAL_MEM_FENCE);
// }
// barrier(CLK_LOCAL_MEM_FENCE);
// if (tid <= max_order)
// output[(get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) + tid] = shared.product[tid];
//}
__kernel __attribute__((reqd_work_group_size(32, 4, 1)))
void cudaComputeAutocor( void cudaComputeAutocor(
__global float *output, __global float *output,
__global const int *samples, __global const int *samples,
__global const float *window, __global const float *window,
__global FLACCLSubframeTask *tasks, __global FLACCLSubframeTask *tasks,
const int max_order, // should be <= 32
const int windowCount, // windows (log2: 0,1) const int windowCount, // windows (log2: 0,1)
const int taskCount // tasks per block const int taskCount // tasks per block
) )
{ {
__local struct { __local float data[GROUP_SIZE * 2];
float data[256]; __local float product[GROUP_SIZE];
volatile float product[128]; __local FLACCLSubframeData task;
FLACCLSubframeData task; const int tid = get_local_id(0);
volatile float result[33];
volatile int dataPos;
volatile int dataLen;
volatile int windowOffs;
volatile int samplesOffs;
//volatile int resultOffs;
} shared;
const int tid = get_local_id(0) + get_local_id(1) * 32;
// fetch task data // fetch task data
if (tid < sizeof(shared.task) / sizeof(int)) if (tid < sizeof(task) / sizeof(int))
((__local int*)&shared.task)[tid] = ((__global int*)(tasks + taskCount * (get_group_id(1) >> windowCount)))[tid]; ((__local int*)&task)[tid] = ((__global int*)(tasks + taskCount * (get_group_id(1) >> windowCount)))[tid];
if (tid == 0)
{
shared.dataPos = get_group_id(0) * 7 * 32;
shared.windowOffs = (get_group_id(1) & ((1 << windowCount)-1)) * shared.task.blocksize + shared.dataPos;
shared.samplesOffs = shared.task.samplesOffs + shared.dataPos;
shared.dataLen = min(shared.task.blocksize - shared.dataPos, 7 * 32 + max_order);
}
//if (tid == 32)
//shared.resultOffs = __mul24(get_group_id(0) + __mul24(get_group_id(1), get_num_groups(0)), max_order + 1);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// fetch samples int bs = task.blocksize;
shared.data[tid] = tid < shared.dataLen ? samples[shared.samplesOffs + tid] * window[shared.windowOffs + tid]: 0.0f; int windowOffs = (get_group_id(1) & ((1 << windowCount)-1)) * bs;
int tid2 = tid + 128;
shared.data[tid2] = tid2 < shared.dataLen ? samples[shared.samplesOffs + tid2] * window[shared.windowOffs + tid2]: 0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
const int ptr = get_local_id(0) * 7; data[tid] = tid < bs ? samples[task.samplesOffs + tid] * window[windowOffs + tid] : 0.0f;
//if (get_local_id(1) == 0) for (int lag = 0; lag <= max_order; lag ++)
//for (int lag = get_local_id(1); lag <= max_order; lag += get_local_size(1)) int tid0 = tid % (GROUP_SIZE >> 2);
for (int lag0 = 0; lag0 <= max_order; lag0 += get_local_size(1)) int tid1 = tid / (GROUP_SIZE >> 2);
int lag0 = get_group_id(0) * 4;
__local float4 * dptr = ((__local float4 *)&data[0]) + tid0;
__local float4 * dptr1 = ((__local float4 *)&data[lag0 + tid1]) + tid0;
float prod = 0.0f;
for (int pos = 0; pos < bs; pos += GROUP_SIZE)
{ {
////const int productLen = min(shared.task.blocksize - get_group_id(0) * partSize - lag, partSize); // fetch samples
const int lag = lag0 + get_local_id(1); float nextData = pos + tid + GROUP_SIZE < bs ? samples[task.samplesOffs + pos + tid + GROUP_SIZE] * window[windowOffs + pos + tid + GROUP_SIZE] : 0.0f;
const int ptr2 = ptr + lag; data[tid + GROUP_SIZE] = nextData;
shared.product[tid] =
shared.data[ptr + 0] * shared.data[ptr2 + 0] +
shared.data[ptr + 1] * shared.data[ptr2 + 1] +
shared.data[ptr + 2] * shared.data[ptr2 + 2] +
shared.data[ptr + 3] * shared.data[ptr2 + 3] +
shared.data[ptr + 4] * shared.data[ptr2 + 4] +
shared.data[ptr + 5] * shared.data[ptr2 + 5] +
shared.data[ptr + 6] * shared.data[ptr2 + 6];
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (int l = 16; l > 1; l >>= 1)
{ prod += dot(*dptr, *dptr1);
if (get_local_id(0) < l)
shared.product[tid] = shared.product[tid] + shared.product[tid + l]; barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
} data[tid] = nextData;
// return results }
if (get_local_id(0) == 0 && lag <= max_order) product[tid] = prod;
shared.result[lag] = shared.product[tid] + shared.product[tid + 1]; barrier(CLK_LOCAL_MEM_FENCE);
for (int l = (GROUP_SIZE >> 3); l > 0; l >>= 1)
{
if (tid0 < l)
product[tid] = product[tid] + product[tid + l];
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
if (tid <= max_order) if (tid < 4 && tid + lag0 <= MAX_ORDER)
output[(get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) + tid] = shared.result[tid]; output[get_group_id(1) * (MAX_ORDER + 1) + tid + lag0] = product[tid * (GROUP_SIZE >> 2)];
//output[(get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) + tid] = shared.product[tid];
//output[(get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) + tid] = shared.windowOffs;
} }
__kernel __attribute__((reqd_work_group_size(32, 1, 1))) __kernel __attribute__((reqd_work_group_size(32, 1, 1)))
void cudaComputeLPC( void cudaComputeLPC(
__global FLACCLSubframeTask *tasks, __global FLACCLSubframeTask *tasks,
int taskCount, // tasks per block __global float *autoc,
__global float*autoc,
int max_order, // should be <= 32
__global float *lpcs, __global float *lpcs,
int windowCount, int taskCount, // tasks per block
int partCount int windowCount
) )
{ {
__local struct { __local struct {
FLACCLSubframeData task; FLACCLSubframeData task;
volatile float parts[32];
volatile float ldr[32]; volatile float ldr[32];
volatile float gen1[32]; volatile float gen1[32];
volatile float error[32]; volatile float error[32];
@@ -311,40 +222,19 @@ void cudaComputeLPC(
// fetch task data // fetch task data
if (tid < sizeof(shared.task) / sizeof(int)) if (tid < sizeof(shared.task) / sizeof(int))
((__local int*)&shared.task)[tid] = ((__global int*)(tasks + get_group_id(1) * taskCount))[tid]; ((__local int*)&shared.task)[tid] = ((__global int*)(tasks + get_group_id(1)))[tid];
if (tid == 0) if (tid == 0)
{ {
shared.lpcOffs = (get_group_id(0) + get_group_id(1) * windowCount) * (max_order + 1) * 32; shared.lpcOffs = (get_group_id(0) + get_group_id(1) * windowCount) * (MAX_ORDER + 1) * 32;
shared.autocOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) * partCount; shared.autocOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1);
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// add up autocorrelation parts if (get_local_id(0) <= MAX_ORDER)
shared.autoc[get_local_id(0)] = autoc[shared.autocOffs + get_local_id(0)];
if (get_local_id(0) + get_local_size(0) <= MAX_ORDER)
shared.autoc[get_local_id(0) + get_local_size(0)] = autoc[shared.autocOffs + get_local_id(0) + get_local_size(0)];
// for (int order = get_local_id(0); order <= max_order; order += 32)
// {
//float sum = 0.0f;
//for (int pos = 0; pos < partCount; pos++)
// sum += autoc[shared.autocOffs + pos * (max_order + 1) + order];
//shared.autoc[order] = sum;
// }
for (int order = 0; order <= max_order; order ++)
{
float part = 0.0f;
for (int pos = get_local_id(0); pos < partCount; pos += get_local_size(0))
part += autoc[shared.autocOffs + pos * (max_order + 1) + order];
shared.parts[tid] = part;
barrier(CLK_LOCAL_MEM_FENCE);
for (int l = get_local_size(0) / 2; l > 1; l >>= 1)
{
if (get_local_id(0) < l)
shared.parts[tid] += shared.parts[tid + l];
barrier(CLK_LOCAL_MEM_FENCE);
}
if (get_local_id(0) == 0)
shared.autoc[order] = shared.parts[tid] + shared.parts[tid + 1];
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// Compute LPC using Schur and Levinson-Durbin recursion // Compute LPC using Schur and Levinson-Durbin recursion
@@ -352,19 +242,19 @@ void cudaComputeLPC(
shared.ldr[get_local_id(0)] = 0.0f; shared.ldr[get_local_id(0)] = 0.0f;
float error = shared.autoc[0]; float error = shared.autoc[0];
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (int order = 0; order < max_order; order++) for (int order = 0; order < MAX_ORDER; order++)
{ {
// Schur recursion // Schur recursion
float reff = -shared.gen1[0] / error; float reff = -shared.gen1[0] / error;
error += shared.gen1[0] * reff; // Equivalent to error *= (1 - reff * reff); error += shared.gen1[0] * reff; // Equivalent to error *= (1 - reff * reff);
float gen1; float gen1;
if (get_local_id(0) < max_order - 1 - order) if (get_local_id(0) < MAX_ORDER - 1 - order)
{ {
gen1 = shared.gen1[get_local_id(0) + 1] + reff * gen0; gen1 = shared.gen1[get_local_id(0) + 1] + reff * gen0;
gen0 += shared.gen1[get_local_id(0) + 1] * reff; gen0 += shared.gen1[get_local_id(0) + 1] * reff;
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < max_order - 1 - order) if (get_local_id(0) < MAX_ORDER - 1 - order)
shared.gen1[get_local_id(0)] = gen1; shared.gen1[get_local_id(0)] = gen1;
// Store prediction error // Store prediction error
@@ -385,118 +275,16 @@ void cudaComputeLPC(
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// Output prediction error estimates // Output prediction error estimates
if (get_local_id(0) < max_order) if (get_local_id(0) < MAX_ORDER)
lpcs[shared.lpcOffs + max_order * 32 + get_local_id(0)] = shared.error[get_local_id(0)]; lpcs[shared.lpcOffs + MAX_ORDER * 32 + get_local_id(0)] = shared.error[get_local_id(0)];
} }
//__kernel void cudaComputeLPCLattice(
// FLACCLSubframeTask *tasks,
// const int taskCount, // tasks per block
// const int *samples,
// const int windowCount,
// const int max_order, // should be <= 12
// float*lpcs
//)
//{
// __local struct {
// volatile FLACCLSubframeData task;
// volatile float F[512];
// volatile float arp[32];
// volatile float tmp[256];
// volatile float error[32];
// volatile int lpcOffs;
// } shared;
//
// // fetch task data
// if (get_local_id(0) < sizeof(shared.task) / sizeof(int))
// ((int*)&shared.task)[get_local_id(0)] = ((int*)(tasks + taskCount * get_group_id(1)))[get_local_id(0)];
// if (get_local_id(0) == 0)
// shared.lpcOffs = __mul24(__mul24(get_group_id(1) + 1, windowCount) - 1, max_order + 1) * 32;
// barrier(CLK_LOCAL_MEM_FENCE);
//
// // F = samples; B = samples
// float s1 = get_local_id(0) < shared.task.blocksize ? (samples[shared.task.samplesOffs + get_local_id(0)]) / 32768.0f : 0.0f;
// float s2 = get_local_id(0) + 256 < shared.task.blocksize ? (samples[shared.task.samplesOffs + get_local_id(0) + 256]) / 32768.0f : 0.0f;
// shared.F[get_local_id(0)] = s1;
// shared.F[get_local_id(0) + 256] = s2;
// barrier(CLK_LOCAL_MEM_FENCE);
//
// shared.tmp[get_local_id(0)] = FSQR(s1) + FSQR(s2);
// barrier(CLK_LOCAL_MEM_FENCE);
// SUM256(shared.tmp, get_local_id(0), +=);
// barrier(CLK_LOCAL_MEM_FENCE);
// float DEN = shared.tmp[0];
// barrier(CLK_LOCAL_MEM_FENCE);
//
// for (int order = 0; order < max_order; order++)
// {
// // reff = F(order+1:frameSize) * B(1:frameSize-order)' / DEN
// int idxF = get_local_id(0) + order + 1;
// int idxF2 = idxF + 256;
//
// shared.tmp[get_local_id(0)] = idxF < shared.task.blocksize ? shared.F[idxF] * s1 : 0.0f;
// shared.tmp[get_local_id(0)] += idxF2 < shared.task.blocksize ? shared.F[idxF2] * s2 : 0.0f;
// barrier(CLK_LOCAL_MEM_FENCE);
// SUM256(shared.tmp, get_local_id(0), +=);
// barrier(CLK_LOCAL_MEM_FENCE);
// float reff = shared.tmp[0] / DEN;
// barrier(CLK_LOCAL_MEM_FENCE);
//
// // arp(order) = rc(order) = reff
// if (get_local_id(0) == 0)
// shared.arp[order] = reff;
// //shared.rc[order - 1] = shared.lpc[order - 1][order - 1] = reff;
//
// // Levinson-Durbin recursion
// // arp(1:order-1) = arp(1:order-1) - reff * arp(order-1:-1:1)
// if (get_local_id(0) < order)
// shared.arp[get_local_id(0)] = shared.arp[get_local_id(0)] - reff * shared.arp[order - 1 - get_local_id(0)];
//
// // Output coeffs
// if (get_local_id(0) <= order)
// lpcs[shared.lpcOffs + order * 32 + get_local_id(0)] = shared.arp[order - get_local_id(0)];
//
// // F1 = F(order+1:frameSize) - reff * B(1:frameSize-order)
// // B(1:frameSize-order) = B(1:frameSize-order) - reff * F(order+1:frameSize)
// // F(order+1:frameSize) = F1
// if (idxF < shared.task.blocksize)
// {
// float f1 = shared.F[idxF];
// shared.F[idxF] -= reff * s1;
// s1 -= reff * f1;
// }
// if (idxF2 < shared.task.blocksize)
// {
// float f2 = shared.F[idxF2];
// shared.F[idxF2] -= reff * s2;
// s2 -= reff * f2;
// }
//
// // DEN = F(order+1:frameSize) * F(order+1:frameSize)' + B(1:frameSize-order) * B(1:frameSize-order)' (BURG)
// shared.tmp[get_local_id(0)] = (idxF + 1 < shared.task.blocksize ? FSQR(shared.F[idxF]) + FSQR(s1) : 0);
// shared.tmp[get_local_id(0)] += (idxF2 + 1 < shared.task.blocksize ? FSQR(shared.F[idxF2]) + FSQR(s2) : 0);
// barrier(CLK_LOCAL_MEM_FENCE);
// SUM256(shared.tmp, get_local_id(0), +=);
// barrier(CLK_LOCAL_MEM_FENCE);
// DEN = shared.tmp[0] / 2;
// // shared.PE[order-1] = shared.tmp[0] / 2 / (frameSize - order + 1);
// if (get_local_id(0) == 0)
// shared.error[order] = DEN / (shared.task.blocksize - order);
// barrier(CLK_LOCAL_MEM_FENCE);
// }
//
// // Output prediction error estimates
// if (get_local_id(0) < max_order)
// lpcs[shared.lpcOffs + max_order * 32 + get_local_id(0)] = shared.error[get_local_id(0)];
//}
__kernel __attribute__((reqd_work_group_size(32, 4, 1))) __kernel __attribute__((reqd_work_group_size(32, 4, 1)))
void cudaQuantizeLPC( void cudaQuantizeLPC(
__global FLACCLSubframeTask *tasks, __global FLACCLSubframeTask *tasks,
__global float*lpcs,
int taskCount, // tasks per block int taskCount, // tasks per block
int taskCountLPC, // tasks per set of coeffs (<= 32) int taskCountLPC, // tasks per set of coeffs (<= 32)
__global float*lpcs,
int max_order, // should be <= 32
int minprecision, int minprecision,
int precisions int precisions
) )
@@ -515,32 +303,30 @@ void cudaQuantizeLPC(
if (tid < sizeof(shared.task) / sizeof(int)) if (tid < sizeof(shared.task) / sizeof(int))
((__local int*)&shared.task)[tid] = ((__global int*)(tasks + get_group_id(1) * taskCount))[tid]; ((__local int*)&shared.task)[tid] = ((__global int*)(tasks + get_group_id(1) * taskCount))[tid];
if (tid == 0) if (tid == 0)
shared.lpcOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) * 32; shared.lpcOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1) * 32;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// Select best orders based on Akaike's Criteria
if (get_local_id(1) == 0) if (get_local_id(1) == 0)
{ {
shared.index[get_local_id(0)] = min(max_order - 1, get_local_id(0)); shared.index[tid] = min(MAX_ORDER - 1, tid);
shared.error[get_local_id(0)] = shared.task.blocksize * 64 + get_local_id(0); shared.error[tid] = shared.task.blocksize * 64 + tid;
shared.index[32 + get_local_id(0)] = min(max_order - 1, get_local_id(0)); shared.index[32 + tid] = min(MAX_ORDER - 1, tid);
shared.error[32 + get_local_id(0)] = shared.task.blocksize * 64 + get_local_id(0); shared.error[32 + tid] = shared.task.blocksize * 64 + tid;
// Select best orders based on Akaike's Criteria
// Load prediction error estimates // Load prediction error estimates
if (get_local_id(0) < max_order) if (tid < MAX_ORDER)
shared.error[get_local_id(0)] = shared.task.blocksize * log(lpcs[shared.lpcOffs + max_order * 32 + get_local_id(0)]) + get_local_id(0) * 5.12f * log(shared.task.blocksize); shared.error[tid] = shared.task.blocksize * log(lpcs[shared.lpcOffs + MAX_ORDER * 32 + tid]) + tid * 5.12f * log(shared.task.blocksize);
//shared.error[get_local_id(0)] = shared.task.blocksize * log(lpcs[shared.lpcOffs + max_order * 32 + get_local_id(0)]) + get_local_id(0) * 0.30f * (shared.task.abits + 1) * log(shared.task.blocksize); //shared.error[get_local_id(0)] = shared.task.blocksize * log(lpcs[shared.lpcOffs + MAX_ORDER * 32 + get_local_id(0)]) + get_local_id(0) * 0.30f * (shared.task.abits + 1) * log(shared.task.blocksize);
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// Sort using bitonic sort // Sort using bitonic sort
for(int size = 2; size < 64; size <<= 1){ for(int size = 2; size < 64; size <<= 1){
//Bitonic merge //Bitonic merge
int ddd = (get_local_id(0) & (size / 2)) == 0; int ddd = (tid & (size / 2)) == 0;
for(int stride = size / 2; stride > 0; stride >>= 1){ for(int stride = size / 2; stride > 0; stride >>= 1){
int pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); int pos = 2 * tid - (tid & (stride - 1));
float e0, e1; float e0, e1;
int i0, i1; int i0, i1;
if (get_local_id(1) == 0) if (get_local_id(1) == 0)
@@ -551,7 +337,7 @@ void cudaQuantizeLPC(
i1 = shared.index[pos + stride]; i1 = shared.index[pos + stride];
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if ((e0 >= e1) == ddd && get_local_id(1) == 0) if (get_local_id(1) == 0 && (e0 >= e1) == ddd)
{ {
shared.error[pos] = e1; shared.error[pos] = e1;
shared.error[pos + stride] = e0; shared.error[pos + stride] = e0;
@@ -566,7 +352,7 @@ void cudaQuantizeLPC(
{ {
for(int stride = 32; stride > 0; stride >>= 1){ for(int stride = 32; stride > 0; stride >>= 1){
//barrier(CLK_LOCAL_MEM_FENCE); //barrier(CLK_LOCAL_MEM_FENCE);
int pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); int pos = 2 * tid - (tid & (stride - 1));
float e0, e1; float e0, e1;
int i0, i1; int i0, i1;
if (get_local_id(1) == 0) if (get_local_id(1) == 0)
@@ -577,7 +363,7 @@ void cudaQuantizeLPC(
i1 = shared.index[pos + stride]; i1 = shared.index[pos + stride];
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (e0 >= e1 && get_local_id(1) == 0) if (get_local_id(1) == 0 && e0 >= e1)
{ {
shared.error[pos] = e1; shared.error[pos] = e1;
shared.error[pos + stride] = e0; shared.error[pos + stride] = e0;
@@ -653,21 +439,21 @@ void cudaQuantizeLPC(
} }
} }
__kernel __attribute__(( vec_type_hint (int4))) __kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
void cudaEstimateResidual( void cudaEstimateResidual(
__global int*output, __global int*output,
__global int*samples, __global int*samples,
__global FLACCLSubframeTask *tasks __global FLACCLSubframeTask *tasks
) )
{ {
__local float data[128 * 2]; __local float data[GROUP_SIZE * 2];
__local int residual[128]; __local int residual[GROUP_SIZE];
__local FLACCLSubframeTask task; __local FLACCLSubframeTask task;
__local float4 coefsf4[8]; __local float4 coefsf4[8];
const int tid = get_local_id(0); const int tid = get_local_id(0);
if (tid < sizeof(task)/sizeof(int)) if (tid < sizeof(task)/sizeof(int))
((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(1)]))[tid]; ((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(0)]))[tid];
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
int ro = task.data.residualOrder; int ro = task.data.residualOrder;
@@ -677,11 +463,11 @@ void cudaEstimateResidual(
if (tid < 32) if (tid < 32)
((__local float *)&coefsf4[0])[tid] = select(0.0f, ((float)task.coefs[tid]) / (1 << task.data.shift), tid < ro); ((__local float *)&coefsf4[0])[tid] = select(0.0f, ((float)task.coefs[tid]) / (1 << task.data.shift), tid < ro);
data[tid] = tid < bs ? (float)(samples[task.data.samplesOffs + tid] >> task.data.wbits) : 0.0f; data[tid] = tid < bs ? (float)(samples[task.data.samplesOffs + tid] >> task.data.wbits) : 0.0f;
for (int pos = 0; pos < bs; pos += get_local_size(0)) for (int pos = 0; pos < bs; pos += GROUP_SIZE)
{ {
// fetch samples // fetch samples
float nextData = pos + tid + get_local_size(0) < bs ? (float)(samples[task.data.samplesOffs + pos + tid + get_local_size(0)] >> task.data.wbits) : 0.0f; float nextData = pos + tid + GROUP_SIZE < bs ? (float)(samples[task.data.samplesOffs + pos + tid + GROUP_SIZE] >> task.data.wbits) : 0.0f;
data[tid + get_local_size(0)] = nextData; data[tid + GROUP_SIZE] = nextData;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// compute residual // compute residual
@@ -718,19 +504,19 @@ void cudaEstimateResidual(
data[tid] = nextData; data[tid] = nextData;
} }
int residualLen = (bs - ro) / get_local_size(0) + select(0, 1, tid < (bs - ro) % get_local_size(0)); int residualLen = (bs - ro) / GROUP_SIZE + select(0, 1, tid < (bs - ro) % GROUP_SIZE);
int k = clamp(convert_int_rtn(log2((res + 0.000001f) / (residualLen + 0.000001f))), 0, 14); int k = clamp(convert_int_rtn(log2((res + 0.000001f) / (residualLen + 0.000001f))), 0, 14);
residual[tid] = residualLen * (k + 1) + (convert_int_rtz(res) >> k); residual[tid] = residualLen * (k + 1) + (convert_int_rtz(res) >> k);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (int l = get_local_size(0) / 2; l > 0; l >>= 1) for (int l = GROUP_SIZE / 2; l > 0; l >>= 1)
{ {
if (tid < l) if (tid < l)
residual[tid] += residual[tid + l]; residual[tid] += residual[tid + l];
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
if (tid == 0) if (tid == 0)
output[get_group_id(1)] = residual[0]; output[get_group_id(0)] = residual[0];
} }
__kernel void cudaChooseBestMethod( __kernel void cudaChooseBestMethod(