mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
optimizations
This commit is contained in:
@@ -1964,10 +1964,10 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
|
|
||||||
public unsafe void do_output_frames(int nFrames)
|
public unsafe void do_output_frames(int nFrames)
|
||||||
{
|
{
|
||||||
if (task2.frameCount > 0)
|
|
||||||
task2.openCLCQ.Finish();
|
|
||||||
send_to_GPU(task1, nFrames, eparams.block_size);
|
send_to_GPU(task1, nFrames, eparams.block_size);
|
||||||
run_GPU_task(task1);
|
run_GPU_task(task1);
|
||||||
|
if (task2.frameCount > 0)
|
||||||
|
task2.openCLCQ.Finish();
|
||||||
if (task2.frameCount > 0)
|
if (task2.frameCount > 0)
|
||||||
{
|
{
|
||||||
if (cpu_tasks != null)
|
if (cpu_tasks != null)
|
||||||
@@ -2681,9 +2681,11 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
if (UseGPUOnly)
|
if (UseGPUOnly)
|
||||||
{
|
{
|
||||||
clEncodeResidual = openCLProgram.CreateKernel("clEncodeResidual");
|
clEncodeResidual = openCLProgram.CreateKernel("clEncodeResidual");
|
||||||
clCalcPartition = openCLProgram.CreateKernel("clCalcPartition");
|
|
||||||
if (openCLCQ.Device.DeviceType != DeviceType.CPU)
|
if (openCLCQ.Device.DeviceType != DeviceType.CPU)
|
||||||
|
{
|
||||||
|
clCalcPartition = openCLProgram.CreateKernel("clCalcPartition");
|
||||||
clCalcPartition16 = openCLProgram.CreateKernel("clCalcPartition16");
|
clCalcPartition16 = openCLProgram.CreateKernel("clCalcPartition16");
|
||||||
|
}
|
||||||
clSumPartition = openCLProgram.CreateKernel("clSumPartition");
|
clSumPartition = openCLProgram.CreateKernel("clSumPartition");
|
||||||
clFindRiceParameter = openCLProgram.CreateKernel("clFindRiceParameter");
|
clFindRiceParameter = openCLProgram.CreateKernel("clFindRiceParameter");
|
||||||
clFindPartitionOrder = openCLProgram.CreateKernel("clFindPartitionOrder");
|
clFindPartitionOrder = openCLProgram.CreateKernel("clFindPartitionOrder");
|
||||||
@@ -2736,9 +2738,11 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
if (UseGPUOnly)
|
if (UseGPUOnly)
|
||||||
{
|
{
|
||||||
clEncodeResidual.Dispose();
|
clEncodeResidual.Dispose();
|
||||||
clCalcPartition.Dispose();
|
|
||||||
if (openCLCQ.Device.DeviceType != DeviceType.CPU)
|
if (openCLCQ.Device.DeviceType != DeviceType.CPU)
|
||||||
|
{
|
||||||
|
clCalcPartition.Dispose();
|
||||||
clCalcPartition16.Dispose();
|
clCalcPartition16.Dispose();
|
||||||
|
}
|
||||||
clSumPartition.Dispose();
|
clSumPartition.Dispose();
|
||||||
clFindRiceParameter.Dispose();
|
clFindRiceParameter.Dispose();
|
||||||
clFindPartitionOrder.Dispose();
|
clFindPartitionOrder.Dispose();
|
||||||
@@ -2942,11 +2946,19 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
groupSize,
|
groupSize,
|
||||||
nEstimateTasksPerChannel * channelsCount * frameCount); // 1 per channel, 4 channels
|
nEstimateTasksPerChannel * channelsCount * frameCount); // 1 per channel, 4 channels
|
||||||
|
|
||||||
|
int tasksToSecondEstimate = nResidualTasksPerChannel - nEstimateTasksPerChannel;
|
||||||
|
|
||||||
|
//if (nEstimateTasksPerChannel < nTasksPerWindow * nWindowFunctions)
|
||||||
|
//tasksToSecondEstimate -= (nEstimateTasksPerChannel / nWindowFunctions) * (nWindowFunctions - 1);
|
||||||
|
|
||||||
clSelectStereoTasks.SetArgs(
|
clSelectStereoTasks.SetArgs(
|
||||||
clResidualTasks,
|
clResidualTasks,
|
||||||
clSelectedTasks,
|
clSelectedTasks,
|
||||||
clSelectedTasksSecondEstimate,
|
clSelectedTasksSecondEstimate,
|
||||||
clSelectedTasksBestMethod,
|
clSelectedTasksBestMethod,
|
||||||
|
nTasksPerWindow,
|
||||||
|
nWindowFunctions,
|
||||||
|
tasksToSecondEstimate,
|
||||||
nResidualTasksPerChannel,
|
nResidualTasksPerChannel,
|
||||||
nEstimateTasksPerChannel);
|
nEstimateTasksPerChannel);
|
||||||
|
|
||||||
@@ -2954,7 +2966,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
clSelectStereoTasks,
|
clSelectStereoTasks,
|
||||||
0, frameCount);
|
0, frameCount);
|
||||||
|
|
||||||
if (nEstimateTasksPerChannel < nResidualTasksPerChannel)
|
if (tasksToSecondEstimate > 0)
|
||||||
{
|
{
|
||||||
clEstimateResidual.SetArgs(
|
clEstimateResidual.SetArgs(
|
||||||
clSamples,
|
clSamples,
|
||||||
@@ -2964,7 +2976,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
openCLCQ.EnqueueNDRangeKernel(
|
openCLCQ.EnqueueNDRangeKernel(
|
||||||
clEstimateResidual,
|
clEstimateResidual,
|
||||||
groupSize,
|
groupSize,
|
||||||
(nResidualTasksPerChannel - nEstimateTasksPerChannel) * channels * frameCount);
|
tasksToSecondEstimate * channels * frameCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
clChooseBestMethod.SetArgs(
|
clChooseBestMethod.SetArgs(
|
||||||
@@ -3003,15 +3015,20 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
if (UseGPUOnly)
|
if (UseGPUOnly)
|
||||||
{
|
{
|
||||||
clEncodeResidual.SetArgs(
|
clEncodeResidual.SetArgs(
|
||||||
|
clPartitions,
|
||||||
clResidual,
|
clResidual,
|
||||||
clSamples,
|
clSamples,
|
||||||
clBestResidualTasks);
|
clBestResidualTasks,
|
||||||
|
max_porder,
|
||||||
|
frameSize >> max_porder);
|
||||||
|
|
||||||
openCLCQ.EnqueueNDRangeKernel(
|
openCLCQ.EnqueueNDRangeKernel(
|
||||||
clEncodeResidual,
|
clEncodeResidual,
|
||||||
groupSize, channels * frameCount);
|
groupSize, channels * frameCount);
|
||||||
|
|
||||||
if ((frameSize >> max_porder == 16) && openCLCQ.Device.DeviceType != DeviceType.CPU)
|
if (openCLCQ.Device.DeviceType != DeviceType.CPU)
|
||||||
|
{
|
||||||
|
if (frameSize >> max_porder == 16)
|
||||||
{
|
{
|
||||||
clCalcPartition16.SetArgs(
|
clCalcPartition16.SetArgs(
|
||||||
clPartitions,
|
clPartitions,
|
||||||
@@ -3032,19 +3049,13 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
max_porder,
|
max_porder,
|
||||||
frameSize >> max_porder);
|
frameSize >> max_porder);
|
||||||
|
|
||||||
if (openCLCQ.Device.DeviceType == DeviceType.CPU)
|
|
||||||
openCLCQ.EnqueueNDRangeKernel(
|
|
||||||
clCalcPartition,
|
|
||||||
groupSize, 1,
|
|
||||||
1,
|
|
||||||
channels * frameCount);
|
|
||||||
else
|
|
||||||
openCLCQ.EnqueueNDRangeKernel(
|
openCLCQ.EnqueueNDRangeKernel(
|
||||||
clCalcPartition,
|
clCalcPartition,
|
||||||
groupSize, 1,
|
groupSize, 1,
|
||||||
1 + ((1 << max_porder) - 1) / (groupSize / 16),
|
1 + ((1 << max_porder) - 1) / (groupSize / 16),
|
||||||
channels * frameCount);
|
channels * frameCount);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (max_porder > 0)
|
if (max_porder > 0)
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -896,13 +896,13 @@ inline int fastclz64(long iv)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if BITS_PER_SAMPLE > 16
|
#if BITS_PER_SAMPLE > 16
|
||||||
typedef long residual_t;
|
#define residual_t long
|
||||||
#define residual_log(s) (63 - fastclz64(s))
|
#define residual_log(s) (63 - fastclz64(s))
|
||||||
#define convert_bps4 convert_long4
|
#define convert_bps4 convert_long4
|
||||||
#define convert_bps_sat convert_int_sat
|
#define convert_bps_sat convert_int_sat
|
||||||
#define bpsint4 long4
|
#define bpsint4 long4
|
||||||
#else
|
#else
|
||||||
typedef int residual_t;
|
#define residual_t int
|
||||||
#define residual_log(s) (31 - fastclz(s))
|
#define residual_log(s) (31 - fastclz(s))
|
||||||
#define convert_bps4
|
#define convert_bps4
|
||||||
#define convert_bps_sat
|
#define convert_bps_sat
|
||||||
@@ -967,50 +967,82 @@ void clEstimateResidual(
|
|||||||
|
|
||||||
SWITCH_N((len[pos >> 6] += fabs((float)t)))
|
SWITCH_N((len[pos >> 6] += fabs((float)t)))
|
||||||
#else
|
#else
|
||||||
float fcoef[32];
|
if (ro <= 4)
|
||||||
for (int tid = 0; tid < 32; tid++)
|
{
|
||||||
fcoef[tid] = tid < MAX_ORDER && tid + ro - MAX_ORDER >= 0 ? - ((float) task.coefs[tid + ro - MAX_ORDER]) / (1 << task.data.shift) : 0.0f;
|
float fcoef[4];
|
||||||
|
for (int tid = 0; tid < 4; tid++)
|
||||||
|
fcoef[tid] = tid + ro - 4 < 0 ? 0.0f : - ((float) task.coefs[tid + ro - 4]) / (1 << task.data.shift);
|
||||||
|
float4 fc0 = vload4(0, &fcoef[0]);
|
||||||
|
float fdata[4];
|
||||||
|
for (int pos = 0; pos < 4; pos++)
|
||||||
|
fdata[pos] = pos + ro - 4 < 0 ? 0.0f : (float)(data[pos + ro - 4] >> task.data.wbits);
|
||||||
|
float4 fd0 = vload4(0, &fdata[0]);
|
||||||
|
for (int pos = ro; pos < bs; pos ++)
|
||||||
|
{
|
||||||
|
float4 sum4 = fc0 * fd0;
|
||||||
|
float2 sum2 = sum4.s01 + sum4.s23;
|
||||||
|
fd0 = fd0.s1230;
|
||||||
|
fd0.s3 = (float)(data[pos] >> task.data.wbits);
|
||||||
|
len[pos >> 6] += fabs(fd0.s3 + (sum2.x + sum2.y));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (ro <= 8)
|
||||||
|
{
|
||||||
|
float fcoef[8];
|
||||||
|
for (int tid = 0; tid < 8; tid++)
|
||||||
|
fcoef[tid] = tid + ro - 8 < 0 ? 0.0f : - ((float) task.coefs[tid + ro - 8]) / (1 << task.data.shift);
|
||||||
|
float8 fc0 = vload8(0, &fcoef[0]);
|
||||||
|
float fdata[8];
|
||||||
|
for (int pos = 0; pos < 8; pos++)
|
||||||
|
fdata[pos] = pos + ro - 8 < 0 ? 0.0f : (float)(data[pos + ro - 8] >> task.data.wbits);
|
||||||
|
float8 fd0 = vload8(0, &fdata[0]);
|
||||||
|
for (int pos = ro; pos < bs; pos ++)
|
||||||
|
{
|
||||||
|
float8 sum8 = fc0 * fd0;
|
||||||
|
float4 sum4 = sum8.s0123 + sum8.s4567;
|
||||||
|
float2 sum2 = sum4.s01 + sum4.s23;
|
||||||
|
fd0 = fd0.s12345670;
|
||||||
|
fd0.s7 = (float)(data[pos] >> task.data.wbits);
|
||||||
|
len[pos >> 6] += fabs(fd0.s7 + (sum2.x + sum2.y));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (ro <= 12)
|
||||||
|
{
|
||||||
|
float fcoef[12];
|
||||||
|
for (int tid = 0; tid < 12; tid++)
|
||||||
|
fcoef[tid] = tid + ro - 12 >= 0 ? - ((float) task.coefs[tid + ro - 12]) / (1 << task.data.shift) : 0.0f;
|
||||||
float4 fc0 = vload4(0, &fcoef[0]);
|
float4 fc0 = vload4(0, &fcoef[0]);
|
||||||
float4 fc1 = vload4(1, &fcoef[0]);
|
float4 fc1 = vload4(1, &fcoef[0]);
|
||||||
#if MAX_ORDER > 8
|
|
||||||
float4 fc2 = vload4(2, &fcoef[0]);
|
float4 fc2 = vload4(2, &fcoef[0]);
|
||||||
#endif
|
float fdata[12];
|
||||||
|
for (int pos = 0; pos < 12; pos++)
|
||||||
#if MAX_ORDER == 8
|
fdata[pos] = pos + ro - 12 < 0 ? 0.0f : (float)(data[pos + ro - 12] >> task.data.wbits);
|
||||||
float fdata[32];
|
float4 fd0 = vload4(0, &fdata[0]);
|
||||||
for (int pos = 0; pos < MAX_ORDER + ro; pos++)
|
float4 fd1 = vload4(1, &fdata[0]);
|
||||||
fdata[pos] = pos < MAX_ORDER ? 0.0f : (float)(data[pos - MAX_ORDER] >> task.data.wbits);
|
float4 fd2 = vload4(2, &fdata[0]);
|
||||||
float4 fd0 = vload4(0, &fdata[ro]);
|
|
||||||
float4 fd1 = vload4(1, &fdata[ro]);
|
|
||||||
for (int pos = ro; pos < bs; pos ++)
|
for (int pos = ro; pos < bs; pos ++)
|
||||||
{
|
{
|
||||||
float4 sum = fc0 * fd0 + fc1 * fd1;
|
float4 sum4 = fc0 * fd0 + fc1 * fd1 + fc2 * fd2;
|
||||||
fd0 = fd0.s1230;
|
float2 sum2 = sum4.s01 + sum4.s23;
|
||||||
fd1 = fd1.s1230;
|
|
||||||
fd0.s3 = fd1.s3;
|
|
||||||
fd1.s3 = (float)(data[pos] >> task.data.wbits);
|
|
||||||
len[pos >> 6] += fabs(fd1.s3 + (sum.x + sum.y + sum.z + sum.w));
|
|
||||||
}
|
|
||||||
#elif MAX_ORDER == 12
|
|
||||||
float fdata[32];
|
|
||||||
for (int pos = 0; pos < MAX_ORDER + ro; pos++)
|
|
||||||
fdata[pos] = pos < MAX_ORDER ? 0.0f : (float)(data[pos - MAX_ORDER] >> task.data.wbits);
|
|
||||||
float4 fd0 = vload4(0, &fdata[ro]);
|
|
||||||
float4 fd1 = vload4(1, &fdata[ro]);
|
|
||||||
float4 fd2 = vload4(2, &fdata[ro]);
|
|
||||||
for (int pos = ro; pos < bs; pos ++)
|
|
||||||
{
|
|
||||||
float4 sum = fc0 * fd0 + fc1 * fd1 + fc2 * fd2;
|
|
||||||
fd0 = fd0.s1230;
|
fd0 = fd0.s1230;
|
||||||
fd1 = fd1.s1230;
|
fd1 = fd1.s1230;
|
||||||
fd2 = fd2.s1230;
|
fd2 = fd2.s1230;
|
||||||
fd0.s3 = fd1.s3;
|
fd0.s3 = fd1.s3;
|
||||||
fd1.s3 = fd2.s3;
|
fd1.s3 = fd2.s3;
|
||||||
fd2.s3 = (float)(data[pos] >> task.data.wbits);
|
fd2.s3 = (float)(data[pos] >> task.data.wbits);
|
||||||
len[pos >> 6] += fabs(fd2.s3 + (sum.x + sum.y + sum.z + sum.w));
|
len[pos >> 6] += fabs(fd2.s3 + (sum2.x + sum2.y));
|
||||||
}
|
}
|
||||||
#else
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
float fcoef[32];
|
||||||
|
for (int tid = 0; tid < 32; tid++)
|
||||||
|
fcoef[tid] = tid < MAX_ORDER && tid + ro - MAX_ORDER >= 0 ? - ((float) task.coefs[tid + ro - MAX_ORDER]) / (1 << task.data.shift) : 0.0f;
|
||||||
|
|
||||||
|
float4 fc0 = vload4(0, &fcoef[0]);
|
||||||
|
float4 fc1 = vload4(1, &fcoef[0]);
|
||||||
|
float4 fc2 = vload4(2, &fcoef[0]);
|
||||||
|
|
||||||
float fdata[MAX_ORDER + TEMPBLOCK1 + 32];
|
float fdata[MAX_ORDER + TEMPBLOCK1 + 32];
|
||||||
for (int pos = 0; pos < MAX_ORDER; pos++)
|
for (int pos = 0; pos < MAX_ORDER; pos++)
|
||||||
fdata[pos] = 0.0f;
|
fdata[pos] = 0.0f;
|
||||||
@@ -1048,7 +1080,7 @@ void clEstimateResidual(
|
|||||||
len[pos >> 6] += fabs(next);
|
len[pos >> 6] += fabs(next);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
}
|
||||||
#endif
|
#endif
|
||||||
int total = 0;
|
int total = 0;
|
||||||
for (int i = 0; i < ERPARTS; i++)
|
for (int i = 0; i < ERPARTS; i++)
|
||||||
@@ -1257,22 +1289,31 @@ void clSelectStereoTasks(
|
|||||||
__global int*selectedTasks,
|
__global int*selectedTasks,
|
||||||
__global int*selectedTasksSecondEstimate,
|
__global int*selectedTasksSecondEstimate,
|
||||||
__global int*selectedTasksBestMethod,
|
__global int*selectedTasksBestMethod,
|
||||||
|
int tasksWindow,
|
||||||
|
int windowCount,
|
||||||
|
int tasksToSecondEstimate,
|
||||||
int taskCount,
|
int taskCount,
|
||||||
int selectedCount
|
int selectedCount
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
int best_size[4];
|
int best_size[4];
|
||||||
|
int best_wind[4];
|
||||||
for (int ch = 0; ch < 4; ch++)
|
for (int ch = 0; ch < 4; ch++)
|
||||||
{
|
{
|
||||||
int first_no = selectedTasks[(get_global_id(0) * 4 + ch) * selectedCount];
|
int first_no = selectedTasks[(get_global_id(0) * 4 + ch) * selectedCount];
|
||||||
int best_len = tasks[first_no].data.size;
|
int best_len = tasks[first_no].data.size;
|
||||||
|
int best_wnd = 0;
|
||||||
for (int i = 1; i < selectedCount; i++)
|
for (int i = 1; i < selectedCount; i++)
|
||||||
{
|
{
|
||||||
int task_no = selectedTasks[(get_global_id(0) * 4 + ch) * selectedCount + i];
|
int task_no = selectedTasks[(get_global_id(0) * 4 + ch) * selectedCount + i];
|
||||||
int task_len = tasks[task_no].data.size;
|
int task_len = tasks[task_no].data.size;
|
||||||
|
int task_wnd = (task_no - first_no) / tasksWindow;
|
||||||
|
task_wnd = select(0, task_wnd, task_wnd < windowCount);
|
||||||
|
best_wnd = select(best_wnd, task_wnd, task_len < best_len);
|
||||||
best_len = min(task_len, best_len);
|
best_len = min(task_len, best_len);
|
||||||
}
|
}
|
||||||
best_size[ch] = best_len;
|
best_size[ch] = best_len;
|
||||||
|
best_wind[ch] = best_wnd;
|
||||||
}
|
}
|
||||||
|
|
||||||
int bitsBest = best_size[2] + best_size[3]; // MidSide
|
int bitsBest = best_size[2] + best_size[3]; // MidSide
|
||||||
@@ -1291,16 +1332,17 @@ void clSelectStereoTasks(
|
|||||||
int ch = select(chMask & 3, chMask >> 2, ich > 0);
|
int ch = select(chMask & 3, chMask >> 2, ich > 0);
|
||||||
int roffs = tasks[(get_global_id(0) * 4 + ich) * taskCount].data.samplesOffs;
|
int roffs = tasks[(get_global_id(0) * 4 + ich) * taskCount].data.samplesOffs;
|
||||||
int nonSelectedNo = 0;
|
int nonSelectedNo = 0;
|
||||||
for (int i = 0; i < taskCount; i++)
|
for (int j = taskCount - 1; j >= 0; j--)
|
||||||
{
|
{
|
||||||
|
int i = select(j, (j % windowCount) * tasksWindow + (j / windowCount), j < windowCount * tasksWindow);
|
||||||
int no = (get_global_id(0) * 4 + ch) * taskCount + i;
|
int no = (get_global_id(0) * 4 + ch) * taskCount + i;
|
||||||
selectedTasksBestMethod[(get_global_id(0) * 2 + ich) * taskCount + i] = no;
|
selectedTasksBestMethod[(get_global_id(0) * 2 + ich) * taskCount + i] = no;
|
||||||
tasks[no].data.residualOffs = roffs;
|
tasks[no].data.residualOffs = roffs;
|
||||||
int selectedFound = 0;
|
if (j >= selectedCount)
|
||||||
for(int selectedNo = 0; selectedNo < selectedCount; selectedNo++)
|
tasks[no].data.size = 0x7fffffff;
|
||||||
selectedFound |= (selectedTasks[(get_global_id(0) * 4 + ch) * selectedCount + selectedNo] == no);
|
if (nonSelectedNo < tasksToSecondEstimate)
|
||||||
if (!selectedFound)
|
if (tasksToSecondEstimate == taskCount - selectedCount || best_wind[ch] == i / tasksWindow || i >= windowCount * tasksWindow)
|
||||||
selectedTasksSecondEstimate[(get_global_id(0) * 2 + ich) * (taskCount - selectedCount) + nonSelectedNo++] = no;
|
selectedTasksSecondEstimate[(get_global_id(0) * 2 + ich) * tasksToSecondEstimate + nonSelectedNo++] = no;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1330,24 +1372,42 @@ void clChooseBestMethod(
|
|||||||
// get_group_id(0) == task index
|
// get_group_id(0) == task index
|
||||||
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||||
void clEncodeResidual(
|
void clEncodeResidual(
|
||||||
|
__global ulong *partition_lengths,
|
||||||
__global int *residual,
|
__global int *residual,
|
||||||
__global int *samples,
|
__global int *samples,
|
||||||
__global FLACCLSubframeTask *tasks
|
__global FLACCLSubframeTask *tasks,
|
||||||
|
int max_porder, // <= 8
|
||||||
|
int psize // == task.blocksize >> max_porder?
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
FLACCLSubframeTask task = tasks[get_group_id(0)];
|
FLACCLSubframeTask task = tasks[get_group_id(0)];
|
||||||
int bs = task.data.blocksize;
|
int bs = task.data.blocksize;
|
||||||
int ro = task.data.residualOrder;
|
int ro = task.data.residualOrder;
|
||||||
__global int *data = &samples[task.data.samplesOffs];
|
__global int *data = &samples[task.data.samplesOffs];
|
||||||
SWITCH_N(residual[task.data.residualOffs + pos] = convert_bps_sat(t));
|
__global ulong *pl = partition_lengths + (1 << (max_porder + 1)) * get_group_id(0);
|
||||||
|
int r;
|
||||||
|
for (int p = 0; p < (1 << max_porder); p++)
|
||||||
|
pl[p] = 0UL;
|
||||||
|
__global int *rptr = residual + task.data.residualOffs;
|
||||||
|
if (psize == 16)
|
||||||
|
{
|
||||||
|
SWITCH_N((rptr[pos] = r = convert_bps_sat(t), pl[pos >> 4] += (uint)((r << 1) ^ (r >> 31))));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
SWITCH_N((rptr[pos] = r = convert_bps_sat(t), pl[pos / psize] += (uint)((r << 1) ^ (r >> 31))));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
// get_group_id(0) == task index
|
// get_group_id(0) == task index
|
||||||
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
|
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
|
||||||
void clEncodeResidual(
|
void clEncodeResidual(
|
||||||
|
__global int *partition_lengths,
|
||||||
__global int *output,
|
__global int *output,
|
||||||
__global int *samples,
|
__global int *samples,
|
||||||
__global FLACCLSubframeTask *tasks
|
__global FLACCLSubframeTask *tasks,
|
||||||
|
int max_porder, // <= 8
|
||||||
|
int psize // == task.blocksize >> max_porder?
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
__local FLACCLSubframeTask task;
|
__local FLACCLSubframeTask task;
|
||||||
@@ -1407,34 +1467,7 @@ void clEncodeResidual(
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef FLACCL_CPU
|
#ifndef FLACCL_CPU
|
||||||
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
|
||||||
void clCalcPartition(
|
|
||||||
__global ulong *partition_lengths,
|
|
||||||
__global int *residual,
|
|
||||||
__global FLACCLSubframeTask *tasks,
|
|
||||||
int max_porder, // <= 8
|
|
||||||
int psize // == task.blocksize >> max_porder?
|
|
||||||
)
|
|
||||||
{
|
|
||||||
FLACCLSubframeTask task = tasks[get_group_id(1)];
|
|
||||||
int bs = task.data.blocksize;
|
|
||||||
int ro = task.data.residualOrder;
|
|
||||||
//int psize = bs >> max_porder;
|
|
||||||
__global ulong *pl = partition_lengths + (1 << (max_porder + 1)) * get_group_id(1);
|
|
||||||
|
|
||||||
for (int p = 0; p < (1 << max_porder); p++)
|
|
||||||
pl[p] = 0UL;
|
|
||||||
|
|
||||||
for (int pos = ro; pos < bs; pos ++)
|
|
||||||
{
|
|
||||||
int s = residual[task.data.residualOffs + pos];
|
|
||||||
// convert to unsigned
|
|
||||||
uint t = (s << 1) ^ (s >> 31);
|
|
||||||
pl[pos / psize] += t;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
// get_group_id(0) == partition index / (GROUP_SIZE / 16)
|
// get_group_id(0) == partition index / (GROUP_SIZE / 16)
|
||||||
// get_group_id(1) == task index
|
// get_group_id(1) == task index
|
||||||
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
|
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
|
||||||
|
|||||||
Reference in New Issue
Block a user