optimizations

This commit is contained in:
chudov
2010-12-10 05:19:39 +00:00
parent c96f27b0de
commit aef331476c
2 changed files with 192 additions and 148 deletions

View File

@@ -1964,10 +1964,10 @@ namespace CUETools.Codecs.FLACCL
public unsafe void do_output_frames(int nFrames) public unsafe void do_output_frames(int nFrames)
{ {
if (task2.frameCount > 0)
task2.openCLCQ.Finish();
send_to_GPU(task1, nFrames, eparams.block_size); send_to_GPU(task1, nFrames, eparams.block_size);
run_GPU_task(task1); run_GPU_task(task1);
if (task2.frameCount > 0)
task2.openCLCQ.Finish();
if (task2.frameCount > 0) if (task2.frameCount > 0)
{ {
if (cpu_tasks != null) if (cpu_tasks != null)
@@ -2681,9 +2681,11 @@ namespace CUETools.Codecs.FLACCL
if (UseGPUOnly) if (UseGPUOnly)
{ {
clEncodeResidual = openCLProgram.CreateKernel("clEncodeResidual"); clEncodeResidual = openCLProgram.CreateKernel("clEncodeResidual");
clCalcPartition = openCLProgram.CreateKernel("clCalcPartition");
if (openCLCQ.Device.DeviceType != DeviceType.CPU) if (openCLCQ.Device.DeviceType != DeviceType.CPU)
{
clCalcPartition = openCLProgram.CreateKernel("clCalcPartition");
clCalcPartition16 = openCLProgram.CreateKernel("clCalcPartition16"); clCalcPartition16 = openCLProgram.CreateKernel("clCalcPartition16");
}
clSumPartition = openCLProgram.CreateKernel("clSumPartition"); clSumPartition = openCLProgram.CreateKernel("clSumPartition");
clFindRiceParameter = openCLProgram.CreateKernel("clFindRiceParameter"); clFindRiceParameter = openCLProgram.CreateKernel("clFindRiceParameter");
clFindPartitionOrder = openCLProgram.CreateKernel("clFindPartitionOrder"); clFindPartitionOrder = openCLProgram.CreateKernel("clFindPartitionOrder");
@@ -2736,9 +2738,11 @@ namespace CUETools.Codecs.FLACCL
if (UseGPUOnly) if (UseGPUOnly)
{ {
clEncodeResidual.Dispose(); clEncodeResidual.Dispose();
clCalcPartition.Dispose();
if (openCLCQ.Device.DeviceType != DeviceType.CPU) if (openCLCQ.Device.DeviceType != DeviceType.CPU)
{
clCalcPartition.Dispose();
clCalcPartition16.Dispose(); clCalcPartition16.Dispose();
}
clSumPartition.Dispose(); clSumPartition.Dispose();
clFindRiceParameter.Dispose(); clFindRiceParameter.Dispose();
clFindPartitionOrder.Dispose(); clFindPartitionOrder.Dispose();
@@ -2942,11 +2946,19 @@ namespace CUETools.Codecs.FLACCL
groupSize, groupSize,
nEstimateTasksPerChannel * channelsCount * frameCount); // 1 per channel, 4 channels nEstimateTasksPerChannel * channelsCount * frameCount); // 1 per channel, 4 channels
int tasksToSecondEstimate = nResidualTasksPerChannel - nEstimateTasksPerChannel;
//if (nEstimateTasksPerChannel < nTasksPerWindow * nWindowFunctions)
//tasksToSecondEstimate -= (nEstimateTasksPerChannel / nWindowFunctions) * (nWindowFunctions - 1);
clSelectStereoTasks.SetArgs( clSelectStereoTasks.SetArgs(
clResidualTasks, clResidualTasks,
clSelectedTasks, clSelectedTasks,
clSelectedTasksSecondEstimate, clSelectedTasksSecondEstimate,
clSelectedTasksBestMethod, clSelectedTasksBestMethod,
nTasksPerWindow,
nWindowFunctions,
tasksToSecondEstimate,
nResidualTasksPerChannel, nResidualTasksPerChannel,
nEstimateTasksPerChannel); nEstimateTasksPerChannel);
@@ -2954,7 +2966,7 @@ namespace CUETools.Codecs.FLACCL
clSelectStereoTasks, clSelectStereoTasks,
0, frameCount); 0, frameCount);
if (nEstimateTasksPerChannel < nResidualTasksPerChannel) if (tasksToSecondEstimate > 0)
{ {
clEstimateResidual.SetArgs( clEstimateResidual.SetArgs(
clSamples, clSamples,
@@ -2964,7 +2976,7 @@ namespace CUETools.Codecs.FLACCL
openCLCQ.EnqueueNDRangeKernel( openCLCQ.EnqueueNDRangeKernel(
clEstimateResidual, clEstimateResidual,
groupSize, groupSize,
(nResidualTasksPerChannel - nEstimateTasksPerChannel) * channels * frameCount); tasksToSecondEstimate * channels * frameCount);
} }
clChooseBestMethod.SetArgs( clChooseBestMethod.SetArgs(
@@ -3003,15 +3015,20 @@ namespace CUETools.Codecs.FLACCL
if (UseGPUOnly) if (UseGPUOnly)
{ {
clEncodeResidual.SetArgs( clEncodeResidual.SetArgs(
clPartitions,
clResidual, clResidual,
clSamples, clSamples,
clBestResidualTasks); clBestResidualTasks,
max_porder,
frameSize >> max_porder);
openCLCQ.EnqueueNDRangeKernel( openCLCQ.EnqueueNDRangeKernel(
clEncodeResidual, clEncodeResidual,
groupSize, channels * frameCount); groupSize, channels * frameCount);
if ((frameSize >> max_porder == 16) && openCLCQ.Device.DeviceType != DeviceType.CPU) if (openCLCQ.Device.DeviceType != DeviceType.CPU)
{
if (frameSize >> max_porder == 16)
{ {
clCalcPartition16.SetArgs( clCalcPartition16.SetArgs(
clPartitions, clPartitions,
@@ -3032,19 +3049,13 @@ namespace CUETools.Codecs.FLACCL
max_porder, max_porder,
frameSize >> max_porder); frameSize >> max_porder);
if (openCLCQ.Device.DeviceType == DeviceType.CPU)
openCLCQ.EnqueueNDRangeKernel(
clCalcPartition,
groupSize, 1,
1,
channels * frameCount);
else
openCLCQ.EnqueueNDRangeKernel( openCLCQ.EnqueueNDRangeKernel(
clCalcPartition, clCalcPartition,
groupSize, 1, groupSize, 1,
1 + ((1 << max_porder) - 1) / (groupSize / 16), 1 + ((1 << max_porder) - 1) / (groupSize / 16),
channels * frameCount); channels * frameCount);
} }
}
if (max_porder > 0) if (max_porder > 0)
{ {

View File

@@ -896,13 +896,13 @@ inline int fastclz64(long iv)
} }
#if BITS_PER_SAMPLE > 16 #if BITS_PER_SAMPLE > 16
typedef long residual_t; #define residual_t long
#define residual_log(s) (63 - fastclz64(s)) #define residual_log(s) (63 - fastclz64(s))
#define convert_bps4 convert_long4 #define convert_bps4 convert_long4
#define convert_bps_sat convert_int_sat #define convert_bps_sat convert_int_sat
#define bpsint4 long4 #define bpsint4 long4
#else #else
typedef int residual_t; #define residual_t int
#define residual_log(s) (31 - fastclz(s)) #define residual_log(s) (31 - fastclz(s))
#define convert_bps4 #define convert_bps4
#define convert_bps_sat #define convert_bps_sat
@@ -967,50 +967,82 @@ void clEstimateResidual(
SWITCH_N((len[pos >> 6] += fabs((float)t))) SWITCH_N((len[pos >> 6] += fabs((float)t)))
#else #else
float fcoef[32]; if (ro <= 4)
for (int tid = 0; tid < 32; tid++) {
fcoef[tid] = tid < MAX_ORDER && tid + ro - MAX_ORDER >= 0 ? - ((float) task.coefs[tid + ro - MAX_ORDER]) / (1 << task.data.shift) : 0.0f; float fcoef[4];
for (int tid = 0; tid < 4; tid++)
fcoef[tid] = tid + ro - 4 < 0 ? 0.0f : - ((float) task.coefs[tid + ro - 4]) / (1 << task.data.shift);
float4 fc0 = vload4(0, &fcoef[0]);
float fdata[4];
for (int pos = 0; pos < 4; pos++)
fdata[pos] = pos + ro - 4 < 0 ? 0.0f : (float)(data[pos + ro - 4] >> task.data.wbits);
float4 fd0 = vload4(0, &fdata[0]);
for (int pos = ro; pos < bs; pos ++)
{
float4 sum4 = fc0 * fd0;
float2 sum2 = sum4.s01 + sum4.s23;
fd0 = fd0.s1230;
fd0.s3 = (float)(data[pos] >> task.data.wbits);
len[pos >> 6] += fabs(fd0.s3 + (sum2.x + sum2.y));
}
}
else if (ro <= 8)
{
float fcoef[8];
for (int tid = 0; tid < 8; tid++)
fcoef[tid] = tid + ro - 8 < 0 ? 0.0f : - ((float) task.coefs[tid + ro - 8]) / (1 << task.data.shift);
float8 fc0 = vload8(0, &fcoef[0]);
float fdata[8];
for (int pos = 0; pos < 8; pos++)
fdata[pos] = pos + ro - 8 < 0 ? 0.0f : (float)(data[pos + ro - 8] >> task.data.wbits);
float8 fd0 = vload8(0, &fdata[0]);
for (int pos = ro; pos < bs; pos ++)
{
float8 sum8 = fc0 * fd0;
float4 sum4 = sum8.s0123 + sum8.s4567;
float2 sum2 = sum4.s01 + sum4.s23;
fd0 = fd0.s12345670;
fd0.s7 = (float)(data[pos] >> task.data.wbits);
len[pos >> 6] += fabs(fd0.s7 + (sum2.x + sum2.y));
}
}
else if (ro <= 12)
{
float fcoef[12];
for (int tid = 0; tid < 12; tid++)
fcoef[tid] = tid + ro - 12 >= 0 ? - ((float) task.coefs[tid + ro - 12]) / (1 << task.data.shift) : 0.0f;
float4 fc0 = vload4(0, &fcoef[0]); float4 fc0 = vload4(0, &fcoef[0]);
float4 fc1 = vload4(1, &fcoef[0]); float4 fc1 = vload4(1, &fcoef[0]);
#if MAX_ORDER > 8
float4 fc2 = vload4(2, &fcoef[0]); float4 fc2 = vload4(2, &fcoef[0]);
#endif float fdata[12];
for (int pos = 0; pos < 12; pos++)
#if MAX_ORDER == 8 fdata[pos] = pos + ro - 12 < 0 ? 0.0f : (float)(data[pos + ro - 12] >> task.data.wbits);
float fdata[32]; float4 fd0 = vload4(0, &fdata[0]);
for (int pos = 0; pos < MAX_ORDER + ro; pos++) float4 fd1 = vload4(1, &fdata[0]);
fdata[pos] = pos < MAX_ORDER ? 0.0f : (float)(data[pos - MAX_ORDER] >> task.data.wbits); float4 fd2 = vload4(2, &fdata[0]);
float4 fd0 = vload4(0, &fdata[ro]);
float4 fd1 = vload4(1, &fdata[ro]);
for (int pos = ro; pos < bs; pos ++) for (int pos = ro; pos < bs; pos ++)
{ {
float4 sum = fc0 * fd0 + fc1 * fd1; float4 sum4 = fc0 * fd0 + fc1 * fd1 + fc2 * fd2;
fd0 = fd0.s1230; float2 sum2 = sum4.s01 + sum4.s23;
fd1 = fd1.s1230;
fd0.s3 = fd1.s3;
fd1.s3 = (float)(data[pos] >> task.data.wbits);
len[pos >> 6] += fabs(fd1.s3 + (sum.x + sum.y + sum.z + sum.w));
}
#elif MAX_ORDER == 12
float fdata[32];
for (int pos = 0; pos < MAX_ORDER + ro; pos++)
fdata[pos] = pos < MAX_ORDER ? 0.0f : (float)(data[pos - MAX_ORDER] >> task.data.wbits);
float4 fd0 = vload4(0, &fdata[ro]);
float4 fd1 = vload4(1, &fdata[ro]);
float4 fd2 = vload4(2, &fdata[ro]);
for (int pos = ro; pos < bs; pos ++)
{
float4 sum = fc0 * fd0 + fc1 * fd1 + fc2 * fd2;
fd0 = fd0.s1230; fd0 = fd0.s1230;
fd1 = fd1.s1230; fd1 = fd1.s1230;
fd2 = fd2.s1230; fd2 = fd2.s1230;
fd0.s3 = fd1.s3; fd0.s3 = fd1.s3;
fd1.s3 = fd2.s3; fd1.s3 = fd2.s3;
fd2.s3 = (float)(data[pos] >> task.data.wbits); fd2.s3 = (float)(data[pos] >> task.data.wbits);
len[pos >> 6] += fabs(fd2.s3 + (sum.x + sum.y + sum.z + sum.w)); len[pos >> 6] += fabs(fd2.s3 + (sum2.x + sum2.y));
} }
#else }
else
{
float fcoef[32];
for (int tid = 0; tid < 32; tid++)
fcoef[tid] = tid < MAX_ORDER && tid + ro - MAX_ORDER >= 0 ? - ((float) task.coefs[tid + ro - MAX_ORDER]) / (1 << task.data.shift) : 0.0f;
float4 fc0 = vload4(0, &fcoef[0]);
float4 fc1 = vload4(1, &fcoef[0]);
float4 fc2 = vload4(2, &fcoef[0]);
float fdata[MAX_ORDER + TEMPBLOCK1 + 32]; float fdata[MAX_ORDER + TEMPBLOCK1 + 32];
for (int pos = 0; pos < MAX_ORDER; pos++) for (int pos = 0; pos < MAX_ORDER; pos++)
fdata[pos] = 0.0f; fdata[pos] = 0.0f;
@@ -1048,7 +1080,7 @@ void clEstimateResidual(
len[pos >> 6] += fabs(next); len[pos >> 6] += fabs(next);
} }
} }
#endif }
#endif #endif
int total = 0; int total = 0;
for (int i = 0; i < ERPARTS; i++) for (int i = 0; i < ERPARTS; i++)
@@ -1257,22 +1289,31 @@ void clSelectStereoTasks(
__global int*selectedTasks, __global int*selectedTasks,
__global int*selectedTasksSecondEstimate, __global int*selectedTasksSecondEstimate,
__global int*selectedTasksBestMethod, __global int*selectedTasksBestMethod,
int tasksWindow,
int windowCount,
int tasksToSecondEstimate,
int taskCount, int taskCount,
int selectedCount int selectedCount
) )
{ {
int best_size[4]; int best_size[4];
int best_wind[4];
for (int ch = 0; ch < 4; ch++) for (int ch = 0; ch < 4; ch++)
{ {
int first_no = selectedTasks[(get_global_id(0) * 4 + ch) * selectedCount]; int first_no = selectedTasks[(get_global_id(0) * 4 + ch) * selectedCount];
int best_len = tasks[first_no].data.size; int best_len = tasks[first_no].data.size;
int best_wnd = 0;
for (int i = 1; i < selectedCount; i++) for (int i = 1; i < selectedCount; i++)
{ {
int task_no = selectedTasks[(get_global_id(0) * 4 + ch) * selectedCount + i]; int task_no = selectedTasks[(get_global_id(0) * 4 + ch) * selectedCount + i];
int task_len = tasks[task_no].data.size; int task_len = tasks[task_no].data.size;
int task_wnd = (task_no - first_no) / tasksWindow;
task_wnd = select(0, task_wnd, task_wnd < windowCount);
best_wnd = select(best_wnd, task_wnd, task_len < best_len);
best_len = min(task_len, best_len); best_len = min(task_len, best_len);
} }
best_size[ch] = best_len; best_size[ch] = best_len;
best_wind[ch] = best_wnd;
} }
int bitsBest = best_size[2] + best_size[3]; // MidSide int bitsBest = best_size[2] + best_size[3]; // MidSide
@@ -1291,16 +1332,17 @@ void clSelectStereoTasks(
int ch = select(chMask & 3, chMask >> 2, ich > 0); int ch = select(chMask & 3, chMask >> 2, ich > 0);
int roffs = tasks[(get_global_id(0) * 4 + ich) * taskCount].data.samplesOffs; int roffs = tasks[(get_global_id(0) * 4 + ich) * taskCount].data.samplesOffs;
int nonSelectedNo = 0; int nonSelectedNo = 0;
for (int i = 0; i < taskCount; i++) for (int j = taskCount - 1; j >= 0; j--)
{ {
int i = select(j, (j % windowCount) * tasksWindow + (j / windowCount), j < windowCount * tasksWindow);
int no = (get_global_id(0) * 4 + ch) * taskCount + i; int no = (get_global_id(0) * 4 + ch) * taskCount + i;
selectedTasksBestMethod[(get_global_id(0) * 2 + ich) * taskCount + i] = no; selectedTasksBestMethod[(get_global_id(0) * 2 + ich) * taskCount + i] = no;
tasks[no].data.residualOffs = roffs; tasks[no].data.residualOffs = roffs;
int selectedFound = 0; if (j >= selectedCount)
for(int selectedNo = 0; selectedNo < selectedCount; selectedNo++) tasks[no].data.size = 0x7fffffff;
selectedFound |= (selectedTasks[(get_global_id(0) * 4 + ch) * selectedCount + selectedNo] == no); if (nonSelectedNo < tasksToSecondEstimate)
if (!selectedFound) if (tasksToSecondEstimate == taskCount - selectedCount || best_wind[ch] == i / tasksWindow || i >= windowCount * tasksWindow)
selectedTasksSecondEstimate[(get_global_id(0) * 2 + ich) * (taskCount - selectedCount) + nonSelectedNo++] = no; selectedTasksSecondEstimate[(get_global_id(0) * 2 + ich) * tasksToSecondEstimate + nonSelectedNo++] = no;
} }
} }
} }
@@ -1330,24 +1372,42 @@ void clChooseBestMethod(
// get_group_id(0) == task index // get_group_id(0) == task index
__kernel __attribute__((reqd_work_group_size(1, 1, 1))) __kernel __attribute__((reqd_work_group_size(1, 1, 1)))
void clEncodeResidual( void clEncodeResidual(
__global ulong *partition_lengths,
__global int *residual, __global int *residual,
__global int *samples, __global int *samples,
__global FLACCLSubframeTask *tasks __global FLACCLSubframeTask *tasks,
int max_porder, // <= 8
int psize // == task.blocksize >> max_porder?
) )
{ {
FLACCLSubframeTask task = tasks[get_group_id(0)]; FLACCLSubframeTask task = tasks[get_group_id(0)];
int bs = task.data.blocksize; int bs = task.data.blocksize;
int ro = task.data.residualOrder; int ro = task.data.residualOrder;
__global int *data = &samples[task.data.samplesOffs]; __global int *data = &samples[task.data.samplesOffs];
SWITCH_N(residual[task.data.residualOffs + pos] = convert_bps_sat(t)); __global ulong *pl = partition_lengths + (1 << (max_porder + 1)) * get_group_id(0);
int r;
for (int p = 0; p < (1 << max_porder); p++)
pl[p] = 0UL;
__global int *rptr = residual + task.data.residualOffs;
if (psize == 16)
{
SWITCH_N((rptr[pos] = r = convert_bps_sat(t), pl[pos >> 4] += (uint)((r << 1) ^ (r >> 31))));
}
else
{
SWITCH_N((rptr[pos] = r = convert_bps_sat(t), pl[pos / psize] += (uint)((r << 1) ^ (r >> 31))));
}
} }
#else #else
// get_group_id(0) == task index // get_group_id(0) == task index
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
void clEncodeResidual( void clEncodeResidual(
__global int *partition_lengths,
__global int *output, __global int *output,
__global int *samples, __global int *samples,
__global FLACCLSubframeTask *tasks __global FLACCLSubframeTask *tasks,
int max_porder, // <= 8
int psize // == task.blocksize >> max_porder?
) )
{ {
__local FLACCLSubframeTask task; __local FLACCLSubframeTask task;
@@ -1407,34 +1467,7 @@ void clEncodeResidual(
} }
#endif #endif
#ifdef FLACCL_CPU #ifndef FLACCL_CPU
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
void clCalcPartition(
__global ulong *partition_lengths,
__global int *residual,
__global FLACCLSubframeTask *tasks,
int max_porder, // <= 8
int psize // == task.blocksize >> max_porder?
)
{
FLACCLSubframeTask task = tasks[get_group_id(1)];
int bs = task.data.blocksize;
int ro = task.data.residualOrder;
//int psize = bs >> max_porder;
__global ulong *pl = partition_lengths + (1 << (max_porder + 1)) * get_group_id(1);
for (int p = 0; p < (1 << max_porder); p++)
pl[p] = 0UL;
for (int pos = ro; pos < bs; pos ++)
{
int s = residual[task.data.residualOffs + pos];
// convert to unsigned
uint t = (s << 1) ^ (s >> 31);
pl[pos / psize] += t;
}
}
#else
// get_group_id(0) == partition index / (GROUP_SIZE / 16) // get_group_id(0) == partition index / (GROUP_SIZE / 16)
// get_group_id(1) == task index // get_group_id(1) == task index
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))