optimizations

This commit is contained in:
chudov
2010-12-10 05:19:39 +00:00
parent c96f27b0de
commit aef331476c
2 changed files with 192 additions and 148 deletions

View File

@@ -1964,10 +1964,10 @@ namespace CUETools.Codecs.FLACCL
public unsafe void do_output_frames(int nFrames)
{
if (task2.frameCount > 0)
task2.openCLCQ.Finish();
send_to_GPU(task1, nFrames, eparams.block_size);
run_GPU_task(task1);
if (task2.frameCount > 0)
task2.openCLCQ.Finish();
if (task2.frameCount > 0)
{
if (cpu_tasks != null)
@@ -2681,9 +2681,11 @@ namespace CUETools.Codecs.FLACCL
if (UseGPUOnly)
{
clEncodeResidual = openCLProgram.CreateKernel("clEncodeResidual");
clCalcPartition = openCLProgram.CreateKernel("clCalcPartition");
if (openCLCQ.Device.DeviceType != DeviceType.CPU)
{
clCalcPartition = openCLProgram.CreateKernel("clCalcPartition");
clCalcPartition16 = openCLProgram.CreateKernel("clCalcPartition16");
}
clSumPartition = openCLProgram.CreateKernel("clSumPartition");
clFindRiceParameter = openCLProgram.CreateKernel("clFindRiceParameter");
clFindPartitionOrder = openCLProgram.CreateKernel("clFindPartitionOrder");
@@ -2736,9 +2738,11 @@ namespace CUETools.Codecs.FLACCL
if (UseGPUOnly)
{
clEncodeResidual.Dispose();
clCalcPartition.Dispose();
if (openCLCQ.Device.DeviceType != DeviceType.CPU)
{
clCalcPartition.Dispose();
clCalcPartition16.Dispose();
}
clSumPartition.Dispose();
clFindRiceParameter.Dispose();
clFindPartitionOrder.Dispose();
@@ -2942,11 +2946,19 @@ namespace CUETools.Codecs.FLACCL
groupSize,
nEstimateTasksPerChannel * channelsCount * frameCount); // 1 per channel, 4 channels
int tasksToSecondEstimate = nResidualTasksPerChannel - nEstimateTasksPerChannel;
//if (nEstimateTasksPerChannel < nTasksPerWindow * nWindowFunctions)
//tasksToSecondEstimate -= (nEstimateTasksPerChannel / nWindowFunctions) * (nWindowFunctions - 1);
clSelectStereoTasks.SetArgs(
clResidualTasks,
clSelectedTasks,
clSelectedTasksSecondEstimate,
clSelectedTasksBestMethod,
nTasksPerWindow,
nWindowFunctions,
tasksToSecondEstimate,
nResidualTasksPerChannel,
nEstimateTasksPerChannel);
@@ -2954,7 +2966,7 @@ namespace CUETools.Codecs.FLACCL
clSelectStereoTasks,
0, frameCount);
if (nEstimateTasksPerChannel < nResidualTasksPerChannel)
if (tasksToSecondEstimate > 0)
{
clEstimateResidual.SetArgs(
clSamples,
@@ -2964,7 +2976,7 @@ namespace CUETools.Codecs.FLACCL
openCLCQ.EnqueueNDRangeKernel(
clEstimateResidual,
groupSize,
(nResidualTasksPerChannel - nEstimateTasksPerChannel) * channels * frameCount);
tasksToSecondEstimate * channels * frameCount);
}
clChooseBestMethod.SetArgs(
@@ -3003,47 +3015,46 @@ namespace CUETools.Codecs.FLACCL
if (UseGPUOnly)
{
clEncodeResidual.SetArgs(
clPartitions,
clResidual,
clSamples,
clBestResidualTasks);
clBestResidualTasks,
max_porder,
frameSize >> max_porder);
openCLCQ.EnqueueNDRangeKernel(
clEncodeResidual,
groupSize, channels * frameCount);
if ((frameSize >> max_porder == 16) && openCLCQ.Device.DeviceType != DeviceType.CPU)
if (openCLCQ.Device.DeviceType != DeviceType.CPU)
{
clCalcPartition16.SetArgs(
clPartitions,
clResidual,
clBestResidualTasks,
max_porder);
if (frameSize >> max_porder == 16)
{
clCalcPartition16.SetArgs(
clPartitions,
clResidual,
clBestResidualTasks,
max_porder);
openCLCQ.EnqueueNDRangeKernel(
clCalcPartition16,
groupSize, channels * frameCount);
}
else
{
clCalcPartition.SetArgs(
clPartitions,
clResidual,
clBestResidualTasks,
max_porder,
frameSize >> max_porder);
if (openCLCQ.Device.DeviceType == DeviceType.CPU)
openCLCQ.EnqueueNDRangeKernel(
clCalcPartition,
groupSize, 1,
1,
channels * frameCount);
clCalcPartition16,
groupSize, channels * frameCount);
}
else
{
clCalcPartition.SetArgs(
clPartitions,
clResidual,
clBestResidualTasks,
max_porder,
frameSize >> max_porder);
openCLCQ.EnqueueNDRangeKernel(
clCalcPartition,
groupSize, 1,
1 + ((1 << max_porder) - 1) / (groupSize / 16),
channels * frameCount);
}
}
if (max_porder > 0)

View File

@@ -896,13 +896,13 @@ inline int fastclz64(long iv)
}
#if BITS_PER_SAMPLE > 16
typedef long residual_t;
#define residual_t long
#define residual_log(s) (63 - fastclz64(s))
#define convert_bps4 convert_long4
#define convert_bps_sat convert_int_sat
#define bpsint4 long4
#else
typedef int residual_t;
#define residual_t int
#define residual_log(s) (31 - fastclz(s))
#define convert_bps4
#define convert_bps_sat
@@ -967,88 +967,120 @@ void clEstimateResidual(
SWITCH_N((len[pos >> 6] += fabs((float)t)))
#else
float fcoef[32];
for (int tid = 0; tid < 32; tid++)
fcoef[tid] = tid < MAX_ORDER && tid + ro - MAX_ORDER >= 0 ? - ((float) task.coefs[tid + ro - MAX_ORDER]) / (1 << task.data.shift) : 0.0f;
float4 fc0 = vload4(0, &fcoef[0]);
float4 fc1 = vload4(1, &fcoef[0]);
#if MAX_ORDER > 8
float4 fc2 = vload4(2, &fcoef[0]);
#endif
#if MAX_ORDER == 8
float fdata[32];
for (int pos = 0; pos < MAX_ORDER + ro; pos++)
fdata[pos] = pos < MAX_ORDER ? 0.0f : (float)(data[pos - MAX_ORDER] >> task.data.wbits);
float4 fd0 = vload4(0, &fdata[ro]);
float4 fd1 = vload4(1, &fdata[ro]);
for (int pos = ro; pos < bs; pos ++)
if (ro <= 4)
{
float4 sum = fc0 * fd0 + fc1 * fd1;
fd0 = fd0.s1230;
fd1 = fd1.s1230;
fd0.s3 = fd1.s3;
fd1.s3 = (float)(data[pos] >> task.data.wbits);
len[pos >> 6] += fabs(fd1.s3 + (sum.x + sum.y + sum.z + sum.w));
}
#elif MAX_ORDER == 12
float fdata[32];
for (int pos = 0; pos < MAX_ORDER + ro; pos++)
fdata[pos] = pos < MAX_ORDER ? 0.0f : (float)(data[pos - MAX_ORDER] >> task.data.wbits);
float4 fd0 = vload4(0, &fdata[ro]);
float4 fd1 = vload4(1, &fdata[ro]);
float4 fd2 = vload4(2, &fdata[ro]);
for (int pos = ro; pos < bs; pos ++)
{
float4 sum = fc0 * fd0 + fc1 * fd1 + fc2 * fd2;
fd0 = fd0.s1230;
fd1 = fd1.s1230;
fd2 = fd2.s1230;
fd0.s3 = fd1.s3;
fd1.s3 = fd2.s3;
fd2.s3 = (float)(data[pos] >> task.data.wbits);
len[pos >> 6] += fabs(fd2.s3 + (sum.x + sum.y + sum.z + sum.w));
}
#else
float fdata[MAX_ORDER + TEMPBLOCK1 + 32];
for (int pos = 0; pos < MAX_ORDER; pos++)
fdata[pos] = 0.0f;
for (int pos = MAX_ORDER + TEMPBLOCK1; pos < MAX_ORDER + TEMPBLOCK1 + 32; pos++)
fdata[pos] = 0.0f;
for (int bpos = 0; bpos < bs; bpos += TEMPBLOCK1)
{
int end = min(bpos + TEMPBLOCK1, bs);
for (int pos = max(bpos - ro, 0); pos < max(bpos, ro); pos++)
fdata[MAX_ORDER + pos - bpos] = (float)(data[pos] >> task.data.wbits);
for (int pos = max(bpos, ro); pos < end; pos ++)
float fcoef[4];
for (int tid = 0; tid < 4; tid++)
fcoef[tid] = tid + ro - 4 < 0 ? 0.0f : - ((float) task.coefs[tid + ro - 4]) / (1 << task.data.shift);
float4 fc0 = vload4(0, &fcoef[0]);
float fdata[4];
for (int pos = 0; pos < 4; pos++)
fdata[pos] = pos + ro - 4 < 0 ? 0.0f : (float)(data[pos + ro - 4] >> task.data.wbits);
float4 fd0 = vload4(0, &fdata[0]);
for (int pos = ro; pos < bs; pos ++)
{
float next = (float)(data[pos] >> task.data.wbits);
float * dptr = fdata + pos - bpos;
dptr[MAX_ORDER] = next;
float4 sum
= fc0 * vload4(0, dptr)
+ fc1 * vload4(1, dptr)
#if MAX_ORDER > 8
+ fc2 * vload4(2, dptr)
#if MAX_ORDER > 12
+ vload4(3, &fcoef[0]) * vload4(3, dptr)
#if MAX_ORDER > 16
+ vload4(4, &fcoef[0]) * vload4(4, dptr)
+ vload4(5, &fcoef[0]) * vload4(5, dptr)
+ vload4(6, &fcoef[0]) * vload4(6, dptr)
+ vload4(7, &fcoef[0]) * vload4(7, dptr)
#endif
#endif
#endif
;
next += sum.x + sum.y + sum.z + sum.w;
len[pos >> 6] += fabs(next);
float4 sum4 = fc0 * fd0;
float2 sum2 = sum4.s01 + sum4.s23;
fd0 = fd0.s1230;
fd0.s3 = (float)(data[pos] >> task.data.wbits);
len[pos >> 6] += fabs(fd0.s3 + (sum2.x + sum2.y));
}
}
else if (ro <= 8)
{
float fcoef[8];
for (int tid = 0; tid < 8; tid++)
fcoef[tid] = tid + ro - 8 < 0 ? 0.0f : - ((float) task.coefs[tid + ro - 8]) / (1 << task.data.shift);
float8 fc0 = vload8(0, &fcoef[0]);
float fdata[8];
for (int pos = 0; pos < 8; pos++)
fdata[pos] = pos + ro - 8 < 0 ? 0.0f : (float)(data[pos + ro - 8] >> task.data.wbits);
float8 fd0 = vload8(0, &fdata[0]);
for (int pos = ro; pos < bs; pos ++)
{
float8 sum8 = fc0 * fd0;
float4 sum4 = sum8.s0123 + sum8.s4567;
float2 sum2 = sum4.s01 + sum4.s23;
fd0 = fd0.s12345670;
fd0.s7 = (float)(data[pos] >> task.data.wbits);
len[pos >> 6] += fabs(fd0.s7 + (sum2.x + sum2.y));
}
}
else if (ro <= 12)
{
float fcoef[12];
for (int tid = 0; tid < 12; tid++)
fcoef[tid] = tid + ro - 12 >= 0 ? - ((float) task.coefs[tid + ro - 12]) / (1 << task.data.shift) : 0.0f;
float4 fc0 = vload4(0, &fcoef[0]);
float4 fc1 = vload4(1, &fcoef[0]);
float4 fc2 = vload4(2, &fcoef[0]);
float fdata[12];
for (int pos = 0; pos < 12; pos++)
fdata[pos] = pos + ro - 12 < 0 ? 0.0f : (float)(data[pos + ro - 12] >> task.data.wbits);
float4 fd0 = vload4(0, &fdata[0]);
float4 fd1 = vload4(1, &fdata[0]);
float4 fd2 = vload4(2, &fdata[0]);
for (int pos = ro; pos < bs; pos ++)
{
float4 sum4 = fc0 * fd0 + fc1 * fd1 + fc2 * fd2;
float2 sum2 = sum4.s01 + sum4.s23;
fd0 = fd0.s1230;
fd1 = fd1.s1230;
fd2 = fd2.s1230;
fd0.s3 = fd1.s3;
fd1.s3 = fd2.s3;
fd2.s3 = (float)(data[pos] >> task.data.wbits);
len[pos >> 6] += fabs(fd2.s3 + (sum2.x + sum2.y));
}
}
else
{
float fcoef[32];
for (int tid = 0; tid < 32; tid++)
fcoef[tid] = tid < MAX_ORDER && tid + ro - MAX_ORDER >= 0 ? - ((float) task.coefs[tid + ro - MAX_ORDER]) / (1 << task.data.shift) : 0.0f;
float4 fc0 = vload4(0, &fcoef[0]);
float4 fc1 = vload4(1, &fcoef[0]);
float4 fc2 = vload4(2, &fcoef[0]);
float fdata[MAX_ORDER + TEMPBLOCK1 + 32];
for (int pos = 0; pos < MAX_ORDER; pos++)
fdata[pos] = 0.0f;
for (int pos = MAX_ORDER + TEMPBLOCK1; pos < MAX_ORDER + TEMPBLOCK1 + 32; pos++)
fdata[pos] = 0.0f;
for (int bpos = 0; bpos < bs; bpos += TEMPBLOCK1)
{
int end = min(bpos + TEMPBLOCK1, bs);
for (int pos = max(bpos - ro, 0); pos < max(bpos, ro); pos++)
fdata[MAX_ORDER + pos - bpos] = (float)(data[pos] >> task.data.wbits);
for (int pos = max(bpos, ro); pos < end; pos ++)
{
float next = (float)(data[pos] >> task.data.wbits);
float * dptr = fdata + pos - bpos;
dptr[MAX_ORDER] = next;
float4 sum
= fc0 * vload4(0, dptr)
+ fc1 * vload4(1, dptr)
#if MAX_ORDER > 8
+ fc2 * vload4(2, dptr)
#if MAX_ORDER > 12
+ vload4(3, &fcoef[0]) * vload4(3, dptr)
#if MAX_ORDER > 16
+ vload4(4, &fcoef[0]) * vload4(4, dptr)
+ vload4(5, &fcoef[0]) * vload4(5, dptr)
+ vload4(6, &fcoef[0]) * vload4(6, dptr)
+ vload4(7, &fcoef[0]) * vload4(7, dptr)
#endif
#endif
#endif
;
next += sum.x + sum.y + sum.z + sum.w;
len[pos >> 6] += fabs(next);
}
}
}
#endif
#endif
int total = 0;
for (int i = 0; i < ERPARTS; i++)
@@ -1257,22 +1289,31 @@ void clSelectStereoTasks(
__global int*selectedTasks,
__global int*selectedTasksSecondEstimate,
__global int*selectedTasksBestMethod,
int tasksWindow,
int windowCount,
int tasksToSecondEstimate,
int taskCount,
int selectedCount
)
{
int best_size[4];
int best_wind[4];
for (int ch = 0; ch < 4; ch++)
{
int first_no = selectedTasks[(get_global_id(0) * 4 + ch) * selectedCount];
int best_len = tasks[first_no].data.size;
int best_wnd = 0;
for (int i = 1; i < selectedCount; i++)
{
int task_no = selectedTasks[(get_global_id(0) * 4 + ch) * selectedCount + i];
int task_len = tasks[task_no].data.size;
int task_wnd = (task_no - first_no) / tasksWindow;
task_wnd = select(0, task_wnd, task_wnd < windowCount);
best_wnd = select(best_wnd, task_wnd, task_len < best_len);
best_len = min(task_len, best_len);
}
best_size[ch] = best_len;
best_wind[ch] = best_wnd;
}
int bitsBest = best_size[2] + best_size[3]; // MidSide
@@ -1291,16 +1332,17 @@ void clSelectStereoTasks(
int ch = select(chMask & 3, chMask >> 2, ich > 0);
int roffs = tasks[(get_global_id(0) * 4 + ich) * taskCount].data.samplesOffs;
int nonSelectedNo = 0;
for (int i = 0; i < taskCount; i++)
for (int j = taskCount - 1; j >= 0; j--)
{
int i = select(j, (j % windowCount) * tasksWindow + (j / windowCount), j < windowCount * tasksWindow);
int no = (get_global_id(0) * 4 + ch) * taskCount + i;
selectedTasksBestMethod[(get_global_id(0) * 2 + ich) * taskCount + i] = no;
tasks[no].data.residualOffs = roffs;
int selectedFound = 0;
for(int selectedNo = 0; selectedNo < selectedCount; selectedNo++)
selectedFound |= (selectedTasks[(get_global_id(0) * 4 + ch) * selectedCount + selectedNo] == no);
if (!selectedFound)
selectedTasksSecondEstimate[(get_global_id(0) * 2 + ich) * (taskCount - selectedCount) + nonSelectedNo++] = no;
if (j >= selectedCount)
tasks[no].data.size = 0x7fffffff;
if (nonSelectedNo < tasksToSecondEstimate)
if (tasksToSecondEstimate == taskCount - selectedCount || best_wind[ch] == i / tasksWindow || i >= windowCount * tasksWindow)
selectedTasksSecondEstimate[(get_global_id(0) * 2 + ich) * tasksToSecondEstimate + nonSelectedNo++] = no;
}
}
}
@@ -1330,24 +1372,42 @@ void clChooseBestMethod(
// get_group_id(0) == task index
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
void clEncodeResidual(
__global ulong *partition_lengths,
__global int *residual,
__global int *samples,
__global FLACCLSubframeTask *tasks
__global FLACCLSubframeTask *tasks,
int max_porder, // <= 8
int psize // == task.blocksize >> max_porder?
)
{
FLACCLSubframeTask task = tasks[get_group_id(0)];
int bs = task.data.blocksize;
int ro = task.data.residualOrder;
__global int *data = &samples[task.data.samplesOffs];
SWITCH_N(residual[task.data.residualOffs + pos] = convert_bps_sat(t));
__global ulong *pl = partition_lengths + (1 << (max_porder + 1)) * get_group_id(0);
int r;
for (int p = 0; p < (1 << max_porder); p++)
pl[p] = 0UL;
__global int *rptr = residual + task.data.residualOffs;
if (psize == 16)
{
SWITCH_N((rptr[pos] = r = convert_bps_sat(t), pl[pos >> 4] += (uint)((r << 1) ^ (r >> 31))));
}
else
{
SWITCH_N((rptr[pos] = r = convert_bps_sat(t), pl[pos / psize] += (uint)((r << 1) ^ (r >> 31))));
}
}
#else
// get_group_id(0) == task index
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
void clEncodeResidual(
__global int *partition_lengths,
__global int *output,
__global int *samples,
__global FLACCLSubframeTask *tasks
__global FLACCLSubframeTask *tasks,
int max_porder, // <= 8
int psize // == task.blocksize >> max_porder?
)
{
__local FLACCLSubframeTask task;
@@ -1407,34 +1467,7 @@ void clEncodeResidual(
}
#endif
#ifdef FLACCL_CPU
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
void clCalcPartition(
__global ulong *partition_lengths,
__global int *residual,
__global FLACCLSubframeTask *tasks,
int max_porder, // <= 8
int psize // == task.blocksize >> max_porder?
)
{
FLACCLSubframeTask task = tasks[get_group_id(1)];
int bs = task.data.blocksize;
int ro = task.data.residualOrder;
//int psize = bs >> max_porder;
__global ulong *pl = partition_lengths + (1 << (max_porder + 1)) * get_group_id(1);
for (int p = 0; p < (1 << max_porder); p++)
pl[p] = 0UL;
for (int pos = ro; pos < bs; pos ++)
{
int s = residual[task.data.residualOffs + pos];
// convert to unsigned
uint t = (s << 1) ^ (s >> 31);
pl[pos / psize] += t;
}
}
#else
#ifndef FLACCL_CPU
// get_group_id(0) == partition index / (GROUP_SIZE / 16)
// get_group_id(1) == task index
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))