mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
optimizations
This commit is contained in:
@@ -1964,10 +1964,10 @@ namespace CUETools.Codecs.FLACCL
|
||||
|
||||
public unsafe void do_output_frames(int nFrames)
|
||||
{
|
||||
if (task2.frameCount > 0)
|
||||
task2.openCLCQ.Finish();
|
||||
send_to_GPU(task1, nFrames, eparams.block_size);
|
||||
run_GPU_task(task1);
|
||||
if (task2.frameCount > 0)
|
||||
task2.openCLCQ.Finish();
|
||||
if (task2.frameCount > 0)
|
||||
{
|
||||
if (cpu_tasks != null)
|
||||
@@ -2681,9 +2681,11 @@ namespace CUETools.Codecs.FLACCL
|
||||
if (UseGPUOnly)
|
||||
{
|
||||
clEncodeResidual = openCLProgram.CreateKernel("clEncodeResidual");
|
||||
clCalcPartition = openCLProgram.CreateKernel("clCalcPartition");
|
||||
if (openCLCQ.Device.DeviceType != DeviceType.CPU)
|
||||
{
|
||||
clCalcPartition = openCLProgram.CreateKernel("clCalcPartition");
|
||||
clCalcPartition16 = openCLProgram.CreateKernel("clCalcPartition16");
|
||||
}
|
||||
clSumPartition = openCLProgram.CreateKernel("clSumPartition");
|
||||
clFindRiceParameter = openCLProgram.CreateKernel("clFindRiceParameter");
|
||||
clFindPartitionOrder = openCLProgram.CreateKernel("clFindPartitionOrder");
|
||||
@@ -2736,9 +2738,11 @@ namespace CUETools.Codecs.FLACCL
|
||||
if (UseGPUOnly)
|
||||
{
|
||||
clEncodeResidual.Dispose();
|
||||
clCalcPartition.Dispose();
|
||||
if (openCLCQ.Device.DeviceType != DeviceType.CPU)
|
||||
{
|
||||
clCalcPartition.Dispose();
|
||||
clCalcPartition16.Dispose();
|
||||
}
|
||||
clSumPartition.Dispose();
|
||||
clFindRiceParameter.Dispose();
|
||||
clFindPartitionOrder.Dispose();
|
||||
@@ -2942,11 +2946,19 @@ namespace CUETools.Codecs.FLACCL
|
||||
groupSize,
|
||||
nEstimateTasksPerChannel * channelsCount * frameCount); // 1 per channel, 4 channels
|
||||
|
||||
int tasksToSecondEstimate = nResidualTasksPerChannel - nEstimateTasksPerChannel;
|
||||
|
||||
//if (nEstimateTasksPerChannel < nTasksPerWindow * nWindowFunctions)
|
||||
//tasksToSecondEstimate -= (nEstimateTasksPerChannel / nWindowFunctions) * (nWindowFunctions - 1);
|
||||
|
||||
clSelectStereoTasks.SetArgs(
|
||||
clResidualTasks,
|
||||
clSelectedTasks,
|
||||
clSelectedTasksSecondEstimate,
|
||||
clSelectedTasksBestMethod,
|
||||
nTasksPerWindow,
|
||||
nWindowFunctions,
|
||||
tasksToSecondEstimate,
|
||||
nResidualTasksPerChannel,
|
||||
nEstimateTasksPerChannel);
|
||||
|
||||
@@ -2954,7 +2966,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
clSelectStereoTasks,
|
||||
0, frameCount);
|
||||
|
||||
if (nEstimateTasksPerChannel < nResidualTasksPerChannel)
|
||||
if (tasksToSecondEstimate > 0)
|
||||
{
|
||||
clEstimateResidual.SetArgs(
|
||||
clSamples,
|
||||
@@ -2964,7 +2976,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
openCLCQ.EnqueueNDRangeKernel(
|
||||
clEstimateResidual,
|
||||
groupSize,
|
||||
(nResidualTasksPerChannel - nEstimateTasksPerChannel) * channels * frameCount);
|
||||
tasksToSecondEstimate * channels * frameCount);
|
||||
}
|
||||
|
||||
clChooseBestMethod.SetArgs(
|
||||
@@ -3003,47 +3015,46 @@ namespace CUETools.Codecs.FLACCL
|
||||
if (UseGPUOnly)
|
||||
{
|
||||
clEncodeResidual.SetArgs(
|
||||
clPartitions,
|
||||
clResidual,
|
||||
clSamples,
|
||||
clBestResidualTasks);
|
||||
clBestResidualTasks,
|
||||
max_porder,
|
||||
frameSize >> max_porder);
|
||||
|
||||
openCLCQ.EnqueueNDRangeKernel(
|
||||
clEncodeResidual,
|
||||
groupSize, channels * frameCount);
|
||||
|
||||
if ((frameSize >> max_porder == 16) && openCLCQ.Device.DeviceType != DeviceType.CPU)
|
||||
if (openCLCQ.Device.DeviceType != DeviceType.CPU)
|
||||
{
|
||||
clCalcPartition16.SetArgs(
|
||||
clPartitions,
|
||||
clResidual,
|
||||
clBestResidualTasks,
|
||||
max_porder);
|
||||
if (frameSize >> max_porder == 16)
|
||||
{
|
||||
clCalcPartition16.SetArgs(
|
||||
clPartitions,
|
||||
clResidual,
|
||||
clBestResidualTasks,
|
||||
max_porder);
|
||||
|
||||
openCLCQ.EnqueueNDRangeKernel(
|
||||
clCalcPartition16,
|
||||
groupSize, channels * frameCount);
|
||||
}
|
||||
else
|
||||
{
|
||||
clCalcPartition.SetArgs(
|
||||
clPartitions,
|
||||
clResidual,
|
||||
clBestResidualTasks,
|
||||
max_porder,
|
||||
frameSize >> max_porder);
|
||||
|
||||
if (openCLCQ.Device.DeviceType == DeviceType.CPU)
|
||||
openCLCQ.EnqueueNDRangeKernel(
|
||||
clCalcPartition,
|
||||
groupSize, 1,
|
||||
1,
|
||||
channels * frameCount);
|
||||
clCalcPartition16,
|
||||
groupSize, channels * frameCount);
|
||||
}
|
||||
else
|
||||
{
|
||||
clCalcPartition.SetArgs(
|
||||
clPartitions,
|
||||
clResidual,
|
||||
clBestResidualTasks,
|
||||
max_porder,
|
||||
frameSize >> max_porder);
|
||||
|
||||
openCLCQ.EnqueueNDRangeKernel(
|
||||
clCalcPartition,
|
||||
groupSize, 1,
|
||||
1 + ((1 << max_porder) - 1) / (groupSize / 16),
|
||||
channels * frameCount);
|
||||
}
|
||||
}
|
||||
|
||||
if (max_porder > 0)
|
||||
|
||||
@@ -896,13 +896,13 @@ inline int fastclz64(long iv)
|
||||
}
|
||||
|
||||
#if BITS_PER_SAMPLE > 16
|
||||
typedef long residual_t;
|
||||
#define residual_t long
|
||||
#define residual_log(s) (63 - fastclz64(s))
|
||||
#define convert_bps4 convert_long4
|
||||
#define convert_bps_sat convert_int_sat
|
||||
#define bpsint4 long4
|
||||
#else
|
||||
typedef int residual_t;
|
||||
#define residual_t int
|
||||
#define residual_log(s) (31 - fastclz(s))
|
||||
#define convert_bps4
|
||||
#define convert_bps_sat
|
||||
@@ -967,88 +967,120 @@ void clEstimateResidual(
|
||||
|
||||
SWITCH_N((len[pos >> 6] += fabs((float)t)))
|
||||
#else
|
||||
float fcoef[32];
|
||||
for (int tid = 0; tid < 32; tid++)
|
||||
fcoef[tid] = tid < MAX_ORDER && tid + ro - MAX_ORDER >= 0 ? - ((float) task.coefs[tid + ro - MAX_ORDER]) / (1 << task.data.shift) : 0.0f;
|
||||
|
||||
float4 fc0 = vload4(0, &fcoef[0]);
|
||||
float4 fc1 = vload4(1, &fcoef[0]);
|
||||
#if MAX_ORDER > 8
|
||||
float4 fc2 = vload4(2, &fcoef[0]);
|
||||
#endif
|
||||
|
||||
#if MAX_ORDER == 8
|
||||
float fdata[32];
|
||||
for (int pos = 0; pos < MAX_ORDER + ro; pos++)
|
||||
fdata[pos] = pos < MAX_ORDER ? 0.0f : (float)(data[pos - MAX_ORDER] >> task.data.wbits);
|
||||
float4 fd0 = vload4(0, &fdata[ro]);
|
||||
float4 fd1 = vload4(1, &fdata[ro]);
|
||||
for (int pos = ro; pos < bs; pos ++)
|
||||
if (ro <= 4)
|
||||
{
|
||||
float4 sum = fc0 * fd0 + fc1 * fd1;
|
||||
fd0 = fd0.s1230;
|
||||
fd1 = fd1.s1230;
|
||||
fd0.s3 = fd1.s3;
|
||||
fd1.s3 = (float)(data[pos] >> task.data.wbits);
|
||||
len[pos >> 6] += fabs(fd1.s3 + (sum.x + sum.y + sum.z + sum.w));
|
||||
}
|
||||
#elif MAX_ORDER == 12
|
||||
float fdata[32];
|
||||
for (int pos = 0; pos < MAX_ORDER + ro; pos++)
|
||||
fdata[pos] = pos < MAX_ORDER ? 0.0f : (float)(data[pos - MAX_ORDER] >> task.data.wbits);
|
||||
float4 fd0 = vload4(0, &fdata[ro]);
|
||||
float4 fd1 = vload4(1, &fdata[ro]);
|
||||
float4 fd2 = vload4(2, &fdata[ro]);
|
||||
for (int pos = ro; pos < bs; pos ++)
|
||||
{
|
||||
float4 sum = fc0 * fd0 + fc1 * fd1 + fc2 * fd2;
|
||||
fd0 = fd0.s1230;
|
||||
fd1 = fd1.s1230;
|
||||
fd2 = fd2.s1230;
|
||||
fd0.s3 = fd1.s3;
|
||||
fd1.s3 = fd2.s3;
|
||||
fd2.s3 = (float)(data[pos] >> task.data.wbits);
|
||||
len[pos >> 6] += fabs(fd2.s3 + (sum.x + sum.y + sum.z + sum.w));
|
||||
}
|
||||
#else
|
||||
float fdata[MAX_ORDER + TEMPBLOCK1 + 32];
|
||||
for (int pos = 0; pos < MAX_ORDER; pos++)
|
||||
fdata[pos] = 0.0f;
|
||||
for (int pos = MAX_ORDER + TEMPBLOCK1; pos < MAX_ORDER + TEMPBLOCK1 + 32; pos++)
|
||||
fdata[pos] = 0.0f;
|
||||
for (int bpos = 0; bpos < bs; bpos += TEMPBLOCK1)
|
||||
{
|
||||
int end = min(bpos + TEMPBLOCK1, bs);
|
||||
|
||||
for (int pos = max(bpos - ro, 0); pos < max(bpos, ro); pos++)
|
||||
fdata[MAX_ORDER + pos - bpos] = (float)(data[pos] >> task.data.wbits);
|
||||
|
||||
for (int pos = max(bpos, ro); pos < end; pos ++)
|
||||
float fcoef[4];
|
||||
for (int tid = 0; tid < 4; tid++)
|
||||
fcoef[tid] = tid + ro - 4 < 0 ? 0.0f : - ((float) task.coefs[tid + ro - 4]) / (1 << task.data.shift);
|
||||
float4 fc0 = vload4(0, &fcoef[0]);
|
||||
float fdata[4];
|
||||
for (int pos = 0; pos < 4; pos++)
|
||||
fdata[pos] = pos + ro - 4 < 0 ? 0.0f : (float)(data[pos + ro - 4] >> task.data.wbits);
|
||||
float4 fd0 = vload4(0, &fdata[0]);
|
||||
for (int pos = ro; pos < bs; pos ++)
|
||||
{
|
||||
float next = (float)(data[pos] >> task.data.wbits);
|
||||
float * dptr = fdata + pos - bpos;
|
||||
dptr[MAX_ORDER] = next;
|
||||
float4 sum
|
||||
= fc0 * vload4(0, dptr)
|
||||
+ fc1 * vload4(1, dptr)
|
||||
#if MAX_ORDER > 8
|
||||
+ fc2 * vload4(2, dptr)
|
||||
#if MAX_ORDER > 12
|
||||
+ vload4(3, &fcoef[0]) * vload4(3, dptr)
|
||||
#if MAX_ORDER > 16
|
||||
+ vload4(4, &fcoef[0]) * vload4(4, dptr)
|
||||
+ vload4(5, &fcoef[0]) * vload4(5, dptr)
|
||||
+ vload4(6, &fcoef[0]) * vload4(6, dptr)
|
||||
+ vload4(7, &fcoef[0]) * vload4(7, dptr)
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
;
|
||||
next += sum.x + sum.y + sum.z + sum.w;
|
||||
len[pos >> 6] += fabs(next);
|
||||
float4 sum4 = fc0 * fd0;
|
||||
float2 sum2 = sum4.s01 + sum4.s23;
|
||||
fd0 = fd0.s1230;
|
||||
fd0.s3 = (float)(data[pos] >> task.data.wbits);
|
||||
len[pos >> 6] += fabs(fd0.s3 + (sum2.x + sum2.y));
|
||||
}
|
||||
}
|
||||
else if (ro <= 8)
|
||||
{
|
||||
float fcoef[8];
|
||||
for (int tid = 0; tid < 8; tid++)
|
||||
fcoef[tid] = tid + ro - 8 < 0 ? 0.0f : - ((float) task.coefs[tid + ro - 8]) / (1 << task.data.shift);
|
||||
float8 fc0 = vload8(0, &fcoef[0]);
|
||||
float fdata[8];
|
||||
for (int pos = 0; pos < 8; pos++)
|
||||
fdata[pos] = pos + ro - 8 < 0 ? 0.0f : (float)(data[pos + ro - 8] >> task.data.wbits);
|
||||
float8 fd0 = vload8(0, &fdata[0]);
|
||||
for (int pos = ro; pos < bs; pos ++)
|
||||
{
|
||||
float8 sum8 = fc0 * fd0;
|
||||
float4 sum4 = sum8.s0123 + sum8.s4567;
|
||||
float2 sum2 = sum4.s01 + sum4.s23;
|
||||
fd0 = fd0.s12345670;
|
||||
fd0.s7 = (float)(data[pos] >> task.data.wbits);
|
||||
len[pos >> 6] += fabs(fd0.s7 + (sum2.x + sum2.y));
|
||||
}
|
||||
}
|
||||
else if (ro <= 12)
|
||||
{
|
||||
float fcoef[12];
|
||||
for (int tid = 0; tid < 12; tid++)
|
||||
fcoef[tid] = tid + ro - 12 >= 0 ? - ((float) task.coefs[tid + ro - 12]) / (1 << task.data.shift) : 0.0f;
|
||||
float4 fc0 = vload4(0, &fcoef[0]);
|
||||
float4 fc1 = vload4(1, &fcoef[0]);
|
||||
float4 fc2 = vload4(2, &fcoef[0]);
|
||||
float fdata[12];
|
||||
for (int pos = 0; pos < 12; pos++)
|
||||
fdata[pos] = pos + ro - 12 < 0 ? 0.0f : (float)(data[pos + ro - 12] >> task.data.wbits);
|
||||
float4 fd0 = vload4(0, &fdata[0]);
|
||||
float4 fd1 = vload4(1, &fdata[0]);
|
||||
float4 fd2 = vload4(2, &fdata[0]);
|
||||
for (int pos = ro; pos < bs; pos ++)
|
||||
{
|
||||
float4 sum4 = fc0 * fd0 + fc1 * fd1 + fc2 * fd2;
|
||||
float2 sum2 = sum4.s01 + sum4.s23;
|
||||
fd0 = fd0.s1230;
|
||||
fd1 = fd1.s1230;
|
||||
fd2 = fd2.s1230;
|
||||
fd0.s3 = fd1.s3;
|
||||
fd1.s3 = fd2.s3;
|
||||
fd2.s3 = (float)(data[pos] >> task.data.wbits);
|
||||
len[pos >> 6] += fabs(fd2.s3 + (sum2.x + sum2.y));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
float fcoef[32];
|
||||
for (int tid = 0; tid < 32; tid++)
|
||||
fcoef[tid] = tid < MAX_ORDER && tid + ro - MAX_ORDER >= 0 ? - ((float) task.coefs[tid + ro - MAX_ORDER]) / (1 << task.data.shift) : 0.0f;
|
||||
|
||||
float4 fc0 = vload4(0, &fcoef[0]);
|
||||
float4 fc1 = vload4(1, &fcoef[0]);
|
||||
float4 fc2 = vload4(2, &fcoef[0]);
|
||||
|
||||
float fdata[MAX_ORDER + TEMPBLOCK1 + 32];
|
||||
for (int pos = 0; pos < MAX_ORDER; pos++)
|
||||
fdata[pos] = 0.0f;
|
||||
for (int pos = MAX_ORDER + TEMPBLOCK1; pos < MAX_ORDER + TEMPBLOCK1 + 32; pos++)
|
||||
fdata[pos] = 0.0f;
|
||||
for (int bpos = 0; bpos < bs; bpos += TEMPBLOCK1)
|
||||
{
|
||||
int end = min(bpos + TEMPBLOCK1, bs);
|
||||
|
||||
for (int pos = max(bpos - ro, 0); pos < max(bpos, ro); pos++)
|
||||
fdata[MAX_ORDER + pos - bpos] = (float)(data[pos] >> task.data.wbits);
|
||||
|
||||
for (int pos = max(bpos, ro); pos < end; pos ++)
|
||||
{
|
||||
float next = (float)(data[pos] >> task.data.wbits);
|
||||
float * dptr = fdata + pos - bpos;
|
||||
dptr[MAX_ORDER] = next;
|
||||
float4 sum
|
||||
= fc0 * vload4(0, dptr)
|
||||
+ fc1 * vload4(1, dptr)
|
||||
#if MAX_ORDER > 8
|
||||
+ fc2 * vload4(2, dptr)
|
||||
#if MAX_ORDER > 12
|
||||
+ vload4(3, &fcoef[0]) * vload4(3, dptr)
|
||||
#if MAX_ORDER > 16
|
||||
+ vload4(4, &fcoef[0]) * vload4(4, dptr)
|
||||
+ vload4(5, &fcoef[0]) * vload4(5, dptr)
|
||||
+ vload4(6, &fcoef[0]) * vload4(6, dptr)
|
||||
+ vload4(7, &fcoef[0]) * vload4(7, dptr)
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
;
|
||||
next += sum.x + sum.y + sum.z + sum.w;
|
||||
len[pos >> 6] += fabs(next);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
int total = 0;
|
||||
for (int i = 0; i < ERPARTS; i++)
|
||||
@@ -1257,22 +1289,31 @@ void clSelectStereoTasks(
|
||||
__global int*selectedTasks,
|
||||
__global int*selectedTasksSecondEstimate,
|
||||
__global int*selectedTasksBestMethod,
|
||||
int tasksWindow,
|
||||
int windowCount,
|
||||
int tasksToSecondEstimate,
|
||||
int taskCount,
|
||||
int selectedCount
|
||||
)
|
||||
{
|
||||
int best_size[4];
|
||||
int best_wind[4];
|
||||
for (int ch = 0; ch < 4; ch++)
|
||||
{
|
||||
int first_no = selectedTasks[(get_global_id(0) * 4 + ch) * selectedCount];
|
||||
int best_len = tasks[first_no].data.size;
|
||||
int best_wnd = 0;
|
||||
for (int i = 1; i < selectedCount; i++)
|
||||
{
|
||||
int task_no = selectedTasks[(get_global_id(0) * 4 + ch) * selectedCount + i];
|
||||
int task_len = tasks[task_no].data.size;
|
||||
int task_wnd = (task_no - first_no) / tasksWindow;
|
||||
task_wnd = select(0, task_wnd, task_wnd < windowCount);
|
||||
best_wnd = select(best_wnd, task_wnd, task_len < best_len);
|
||||
best_len = min(task_len, best_len);
|
||||
}
|
||||
best_size[ch] = best_len;
|
||||
best_wind[ch] = best_wnd;
|
||||
}
|
||||
|
||||
int bitsBest = best_size[2] + best_size[3]; // MidSide
|
||||
@@ -1291,16 +1332,17 @@ void clSelectStereoTasks(
|
||||
int ch = select(chMask & 3, chMask >> 2, ich > 0);
|
||||
int roffs = tasks[(get_global_id(0) * 4 + ich) * taskCount].data.samplesOffs;
|
||||
int nonSelectedNo = 0;
|
||||
for (int i = 0; i < taskCount; i++)
|
||||
for (int j = taskCount - 1; j >= 0; j--)
|
||||
{
|
||||
int i = select(j, (j % windowCount) * tasksWindow + (j / windowCount), j < windowCount * tasksWindow);
|
||||
int no = (get_global_id(0) * 4 + ch) * taskCount + i;
|
||||
selectedTasksBestMethod[(get_global_id(0) * 2 + ich) * taskCount + i] = no;
|
||||
tasks[no].data.residualOffs = roffs;
|
||||
int selectedFound = 0;
|
||||
for(int selectedNo = 0; selectedNo < selectedCount; selectedNo++)
|
||||
selectedFound |= (selectedTasks[(get_global_id(0) * 4 + ch) * selectedCount + selectedNo] == no);
|
||||
if (!selectedFound)
|
||||
selectedTasksSecondEstimate[(get_global_id(0) * 2 + ich) * (taskCount - selectedCount) + nonSelectedNo++] = no;
|
||||
if (j >= selectedCount)
|
||||
tasks[no].data.size = 0x7fffffff;
|
||||
if (nonSelectedNo < tasksToSecondEstimate)
|
||||
if (tasksToSecondEstimate == taskCount - selectedCount || best_wind[ch] == i / tasksWindow || i >= windowCount * tasksWindow)
|
||||
selectedTasksSecondEstimate[(get_global_id(0) * 2 + ich) * tasksToSecondEstimate + nonSelectedNo++] = no;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1330,24 +1372,42 @@ void clChooseBestMethod(
|
||||
// get_group_id(0) == task index
|
||||
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||
void clEncodeResidual(
|
||||
__global ulong *partition_lengths,
|
||||
__global int *residual,
|
||||
__global int *samples,
|
||||
__global FLACCLSubframeTask *tasks
|
||||
__global FLACCLSubframeTask *tasks,
|
||||
int max_porder, // <= 8
|
||||
int psize // == task.blocksize >> max_porder?
|
||||
)
|
||||
{
|
||||
FLACCLSubframeTask task = tasks[get_group_id(0)];
|
||||
int bs = task.data.blocksize;
|
||||
int ro = task.data.residualOrder;
|
||||
__global int *data = &samples[task.data.samplesOffs];
|
||||
SWITCH_N(residual[task.data.residualOffs + pos] = convert_bps_sat(t));
|
||||
__global ulong *pl = partition_lengths + (1 << (max_porder + 1)) * get_group_id(0);
|
||||
int r;
|
||||
for (int p = 0; p < (1 << max_porder); p++)
|
||||
pl[p] = 0UL;
|
||||
__global int *rptr = residual + task.data.residualOffs;
|
||||
if (psize == 16)
|
||||
{
|
||||
SWITCH_N((rptr[pos] = r = convert_bps_sat(t), pl[pos >> 4] += (uint)((r << 1) ^ (r >> 31))));
|
||||
}
|
||||
else
|
||||
{
|
||||
SWITCH_N((rptr[pos] = r = convert_bps_sat(t), pl[pos / psize] += (uint)((r << 1) ^ (r >> 31))));
|
||||
}
|
||||
}
|
||||
#else
|
||||
// get_group_id(0) == task index
|
||||
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
|
||||
void clEncodeResidual(
|
||||
__global int *partition_lengths,
|
||||
__global int *output,
|
||||
__global int *samples,
|
||||
__global FLACCLSubframeTask *tasks
|
||||
__global FLACCLSubframeTask *tasks,
|
||||
int max_porder, // <= 8
|
||||
int psize // == task.blocksize >> max_porder?
|
||||
)
|
||||
{
|
||||
__local FLACCLSubframeTask task;
|
||||
@@ -1407,34 +1467,7 @@ void clEncodeResidual(
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef FLACCL_CPU
|
||||
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||
void clCalcPartition(
|
||||
__global ulong *partition_lengths,
|
||||
__global int *residual,
|
||||
__global FLACCLSubframeTask *tasks,
|
||||
int max_porder, // <= 8
|
||||
int psize // == task.blocksize >> max_porder?
|
||||
)
|
||||
{
|
||||
FLACCLSubframeTask task = tasks[get_group_id(1)];
|
||||
int bs = task.data.blocksize;
|
||||
int ro = task.data.residualOrder;
|
||||
//int psize = bs >> max_porder;
|
||||
__global ulong *pl = partition_lengths + (1 << (max_porder + 1)) * get_group_id(1);
|
||||
|
||||
for (int p = 0; p < (1 << max_porder); p++)
|
||||
pl[p] = 0UL;
|
||||
|
||||
for (int pos = ro; pos < bs; pos ++)
|
||||
{
|
||||
int s = residual[task.data.residualOffs + pos];
|
||||
// convert to unsigned
|
||||
uint t = (s << 1) ^ (s >> 31);
|
||||
pl[pos / psize] += t;
|
||||
}
|
||||
}
|
||||
#else
|
||||
#ifndef FLACCL_CPU
|
||||
// get_group_id(0) == partition index / (GROUP_SIZE / 16)
|
||||
// get_group_id(1) == task index
|
||||
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
|
||||
|
||||
Reference in New Issue
Block a user