mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
optimizations
This commit is contained in:
@@ -251,7 +251,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
if (_settings.DeviceType == OpenCLDeviceType.CPU)
|
||||
{
|
||||
_settings.GroupSize = 1;
|
||||
_settings.GPUOnly = false;
|
||||
//_settings.GPUOnly = true;
|
||||
_settings.MappedMemory = true;
|
||||
}
|
||||
eparams.flake_set_defaults(_compressionLevel, !_settings.GPUOnly);
|
||||
@@ -2657,11 +2657,18 @@ namespace CUETools.Codecs.FLACCL
|
||||
max_porder,
|
||||
frameSize >> max_porder);
|
||||
|
||||
openCLCQ.EnqueueNDRangeKernel(
|
||||
clCalcPartition,
|
||||
groupSize, 1,
|
||||
1 + ((1 << max_porder) - 1) / (groupSize / 16),
|
||||
channels * frameCount);
|
||||
if (openCLCQ.Device.DeviceType == DeviceType.CPU)
|
||||
openCLCQ.EnqueueNDRangeKernel(
|
||||
clCalcPartition,
|
||||
groupSize, 1,
|
||||
1,
|
||||
channels * frameCount);
|
||||
else
|
||||
openCLCQ.EnqueueNDRangeKernel(
|
||||
clCalcPartition,
|
||||
groupSize, 1,
|
||||
1 + ((1 << max_porder) - 1) / (groupSize / 16),
|
||||
channels * frameCount);
|
||||
}
|
||||
|
||||
if (max_porder > 0)
|
||||
@@ -2670,26 +2677,32 @@ namespace CUETools.Codecs.FLACCL
|
||||
clPartitions,
|
||||
max_porder);
|
||||
|
||||
openCLCQ.EnqueueNDRangeKernel(
|
||||
clSumPartition,
|
||||
128, 1,
|
||||
(Flake.MAX_RICE_PARAM + 1),
|
||||
channels * frameCount);
|
||||
if (openCLCQ.Device.DeviceType == DeviceType.CPU)
|
||||
openCLCQ.EnqueueNDRangeKernel(
|
||||
clSumPartition,
|
||||
1, 1, 1,
|
||||
channels * frameCount);
|
||||
else
|
||||
openCLCQ.EnqueueNDRangeKernel(
|
||||
clSumPartition,
|
||||
128, 1,
|
||||
(Flake.MAX_RICE_PARAM + 1),
|
||||
channels * frameCount);
|
||||
}
|
||||
|
||||
clFindRiceParameter.SetArgs(
|
||||
clBestResidualTasks,
|
||||
clRiceParams,
|
||||
clPartitions,
|
||||
max_porder);
|
||||
|
||||
openCLCQ.EnqueueNDRangeKernel(
|
||||
clFindRiceParameter,
|
||||
groupSize, 1,
|
||||
Math.Max(1, (2 << max_porder) / groupSize),
|
||||
channels * frameCount);
|
||||
groupSize, channels * frameCount);
|
||||
|
||||
//if (max_porder > 0) // need to run even if max_porder==0 just to calculate the final frame size
|
||||
clFindPartitionOrder.SetArgs(
|
||||
clResidual,
|
||||
clBestRiceParams,
|
||||
clBestResidualTasks,
|
||||
clRiceParams,
|
||||
|
||||
@@ -874,7 +874,7 @@ void clSumPartition(
|
||||
const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (get_group_id(0) << (max_porder + 1));
|
||||
|
||||
// fetch partition lengths
|
||||
int2 pl = get_local_id(0) * 2 < (1 << max_porder) ? *(__global int2*)&partition_lengths[pos + get_local_id(0) * 2] : 0;
|
||||
int2 pl = get_local_id(0) * 2 < (1 << max_porder) ? vload2(get_local_id(0),&partition_lengths[pos]) : 0;
|
||||
data[get_local_id(0)] = pl.x + pl.y;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
@@ -893,22 +893,19 @@ void clSumPartition(
|
||||
partition_lengths[pos + (1 << max_porder) + get_local_size(0) + get_local_id(0)] = data[get_local_size(0) + get_local_id(0)];
|
||||
}
|
||||
|
||||
// Finds optimal rice parameter for several partitions at a time.
|
||||
// get_group_id(0) == chunk index (chunk size is GROUP_SIZE, total task size is (2 << max_porder))
|
||||
// get_group_id(1) == task index
|
||||
// Finds optimal rice parameter for each partition.
|
||||
// get_group_id(0) == task index
|
||||
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
|
||||
void clFindRiceParameter(
|
||||
__global FLACCLSubframeTask *tasks,
|
||||
__global int* rice_parameters,
|
||||
__global int* partition_lengths,
|
||||
int max_porder
|
||||
)
|
||||
{
|
||||
const int tid = get_local_id(0);
|
||||
const int parts = min(GROUP_SIZE, 2 << max_porder);
|
||||
const int pos = (15 << (max_porder + 1)) * get_group_id(1) + get_group_id(0) * GROUP_SIZE + tid;
|
||||
|
||||
if (tid < parts)
|
||||
for (int offs = get_local_id(0); offs < (2 << max_porder); offs += GROUP_SIZE)
|
||||
{
|
||||
const int pos = (15 << (max_porder + 1)) * get_group_id(0) + offs;
|
||||
int best_l = partition_lengths[pos];
|
||||
int best_k = 0;
|
||||
for (int k = 1; k <= 14; k++)
|
||||
@@ -919,15 +916,16 @@ void clFindRiceParameter(
|
||||
}
|
||||
|
||||
// output rice parameter
|
||||
rice_parameters[(get_group_id(1) << (max_porder + 2)) + get_group_id(0) * GROUP_SIZE + tid] = best_k;
|
||||
rice_parameters[(get_group_id(0) << (max_porder + 2)) + offs] = best_k;
|
||||
// output length
|
||||
rice_parameters[(get_group_id(1) << (max_porder + 2)) + (1 << (max_porder + 1)) + get_group_id(0) * GROUP_SIZE + tid] = best_l;
|
||||
rice_parameters[(get_group_id(0) << (max_porder + 2)) + (1 << (max_porder + 1)) + offs] = best_l;
|
||||
}
|
||||
}
|
||||
|
||||
// get_group_id(0) == task index
|
||||
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
|
||||
void clFindPartitionOrder(
|
||||
__global int *residual,
|
||||
__global int* best_rice_parameters,
|
||||
__global FLACCLSubframeTask *tasks,
|
||||
__global int* rice_parameters,
|
||||
@@ -973,9 +971,8 @@ void clFindPartitionOrder(
|
||||
task.type == Constant ? obits : obits * task.blocksize;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for (int offs = 0; offs < (1 << best_porder); offs += GROUP_SIZE)
|
||||
if (offs + get_local_id(0) < (1 << best_porder))
|
||||
best_rice_parameters[(get_group_id(0) << max_porder) + offs + get_local_id(0)] = rice_parameters[pos - (2 << best_porder) + offs + get_local_id(0)];
|
||||
for (int offs = get_local_id(0); offs < (1 << best_porder); offs += GROUP_SIZE)
|
||||
best_rice_parameters[(get_group_id(0) << max_porder) + offs] = rice_parameters[pos - (2 << best_porder) + offs];
|
||||
// FIXME: should be bytes?
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -131,6 +131,8 @@ void clFindWastedBits(
|
||||
}
|
||||
}
|
||||
|
||||
#define TEMPBLOCK 64
|
||||
|
||||
// get_num_groups(0) == number of tasks
|
||||
// get_num_groups(1) == number of windows
|
||||
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||
@@ -145,43 +147,31 @@ void clComputeAutocor(
|
||||
FLACCLSubframeData task = tasks[get_group_id(0) * taskCount].data;
|
||||
int len = task.blocksize;
|
||||
int windowOffs = get_group_id(1) * len;
|
||||
float data1[4096 + 32];
|
||||
float data[TEMPBLOCK + MAX_ORDER + 3];
|
||||
double ac[MAX_ORDER + 4];
|
||||
|
||||
// TODO!!!!!!!!!!! if (bs > 4096) data1[bs + 32]
|
||||
|
||||
for (int tid = 0; tid < len; tid++)
|
||||
data1[tid] = samples[task.samplesOffs + tid] * window[windowOffs + tid];
|
||||
data1[len] = 0.0f;
|
||||
__global float * pout = &output[(get_group_id(0) * get_num_groups(1) + get_group_id(1)) * (MAX_ORDER + 1)];
|
||||
for (int l = 1; l < MAX_ORDER; l++)
|
||||
data1[len + l] = 0.0f;
|
||||
|
||||
// double ac0 = 0.0, ac1 = 0.0, ac2 = 0.0, ac3 = 0.0;
|
||||
// for (int j = 0; j < len; j++)
|
||||
// {
|
||||
//float dj = data1[j];
|
||||
//ac0 += dj * dj;
|
||||
//ac1 += dj * data1[j + 1];
|
||||
//ac2 += dj * data1[j + 2];
|
||||
//ac3 += dj * data1[j + 3];
|
||||
// }
|
||||
// pout[0] = ac0;
|
||||
// pout[1] = ac1;
|
||||
// pout[2] = ac2;
|
||||
// pout[3] = ac3;
|
||||
for (int i = 0; i <= MAX_ORDER; ++i)
|
||||
{
|
||||
double temp = 1.0;
|
||||
double temp2 = 1.0;
|
||||
float* finish = data1 + len - i;
|
||||
ac[i] = 0.0;
|
||||
|
||||
for (float* pdata = data1; pdata < finish; pdata += 2)
|
||||
for (int pos = 0; pos < len; pos += TEMPBLOCK)
|
||||
{
|
||||
for (int tid = 0; tid < TEMPBLOCK + MAX_ORDER + 3; tid++)
|
||||
data[tid] = tid < len - pos ? samples[task.samplesOffs + pos + tid] * window[windowOffs + pos + tid] : 0.0f;
|
||||
|
||||
for (int i = 0; i <= MAX_ORDER; i += 4)
|
||||
{
|
||||
temp += pdata[i] * pdata[0];
|
||||
temp2 += pdata[i + 1] * pdata[1];
|
||||
float4 temp = 0.0;
|
||||
for (int j = 0; j < min(TEMPBLOCK, len - pos); j++)
|
||||
temp += data[j] * vload4(0, &data[j + i]);
|
||||
ac[i] += temp.x;
|
||||
ac[i+1] += temp.y;
|
||||
ac[i+2] += temp.z;
|
||||
ac[i+3] += temp.w;
|
||||
}
|
||||
pout[i] = temp + temp2;
|
||||
}
|
||||
__global float * pout = &output[(get_group_id(0) * get_num_groups(1) + get_group_id(1)) * (MAX_ORDER + 1)];
|
||||
for (int i = 0; i <= MAX_ORDER; ++i)
|
||||
pout[i] = ac[i];
|
||||
}
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||
@@ -319,13 +309,36 @@ void clQuantizeLPC(
|
||||
}
|
||||
}
|
||||
|
||||
#define ESTIMATE_N(ro,sum) for (int pos = ro; pos < bs; pos ++) { \
|
||||
__global int *ptr = data + pos - ro; \
|
||||
int t = clamp((data[pos] - ((sum) >> task.data.shift)) >> task.data.wbits, -0x7fffff, 0x7fffff); \
|
||||
len[pos >> (12 - EPO)] += (t << 1) ^ (t >> 31); \
|
||||
}
|
||||
inline int calc_residual(__global int *ptr, int * coefs, int ro)
|
||||
{
|
||||
int sum = 0;
|
||||
for (int i = 0; i < ro; i++)
|
||||
sum += ptr[i] * coefs[i];
|
||||
return sum;
|
||||
}
|
||||
|
||||
// int sum = 0; for (int i = 0; i < ro; i++) sum += *(ptr++) * task.coefs[i];
|
||||
#define ENCODE_N(cro,action) for (int pos = cro; pos < bs; pos ++) { \
|
||||
int t = (data[pos] - (calc_residual(data + pos - cro, task.coefs, cro) >> task.data.shift)) >> task.data.wbits; \
|
||||
action; \
|
||||
}
|
||||
#define SWITCH_N(action) \
|
||||
switch (ro) \
|
||||
{ \
|
||||
case 0: ENCODE_N(0, action) break; \
|
||||
case 1: ENCODE_N(1, action) break; \
|
||||
case 2: ENCODE_N(2, action) /*if (task.coefs[0] == -1 && task.coefs[1] == 2) ENCODE_N(2, 2 * ptr[1] - ptr[0], action) else*/ break; \
|
||||
case 3: ENCODE_N(3, action) break; \
|
||||
case 4: ENCODE_N(4, action) break; \
|
||||
case 5: ENCODE_N(5, action) break; \
|
||||
case 6: ENCODE_N(6, action) break; \
|
||||
case 7: ENCODE_N(7, action) break; \
|
||||
case 8: ENCODE_N(8, action) break; \
|
||||
case 9: ENCODE_N(9, action) break; \
|
||||
case 10: ENCODE_N(10, action) break; \
|
||||
case 11: ENCODE_N(11, action) break; \
|
||||
case 12: ENCODE_N(12, action) break; \
|
||||
default: ENCODE_N(ro, action) \
|
||||
}
|
||||
|
||||
__kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||
void clEstimateResidual(
|
||||
@@ -337,72 +350,15 @@ void clEstimateResidual(
|
||||
int ro = task.data.residualOrder;
|
||||
int bs = task.data.blocksize;
|
||||
#define EPO 6
|
||||
int len[1 << EPO];
|
||||
int len[1 << EPO]; // blocksize / 64!!!!
|
||||
|
||||
#if 0
|
||||
//float data[4096 + 32];
|
||||
//float fcoef[32];
|
||||
|
||||
// TODO!!!!!!!!!!! if (bs > 4096) data1[bs + 32]
|
||||
|
||||
for (int tid = 0; tid < bs; tid++)
|
||||
data[tid] = (float)samples[task.data.samplesOffs + tid] / (1 << task.data.wbits);
|
||||
for (int tid = 0; tid < 32; tid++)
|
||||
fcoef[tid] = select(0.0f, - ((float) task.coefs[tid]) / (1 << task.data.shift), tid < ro);
|
||||
float4 c0 = vload4(0, &fcoef[0]);
|
||||
float4 c1 = vload4(1, &fcoef[0]);
|
||||
float4 c2 = vload4(2, &fcoef[0]);
|
||||
#else
|
||||
__global int *data = &samples[task.data.samplesOffs];
|
||||
for (int i = ro; i < 32; i++)
|
||||
task.coefs[i] = 0;
|
||||
#endif
|
||||
// for (int i = ro; i < 32; i++)
|
||||
//task.coefs[i] = 0;
|
||||
for (int i = 0; i < 1 << EPO; i++)
|
||||
len[i] = 0;
|
||||
|
||||
switch (ro)
|
||||
{
|
||||
case 0: ESTIMATE_N(0, 0) break;
|
||||
case 1: ESTIMATE_N(1, *ptr * task.coefs[0]) break;
|
||||
case 2: ESTIMATE_N(2, *(ptr++) * task.coefs[0] + *ptr * task.coefs[1]) break;
|
||||
case 3: ESTIMATE_N(3, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *ptr * task.coefs[2]) break;
|
||||
case 4: ESTIMATE_N(4, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *ptr * task.coefs[3]) break;
|
||||
case 5: ESTIMATE_N(5, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *ptr * task.coefs[4]) break;
|
||||
case 6: ESTIMATE_N(6, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *ptr * task.coefs[5]) break;
|
||||
case 7: ESTIMATE_N(7, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *ptr * task.coefs[6]) break;
|
||||
case 8: ESTIMATE_N(8, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *ptr * task.coefs[7]) break;
|
||||
case 9: ESTIMATE_N(9, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7] + *ptr * task.coefs[8]) break;
|
||||
case 10: ESTIMATE_N(10, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7] + *(ptr++) * task.coefs[8] + *ptr * task.coefs[9]) break;
|
||||
case 11: ESTIMATE_N(11, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7] + *(ptr++) * task.coefs[8] + *(ptr++) * task.coefs[9] + *ptr * task.coefs[10]) break;
|
||||
case 12: ESTIMATE_N(12, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7] + *(ptr++) * task.coefs[8] + *(ptr++) * task.coefs[9] + *(ptr++) * task.coefs[10] + *ptr * task.coefs[11]) break;
|
||||
default:
|
||||
for (int pos = ro; pos < bs; pos ++)
|
||||
{
|
||||
#if 0
|
||||
float sum = dot(vload4(0, data + pos - ro), c0)
|
||||
+ dot(vload4(1, data + pos - ro), c1)
|
||||
+ dot(vload4(2, data + pos - ro), c2)
|
||||
;
|
||||
int t = convert_int_rte(data[pos] + sum);
|
||||
#else
|
||||
__global int *ptr = data + pos - ro;
|
||||
int sum =
|
||||
*(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3]
|
||||
+ *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7]
|
||||
+ *(ptr++) * task.coefs[8] + *(ptr++) * task.coefs[9] + *(ptr++) * task.coefs[10] + *(ptr++) * task.coefs[11]
|
||||
;
|
||||
for (int i = 12; i < ro; i++)
|
||||
sum += *(ptr++) * task.coefs[i];
|
||||
int t = (data[pos] - (sum >> task.data.shift)) >> task.data.wbits;
|
||||
#endif
|
||||
// overflow protection
|
||||
t = clamp(t, -0x7fffff, 0x7fffff);
|
||||
// convert to unsigned
|
||||
t = (t << 1) ^ (t >> 31);
|
||||
len[pos >> (12 - EPO)] += t;
|
||||
}
|
||||
break;
|
||||
}
|
||||
SWITCH_N((t = clamp(t, -0x7fffff, 0x7fffff), len[pos >> (12 - EPO)] += (t << 1) ^ (t >> 31)))
|
||||
|
||||
int total = 0;
|
||||
for (int i = 0; i < 1 << EPO; i++)
|
||||
@@ -497,4 +453,183 @@ void clCopyBestMethodStereo(
|
||||
tasks_out[2 * get_group_id(0) + 1].data.residualOffs = tasks[best_index[1]].data.residualOffs;
|
||||
}
|
||||
|
||||
// get_group_id(0) == task index
|
||||
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||
void clEncodeResidual(
|
||||
__global int *residual,
|
||||
__global int *samples,
|
||||
__global FLACCLSubframeTask *tasks
|
||||
)
|
||||
{
|
||||
FLACCLSubframeTask task = tasks[get_group_id(0)];
|
||||
int bs = task.data.blocksize;
|
||||
int ro = task.data.residualOrder;
|
||||
__global int *data = &samples[task.data.samplesOffs];
|
||||
SWITCH_N(residual[task.data.residualOffs + pos] = t);
|
||||
}
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||
void clCalcPartition(
|
||||
__global int *partition_lengths,
|
||||
__global int *residual,
|
||||
__global FLACCLSubframeTask *tasks,
|
||||
int max_porder, // <= 8
|
||||
int psize // == task.blocksize >> max_porder?
|
||||
)
|
||||
{
|
||||
FLACCLSubframeTask task = tasks[get_group_id(1)];
|
||||
int bs = task.data.blocksize;
|
||||
int ro = task.data.residualOrder;
|
||||
//int psize = bs >> max_porder;
|
||||
__global int *pl = partition_lengths + (1 << (max_porder + 1)) * get_group_id(1);
|
||||
|
||||
for (int p = 0; p < (1 << max_porder); p++)
|
||||
pl[p] = 0;
|
||||
|
||||
for (int pos = ro; pos < bs; pos ++)
|
||||
{
|
||||
int t = residual[task.data.residualOffs + pos];
|
||||
// overflow protection
|
||||
t = clamp(t, -0x7fffff, 0x7fffff);
|
||||
// convert to unsigned
|
||||
t = (t << 1) ^ (t >> 31);
|
||||
pl[pos / psize] += t;
|
||||
}
|
||||
}
|
||||
// get_group_id(0) == task index
|
||||
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||
void clCalcPartition16(
|
||||
__global int *partition_lengths,
|
||||
__global int *residual,
|
||||
__global int *samples,
|
||||
__global FLACCLSubframeTask *tasks,
|
||||
int max_porder // <= 8
|
||||
)
|
||||
{
|
||||
FLACCLSubframeTask task = tasks[get_group_id(0)];
|
||||
int bs = task.data.blocksize;
|
||||
int ro = task.data.residualOrder;
|
||||
__global int *data = &samples[task.data.samplesOffs];
|
||||
__global int *pl = partition_lengths + (1 << (max_porder + 1)) * get_group_id(0);
|
||||
for (int p = 0; p < (1 << max_porder); p++)
|
||||
pl[p] = 0;
|
||||
SWITCH_N((residual[task.data.residualOffs + pos] = t, t = clamp(t, -0x7fffff, 0x7fffff), t = (t << 1) ^ (t >> 31), pl[pos >> 4] += t));
|
||||
}
|
||||
|
||||
// Sums partition lengths for a certain k == get_group_id(0)
|
||||
// get_group_id(0) == k
|
||||
// get_group_id(1) == task index
|
||||
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||
void clSumPartition(
|
||||
__global int* partition_lengths,
|
||||
int max_porder
|
||||
)
|
||||
{
|
||||
if (get_group_id(0) != 0) // ignore k != 0
|
||||
return;
|
||||
__global int * sums = partition_lengths + (1 << (max_porder + 1)) * get_group_id(1);
|
||||
for (int i = max_porder - 1; i >= 0; i--)
|
||||
{
|
||||
for (int j = 0; j < (1 << i); j++)
|
||||
{
|
||||
sums[(2 << i) + j] = sums[2 * j] + sums[2 * j + 1];
|
||||
// if (get_group_id(1) == 0)
|
||||
//printf("[%d][%d]: %d + %d == %d\n", i, j, sums[2 * j], sums[2 * j + 1], sums[2 * j] + sums[2 * j + 1]);
|
||||
}
|
||||
sums += 2 << i;
|
||||
}
|
||||
}
|
||||
|
||||
// Finds optimal rice parameter for each partition.
|
||||
// get_group_id(0) == task index
|
||||
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||
void clFindRiceParameter(
|
||||
__global FLACCLSubframeTask *tasks,
|
||||
__global int* rice_parameters,
|
||||
__global int* partition_lengths,
|
||||
int max_porder
|
||||
)
|
||||
{
|
||||
__global FLACCLSubframeTask* task = tasks + get_group_id(0);
|
||||
const int tid = get_local_id(0);
|
||||
int lim = (2 << max_porder) - 1;
|
||||
int psize = task->data.blocksize >> max_porder;
|
||||
int bs = task->data.blocksize;
|
||||
int ro = task->data.residualOrder;
|
||||
for (int offs = 0; offs < lim; offs ++)
|
||||
{
|
||||
int pl = partition_lengths[(1 << (max_porder + 1)) * get_group_id(0) + offs];
|
||||
int porder = 31 - clz(lim - offs);
|
||||
int ps = (bs >> porder) - select(0, ro, offs == lim + 1 - (2 << porder));
|
||||
//if (ps <= 0)
|
||||
// printf("max_porder == %d, porder == %d, ro == %d\n", max_porder, porder, ro);
|
||||
int k = clamp(31 - clz(pl / max(1, ps)), 0, 14);
|
||||
int plk = ps * (k + 1) + (pl >> k);
|
||||
|
||||
// output rice parameter
|
||||
rice_parameters[(get_group_id(0) << (max_porder + 2)) + offs] = k;
|
||||
// output length
|
||||
rice_parameters[(get_group_id(0) << (max_porder + 2)) + (1 << (max_porder + 1)) + offs] = plk;
|
||||
}
|
||||
}
|
||||
|
||||
// get_group_id(0) == task index
|
||||
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||
void clFindPartitionOrder(
|
||||
__global int *residual,
|
||||
__global int* best_rice_parameters,
|
||||
__global FLACCLSubframeTask *tasks,
|
||||
__global int* rice_parameters,
|
||||
int max_porder
|
||||
)
|
||||
{
|
||||
__global FLACCLSubframeTask* task = tasks + get_group_id(0);
|
||||
int partlen[9];
|
||||
for (int p = 0; p < 9; p++)
|
||||
partlen[p] = 0;
|
||||
// fetch partition lengths
|
||||
const int pos = (get_group_id(0) << (max_porder + 2)) + (2 << max_porder);
|
||||
int lim = (2 << max_porder) - 1;
|
||||
for (int offs = 0; offs < lim; offs ++)
|
||||
{
|
||||
int len = rice_parameters[pos + offs];
|
||||
int porder = 31 - clz(lim - offs);
|
||||
partlen[porder] += len;
|
||||
}
|
||||
|
||||
int best_length = partlen[0] + 4;
|
||||
int best_porder = 0;
|
||||
for (int porder = 1; porder <= max_porder; porder++)
|
||||
{
|
||||
int length = (4 << porder) + partlen[porder];
|
||||
best_porder = select(best_porder, porder, length < best_length);
|
||||
best_length = min(best_length, length);
|
||||
}
|
||||
|
||||
best_length = (4 << best_porder) + task->data.blocksize - task->data.residualOrder;
|
||||
int best_psize = task->data.blocksize >> best_porder;
|
||||
int start = task->data.residualOffs + task->data.residualOrder;
|
||||
int fin = task->data.residualOffs + best_psize;
|
||||
for (int p = 0; p < (1 << best_porder); p++)
|
||||
{
|
||||
int k = rice_parameters[pos - (2 << best_porder) + p];
|
||||
best_length += k * (fin - start);
|
||||
for (int i = start; i < fin; i++)
|
||||
{
|
||||
int t = residual[i];
|
||||
best_length += ((t << 1) ^ (t >> 31)) >> k;
|
||||
}
|
||||
start = fin;
|
||||
fin += best_psize;
|
||||
}
|
||||
|
||||
int obits = task->data.obits - task->data.wbits;
|
||||
task->data.porder = best_porder;
|
||||
task->data.size =
|
||||
task->data.type == Fixed ? task->data.residualOrder * obits + 6 + best_length :
|
||||
task->data.type == LPC ? task->data.residualOrder * obits + 6 + best_length + 4 + 5 + task->data.residualOrder * task->data.cbits :
|
||||
task->data.type == Constant ? obits : obits * task->data.blocksize;
|
||||
for (int offs = 0; offs < (1 << best_porder); offs ++)
|
||||
best_rice_parameters[(get_group_id(0) << max_porder) + offs] = rice_parameters[pos - (2 << best_porder) + offs];
|
||||
}
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user