optimizations

This commit is contained in:
chudov
2010-12-11 07:20:54 +00:00
parent 26328d99be
commit 51639e3c7b
2 changed files with 98 additions and 129 deletions

View File

@@ -964,6 +964,10 @@ namespace CUETools.Codecs.FLACCL
{
switch (sub.best.type)
{
case SubframeType.Constant:
return (uint)sub.obits;
case SubframeType.Verbatim:
return (uint)(sub.obits * frame.blocksize);
case SubframeType.Fixed:
return measure_subframe_fixed(frame, sub);
case SubframeType.LPC:
@@ -1214,99 +1218,10 @@ namespace CUETools.Codecs.FLACCL
}
}
unsafe void encode_residual(FLACCLTask task)
unsafe void encode_residual(FLACCLTask task, int channelsCount, int iFrame)
{
bool unpacked = false;
unpack_samples(task, Math.Min(32, task.frameSize));
for (int ch = 0; ch < channels; ch++)
{
switch (task.frame.subframes[ch].best.type)
{
case SubframeType.Constant:
break;
case SubframeType.Verbatim:
if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true;
break;
case SubframeType.Fixed:
if (!task.UseGPUOnly)
{
if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true;
encode_residual_fixed(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples,
task.frame.blocksize, task.frame.subframes[ch].best.order);
FlacFrame frame = task.frame;
int pmin = get_max_p_order(eparams.min_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order);
int pmax = get_max_p_order(eparams.max_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order);
uint bits = (uint)(task.frame.subframes[ch].best.order * task.frame.subframes[ch].obits) + 6;
task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order, PCM.BitsPerSample > 16 ? 1 : 0);
}
break;
case SubframeType.LPC:
fixed (int* coefs = task.frame.subframes[ch].best.coefs)
{
ulong csum = 0;
for (int i = task.frame.subframes[ch].best.order; i > 0; i--)
csum += (ulong)Math.Abs(coefs[i - 1]);
#if DEBUG
// check size
if (task.UseGPUOnly && !task.UseGPURice)
{
uint real_size = measure_subframe(task.frame, task.frame.subframes[ch]);
if (real_size != task.frame.subframes[ch].best.size)
throw new Exception("size reported incorrectly");
}
#endif
if ((((csum << task.frame.subframes[ch].obits) >= 1UL << 32) && PCM.BitsPerSample == 16) || !task.UseGPUOnly)
{
if (task.UseGPURice)
#if DEBUG
// throw new Exception("DoRice failed");
break;
#else
break;
#endif
if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true;
if ((csum << task.frame.subframes[ch].obits) >= 1UL << 32)
lpc.encode_residual_long(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, task.frame.blocksize, task.frame.subframes[ch].best.order, coefs, task.frame.subframes[ch].best.shift);
else
lpc.encode_residual(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, task.frame.blocksize, task.frame.subframes[ch].best.order, coefs, task.frame.subframes[ch].best.shift);
int pmin = get_max_p_order(eparams.min_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order);
int pmax = get_max_p_order(eparams.max_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order);
uint bits = (uint)(task.frame.subframes[ch].best.order * task.frame.subframes[ch].obits) + 4 + 5 + (uint)task.frame.subframes[ch].best.order * (uint)task.frame.subframes[ch].best.cbits + 6;
#if KLJLKJLKJL
uint oldsize = task.frame.subframes[ch].best.size;
RiceContext rc1 = task.frame.subframes[ch].best.rc;
task.frame.subframes[ch].best.rc = new RiceContext();
#endif
task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order, PCM.BitsPerSample > 16 ? 1 : 0);
task.frame.subframes[ch].best.size = measure_subframe(task.frame, task.frame.subframes[ch]);
#if KJHKJH
// check size
if (task.UseGPUOnly && oldsize > task.frame.subframes[ch].best.size)
throw new Exception("unoptimal size reported");
#endif
//if (task.frame.subframes[ch].best.size > task.frame.subframes[ch].obits * (uint)task.frame.blocksize &&
// oldsize <= task.frame.subframes[ch].obits * (uint)task.frame.blocksize)
// throw new Exception("oops");
}
}
break;
}
if (task.frame.subframes[ch].best.size > task.frame.subframes[ch].obits * task.frame.blocksize)
{
#if DEBUG
throw new Exception("larger than verbatim");
#endif
task.frame.subframes[ch].best.type = SubframeType.Verbatim;
task.frame.subframes[ch].best.size = (uint)(task.frame.subframes[ch].obits * task.frame.blocksize);
if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true;
}
}
}
unsafe void select_best_methods(FlacFrame frame, int channelsCount, int iFrame, FLACCLTask task)
{
if (channelsCount == 4 && channels == 2 && frame.blocksize > 4)
{
if (task.BestResidualTasks[iFrame * 2].channel == 0 && task.BestResidualTasks[iFrame * 2 + 1].channel == 1)
@@ -1325,49 +1240,106 @@ namespace CUETools.Codecs.FLACCL
else
frame.ch_mode = channels != 2 ? ChannelMode.NotStereo : ChannelMode.LeftRight;
// calculate wbits before unpacking samples.
for (int ch = 0; ch < channels; ch++)
{
int index = ch + iFrame * channels;
frame.subframes[ch].wbits = frame.blocksize > 4
? task.BestResidualTasks[index].wbits : 0;
}
unpack_samples(task, Math.Min(task.frameSize, eparams.max_prediction_order));
for (int ch = 0; ch < channels; ch++)
{
int index = ch + iFrame * channels;
frame.subframes[ch].best.residual = ((int*)task.clResidualPtr) + task.BestResidualTasks[index].residualOffs;
frame.subframes[ch].best.type = SubframeType.Verbatim;
frame.subframes[ch].best.size = (uint)(frame.subframes[ch].obits * frame.blocksize);
frame.subframes[ch].wbits = 0;
if (frame.blocksize <= Math.Max(4, eparams.max_prediction_order))
continue;
if (task.BestResidualTasks[index].size < 0)
throw new Exception("internal error");
if (frame.subframes[ch].best.size > task.BestResidualTasks[index].size
&& (SubframeType)task.BestResidualTasks[index].type != SubframeType.Verbatim)
if (frame.blocksize > Math.Max(4, eparams.max_prediction_order))
{
frame.subframes[ch].best.type = (SubframeType)task.BestResidualTasks[index].type;
frame.subframes[ch].best.size = (uint)task.BestResidualTasks[index].size;
frame.subframes[ch].best.order = task.BestResidualTasks[index].residualOrder;
frame.subframes[ch].best.cbits = task.BestResidualTasks[index].cbits;
frame.subframes[ch].best.shift = task.BestResidualTasks[index].shift;
frame.subframes[ch].obits -= task.BestResidualTasks[index].wbits;
frame.subframes[ch].wbits = task.BestResidualTasks[index].wbits;
for (int i = 0; i < task.BestResidualTasks[index].residualOrder; i++)
frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i];
frame.subframes[ch].best.rc.porder = task.BestResidualTasks[index].porder;
frame.subframes[ch].best.rc.coding_method = task.BestResidualTasks[index].coding_method;
if (task.UseGPUOnly && !task.UseGPURice && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC))
//if (task.UseGPUOnly && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC))
if (task.BestResidualTasks[index].size < 0)
throw new Exception("internal error");
if (frame.subframes[ch].best.size > task.BestResidualTasks[index].size
&& (SubframeType)task.BestResidualTasks[index].type != SubframeType.Verbatim)
{
int* riceParams = ((int*)task.clBestRiceParamsPtr) + (index << task.max_porder);
fixed (int* dstParams = frame.subframes[ch].best.rc.rparams)
AudioSamples.MemCpy(dstParams, riceParams, (1 << frame.subframes[ch].best.rc.porder));
//for (int i = 0; i < (1 << frame.subframes[ch].best.rc.porder); i++)
// frame.subframes[ch].best.rc.rparams[i] = riceParams[i];
uint real_size = measure_subframe(frame, frame.subframes[ch]);
if (real_size != task.frame.subframes[ch].best.size)
frame.subframes[ch].best.type = (SubframeType)task.BestResidualTasks[index].type;
frame.subframes[ch].best.size = (uint)task.BestResidualTasks[index].size;
frame.subframes[ch].best.order = task.BestResidualTasks[index].residualOrder;
frame.subframes[ch].best.cbits = task.BestResidualTasks[index].cbits;
frame.subframes[ch].best.shift = task.BestResidualTasks[index].shift;
frame.subframes[ch].obits -= task.BestResidualTasks[index].wbits;
frame.subframes[ch].wbits = task.BestResidualTasks[index].wbits;
for (int i = 0; i < task.BestResidualTasks[index].residualOrder; i++)
frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i];
frame.subframes[ch].best.rc.porder = task.BestResidualTasks[index].porder;
frame.subframes[ch].best.rc.coding_method = task.BestResidualTasks[index].coding_method;
if (task.UseGPUOnly && !task.UseGPURice)
{
if (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC)
{
int* riceParams = ((int*)task.clBestRiceParamsPtr) + (index << task.max_porder);
fixed (int* dstParams = frame.subframes[ch].best.rc.rparams)
AudioSamples.MemCpy(dstParams, riceParams, (1 << frame.subframes[ch].best.rc.porder));
}
uint real_size = measure_subframe(frame, frame.subframes[ch]);
if (real_size != task.frame.subframes[ch].best.size)
throw new Exception("size reported incorrectly");
}
}
else
{
if (task.UseGPURice && frame.subframes[ch].best.size != task.BestResidualTasks[index].size)
throw new Exception("size reported incorrectly");
}
}
else
switch (task.frame.subframes[ch].best.type)
{
if (task.UseGPURice && frame.subframes[ch].best.size != task.BestResidualTasks[index].size)
throw new Exception("size reported incorrectly");
case SubframeType.Constant:
break;
case SubframeType.Verbatim:
unpack_samples(task, task.frameSize);
break;
case SubframeType.Fixed:
if (!task.UseGPUOnly)
{
unpack_samples(task, task.frameSize);
encode_residual_fixed(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples,
task.frame.blocksize, task.frame.subframes[ch].best.order);
int pmin = get_max_p_order(eparams.min_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order);
int pmax = get_max_p_order(eparams.max_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order);
calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order, PCM.BitsPerSample > 16 ? 1 : 0);
}
break;
case SubframeType.LPC:
if (!task.UseGPUOnly)
{
unpack_samples(task, task.frameSize);
fixed (int* coefs = task.frame.subframes[ch].best.coefs)
{
if (PCM.BitsPerSample > 16)
lpc.encode_residual_long(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, task.frame.blocksize, task.frame.subframes[ch].best.order, coefs, task.frame.subframes[ch].best.shift);
else
lpc.encode_residual(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, task.frame.blocksize, task.frame.subframes[ch].best.order, coefs, task.frame.subframes[ch].best.shift);
}
int pmin = get_max_p_order(eparams.min_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order);
int pmax = get_max_p_order(eparams.max_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order);
calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order, PCM.BitsPerSample > 16 ? 1 : 0);
}
break;
}
if (!task.UseGPUOnly)
{
task.frame.subframes[ch].best.size = measure_subframe(task.frame, task.frame.subframes[ch]);
if (task.frame.subframes[ch].best.size > task.frame.subframes[ch].obits * task.frame.blocksize)
{
task.frame.subframes[ch].best.type = SubframeType.Verbatim;
task.frame.subframes[ch].best.size = (uint)(task.frame.subframes[ch].obits * task.frame.blocksize);
}
}
}
}
@@ -1577,9 +1549,7 @@ namespace CUETools.Codecs.FLACCL
((int*)task.clResidualPtr) + ch * task.channelSize + iFrame * task.frameSize,
_pcm.BitsPerSample + (doMidside && ch == 3 ? 1 : 0), 0);
select_best_methods(task.frame, channelCount, iFrame, task);
//unpack_samples(task);
encode_residual(task);
encode_residual(task, channelCount, iFrame);
//task.frame.writer.Reset();
task.frame.frame_number = current_frame_number;

View File

@@ -616,6 +616,7 @@ void clQuantizeLPC(
{
int bs = tasks[get_group_id(1) * taskCount].data.blocksize;
int abits = tasks[get_group_id(1) * taskCount].data.abits;
int obits = tasks[get_group_id(1) * taskCount].data.obits;
int lpcOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1) * 32;
float error[MAX_ORDER];
int best_orders[MAX_ORDER];
@@ -667,7 +668,7 @@ void clQuantizeLPC(
#if BITS_PER_SAMPLE > 16
int cbits = max(3, min(15 - minprecision + (i - ((i >> precisions) << precisions)) - (bs <= 2304) - (bs <= 1152) - (bs <= 576), abits));
#else
int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (bs <= 2304) - (bs <= 1152) - (bs <= 576), abits), clz(order) + 1 - abits));
int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (bs <= 2304) - (bs <= 1152) - (bs <= 576), abits), clz(order) + 1 - obits));
#endif
// calculate shift based on precision and number of leading zeroes in coeffs
int shift = max(0,min(15, clz(tmpi) - 18 + cbits));
@@ -822,7 +823,7 @@ void clQuantizeLPC(
#if BITS_PER_SAMPLE > 16
int cbits = max(3, min(min(15 - minprecision + (i - ((i >> precisions) << precisions)) - (shared.task.blocksize <= 2304) - (shared.task.blocksize <= 1152) - (shared.task.blocksize <= 576), shared.task.abits), 15));
#else
int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (shared.task.blocksize <= 2304) - (shared.task.blocksize <= 1152) - (shared.task.blocksize <= 576), shared.task.abits), clz(order) + 1 - shared.task.abits));
int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (shared.task.blocksize <= 2304) - (shared.task.blocksize <= 1152) - (shared.task.blocksize <= 576), shared.task.abits), clz(order) + 1 - shared.task.obits));
#endif
// calculate shift based on precision and number of leading zeroes in coeffs
int shift = max(0,min(15, clz(shared.maxcoef[i]) - 18 + cbits));
@@ -1145,7 +1146,6 @@ void clEstimateResidual(
atom_add(&psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG], t);
#endif
}
#if 1
if (pos < bs)
{
// fetch samples
@@ -1192,7 +1192,6 @@ void clEstimateResidual(
atom_add(&psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG], t);
#endif
}
#endif
// calculate rice partition bit length for every 32 samples
barrier(CLK_LOCAL_MEM_FENCE);
@@ -1421,7 +1420,7 @@ void clEncodeResidual(
int4 cptr2 = vload4(2, &task.coefs[0]);
#endif
// We tweaked coeffs so that (task.cbits + task.abits + clz(ro) <= 32)
// We tweaked coeffs so that (task.cbits + task.obits + log2i(ro) <= 32)
// when BITS_PER_SAMPLE == 16, so we don't need 64bit arithmetics.
data[tid] = 0;