opencl flac encoder

This commit is contained in:
chudov
2010-10-11 08:46:36 +00:00
parent 04ca40e627
commit badfb2fec8
3 changed files with 108 additions and 47 deletions

View File

@@ -754,6 +754,32 @@ namespace CUETools.Codecs.FLACCL
frame.writer.writebits(8, crc);
}
unsafe int measure_residual(FlacFrame frame, FlacSubframeInfo sub, int pos, int cnt, int k)
{
int q = 0;
for (int i = pos; i < pos + cnt; i++)
{
int v = sub.best.residual[i];
v = (v << 1) ^ (v >> 31);
q += (v >> k);
}
return (k + 1) * cnt + q;
}
unsafe int measure_residual(FlacFrame frame, FlacSubframeInfo sub)
{
// partition order
int porder = sub.best.rc.porder;
int psize = frame.blocksize >> porder;
//assert(porder >= 0);
int size = 6 + (4 << porder);
size += measure_residual(frame, sub, sub.best.order, psize - sub.best.order, sub.best.rc.rparams[0]);
// residual
for (int p = 1; p < (1 << porder); p++)
size += measure_residual(frame, sub, p * psize, psize, sub.best.rc.rparams[p]);
return size;
}
unsafe void output_residual(FlacFrame frame, FlacSubframeInfo sub)
{
// rice-encoded block
@@ -806,6 +832,12 @@ namespace CUETools.Codecs.FLACCL
output_residual(frame, sub);
}
unsafe uint
measure_subframe_lpc(FlacFrame frame, FlacSubframeInfo sub)
{
return (uint)(sub.best.order * sub.obits + 9 + sub.best.order * sub.best.cbits + measure_residual(frame, sub));
}
unsafe void
output_subframe_lpc(FlacFrame frame, FlacSubframeInfo sub)
{
@@ -898,8 +930,8 @@ namespace CUETools.Codecs.FLACCL
task.nResidualTasks = 0;
task.nTasksPerWindow = Math.Min(32, eparams.orders_per_window);
task.nResidualTasksPerChannel = _windowcount * task.nTasksPerWindow + 1 + (eparams.do_constant ? 1 : 0) + eparams.max_fixed_order - eparams.min_fixed_order;
if (task.nResidualTasksPerChannel >= 4)
task.nResidualTasksPerChannel = (task.nResidualTasksPerChannel + 7) & ~7;
//if (task.nResidualTasksPerChannel >= 4)
// task.nResidualTasksPerChannel = (task.nResidualTasksPerChannel + 7) & ~7;
task.nAutocorTasksPerChannel = _windowcount;
for (int iFrame = 0; iFrame < nFrames; iFrame++)
{
@@ -973,20 +1005,20 @@ namespace CUETools.Codecs.FLACCL
}
task.nResidualTasks++;
}
// Filler
while ((task.nResidualTasks % task.nResidualTasksPerChannel) != 0)
{
task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.Verbatim;
task.ResidualTasks[task.nResidualTasks].channel = ch;
task.ResidualTasks[task.nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0);
task.ResidualTasks[task.nResidualTasks].abits = task.ResidualTasks[task.nResidualTasks].obits;
task.ResidualTasks[task.nResidualTasks].blocksize = blocksize;
task.ResidualTasks[task.nResidualTasks].residualOrder = 0;
task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * blocksize;
task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs;
task.ResidualTasks[task.nResidualTasks].shift = 0;
task.nResidualTasks++;
}
//// Filler
//while ((task.nResidualTasks % task.nResidualTasksPerChannel) != 0)
//{
// task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.Verbatim;
// task.ResidualTasks[task.nResidualTasks].channel = ch;
// task.ResidualTasks[task.nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0);
// task.ResidualTasks[task.nResidualTasks].abits = task.ResidualTasks[task.nResidualTasks].obits;
// task.ResidualTasks[task.nResidualTasks].blocksize = blocksize;
// task.ResidualTasks[task.nResidualTasks].residualOrder = 0;
// task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * blocksize;
// task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs;
// task.ResidualTasks[task.nResidualTasks].shift = 0;
// task.nResidualTasks++;
//}
}
}
if (sizeof(FLACCLSubframeTask) * task.nResidualTasks > task.residualTasksLen)
@@ -1029,6 +1061,17 @@ namespace CUETools.Codecs.FLACCL
ulong csum = 0;
for (int i = task.frame.subframes[ch].best.order; i > 0; i--)
csum += (ulong)Math.Abs(coefs[i - 1]);
#if DEBUG
// check size
if (_settings.GPUOnly)
{
uint real_size = measure_subframe_lpc(task.frame, task.frame.subframes[ch]);
if (real_size != task.frame.subframes[ch].best.size)
throw new Exception("size reported incorrectly");
}
#endif
if ((csum << task.frame.subframes[ch].obits) >= 1UL << 32 || !_settings.GPUOnly)
{
if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true;
@@ -1039,8 +1082,18 @@ namespace CUETools.Codecs.FLACCL
int pmin = get_max_p_order(eparams.min_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order);
int pmax = get_max_p_order(eparams.max_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order);
uint bits = (uint)(task.frame.subframes[ch].best.order * task.frame.subframes[ch].obits) + 4 + 5 + (uint)task.frame.subframes[ch].best.order * (uint)task.frame.subframes[ch].best.cbits + 6;
//uint oldsize = task.frame.subframes[ch].best.size;
task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order);
#if KLJLKJLKJL
uint oldsize = task.frame.subframes[ch].best.size;
RiceContext rc1 = task.frame.subframes[ch].best.rc;
task.frame.subframes[ch].best.rc = new RiceContext();
#endif
task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order);
task.frame.subframes[ch].best.size = measure_subframe_lpc(task.frame, task.frame.subframes[ch]);
#if KJHKJH
// check size
if (_settings.GPUOnly && oldsize > task.frame.subframes[ch].best.size)
throw new Exception("unoptimal size reported");
#endif
//if (task.frame.subframes[ch].best.size > task.frame.subframes[ch].obits * (uint)task.frame.blocksize &&
// oldsize <= task.frame.subframes[ch].obits * (uint)task.frame.blocksize)
// throw new Exception("oops");
@@ -1137,19 +1190,19 @@ namespace CUETools.Codecs.FLACCL
task.cudaComputeLPC.SetArg(0, task.cudaResidualTasks);
task.cudaComputeLPC.SetArg(1, task.cudaAutocorOutput);
task.cudaComputeLPC.SetArg(2, task.cudaLPCData);
task.cudaComputeLPC.SetArg(3, (uint)task.nResidualTasksPerChannel);
task.cudaComputeLPC.SetArg(3, task.nResidualTasksPerChannel);
task.cudaComputeLPC.SetArg(4, (uint)_windowcount);
task.cudaQuantizeLPC.SetArg(0, task.cudaResidualTasks);
task.cudaQuantizeLPC.SetArg(1, task.cudaLPCData);
task.cudaQuantizeLPC.SetArg(2, (uint)task.nResidualTasksPerChannel);
task.cudaQuantizeLPC.SetArg(2, task.nResidualTasksPerChannel);
task.cudaQuantizeLPC.SetArg(3, (uint)task.nTasksPerWindow);
task.cudaQuantizeLPC.SetArg(4, (uint)eparams.lpc_min_precision_search);
task.cudaQuantizeLPC.SetArg(5, (uint)(eparams.lpc_max_precision_search - eparams.lpc_min_precision_search));
task.cudaCopyBestMethod.SetArg(0, task.cudaBestResidualTasks);
task.cudaCopyBestMethod.SetArg(1, task.cudaResidualTasks);
task.cudaCopyBestMethod.SetArg(2, (uint)task.nResidualTasksPerChannel);
task.cudaCopyBestMethod.SetArg(2, task.nResidualTasksPerChannel);
task.cudaCopyBestMethodStereo.SetArg(0, task.cudaBestResidualTasks);
task.cudaCopyBestMethodStereo.SetArg(1, task.cudaResidualTasks);
@@ -2384,8 +2437,8 @@ namespace CUETools.Codecs.FLACCL
cudaComputeAutocor.SetArg(1, cudaSamples);
cudaComputeAutocor.SetArg(2, cudaWindow);
cudaComputeAutocor.SetArg(3, cudaResidualTasks);
cudaComputeAutocor.SetArg(4, (uint)nAutocorTasksPerChannel - 1);
cudaComputeAutocor.SetArg(5, (uint)nResidualTasksPerChannel);
cudaComputeAutocor.SetArg(4, nAutocorTasksPerChannel - 1);
cudaComputeAutocor.SetArg(5, nResidualTasksPerChannel);
int workX = max_prediction_order / 4 + 1;
int workY = nAutocorTasksPerChannel * channelsCount * frameCount;
@@ -2406,7 +2459,7 @@ namespace CUETools.Codecs.FLACCL
{
cudaChooseBestMethod.SetArg(0, cudaResidualTasks);
cudaChooseBestMethod.SetArg(1, cudaResidualOutput);
cudaChooseBestMethod.SetArg(2, (uint)nResidualTasksPerChannel);
cudaChooseBestMethod.SetArg(2, nResidualTasksPerChannel);
openCLCQ.EnqueueNDRangeKernel(cudaChooseBestMethod, 2, null, new int[] { 32, channelsCount * frameCount }, new int[] { 32, 1 });
}

View File

@@ -621,18 +621,18 @@ void cudaChooseBestMethod(
tasks[tid + taskCount * get_group_id(1)].data.size = shared.length[tid];
int l1 = shared.length[tid];
for (int sh = 4; sh > 0; sh --)
for (int l = 16; l > 0; l >>= 1)
{
if (tid < (1 << sh))
if (tid < l)
{
int l2 = shared.length[tid + (1 << sh)];
shared.index[tid] = shared.index[tid + ((l2 < l1) << sh)];
int l2 = shared.length[tid + l];
shared.index[tid] = shared.index[tid + select(0, l, l2 < l1)];
shared.length[tid] = l1 = min(l1, l2);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (tid == 0)
tasks[taskCount * get_group_id(1)].data.best_index = taskCount * get_group_id(1) + shared.index[shared.length[1] < shared.length[0]];
tasks[taskCount * get_group_id(1)].data.best_index = taskCount * get_group_id(1) + shared.index[0];
}
__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
@@ -921,13 +921,14 @@ void cudaFindPartitionOrder(
if (get_local_id(0) < 32)
shared.index[get_local_id(0)] = get_local_id(0);
barrier(CLK_LOCAL_MEM_FENCE);
//atom_min(shared.index[get_local_id(0)],);
int l1 = get_local_id(0) <= max_porder ? shared.length[get_local_id(0)] : 0xfffffff;
for (int sh = 3; sh >= 0; sh --)
for (int l = 8; l > 0; l >>= 1)
{
if (get_local_id(0) < (1 << sh))
if (get_local_id(0) < l)
{
int l2 = shared.length[get_local_id(0) + (1 << sh)];
shared.index[get_local_id(0)] = shared.index[get_local_id(0) + ((l2 < l1) << sh)];
int l2 = shared.length[get_local_id(0) + l];
shared.index[get_local_id(0)] = shared.index[get_local_id(0) + select(0, l, l2 < l1)];
shared.length[get_local_id(0)] = l1 = min(l1, l2);
}
barrier(CLK_LOCAL_MEM_FENCE);