24-bit/multichannel support

This commit is contained in:
chudov
2010-12-07 22:52:34 +00:00
parent 6783bba2e9
commit 6585ea2001
12 changed files with 691 additions and 432 deletions

View File

@@ -87,7 +87,7 @@ namespace CUETools.Codecs.FLACCL
[SRDescription(typeof(Properties.Resources), "DescriptionDeviceType")] [SRDescription(typeof(Properties.Resources), "DescriptionDeviceType")]
public OpenCLDeviceType DeviceType { get; set; } public OpenCLDeviceType DeviceType { get; set; }
int cpu_threads = 1; int cpu_threads = 0;
[DefaultValue(1)] [DefaultValue(1)]
[SRDescription(typeof(Properties.Resources), "DescriptionCPUThreads")] [SRDescription(typeof(Properties.Resources), "DescriptionCPUThreads")]
public int CPUThreads public int CPUThreads
@@ -214,10 +214,11 @@ namespace CUETools.Codecs.FLACCL
{ {
_pcm = pcm; _pcm = pcm;
if (pcm.BitsPerSample != 16) // FIXME: For now, only 16-bit encoding is supported
if (pcm.BitsPerSample != 16 && pcm.BitsPerSample != 24)
throw new Exception("Bits per sample must be 16."); throw new Exception("Bits per sample must be 16.");
if (pcm.ChannelCount != 2) //if (pcm.ChannelCount != 2)
throw new Exception("ChannelCount must be 2."); // throw new Exception("ChannelCount must be 2.");
channels = pcm.ChannelCount; channels = pcm.ChannelCount;
sample_rate = pcm.SampleRate; sample_rate = pcm.SampleRate;
@@ -288,12 +289,6 @@ namespace CUETools.Codecs.FLACCL
if (value as FLACCLWriterSettings == null) if (value as FLACCLWriterSettings == null)
throw new Exception("Unsupported options " + value); throw new Exception("Unsupported options " + value);
_settings = value as FLACCLWriterSettings; _settings = value as FLACCLWriterSettings;
if (_settings.DeviceType == OpenCLDeviceType.CPU)
{
_settings.GroupSize = 1;
//_settings.GPUOnly = true;
_settings.MappedMemory = true;
}
} }
} }
@@ -644,24 +639,28 @@ namespace CUETools.Codecs.FLACCL
} }
} }
static unsafe uint calc_optimal_rice_params(int porder, int* parm, uint* sums, uint n, uint pred_order) static unsafe uint calc_optimal_rice_params(int porder, int* parm, ulong* sums, uint n, uint pred_order, ref int method)
{ {
uint part = (1U << porder); uint part = (1U << porder);
uint cnt = (n >> porder) - pred_order; uint cnt = (n >> porder) - pred_order;
int k = cnt > 0 ? Math.Min(Flake.MAX_RICE_PARAM, BitReader.log2i(sums[0] / cnt)) : 0; int maxK = method > 0 ? 30 : Flake.MAX_RICE_PARAM;
uint all_bits = cnt * ((uint)k + 1U) + (sums[0] >> k); int k = cnt > 0 ? Math.Min(maxK, BitReader.log2i(sums[0] / cnt)) : 0;
int realMaxK0 = k;
ulong all_bits = cnt * ((uint)k + 1U) + (sums[0] >> k);
parm[0] = k; parm[0] = k;
cnt = (n >> porder); cnt = (n >> porder);
for (uint i = 1; i < part; i++) for (uint i = 1; i < part; i++)
{ {
k = Math.Min(Flake.MAX_RICE_PARAM, BitReader.log2i(sums[i] / cnt)); k = Math.Min(maxK, BitReader.log2i(sums[i] / cnt));
realMaxK0 = Math.Max(realMaxK0, k);
all_bits += cnt * ((uint)k + 1U) + (sums[i] >> k); all_bits += cnt * ((uint)k + 1U) + (sums[i] >> k);
parm[i] = k; parm[i] = k;
} }
return all_bits + (4 * part); method = realMaxK0 > Flake.MAX_RICE_PARAM ? 1 : 0;
return (uint)all_bits + ((4U + (uint)method) * part);
} }
static unsafe void calc_lower_sums(int pmin, int pmax, uint* sums) static unsafe void calc_lower_sums(int pmin, int pmax, ulong* sums)
{ {
for (int i = pmax - 1; i >= pmin; i--) for (int i = pmax - 1; i >= pmin; i--)
{ {
@@ -674,12 +673,12 @@ namespace CUETools.Codecs.FLACCL
} }
} }
static unsafe void calc_sums(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums) static unsafe void calc_sums(int pmin, int pmax, uint* data, uint n, uint pred_order, ulong* sums)
{ {
int parts = (1 << pmax); int parts = (1 << pmax);
uint* res = data + pred_order; uint* res = data + pred_order;
uint cnt = (n >> pmax) - pred_order; uint cnt = (n >> pmax) - pred_order;
uint sum = 0; ulong sum = 0;
for (uint j = cnt; j > 0; j--) for (uint j = cnt; j > 0; j--)
sum += *(res++); sum += *(res++);
sums[0] = sum; sums[0] = sum;
@@ -702,18 +701,18 @@ namespace CUETools.Codecs.FLACCL
/// <param name="n"></param> /// <param name="n"></param>
/// <param name="pred_order"></param> /// <param name="pred_order"></param>
/// <param name="sums"></param> /// <param name="sums"></param>
static unsafe void calc_sums18(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums) static unsafe void calc_sums18(int pmin, int pmax, uint* data, uint n, uint pred_order, ulong* sums)
{ {
int parts = (1 << pmax); int parts = (1 << pmax);
uint* res = data + pred_order; uint* res = data + pred_order;
uint cnt = 18 - pred_order; uint cnt = 18 - pred_order;
uint sum = 0; ulong sum = 0UL;
for (uint j = cnt; j > 0; j--) for (uint j = cnt; j > 0; j--)
sum += *(res++); sum += *(res++);
sums[0] = sum; sums[0] = sum;
for (int i = 1; i < parts; i++) for (int i = 1; i < parts; i++)
{ {
sums[i] = sums[i] = 0UL +
*(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) +
*(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) +
*(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) +
@@ -731,18 +730,18 @@ namespace CUETools.Codecs.FLACCL
/// <param name="n"></param> /// <param name="n"></param>
/// <param name="pred_order"></param> /// <param name="pred_order"></param>
/// <param name="sums"></param> /// <param name="sums"></param>
static unsafe void calc_sums16(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums) static unsafe void calc_sums16(int pmin, int pmax, uint* data, uint n, uint pred_order, ulong* sums)
{ {
int parts = (1 << pmax); int parts = (1 << pmax);
uint* res = data + pred_order; uint* res = data + pred_order;
uint cnt = 16 - pred_order; uint cnt = 16 - pred_order;
uint sum = 0; ulong sum = 0UL;
for (uint j = cnt; j > 0; j--) for (uint j = cnt; j > 0; j--)
sum += *(res++); sum += *(res++);
sums[0] = sum; sums[0] = sum;
for (int i = 1; i < parts; i++) for (int i = 1; i < parts; i++)
{ {
sums[i] = sums[i] = 0UL +
*(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) +
*(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) +
*(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) +
@@ -750,10 +749,10 @@ namespace CUETools.Codecs.FLACCL
} }
} }
static unsafe uint calc_rice_params(RiceContext rc, int pmin, int pmax, int* data, uint n, uint pred_order) static unsafe uint calc_rice_params(RiceContext rc, int pmin, int pmax, int* data, uint n, uint pred_order, int max_method)
{ {
uint* udata = stackalloc uint[(int)n]; uint* udata = stackalloc uint[(int)n];
uint* sums = stackalloc uint[(pmax + 1) * Flake.MAX_PARTITIONS]; ulong* sums = stackalloc ulong[(pmax + 1) * Flake.MAX_PARTITIONS];
int* parm = stackalloc int[(pmax + 1) * Flake.MAX_PARTITIONS]; int* parm = stackalloc int[(pmax + 1) * Flake.MAX_PARTITIONS];
//uint* bits = stackalloc uint[Flake.MAX_PARTITION_ORDER]; //uint* bits = stackalloc uint[Flake.MAX_PARTITION_ORDER];
@@ -776,17 +775,21 @@ namespace CUETools.Codecs.FLACCL
uint opt_bits = AudioSamples.UINT32_MAX; uint opt_bits = AudioSamples.UINT32_MAX;
int opt_porder = pmin; int opt_porder = pmin;
int opt_method = 0;
for (int i = pmin; i <= pmax; i++) for (int i = pmin; i <= pmax; i++)
{ {
uint bits = calc_optimal_rice_params(i, parm + i * Flake.MAX_PARTITIONS, sums + i * Flake.MAX_PARTITIONS, n, pred_order); int method = max_method;
uint bits = calc_optimal_rice_params(i, parm + i * Flake.MAX_PARTITIONS, sums + i * Flake.MAX_PARTITIONS, n, pred_order, ref method);
if (bits <= opt_bits) if (bits <= opt_bits)
{ {
opt_bits = bits; opt_bits = bits;
opt_porder = i; opt_porder = i;
opt_method = method;
} }
} }
rc.porder = opt_porder; rc.porder = opt_porder;
rc.coding_method = opt_method;
fixed (int* rparms = rc.rparams) fixed (int* rparms = rc.rparams)
AudioSamples.MemCpy(rparms, parm + opt_porder * Flake.MAX_PARTITIONS, (1 << opt_porder)); AudioSamples.MemCpy(rparms, parm + opt_porder * Flake.MAX_PARTITIONS, (1 << opt_porder));
@@ -845,8 +848,8 @@ namespace CUETools.Codecs.FLACCL
for (int i = pos; i < pos + cnt; i++) for (int i = pos; i < pos + cnt; i++)
{ {
int v = sub.best.residual[i]; int v = sub.best.residual[i];
v = (v << 1) ^ (v >> 31); uint uv = (uint)((v << 1) ^ (v >> 31));
q += (v >> k); q += (int)(uv >> k);
} }
return (k + 1) * cnt + q; return (k + 1) * cnt + q;
} }
@@ -857,7 +860,7 @@ namespace CUETools.Codecs.FLACCL
int porder = sub.best.rc.porder; int porder = sub.best.rc.porder;
int psize = frame.blocksize >> porder; int psize = frame.blocksize >> porder;
//assert(porder >= 0); //assert(porder >= 0);
int size = 6 + (4 << porder); int size = 6 + ((4 + sub.best.rc.coding_method) << porder);
size += measure_residual(frame, sub, sub.best.order, psize - sub.best.order, sub.best.rc.rparams[0]); size += measure_residual(frame, sub, sub.best.order, psize - sub.best.order, sub.best.rc.rparams[0]);
// residual // residual
for (int p = 1; p < (1 << porder); p++) for (int p = 1; p < (1 << porder); p++)
@@ -870,13 +873,13 @@ namespace CUETools.Codecs.FLACCL
FlacFrame frame = task.frame; FlacFrame frame = task.frame;
// rice-encoded block // rice-encoded block
frame.writer.writebits(2, 0); frame.writer.writebits(2, sub.best.rc.coding_method);
// partition order // partition order
int porder = sub.best.rc.porder; int porder = sub.best.rc.porder;
//assert(porder >= 0); //assert(porder >= 0);
frame.writer.writebits(4, porder); frame.writer.writebits(4, porder);
if (_settings.GPUOnly && _settings.DoRice) if (task.UseGPURice)
{ {
int len = task.BestResidualTasks[index].size - task.BestResidualTasks[index].headerLen; int len = task.BestResidualTasks[index].size - task.BestResidualTasks[index].headerLen;
int pos = task.BestResidualTasks[index].encodingOffset; int pos = task.BestResidualTasks[index].encodingOffset;
@@ -901,7 +904,7 @@ namespace CUETools.Codecs.FLACCL
for (int p = 0; p < (1 << porder); p++) for (int p = 0; p < (1 << porder); p++)
{ {
int k = sub.best.rc.rparams[p]; int k = sub.best.rc.rparams[p];
frame.writer.writebits(4, k); frame.writer.writebits(4 + sub.best.rc.coding_method, k);
if (p == 1) res_cnt = psize; if (p == 1) res_cnt = psize;
int cnt = Math.Min(res_cnt, frame.blocksize - j); int cnt = Math.Min(res_cnt, frame.blocksize - j);
frame.writer.write_rice_block_signed(fixbuf, k, sub.best.residual + j, cnt); frame.writer.write_rice_block_signed(fixbuf, k, sub.best.residual + j, cnt);
@@ -1069,7 +1072,7 @@ namespace CUETools.Codecs.FLACCL
calculate_window(task, lpc.window_bartlett, WindowFunction.Bartlett); calculate_window(task, lpc.window_bartlett, WindowFunction.Bartlett);
if (task.nWindowFunctions == 0) if (task.nWindowFunctions == 0)
throw new Exception("invalid windowfunction"); throw new Exception("invalid windowfunction");
if (!_settings.MappedMemory) if (!task.UseMappedMemory)
task.openCLCQ.EnqueueWriteBuffer(task.clWindowFunctions, false, 0, sizeof(float) * task.nWindowFunctions * task.frameSize, task.clWindowFunctionsPtr); task.openCLCQ.EnqueueWriteBuffer(task.clWindowFunctions, false, 0, sizeof(float) * task.nWindowFunctions * task.frameSize, task.clWindowFunctionsPtr);
} }
@@ -1116,6 +1119,7 @@ namespace CUETools.Codecs.FLACCL
task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * task.channelSize + iFrame * blocksize; task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * task.channelSize + iFrame * blocksize;
task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs; task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs;
task.ResidualTasks[task.nResidualTasks].wbits = 0; task.ResidualTasks[task.nResidualTasks].wbits = 0;
task.ResidualTasks[task.nResidualTasks].coding_method = PCM.BitsPerSample > 16 ? 1 : 0;
task.ResidualTasks[task.nResidualTasks].size = task.ResidualTasks[task.nResidualTasks].obits * blocksize; task.ResidualTasks[task.nResidualTasks].size = task.ResidualTasks[task.nResidualTasks].obits * blocksize;
task.nResidualTasks++; task.nResidualTasks++;
} }
@@ -1131,6 +1135,7 @@ namespace CUETools.Codecs.FLACCL
task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * task.channelSize + iFrame * blocksize; task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * task.channelSize + iFrame * blocksize;
task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs; task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs;
task.ResidualTasks[task.nResidualTasks].wbits = 0; task.ResidualTasks[task.nResidualTasks].wbits = 0;
task.ResidualTasks[task.nResidualTasks].coding_method = PCM.BitsPerSample > 16 ? 1 : 0;
task.ResidualTasks[task.nResidualTasks].size = task.ResidualTasks[task.nResidualTasks].obits * blocksize; task.ResidualTasks[task.nResidualTasks].size = task.ResidualTasks[task.nResidualTasks].obits * blocksize;
task.ResidualTasks[task.nResidualTasks].residualOrder = 1; task.ResidualTasks[task.nResidualTasks].residualOrder = 1;
task.ResidualTasks[task.nResidualTasks].shift = 0; task.ResidualTasks[task.nResidualTasks].shift = 0;
@@ -1149,6 +1154,7 @@ namespace CUETools.Codecs.FLACCL
task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * task.channelSize + iFrame * blocksize; task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * task.channelSize + iFrame * blocksize;
task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs; task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs;
task.ResidualTasks[task.nResidualTasks].wbits = 0; task.ResidualTasks[task.nResidualTasks].wbits = 0;
task.ResidualTasks[task.nResidualTasks].coding_method = PCM.BitsPerSample > 16 ? 1 : 0;
task.ResidualTasks[task.nResidualTasks].size = task.ResidualTasks[task.nResidualTasks].obits * blocksize; task.ResidualTasks[task.nResidualTasks].size = task.ResidualTasks[task.nResidualTasks].obits * blocksize;
task.ResidualTasks[task.nResidualTasks].shift = 0; task.ResidualTasks[task.nResidualTasks].shift = 0;
switch (order) switch (order)
@@ -1195,11 +1201,12 @@ namespace CUETools.Codecs.FLACCL
if (sizeof(FLACCLSubframeTask) * task.nResidualTasks > task.residualTasksLen) if (sizeof(FLACCLSubframeTask) * task.nResidualTasks > task.residualTasksLen)
throw new Exception("oops"); throw new Exception("oops");
if (!_settings.MappedMemory) if (!task.UseMappedMemory)
{
task.openCLCQ.EnqueueWriteBuffer(task.clResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * task.nResidualTasks, task.clResidualTasksPtr); task.openCLCQ.EnqueueWriteBuffer(task.clResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * task.nResidualTasks, task.clResidualTasksPtr);
if (!_settings.MappedMemory)
task.openCLCQ.EnqueueWriteBuffer(task.clSelectedTasks, false, 0, sizeof(int) * (nFrames * channelsCount * task.nEstimateTasksPerChannel), task.clSelectedTasksPtr); task.openCLCQ.EnqueueWriteBuffer(task.clSelectedTasks, false, 0, sizeof(int) * (nFrames * channelsCount * task.nEstimateTasksPerChannel), task.clSelectedTasksPtr);
} }
}
unsafe void encode_residual(FLACCLTask task) unsafe void encode_residual(FLACCLTask task)
{ {
@@ -1215,7 +1222,7 @@ namespace CUETools.Codecs.FLACCL
if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true; if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true;
break; break;
case SubframeType.Fixed: case SubframeType.Fixed:
if (!_settings.GPUOnly) if (!task.UseGPUOnly)
{ {
if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true; if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true;
encode_residual_fixed(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, encode_residual_fixed(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples,
@@ -1224,7 +1231,7 @@ namespace CUETools.Codecs.FLACCL
int pmin = get_max_p_order(eparams.min_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order); int pmin = get_max_p_order(eparams.min_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order);
int pmax = get_max_p_order(eparams.max_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order); int pmax = get_max_p_order(eparams.max_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order);
uint bits = (uint)(task.frame.subframes[ch].best.order * task.frame.subframes[ch].obits) + 6; uint bits = (uint)(task.frame.subframes[ch].best.order * task.frame.subframes[ch].obits) + 6;
task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order); task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order, PCM.BitsPerSample > 16 ? 1 : 0);
} }
break; break;
case SubframeType.LPC: case SubframeType.LPC:
@@ -1236,7 +1243,7 @@ namespace CUETools.Codecs.FLACCL
#if DEBUG #if DEBUG
// check size // check size
if (_settings.GPUOnly && !_settings.DoRice) if (task.UseGPUOnly && !task.UseGPURice)
{ {
uint real_size = measure_subframe(task.frame, task.frame.subframes[ch]); uint real_size = measure_subframe(task.frame, task.frame.subframes[ch]);
if (real_size != task.frame.subframes[ch].best.size) if (real_size != task.frame.subframes[ch].best.size)
@@ -1244,9 +1251,9 @@ namespace CUETools.Codecs.FLACCL
} }
#endif #endif
if (((csum << task.frame.subframes[ch].obits) >= 1UL << 32) || !_settings.GPUOnly) if ((((csum << task.frame.subframes[ch].obits) >= 1UL << 32) && PCM.BitsPerSample == 16) || !task.UseGPUOnly)
{ {
if (_settings.GPUOnly && _settings.DoRice) if (task.UseGPURice)
#if DEBUG #if DEBUG
// throw new Exception("DoRice failed"); // throw new Exception("DoRice failed");
break; break;
@@ -1266,11 +1273,11 @@ namespace CUETools.Codecs.FLACCL
RiceContext rc1 = task.frame.subframes[ch].best.rc; RiceContext rc1 = task.frame.subframes[ch].best.rc;
task.frame.subframes[ch].best.rc = new RiceContext(); task.frame.subframes[ch].best.rc = new RiceContext();
#endif #endif
task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order); task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order, PCM.BitsPerSample > 16 ? 1 : 0);
task.frame.subframes[ch].best.size = measure_subframe(task.frame, task.frame.subframes[ch]); task.frame.subframes[ch].best.size = measure_subframe(task.frame, task.frame.subframes[ch]);
#if KJHKJH #if KJHKJH
// check size // check size
if (_settings.GPUOnly && oldsize > task.frame.subframes[ch].best.size) if (task.UseGPUOnly && oldsize > task.frame.subframes[ch].best.size)
throw new Exception("unoptimal size reported"); throw new Exception("unoptimal size reported");
#endif #endif
//if (task.frame.subframes[ch].best.size > task.frame.subframes[ch].obits * (uint)task.frame.blocksize && //if (task.frame.subframes[ch].best.size > task.frame.subframes[ch].obits * (uint)task.frame.blocksize &&
@@ -1337,8 +1344,9 @@ namespace CUETools.Codecs.FLACCL
for (int i = 0; i < task.BestResidualTasks[index].residualOrder; i++) for (int i = 0; i < task.BestResidualTasks[index].residualOrder; i++)
frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i]; frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i];
frame.subframes[ch].best.rc.porder = task.BestResidualTasks[index].porder; frame.subframes[ch].best.rc.porder = task.BestResidualTasks[index].porder;
if (_settings.GPUOnly && !_settings.DoRice && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC)) frame.subframes[ch].best.rc.coding_method = task.BestResidualTasks[index].coding_method;
//if (_settings.GPUOnly && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC)) if (task.UseGPUOnly && !task.UseGPURice && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC))
//if (task.UseGPUOnly && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC))
{ {
int* riceParams = ((int*)task.clBestRiceParamsPtr) + (index << task.max_porder); int* riceParams = ((int*)task.clBestRiceParamsPtr) + (index << task.max_porder);
fixed (int* dstParams = frame.subframes[ch].best.rc.rparams) fixed (int* dstParams = frame.subframes[ch].best.rc.rparams)
@@ -1352,7 +1360,7 @@ namespace CUETools.Codecs.FLACCL
} }
else else
{ {
if (_settings.GPUOnly && _settings.DoRice && frame.subframes[ch].best.size != task.BestResidualTasks[index].size) if (task.UseGPURice && frame.subframes[ch].best.size != task.BestResidualTasks[index].size)
throw new Exception("size reported incorrectly"); throw new Exception("size reported incorrectly");
} }
} }
@@ -1369,10 +1377,9 @@ namespace CUETools.Codecs.FLACCL
/// </summary> /// </summary>
/// <param name="task"></param> /// <param name="task"></param>
/// <param name="doMidside"></param> /// <param name="doMidside"></param>
unsafe void unpack_samples(FLACCLTask task, int count) unsafe void unpack_samples_16(FLACCLTask task, byte * srcptr, int count)
{ {
int iFrame = task.frame.frame_number; short* src = (short*)srcptr;
short* src = ((short*)task.clSamplesBytesPtr) + iFrame * channels * task.frameSize;
switch (task.frame.ch_mode) switch (task.frame.ch_mode)
{ {
@@ -1382,7 +1389,7 @@ namespace CUETools.Codecs.FLACCL
int* s = task.frame.subframes[ch].samples; int* s = task.frame.subframes[ch].samples;
int wbits = (int)task.frame.subframes[ch].wbits; int wbits = (int)task.frame.subframes[ch].wbits;
for (int i = 0; i < count; i++) for (int i = 0; i < count; i++)
s[i] = src[i * channels + ch] >>= wbits; s[i] = src[i * channels + ch] >> wbits;
} }
break; break;
case ChannelMode.LeftRight: case ChannelMode.LeftRight:
@@ -1448,6 +1455,108 @@ namespace CUETools.Codecs.FLACCL
} }
} }
/// <summary>
/// Copy channel-interleaved input samples into separate subframes
/// </summary>
/// <param name="task"></param>
/// <param name="doMidside"></param>
unsafe void unpack_samples_24(FLACCLTask task, byte* srcptr, int count)
{
switch (task.frame.ch_mode)
{
case ChannelMode.NotStereo:
for (int ch = 0; ch < channels; ch++)
{
int* s = task.frame.subframes[ch].samples;
int wbits = (int)task.frame.subframes[ch].wbits;
byte* src = &srcptr[ch * 3];
for (int i = 0; i < count; i++)
{
s[i] = (((int)src[0] << 8) + ((int)src[1] << 16) + ((int)src[2] << 24)) >> (8 + wbits);
src += PCM.BlockAlign;
}
}
break;
case ChannelMode.LeftRight:
{
int* left = task.frame.subframes[0].samples;
int* right = task.frame.subframes[1].samples;
int lwbits = (int)task.frame.subframes[0].wbits;
int rwbits = (int)task.frame.subframes[1].wbits;
for (int i = 0; i < count; i++)
{
int l = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8;
int r = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8;
left[i] = l >> lwbits;
right[i] = r >> rwbits;
}
break;
}
case ChannelMode.LeftSide:
{
int* left = task.frame.subframes[0].samples;
int* right = task.frame.subframes[1].samples;
int lwbits = (int)task.frame.subframes[0].wbits;
int rwbits = (int)task.frame.subframes[1].wbits;
for (int i = 0; i < count; i++)
{
int l = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8;
int r = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8;
left[i] = l >> lwbits;
right[i] = (l - r) >> rwbits;
}
break;
}
case ChannelMode.RightSide:
{
int* left = task.frame.subframes[0].samples;
int* right = task.frame.subframes[1].samples;
int lwbits = (int)task.frame.subframes[0].wbits;
int rwbits = (int)task.frame.subframes[1].wbits;
for (int i = 0; i < count; i++)
{
int l = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8;
int r = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8;
left[i] = (l - r) >> lwbits;
right[i] = r >> rwbits;
}
break;
}
case ChannelMode.MidSide:
{
int* left = task.frame.subframes[0].samples;
int* right = task.frame.subframes[1].samples;
int lwbits = (int)task.frame.subframes[0].wbits;
int rwbits = (int)task.frame.subframes[1].wbits;
for (int i = 0; i < count; i++)
{
int l = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8;
int r = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8;
left[i] = (l + r) >> (1 + lwbits);
right[i] = (l - r) >> rwbits;
}
break;
}
}
}
/// <summary>
/// Copy channel-interleaved input samples into separate subframes
/// </summary>
/// <param name="task"></param>
/// <param name="doMidside"></param>
unsafe void unpack_samples(FLACCLTask task, int count)
{
int iFrame = task.frame.frame_number;
byte* srcptr = ((byte*)task.clSamplesBytesPtr) + iFrame * task.frameSize * PCM.BlockAlign;
if (PCM.BitsPerSample == 16)
unpack_samples_16(task, srcptr, count);
else if (PCM.BitsPerSample == 24)
unpack_samples_24(task, srcptr, count);
else
throw new Exception("Invalid BPS");
}
unsafe int encode_frame(bool doMidside, int channelCount, int iFrame, FLACCLTask task, int current_frame_number) unsafe int encode_frame(bool doMidside, int channelCount, int iFrame, FLACCLTask task, int current_frame_number)
{ {
task.frame.InitSize(task.frameSize, eparams.variable_block_size != 0); task.frame.InitSize(task.frameSize, eparams.variable_block_size != 0);
@@ -1492,8 +1601,8 @@ namespace CUETools.Codecs.FLACCL
task.framePos = frame_pos; task.framePos = frame_pos;
frame_count += nFrames; frame_count += nFrames;
frame_pos += nFrames * blocksize; frame_pos += nFrames * blocksize;
if (!_settings.MappedMemory) if (!task.UseMappedMemory)
task.openCLCQ.EnqueueWriteBuffer(task.clSamplesBytes, false, 0, sizeof(short) * channels * blocksize * nFrames, task.clSamplesBytesPtr); task.openCLCQ.EnqueueWriteBuffer(task.clSamplesBytes, false, 0, PCM.BlockAlign * blocksize * nFrames, task.clSamplesBytesPtr);
//task.openCLCQ.EnqueueUnmapMemObject(task.clSamplesBytes, task.clSamplesBytes.HostPtr); //task.openCLCQ.EnqueueUnmapMemObject(task.clSamplesBytes, task.clSamplesBytes.HostPtr);
//task.openCLCQ.EnqueueMapBuffer(task.clSamplesBytes, true, MapFlags.WRITE, 0, task.samplesBufferLen / 2); //task.openCLCQ.EnqueueMapBuffer(task.clSamplesBytes, true, MapFlags.WRITE, 0, task.samplesBufferLen / 2);
} }
@@ -1530,21 +1639,39 @@ namespace CUETools.Codecs.FLACCL
{ {
int decoded = task.verify.DecodeFrame(task.frame.writer.Buffer, task.frame.writer_offset, fs); int decoded = task.verify.DecodeFrame(task.frame.writer.Buffer, task.frame.writer_offset, fs);
if (decoded != fs || task.verify.Remaining != task.frameSize) if (decoded != fs || task.verify.Remaining != task.frameSize)
throw new Exception("validation failed! frame size mismatch"); throw new Exception(string.Format("validation failed! frame size mismatch, iFrame={0}, decoded=={1}, fs=={2}", fn, decoded, fs));
fixed (int* r = task.verify.Samples) fixed (int* r = task.verify.Samples)
{ {
for (int ch = 0; ch < channels; ch++) for (int ch = 0; ch < channels; ch++)
{ {
short* res = ((short*)task.clSamplesBytesPtr) + iFrame * channels * task.frameSize + ch; byte* res = ((byte*)task.clSamplesBytesPtr) + PCM.BlockAlign * iFrame * task.frameSize + ch * (PCM.BlockAlign / channels);
int* smp = r + ch * Flake.MAX_BLOCKSIZE; int* smp = r + ch * Flake.MAX_BLOCKSIZE;
int ba = PCM.BlockAlign;
if (PCM.BitsPerSample == 16)
{
for (int i = task.frameSize; i > 0; i--) for (int i = task.frameSize; i > 0; i--)
{ {
//if (AudioSamples.MemCmp(s + iFrame * task.frameSize + ch * FLACCLWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, task.frameSize)) //if (AudioSamples.MemCmp(s + iFrame * task.frameSize + ch * FLACCLWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, task.frameSize))
if (*res != *(smp++)) int ress = *(short*)res;
throw new Exception(string.Format("validation failed! iFrame={0}, ch={1}", iFrame, ch)); if (ress != *(smp++))
res += channels; throw new Exception(string.Format("validation failed! iFrame={0}, ch={1}", fn, ch));
res += ba;
} }
} }
else if (PCM.BitsPerSample == 24)
{
for (int i = task.frameSize; i > 0; i--)
{
//if (AudioSamples.MemCmp(s + iFrame * task.frameSize + ch * FLACCLWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, task.frameSize))
int ress = (((int)res[0] << 8) + ((int)res[1] << 16) + ((int)res[2] << 24)) >> (8);
if (ress != *(smp++))
throw new Exception(string.Format("validation failed! iFrame={0}, ch={1}", iFrame, ch));
res += ba;
}
}
else
throw new Exception("Invalid BPS");
}
} }
} }
@@ -1644,10 +1771,21 @@ namespace CUETools.Codecs.FLACCL
} }
OCLMan.CreateDefaultContext(platformId, (DeviceType)_settings.DeviceType); OCLMan.CreateDefaultContext(platformId, (DeviceType)_settings.DeviceType);
this.framesPerTask = (int)OCLMan.Context.Devices[0].MaxComputeUnits * _settings.TaskSize; this.framesPerTask = (int)OCLMan.Context.Devices[0].MaxComputeUnits * Math.Max(1, _settings.TaskSize / channels);
if (!OCLMan.Context.Devices[0].Extensions.Contains("cl_khr_local_int32_extended_atomics")) bool UseGPUOnly = _settings.GPUOnly && OCLMan.Context.Devices[0].Extensions.Contains("cl_khr_local_int32_extended_atomics");
_settings.GPUOnly = false; bool UseGPURice = UseGPUOnly && _settings.DoRice;
if (_blocksize == 0)
{
if (eparams.block_size == 0)
eparams.block_size = select_blocksize(sample_rate, eparams.block_time_ms);
_blocksize = eparams.block_size;
}
else
eparams.block_size = _blocksize;
int maxBS = 1 << (BitReader.log2i(eparams.block_size - 1) + 1);
// The Defines string gets prepended to any and all sources that are compiled // The Defines string gets prepended to any and all sources that are compiled
// and serve as a convenient way to pass configuration information to the compilation process // and serve as a convenient way to pass configuration information to the compilation process
@@ -1655,8 +1793,11 @@ namespace CUETools.Codecs.FLACCL
"#define MAX_ORDER " + eparams.max_prediction_order.ToString() + "\n" + "#define MAX_ORDER " + eparams.max_prediction_order.ToString() + "\n" +
"#define GROUP_SIZE " + groupSize.ToString() + "\n" + "#define GROUP_SIZE " + groupSize.ToString() + "\n" +
"#define FLACCL_VERSION \"" + vendor_string + "\"\n" + "#define FLACCL_VERSION \"" + vendor_string + "\"\n" +
(_settings.GPUOnly ? "#define DO_PARTITIONS\n" : "") + (UseGPUOnly ? "#define DO_PARTITIONS\n" : "") +
(_settings.DoRice ? "#define DO_RICE\n" : "") + (UseGPURice ? "#define DO_RICE\n" : "") +
"#define BITS_PER_SAMPLE " + PCM.BitsPerSample + "\n" +
"#define MAX_BLOCKSIZE " + maxBS + "\n" +
"#define MAX_CHANNELS " + PCM.ChannelCount + "\n" +
#if DEBUG #if DEBUG
"#define DEBUG\n" + "#define DEBUG\n" +
#endif #endif
@@ -1718,13 +1859,13 @@ namespace CUETools.Codecs.FLACCL
if (_IO.CanSeek) if (_IO.CanSeek)
first_frame_offset = _IO.Position; first_frame_offset = _IO.Position;
task1 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, this, groupSize); task1 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, this, groupSize, UseGPUOnly, UseGPURice);
task2 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, this, groupSize); task2 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, this, groupSize, UseGPUOnly, UseGPURice);
if (_settings.CPUThreads > 0) if (_settings.CPUThreads > 0)
{ {
cpu_tasks = new FLACCLTask[_settings.CPUThreads]; cpu_tasks = new FLACCLTask[_settings.CPUThreads];
for (int i = 0; i < cpu_tasks.Length; i++) for (int i = 0; i < cpu_tasks.Length; i++)
cpu_tasks[i] = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, this, groupSize); cpu_tasks[i] = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, this, groupSize, UseGPUOnly, UseGPURice);
} }
inited = true; inited = true;
} }
@@ -1823,10 +1964,10 @@ namespace CUETools.Codecs.FLACCL
public unsafe void do_output_frames(int nFrames) public unsafe void do_output_frames(int nFrames)
{ {
send_to_GPU(task1, nFrames, eparams.block_size);
run_GPU_task(task1);
if (task2.frameCount > 0) if (task2.frameCount > 0)
task2.openCLCQ.Finish(); task2.openCLCQ.Finish();
send_to_GPU(task1, nFrames, eparams.block_size);
run_GPU_task(task1);
if (task2.frameCount > 0) if (task2.frameCount > 0)
{ {
if (cpu_tasks != null) if (cpu_tasks != null)
@@ -1871,15 +2012,16 @@ namespace CUETools.Codecs.FLACCL
{ {
int blocksize = Flake.flac_blocksizes[1]; int blocksize = Flake.flac_blocksizes[1];
int target = (samplerate * time_ms) / 1000; int target = (samplerate * time_ms) / 1000;
if (eparams.variable_block_size > 0)
{
blocksize = 1024;
while (target >= blocksize)
blocksize <<= 1;
return blocksize >> 1;
}
for (int i = 0; i < Flake.flac_blocksizes.Length; i++) ////if (eparams.variable_block_size > 0)
////{
//// blocksize = 1024;
//// while (target >= blocksize)
//// blocksize <<= 1;
//// return blocksize >> 1;
////}
for (int i = 8; i < Flake.flac_blocksizes.Length; i++)
if (target >= Flake.flac_blocksizes[i] && Flake.flac_blocksizes[i] > blocksize) if (target >= Flake.flac_blocksizes[i] && Flake.flac_blocksizes[i] > blocksize)
{ {
blocksize = Flake.flac_blocksizes[i]; blocksize = Flake.flac_blocksizes[i];
@@ -2052,18 +2194,6 @@ namespace CUETools.Codecs.FLACCL
} }
if (i == 8) if (i == 8)
throw new Exception("non-standard bps"); throw new Exception("non-standard bps");
// FIXME: For now, only 16-bit encoding is supported
if (bits_per_sample != 16)
throw new Exception("non-standard bps");
if (_blocksize == 0)
{
if (eparams.block_size == 0)
eparams.block_size = select_blocksize(sample_rate, eparams.block_time_ms);
_blocksize = eparams.block_size;
}
else
eparams.block_size = _blocksize;
// set maximum encoded frame size (if larger, re-encodes in verbatim mode) // set maximum encoded frame size (if larger, re-encodes in verbatim mode)
if (channels == 2) if (channels == 2)
@@ -2332,7 +2462,7 @@ namespace CUETools.Codecs.FLACCL
public int type; public int type;
public int obits; public int obits;
public int blocksize; public int blocksize;
public int best_index; public int coding_method;
public int channel; public int channel;
public int residualOffs; public int residualOffs;
public int wbits; public int wbits;
@@ -2350,6 +2480,7 @@ namespace CUETools.Codecs.FLACCL
public Kernel clStereoDecorr; public Kernel clStereoDecorr;
//public Kernel cudaChannelDecorr; //public Kernel cudaChannelDecorr;
public Kernel clChannelDecorr2; public Kernel clChannelDecorr2;
public Kernel clChannelDecorrX;
public Kernel clFindWastedBits; public Kernel clFindWastedBits;
public Kernel clComputeAutocor; public Kernel clComputeAutocor;
public Kernel clComputeLPC; public Kernel clComputeLPC;
@@ -2428,9 +2559,15 @@ namespace CUETools.Codecs.FLACCL
public int groupSize = 128; public int groupSize = 128;
public int channels, channelsCount; public int channels, channelsCount;
public FLACCLWriter writer; public FLACCLWriter writer;
public bool UseGPUOnly = false;
public bool UseGPURice = false;
public bool UseMappedMemory = false;
unsafe public FLACCLTask(Program _openCLProgram, int channelsCount, int channels, uint bits_per_sample, int max_frame_size, FLACCLWriter writer, int groupSize) unsafe public FLACCLTask(Program _openCLProgram, int channelsCount, int channels, uint bits_per_sample, int max_frame_size, FLACCLWriter writer, int groupSize, bool gpuOnly, bool gpuRice)
{ {
this.UseGPUOnly = gpuOnly;
this.UseGPURice = gpuOnly && gpuRice;
this.UseMappedMemory = writer._settings.MappedMemory || writer._settings.DeviceType == OpenCLDeviceType.CPU;
this.groupSize = groupSize; this.groupSize = groupSize;
this.channels = channels; this.channels = channels;
this.channelsCount = channelsCount; this.channelsCount = channelsCount;
@@ -2448,9 +2585,9 @@ namespace CUETools.Codecs.FLACCL
int MAX_CHANNELSIZE = MAX_FRAMES * writer.eparams.block_size; int MAX_CHANNELSIZE = MAX_FRAMES * writer.eparams.block_size;
residualTasksLen = sizeof(FLACCLSubframeTask) * 32 * channelsCount * MAX_FRAMES; residualTasksLen = sizeof(FLACCLSubframeTask) * 32 * channelsCount * MAX_FRAMES;
bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channels * MAX_FRAMES; bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channels * MAX_FRAMES;
int samplesBufferLen = sizeof(int) * MAX_CHANNELSIZE * channelsCount; int samplesBufferLen = writer.PCM.BlockAlign * MAX_CHANNELSIZE * channelsCount;
int residualBufferLen = sizeof(int) * MAX_CHANNELSIZE * channels; // need to adjust residualOffset? int residualBufferLen = sizeof(int) * MAX_CHANNELSIZE * channels; // need to adjust residualOffset?
int partitionsLen = sizeof(int) * (30 << 8) * channels * MAX_FRAMES; int partitionsLen = sizeof(int) * ((writer.PCM.BitsPerSample > 16 ? 31 : 15) * 2 << 8) * channels * MAX_FRAMES;
int riceParamsLen = sizeof(int) * (4 << 8) * channels * MAX_FRAMES; int riceParamsLen = sizeof(int) * (4 << 8) * channels * MAX_FRAMES;
int autocorLen = sizeof(float) * (MAX_ORDER + 1) * lpc.MAX_LPC_WINDOWS * channelsCount * MAX_FRAMES; int autocorLen = sizeof(float) * (MAX_ORDER + 1) * lpc.MAX_LPC_WINDOWS * channelsCount * MAX_FRAMES;
int lpcDataLen = autocorLen * 32; int lpcDataLen = autocorLen * 32;
@@ -2459,7 +2596,7 @@ namespace CUETools.Codecs.FLACCL
int selectedLen = sizeof(int) * 32 * channelsCount * MAX_FRAMES; int selectedLen = sizeof(int) * 32 * channelsCount * MAX_FRAMES;
int riceLen = sizeof(int) * channels * MAX_CHANNELSIZE; int riceLen = sizeof(int) * channels * MAX_CHANNELSIZE;
if (!writer._settings.MappedMemory) if (!this.UseMappedMemory)
{ {
clSamplesBytes = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, samplesBufferLen / 2); clSamplesBytes = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, samplesBufferLen / 2);
clResidual = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, residualBufferLen); clResidual = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, residualBufferLen);
@@ -2521,7 +2658,7 @@ namespace CUETools.Codecs.FLACCL
clAutocorOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, autocorLen); clAutocorOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, autocorLen);
clSelectedTasksSecondEstimate = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, selectedLen); clSelectedTasksSecondEstimate = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, selectedLen);
clSelectedTasksBestMethod = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, selectedLen); clSelectedTasksBestMethod = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, selectedLen);
if (writer._settings.GPUOnly) if (UseGPUOnly)
{ {
clPartitions = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, partitionsLen); clPartitions = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, partitionsLen);
clRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, riceParamsLen); clRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, riceParamsLen);
@@ -2533,6 +2670,7 @@ namespace CUETools.Codecs.FLACCL
clStereoDecorr = openCLProgram.CreateKernel("clStereoDecorr"); clStereoDecorr = openCLProgram.CreateKernel("clStereoDecorr");
//cudaChannelDecorr = openCLProgram.CreateKernel("clChannelDecorr"); //cudaChannelDecorr = openCLProgram.CreateKernel("clChannelDecorr");
clChannelDecorr2 = openCLProgram.CreateKernel("clChannelDecorr2"); clChannelDecorr2 = openCLProgram.CreateKernel("clChannelDecorr2");
clChannelDecorrX = openCLProgram.CreateKernel("clChannelDecorrX");
clFindWastedBits = openCLProgram.CreateKernel("clFindWastedBits"); clFindWastedBits = openCLProgram.CreateKernel("clFindWastedBits");
clComputeLPC = openCLProgram.CreateKernel("clComputeLPC"); clComputeLPC = openCLProgram.CreateKernel("clComputeLPC");
clQuantizeLPC = openCLProgram.CreateKernel("clQuantizeLPC"); clQuantizeLPC = openCLProgram.CreateKernel("clQuantizeLPC");
@@ -2540,15 +2678,16 @@ namespace CUETools.Codecs.FLACCL
clSelectStereoTasks = openCLProgram.CreateKernel("clSelectStereoTasks"); clSelectStereoTasks = openCLProgram.CreateKernel("clSelectStereoTasks");
clEstimateResidual = openCLProgram.CreateKernel("clEstimateResidual"); clEstimateResidual = openCLProgram.CreateKernel("clEstimateResidual");
clChooseBestMethod = openCLProgram.CreateKernel("clChooseBestMethod"); clChooseBestMethod = openCLProgram.CreateKernel("clChooseBestMethod");
if (writer._settings.GPUOnly) if (UseGPUOnly)
{ {
clEncodeResidual = openCLProgram.CreateKernel("clEncodeResidual"); clEncodeResidual = openCLProgram.CreateKernel("clEncodeResidual");
clCalcPartition = openCLProgram.CreateKernel("clCalcPartition"); clCalcPartition = openCLProgram.CreateKernel("clCalcPartition");
if (openCLCQ.Device.DeviceType != DeviceType.CPU)
clCalcPartition16 = openCLProgram.CreateKernel("clCalcPartition16"); clCalcPartition16 = openCLProgram.CreateKernel("clCalcPartition16");
clSumPartition = openCLProgram.CreateKernel("clSumPartition"); clSumPartition = openCLProgram.CreateKernel("clSumPartition");
clFindRiceParameter = openCLProgram.CreateKernel("clFindRiceParameter"); clFindRiceParameter = openCLProgram.CreateKernel("clFindRiceParameter");
clFindPartitionOrder = openCLProgram.CreateKernel("clFindPartitionOrder"); clFindPartitionOrder = openCLProgram.CreateKernel("clFindPartitionOrder");
if (writer._settings.DoRice) if (UseGPURice)
{ {
clCalcOutputOffsets = openCLProgram.CreateKernel("clCalcOutputOffsets"); clCalcOutputOffsets = openCLProgram.CreateKernel("clCalcOutputOffsets");
clRiceEncoding = openCLProgram.CreateKernel("clRiceEncoding"); clRiceEncoding = openCLProgram.CreateKernel("clRiceEncoding");
@@ -2586,6 +2725,7 @@ namespace CUETools.Codecs.FLACCL
clStereoDecorr.Dispose(); clStereoDecorr.Dispose();
//cudaChannelDecorr.Dispose(); //cudaChannelDecorr.Dispose();
clChannelDecorr2.Dispose(); clChannelDecorr2.Dispose();
clChannelDecorrX.Dispose();
clFindWastedBits.Dispose(); clFindWastedBits.Dispose();
clComputeLPC.Dispose(); clComputeLPC.Dispose();
clQuantizeLPC.Dispose(); clQuantizeLPC.Dispose();
@@ -2593,15 +2733,16 @@ namespace CUETools.Codecs.FLACCL
clSelectStereoTasks.Dispose(); clSelectStereoTasks.Dispose();
clEstimateResidual.Dispose(); clEstimateResidual.Dispose();
clChooseBestMethod.Dispose(); clChooseBestMethod.Dispose();
if (writer._settings.GPUOnly) if (UseGPUOnly)
{ {
clEncodeResidual.Dispose(); clEncodeResidual.Dispose();
clCalcPartition.Dispose(); clCalcPartition.Dispose();
if (openCLCQ.Device.DeviceType != DeviceType.CPU)
clCalcPartition16.Dispose(); clCalcPartition16.Dispose();
clSumPartition.Dispose(); clSumPartition.Dispose();
clFindRiceParameter.Dispose(); clFindRiceParameter.Dispose();
clFindPartitionOrder.Dispose(); clFindPartitionOrder.Dispose();
if (writer._settings.DoRice) if (UseGPURice)
{ {
clCalcOutputOffsets.Dispose(); clCalcOutputOffsets.Dispose();
clRiceEncoding.Dispose(); clRiceEncoding.Dispose();
@@ -2611,7 +2752,7 @@ namespace CUETools.Codecs.FLACCL
clRiceParams.Dispose(); clRiceParams.Dispose();
} }
if (!writer._settings.MappedMemory) if (!this.UseMappedMemory)
{ {
if (clSamplesBytesPtr != IntPtr.Zero) if (clSamplesBytesPtr != IntPtr.Zero)
openCLCQ.EnqueueUnmapMemObject(clSamplesBytesPinned, clSamplesBytesPtr); openCLCQ.EnqueueUnmapMemObject(clSamplesBytesPinned, clSamplesBytesPtr);
@@ -2701,19 +2842,36 @@ namespace CUETools.Codecs.FLACCL
while ((frameSize >> max_porder) < 16 && max_porder > 0) while ((frameSize >> max_porder) < 16 && max_porder > 0)
this.max_porder--; this.max_porder--;
if (channels != 2) throw new Exception("channels != 2"); // need to Enqueue cudaChannelDecorr for each channel
Kernel clChannelDecorr = channels == 2 ? (channelsCount == 4 ? clStereoDecorr : clChannelDecorr2) : null;// cudaChannelDecorr;
// openCLCQ.EnqueueMapBuffer(cudaSamplesBytes // openCLCQ.EnqueueMapBuffer(cudaSamplesBytes
//openCLCQ.EnqueueUnmapMemObject(cudaSamplesBytes, cudaSamplesBytes.HostPtr); //openCLCQ.EnqueueUnmapMemObject(cudaSamplesBytes, cudaSamplesBytes.HostPtr);
// issue work to the GPU // issue work to the GPU
if (channels == 2)
{
Kernel clChannelDecorr = channelsCount == 4 ? clStereoDecorr : clChannelDecorr2;
int channelSize1 = writer.PCM.BitsPerSample == 16 ? channelSize / 4 : channelSize;
clChannelDecorr.SetArgs( clChannelDecorr.SetArgs(
clSamples, clSamples,
clSamplesBytes, clSamplesBytes,
channelSize / 4); channelSize1);
openCLCQ.EnqueueNDRangeKernel(clChannelDecorr, 0, channelSize / 4); openCLCQ.EnqueueNDRangeKernel(
clChannelDecorr,
0,
channelSize1);
}
else
{
clChannelDecorrX.SetArgs(
clSamples,
clSamplesBytes,
channelSize);
openCLCQ.EnqueueNDRangeKernel(
clChannelDecorrX,
0,
channelSize);
}
//openCLCQ.EnqueueNDRangeKernel(clChannelDecorr, 0, (frameSize * frameCount + 3) / 4); //openCLCQ.EnqueueNDRangeKernel(clChannelDecorr, 0, (frameSize * frameCount + 3) / 4);
if (eparams.do_wasted) if (eparams.do_wasted)
@@ -2842,22 +3000,7 @@ namespace CUETools.Codecs.FLACCL
0, channels * frameCount); 0, channels * frameCount);
} }
if (writer._settings.GPUOnly) if (UseGPUOnly)
{
if (frameSize >> max_porder == 16)
{
clCalcPartition16.SetArgs(
clPartitions,
clResidual,
clSamples,
clBestResidualTasks,
max_porder);
openCLCQ.EnqueueNDRangeKernel(
clCalcPartition16,
groupSize, channels * frameCount);
}
else
{ {
clEncodeResidual.SetArgs( clEncodeResidual.SetArgs(
clResidual, clResidual,
@@ -2868,6 +3011,20 @@ namespace CUETools.Codecs.FLACCL
clEncodeResidual, clEncodeResidual,
groupSize, channels * frameCount); groupSize, channels * frameCount);
if ((frameSize >> max_porder == 16) && openCLCQ.Device.DeviceType != DeviceType.CPU)
{
clCalcPartition16.SetArgs(
clPartitions,
clResidual,
clBestResidualTasks,
max_porder);
openCLCQ.EnqueueNDRangeKernel(
clCalcPartition16,
groupSize, channels * frameCount);
}
else
{
clCalcPartition.SetArgs( clCalcPartition.SetArgs(
clPartitions, clPartitions,
clResidual, clResidual,
@@ -2895,6 +3052,7 @@ namespace CUETools.Codecs.FLACCL
clPartitions, clPartitions,
max_porder); max_porder);
int maxK = writer.PCM.BitsPerSample > 16 ? 30 : Flake.MAX_RICE_PARAM;
if (openCLCQ.Device.DeviceType == DeviceType.CPU) if (openCLCQ.Device.DeviceType == DeviceType.CPU)
openCLCQ.EnqueueNDRangeKernel( openCLCQ.EnqueueNDRangeKernel(
clSumPartition, clSumPartition,
@@ -2904,7 +3062,7 @@ namespace CUETools.Codecs.FLACCL
openCLCQ.EnqueueNDRangeKernel( openCLCQ.EnqueueNDRangeKernel(
clSumPartition, clSumPartition,
128, 1, 128, 1,
(Flake.MAX_RICE_PARAM + 1), (maxK + 1),
channels * frameCount); channels * frameCount);
} }
@@ -2931,7 +3089,7 @@ namespace CUETools.Codecs.FLACCL
groupSize, groupSize,
channels * frameCount); channels * frameCount);
if (writer._settings.DoRice) if (UseGPURice)
{ {
clCalcOutputOffsets.SetArgs( clCalcOutputOffsets.SetArgs(
clResidual, clResidual,
@@ -2960,10 +3118,10 @@ namespace CUETools.Codecs.FLACCL
channels * frameCount); channels * frameCount);
} }
if (!writer._settings.MappedMemory) if (!this.UseMappedMemory)
{ {
if (writer._settings.DoRice) if (UseGPURice)
openCLCQ.EnqueueReadBuffer(clRiceOutput, false, 0, (channels * frameSize * 17 + 128) / 8 * frameCount, clRiceOutputPtr); openCLCQ.EnqueueReadBuffer(clRiceOutput, false, 0, (channels * frameSize * (writer.PCM.BitsPerSample + 1) + 256) / 8 * frameCount, clRiceOutputPtr);
else else
{ {
openCLCQ.EnqueueReadBuffer(clBestRiceParams, false, 0, sizeof(int) * (1 << max_porder) * channels * frameCount, clBestRiceParamsPtr); openCLCQ.EnqueueReadBuffer(clBestRiceParams, false, 0, sizeof(int) * (1 << max_porder) * channels * frameCount, clBestRiceParamsPtr);
@@ -2971,7 +3129,7 @@ namespace CUETools.Codecs.FLACCL
} }
} }
} }
if (!writer._settings.MappedMemory) if (!this.UseMappedMemory)
openCLCQ.EnqueueReadBuffer(clBestResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * channels * frameCount, clBestResidualTasksPtr); openCLCQ.EnqueueReadBuffer(clBestResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * channels * frameCount, clBestResidualTasksPtr);
} }
} }

View File

@@ -65,6 +65,14 @@
#define WARP_SIZE 32 #define WARP_SIZE 32
#if BITS_PER_SAMPLE > 16
#define MAX_RICE_PARAM 30
#define RICE_PARAM_BITS 5
#else
#define MAX_RICE_PARAM 14
#define RICE_PARAM_BITS 4
#endif
typedef enum typedef enum
{ {
Constant = 0, Constant = 0,
@@ -83,7 +91,7 @@ typedef struct
int type; int type;
int obits; int obits;
int blocksize; int blocksize;
int best_index; int coding_method;
int channel; int channel;
int residualOffs; int residualOffs;
int wbits; int wbits;
@@ -125,6 +133,49 @@ __kernel void clWindowTukey(__global float* window, int windowOffset, float p)
} }
#endif #endif
#if BITS_PER_SAMPLE > 16
__kernel void clStereoDecorr(
__global int *samples,
__global unsigned char *src,
int offset
)
{
int pos = get_global_id(0);
int bpos = pos * 6;
int x = (((int)src[bpos] << 8) | ((int)src[bpos+1] << 16) | ((int)src[bpos+2] << 24)) >> 8;
int y = (((int)src[bpos+3] << 8) | ((int)src[bpos+4] << 16) | ((int)src[bpos+5] << 24)) >> 8;
samples[pos] = x;
samples[1 * offset + pos] = y;
samples[2 * offset + pos] = (x + y) >> 1;
samples[3 * offset + pos] = x - y;
}
__kernel void clChannelDecorr2(
__global int *samples,
__global unsigned char *src,
int offset
)
{
int pos = get_global_id(0);
int bpos = pos * 6;
samples[pos] = (((int)src[bpos] << 8) | ((int)src[bpos+1] << 16) | ((int)src[bpos+2] << 24)) >> 8;
samples[offset + pos] = (((int)src[bpos+3] << 8) | ((int)src[bpos+4] << 16) | ((int)src[bpos+5] << 24)) >> 8;
}
__kernel void clChannelDecorrX(
__global int *samples,
__global unsigned char *src,
int offset
)
{
int pos = get_global_id(0);
for (int ch = 0; ch < MAX_CHANNELS; ch++)
{
int bpos = 3 * (pos * MAX_CHANNELS + ch);
samples[offset * ch + pos] = (((int)src[bpos] << 8) | ((int)src[bpos+1] << 16) | ((int)src[bpos+2] << 24)) >> 8;
}
}
#else
__kernel void clStereoDecorr( __kernel void clStereoDecorr(
__global int4 *samples, __global int4 *samples,
__global int4 *src, __global int4 *src,
@@ -153,6 +204,21 @@ __kernel void clChannelDecorr2(
samples[offset + pos] = s >> 16; samples[offset + pos] = s >> 16;
} }
__kernel void clChannelDecorrX(
__global int *samples,
__global short *src,
int offset
)
{
int pos = get_global_id(0);
for (int ch = 0; ch < MAX_CHANNELS; ch++)
{
int bpos = pos * MAX_CHANNELS + ch;
samples[offset * ch + pos] = src[bpos];
}
}
#endif
//__kernel void clChannelDecorr( //__kernel void clChannelDecorr(
// int *samples, // int *samples,
// short *src, // short *src,
@@ -598,7 +664,11 @@ void clQuantizeLPC(
} }
// choose precision // choose precision
//int cbits = max(3, min(10, 5 + (abits >> 1))); // - convert_int_rte(shared.PE[order - 1]) //int cbits = max(3, min(10, 5 + (abits >> 1))); // - convert_int_rte(shared.PE[order - 1])
#if BITS_PER_SAMPLE > 16
int cbits = max(3, min(15 - minprecision + (i - ((i >> precisions) << precisions)) - (bs <= 2304) - (bs <= 1152) - (bs <= 576), abits));
#else
int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (bs <= 2304) - (bs <= 1152) - (bs <= 576), abits), clz(order) + 1 - abits)); int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (bs <= 2304) - (bs <= 1152) - (bs <= 576), abits), clz(order) + 1 - abits));
#endif
// calculate shift based on precision and number of leading zeroes in coeffs // calculate shift based on precision and number of leading zeroes in coeffs
int shift = max(0,min(15, clz(tmpi) - 18 + cbits)); int shift = max(0,min(15, clz(tmpi) - 18 + cbits));
@@ -749,7 +819,11 @@ void clQuantizeLPC(
//SUM32(shared.tmpi,tid,|=); //SUM32(shared.tmpi,tid,|=);
// choose precision // choose precision
//int cbits = max(3, min(10, 5 + (shared.task.abits >> 1))); // - convert_int_rte(shared.PE[order - 1]) //int cbits = max(3, min(10, 5 + (shared.task.abits >> 1))); // - convert_int_rte(shared.PE[order - 1])
#if BITS_PER_SAMPLE > 16
int cbits = max(3, min(min(15 - minprecision + (i - ((i >> precisions) << precisions)) - (shared.task.blocksize <= 2304) - (shared.task.blocksize <= 1152) - (shared.task.blocksize <= 576), shared.task.abits), 15));
#else
int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (shared.task.blocksize <= 2304) - (shared.task.blocksize <= 1152) - (shared.task.blocksize <= 576), shared.task.abits), clz(order) + 1 - shared.task.abits)); int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (shared.task.blocksize <= 2304) - (shared.task.blocksize <= 1152) - (shared.task.blocksize <= 576), shared.task.abits), clz(order) + 1 - shared.task.abits));
#endif
// calculate shift based on precision and number of leading zeroes in coeffs // calculate shift based on precision and number of leading zeroes in coeffs
int shift = max(0,min(15, clz(shared.maxcoef[i]) - 18 + cbits)); int shift = max(0,min(15, clz(shared.maxcoef[i]) - 18 + cbits));
@@ -797,7 +871,6 @@ void clQuantizeLPC(
#endif #endif
#ifdef FLACCL_CPU #ifdef FLACCL_CPU
inline int fastclz(int iv) inline int fastclz(int iv)
{ {
unsigned int v = (unsigned int)iv; unsigned int v = (unsigned int)iv;
@@ -809,17 +882,44 @@ inline int fastclz(int iv)
x += (0 != (v >> x)); x += (0 != (v >> x));
return 32 - x; return 32 - x;
} }
#else
inline int calc_residual(__global int *ptr, int * coefs, int ro) inline int fastclz(int iv)
{ {
int sum = 0; return clz(iv);
}
#endif
inline int fastclz64(long iv)
{
unsigned long v = (unsigned long)iv;
int x = (0 != (v >> 32)) * 32;
return 32 - x + fastclz(v >> x);
}
#if BITS_PER_SAMPLE > 16
typedef long residual_t;
#define residual_log(s) (63 - fastclz64(s))
#define convert_bps4 convert_long4
#define convert_bps_sat convert_int_sat
#define bpsint4 long4
#else
typedef int residual_t;
#define residual_log(s) (31 - fastclz(s))
#define convert_bps4
#define convert_bps_sat
#define bpsint4 int4
#endif
#ifdef FLACCL_CPU
inline residual_t calc_residual(__global int *ptr, int * coefs, int ro)
{
residual_t sum = 0;
for (int i = 0; i < ro; i++) for (int i = 0; i < ro; i++)
sum += ptr[i] * coefs[i]; sum += (residual_t) ptr[i] * coefs[i];
return sum; return sum;
} }
#define ENCODE_N(cro,action) for (int pos = cro; pos < bs; pos ++) { \ #define ENCODE_N(cro,action) for (int pos = cro; pos < bs; pos ++) { \
int t = (data[pos] - (calc_residual(data + pos - cro, task.coefs, cro) >> task.data.shift)) >> task.data.wbits; \ residual_t t = (data[pos] - (calc_residual(data + pos - cro, task.coefs, cro) >> task.data.shift)) >> task.data.wbits; \
action; \ action; \
} }
#define SWITCH_N(action) \ #define SWITCH_N(action) \
@@ -861,7 +961,7 @@ void clEstimateResidual(
for (int i = 0; i < 1 << EPO; i++) for (int i = 0; i < 1 << EPO; i++)
len[i] = 0; len[i] = 0;
#ifdef AMD #if defined(AMD) || BITS_PER_SAMPLE > 16
SWITCH_N((t = (t << 1) ^ (t >> 31), len[pos >> (12 - EPO)] += t & 0x7fffff)) SWITCH_N((t = (t << 1) ^ (t >> 31), len[pos >> (12 - EPO)] += t & 0x7fffff))
#else #else
int4 c0 = vload4(0, &task.coefs[0]); int4 c0 = vload4(0, &task.coefs[0]);
@@ -884,21 +984,19 @@ void clEstimateResidual(
int total = 0; int total = 0;
for (int i = 0; i < 1 << EPO; i++) for (int i = 0; i < 1 << EPO; i++)
{ {
int res = min(0x7fffff,len[i]); int res = len[i];
int k = iclamp(31 - (12 - EPO) - fastclz(res), 0, 14); // 25 - clz(res) == clz(64) - clz(res) == log2(res / 64) int k = iclamp(31 - fastclz(res) - (12 - EPO), 0, MAX_RICE_PARAM); // 25 - clz(res) == clz(64) - clz(res) == log2(res / 64)
total += (k << (12 - EPO)) + (res >> k); total += (k << (12 - EPO)) + (res >> k);
} }
int partLen = min(0x7ffffff, total) + (bs - ro); int partLen = min(0x7ffffff, total) + (bs - ro);
int obits = task.data.obits - task.data.wbits; int obits = task.data.obits - task.data.wbits;
tasks[selectedTask].data.size = min(obits * bs, tasks[selectedTask].data.size = min(obits * bs,
task.data.type == Fixed ? ro * obits + 6 + (4 * 1/2) + partLen : task.data.type == Fixed ? ro * obits + 6 + RICE_PARAM_BITS + partLen :
task.data.type == LPC ? ro * obits + 4 + 5 + ro * task.data.cbits + 6 + (4 * 1/2)/* << porder */ + partLen : task.data.type == LPC ? ro * obits + 4 + 5 + ro * task.data.cbits + 6 + RICE_PARAM_BITS/* << porder */ + partLen :
task.data.type == Constant ? obits * select(1, bs, partLen != bs - ro) : task.data.type == Constant ? obits * select(1, bs, partLen != bs - ro) :
obits * bs); obits * bs);
} }
#else #else
#define MAX_BLOCKSIZE 4096
#define ESTPARTLOG 5 #define ESTPARTLOG 5
__kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) __kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
@@ -1049,7 +1147,7 @@ void clEstimateResidual(
// calculate rice partition bit length for every 32 samples // calculate rice partition bit length for every 32 samples
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// Bug: if (MAX_BLOCKSIZE >> (ESTPARTLOG + 1)) > GROUP_SIZE // Bug: if (MAX_BLOCKSIZE >> (ESTPARTLOG + 1)) > GROUP_SIZE
int pl = get_local_id(0) < (MAX_BLOCKSIZE >> (ESTPARTLOG + 1)) ? pl = psum[tid * 2] + psum[tid * 2 + 1] : 0; int pl = get_local_id(0) < (MAX_BLOCKSIZE >> (ESTPARTLOG + 1)) ? psum[tid * 2] + psum[tid * 2 + 1] : 0;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// for (int pos = 0; pos < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2; pos += GROUP_SIZE) // for (int pos = 0; pos < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2; pos += GROUP_SIZE)
// { // {
@@ -1060,7 +1158,7 @@ void clEstimateResidual(
//if (offs < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2) //if (offs < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2)
// psum[offs] = pl; // psum[offs] = pl;
// } // }
int k = iclamp(31 - (ESTPARTLOG + 1) - clz(pl), 0, 14); // 26 - clz(res) == clz(32) - clz(res) == log2(res / 32) int k = iclamp(31 - fastclz(pl) - (ESTPARTLOG + 1), 0, MAX_RICE_PARAM); // 26 - clz(res) == clz(32) - clz(res) == log2(res / 32)
if (tid < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2) if (tid < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2)
psum[tid] = (k << (ESTPARTLOG + 1)) + (pl >> k); psum[tid] = (k << (ESTPARTLOG + 1)) + (pl >> k);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@@ -1075,8 +1173,8 @@ void clEstimateResidual(
int pl = psum[0] + (bs - ro); int pl = psum[0] + (bs - ro);
int obits = task.data.obits - task.data.wbits; int obits = task.data.obits - task.data.wbits;
int len = min(obits * task.data.blocksize, int len = min(obits * task.data.blocksize,
task.data.type == Fixed ? task.data.residualOrder * obits + 6 + (4 * 1/2) + pl : task.data.type == Fixed ? task.data.residualOrder * obits + 6 + RICE_PARAM_BITS + pl :
task.data.type == LPC ? task.data.residualOrder * obits + 4 + 5 + task.data.residualOrder * task.data.cbits + 6 + (4 * 1/2)/* << porder */ + pl : task.data.type == LPC ? task.data.residualOrder * obits + 4 + 5 + task.data.residualOrder * task.data.cbits + 6 + RICE_PARAM_BITS/* << porder */ + pl :
task.data.type == Constant ? obits * select(1, task.data.blocksize, pl != task.data.blocksize - task.data.residualOrder) : task.data.type == Constant ? obits * select(1, task.data.blocksize, pl != task.data.blocksize - task.data.residualOrder) :
obits * task.data.blocksize); obits * task.data.blocksize);
tasks[selectedTask].data.size = len; tasks[selectedTask].data.size = len;
@@ -1172,7 +1270,7 @@ void clEncodeResidual(
int bs = task.data.blocksize; int bs = task.data.blocksize;
int ro = task.data.residualOrder; int ro = task.data.residualOrder;
__global int *data = &samples[task.data.samplesOffs]; __global int *data = &samples[task.data.samplesOffs];
SWITCH_N(residual[task.data.residualOffs + pos] = t); SWITCH_N(residual[task.data.residualOffs + pos] = convert_bps_sat(t));
} }
#else #else
// get_group_id(0) == task index // get_group_id(0) == task index
@@ -1198,12 +1296,10 @@ void clEncodeResidual(
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
#ifdef AMD bpsint4 cptr0 = convert_bps4(vload4(0, &task.coefs[0]));
int4 cptr0 = vload4(0, &task.coefs[0]); bpsint4 cptr1 = convert_bps4(vload4(1, &task.coefs[0]));
int4 cptr1 = vload4(1, &task.coefs[0]);
#if MAX_ORDER > 8 #if MAX_ORDER > 8
int4 cptr2 = vload4(2, &task.coefs[0]); bpsint4 cptr2 = convert_bps4(vload4(2, &task.coefs[0]));
#endif
#endif #endif
data[tid] = 0; data[tid] = 0;
@@ -1217,33 +1313,24 @@ void clEncodeResidual(
// compute residual // compute residual
__local int* dptr = &data[tid + GROUP_SIZE - ro]; __local int* dptr = &data[tid + GROUP_SIZE - ro];
int4 sum bpsint4 sum
#ifdef AMD = cptr0 * convert_bps4(vload4(0, dptr))
= cptr0 * vload4(0, dptr) + cptr1 * convert_bps4(vload4(1, dptr))
+ cptr1 * vload4(1, dptr)
#else
= vload4(0, &task.coefs[0]) * vload4(0, dptr)
+ vload4(1, &task.coefs[0]) * vload4(1, dptr)
#endif
#if MAX_ORDER > 8 #if MAX_ORDER > 8
#ifdef AMD + cptr2 * convert_bps4(vload4(2, dptr))
+ cptr2 * vload4(2, dptr)
#else
+ vload4(2, &task.coefs[0]) * vload4(2, dptr)
#endif
#if MAX_ORDER > 12 #if MAX_ORDER > 12
+ vload4(3, &task.coefs[0]) * vload4(3, dptr) + convert_bps4(vload4(3, &task.coefs[0])) * convert_bps4(vload4(3, dptr))
#if MAX_ORDER > 16 #if MAX_ORDER > 16
+ vload4(4, &task.coefs[0]) * vload4(4, dptr) + convert_bps4(vload4(4, &task.coefs[0])) * convert_bps4(vload4(4, dptr))
+ vload4(5, &task.coefs[0]) * vload4(5, dptr) + convert_bps4(vload4(5, &task.coefs[0])) * convert_bps4(vload4(5, dptr))
+ vload4(6, &task.coefs[0]) * vload4(6, dptr) + convert_bps4(vload4(6, &task.coefs[0])) * convert_bps4(vload4(6, dptr))
+ vload4(7, &task.coefs[0]) * vload4(7, dptr) + convert_bps4(vload4(7, &task.coefs[0])) * convert_bps4(vload4(7, dptr))
#endif #endif
#endif #endif
#endif #endif
; ;
if (off >= ro && off < bs) if (off >= ro && off < bs)
output[task.data.residualOffs + off] = data[tid + GROUP_SIZE] - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift); output[task.data.residualOffs + off] = convert_bps_sat(nextData - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift));
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
data[tid] = nextData; data[tid] = nextData;
@@ -1254,7 +1341,7 @@ void clEncodeResidual(
#ifdef FLACCL_CPU #ifdef FLACCL_CPU
__kernel __attribute__((reqd_work_group_size(1, 1, 1))) __kernel __attribute__((reqd_work_group_size(1, 1, 1)))
void clCalcPartition( void clCalcPartition(
__global int *partition_lengths, __global ulong *partition_lengths,
__global int *residual, __global int *residual,
__global FLACCLSubframeTask *tasks, __global FLACCLSubframeTask *tasks,
int max_porder, // <= 8 int max_porder, // <= 8
@@ -1265,18 +1352,16 @@ void clCalcPartition(
int bs = task.data.blocksize; int bs = task.data.blocksize;
int ro = task.data.residualOrder; int ro = task.data.residualOrder;
//int psize = bs >> max_porder; //int psize = bs >> max_porder;
__global int *pl = partition_lengths + (1 << (max_porder + 1)) * get_group_id(1); __global ulong *pl = partition_lengths + (1 << (max_porder + 1)) * get_group_id(1);
for (int p = 0; p < (1 << max_porder); p++) for (int p = 0; p < (1 << max_porder); p++)
pl[p] = 0; pl[p] = 0UL;
for (int pos = ro; pos < bs; pos ++) for (int pos = ro; pos < bs; pos ++)
{ {
int t = residual[task.data.residualOffs + pos]; int s = residual[task.data.residualOffs + pos];
// overflow protection
t = clamp(t, -0x7fffff, 0x7fffff);
// convert to unsigned // convert to unsigned
t = (t << 1) ^ (t >> 31); uint t = (s << 1) ^ (s >> 31);
pl[pos / psize] += t; pl[pos / psize] += t;
} }
} }
@@ -1292,15 +1377,15 @@ void clCalcPartition(
int psize // == task.blocksize >> max_porder? int psize // == task.blocksize >> max_porder?
) )
{ {
__local int pl[(GROUP_SIZE / 8)][15]; __local uint pl[(GROUP_SIZE / 16)][MAX_RICE_PARAM + 1];
__local FLACCLSubframeData task; __local FLACCLSubframeData task;
const int tid = get_local_id(0); const int tid = get_local_id(0);
if (tid < sizeof(task) / sizeof(int)) if (tid < sizeof(task) / sizeof(int))
((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(1)]))[tid]; ((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(1)]))[tid];
if (tid < (GROUP_SIZE / 8)) if (tid < (GROUP_SIZE / 16))
{ {
for (int k = 0; k <= 14; k++) for (int k = 0; k <= MAX_RICE_PARAM; k++)
pl[tid][k] = 0; pl[tid][k] = 0;
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@@ -1311,14 +1396,14 @@ void clCalcPartition(
{ {
// fetch residual // fetch residual
int s = (offs >= task.residualOrder && offs < end) ? residual[task.residualOffs + offs] : 0; int s = (offs >= task.residualOrder && offs < end) ? residual[task.residualOffs + offs] : 0;
// overflow protection
s = iclamp(s, -0x7fffff, 0x7fffff);
// convert to unsigned // convert to unsigned
s = (s << 1) ^ (s >> 31); uint t = (s << 1) ^ (s >> 31);
// calc number of unary bits for each residual sample with each rice paramater // calc number of unary bits for each residual sample with each rice paramater
int part = (offs - start) / psize + (tid & 1) * (GROUP_SIZE / 16); int part = (offs - start) / psize;
for (int k = 0; k <= 14; k++) // we must ensure that psize * (t >> k) doesn't overflow;
atom_add(&pl[part][k], s >> k); // i.e. t < ((1 << 32) >> (log2(psize) - k)) <= (1 << 32) >> (32 - clz(MAX_BLOCKSIZE) - k)
for (int k = 0; k <= MAX_RICE_PARAM; k++)
atom_add(&pl[part][k], min(t, 0xffffffffU >> max(0, 32 - clz(MAX_BLOCKSIZE) - k)) >> k);
//pl[part][k] += s >> k; //pl[part][k] += s >> k;
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@@ -1326,141 +1411,79 @@ void clCalcPartition(
int part = get_group_id(0) * (GROUP_SIZE / 16) + tid; int part = get_group_id(0) * (GROUP_SIZE / 16) + tid;
if (tid < (GROUP_SIZE / 16) && part < (1 << max_porder)) if (tid < (GROUP_SIZE / 16) && part < (1 << max_porder))
{ {
for (int k = 0; k <= 14; k++) for (int k = 0; k <= MAX_RICE_PARAM; k++)
{ {
// output length // output length
const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (k << (max_porder + 1)); const int pos = ((MAX_RICE_PARAM + 1) << (max_porder + 1)) * get_group_id(1) + (k << (max_porder + 1));
int plen = pl[tid][k] + pl[tid + (GROUP_SIZE / 16)][k]; uint plen = pl[tid][k];
partition_lengths[pos + part] = min(0x7fffff, plen) + (psize - select(0, task.residualOrder, part == 0)) * (k + 1); partition_lengths[pos + part] = min(0x007fffffU, plen) + (uint)(psize - select(0, task.residualOrder, part == 0)) * (k + 1);
// if (get_group_id(1) == 0) // if (get_group_id(1) == 0)
//printf("pl[%d][%d] == %d\n", k, part, min(0x7fffff, pl[k][tid]) + (psize - task.residualOrder * (part == 0)) * (k + 1)); //printf("pl[%d][%d] == %d\n", k, part, min(0x7fffff, pl[k][tid]) + (psize - task.residualOrder * (part == 0)) * (k + 1));
} }
} }
} }
#endif
#ifdef FLACCL_CPU
// get_group_id(0) == task index
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
void clCalcPartition16(
__global int *partition_lengths,
__global int *residual,
__global int *samples,
__global FLACCLSubframeTask *tasks,
int max_porder // <= 8
)
{
FLACCLSubframeTask task = tasks[get_global_id(0)];
int bs = task.data.blocksize;
int ro = task.data.residualOrder;
__global int *data = &samples[task.data.samplesOffs];
__global int *pl = partition_lengths + (1 << (max_porder + 1)) * get_global_id(0);
for (int p = 0; p < (1 << max_porder); p++)
pl[p] = 0;
__global int *rptr = residual + task.data.residualOffs;
SWITCH_N((rptr[pos] = t, pl[pos >> 4] += (t << 1) ^ (t >> 31)));
//SWITCH_N((residual[task.data.residualOffs + pos] = t, t = (t << 1) ^ (t >> 31), pl[pos >> 4] += t));
}
#else
// get_group_id(0) == task index
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
void clCalcPartition16( void clCalcPartition16(
__global int *partition_lengths, __global unsigned int *partition_lengths,
__global int *residual, __global int *residual,
__global int *samples,
__global FLACCLSubframeTask *tasks, __global FLACCLSubframeTask *tasks,
int max_porder // <= 8 int max_porder // <= 8
) )
{ {
__local FLACCLSubframeTask task; __local FLACCLSubframeData task;
__local int data[GROUP_SIZE * 2]; __local unsigned int res[GROUP_SIZE];
__local int res[GROUP_SIZE]; __local unsigned int pl[GROUP_SIZE >> 4][MAX_RICE_PARAM + 1];
__local int pl[GROUP_SIZE >> 4][15];
const int tid = get_local_id(0); const int tid = get_local_id(0);
if (tid < sizeof(task) / sizeof(int)) if (tid < sizeof(task) / sizeof(int))
((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(0)]))[tid]; ((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(0)]))[tid];
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
int bs = task.data.blocksize; int bs = task.blocksize;
int ro = task.data.residualOrder; int ro = task.residualOrder;
int sh = task.data.shift;
if (tid >= ro && tid < 32)
task.coefs[tid] = 0;
int k = tid & 15;
int x = tid / 16;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
__global int * rptr = &residual[task.data.residualOffs];
__global int * plptr = &partition_lengths[(15 << (max_porder + 1)) * get_group_id(0) + (k << (max_porder + 1))];
__local int* dptr = &data[tid + GROUP_SIZE - ro];
int4 cptr0 = vload4(0, &task.coefs[0]);
int4 cptr1 = vload4(1, &task.coefs[0]);
int4 cptr2 = vload4(2, &task.coefs[0]);
data[tid] = 0;
for (int pos = 0; pos < bs; pos += GROUP_SIZE) for (int pos = 0; pos < bs; pos += GROUP_SIZE)
{ {
int offs = pos + tid; int offs = pos + tid;
// fetch samples // fetch residual
int nextData = offs < bs ? samples[task.data.samplesOffs + offs] >> task.data.wbits : 0; int s = (offs >= ro && offs < bs) ? residual[task.residualOffs + offs] : 0;
data[tid + GROUP_SIZE] = nextData;
barrier(CLK_LOCAL_MEM_FENCE);
// compute residual
int4 sum = cptr0 * vload4(0, dptr)
#if MAX_ORDER > 4
+ cptr1 * vload4(1, dptr)
#if MAX_ORDER > 8
+ cptr2 * vload4(2, dptr)
#if MAX_ORDER > 12
+ vload4(3, &task.coefs[0]) * vload4(3, dptr)
#if MAX_ORDER > 16
+ vload4(4, &task.coefs[0]) * vload4(4, dptr)
+ vload4(5, &task.coefs[0]) * vload4(5, dptr)
+ vload4(6, &task.coefs[0]) * vload4(6, dptr)
+ vload4(7, &task.coefs[0]) * vload4(7, dptr)
#endif
#endif
#endif
#endif
;
int s = select(0, nextData - ((sum.x + sum.y + sum.z + sum.w) >> sh), offs >= ro && offs < bs);
// output residual
if (offs < bs)
rptr[offs] = s;
s = iclamp(s, -0x7fffff, 0x7fffff);
// convert to unsigned // convert to unsigned
res[tid] = (s << 1) ^ (s >> 31); res[tid] = (s << 1) ^ (s >> 31);
// for (int k = 0; k < 15; k++) atom_add(&pl[x][k], s >> k); barrier(CLK_LOCAL_MEM_FENCE);
for (int k0 = 0; k0 <= MAX_RICE_PARAM; k0 += 16)
{
// calc number of unary bits for each group of 16 residual samples
// with each rice parameter.
int k = k0 + (tid & 15);
int x = tid >> 4;
// we must ensure that psize * (t >> k) doesn't overflow;
// i.e. t < ((1 << 32) >> (log2(16) - k)) <= (1 << 32) >> (4 - k)
uint4 lim = 0xffffffffU >> max(0, 4 - k);
__local uint * chunk = &res[x << 4];
uint4 rsum = (min(lim,vload4(0,chunk)) >> k) + (min(lim,vload4(1,chunk)) >> k) + (min(lim,vload4(2,chunk)) >> k) + (min(lim,vload4(3,chunk)) >> k);
uint rs = rsum.x + rsum.y + rsum.z + rsum.w;
// We can safely limit length here to 0x007fffffU, not causing length
// mismatch, because any such length would cause Verbatim frame anyway.
// And this limit protects us from overflows when calculating larger
// partitions, as we can have a maximum of 2^8 partitions, resulting
// in maximum partition length of 0x7fffffffU + change.
if (k <= MAX_RICE_PARAM) pl[x][k] = min(0x007fffffU, rs) + (uint)(16 - select(0, ro, offs < 16)) * (k + 1);
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
data[tid] = nextData;
// calc number of unary bits for each residual sample with each rice paramater for (int k0 = 0; k0 <= MAX_RICE_PARAM; k0 += 16)
__local int * chunk = &res[x << 4]; {
sum = (vload4(0,chunk) >> k) + (vload4(1,chunk) >> k) + (vload4(2,chunk) >> k) + (vload4(3,chunk) >> k); int k1 = k0 + (tid >> 3), x1 = tid & 7;
s = sum.x + sum.y + sum.z + sum.w; if (k1 <= MAX_RICE_PARAM && (pos >> 4) + x1 < (1 << max_porder))
partition_lengths[((MAX_RICE_PARAM + 1) << (max_porder + 1)) * get_group_id(0) + (k1 << (max_porder + 1)) + (pos >> 4) + x1] = pl[x1][k1];
#if 0 }
if (k <= 14 && offs < bs)
plptr[offs >> 4] = min(0x7fffff, s) + (16 - select(0, ro, offs < 16)) * (k + 1);
#else
if (k <= 14) pl[x][k] = min(0x7fffff, s) + (16 - select(0, ro, offs < 16)) * (k + 1);
barrier(CLK_LOCAL_MEM_FENCE);
int k1 = tid >> 3, x1 = tid & 7;
if (k1 <= 14 && (pos >> 4) + x1 < (1 << max_porder))
partition_lengths[(15 << (max_porder + 1)) * get_group_id(0) + (k1 << (max_porder + 1)) + (pos >> 4) + x1] = pl[x1][k1];
#endif
// if (task.data.blocksize == 16 && x == 0 && k <= 14)
// printf("[%d] = %d = s:%d + %d * (k:%d + 1), ro=%d, offs=%d, lpos=%d\n", k, partition_lengths[lpos], s, (16 - select(0, ro, offs < 16)), k, ro, offs, lpos);
} }
} }
#endif #endif
@@ -1471,13 +1494,13 @@ void clCalcPartition16(
// get_group_id(1) == task index // get_group_id(1) == task index
__kernel __attribute__((reqd_work_group_size(1, 1, 1))) __kernel __attribute__((reqd_work_group_size(1, 1, 1)))
void clSumPartition( void clSumPartition(
__global int* partition_lengths, __global ulong* partition_lengths,
int max_porder int max_porder
) )
{ {
if (get_group_id(0) != 0) // ignore k != 0 if (get_group_id(0) != 0) // ignore k != 0
return; return;
__global int * sums = partition_lengths + (1 << (max_porder + 1)) * get_group_id(1); __global ulong * sums = partition_lengths + (1 << (max_porder + 1)) * get_group_id(1);
for (int i = max_porder - 1; i >= 0; i--) for (int i = max_porder - 1; i >= 0; i--)
{ {
for (int j = 0; j < (1 << i); j++) for (int j = 0; j < (1 << i); j++)
@@ -1496,15 +1519,15 @@ void clSumPartition(
// get_group_id(1) == task index // get_group_id(1) == task index
__kernel __attribute__((reqd_work_group_size(128, 1, 1))) __kernel __attribute__((reqd_work_group_size(128, 1, 1)))
void clSumPartition( void clSumPartition(
__global int* partition_lengths, __global uint* partition_lengths,
int max_porder int max_porder
) )
{ {
__local int data[256]; // max_porder <= 8, data length <= 1 << 9. __local uint data[256]; // max_porder <= 8, data length <= 1 << 9.
const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (get_group_id(0) << (max_porder + 1)); const int pos = ((MAX_RICE_PARAM + 1) << (max_porder + 1)) * get_group_id(1) + (get_group_id(0) << (max_porder + 1));
// fetch partition lengths // fetch partition lengths
int2 pl = get_local_id(0) * 2 < (1 << max_porder) ? vload2(get_local_id(0),&partition_lengths[pos]) : 0; uint2 pl = get_local_id(0) * 2 < (1 << max_porder) ? vload2(get_local_id(0),&partition_lengths[pos]) : 0;
data[get_local_id(0)] = pl.x + pl.y; data[get_local_id(0)] = pl.x + pl.y;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@@ -1531,7 +1554,7 @@ __kernel __attribute__((reqd_work_group_size(1, 1, 1)))
void clFindRiceParameter( void clFindRiceParameter(
__global FLACCLSubframeTask *tasks, __global FLACCLSubframeTask *tasks,
__global int* rice_parameters, __global int* rice_parameters,
__global int* partition_lengths, __global ulong* partition_lengths,
int max_porder int max_porder
) )
{ {
@@ -1541,7 +1564,7 @@ void clFindRiceParameter(
//int psize = task->data.blocksize >> max_porder; //int psize = task->data.blocksize >> max_porder;
int bs = task->data.blocksize; int bs = task->data.blocksize;
int ro = task->data.residualOrder; int ro = task->data.residualOrder;
__global int* ppl = &partition_lengths[get_group_id(0) << (max_porder + 1)]; __global ulong* ppl = &partition_lengths[get_group_id(0) << (max_porder + 1)];
__global int* prp = &rice_parameters[get_group_id(0) << (max_porder + 2)]; __global int* prp = &rice_parameters[get_group_id(0) << (max_porder + 2)];
__global int* pol = prp + (1 << (max_porder + 1)); __global int* pol = prp + (1 << (max_porder + 1));
for (int porder = max_porder; porder >= 0; porder--) for (int porder = max_porder; porder >= 0; porder--)
@@ -1549,10 +1572,10 @@ void clFindRiceParameter(
int pos = (2 << max_porder) - (2 << porder); int pos = (2 << max_porder) - (2 << porder);
int fin = pos + (1 << porder); int fin = pos + (1 << porder);
int pl = ppl[pos]; ulong pl = ppl[pos];
int ps = (bs >> porder) - ro; int ps = (bs >> porder) - ro;
int k = iclamp(31 - fastclz(pl / max(1, ps)), 0, 14); int k = iclamp(63 - fastclz64(pl / max(1, ps)), 0, MAX_RICE_PARAM);
int plk = ps * (k + 1) + (pl >> k); int plk = ps * (k + 1) + (int)(pl >> k);
// output rice parameter // output rice parameter
prp[pos] = k; prp[pos] = k;
@@ -1564,8 +1587,8 @@ void clFindRiceParameter(
for (int offs = pos + 1; offs < fin; offs++) for (int offs = pos + 1; offs < fin; offs++)
{ {
pl = ppl[offs]; pl = ppl[offs];
k = iclamp(31 - fastclz(pl / ps), 0, 14); k = iclamp(63 - fastclz64(pl / ps), 0, MAX_RICE_PARAM);
plk = ps * (k + 1) + (pl >> k); plk = ps * (k + 1) + (int)(pl >> k);
// output rice parameter // output rice parameter
prp[offs] = k; prp[offs] = k;
@@ -1581,18 +1604,18 @@ __kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
void clFindRiceParameter( void clFindRiceParameter(
__global FLACCLSubframeTask *tasks, __global FLACCLSubframeTask *tasks,
__global int* rice_parameters, __global int* rice_parameters,
__global int* partition_lengths, __global uint* partition_lengths,
int max_porder int max_porder
) )
{ {
for (int offs = get_local_id(0); offs < (2 << max_porder); offs += GROUP_SIZE) for (int offs = get_local_id(0); offs < (2 << max_porder); offs += GROUP_SIZE)
{ {
const int pos = (15 << (max_porder + 1)) * get_group_id(0) + offs; const int pos = ((MAX_RICE_PARAM + 1) << (max_porder + 1)) * get_group_id(0) + offs;
int best_l = partition_lengths[pos]; uint best_l = partition_lengths[pos];
int best_k = 0; int best_k = 0;
for (int k = 1; k <= 14; k++) for (int k = 1; k <= MAX_RICE_PARAM; k++)
{ {
int l = partition_lengths[pos + (k << (max_porder + 1))]; uint l = partition_lengths[pos + (k << (max_porder + 1))];
best_k = select(best_k, k, l < best_l); best_k = select(best_k, k, l < best_l);
best_l = min(best_l, l); best_l = min(best_l, l);
} }
@@ -1630,16 +1653,16 @@ void clFindPartitionOrder(
partlen[porder] += rice_parameters[pos + start + offs]; partlen[porder] += rice_parameters[pos + start + offs];
} }
int best_length = partlen[0] + 4; int best_length = partlen[0] + RICE_PARAM_BITS;
int best_porder = 0; int best_porder = 0;
for (int porder = 1; porder <= max_porder; porder++) for (int porder = 1; porder <= max_porder; porder++)
{ {
int length = (4 << porder) + partlen[porder]; int length = (RICE_PARAM_BITS << porder) + partlen[porder];
best_porder = select(best_porder, porder, length < best_length); best_porder = select(best_porder, porder, length < best_length);
best_length = min(best_length, length); best_length = min(best_length, length);
} }
best_length = (4 << best_porder) + task->data.blocksize - task->data.residualOrder; best_length = (RICE_PARAM_BITS << best_porder) + task->data.blocksize - task->data.residualOrder;
int best_psize = task->data.blocksize >> best_porder; int best_psize = task->data.blocksize >> best_porder;
int start = task->data.residualOffs + task->data.residualOrder; int start = task->data.residualOffs + task->data.residualOrder;
int fin = task->data.residualOffs + best_psize; int fin = task->data.residualOffs + best_psize;
@@ -1704,11 +1727,11 @@ void clFindPartitionOrder(
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
int best_length = partlen[0] + 4; int best_length = partlen[0] + RICE_PARAM_BITS;
int best_porder = 0; int best_porder = 0;
for (int porder = 1; porder <= max_porder; porder++) for (int porder = 1; porder <= max_porder; porder++)
{ {
int length = (4 << porder) + partlen[porder]; int length = (RICE_PARAM_BITS << porder) + partlen[porder];
best_porder = select(best_porder, porder, length < best_length); best_porder = select(best_porder, porder, length < best_length);
best_length = min(best_length, length); best_length = min(best_length, length);
} }
@@ -1836,14 +1859,14 @@ void clCalcOutputOffsets(
) )
{ {
const int channels = 2; const int channels = 2;
__local FLACCLSubframeData ltasks[2]; __local FLACCLSubframeData ltasks[MAX_CHANNELS];
__local volatile int mypos[2]; __local volatile int mypos[MAX_CHANNELS];
int offset = 0; int offset = 0;
for (int iFrame = 0; iFrame < frameCount; iFrame++) for (int iFrame = 0; iFrame < frameCount; iFrame++)
{ {
if (get_local_id(0) < sizeof(ltasks[0]) / sizeof(int)) if (get_local_id(0) < sizeof(ltasks[0]) / sizeof(int))
for (int ch = 0; ch < channels; ch++) for (int ch = 0; ch < MAX_CHANNELS; ch++)
((__local int*)&ltasks[ch])[get_local_id(0)] = ((__global int*)(&tasks[iFrame * channels + ch]))[get_local_id(0)]; ((__local int*)&ltasks[ch])[get_local_id(0)] = ((__global int*)(&tasks[iFrame * MAX_CHANNELS + ch]))[get_local_id(0)];
//printf("len_utf8(%d) == %d\n", firstFrame + iFrame, len_utf8(firstFrame + iFrame)); //printf("len_utf8(%d) == %d\n", firstFrame + iFrame, len_utf8(firstFrame + iFrame));
offset += 15 + 1 + 4 + 4 + 4 + 3 + 1 + len_utf8(firstFrame + iFrame) offset += 15 + 1 + 4 + 4 + 4 + 3 + 1 + len_utf8(firstFrame + iFrame)
@@ -1856,18 +1879,18 @@ void clCalcOutputOffsets(
// assert (offset % 8) == 0 // assert (offset % 8) == 0
offset += 8; offset += 8;
if (get_local_id(0) < channels) if (get_local_id(0) < MAX_CHANNELS)
{ {
int ch = get_local_id(0); int ch = get_local_id(0);
// Add 64 bits to separate frames if header is too small so they can intersect // Add 64 bits to separate frames if header is too small so they can intersect
int mylen = 8 + ltasks[ch].wbits + 64 + ltasks[ch].size; int mylen = 8 + ltasks[ch].wbits + 64 + ltasks[ch].size;
mypos[ch] = mylen; mypos[ch] = mylen;
for (int offset = 1; offset < WARP_SIZE && offset < channels; offset <<= 1) for (int offset = 1; offset < WARP_SIZE && offset < MAX_CHANNELS; offset <<= 1)
if (ch >= offset) mypos[ch] += mypos[ch - offset]; if (ch >= offset) mypos[ch] += mypos[ch - offset];
mypos[ch] += offset; mypos[ch] += offset;
tasks[iFrame * channels + ch].data.encodingOffset = mypos[ch] - ltasks[ch].size + ltasks[ch].headerLen; tasks[iFrame * MAX_CHANNELS + ch].data.encodingOffset = mypos[ch] - ltasks[ch].size + ltasks[ch].headerLen;
} }
offset = mypos[channels - 1]; offset = mypos[MAX_CHANNELS - 1];
offset = (offset + 7) & ~7; offset = (offset + 7) & ~7;
offset += 16; offset += 16;
} }
@@ -1909,7 +1932,7 @@ void clRiceEncoding(
for (int p = 0; p < (1 << porder); p++) for (int p = 0; p < (1 << porder); p++)
{ {
int k = kptr[p]; int k = kptr[p];
writebits(&bw, 4, k); writebits(&bw, RICE_PARAM_BITS, k);
//if (get_group_id(0) == 0) printf("[%x] ", k); //if (get_group_id(0) == 0) printf("[%x] ", k);
//if (get_group_id(0) == 0) printf("(%x) ", bw.bit_buf); //if (get_group_id(0) == 0) printf("(%x) ", bw.bit_buf);
if (p == 1) res_cnt = psize; if (p == 1) res_cnt = psize;
@@ -1978,7 +2001,7 @@ void clRiceEncoding(
flush(&bw); flush(&bw);
} }
#else #else
__local unsigned int data[GROUP_SIZE]; __local uint data[GROUP_SIZE];
__local volatile int mypos[GROUP_SIZE+1]; __local volatile int mypos[GROUP_SIZE+1];
#if 0 #if 0
__local int brp[256]; __local int brp[256];
@@ -2006,12 +2029,12 @@ void clRiceEncoding(
int start = task.encodingOffset; int start = task.encodingOffset;
int plen = bs >> task.porder; int plen = bs >> task.porder;
//int plenoffs = 12 - task.porder; //int plenoffs = 12 - task.porder;
unsigned int remainder = 0U; uint remainder = 0U;
int pos; int pos;
for (pos = 0; pos + GROUP_SIZE - 1 < bs; pos += GROUP_SIZE) for (pos = 0; pos + GROUP_SIZE - 1 < bs; pos += GROUP_SIZE)
{ {
int offs = pos + tid; int offs = pos + tid;
int v = residual[task.residualOffs + offs]; int iv = residual[task.residualOffs + offs];
int part = offs / plen; // >> plenoffs; int part = offs / plen; // >> plenoffs;
#if 0 #if 0
int k = brp[part]; int k = brp[part];
@@ -2019,8 +2042,8 @@ void clRiceEncoding(
int k = best_rice_parameters[(get_group_id(0) << max_porder) + part]; int k = best_rice_parameters[(get_group_id(0) << max_porder) + part];
#endif #endif
int pstart = offs == task.residualOrder || offs == part * plen; int pstart = offs == task.residualOrder || offs == part * plen;
v = (v << 1) ^ (v >> 31); uint v = (iv << 1) ^ (iv >> 31);
int mylen = select(0, (v >> k) + 1 + k + select(0, 4, pstart), offs >= task.residualOrder && offs < bs); int mylen = select(0, (int)(v >> k) + 1 + k + select(0, RICE_PARAM_BITS, pstart), offs >= task.residualOrder && offs < bs);
mypos[tid] = mylen; mypos[tid] = mylen;
// Inclusive scan(+) // Inclusive scan(+)
@@ -2040,7 +2063,8 @@ void clRiceEncoding(
mp += start + select(0, warppos[tid / WARP_SIZE - 1], tid / WARP_SIZE > 0); mp += start + select(0, warppos[tid / WARP_SIZE - 1], tid / WARP_SIZE > 0);
int start32 = start >> 5; int start32 = start >> 5;
start += mypos[GROUP_SIZE - 1] + warppos[GROUP_SIZE / WARP_SIZE - 2]; start += mypos[GROUP_SIZE - 1] + warppos[GROUP_SIZE / WARP_SIZE - 2];
//if (start / 32 - start32 >= GROUP_SIZE - 3)
// tasks[get_group_id(0)].data.size = 1;
//if (tid == GROUP_SIZE - 1 && mypos[tid] > (GROUP_SIZE/2) * 32) //if (tid == GROUP_SIZE - 1 && mypos[tid] > (GROUP_SIZE/2) * 32)
// printf("Oops: %d\n", mypos[tid]); // printf("Oops: %d\n", mypos[tid]);
data[tid] = select(0U, remainder, tid == 0); data[tid] = select(0U, remainder, tid == 0);
@@ -2052,18 +2076,18 @@ void clRiceEncoding(
int kpos = mp - mylen; int kpos = mp - mylen;
int kpos0 = (kpos >> 5) - start32; int kpos0 = (kpos >> 5) - start32;
int kpos1 = kpos & 31; int kpos1 = kpos & 31;
unsigned int kval = (unsigned int)k << 28; uint kval = (uint)k << (32 - RICE_PARAM_BITS);
unsigned int kval0 = kval >> kpos1; uint kval0 = kval >> kpos1;
unsigned int kval1 = kval << (32 - kpos1); uint kval1 = kval << (32 - kpos1);
if (kval0) atom_or(&data[kpos0], kval0); if (kval0) atom_or(&data[kpos0], kval0);
if (kpos1 && kval1) atom_or(&data[kpos0 + 1], kval1); if (kpos1 && kval1) atom_or(&data[kpos0 + 1], kval1);
} }
int qpos = mp - k - 1; int qpos = mp - k - 1;
int qpos0 = (qpos >> 5) - start32; int qpos0 = (qpos >> 5) - start32;
int qpos1 = qpos & 31; int qpos1 = qpos & 31;
unsigned int qval = (1U << 31) | ((unsigned int)v << (31 - k)); uint qval = (1U << 31) | (v << (31 - k));
unsigned int qval0 = qval >> qpos1; uint qval0 = qval >> qpos1;
unsigned int qval1= qval << (32 - qpos1); uint qval1= qval << (32 - qpos1);
if (qval0) atom_or(&data[qpos0], qval0); if (qval0) atom_or(&data[qpos0], qval0);
if (qpos1 && qval1) atom_or(&data[qpos0 + 1], qval1); if (qpos1 && qval1) atom_or(&data[qpos0 + 1], qval1);
} }
@@ -2075,13 +2099,13 @@ void clRiceEncoding(
if (pos < bs) if (pos < bs)
{ {
int offs = pos + tid; int offs = pos + tid;
int v = offs < bs ? residual[task.residualOffs + offs] : 0; int iv = offs < bs ? residual[task.residualOffs + offs] : 0;
int part = offs / plen; // >> plenoffs; int part = offs / plen; // >> plenoffs;
//int k = brp[min(255, part)]; //int k = brp[min(255, part)];
int k = offs < bs ? best_rice_parameters[(get_group_id(0) << max_porder) + part] : 0; int k = offs < bs ? best_rice_parameters[(get_group_id(0) << max_porder) + part] : 0;
int pstart = offs == task.residualOrder || offs == part * plen; int pstart = offs == task.residualOrder || offs == part * plen;
v = (v << 1) ^ (v >> 31); uint v = (iv << 1) ^ (iv >> 31);
int mylen = select(0, (v >> k) + 1 + k + select(0, 4, pstart), offs >= task.residualOrder && offs < bs); int mylen = select(0, (int)(v >> k) + 1 + k + select(0, RICE_PARAM_BITS, pstart), offs >= task.residualOrder && offs < bs);
mypos[tid] = mylen; mypos[tid] = mylen;
// Inclusive scan(+) // Inclusive scan(+)
@@ -2113,18 +2137,18 @@ void clRiceEncoding(
int kpos = mp - mylen; int kpos = mp - mylen;
int kpos0 = (kpos >> 5) - start32; int kpos0 = (kpos >> 5) - start32;
int kpos1 = kpos & 31; int kpos1 = kpos & 31;
unsigned int kval = (unsigned int)k << 28; uint kval = (uint)k << (32 - RICE_PARAM_BITS);
unsigned int kval0 = kval >> kpos1; uint kval0 = kval >> kpos1;
unsigned int kval1 = kval << (32 - kpos1); uint kval1 = kval << (32 - kpos1);
if (kval0) atom_or(&data[kpos0], kval0); if (kval0) atom_or(&data[kpos0], kval0);
if (kpos1 && kval1) atom_or(&data[kpos0 + 1], kval1); if (kpos1 && kval1) atom_or(&data[kpos0 + 1], kval1);
} }
int qpos = mp - k - 1; int qpos = mp - k - 1;
int qpos0 = (qpos >> 5) - start32; int qpos0 = (qpos >> 5) - start32;
int qpos1 = qpos & 31; int qpos1 = qpos & 31;
unsigned int qval = (1U << 31) | ((unsigned int)v << (31 - k)); uint qval = (1U << 31) | (v << (31 - k));
unsigned int qval0 = qval >> qpos1; uint qval0 = qval >> qpos1;
unsigned int qval1= qval << (32 - qpos1); uint qval1= qval << (32 - qpos1);
if (qval0) atom_or(&data[qpos0], qval0); if (qval0) atom_or(&data[qpos0], qval0);
if (qpos1 && qval1) atom_or(&data[qpos0 + 1], qval1); if (qpos1 && qval1) atom_or(&data[qpos0 + 1], qval1);
} }

View File

@@ -83,6 +83,11 @@ namespace CUETools.Codecs.FLAKE
/// </summary> /// </summary>
public int porder; public int porder;
/// <summary>
/// coding method: rice parameters use 4 bits for coding_method 0 and 5 bits for coding_method 1
/// </summary>
public int coding_method;
/// <summary> /// <summary>
/// Rice parameters /// Rice parameters
/// </summary> /// </summary>

View File

@@ -102,7 +102,7 @@ namespace CUETools.Codecs.FLAKE
} }
_samplesInBuffer = 0; _samplesInBuffer = 0;
if (PCM.BitsPerSample != 16 || PCM.ChannelCount != 2 || PCM.SampleRate != 44100) if ((PCM.BitsPerSample != 16 && PCM.BitsPerSample != 24) || PCM.ChannelCount != 2 || (PCM.SampleRate != 44100 && PCM.SampleRate != 48000))
throw new Exception("invalid flac file"); throw new Exception("invalid flac file");
samplesBuffer = new int[Flake.MAX_BLOCKSIZE * PCM.ChannelCount]; samplesBuffer = new int[Flake.MAX_BLOCKSIZE * PCM.ChannelCount];
@@ -362,8 +362,9 @@ namespace CUETools.Codecs.FLAKE
unsafe void decode_residual(BitReader bitreader, FlacFrame frame, int ch) unsafe void decode_residual(BitReader bitreader, FlacFrame frame, int ch)
{ {
// rice-encoded block // rice-encoded block
uint coding_method = bitreader.readbits(2); // ????? == 0 // coding method
if (coding_method != 0 && coding_method != 1) // if 1, then parameter length == 5 bits instead of 4 frame.subframes[ch].best.rc.coding_method = (int)bitreader.readbits(2); // ????? == 0
if (frame.subframes[ch].best.rc.coding_method != 0 && frame.subframes[ch].best.rc.coding_method != 1)
throw new Exception("unsupported residual coding"); throw new Exception("unsupported residual coding");
// partition order // partition order
frame.subframes[ch].best.rc.porder = (int)bitreader.readbits(4); frame.subframes[ch].best.rc.porder = (int)bitreader.readbits(4);
@@ -372,7 +373,7 @@ namespace CUETools.Codecs.FLAKE
int psize = frame.blocksize >> frame.subframes[ch].best.rc.porder; int psize = frame.blocksize >> frame.subframes[ch].best.rc.porder;
int res_cnt = psize - frame.subframes[ch].best.order; int res_cnt = psize - frame.subframes[ch].best.order;
int rice_len = 4 + (int)coding_method; int rice_len = 4 + frame.subframes[ch].best.rc.coding_method;
// residual // residual
int j = frame.subframes[ch].best.order; int j = frame.subframes[ch].best.order;
int* r = frame.subframes[ch].best.residual + j; int* r = frame.subframes[ch].best.residual + j;

View File

@@ -125,8 +125,8 @@ namespace CUETools.Codecs.FLAKE
{ {
_pcm = pcm; _pcm = pcm;
if (_pcm.BitsPerSample != 16) //if (_pcm.BitsPerSample != 16)
throw new Exception("Bits per sample must be 16."); // throw new Exception("Bits per sample must be 16.");
if (_pcm.ChannelCount != 2) if (_pcm.ChannelCount != 2)
throw new Exception("ChannelCount must be 2."); throw new Exception("ChannelCount must be 2.");
@@ -571,14 +571,14 @@ namespace CUETools.Codecs.FLAKE
samplesInBuffer += block; samplesInBuffer += block;
} }
unsafe static void channel_decorrelation(int* leftS, int* rightS, int *leftM, int *rightM, int blocksize) //unsafe static void channel_decorrelation(int* leftS, int* rightS, int *leftM, int *rightM, int blocksize)
{ //{
for (int i = 0; i < blocksize; i++) // for (int i = 0; i < blocksize; i++)
{ // {
leftM[i] = (leftS[i] + rightS[i]) >> 1; // leftM[i] = (leftS[i] + rightS[i]) >> 1;
rightM[i] = leftS[i] - rightS[i]; // rightM[i] = leftS[i] - rightS[i];
} // }
} //}
unsafe void encode_residual_verbatim(int* res, int* smp, uint n) unsafe void encode_residual_verbatim(int* res, int* smp, uint n)
{ {
@@ -638,24 +638,28 @@ namespace CUETools.Codecs.FLAKE
} }
} }
static unsafe uint calc_optimal_rice_params(int porder, int* parm, uint* sums, uint n, uint pred_order) static unsafe uint calc_optimal_rice_params(int porder, int* parm, ulong* sums, uint n, uint pred_order, ref int method)
{ {
uint part = (1U << porder); uint part = (1U << porder);
uint cnt = (n >> porder) - pred_order; uint cnt = (n >> porder) - pred_order;
int k = cnt > 0 ? Math.Min(Flake.MAX_RICE_PARAM, BitReader.log2i(sums[0] / cnt)) : 0; int maxK = method > 0 ? 30 : Flake.MAX_RICE_PARAM;
uint all_bits = cnt * ((uint)k + 1U) + (sums[0] >> k); int k = cnt > 0 ? Math.Min(maxK, BitReader.log2i(sums[0] / cnt)) : 0;
int realMaxK0 = k;
ulong all_bits = cnt * ((uint)k + 1U) + (sums[0] >> k);
parm[0] = k; parm[0] = k;
cnt = (n >> porder); cnt = (n >> porder);
for (uint i = 1; i < part; i++) for (uint i = 1; i < part; i++)
{ {
k = Math.Min(Flake.MAX_RICE_PARAM, BitReader.log2i(sums[i] / cnt)); k = Math.Min(maxK, BitReader.log2i(sums[i] / cnt));
realMaxK0 = Math.Max(realMaxK0, k);
all_bits += cnt * ((uint)k + 1U) + (sums[i] >> k); all_bits += cnt * ((uint)k + 1U) + (sums[i] >> k);
parm[i] = k; parm[i] = k;
} }
return all_bits + (4 * part); method = realMaxK0 > Flake.MAX_RICE_PARAM ? 1 : 0;
return (uint)all_bits + ((4U + (uint)method) * part);
} }
static unsafe void calc_lower_sums(int pmin, int pmax, uint* sums) static unsafe void calc_lower_sums(int pmin, int pmax, ulong* sums)
{ {
for (int i = pmax - 1; i >= pmin; i--) for (int i = pmax - 1; i >= pmin; i--)
{ {
@@ -668,12 +672,12 @@ namespace CUETools.Codecs.FLAKE
} }
} }
static unsafe void calc_sums(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums) static unsafe void calc_sums(int pmin, int pmax, uint* data, uint n, uint pred_order, ulong* sums)
{ {
int parts = (1 << pmax); int parts = (1 << pmax);
uint* res = data + pred_order; uint* res = data + pred_order;
uint cnt = (n >> pmax) - pred_order; uint cnt = (n >> pmax) - pred_order;
uint sum = 0; ulong sum = 0;
for (uint j = cnt; j > 0; j--) for (uint j = cnt; j > 0; j--)
sum += *(res++); sum += *(res++);
sums[0] = sum; sums[0] = sum;
@@ -696,18 +700,18 @@ namespace CUETools.Codecs.FLAKE
/// <param name="n"></param> /// <param name="n"></param>
/// <param name="pred_order"></param> /// <param name="pred_order"></param>
/// <param name="sums"></param> /// <param name="sums"></param>
static unsafe void calc_sums18(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums) static unsafe void calc_sums18(int pmin, int pmax, uint* data, uint n, uint pred_order, ulong* sums)
{ {
int parts = (1 << pmax); int parts = (1 << pmax);
uint* res = data + pred_order; uint* res = data + pred_order;
uint cnt = 18 - pred_order; uint cnt = 18 - pred_order;
uint sum = 0; ulong sum = 0;
for (uint j = cnt; j > 0; j--) for (uint j = cnt; j > 0; j--)
sum += *(res++); sum += *(res++);
sums[0] = sum; sums[0] = sum;
for (int i = 1; i < parts; i++) for (int i = 1; i < parts; i++)
{ {
sums[i] = sums[i] = 0UL +
*(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) +
*(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) +
*(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) +
@@ -725,18 +729,18 @@ namespace CUETools.Codecs.FLAKE
/// <param name="n"></param> /// <param name="n"></param>
/// <param name="pred_order"></param> /// <param name="pred_order"></param>
/// <param name="sums"></param> /// <param name="sums"></param>
static unsafe void calc_sums16(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums) static unsafe void calc_sums16(int pmin, int pmax, uint* data, uint n, uint pred_order, ulong* sums)
{ {
int parts = (1 << pmax); int parts = (1 << pmax);
uint* res = data + pred_order; uint* res = data + pred_order;
uint cnt = 16 - pred_order; uint cnt = 16 - pred_order;
uint sum = 0; ulong sum = 0;
for (uint j = cnt; j > 0; j--) for (uint j = cnt; j > 0; j--)
sum += *(res++); sum += *(res++);
sums[0] = sum; sums[0] = sum;
for (int i = 1; i < parts; i++) for (int i = 1; i < parts; i++)
{ {
sums[i] = sums[i] = 0UL +
*(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) +
*(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) +
*(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) +
@@ -744,10 +748,10 @@ namespace CUETools.Codecs.FLAKE
} }
} }
static unsafe uint calc_rice_params(RiceContext rc, int pmin, int pmax, int* data, uint n, uint pred_order) static unsafe uint calc_rice_params(RiceContext rc, int pmin, int pmax, int* data, uint n, uint pred_order, int bps)
{ {
uint* udata = stackalloc uint[(int)n]; uint* udata = stackalloc uint[(int)n];
uint* sums = stackalloc uint[(pmax + 1) * Flake.MAX_PARTITIONS]; ulong* sums = stackalloc ulong[(pmax + 1) * Flake.MAX_PARTITIONS];
int* parm = stackalloc int[(pmax + 1) * Flake.MAX_PARTITIONS]; int* parm = stackalloc int[(pmax + 1) * Flake.MAX_PARTITIONS];
//uint* bits = stackalloc uint[Flake.MAX_PARTITION_ORDER]; //uint* bits = stackalloc uint[Flake.MAX_PARTITION_ORDER];
@@ -770,17 +774,21 @@ namespace CUETools.Codecs.FLAKE
uint opt_bits = AudioSamples.UINT32_MAX; uint opt_bits = AudioSamples.UINT32_MAX;
int opt_porder = pmin; int opt_porder = pmin;
int opt_method = 0;
for (int i = pmin; i <= pmax; i++) for (int i = pmin; i <= pmax; i++)
{ {
uint bits = calc_optimal_rice_params(i, parm + i * Flake.MAX_PARTITIONS, sums + i * Flake.MAX_PARTITIONS, n, pred_order); int method = bps > 16 ? 1 : 0;
uint bits = calc_optimal_rice_params(i, parm + i * Flake.MAX_PARTITIONS, sums + i * Flake.MAX_PARTITIONS, n, pred_order, ref method);
if (bits <= opt_bits) if (bits <= opt_bits)
{ {
opt_bits = bits; opt_bits = bits;
opt_porder = i; opt_porder = i;
opt_method = method;
} }
} }
rc.porder = opt_porder; rc.porder = opt_porder;
rc.coding_method = opt_method;
fixed (int* rparms = rc.rparams) fixed (int* rparms = rc.rparams)
AudioSamples.MemCpy(rparms, parm + opt_porder * Flake.MAX_PARTITIONS, (1 << opt_porder)); AudioSamples.MemCpy(rparms, parm + opt_porder * Flake.MAX_PARTITIONS, (1 << opt_porder));
@@ -841,7 +849,7 @@ namespace CUETools.Codecs.FLAKE
} }
int pmax = get_max_p_order(eparams.max_partition_order, frame.blocksize, frame.current.order); int pmax = get_max_p_order(eparams.max_partition_order, frame.blocksize, frame.current.order);
int pmin = Math.Min(eparams.min_partition_order, pmax); int pmin = Math.Min(eparams.min_partition_order, pmax);
uint best_size = calc_rice_params(frame.current.rc, pmin, pmax, frame.current.residual, (uint)frame.blocksize, (uint)frame.current.order); uint best_size = calc_rice_params(frame.current.rc, pmin, pmax, frame.current.residual, (uint)frame.blocksize, (uint)frame.current.order, PCM.BitsPerSample);
// not working // not working
//for (int o = 1; o <= frame.current.order; o++) //for (int o = 1; o <= frame.current.order; o++)
//{ //{
@@ -877,7 +885,7 @@ namespace CUETools.Codecs.FLAKE
int pmax = get_max_p_order(eparams.max_partition_order, frame.blocksize, frame.current.order); int pmax = get_max_p_order(eparams.max_partition_order, frame.blocksize, frame.current.order);
int pmin = Math.Min(eparams.min_partition_order, pmax); int pmin = Math.Min(eparams.min_partition_order, pmax);
frame.current.size = (uint)(frame.current.order * frame.subframes[ch].obits) + 6 frame.current.size = (uint)(frame.current.order * frame.subframes[ch].obits) + 6
+ calc_rice_params(frame.current.rc, pmin, pmax, frame.current.residual, (uint)frame.blocksize, (uint)frame.current.order); + calc_rice_params(frame.current.rc, pmin, pmax, frame.current.residual, (uint)frame.blocksize, (uint)frame.current.order, PCM.BitsPerSample);
frame.subframes[ch].done_fixed |= (1U << order); frame.subframes[ch].done_fixed |= (1U << order);
@@ -1054,7 +1062,7 @@ namespace CUETools.Codecs.FLAKE
unsafe void output_residual(FlacFrame frame, BitWriter bitwriter, FlacSubframeInfo sub) unsafe void output_residual(FlacFrame frame, BitWriter bitwriter, FlacSubframeInfo sub)
{ {
// rice-encoded block // rice-encoded block
bitwriter.writebits(2, 0); bitwriter.writebits(2, sub.best.rc.coding_method);
// partition order // partition order
int porder = sub.best.rc.porder; int porder = sub.best.rc.porder;
@@ -1063,13 +1071,14 @@ namespace CUETools.Codecs.FLAKE
bitwriter.writebits(4, porder); bitwriter.writebits(4, porder);
int res_cnt = psize - sub.best.order; int res_cnt = psize - sub.best.order;
int rice_len = 4 + sub.best.rc.coding_method;
// residual // residual
int j = sub.best.order; int j = sub.best.order;
fixed (byte* fixbuf = &frame_buffer[0]) fixed (byte* fixbuf = &frame_buffer[0])
for (int p = 0; p < (1 << porder); p++) for (int p = 0; p < (1 << porder); p++)
{ {
int k = sub.best.rc.rparams[p]; int k = sub.best.rc.rparams[p];
bitwriter.writebits(4, k); bitwriter.writebits(rice_len, k);
if (p == 1) res_cnt = psize; if (p == 1) res_cnt = psize;
int cnt = Math.Min(res_cnt, frame.blocksize - j); int cnt = Math.Min(res_cnt, frame.blocksize - j);
bitwriter.write_rice_block_signed(fixbuf, k, sub.best.residual + j, cnt); bitwriter.write_rice_block_signed(fixbuf, k, sub.best.residual + j, cnt);
@@ -1436,6 +1445,9 @@ namespace CUETools.Codecs.FLAKE
output_subframes(frame, bitwriter); output_subframes(frame, bitwriter);
output_frame_footer(bitwriter); output_frame_footer(bitwriter);
if (bitwriter.Length >= max_frame_size)
throw new Exception("buffer overflow");
if (frame_buffer != null) if (frame_buffer != null)
{ {
if (eparams.variable_block_size > 0) if (eparams.variable_block_size > 0)
@@ -1732,9 +1744,6 @@ namespace CUETools.Codecs.FLAKE
} }
if (i == 8) if (i == 8)
throw new Exception("non-standard bps"); throw new Exception("non-standard bps");
// FIXME: For now, only 16-bit encoding is supported
if (_pcm.BitsPerSample != 16)
throw new Exception("non-standard bps");
if (_blocksize == 0) if (_blocksize == 0)
{ {

View File

@@ -35,6 +35,15 @@ namespace CUETools.Codecs
return log2i((uint)v); return log2i((uint)v);
} }
public static int log2i(ulong v)
{
int n = 0;
if (0 != (v & 0xffffffff00000000)) { v >>= 32; n += 32; }
if (0 != (v & 0xffff0000)) { v >>= 16; n += 16; }
if (0 != (v & 0xff00)) { v >>= 8; n += 8; }
return n + byte_to_log2_table[v];
}
public static int log2i(uint v) public static int log2i(uint v)
{ {
int n = 0; int n = 0;

View File

@@ -422,8 +422,10 @@ namespace CUETools.Codecs
unsafe public void Interlace(int pos, int* src1, int* src2, int n) unsafe public void Interlace(int pos, int* src1, int* src2, int n)
{ {
if (PCM.ChannelCount != 2 || PCM.BitsPerSample != 16) if (PCM.ChannelCount != 2)
throw new Exception(""); throw new Exception("Must be stereo");
if (PCM.BitsPerSample == 16)
{
fixed (byte* bs = Bytes) fixed (byte* bs = Bytes)
{ {
int* res = ((int*)bs) + pos; int* res = ((int*)bs) + pos;
@@ -431,6 +433,31 @@ namespace CUETools.Codecs
*(res++) = (*(src1++) & 0xffff) ^ (*(src2++) << 16); *(res++) = (*(src1++) & 0xffff) ^ (*(src2++) << 16);
} }
} }
else if (PCM.BitsPerSample == 24)
{
fixed (byte* bs = Bytes)
{
byte* res= bs + pos * 6;
for (int i = n; i > 0; i--)
{
uint sample_out = (uint)*(src1++);
*(res++) = (byte)(sample_out & 0xFF);
sample_out >>= 8;
*(res++) = (byte)(sample_out & 0xFF);
sample_out >>= 8;
*(res++) = (byte)(sample_out & 0xFF);
sample_out = (uint)*(src2++);
*(res++) = (byte)(sample_out & 0xFF);
sample_out >>= 8;
*(res++) = (byte)(sample_out & 0xFF);
sample_out >>= 8;
*(res++) = (byte)(sample_out & 0xFF);
}
}
}
else
throw new Exception("Unsupported BPS");
}
//public void Clear() //public void Clear()
//{ //{
@@ -451,6 +478,7 @@ namespace CUETools.Codecs
short* pOutSamples = (short*)outSamples; short* pOutSamples = (short*)outSamples;
for (int i = 0; i < loopCount; i++) for (int i = 0; i < loopCount; i++)
pOutSamples[i] = (short)pInSamples[i]; pOutSamples[i] = (short)pInSamples[i];
//*(pOutSamples++) = (short)*(pInSamples++);
} }
} }
@@ -465,19 +493,8 @@ namespace CUETools.Codecs
throw new IndexOutOfRangeException(); throw new IndexOutOfRangeException();
} }
fixed (int* pInSamplesFixed = &inSamples[inSampleOffset, 0])
{
fixed (byte* pOutSamplesFixed = &outSamples[outByteOffset]) fixed (byte* pOutSamplesFixed = &outSamples[outByteOffset])
{ FLACSamplesToBytes_16(inSamples, inSampleOffset, pOutSamplesFixed, sampleCount, channelCount);
int* pInSamples = pInSamplesFixed;
short* pOutSamples = (short*)pOutSamplesFixed;
for (int i = 0; i < loopCount; i++)
{
*(pOutSamples++) = (short)*(pInSamples++);
}
}
}
} }
public static unsafe void FLACSamplesToBytes_24(int[,] inSamples, int inSampleOffset, public static unsafe void FLACSamplesToBytes_24(int[,] inSamples, int inSampleOffset,
@@ -917,16 +934,16 @@ namespace CUETools.Codecs
private AudioPCMConfig pcm; private AudioPCMConfig pcm;
private int _sampleVal; private int _sampleVal;
public SilenceGenerator(long sampleCount, int sampleVal) public SilenceGenerator(AudioPCMConfig pcm, long sampleCount, int sampleVal)
{ {
_sampleVal = sampleVal; this._sampleVal = sampleVal;
_sampleOffset = 0; this._sampleOffset = 0;
_sampleCount = sampleCount; this._sampleCount = sampleCount;
pcm = AudioPCMConfig.RedBook; this.pcm = pcm;
} }
public SilenceGenerator(long sampleCount) public SilenceGenerator(long sampleCount)
: this(sampleCount, 0) : this(AudioPCMConfig.RedBook, sampleCount, 0)
{ {
} }
@@ -1091,19 +1108,29 @@ namespace CUETools.Codecs
{ {
foundFormat = true; foundFormat = true;
if (_br.ReadUInt16() != 1) uint fmtTag = _br.ReadUInt16();
{
throw new Exception("WAVE must be PCM format.");
}
int _channelCount = _br.ReadInt16(); int _channelCount = _br.ReadInt16();
int _sampleRate = _br.ReadInt32(); int _sampleRate = _br.ReadInt32();
_br.ReadInt32(); _br.ReadInt32(); // bytes per second
int _blockAlign = _br.ReadInt16(); int _blockAlign = _br.ReadInt16();
int _bitsPerSample = _br.ReadInt16(); int _bitsPerSample = _br.ReadInt16();
pos += 16;
if (fmtTag == 0xFFFEU && ckSize >= 34) // WAVE_FORMAT_EXTENSIBLE
{
_br.ReadInt16(); // CbSize
_br.ReadInt16(); // ValidBitsPerSample
int channelMask = _br.ReadInt32();
fmtTag = _br.ReadUInt16();
pos += 10;
}
if (fmtTag != 1) // WAVE_FORMAT_PCM
throw new Exception("WAVE format tag not WAVE_FORMAT_PCM.");
pcm = new AudioPCMConfig(_bitsPerSample, _channelCount, _sampleRate); pcm = new AudioPCMConfig(_bitsPerSample, _channelCount, _sampleRate);
if (pcm.BlockAlign != _blockAlign) if (pcm.BlockAlign != _blockAlign)
throw new Exception("WAVE has strange BlockAlign"); throw new Exception("WAVE has strange BlockAlign");
pos += 16;
} }
else if (ckID == fccData) else if (ckID == fccData)
{ {

View File

@@ -23,7 +23,7 @@
<DefineConstants>DEBUG;TRACE</DefineConstants> <DefineConstants>DEBUG;TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport> <ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel> <WarningLevel>4</WarningLevel>
<PlatformTarget>x86</PlatformTarget> <PlatformTarget>AnyCPU</PlatformTarget>
</PropertyGroup> </PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' "> <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
<DebugType>pdbonly</DebugType> <DebugType>pdbonly</DebugType>

View File

@@ -87,7 +87,7 @@ namespace CUETools.FLACCL.cmd
min_precision = -1, max_precision = -1, min_precision = -1, max_precision = -1,
orders_per_window = -1, orders_per_channel = -1, orders_per_window = -1, orders_per_channel = -1,
blocksize = -1; blocksize = -1;
int input_len = 4096, input_val = 0; int input_len = 4096, input_val = 0, input_bps = 16, input_ch = 2, input_rate = 44100;
int level = -1, padding = -1, vbr_mode = -1; int level = -1, padding = -1, vbr_mode = -1;
bool do_seektable = true; bool do_seektable = true;
bool buffered = false; bool buffered = false;
@@ -136,6 +136,10 @@ namespace CUETools.FLACCL.cmd
input_len = intarg; input_len = intarg;
else if (args[arg] == "--input-value" && ++arg < args.Length && int.TryParse(args[arg], out intarg)) else if (args[arg] == "--input-value" && ++arg < args.Length && int.TryParse(args[arg], out intarg))
input_val = intarg; input_val = intarg;
else if (args[arg] == "--input-bps" && ++arg < args.Length && int.TryParse(args[arg], out intarg))
input_bps = intarg;
else if (args[arg] == "--input-channels" && ++arg < args.Length && int.TryParse(args[arg], out intarg))
input_ch = intarg;
else if ((args[arg] == "-o" || args[arg] == "--output") && ++arg < args.Length) else if ((args[arg] == "-o" || args[arg] == "--output") && ++arg < args.Length)
output_file = args[arg]; output_file = args[arg];
else if ((args[arg] == "-s" || args[arg] == "--stereo") && ++arg < args.Length) else if ((args[arg] == "-s" || args[arg] == "--stereo") && ++arg < args.Length)
@@ -211,10 +215,12 @@ namespace CUETools.FLACCL.cmd
} }
IAudioSource audioSource; IAudioSource audioSource;
try
{
if (input_file == "-") if (input_file == "-")
audioSource = new WAVReader("", Console.OpenStandardInput()); audioSource = new WAVReader("", Console.OpenStandardInput());
else if (input_file == "nul") else if (input_file == "nul")
audioSource = new SilenceGenerator(input_len, input_val); audioSource = new SilenceGenerator(new AudioPCMConfig(input_bps, input_ch, input_rate), input_len, input_val);
else if (File.Exists(input_file) && Path.GetExtension(input_file) == ".wav") else if (File.Exists(input_file) && Path.GetExtension(input_file) == ".wav")
audioSource = new WAVReader(input_file, null); audioSource = new WAVReader(input_file, null);
else if (File.Exists(input_file) && Path.GetExtension(input_file) == ".flac") else if (File.Exists(input_file) && Path.GetExtension(input_file) == ".flac")
@@ -224,6 +230,14 @@ namespace CUETools.FLACCL.cmd
Usage(); Usage();
return 2; return 2;
} }
}
catch (Exception ex)
{
Usage();
Console.WriteLine("");
Console.WriteLine("Error: {0}.", ex.Message);
return 3;
}
if (buffered) if (buffered)
audioSource = new AudioPipe(audioSource, FLACCLWriter.MAX_BLOCKSIZE); audioSource = new AudioPipe(audioSource, FLACCLWriter.MAX_BLOCKSIZE);
if (output_file == null) if (output_file == null)

View File

@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="utf-8" ?>
<configuration>
<runtime>
<assemblyBinding xmlns="urn:schemas-microsoft-com:asm.v1">
<probing privatePath="plugins"/>
</assemblyBinding>
</runtime>
</configuration>

View File

@@ -2,7 +2,7 @@
<PropertyGroup> <PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration> <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform> <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
<ProductVersion>8.0.50727</ProductVersion> <ProductVersion>9.0.30729</ProductVersion>
<SchemaVersion>2.0</SchemaVersion> <SchemaVersion>2.0</SchemaVersion>
<ProjectGuid>{2379BAAF-A406-4477-BF53-2D6A326C24C8}</ProjectGuid> <ProjectGuid>{2379BAAF-A406-4477-BF53-2D6A326C24C8}</ProjectGuid>
<OutputType>Exe</OutputType> <OutputType>Exe</OutputType>
@@ -19,7 +19,7 @@
<DebugSymbols>true</DebugSymbols> <DebugSymbols>true</DebugSymbols>
<DebugType>full</DebugType> <DebugType>full</DebugType>
<Optimize>false</Optimize> <Optimize>false</Optimize>
<OutputPath>bin\Debug\</OutputPath> <OutputPath>..\bin\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants> <DefineConstants>DEBUG;TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport> <ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel> <WarningLevel>4</WarningLevel>
@@ -52,6 +52,9 @@
<Name>CUETools.Codecs</Name> <Name>CUETools.Codecs</Name>
</ProjectReference> </ProjectReference>
</ItemGroup> </ItemGroup>
<ItemGroup>
<None Include="App.config" />
</ItemGroup>
<Import Project="$(MSBuildBinPath)\Microsoft.CSharp.targets" /> <Import Project="$(MSBuildBinPath)\Microsoft.CSharp.targets" />
<!-- To modify your build process, add your task inside one of the targets below and uncomment it. <!-- To modify your build process, add your task inside one of the targets below and uncomment it.
Other similar extension points exist, see Microsoft.Common.targets. Other similar extension points exist, see Microsoft.Common.targets.

View File

@@ -279,9 +279,10 @@ namespace CUETools.FlakeExe
if (!quiet) if (!quiet)
{ {
Console.Error.Write("\r \r"); Console.Error.Write("\r \r");
Console.WriteLine("Results : {0:0.00}x; {1}", Console.WriteLine("Results : {0:0.00}x; {2} bytes in {1} seconds;",
audioSource.Position / totalElapsed.TotalSeconds / audioSource.PCM.SampleRate, audioSource.Position / totalElapsed.TotalSeconds / audioSource.PCM.SampleRate,
totalElapsed totalElapsed,
flake.TotalSize
); );
} }
audioSource.Close(); audioSource.Close();