mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
24-bit/multichannel support
This commit is contained in:
@@ -87,7 +87,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
[SRDescription(typeof(Properties.Resources), "DescriptionDeviceType")]
|
||||
public OpenCLDeviceType DeviceType { get; set; }
|
||||
|
||||
int cpu_threads = 1;
|
||||
int cpu_threads = 0;
|
||||
[DefaultValue(1)]
|
||||
[SRDescription(typeof(Properties.Resources), "DescriptionCPUThreads")]
|
||||
public int CPUThreads
|
||||
@@ -214,10 +214,11 @@ namespace CUETools.Codecs.FLACCL
|
||||
{
|
||||
_pcm = pcm;
|
||||
|
||||
if (pcm.BitsPerSample != 16)
|
||||
// FIXME: For now, only 16-bit encoding is supported
|
||||
if (pcm.BitsPerSample != 16 && pcm.BitsPerSample != 24)
|
||||
throw new Exception("Bits per sample must be 16.");
|
||||
if (pcm.ChannelCount != 2)
|
||||
throw new Exception("ChannelCount must be 2.");
|
||||
//if (pcm.ChannelCount != 2)
|
||||
// throw new Exception("ChannelCount must be 2.");
|
||||
|
||||
channels = pcm.ChannelCount;
|
||||
sample_rate = pcm.SampleRate;
|
||||
@@ -288,12 +289,6 @@ namespace CUETools.Codecs.FLACCL
|
||||
if (value as FLACCLWriterSettings == null)
|
||||
throw new Exception("Unsupported options " + value);
|
||||
_settings = value as FLACCLWriterSettings;
|
||||
if (_settings.DeviceType == OpenCLDeviceType.CPU)
|
||||
{
|
||||
_settings.GroupSize = 1;
|
||||
//_settings.GPUOnly = true;
|
||||
_settings.MappedMemory = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -644,24 +639,28 @@ namespace CUETools.Codecs.FLACCL
|
||||
}
|
||||
}
|
||||
|
||||
static unsafe uint calc_optimal_rice_params(int porder, int* parm, uint* sums, uint n, uint pred_order)
|
||||
static unsafe uint calc_optimal_rice_params(int porder, int* parm, ulong* sums, uint n, uint pred_order, ref int method)
|
||||
{
|
||||
uint part = (1U << porder);
|
||||
uint cnt = (n >> porder) - pred_order;
|
||||
int k = cnt > 0 ? Math.Min(Flake.MAX_RICE_PARAM, BitReader.log2i(sums[0] / cnt)) : 0;
|
||||
uint all_bits = cnt * ((uint)k + 1U) + (sums[0] >> k);
|
||||
int maxK = method > 0 ? 30 : Flake.MAX_RICE_PARAM;
|
||||
int k = cnt > 0 ? Math.Min(maxK, BitReader.log2i(sums[0] / cnt)) : 0;
|
||||
int realMaxK0 = k;
|
||||
ulong all_bits = cnt * ((uint)k + 1U) + (sums[0] >> k);
|
||||
parm[0] = k;
|
||||
cnt = (n >> porder);
|
||||
for (uint i = 1; i < part; i++)
|
||||
{
|
||||
k = Math.Min(Flake.MAX_RICE_PARAM, BitReader.log2i(sums[i] / cnt));
|
||||
k = Math.Min(maxK, BitReader.log2i(sums[i] / cnt));
|
||||
realMaxK0 = Math.Max(realMaxK0, k);
|
||||
all_bits += cnt * ((uint)k + 1U) + (sums[i] >> k);
|
||||
parm[i] = k;
|
||||
}
|
||||
return all_bits + (4 * part);
|
||||
method = realMaxK0 > Flake.MAX_RICE_PARAM ? 1 : 0;
|
||||
return (uint)all_bits + ((4U + (uint)method) * part);
|
||||
}
|
||||
|
||||
static unsafe void calc_lower_sums(int pmin, int pmax, uint* sums)
|
||||
static unsafe void calc_lower_sums(int pmin, int pmax, ulong* sums)
|
||||
{
|
||||
for (int i = pmax - 1; i >= pmin; i--)
|
||||
{
|
||||
@@ -674,12 +673,12 @@ namespace CUETools.Codecs.FLACCL
|
||||
}
|
||||
}
|
||||
|
||||
static unsafe void calc_sums(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums)
|
||||
static unsafe void calc_sums(int pmin, int pmax, uint* data, uint n, uint pred_order, ulong* sums)
|
||||
{
|
||||
int parts = (1 << pmax);
|
||||
uint* res = data + pred_order;
|
||||
uint cnt = (n >> pmax) - pred_order;
|
||||
uint sum = 0;
|
||||
ulong sum = 0;
|
||||
for (uint j = cnt; j > 0; j--)
|
||||
sum += *(res++);
|
||||
sums[0] = sum;
|
||||
@@ -702,18 +701,18 @@ namespace CUETools.Codecs.FLACCL
|
||||
/// <param name="n"></param>
|
||||
/// <param name="pred_order"></param>
|
||||
/// <param name="sums"></param>
|
||||
static unsafe void calc_sums18(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums)
|
||||
static unsafe void calc_sums18(int pmin, int pmax, uint* data, uint n, uint pred_order, ulong* sums)
|
||||
{
|
||||
int parts = (1 << pmax);
|
||||
uint* res = data + pred_order;
|
||||
uint cnt = 18 - pred_order;
|
||||
uint sum = 0;
|
||||
ulong sum = 0UL;
|
||||
for (uint j = cnt; j > 0; j--)
|
||||
sum += *(res++);
|
||||
sums[0] = sum;
|
||||
for (int i = 1; i < parts; i++)
|
||||
{
|
||||
sums[i] =
|
||||
sums[i] = 0UL +
|
||||
*(res++) + *(res++) + *(res++) + *(res++) +
|
||||
*(res++) + *(res++) + *(res++) + *(res++) +
|
||||
*(res++) + *(res++) + *(res++) + *(res++) +
|
||||
@@ -731,18 +730,18 @@ namespace CUETools.Codecs.FLACCL
|
||||
/// <param name="n"></param>
|
||||
/// <param name="pred_order"></param>
|
||||
/// <param name="sums"></param>
|
||||
static unsafe void calc_sums16(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums)
|
||||
static unsafe void calc_sums16(int pmin, int pmax, uint* data, uint n, uint pred_order, ulong* sums)
|
||||
{
|
||||
int parts = (1 << pmax);
|
||||
uint* res = data + pred_order;
|
||||
uint cnt = 16 - pred_order;
|
||||
uint sum = 0;
|
||||
ulong sum = 0UL;
|
||||
for (uint j = cnt; j > 0; j--)
|
||||
sum += *(res++);
|
||||
sums[0] = sum;
|
||||
for (int i = 1; i < parts; i++)
|
||||
{
|
||||
sums[i] =
|
||||
sums[i] = 0UL +
|
||||
*(res++) + *(res++) + *(res++) + *(res++) +
|
||||
*(res++) + *(res++) + *(res++) + *(res++) +
|
||||
*(res++) + *(res++) + *(res++) + *(res++) +
|
||||
@@ -750,10 +749,10 @@ namespace CUETools.Codecs.FLACCL
|
||||
}
|
||||
}
|
||||
|
||||
static unsafe uint calc_rice_params(RiceContext rc, int pmin, int pmax, int* data, uint n, uint pred_order)
|
||||
static unsafe uint calc_rice_params(RiceContext rc, int pmin, int pmax, int* data, uint n, uint pred_order, int max_method)
|
||||
{
|
||||
uint* udata = stackalloc uint[(int)n];
|
||||
uint* sums = stackalloc uint[(pmax + 1) * Flake.MAX_PARTITIONS];
|
||||
ulong* sums = stackalloc ulong[(pmax + 1) * Flake.MAX_PARTITIONS];
|
||||
int* parm = stackalloc int[(pmax + 1) * Flake.MAX_PARTITIONS];
|
||||
//uint* bits = stackalloc uint[Flake.MAX_PARTITION_ORDER];
|
||||
|
||||
@@ -776,17 +775,21 @@ namespace CUETools.Codecs.FLACCL
|
||||
|
||||
uint opt_bits = AudioSamples.UINT32_MAX;
|
||||
int opt_porder = pmin;
|
||||
int opt_method = 0;
|
||||
for (int i = pmin; i <= pmax; i++)
|
||||
{
|
||||
uint bits = calc_optimal_rice_params(i, parm + i * Flake.MAX_PARTITIONS, sums + i * Flake.MAX_PARTITIONS, n, pred_order);
|
||||
int method = max_method;
|
||||
uint bits = calc_optimal_rice_params(i, parm + i * Flake.MAX_PARTITIONS, sums + i * Flake.MAX_PARTITIONS, n, pred_order, ref method);
|
||||
if (bits <= opt_bits)
|
||||
{
|
||||
opt_bits = bits;
|
||||
opt_porder = i;
|
||||
opt_method = method;
|
||||
}
|
||||
}
|
||||
|
||||
rc.porder = opt_porder;
|
||||
rc.coding_method = opt_method;
|
||||
fixed (int* rparms = rc.rparams)
|
||||
AudioSamples.MemCpy(rparms, parm + opt_porder * Flake.MAX_PARTITIONS, (1 << opt_porder));
|
||||
|
||||
@@ -845,8 +848,8 @@ namespace CUETools.Codecs.FLACCL
|
||||
for (int i = pos; i < pos + cnt; i++)
|
||||
{
|
||||
int v = sub.best.residual[i];
|
||||
v = (v << 1) ^ (v >> 31);
|
||||
q += (v >> k);
|
||||
uint uv = (uint)((v << 1) ^ (v >> 31));
|
||||
q += (int)(uv >> k);
|
||||
}
|
||||
return (k + 1) * cnt + q;
|
||||
}
|
||||
@@ -857,7 +860,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
int porder = sub.best.rc.porder;
|
||||
int psize = frame.blocksize >> porder;
|
||||
//assert(porder >= 0);
|
||||
int size = 6 + (4 << porder);
|
||||
int size = 6 + ((4 + sub.best.rc.coding_method) << porder);
|
||||
size += measure_residual(frame, sub, sub.best.order, psize - sub.best.order, sub.best.rc.rparams[0]);
|
||||
// residual
|
||||
for (int p = 1; p < (1 << porder); p++)
|
||||
@@ -870,13 +873,13 @@ namespace CUETools.Codecs.FLACCL
|
||||
FlacFrame frame = task.frame;
|
||||
|
||||
// rice-encoded block
|
||||
frame.writer.writebits(2, 0);
|
||||
frame.writer.writebits(2, sub.best.rc.coding_method);
|
||||
// partition order
|
||||
int porder = sub.best.rc.porder;
|
||||
//assert(porder >= 0);
|
||||
frame.writer.writebits(4, porder);
|
||||
|
||||
if (_settings.GPUOnly && _settings.DoRice)
|
||||
if (task.UseGPURice)
|
||||
{
|
||||
int len = task.BestResidualTasks[index].size - task.BestResidualTasks[index].headerLen;
|
||||
int pos = task.BestResidualTasks[index].encodingOffset;
|
||||
@@ -901,7 +904,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
for (int p = 0; p < (1 << porder); p++)
|
||||
{
|
||||
int k = sub.best.rc.rparams[p];
|
||||
frame.writer.writebits(4, k);
|
||||
frame.writer.writebits(4 + sub.best.rc.coding_method, k);
|
||||
if (p == 1) res_cnt = psize;
|
||||
int cnt = Math.Min(res_cnt, frame.blocksize - j);
|
||||
frame.writer.write_rice_block_signed(fixbuf, k, sub.best.residual + j, cnt);
|
||||
@@ -1069,7 +1072,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
calculate_window(task, lpc.window_bartlett, WindowFunction.Bartlett);
|
||||
if (task.nWindowFunctions == 0)
|
||||
throw new Exception("invalid windowfunction");
|
||||
if (!_settings.MappedMemory)
|
||||
if (!task.UseMappedMemory)
|
||||
task.openCLCQ.EnqueueWriteBuffer(task.clWindowFunctions, false, 0, sizeof(float) * task.nWindowFunctions * task.frameSize, task.clWindowFunctionsPtr);
|
||||
}
|
||||
|
||||
@@ -1116,6 +1119,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * task.channelSize + iFrame * blocksize;
|
||||
task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs;
|
||||
task.ResidualTasks[task.nResidualTasks].wbits = 0;
|
||||
task.ResidualTasks[task.nResidualTasks].coding_method = PCM.BitsPerSample > 16 ? 1 : 0;
|
||||
task.ResidualTasks[task.nResidualTasks].size = task.ResidualTasks[task.nResidualTasks].obits * blocksize;
|
||||
task.nResidualTasks++;
|
||||
}
|
||||
@@ -1131,6 +1135,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * task.channelSize + iFrame * blocksize;
|
||||
task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs;
|
||||
task.ResidualTasks[task.nResidualTasks].wbits = 0;
|
||||
task.ResidualTasks[task.nResidualTasks].coding_method = PCM.BitsPerSample > 16 ? 1 : 0;
|
||||
task.ResidualTasks[task.nResidualTasks].size = task.ResidualTasks[task.nResidualTasks].obits * blocksize;
|
||||
task.ResidualTasks[task.nResidualTasks].residualOrder = 1;
|
||||
task.ResidualTasks[task.nResidualTasks].shift = 0;
|
||||
@@ -1149,6 +1154,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * task.channelSize + iFrame * blocksize;
|
||||
task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs;
|
||||
task.ResidualTasks[task.nResidualTasks].wbits = 0;
|
||||
task.ResidualTasks[task.nResidualTasks].coding_method = PCM.BitsPerSample > 16 ? 1 : 0;
|
||||
task.ResidualTasks[task.nResidualTasks].size = task.ResidualTasks[task.nResidualTasks].obits * blocksize;
|
||||
task.ResidualTasks[task.nResidualTasks].shift = 0;
|
||||
switch (order)
|
||||
@@ -1195,10 +1201,11 @@ namespace CUETools.Codecs.FLACCL
|
||||
if (sizeof(FLACCLSubframeTask) * task.nResidualTasks > task.residualTasksLen)
|
||||
throw new Exception("oops");
|
||||
|
||||
if (!_settings.MappedMemory)
|
||||
task.openCLCQ.EnqueueWriteBuffer(task.clResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * task.nResidualTasks, task.clResidualTasksPtr);
|
||||
if (!_settings.MappedMemory)
|
||||
task.openCLCQ.EnqueueWriteBuffer(task.clSelectedTasks, false, 0, sizeof(int) * (nFrames * channelsCount * task.nEstimateTasksPerChannel), task.clSelectedTasksPtr);
|
||||
if (!task.UseMappedMemory)
|
||||
{
|
||||
task.openCLCQ.EnqueueWriteBuffer(task.clResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * task.nResidualTasks, task.clResidualTasksPtr);
|
||||
task.openCLCQ.EnqueueWriteBuffer(task.clSelectedTasks, false, 0, sizeof(int) * (nFrames * channelsCount * task.nEstimateTasksPerChannel), task.clSelectedTasksPtr);
|
||||
}
|
||||
}
|
||||
|
||||
unsafe void encode_residual(FLACCLTask task)
|
||||
@@ -1215,7 +1222,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true;
|
||||
break;
|
||||
case SubframeType.Fixed:
|
||||
if (!_settings.GPUOnly)
|
||||
if (!task.UseGPUOnly)
|
||||
{
|
||||
if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true;
|
||||
encode_residual_fixed(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples,
|
||||
@@ -1224,7 +1231,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
int pmin = get_max_p_order(eparams.min_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order);
|
||||
int pmax = get_max_p_order(eparams.max_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order);
|
||||
uint bits = (uint)(task.frame.subframes[ch].best.order * task.frame.subframes[ch].obits) + 6;
|
||||
task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order);
|
||||
task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order, PCM.BitsPerSample > 16 ? 1 : 0);
|
||||
}
|
||||
break;
|
||||
case SubframeType.LPC:
|
||||
@@ -1236,7 +1243,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
|
||||
#if DEBUG
|
||||
// check size
|
||||
if (_settings.GPUOnly && !_settings.DoRice)
|
||||
if (task.UseGPUOnly && !task.UseGPURice)
|
||||
{
|
||||
uint real_size = measure_subframe(task.frame, task.frame.subframes[ch]);
|
||||
if (real_size != task.frame.subframes[ch].best.size)
|
||||
@@ -1244,9 +1251,9 @@ namespace CUETools.Codecs.FLACCL
|
||||
}
|
||||
#endif
|
||||
|
||||
if (((csum << task.frame.subframes[ch].obits) >= 1UL << 32) || !_settings.GPUOnly)
|
||||
if ((((csum << task.frame.subframes[ch].obits) >= 1UL << 32) && PCM.BitsPerSample == 16) || !task.UseGPUOnly)
|
||||
{
|
||||
if (_settings.GPUOnly && _settings.DoRice)
|
||||
if (task.UseGPURice)
|
||||
#if DEBUG
|
||||
// throw new Exception("DoRice failed");
|
||||
break;
|
||||
@@ -1266,11 +1273,11 @@ namespace CUETools.Codecs.FLACCL
|
||||
RiceContext rc1 = task.frame.subframes[ch].best.rc;
|
||||
task.frame.subframes[ch].best.rc = new RiceContext();
|
||||
#endif
|
||||
task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order);
|
||||
task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order, PCM.BitsPerSample > 16 ? 1 : 0);
|
||||
task.frame.subframes[ch].best.size = measure_subframe(task.frame, task.frame.subframes[ch]);
|
||||
#if KJHKJH
|
||||
// check size
|
||||
if (_settings.GPUOnly && oldsize > task.frame.subframes[ch].best.size)
|
||||
if (task.UseGPUOnly && oldsize > task.frame.subframes[ch].best.size)
|
||||
throw new Exception("unoptimal size reported");
|
||||
#endif
|
||||
//if (task.frame.subframes[ch].best.size > task.frame.subframes[ch].obits * (uint)task.frame.blocksize &&
|
||||
@@ -1337,8 +1344,9 @@ namespace CUETools.Codecs.FLACCL
|
||||
for (int i = 0; i < task.BestResidualTasks[index].residualOrder; i++)
|
||||
frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i];
|
||||
frame.subframes[ch].best.rc.porder = task.BestResidualTasks[index].porder;
|
||||
if (_settings.GPUOnly && !_settings.DoRice && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC))
|
||||
//if (_settings.GPUOnly && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC))
|
||||
frame.subframes[ch].best.rc.coding_method = task.BestResidualTasks[index].coding_method;
|
||||
if (task.UseGPUOnly && !task.UseGPURice && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC))
|
||||
//if (task.UseGPUOnly && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC))
|
||||
{
|
||||
int* riceParams = ((int*)task.clBestRiceParamsPtr) + (index << task.max_porder);
|
||||
fixed (int* dstParams = frame.subframes[ch].best.rc.rparams)
|
||||
@@ -1352,7 +1360,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
}
|
||||
else
|
||||
{
|
||||
if (_settings.GPUOnly && _settings.DoRice && frame.subframes[ch].best.size != task.BestResidualTasks[index].size)
|
||||
if (task.UseGPURice && frame.subframes[ch].best.size != task.BestResidualTasks[index].size)
|
||||
throw new Exception("size reported incorrectly");
|
||||
}
|
||||
}
|
||||
@@ -1369,10 +1377,9 @@ namespace CUETools.Codecs.FLACCL
|
||||
/// </summary>
|
||||
/// <param name="task"></param>
|
||||
/// <param name="doMidside"></param>
|
||||
unsafe void unpack_samples(FLACCLTask task, int count)
|
||||
unsafe void unpack_samples_16(FLACCLTask task, byte * srcptr, int count)
|
||||
{
|
||||
int iFrame = task.frame.frame_number;
|
||||
short* src = ((short*)task.clSamplesBytesPtr) + iFrame * channels * task.frameSize;
|
||||
short* src = (short*)srcptr;
|
||||
|
||||
switch (task.frame.ch_mode)
|
||||
{
|
||||
@@ -1382,7 +1389,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
int* s = task.frame.subframes[ch].samples;
|
||||
int wbits = (int)task.frame.subframes[ch].wbits;
|
||||
for (int i = 0; i < count; i++)
|
||||
s[i] = src[i * channels + ch] >>= wbits;
|
||||
s[i] = src[i * channels + ch] >> wbits;
|
||||
}
|
||||
break;
|
||||
case ChannelMode.LeftRight:
|
||||
@@ -1448,6 +1455,108 @@ namespace CUETools.Codecs.FLACCL
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Copy channel-interleaved input samples into separate subframes
|
||||
/// </summary>
|
||||
/// <param name="task"></param>
|
||||
/// <param name="doMidside"></param>
|
||||
unsafe void unpack_samples_24(FLACCLTask task, byte* srcptr, int count)
|
||||
{
|
||||
switch (task.frame.ch_mode)
|
||||
{
|
||||
case ChannelMode.NotStereo:
|
||||
for (int ch = 0; ch < channels; ch++)
|
||||
{
|
||||
int* s = task.frame.subframes[ch].samples;
|
||||
int wbits = (int)task.frame.subframes[ch].wbits;
|
||||
byte* src = &srcptr[ch * 3];
|
||||
for (int i = 0; i < count; i++)
|
||||
{
|
||||
s[i] = (((int)src[0] << 8) + ((int)src[1] << 16) + ((int)src[2] << 24)) >> (8 + wbits);
|
||||
src += PCM.BlockAlign;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case ChannelMode.LeftRight:
|
||||
{
|
||||
int* left = task.frame.subframes[0].samples;
|
||||
int* right = task.frame.subframes[1].samples;
|
||||
int lwbits = (int)task.frame.subframes[0].wbits;
|
||||
int rwbits = (int)task.frame.subframes[1].wbits;
|
||||
for (int i = 0; i < count; i++)
|
||||
{
|
||||
int l = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8;
|
||||
int r = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8;
|
||||
left[i] = l >> lwbits;
|
||||
right[i] = r >> rwbits;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ChannelMode.LeftSide:
|
||||
{
|
||||
int* left = task.frame.subframes[0].samples;
|
||||
int* right = task.frame.subframes[1].samples;
|
||||
int lwbits = (int)task.frame.subframes[0].wbits;
|
||||
int rwbits = (int)task.frame.subframes[1].wbits;
|
||||
for (int i = 0; i < count; i++)
|
||||
{
|
||||
int l = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8;
|
||||
int r = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8;
|
||||
left[i] = l >> lwbits;
|
||||
right[i] = (l - r) >> rwbits;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ChannelMode.RightSide:
|
||||
{
|
||||
int* left = task.frame.subframes[0].samples;
|
||||
int* right = task.frame.subframes[1].samples;
|
||||
int lwbits = (int)task.frame.subframes[0].wbits;
|
||||
int rwbits = (int)task.frame.subframes[1].wbits;
|
||||
for (int i = 0; i < count; i++)
|
||||
{
|
||||
int l = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8;
|
||||
int r = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8;
|
||||
left[i] = (l - r) >> lwbits;
|
||||
right[i] = r >> rwbits;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ChannelMode.MidSide:
|
||||
{
|
||||
int* left = task.frame.subframes[0].samples;
|
||||
int* right = task.frame.subframes[1].samples;
|
||||
int lwbits = (int)task.frame.subframes[0].wbits;
|
||||
int rwbits = (int)task.frame.subframes[1].wbits;
|
||||
for (int i = 0; i < count; i++)
|
||||
{
|
||||
int l = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8;
|
||||
int r = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8;
|
||||
left[i] = (l + r) >> (1 + lwbits);
|
||||
right[i] = (l - r) >> rwbits;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Copy channel-interleaved input samples into separate subframes
|
||||
/// </summary>
|
||||
/// <param name="task"></param>
|
||||
/// <param name="doMidside"></param>
|
||||
unsafe void unpack_samples(FLACCLTask task, int count)
|
||||
{
|
||||
int iFrame = task.frame.frame_number;
|
||||
byte* srcptr = ((byte*)task.clSamplesBytesPtr) + iFrame * task.frameSize * PCM.BlockAlign;
|
||||
if (PCM.BitsPerSample == 16)
|
||||
unpack_samples_16(task, srcptr, count);
|
||||
else if (PCM.BitsPerSample == 24)
|
||||
unpack_samples_24(task, srcptr, count);
|
||||
else
|
||||
throw new Exception("Invalid BPS");
|
||||
}
|
||||
|
||||
unsafe int encode_frame(bool doMidside, int channelCount, int iFrame, FLACCLTask task, int current_frame_number)
|
||||
{
|
||||
task.frame.InitSize(task.frameSize, eparams.variable_block_size != 0);
|
||||
@@ -1492,8 +1601,8 @@ namespace CUETools.Codecs.FLACCL
|
||||
task.framePos = frame_pos;
|
||||
frame_count += nFrames;
|
||||
frame_pos += nFrames * blocksize;
|
||||
if (!_settings.MappedMemory)
|
||||
task.openCLCQ.EnqueueWriteBuffer(task.clSamplesBytes, false, 0, sizeof(short) * channels * blocksize * nFrames, task.clSamplesBytesPtr);
|
||||
if (!task.UseMappedMemory)
|
||||
task.openCLCQ.EnqueueWriteBuffer(task.clSamplesBytes, false, 0, PCM.BlockAlign * blocksize * nFrames, task.clSamplesBytesPtr);
|
||||
//task.openCLCQ.EnqueueUnmapMemObject(task.clSamplesBytes, task.clSamplesBytes.HostPtr);
|
||||
//task.openCLCQ.EnqueueMapBuffer(task.clSamplesBytes, true, MapFlags.WRITE, 0, task.samplesBufferLen / 2);
|
||||
}
|
||||
@@ -1530,20 +1639,38 @@ namespace CUETools.Codecs.FLACCL
|
||||
{
|
||||
int decoded = task.verify.DecodeFrame(task.frame.writer.Buffer, task.frame.writer_offset, fs);
|
||||
if (decoded != fs || task.verify.Remaining != task.frameSize)
|
||||
throw new Exception("validation failed! frame size mismatch");
|
||||
throw new Exception(string.Format("validation failed! frame size mismatch, iFrame={0}, decoded=={1}, fs=={2}", fn, decoded, fs));
|
||||
fixed (int* r = task.verify.Samples)
|
||||
{
|
||||
for (int ch = 0; ch < channels; ch++)
|
||||
{
|
||||
short* res = ((short*)task.clSamplesBytesPtr) + iFrame * channels * task.frameSize + ch;
|
||||
byte* res = ((byte*)task.clSamplesBytesPtr) + PCM.BlockAlign * iFrame * task.frameSize + ch * (PCM.BlockAlign / channels);
|
||||
int* smp = r + ch * Flake.MAX_BLOCKSIZE;
|
||||
for (int i = task.frameSize; i > 0; i--)
|
||||
int ba = PCM.BlockAlign;
|
||||
if (PCM.BitsPerSample == 16)
|
||||
{
|
||||
//if (AudioSamples.MemCmp(s + iFrame * task.frameSize + ch * FLACCLWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, task.frameSize))
|
||||
if (*res != *(smp++))
|
||||
throw new Exception(string.Format("validation failed! iFrame={0}, ch={1}", iFrame, ch));
|
||||
res += channels;
|
||||
for (int i = task.frameSize; i > 0; i--)
|
||||
{
|
||||
//if (AudioSamples.MemCmp(s + iFrame * task.frameSize + ch * FLACCLWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, task.frameSize))
|
||||
int ress = *(short*)res;
|
||||
if (ress != *(smp++))
|
||||
throw new Exception(string.Format("validation failed! iFrame={0}, ch={1}", fn, ch));
|
||||
res += ba;
|
||||
}
|
||||
}
|
||||
else if (PCM.BitsPerSample == 24)
|
||||
{
|
||||
for (int i = task.frameSize; i > 0; i--)
|
||||
{
|
||||
//if (AudioSamples.MemCmp(s + iFrame * task.frameSize + ch * FLACCLWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, task.frameSize))
|
||||
int ress = (((int)res[0] << 8) + ((int)res[1] << 16) + ((int)res[2] << 24)) >> (8);
|
||||
if (ress != *(smp++))
|
||||
throw new Exception(string.Format("validation failed! iFrame={0}, ch={1}", iFrame, ch));
|
||||
res += ba;
|
||||
}
|
||||
}
|
||||
else
|
||||
throw new Exception("Invalid BPS");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1644,10 +1771,21 @@ namespace CUETools.Codecs.FLACCL
|
||||
}
|
||||
OCLMan.CreateDefaultContext(platformId, (DeviceType)_settings.DeviceType);
|
||||
|
||||
this.framesPerTask = (int)OCLMan.Context.Devices[0].MaxComputeUnits * _settings.TaskSize;
|
||||
this.framesPerTask = (int)OCLMan.Context.Devices[0].MaxComputeUnits * Math.Max(1, _settings.TaskSize / channels);
|
||||
|
||||
if (!OCLMan.Context.Devices[0].Extensions.Contains("cl_khr_local_int32_extended_atomics"))
|
||||
_settings.GPUOnly = false;
|
||||
bool UseGPUOnly = _settings.GPUOnly && OCLMan.Context.Devices[0].Extensions.Contains("cl_khr_local_int32_extended_atomics");
|
||||
bool UseGPURice = UseGPUOnly && _settings.DoRice;
|
||||
|
||||
if (_blocksize == 0)
|
||||
{
|
||||
if (eparams.block_size == 0)
|
||||
eparams.block_size = select_blocksize(sample_rate, eparams.block_time_ms);
|
||||
_blocksize = eparams.block_size;
|
||||
}
|
||||
else
|
||||
eparams.block_size = _blocksize;
|
||||
|
||||
int maxBS = 1 << (BitReader.log2i(eparams.block_size - 1) + 1);
|
||||
|
||||
// The Defines string gets prepended to any and all sources that are compiled
|
||||
// and serve as a convenient way to pass configuration information to the compilation process
|
||||
@@ -1655,8 +1793,11 @@ namespace CUETools.Codecs.FLACCL
|
||||
"#define MAX_ORDER " + eparams.max_prediction_order.ToString() + "\n" +
|
||||
"#define GROUP_SIZE " + groupSize.ToString() + "\n" +
|
||||
"#define FLACCL_VERSION \"" + vendor_string + "\"\n" +
|
||||
(_settings.GPUOnly ? "#define DO_PARTITIONS\n" : "") +
|
||||
(_settings.DoRice ? "#define DO_RICE\n" : "") +
|
||||
(UseGPUOnly ? "#define DO_PARTITIONS\n" : "") +
|
||||
(UseGPURice ? "#define DO_RICE\n" : "") +
|
||||
"#define BITS_PER_SAMPLE " + PCM.BitsPerSample + "\n" +
|
||||
"#define MAX_BLOCKSIZE " + maxBS + "\n" +
|
||||
"#define MAX_CHANNELS " + PCM.ChannelCount + "\n" +
|
||||
#if DEBUG
|
||||
"#define DEBUG\n" +
|
||||
#endif
|
||||
@@ -1718,13 +1859,13 @@ namespace CUETools.Codecs.FLACCL
|
||||
if (_IO.CanSeek)
|
||||
first_frame_offset = _IO.Position;
|
||||
|
||||
task1 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, this, groupSize);
|
||||
task2 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, this, groupSize);
|
||||
task1 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, this, groupSize, UseGPUOnly, UseGPURice);
|
||||
task2 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, this, groupSize, UseGPUOnly, UseGPURice);
|
||||
if (_settings.CPUThreads > 0)
|
||||
{
|
||||
cpu_tasks = new FLACCLTask[_settings.CPUThreads];
|
||||
for (int i = 0; i < cpu_tasks.Length; i++)
|
||||
cpu_tasks[i] = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, this, groupSize);
|
||||
cpu_tasks[i] = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, this, groupSize, UseGPUOnly, UseGPURice);
|
||||
}
|
||||
inited = true;
|
||||
}
|
||||
@@ -1823,10 +1964,10 @@ namespace CUETools.Codecs.FLACCL
|
||||
|
||||
public unsafe void do_output_frames(int nFrames)
|
||||
{
|
||||
send_to_GPU(task1, nFrames, eparams.block_size);
|
||||
run_GPU_task(task1);
|
||||
if (task2.frameCount > 0)
|
||||
task2.openCLCQ.Finish();
|
||||
send_to_GPU(task1, nFrames, eparams.block_size);
|
||||
run_GPU_task(task1);
|
||||
if (task2.frameCount > 0)
|
||||
{
|
||||
if (cpu_tasks != null)
|
||||
@@ -1871,15 +2012,16 @@ namespace CUETools.Codecs.FLACCL
|
||||
{
|
||||
int blocksize = Flake.flac_blocksizes[1];
|
||||
int target = (samplerate * time_ms) / 1000;
|
||||
if (eparams.variable_block_size > 0)
|
||||
{
|
||||
blocksize = 1024;
|
||||
while (target >= blocksize)
|
||||
blocksize <<= 1;
|
||||
return blocksize >> 1;
|
||||
}
|
||||
|
||||
for (int i = 0; i < Flake.flac_blocksizes.Length; i++)
|
||||
////if (eparams.variable_block_size > 0)
|
||||
////{
|
||||
//// blocksize = 1024;
|
||||
//// while (target >= blocksize)
|
||||
//// blocksize <<= 1;
|
||||
//// return blocksize >> 1;
|
||||
////}
|
||||
|
||||
for (int i = 8; i < Flake.flac_blocksizes.Length; i++)
|
||||
if (target >= Flake.flac_blocksizes[i] && Flake.flac_blocksizes[i] > blocksize)
|
||||
{
|
||||
blocksize = Flake.flac_blocksizes[i];
|
||||
@@ -2052,18 +2194,6 @@ namespace CUETools.Codecs.FLACCL
|
||||
}
|
||||
if (i == 8)
|
||||
throw new Exception("non-standard bps");
|
||||
// FIXME: For now, only 16-bit encoding is supported
|
||||
if (bits_per_sample != 16)
|
||||
throw new Exception("non-standard bps");
|
||||
|
||||
if (_blocksize == 0)
|
||||
{
|
||||
if (eparams.block_size == 0)
|
||||
eparams.block_size = select_blocksize(sample_rate, eparams.block_time_ms);
|
||||
_blocksize = eparams.block_size;
|
||||
}
|
||||
else
|
||||
eparams.block_size = _blocksize;
|
||||
|
||||
// set maximum encoded frame size (if larger, re-encodes in verbatim mode)
|
||||
if (channels == 2)
|
||||
@@ -2332,7 +2462,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
public int type;
|
||||
public int obits;
|
||||
public int blocksize;
|
||||
public int best_index;
|
||||
public int coding_method;
|
||||
public int channel;
|
||||
public int residualOffs;
|
||||
public int wbits;
|
||||
@@ -2350,6 +2480,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
public Kernel clStereoDecorr;
|
||||
//public Kernel cudaChannelDecorr;
|
||||
public Kernel clChannelDecorr2;
|
||||
public Kernel clChannelDecorrX;
|
||||
public Kernel clFindWastedBits;
|
||||
public Kernel clComputeAutocor;
|
||||
public Kernel clComputeLPC;
|
||||
@@ -2428,9 +2559,15 @@ namespace CUETools.Codecs.FLACCL
|
||||
public int groupSize = 128;
|
||||
public int channels, channelsCount;
|
||||
public FLACCLWriter writer;
|
||||
public bool UseGPUOnly = false;
|
||||
public bool UseGPURice = false;
|
||||
public bool UseMappedMemory = false;
|
||||
|
||||
unsafe public FLACCLTask(Program _openCLProgram, int channelsCount, int channels, uint bits_per_sample, int max_frame_size, FLACCLWriter writer, int groupSize)
|
||||
unsafe public FLACCLTask(Program _openCLProgram, int channelsCount, int channels, uint bits_per_sample, int max_frame_size, FLACCLWriter writer, int groupSize, bool gpuOnly, bool gpuRice)
|
||||
{
|
||||
this.UseGPUOnly = gpuOnly;
|
||||
this.UseGPURice = gpuOnly && gpuRice;
|
||||
this.UseMappedMemory = writer._settings.MappedMemory || writer._settings.DeviceType == OpenCLDeviceType.CPU;
|
||||
this.groupSize = groupSize;
|
||||
this.channels = channels;
|
||||
this.channelsCount = channelsCount;
|
||||
@@ -2448,9 +2585,9 @@ namespace CUETools.Codecs.FLACCL
|
||||
int MAX_CHANNELSIZE = MAX_FRAMES * writer.eparams.block_size;
|
||||
residualTasksLen = sizeof(FLACCLSubframeTask) * 32 * channelsCount * MAX_FRAMES;
|
||||
bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channels * MAX_FRAMES;
|
||||
int samplesBufferLen = sizeof(int) * MAX_CHANNELSIZE * channelsCount;
|
||||
int samplesBufferLen = writer.PCM.BlockAlign * MAX_CHANNELSIZE * channelsCount;
|
||||
int residualBufferLen = sizeof(int) * MAX_CHANNELSIZE * channels; // need to adjust residualOffset?
|
||||
int partitionsLen = sizeof(int) * (30 << 8) * channels * MAX_FRAMES;
|
||||
int partitionsLen = sizeof(int) * ((writer.PCM.BitsPerSample > 16 ? 31 : 15) * 2 << 8) * channels * MAX_FRAMES;
|
||||
int riceParamsLen = sizeof(int) * (4 << 8) * channels * MAX_FRAMES;
|
||||
int autocorLen = sizeof(float) * (MAX_ORDER + 1) * lpc.MAX_LPC_WINDOWS * channelsCount * MAX_FRAMES;
|
||||
int lpcDataLen = autocorLen * 32;
|
||||
@@ -2459,7 +2596,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
int selectedLen = sizeof(int) * 32 * channelsCount * MAX_FRAMES;
|
||||
int riceLen = sizeof(int) * channels * MAX_CHANNELSIZE;
|
||||
|
||||
if (!writer._settings.MappedMemory)
|
||||
if (!this.UseMappedMemory)
|
||||
{
|
||||
clSamplesBytes = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, samplesBufferLen / 2);
|
||||
clResidual = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, residualBufferLen);
|
||||
@@ -2521,7 +2658,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
clAutocorOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, autocorLen);
|
||||
clSelectedTasksSecondEstimate = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, selectedLen);
|
||||
clSelectedTasksBestMethod = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, selectedLen);
|
||||
if (writer._settings.GPUOnly)
|
||||
if (UseGPUOnly)
|
||||
{
|
||||
clPartitions = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, partitionsLen);
|
||||
clRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, riceParamsLen);
|
||||
@@ -2533,6 +2670,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
clStereoDecorr = openCLProgram.CreateKernel("clStereoDecorr");
|
||||
//cudaChannelDecorr = openCLProgram.CreateKernel("clChannelDecorr");
|
||||
clChannelDecorr2 = openCLProgram.CreateKernel("clChannelDecorr2");
|
||||
clChannelDecorrX = openCLProgram.CreateKernel("clChannelDecorrX");
|
||||
clFindWastedBits = openCLProgram.CreateKernel("clFindWastedBits");
|
||||
clComputeLPC = openCLProgram.CreateKernel("clComputeLPC");
|
||||
clQuantizeLPC = openCLProgram.CreateKernel("clQuantizeLPC");
|
||||
@@ -2540,15 +2678,16 @@ namespace CUETools.Codecs.FLACCL
|
||||
clSelectStereoTasks = openCLProgram.CreateKernel("clSelectStereoTasks");
|
||||
clEstimateResidual = openCLProgram.CreateKernel("clEstimateResidual");
|
||||
clChooseBestMethod = openCLProgram.CreateKernel("clChooseBestMethod");
|
||||
if (writer._settings.GPUOnly)
|
||||
if (UseGPUOnly)
|
||||
{
|
||||
clEncodeResidual = openCLProgram.CreateKernel("clEncodeResidual");
|
||||
clCalcPartition = openCLProgram.CreateKernel("clCalcPartition");
|
||||
clCalcPartition16 = openCLProgram.CreateKernel("clCalcPartition16");
|
||||
if (openCLCQ.Device.DeviceType != DeviceType.CPU)
|
||||
clCalcPartition16 = openCLProgram.CreateKernel("clCalcPartition16");
|
||||
clSumPartition = openCLProgram.CreateKernel("clSumPartition");
|
||||
clFindRiceParameter = openCLProgram.CreateKernel("clFindRiceParameter");
|
||||
clFindPartitionOrder = openCLProgram.CreateKernel("clFindPartitionOrder");
|
||||
if (writer._settings.DoRice)
|
||||
if (UseGPURice)
|
||||
{
|
||||
clCalcOutputOffsets = openCLProgram.CreateKernel("clCalcOutputOffsets");
|
||||
clRiceEncoding = openCLProgram.CreateKernel("clRiceEncoding");
|
||||
@@ -2586,6 +2725,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
clStereoDecorr.Dispose();
|
||||
//cudaChannelDecorr.Dispose();
|
||||
clChannelDecorr2.Dispose();
|
||||
clChannelDecorrX.Dispose();
|
||||
clFindWastedBits.Dispose();
|
||||
clComputeLPC.Dispose();
|
||||
clQuantizeLPC.Dispose();
|
||||
@@ -2593,15 +2733,16 @@ namespace CUETools.Codecs.FLACCL
|
||||
clSelectStereoTasks.Dispose();
|
||||
clEstimateResidual.Dispose();
|
||||
clChooseBestMethod.Dispose();
|
||||
if (writer._settings.GPUOnly)
|
||||
if (UseGPUOnly)
|
||||
{
|
||||
clEncodeResidual.Dispose();
|
||||
clCalcPartition.Dispose();
|
||||
clCalcPartition16.Dispose();
|
||||
if (openCLCQ.Device.DeviceType != DeviceType.CPU)
|
||||
clCalcPartition16.Dispose();
|
||||
clSumPartition.Dispose();
|
||||
clFindRiceParameter.Dispose();
|
||||
clFindPartitionOrder.Dispose();
|
||||
if (writer._settings.DoRice)
|
||||
if (UseGPURice)
|
||||
{
|
||||
clCalcOutputOffsets.Dispose();
|
||||
clRiceEncoding.Dispose();
|
||||
@@ -2611,7 +2752,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
clRiceParams.Dispose();
|
||||
}
|
||||
|
||||
if (!writer._settings.MappedMemory)
|
||||
if (!this.UseMappedMemory)
|
||||
{
|
||||
if (clSamplesBytesPtr != IntPtr.Zero)
|
||||
openCLCQ.EnqueueUnmapMemObject(clSamplesBytesPinned, clSamplesBytesPtr);
|
||||
@@ -2701,19 +2842,36 @@ namespace CUETools.Codecs.FLACCL
|
||||
while ((frameSize >> max_porder) < 16 && max_porder > 0)
|
||||
this.max_porder--;
|
||||
|
||||
if (channels != 2) throw new Exception("channels != 2"); // need to Enqueue cudaChannelDecorr for each channel
|
||||
Kernel clChannelDecorr = channels == 2 ? (channelsCount == 4 ? clStereoDecorr : clChannelDecorr2) : null;// cudaChannelDecorr;
|
||||
|
||||
// openCLCQ.EnqueueMapBuffer(cudaSamplesBytes
|
||||
//openCLCQ.EnqueueUnmapMemObject(cudaSamplesBytes, cudaSamplesBytes.HostPtr);
|
||||
|
||||
// issue work to the GPU
|
||||
clChannelDecorr.SetArgs(
|
||||
clSamples,
|
||||
clSamplesBytes,
|
||||
channelSize / 4);
|
||||
if (channels == 2)
|
||||
{
|
||||
Kernel clChannelDecorr = channelsCount == 4 ? clStereoDecorr : clChannelDecorr2;
|
||||
int channelSize1 = writer.PCM.BitsPerSample == 16 ? channelSize / 4 : channelSize;
|
||||
clChannelDecorr.SetArgs(
|
||||
clSamples,
|
||||
clSamplesBytes,
|
||||
channelSize1);
|
||||
|
||||
openCLCQ.EnqueueNDRangeKernel(clChannelDecorr, 0, channelSize / 4);
|
||||
openCLCQ.EnqueueNDRangeKernel(
|
||||
clChannelDecorr,
|
||||
0,
|
||||
channelSize1);
|
||||
}
|
||||
else
|
||||
{
|
||||
clChannelDecorrX.SetArgs(
|
||||
clSamples,
|
||||
clSamplesBytes,
|
||||
channelSize);
|
||||
|
||||
openCLCQ.EnqueueNDRangeKernel(
|
||||
clChannelDecorrX,
|
||||
0,
|
||||
channelSize);
|
||||
}
|
||||
//openCLCQ.EnqueueNDRangeKernel(clChannelDecorr, 0, (frameSize * frameCount + 3) / 4);
|
||||
|
||||
if (eparams.do_wasted)
|
||||
@@ -2842,14 +3000,22 @@ namespace CUETools.Codecs.FLACCL
|
||||
0, channels * frameCount);
|
||||
}
|
||||
|
||||
if (writer._settings.GPUOnly)
|
||||
if (UseGPUOnly)
|
||||
{
|
||||
if (frameSize >> max_porder == 16)
|
||||
clEncodeResidual.SetArgs(
|
||||
clResidual,
|
||||
clSamples,
|
||||
clBestResidualTasks);
|
||||
|
||||
openCLCQ.EnqueueNDRangeKernel(
|
||||
clEncodeResidual,
|
||||
groupSize, channels * frameCount);
|
||||
|
||||
if ((frameSize >> max_porder == 16) && openCLCQ.Device.DeviceType != DeviceType.CPU)
|
||||
{
|
||||
clCalcPartition16.SetArgs(
|
||||
clPartitions,
|
||||
clResidual,
|
||||
clSamples,
|
||||
clBestResidualTasks,
|
||||
max_porder);
|
||||
|
||||
@@ -2859,15 +3025,6 @@ namespace CUETools.Codecs.FLACCL
|
||||
}
|
||||
else
|
||||
{
|
||||
clEncodeResidual.SetArgs(
|
||||
clResidual,
|
||||
clSamples,
|
||||
clBestResidualTasks);
|
||||
|
||||
openCLCQ.EnqueueNDRangeKernel(
|
||||
clEncodeResidual,
|
||||
groupSize, channels * frameCount);
|
||||
|
||||
clCalcPartition.SetArgs(
|
||||
clPartitions,
|
||||
clResidual,
|
||||
@@ -2895,6 +3052,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
clPartitions,
|
||||
max_porder);
|
||||
|
||||
int maxK = writer.PCM.BitsPerSample > 16 ? 30 : Flake.MAX_RICE_PARAM;
|
||||
if (openCLCQ.Device.DeviceType == DeviceType.CPU)
|
||||
openCLCQ.EnqueueNDRangeKernel(
|
||||
clSumPartition,
|
||||
@@ -2904,7 +3062,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
openCLCQ.EnqueueNDRangeKernel(
|
||||
clSumPartition,
|
||||
128, 1,
|
||||
(Flake.MAX_RICE_PARAM + 1),
|
||||
(maxK + 1),
|
||||
channels * frameCount);
|
||||
}
|
||||
|
||||
@@ -2931,7 +3089,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
groupSize,
|
||||
channels * frameCount);
|
||||
|
||||
if (writer._settings.DoRice)
|
||||
if (UseGPURice)
|
||||
{
|
||||
clCalcOutputOffsets.SetArgs(
|
||||
clResidual,
|
||||
@@ -2960,10 +3118,10 @@ namespace CUETools.Codecs.FLACCL
|
||||
channels * frameCount);
|
||||
}
|
||||
|
||||
if (!writer._settings.MappedMemory)
|
||||
if (!this.UseMappedMemory)
|
||||
{
|
||||
if (writer._settings.DoRice)
|
||||
openCLCQ.EnqueueReadBuffer(clRiceOutput, false, 0, (channels * frameSize * 17 + 128) / 8 * frameCount, clRiceOutputPtr);
|
||||
if (UseGPURice)
|
||||
openCLCQ.EnqueueReadBuffer(clRiceOutput, false, 0, (channels * frameSize * (writer.PCM.BitsPerSample + 1) + 256) / 8 * frameCount, clRiceOutputPtr);
|
||||
else
|
||||
{
|
||||
openCLCQ.EnqueueReadBuffer(clBestRiceParams, false, 0, sizeof(int) * (1 << max_porder) * channels * frameCount, clBestRiceParamsPtr);
|
||||
@@ -2971,7 +3129,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!writer._settings.MappedMemory)
|
||||
if (!this.UseMappedMemory)
|
||||
openCLCQ.EnqueueReadBuffer(clBestResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * channels * frameCount, clBestResidualTasksPtr);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -65,6 +65,14 @@
|
||||
|
||||
#define WARP_SIZE 32
|
||||
|
||||
#if BITS_PER_SAMPLE > 16
|
||||
#define MAX_RICE_PARAM 30
|
||||
#define RICE_PARAM_BITS 5
|
||||
#else
|
||||
#define MAX_RICE_PARAM 14
|
||||
#define RICE_PARAM_BITS 4
|
||||
#endif
|
||||
|
||||
typedef enum
|
||||
{
|
||||
Constant = 0,
|
||||
@@ -83,7 +91,7 @@ typedef struct
|
||||
int type;
|
||||
int obits;
|
||||
int blocksize;
|
||||
int best_index;
|
||||
int coding_method;
|
||||
int channel;
|
||||
int residualOffs;
|
||||
int wbits;
|
||||
@@ -125,6 +133,49 @@ __kernel void clWindowTukey(__global float* window, int windowOffset, float p)
|
||||
}
|
||||
#endif
|
||||
|
||||
#if BITS_PER_SAMPLE > 16
|
||||
__kernel void clStereoDecorr(
|
||||
__global int *samples,
|
||||
__global unsigned char *src,
|
||||
int offset
|
||||
)
|
||||
{
|
||||
int pos = get_global_id(0);
|
||||
int bpos = pos * 6;
|
||||
int x = (((int)src[bpos] << 8) | ((int)src[bpos+1] << 16) | ((int)src[bpos+2] << 24)) >> 8;
|
||||
int y = (((int)src[bpos+3] << 8) | ((int)src[bpos+4] << 16) | ((int)src[bpos+5] << 24)) >> 8;
|
||||
samples[pos] = x;
|
||||
samples[1 * offset + pos] = y;
|
||||
samples[2 * offset + pos] = (x + y) >> 1;
|
||||
samples[3 * offset + pos] = x - y;
|
||||
}
|
||||
|
||||
__kernel void clChannelDecorr2(
|
||||
__global int *samples,
|
||||
__global unsigned char *src,
|
||||
int offset
|
||||
)
|
||||
{
|
||||
int pos = get_global_id(0);
|
||||
int bpos = pos * 6;
|
||||
samples[pos] = (((int)src[bpos] << 8) | ((int)src[bpos+1] << 16) | ((int)src[bpos+2] << 24)) >> 8;
|
||||
samples[offset + pos] = (((int)src[bpos+3] << 8) | ((int)src[bpos+4] << 16) | ((int)src[bpos+5] << 24)) >> 8;
|
||||
}
|
||||
|
||||
__kernel void clChannelDecorrX(
|
||||
__global int *samples,
|
||||
__global unsigned char *src,
|
||||
int offset
|
||||
)
|
||||
{
|
||||
int pos = get_global_id(0);
|
||||
for (int ch = 0; ch < MAX_CHANNELS; ch++)
|
||||
{
|
||||
int bpos = 3 * (pos * MAX_CHANNELS + ch);
|
||||
samples[offset * ch + pos] = (((int)src[bpos] << 8) | ((int)src[bpos+1] << 16) | ((int)src[bpos+2] << 24)) >> 8;
|
||||
}
|
||||
}
|
||||
#else
|
||||
__kernel void clStereoDecorr(
|
||||
__global int4 *samples,
|
||||
__global int4 *src,
|
||||
@@ -153,6 +204,21 @@ __kernel void clChannelDecorr2(
|
||||
samples[offset + pos] = s >> 16;
|
||||
}
|
||||
|
||||
__kernel void clChannelDecorrX(
|
||||
__global int *samples,
|
||||
__global short *src,
|
||||
int offset
|
||||
)
|
||||
{
|
||||
int pos = get_global_id(0);
|
||||
for (int ch = 0; ch < MAX_CHANNELS; ch++)
|
||||
{
|
||||
int bpos = pos * MAX_CHANNELS + ch;
|
||||
samples[offset * ch + pos] = src[bpos];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
//__kernel void clChannelDecorr(
|
||||
// int *samples,
|
||||
// short *src,
|
||||
@@ -598,7 +664,11 @@ void clQuantizeLPC(
|
||||
}
|
||||
// choose precision
|
||||
//int cbits = max(3, min(10, 5 + (abits >> 1))); // - convert_int_rte(shared.PE[order - 1])
|
||||
#if BITS_PER_SAMPLE > 16
|
||||
int cbits = max(3, min(15 - minprecision + (i - ((i >> precisions) << precisions)) - (bs <= 2304) - (bs <= 1152) - (bs <= 576), abits));
|
||||
#else
|
||||
int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (bs <= 2304) - (bs <= 1152) - (bs <= 576), abits), clz(order) + 1 - abits));
|
||||
#endif
|
||||
// calculate shift based on precision and number of leading zeroes in coeffs
|
||||
int shift = max(0,min(15, clz(tmpi) - 18 + cbits));
|
||||
|
||||
@@ -749,7 +819,11 @@ void clQuantizeLPC(
|
||||
//SUM32(shared.tmpi,tid,|=);
|
||||
// choose precision
|
||||
//int cbits = max(3, min(10, 5 + (shared.task.abits >> 1))); // - convert_int_rte(shared.PE[order - 1])
|
||||
#if BITS_PER_SAMPLE > 16
|
||||
int cbits = max(3, min(min(15 - minprecision + (i - ((i >> precisions) << precisions)) - (shared.task.blocksize <= 2304) - (shared.task.blocksize <= 1152) - (shared.task.blocksize <= 576), shared.task.abits), 15));
|
||||
#else
|
||||
int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (shared.task.blocksize <= 2304) - (shared.task.blocksize <= 1152) - (shared.task.blocksize <= 576), shared.task.abits), clz(order) + 1 - shared.task.abits));
|
||||
#endif
|
||||
// calculate shift based on precision and number of leading zeroes in coeffs
|
||||
int shift = max(0,min(15, clz(shared.maxcoef[i]) - 18 + cbits));
|
||||
|
||||
@@ -797,7 +871,6 @@ void clQuantizeLPC(
|
||||
#endif
|
||||
|
||||
#ifdef FLACCL_CPU
|
||||
|
||||
inline int fastclz(int iv)
|
||||
{
|
||||
unsigned int v = (unsigned int)iv;
|
||||
@@ -809,17 +882,44 @@ inline int fastclz(int iv)
|
||||
x += (0 != (v >> x));
|
||||
return 32 - x;
|
||||
}
|
||||
|
||||
inline int calc_residual(__global int *ptr, int * coefs, int ro)
|
||||
#else
|
||||
inline int fastclz(int iv)
|
||||
{
|
||||
int sum = 0;
|
||||
return clz(iv);
|
||||
}
|
||||
#endif
|
||||
inline int fastclz64(long iv)
|
||||
{
|
||||
unsigned long v = (unsigned long)iv;
|
||||
int x = (0 != (v >> 32)) * 32;
|
||||
return 32 - x + fastclz(v >> x);
|
||||
}
|
||||
|
||||
#if BITS_PER_SAMPLE > 16
|
||||
typedef long residual_t;
|
||||
#define residual_log(s) (63 - fastclz64(s))
|
||||
#define convert_bps4 convert_long4
|
||||
#define convert_bps_sat convert_int_sat
|
||||
#define bpsint4 long4
|
||||
#else
|
||||
typedef int residual_t;
|
||||
#define residual_log(s) (31 - fastclz(s))
|
||||
#define convert_bps4
|
||||
#define convert_bps_sat
|
||||
#define bpsint4 int4
|
||||
#endif
|
||||
|
||||
#ifdef FLACCL_CPU
|
||||
inline residual_t calc_residual(__global int *ptr, int * coefs, int ro)
|
||||
{
|
||||
residual_t sum = 0;
|
||||
for (int i = 0; i < ro; i++)
|
||||
sum += ptr[i] * coefs[i];
|
||||
sum += (residual_t) ptr[i] * coefs[i];
|
||||
return sum;
|
||||
}
|
||||
|
||||
#define ENCODE_N(cro,action) for (int pos = cro; pos < bs; pos ++) { \
|
||||
int t = (data[pos] - (calc_residual(data + pos - cro, task.coefs, cro) >> task.data.shift)) >> task.data.wbits; \
|
||||
residual_t t = (data[pos] - (calc_residual(data + pos - cro, task.coefs, cro) >> task.data.shift)) >> task.data.wbits; \
|
||||
action; \
|
||||
}
|
||||
#define SWITCH_N(action) \
|
||||
@@ -861,7 +961,7 @@ void clEstimateResidual(
|
||||
for (int i = 0; i < 1 << EPO; i++)
|
||||
len[i] = 0;
|
||||
|
||||
#ifdef AMD
|
||||
#if defined(AMD) || BITS_PER_SAMPLE > 16
|
||||
SWITCH_N((t = (t << 1) ^ (t >> 31), len[pos >> (12 - EPO)] += t & 0x7fffff))
|
||||
#else
|
||||
int4 c0 = vload4(0, &task.coefs[0]);
|
||||
@@ -884,21 +984,19 @@ void clEstimateResidual(
|
||||
int total = 0;
|
||||
for (int i = 0; i < 1 << EPO; i++)
|
||||
{
|
||||
int res = min(0x7fffff,len[i]);
|
||||
int k = iclamp(31 - (12 - EPO) - fastclz(res), 0, 14); // 25 - clz(res) == clz(64) - clz(res) == log2(res / 64)
|
||||
int res = len[i];
|
||||
int k = iclamp(31 - fastclz(res) - (12 - EPO), 0, MAX_RICE_PARAM); // 25 - clz(res) == clz(64) - clz(res) == log2(res / 64)
|
||||
total += (k << (12 - EPO)) + (res >> k);
|
||||
}
|
||||
int partLen = min(0x7ffffff, total) + (bs - ro);
|
||||
int obits = task.data.obits - task.data.wbits;
|
||||
tasks[selectedTask].data.size = min(obits * bs,
|
||||
task.data.type == Fixed ? ro * obits + 6 + (4 * 1/2) + partLen :
|
||||
task.data.type == LPC ? ro * obits + 4 + 5 + ro * task.data.cbits + 6 + (4 * 1/2)/* << porder */ + partLen :
|
||||
task.data.type == Fixed ? ro * obits + 6 + RICE_PARAM_BITS + partLen :
|
||||
task.data.type == LPC ? ro * obits + 4 + 5 + ro * task.data.cbits + 6 + RICE_PARAM_BITS/* << porder */ + partLen :
|
||||
task.data.type == Constant ? obits * select(1, bs, partLen != bs - ro) :
|
||||
obits * bs);
|
||||
}
|
||||
#else
|
||||
|
||||
#define MAX_BLOCKSIZE 4096
|
||||
#define ESTPARTLOG 5
|
||||
|
||||
__kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
|
||||
@@ -1049,7 +1147,7 @@ void clEstimateResidual(
|
||||
// calculate rice partition bit length for every 32 samples
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
// Bug: if (MAX_BLOCKSIZE >> (ESTPARTLOG + 1)) > GROUP_SIZE
|
||||
int pl = get_local_id(0) < (MAX_BLOCKSIZE >> (ESTPARTLOG + 1)) ? pl = psum[tid * 2] + psum[tid * 2 + 1] : 0;
|
||||
int pl = get_local_id(0) < (MAX_BLOCKSIZE >> (ESTPARTLOG + 1)) ? psum[tid * 2] + psum[tid * 2 + 1] : 0;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
// for (int pos = 0; pos < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2; pos += GROUP_SIZE)
|
||||
// {
|
||||
@@ -1060,7 +1158,7 @@ void clEstimateResidual(
|
||||
//if (offs < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2)
|
||||
// psum[offs] = pl;
|
||||
// }
|
||||
int k = iclamp(31 - (ESTPARTLOG + 1) - clz(pl), 0, 14); // 26 - clz(res) == clz(32) - clz(res) == log2(res / 32)
|
||||
int k = iclamp(31 - fastclz(pl) - (ESTPARTLOG + 1), 0, MAX_RICE_PARAM); // 26 - clz(res) == clz(32) - clz(res) == log2(res / 32)
|
||||
if (tid < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2)
|
||||
psum[tid] = (k << (ESTPARTLOG + 1)) + (pl >> k);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@@ -1075,8 +1173,8 @@ void clEstimateResidual(
|
||||
int pl = psum[0] + (bs - ro);
|
||||
int obits = task.data.obits - task.data.wbits;
|
||||
int len = min(obits * task.data.blocksize,
|
||||
task.data.type == Fixed ? task.data.residualOrder * obits + 6 + (4 * 1/2) + pl :
|
||||
task.data.type == LPC ? task.data.residualOrder * obits + 4 + 5 + task.data.residualOrder * task.data.cbits + 6 + (4 * 1/2)/* << porder */ + pl :
|
||||
task.data.type == Fixed ? task.data.residualOrder * obits + 6 + RICE_PARAM_BITS + pl :
|
||||
task.data.type == LPC ? task.data.residualOrder * obits + 4 + 5 + task.data.residualOrder * task.data.cbits + 6 + RICE_PARAM_BITS/* << porder */ + pl :
|
||||
task.data.type == Constant ? obits * select(1, task.data.blocksize, pl != task.data.blocksize - task.data.residualOrder) :
|
||||
obits * task.data.blocksize);
|
||||
tasks[selectedTask].data.size = len;
|
||||
@@ -1172,7 +1270,7 @@ void clEncodeResidual(
|
||||
int bs = task.data.blocksize;
|
||||
int ro = task.data.residualOrder;
|
||||
__global int *data = &samples[task.data.samplesOffs];
|
||||
SWITCH_N(residual[task.data.residualOffs + pos] = t);
|
||||
SWITCH_N(residual[task.data.residualOffs + pos] = convert_bps_sat(t));
|
||||
}
|
||||
#else
|
||||
// get_group_id(0) == task index
|
||||
@@ -1198,12 +1296,10 @@ void clEncodeResidual(
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
#ifdef AMD
|
||||
int4 cptr0 = vload4(0, &task.coefs[0]);
|
||||
int4 cptr1 = vload4(1, &task.coefs[0]);
|
||||
bpsint4 cptr0 = convert_bps4(vload4(0, &task.coefs[0]));
|
||||
bpsint4 cptr1 = convert_bps4(vload4(1, &task.coefs[0]));
|
||||
#if MAX_ORDER > 8
|
||||
int4 cptr2 = vload4(2, &task.coefs[0]);
|
||||
#endif
|
||||
bpsint4 cptr2 = convert_bps4(vload4(2, &task.coefs[0]));
|
||||
#endif
|
||||
|
||||
data[tid] = 0;
|
||||
@@ -1217,33 +1313,24 @@ void clEncodeResidual(
|
||||
|
||||
// compute residual
|
||||
__local int* dptr = &data[tid + GROUP_SIZE - ro];
|
||||
int4 sum
|
||||
#ifdef AMD
|
||||
= cptr0 * vload4(0, dptr)
|
||||
+ cptr1 * vload4(1, dptr)
|
||||
#else
|
||||
= vload4(0, &task.coefs[0]) * vload4(0, dptr)
|
||||
+ vload4(1, &task.coefs[0]) * vload4(1, dptr)
|
||||
#endif
|
||||
bpsint4 sum
|
||||
= cptr0 * convert_bps4(vload4(0, dptr))
|
||||
+ cptr1 * convert_bps4(vload4(1, dptr))
|
||||
#if MAX_ORDER > 8
|
||||
#ifdef AMD
|
||||
+ cptr2 * vload4(2, dptr)
|
||||
#else
|
||||
+ vload4(2, &task.coefs[0]) * vload4(2, dptr)
|
||||
#endif
|
||||
+ cptr2 * convert_bps4(vload4(2, dptr))
|
||||
#if MAX_ORDER > 12
|
||||
+ vload4(3, &task.coefs[0]) * vload4(3, dptr)
|
||||
+ convert_bps4(vload4(3, &task.coefs[0])) * convert_bps4(vload4(3, dptr))
|
||||
#if MAX_ORDER > 16
|
||||
+ vload4(4, &task.coefs[0]) * vload4(4, dptr)
|
||||
+ vload4(5, &task.coefs[0]) * vload4(5, dptr)
|
||||
+ vload4(6, &task.coefs[0]) * vload4(6, dptr)
|
||||
+ vload4(7, &task.coefs[0]) * vload4(7, dptr)
|
||||
+ convert_bps4(vload4(4, &task.coefs[0])) * convert_bps4(vload4(4, dptr))
|
||||
+ convert_bps4(vload4(5, &task.coefs[0])) * convert_bps4(vload4(5, dptr))
|
||||
+ convert_bps4(vload4(6, &task.coefs[0])) * convert_bps4(vload4(6, dptr))
|
||||
+ convert_bps4(vload4(7, &task.coefs[0])) * convert_bps4(vload4(7, dptr))
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
;
|
||||
if (off >= ro && off < bs)
|
||||
output[task.data.residualOffs + off] = data[tid + GROUP_SIZE] - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift);
|
||||
output[task.data.residualOffs + off] = convert_bps_sat(nextData - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift));
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
data[tid] = nextData;
|
||||
@@ -1254,7 +1341,7 @@ void clEncodeResidual(
|
||||
#ifdef FLACCL_CPU
|
||||
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||
void clCalcPartition(
|
||||
__global int *partition_lengths,
|
||||
__global ulong *partition_lengths,
|
||||
__global int *residual,
|
||||
__global FLACCLSubframeTask *tasks,
|
||||
int max_porder, // <= 8
|
||||
@@ -1265,18 +1352,16 @@ void clCalcPartition(
|
||||
int bs = task.data.blocksize;
|
||||
int ro = task.data.residualOrder;
|
||||
//int psize = bs >> max_porder;
|
||||
__global int *pl = partition_lengths + (1 << (max_porder + 1)) * get_group_id(1);
|
||||
__global ulong *pl = partition_lengths + (1 << (max_porder + 1)) * get_group_id(1);
|
||||
|
||||
for (int p = 0; p < (1 << max_porder); p++)
|
||||
pl[p] = 0;
|
||||
pl[p] = 0UL;
|
||||
|
||||
for (int pos = ro; pos < bs; pos ++)
|
||||
{
|
||||
int t = residual[task.data.residualOffs + pos];
|
||||
// overflow protection
|
||||
t = clamp(t, -0x7fffff, 0x7fffff);
|
||||
int s = residual[task.data.residualOffs + pos];
|
||||
// convert to unsigned
|
||||
t = (t << 1) ^ (t >> 31);
|
||||
uint t = (s << 1) ^ (s >> 31);
|
||||
pl[pos / psize] += t;
|
||||
}
|
||||
}
|
||||
@@ -1292,15 +1377,15 @@ void clCalcPartition(
|
||||
int psize // == task.blocksize >> max_porder?
|
||||
)
|
||||
{
|
||||
__local int pl[(GROUP_SIZE / 8)][15];
|
||||
__local uint pl[(GROUP_SIZE / 16)][MAX_RICE_PARAM + 1];
|
||||
__local FLACCLSubframeData task;
|
||||
|
||||
const int tid = get_local_id(0);
|
||||
if (tid < sizeof(task) / sizeof(int))
|
||||
((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(1)]))[tid];
|
||||
if (tid < (GROUP_SIZE / 8))
|
||||
if (tid < (GROUP_SIZE / 16))
|
||||
{
|
||||
for (int k = 0; k <= 14; k++)
|
||||
for (int k = 0; k <= MAX_RICE_PARAM; k++)
|
||||
pl[tid][k] = 0;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@@ -1311,14 +1396,14 @@ void clCalcPartition(
|
||||
{
|
||||
// fetch residual
|
||||
int s = (offs >= task.residualOrder && offs < end) ? residual[task.residualOffs + offs] : 0;
|
||||
// overflow protection
|
||||
s = iclamp(s, -0x7fffff, 0x7fffff);
|
||||
// convert to unsigned
|
||||
s = (s << 1) ^ (s >> 31);
|
||||
uint t = (s << 1) ^ (s >> 31);
|
||||
// calc number of unary bits for each residual sample with each rice paramater
|
||||
int part = (offs - start) / psize + (tid & 1) * (GROUP_SIZE / 16);
|
||||
for (int k = 0; k <= 14; k++)
|
||||
atom_add(&pl[part][k], s >> k);
|
||||
int part = (offs - start) / psize;
|
||||
// we must ensure that psize * (t >> k) doesn't overflow;
|
||||
// i.e. t < ((1 << 32) >> (log2(psize) - k)) <= (1 << 32) >> (32 - clz(MAX_BLOCKSIZE) - k)
|
||||
for (int k = 0; k <= MAX_RICE_PARAM; k++)
|
||||
atom_add(&pl[part][k], min(t, 0xffffffffU >> max(0, 32 - clz(MAX_BLOCKSIZE) - k)) >> k);
|
||||
//pl[part][k] += s >> k;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@@ -1326,141 +1411,79 @@ void clCalcPartition(
|
||||
int part = get_group_id(0) * (GROUP_SIZE / 16) + tid;
|
||||
if (tid < (GROUP_SIZE / 16) && part < (1 << max_porder))
|
||||
{
|
||||
for (int k = 0; k <= 14; k++)
|
||||
for (int k = 0; k <= MAX_RICE_PARAM; k++)
|
||||
{
|
||||
// output length
|
||||
const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (k << (max_porder + 1));
|
||||
int plen = pl[tid][k] + pl[tid + (GROUP_SIZE / 16)][k];
|
||||
partition_lengths[pos + part] = min(0x7fffff, plen) + (psize - select(0, task.residualOrder, part == 0)) * (k + 1);
|
||||
const int pos = ((MAX_RICE_PARAM + 1) << (max_porder + 1)) * get_group_id(1) + (k << (max_porder + 1));
|
||||
uint plen = pl[tid][k];
|
||||
partition_lengths[pos + part] = min(0x007fffffU, plen) + (uint)(psize - select(0, task.residualOrder, part == 0)) * (k + 1);
|
||||
// if (get_group_id(1) == 0)
|
||||
//printf("pl[%d][%d] == %d\n", k, part, min(0x7fffff, pl[k][tid]) + (psize - task.residualOrder * (part == 0)) * (k + 1));
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef FLACCL_CPU
|
||||
// get_group_id(0) == task index
|
||||
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||
void clCalcPartition16(
|
||||
__global int *partition_lengths,
|
||||
__global int *residual,
|
||||
__global int *samples,
|
||||
__global FLACCLSubframeTask *tasks,
|
||||
int max_porder // <= 8
|
||||
)
|
||||
{
|
||||
FLACCLSubframeTask task = tasks[get_global_id(0)];
|
||||
int bs = task.data.blocksize;
|
||||
int ro = task.data.residualOrder;
|
||||
__global int *data = &samples[task.data.samplesOffs];
|
||||
__global int *pl = partition_lengths + (1 << (max_porder + 1)) * get_global_id(0);
|
||||
for (int p = 0; p < (1 << max_porder); p++)
|
||||
pl[p] = 0;
|
||||
__global int *rptr = residual + task.data.residualOffs;
|
||||
SWITCH_N((rptr[pos] = t, pl[pos >> 4] += (t << 1) ^ (t >> 31)));
|
||||
//SWITCH_N((residual[task.data.residualOffs + pos] = t, t = (t << 1) ^ (t >> 31), pl[pos >> 4] += t));
|
||||
}
|
||||
#else
|
||||
// get_group_id(0) == task index
|
||||
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
|
||||
void clCalcPartition16(
|
||||
__global int *partition_lengths,
|
||||
__global unsigned int *partition_lengths,
|
||||
__global int *residual,
|
||||
__global int *samples,
|
||||
__global FLACCLSubframeTask *tasks,
|
||||
int max_porder // <= 8
|
||||
)
|
||||
{
|
||||
__local FLACCLSubframeTask task;
|
||||
__local int data[GROUP_SIZE * 2];
|
||||
__local int res[GROUP_SIZE];
|
||||
__local int pl[GROUP_SIZE >> 4][15];
|
||||
__local FLACCLSubframeData task;
|
||||
__local unsigned int res[GROUP_SIZE];
|
||||
__local unsigned int pl[GROUP_SIZE >> 4][MAX_RICE_PARAM + 1];
|
||||
|
||||
const int tid = get_local_id(0);
|
||||
if (tid < sizeof(task) / sizeof(int))
|
||||
((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(0)]))[tid];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int bs = task.data.blocksize;
|
||||
int ro = task.data.residualOrder;
|
||||
int sh = task.data.shift;
|
||||
|
||||
if (tid >= ro && tid < 32)
|
||||
task.coefs[tid] = 0;
|
||||
|
||||
int k = tid & 15;
|
||||
int x = tid / 16;
|
||||
int bs = task.blocksize;
|
||||
int ro = task.residualOrder;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
__global int * rptr = &residual[task.data.residualOffs];
|
||||
__global int * plptr = &partition_lengths[(15 << (max_porder + 1)) * get_group_id(0) + (k << (max_porder + 1))];
|
||||
__local int* dptr = &data[tid + GROUP_SIZE - ro];
|
||||
|
||||
int4 cptr0 = vload4(0, &task.coefs[0]);
|
||||
int4 cptr1 = vload4(1, &task.coefs[0]);
|
||||
int4 cptr2 = vload4(2, &task.coefs[0]);
|
||||
data[tid] = 0;
|
||||
for (int pos = 0; pos < bs; pos += GROUP_SIZE)
|
||||
{
|
||||
int offs = pos + tid;
|
||||
// fetch samples
|
||||
int nextData = offs < bs ? samples[task.data.samplesOffs + offs] >> task.data.wbits : 0;
|
||||
data[tid + GROUP_SIZE] = nextData;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// compute residual
|
||||
int4 sum = cptr0 * vload4(0, dptr)
|
||||
#if MAX_ORDER > 4
|
||||
+ cptr1 * vload4(1, dptr)
|
||||
#if MAX_ORDER > 8
|
||||
+ cptr2 * vload4(2, dptr)
|
||||
#if MAX_ORDER > 12
|
||||
+ vload4(3, &task.coefs[0]) * vload4(3, dptr)
|
||||
#if MAX_ORDER > 16
|
||||
+ vload4(4, &task.coefs[0]) * vload4(4, dptr)
|
||||
+ vload4(5, &task.coefs[0]) * vload4(5, dptr)
|
||||
+ vload4(6, &task.coefs[0]) * vload4(6, dptr)
|
||||
+ vload4(7, &task.coefs[0]) * vload4(7, dptr)
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
;
|
||||
int s = select(0, nextData - ((sum.x + sum.y + sum.z + sum.w) >> sh), offs >= ro && offs < bs);
|
||||
|
||||
// output residual
|
||||
if (offs < bs)
|
||||
rptr[offs] = s;
|
||||
|
||||
s = iclamp(s, -0x7fffff, 0x7fffff);
|
||||
// fetch residual
|
||||
int s = (offs >= ro && offs < bs) ? residual[task.residualOffs + offs] : 0;
|
||||
// convert to unsigned
|
||||
res[tid] = (s << 1) ^ (s >> 31);
|
||||
|
||||
// for (int k = 0; k < 15; k++) atom_add(&pl[x][k], s >> k);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for (int k0 = 0; k0 <= MAX_RICE_PARAM; k0 += 16)
|
||||
{
|
||||
// calc number of unary bits for each group of 16 residual samples
|
||||
// with each rice parameter.
|
||||
int k = k0 + (tid & 15);
|
||||
int x = tid >> 4;
|
||||
// we must ensure that psize * (t >> k) doesn't overflow;
|
||||
// i.e. t < ((1 << 32) >> (log2(16) - k)) <= (1 << 32) >> (4 - k)
|
||||
uint4 lim = 0xffffffffU >> max(0, 4 - k);
|
||||
__local uint * chunk = &res[x << 4];
|
||||
uint4 rsum = (min(lim,vload4(0,chunk)) >> k) + (min(lim,vload4(1,chunk)) >> k) + (min(lim,vload4(2,chunk)) >> k) + (min(lim,vload4(3,chunk)) >> k);
|
||||
uint rs = rsum.x + rsum.y + rsum.z + rsum.w;
|
||||
|
||||
// We can safely limit length here to 0x007fffffU, not causing length
|
||||
// mismatch, because any such length would cause Verbatim frame anyway.
|
||||
// And this limit protects us from overflows when calculating larger
|
||||
// partitions, as we can have a maximum of 2^8 partitions, resulting
|
||||
// in maximum partition length of 0x7fffffffU + change.
|
||||
if (k <= MAX_RICE_PARAM) pl[x][k] = min(0x007fffffU, rs) + (uint)(16 - select(0, ro, offs < 16)) * (k + 1);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
data[tid] = nextData;
|
||||
|
||||
// calc number of unary bits for each residual sample with each rice paramater
|
||||
__local int * chunk = &res[x << 4];
|
||||
sum = (vload4(0,chunk) >> k) + (vload4(1,chunk) >> k) + (vload4(2,chunk) >> k) + (vload4(3,chunk) >> k);
|
||||
s = sum.x + sum.y + sum.z + sum.w;
|
||||
|
||||
#if 0
|
||||
if (k <= 14 && offs < bs)
|
||||
plptr[offs >> 4] = min(0x7fffff, s) + (16 - select(0, ro, offs < 16)) * (k + 1);
|
||||
#else
|
||||
if (k <= 14) pl[x][k] = min(0x7fffff, s) + (16 - select(0, ro, offs < 16)) * (k + 1);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int k1 = tid >> 3, x1 = tid & 7;
|
||||
if (k1 <= 14 && (pos >> 4) + x1 < (1 << max_porder))
|
||||
partition_lengths[(15 << (max_porder + 1)) * get_group_id(0) + (k1 << (max_porder + 1)) + (pos >> 4) + x1] = pl[x1][k1];
|
||||
#endif
|
||||
|
||||
// if (task.data.blocksize == 16 && x == 0 && k <= 14)
|
||||
// printf("[%d] = %d = s:%d + %d * (k:%d + 1), ro=%d, offs=%d, lpos=%d\n", k, partition_lengths[lpos], s, (16 - select(0, ro, offs < 16)), k, ro, offs, lpos);
|
||||
|
||||
for (int k0 = 0; k0 <= MAX_RICE_PARAM; k0 += 16)
|
||||
{
|
||||
int k1 = k0 + (tid >> 3), x1 = tid & 7;
|
||||
if (k1 <= MAX_RICE_PARAM && (pos >> 4) + x1 < (1 << max_porder))
|
||||
partition_lengths[((MAX_RICE_PARAM + 1) << (max_porder + 1)) * get_group_id(0) + (k1 << (max_porder + 1)) + (pos >> 4) + x1] = pl[x1][k1];
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@@ -1471,13 +1494,13 @@ void clCalcPartition16(
|
||||
// get_group_id(1) == task index
|
||||
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||
void clSumPartition(
|
||||
__global int* partition_lengths,
|
||||
__global ulong* partition_lengths,
|
||||
int max_porder
|
||||
)
|
||||
{
|
||||
if (get_group_id(0) != 0) // ignore k != 0
|
||||
return;
|
||||
__global int * sums = partition_lengths + (1 << (max_porder + 1)) * get_group_id(1);
|
||||
__global ulong * sums = partition_lengths + (1 << (max_porder + 1)) * get_group_id(1);
|
||||
for (int i = max_porder - 1; i >= 0; i--)
|
||||
{
|
||||
for (int j = 0; j < (1 << i); j++)
|
||||
@@ -1496,15 +1519,15 @@ void clSumPartition(
|
||||
// get_group_id(1) == task index
|
||||
__kernel __attribute__((reqd_work_group_size(128, 1, 1)))
|
||||
void clSumPartition(
|
||||
__global int* partition_lengths,
|
||||
__global uint* partition_lengths,
|
||||
int max_porder
|
||||
)
|
||||
{
|
||||
__local int data[256]; // max_porder <= 8, data length <= 1 << 9.
|
||||
const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (get_group_id(0) << (max_porder + 1));
|
||||
__local uint data[256]; // max_porder <= 8, data length <= 1 << 9.
|
||||
const int pos = ((MAX_RICE_PARAM + 1) << (max_porder + 1)) * get_group_id(1) + (get_group_id(0) << (max_porder + 1));
|
||||
|
||||
// fetch partition lengths
|
||||
int2 pl = get_local_id(0) * 2 < (1 << max_porder) ? vload2(get_local_id(0),&partition_lengths[pos]) : 0;
|
||||
uint2 pl = get_local_id(0) * 2 < (1 << max_porder) ? vload2(get_local_id(0),&partition_lengths[pos]) : 0;
|
||||
data[get_local_id(0)] = pl.x + pl.y;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
@@ -1512,7 +1535,7 @@ void clSumPartition(
|
||||
int out_pos = (1 << (max_porder - 1)) + get_local_id(0);
|
||||
for (int bs = 1 << (max_porder - 2); bs > 0; bs >>= 1)
|
||||
{
|
||||
if (get_local_id(0) < bs) data[out_pos] = data[in_pos] + data[in_pos + 1];
|
||||
if (get_local_id(0) < bs) data[out_pos] = data[in_pos] + data[in_pos + 1];
|
||||
in_pos += bs << 1;
|
||||
out_pos += bs;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@@ -1531,7 +1554,7 @@ __kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||
void clFindRiceParameter(
|
||||
__global FLACCLSubframeTask *tasks,
|
||||
__global int* rice_parameters,
|
||||
__global int* partition_lengths,
|
||||
__global ulong* partition_lengths,
|
||||
int max_porder
|
||||
)
|
||||
{
|
||||
@@ -1541,7 +1564,7 @@ void clFindRiceParameter(
|
||||
//int psize = task->data.blocksize >> max_porder;
|
||||
int bs = task->data.blocksize;
|
||||
int ro = task->data.residualOrder;
|
||||
__global int* ppl = &partition_lengths[get_group_id(0) << (max_porder + 1)];
|
||||
__global ulong* ppl = &partition_lengths[get_group_id(0) << (max_porder + 1)];
|
||||
__global int* prp = &rice_parameters[get_group_id(0) << (max_porder + 2)];
|
||||
__global int* pol = prp + (1 << (max_porder + 1));
|
||||
for (int porder = max_porder; porder >= 0; porder--)
|
||||
@@ -1549,10 +1572,10 @@ void clFindRiceParameter(
|
||||
int pos = (2 << max_porder) - (2 << porder);
|
||||
int fin = pos + (1 << porder);
|
||||
|
||||
int pl = ppl[pos];
|
||||
ulong pl = ppl[pos];
|
||||
int ps = (bs >> porder) - ro;
|
||||
int k = iclamp(31 - fastclz(pl / max(1, ps)), 0, 14);
|
||||
int plk = ps * (k + 1) + (pl >> k);
|
||||
int k = iclamp(63 - fastclz64(pl / max(1, ps)), 0, MAX_RICE_PARAM);
|
||||
int plk = ps * (k + 1) + (int)(pl >> k);
|
||||
|
||||
// output rice parameter
|
||||
prp[pos] = k;
|
||||
@@ -1564,8 +1587,8 @@ void clFindRiceParameter(
|
||||
for (int offs = pos + 1; offs < fin; offs++)
|
||||
{
|
||||
pl = ppl[offs];
|
||||
k = iclamp(31 - fastclz(pl / ps), 0, 14);
|
||||
plk = ps * (k + 1) + (pl >> k);
|
||||
k = iclamp(63 - fastclz64(pl / ps), 0, MAX_RICE_PARAM);
|
||||
plk = ps * (k + 1) + (int)(pl >> k);
|
||||
|
||||
// output rice parameter
|
||||
prp[offs] = k;
|
||||
@@ -1581,18 +1604,18 @@ __kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
|
||||
void clFindRiceParameter(
|
||||
__global FLACCLSubframeTask *tasks,
|
||||
__global int* rice_parameters,
|
||||
__global int* partition_lengths,
|
||||
__global uint* partition_lengths,
|
||||
int max_porder
|
||||
)
|
||||
{
|
||||
for (int offs = get_local_id(0); offs < (2 << max_porder); offs += GROUP_SIZE)
|
||||
{
|
||||
const int pos = (15 << (max_porder + 1)) * get_group_id(0) + offs;
|
||||
int best_l = partition_lengths[pos];
|
||||
const int pos = ((MAX_RICE_PARAM + 1) << (max_porder + 1)) * get_group_id(0) + offs;
|
||||
uint best_l = partition_lengths[pos];
|
||||
int best_k = 0;
|
||||
for (int k = 1; k <= 14; k++)
|
||||
for (int k = 1; k <= MAX_RICE_PARAM; k++)
|
||||
{
|
||||
int l = partition_lengths[pos + (k << (max_porder + 1))];
|
||||
uint l = partition_lengths[pos + (k << (max_porder + 1))];
|
||||
best_k = select(best_k, k, l < best_l);
|
||||
best_l = min(best_l, l);
|
||||
}
|
||||
@@ -1630,16 +1653,16 @@ void clFindPartitionOrder(
|
||||
partlen[porder] += rice_parameters[pos + start + offs];
|
||||
}
|
||||
|
||||
int best_length = partlen[0] + 4;
|
||||
int best_length = partlen[0] + RICE_PARAM_BITS;
|
||||
int best_porder = 0;
|
||||
for (int porder = 1; porder <= max_porder; porder++)
|
||||
{
|
||||
int length = (4 << porder) + partlen[porder];
|
||||
int length = (RICE_PARAM_BITS << porder) + partlen[porder];
|
||||
best_porder = select(best_porder, porder, length < best_length);
|
||||
best_length = min(best_length, length);
|
||||
}
|
||||
|
||||
best_length = (4 << best_porder) + task->data.blocksize - task->data.residualOrder;
|
||||
best_length = (RICE_PARAM_BITS << best_porder) + task->data.blocksize - task->data.residualOrder;
|
||||
int best_psize = task->data.blocksize >> best_porder;
|
||||
int start = task->data.residualOffs + task->data.residualOrder;
|
||||
int fin = task->data.residualOffs + best_psize;
|
||||
@@ -1704,11 +1727,11 @@ void clFindPartitionOrder(
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int best_length = partlen[0] + 4;
|
||||
int best_length = partlen[0] + RICE_PARAM_BITS;
|
||||
int best_porder = 0;
|
||||
for (int porder = 1; porder <= max_porder; porder++)
|
||||
{
|
||||
int length = (4 << porder) + partlen[porder];
|
||||
int length = (RICE_PARAM_BITS << porder) + partlen[porder];
|
||||
best_porder = select(best_porder, porder, length < best_length);
|
||||
best_length = min(best_length, length);
|
||||
}
|
||||
@@ -1836,14 +1859,14 @@ void clCalcOutputOffsets(
|
||||
)
|
||||
{
|
||||
const int channels = 2;
|
||||
__local FLACCLSubframeData ltasks[2];
|
||||
__local volatile int mypos[2];
|
||||
__local FLACCLSubframeData ltasks[MAX_CHANNELS];
|
||||
__local volatile int mypos[MAX_CHANNELS];
|
||||
int offset = 0;
|
||||
for (int iFrame = 0; iFrame < frameCount; iFrame++)
|
||||
{
|
||||
if (get_local_id(0) < sizeof(ltasks[0]) / sizeof(int))
|
||||
for (int ch = 0; ch < channels; ch++)
|
||||
((__local int*)<asks[ch])[get_local_id(0)] = ((__global int*)(&tasks[iFrame * channels + ch]))[get_local_id(0)];
|
||||
for (int ch = 0; ch < MAX_CHANNELS; ch++)
|
||||
((__local int*)<asks[ch])[get_local_id(0)] = ((__global int*)(&tasks[iFrame * MAX_CHANNELS + ch]))[get_local_id(0)];
|
||||
|
||||
//printf("len_utf8(%d) == %d\n", firstFrame + iFrame, len_utf8(firstFrame + iFrame));
|
||||
offset += 15 + 1 + 4 + 4 + 4 + 3 + 1 + len_utf8(firstFrame + iFrame)
|
||||
@@ -1856,18 +1879,18 @@ void clCalcOutputOffsets(
|
||||
|
||||
// assert (offset % 8) == 0
|
||||
offset += 8;
|
||||
if (get_local_id(0) < channels)
|
||||
if (get_local_id(0) < MAX_CHANNELS)
|
||||
{
|
||||
int ch = get_local_id(0);
|
||||
// Add 64 bits to separate frames if header is too small so they can intersect
|
||||
int mylen = 8 + ltasks[ch].wbits + 64 + ltasks[ch].size;
|
||||
mypos[ch] = mylen;
|
||||
for (int offset = 1; offset < WARP_SIZE && offset < channels; offset <<= 1)
|
||||
for (int offset = 1; offset < WARP_SIZE && offset < MAX_CHANNELS; offset <<= 1)
|
||||
if (ch >= offset) mypos[ch] += mypos[ch - offset];
|
||||
mypos[ch] += offset;
|
||||
tasks[iFrame * channels + ch].data.encodingOffset = mypos[ch] - ltasks[ch].size + ltasks[ch].headerLen;
|
||||
tasks[iFrame * MAX_CHANNELS + ch].data.encodingOffset = mypos[ch] - ltasks[ch].size + ltasks[ch].headerLen;
|
||||
}
|
||||
offset = mypos[channels - 1];
|
||||
offset = mypos[MAX_CHANNELS - 1];
|
||||
offset = (offset + 7) & ~7;
|
||||
offset += 16;
|
||||
}
|
||||
@@ -1909,7 +1932,7 @@ void clRiceEncoding(
|
||||
for (int p = 0; p < (1 << porder); p++)
|
||||
{
|
||||
int k = kptr[p];
|
||||
writebits(&bw, 4, k);
|
||||
writebits(&bw, RICE_PARAM_BITS, k);
|
||||
//if (get_group_id(0) == 0) printf("[%x] ", k);
|
||||
//if (get_group_id(0) == 0) printf("(%x) ", bw.bit_buf);
|
||||
if (p == 1) res_cnt = psize;
|
||||
@@ -1978,7 +2001,7 @@ void clRiceEncoding(
|
||||
flush(&bw);
|
||||
}
|
||||
#else
|
||||
__local unsigned int data[GROUP_SIZE];
|
||||
__local uint data[GROUP_SIZE];
|
||||
__local volatile int mypos[GROUP_SIZE+1];
|
||||
#if 0
|
||||
__local int brp[256];
|
||||
@@ -2006,12 +2029,12 @@ void clRiceEncoding(
|
||||
int start = task.encodingOffset;
|
||||
int plen = bs >> task.porder;
|
||||
//int plenoffs = 12 - task.porder;
|
||||
unsigned int remainder = 0U;
|
||||
uint remainder = 0U;
|
||||
int pos;
|
||||
for (pos = 0; pos + GROUP_SIZE - 1 < bs; pos += GROUP_SIZE)
|
||||
{
|
||||
int offs = pos + tid;
|
||||
int v = residual[task.residualOffs + offs];
|
||||
int iv = residual[task.residualOffs + offs];
|
||||
int part = offs / plen; // >> plenoffs;
|
||||
#if 0
|
||||
int k = brp[part];
|
||||
@@ -2019,10 +2042,10 @@ void clRiceEncoding(
|
||||
int k = best_rice_parameters[(get_group_id(0) << max_porder) + part];
|
||||
#endif
|
||||
int pstart = offs == task.residualOrder || offs == part * plen;
|
||||
v = (v << 1) ^ (v >> 31);
|
||||
int mylen = select(0, (v >> k) + 1 + k + select(0, 4, pstart), offs >= task.residualOrder && offs < bs);
|
||||
uint v = (iv << 1) ^ (iv >> 31);
|
||||
int mylen = select(0, (int)(v >> k) + 1 + k + select(0, RICE_PARAM_BITS, pstart), offs >= task.residualOrder && offs < bs);
|
||||
mypos[tid] = mylen;
|
||||
|
||||
|
||||
// Inclusive scan(+)
|
||||
int lane = (tid & (WARP_SIZE - 1));
|
||||
for (int offset = 1; offset < WARP_SIZE; offset <<= 1)
|
||||
@@ -2040,7 +2063,8 @@ void clRiceEncoding(
|
||||
mp += start + select(0, warppos[tid / WARP_SIZE - 1], tid / WARP_SIZE > 0);
|
||||
int start32 = start >> 5;
|
||||
start += mypos[GROUP_SIZE - 1] + warppos[GROUP_SIZE / WARP_SIZE - 2];
|
||||
|
||||
//if (start / 32 - start32 >= GROUP_SIZE - 3)
|
||||
// tasks[get_group_id(0)].data.size = 1;
|
||||
//if (tid == GROUP_SIZE - 1 && mypos[tid] > (GROUP_SIZE/2) * 32)
|
||||
// printf("Oops: %d\n", mypos[tid]);
|
||||
data[tid] = select(0U, remainder, tid == 0);
|
||||
@@ -2052,18 +2076,18 @@ void clRiceEncoding(
|
||||
int kpos = mp - mylen;
|
||||
int kpos0 = (kpos >> 5) - start32;
|
||||
int kpos1 = kpos & 31;
|
||||
unsigned int kval = (unsigned int)k << 28;
|
||||
unsigned int kval0 = kval >> kpos1;
|
||||
unsigned int kval1 = kval << (32 - kpos1);
|
||||
uint kval = (uint)k << (32 - RICE_PARAM_BITS);
|
||||
uint kval0 = kval >> kpos1;
|
||||
uint kval1 = kval << (32 - kpos1);
|
||||
if (kval0) atom_or(&data[kpos0], kval0);
|
||||
if (kpos1 && kval1) atom_or(&data[kpos0 + 1], kval1);
|
||||
}
|
||||
int qpos = mp - k - 1;
|
||||
int qpos0 = (qpos >> 5) - start32;
|
||||
int qpos1 = qpos & 31;
|
||||
unsigned int qval = (1U << 31) | ((unsigned int)v << (31 - k));
|
||||
unsigned int qval0 = qval >> qpos1;
|
||||
unsigned int qval1= qval << (32 - qpos1);
|
||||
uint qval = (1U << 31) | (v << (31 - k));
|
||||
uint qval0 = qval >> qpos1;
|
||||
uint qval1= qval << (32 - qpos1);
|
||||
if (qval0) atom_or(&data[qpos0], qval0);
|
||||
if (qpos1 && qval1) atom_or(&data[qpos0 + 1], qval1);
|
||||
}
|
||||
@@ -2075,13 +2099,13 @@ void clRiceEncoding(
|
||||
if (pos < bs)
|
||||
{
|
||||
int offs = pos + tid;
|
||||
int v = offs < bs ? residual[task.residualOffs + offs] : 0;
|
||||
int iv = offs < bs ? residual[task.residualOffs + offs] : 0;
|
||||
int part = offs / plen; // >> plenoffs;
|
||||
//int k = brp[min(255, part)];
|
||||
int k = offs < bs ? best_rice_parameters[(get_group_id(0) << max_porder) + part] : 0;
|
||||
int pstart = offs == task.residualOrder || offs == part * plen;
|
||||
v = (v << 1) ^ (v >> 31);
|
||||
int mylen = select(0, (v >> k) + 1 + k + select(0, 4, pstart), offs >= task.residualOrder && offs < bs);
|
||||
uint v = (iv << 1) ^ (iv >> 31);
|
||||
int mylen = select(0, (int)(v >> k) + 1 + k + select(0, RICE_PARAM_BITS, pstart), offs >= task.residualOrder && offs < bs);
|
||||
mypos[tid] = mylen;
|
||||
|
||||
// Inclusive scan(+)
|
||||
@@ -2113,18 +2137,18 @@ void clRiceEncoding(
|
||||
int kpos = mp - mylen;
|
||||
int kpos0 = (kpos >> 5) - start32;
|
||||
int kpos1 = kpos & 31;
|
||||
unsigned int kval = (unsigned int)k << 28;
|
||||
unsigned int kval0 = kval >> kpos1;
|
||||
unsigned int kval1 = kval << (32 - kpos1);
|
||||
uint kval = (uint)k << (32 - RICE_PARAM_BITS);
|
||||
uint kval0 = kval >> kpos1;
|
||||
uint kval1 = kval << (32 - kpos1);
|
||||
if (kval0) atom_or(&data[kpos0], kval0);
|
||||
if (kpos1 && kval1) atom_or(&data[kpos0 + 1], kval1);
|
||||
}
|
||||
int qpos = mp - k - 1;
|
||||
int qpos0 = (qpos >> 5) - start32;
|
||||
int qpos1 = qpos & 31;
|
||||
unsigned int qval = (1U << 31) | ((unsigned int)v << (31 - k));
|
||||
unsigned int qval0 = qval >> qpos1;
|
||||
unsigned int qval1= qval << (32 - qpos1);
|
||||
uint qval = (1U << 31) | (v << (31 - k));
|
||||
uint qval0 = qval >> qpos1;
|
||||
uint qval1= qval << (32 - qpos1);
|
||||
if (qval0) atom_or(&data[qpos0], qval0);
|
||||
if (qpos1 && qval1) atom_or(&data[qpos0 + 1], qval1);
|
||||
}
|
||||
|
||||
@@ -83,6 +83,11 @@ namespace CUETools.Codecs.FLAKE
|
||||
/// </summary>
|
||||
public int porder;
|
||||
|
||||
/// <summary>
|
||||
/// coding method: rice parameters use 4 bits for coding_method 0 and 5 bits for coding_method 1
|
||||
/// </summary>
|
||||
public int coding_method;
|
||||
|
||||
/// <summary>
|
||||
/// Rice parameters
|
||||
/// </summary>
|
||||
|
||||
@@ -102,7 +102,7 @@ namespace CUETools.Codecs.FLAKE
|
||||
}
|
||||
_samplesInBuffer = 0;
|
||||
|
||||
if (PCM.BitsPerSample != 16 || PCM.ChannelCount != 2 || PCM.SampleRate != 44100)
|
||||
if ((PCM.BitsPerSample != 16 && PCM.BitsPerSample != 24) || PCM.ChannelCount != 2 || (PCM.SampleRate != 44100 && PCM.SampleRate != 48000))
|
||||
throw new Exception("invalid flac file");
|
||||
|
||||
samplesBuffer = new int[Flake.MAX_BLOCKSIZE * PCM.ChannelCount];
|
||||
@@ -362,8 +362,9 @@ namespace CUETools.Codecs.FLAKE
|
||||
unsafe void decode_residual(BitReader bitreader, FlacFrame frame, int ch)
|
||||
{
|
||||
// rice-encoded block
|
||||
uint coding_method = bitreader.readbits(2); // ????? == 0
|
||||
if (coding_method != 0 && coding_method != 1) // if 1, then parameter length == 5 bits instead of 4
|
||||
// coding method
|
||||
frame.subframes[ch].best.rc.coding_method = (int)bitreader.readbits(2); // ????? == 0
|
||||
if (frame.subframes[ch].best.rc.coding_method != 0 && frame.subframes[ch].best.rc.coding_method != 1)
|
||||
throw new Exception("unsupported residual coding");
|
||||
// partition order
|
||||
frame.subframes[ch].best.rc.porder = (int)bitreader.readbits(4);
|
||||
@@ -372,7 +373,7 @@ namespace CUETools.Codecs.FLAKE
|
||||
int psize = frame.blocksize >> frame.subframes[ch].best.rc.porder;
|
||||
int res_cnt = psize - frame.subframes[ch].best.order;
|
||||
|
||||
int rice_len = 4 + (int)coding_method;
|
||||
int rice_len = 4 + frame.subframes[ch].best.rc.coding_method;
|
||||
// residual
|
||||
int j = frame.subframes[ch].best.order;
|
||||
int* r = frame.subframes[ch].best.residual + j;
|
||||
|
||||
@@ -125,8 +125,8 @@ namespace CUETools.Codecs.FLAKE
|
||||
{
|
||||
_pcm = pcm;
|
||||
|
||||
if (_pcm.BitsPerSample != 16)
|
||||
throw new Exception("Bits per sample must be 16.");
|
||||
//if (_pcm.BitsPerSample != 16)
|
||||
// throw new Exception("Bits per sample must be 16.");
|
||||
if (_pcm.ChannelCount != 2)
|
||||
throw new Exception("ChannelCount must be 2.");
|
||||
|
||||
@@ -571,14 +571,14 @@ namespace CUETools.Codecs.FLAKE
|
||||
samplesInBuffer += block;
|
||||
}
|
||||
|
||||
unsafe static void channel_decorrelation(int* leftS, int* rightS, int *leftM, int *rightM, int blocksize)
|
||||
{
|
||||
for (int i = 0; i < blocksize; i++)
|
||||
{
|
||||
leftM[i] = (leftS[i] + rightS[i]) >> 1;
|
||||
rightM[i] = leftS[i] - rightS[i];
|
||||
}
|
||||
}
|
||||
//unsafe static void channel_decorrelation(int* leftS, int* rightS, int *leftM, int *rightM, int blocksize)
|
||||
//{
|
||||
// for (int i = 0; i < blocksize; i++)
|
||||
// {
|
||||
// leftM[i] = (leftS[i] + rightS[i]) >> 1;
|
||||
// rightM[i] = leftS[i] - rightS[i];
|
||||
// }
|
||||
//}
|
||||
|
||||
unsafe void encode_residual_verbatim(int* res, int* smp, uint n)
|
||||
{
|
||||
@@ -638,24 +638,28 @@ namespace CUETools.Codecs.FLAKE
|
||||
}
|
||||
}
|
||||
|
||||
static unsafe uint calc_optimal_rice_params(int porder, int* parm, uint* sums, uint n, uint pred_order)
|
||||
static unsafe uint calc_optimal_rice_params(int porder, int* parm, ulong* sums, uint n, uint pred_order, ref int method)
|
||||
{
|
||||
uint part = (1U << porder);
|
||||
uint cnt = (n >> porder) - pred_order;
|
||||
int k = cnt > 0 ? Math.Min(Flake.MAX_RICE_PARAM, BitReader.log2i(sums[0] / cnt)) : 0;
|
||||
uint all_bits = cnt * ((uint)k + 1U) + (sums[0] >> k);
|
||||
int maxK = method > 0 ? 30 : Flake.MAX_RICE_PARAM;
|
||||
int k = cnt > 0 ? Math.Min(maxK, BitReader.log2i(sums[0] / cnt)) : 0;
|
||||
int realMaxK0 = k;
|
||||
ulong all_bits = cnt * ((uint)k + 1U) + (sums[0] >> k);
|
||||
parm[0] = k;
|
||||
cnt = (n >> porder);
|
||||
for (uint i = 1; i < part; i++)
|
||||
{
|
||||
k = Math.Min(Flake.MAX_RICE_PARAM, BitReader.log2i(sums[i] / cnt));
|
||||
k = Math.Min(maxK, BitReader.log2i(sums[i] / cnt));
|
||||
realMaxK0 = Math.Max(realMaxK0, k);
|
||||
all_bits += cnt * ((uint)k + 1U) + (sums[i] >> k);
|
||||
parm[i] = k;
|
||||
}
|
||||
return all_bits + (4 * part);
|
||||
method = realMaxK0 > Flake.MAX_RICE_PARAM ? 1 : 0;
|
||||
return (uint)all_bits + ((4U + (uint)method) * part);
|
||||
}
|
||||
|
||||
static unsafe void calc_lower_sums(int pmin, int pmax, uint* sums)
|
||||
static unsafe void calc_lower_sums(int pmin, int pmax, ulong* sums)
|
||||
{
|
||||
for (int i = pmax - 1; i >= pmin; i--)
|
||||
{
|
||||
@@ -668,12 +672,12 @@ namespace CUETools.Codecs.FLAKE
|
||||
}
|
||||
}
|
||||
|
||||
static unsafe void calc_sums(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums)
|
||||
static unsafe void calc_sums(int pmin, int pmax, uint* data, uint n, uint pred_order, ulong* sums)
|
||||
{
|
||||
int parts = (1 << pmax);
|
||||
uint* res = data + pred_order;
|
||||
uint cnt = (n >> pmax) - pred_order;
|
||||
uint sum = 0;
|
||||
ulong sum = 0;
|
||||
for (uint j = cnt; j > 0; j--)
|
||||
sum += *(res++);
|
||||
sums[0] = sum;
|
||||
@@ -696,18 +700,18 @@ namespace CUETools.Codecs.FLAKE
|
||||
/// <param name="n"></param>
|
||||
/// <param name="pred_order"></param>
|
||||
/// <param name="sums"></param>
|
||||
static unsafe void calc_sums18(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums)
|
||||
static unsafe void calc_sums18(int pmin, int pmax, uint* data, uint n, uint pred_order, ulong* sums)
|
||||
{
|
||||
int parts = (1 << pmax);
|
||||
uint* res = data + pred_order;
|
||||
uint cnt = 18 - pred_order;
|
||||
uint sum = 0;
|
||||
ulong sum = 0;
|
||||
for (uint j = cnt; j > 0; j--)
|
||||
sum += *(res++);
|
||||
sums[0] = sum;
|
||||
for (int i = 1; i < parts; i++)
|
||||
{
|
||||
sums[i] =
|
||||
sums[i] = 0UL +
|
||||
*(res++) + *(res++) + *(res++) + *(res++) +
|
||||
*(res++) + *(res++) + *(res++) + *(res++) +
|
||||
*(res++) + *(res++) + *(res++) + *(res++) +
|
||||
@@ -725,18 +729,18 @@ namespace CUETools.Codecs.FLAKE
|
||||
/// <param name="n"></param>
|
||||
/// <param name="pred_order"></param>
|
||||
/// <param name="sums"></param>
|
||||
static unsafe void calc_sums16(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums)
|
||||
static unsafe void calc_sums16(int pmin, int pmax, uint* data, uint n, uint pred_order, ulong* sums)
|
||||
{
|
||||
int parts = (1 << pmax);
|
||||
uint* res = data + pred_order;
|
||||
uint cnt = 16 - pred_order;
|
||||
uint sum = 0;
|
||||
ulong sum = 0;
|
||||
for (uint j = cnt; j > 0; j--)
|
||||
sum += *(res++);
|
||||
sums[0] = sum;
|
||||
for (int i = 1; i < parts; i++)
|
||||
{
|
||||
sums[i] =
|
||||
sums[i] = 0UL +
|
||||
*(res++) + *(res++) + *(res++) + *(res++) +
|
||||
*(res++) + *(res++) + *(res++) + *(res++) +
|
||||
*(res++) + *(res++) + *(res++) + *(res++) +
|
||||
@@ -744,10 +748,10 @@ namespace CUETools.Codecs.FLAKE
|
||||
}
|
||||
}
|
||||
|
||||
static unsafe uint calc_rice_params(RiceContext rc, int pmin, int pmax, int* data, uint n, uint pred_order)
|
||||
static unsafe uint calc_rice_params(RiceContext rc, int pmin, int pmax, int* data, uint n, uint pred_order, int bps)
|
||||
{
|
||||
uint* udata = stackalloc uint[(int)n];
|
||||
uint* sums = stackalloc uint[(pmax + 1) * Flake.MAX_PARTITIONS];
|
||||
ulong* sums = stackalloc ulong[(pmax + 1) * Flake.MAX_PARTITIONS];
|
||||
int* parm = stackalloc int[(pmax + 1) * Flake.MAX_PARTITIONS];
|
||||
//uint* bits = stackalloc uint[Flake.MAX_PARTITION_ORDER];
|
||||
|
||||
@@ -770,17 +774,21 @@ namespace CUETools.Codecs.FLAKE
|
||||
|
||||
uint opt_bits = AudioSamples.UINT32_MAX;
|
||||
int opt_porder = pmin;
|
||||
int opt_method = 0;
|
||||
for (int i = pmin; i <= pmax; i++)
|
||||
{
|
||||
uint bits = calc_optimal_rice_params(i, parm + i * Flake.MAX_PARTITIONS, sums + i * Flake.MAX_PARTITIONS, n, pred_order);
|
||||
int method = bps > 16 ? 1 : 0;
|
||||
uint bits = calc_optimal_rice_params(i, parm + i * Flake.MAX_PARTITIONS, sums + i * Flake.MAX_PARTITIONS, n, pred_order, ref method);
|
||||
if (bits <= opt_bits)
|
||||
{
|
||||
opt_bits = bits;
|
||||
opt_porder = i;
|
||||
opt_method = method;
|
||||
}
|
||||
}
|
||||
|
||||
rc.porder = opt_porder;
|
||||
rc.coding_method = opt_method;
|
||||
fixed (int* rparms = rc.rparams)
|
||||
AudioSamples.MemCpy(rparms, parm + opt_porder * Flake.MAX_PARTITIONS, (1 << opt_porder));
|
||||
|
||||
@@ -841,7 +849,7 @@ namespace CUETools.Codecs.FLAKE
|
||||
}
|
||||
int pmax = get_max_p_order(eparams.max_partition_order, frame.blocksize, frame.current.order);
|
||||
int pmin = Math.Min(eparams.min_partition_order, pmax);
|
||||
uint best_size = calc_rice_params(frame.current.rc, pmin, pmax, frame.current.residual, (uint)frame.blocksize, (uint)frame.current.order);
|
||||
uint best_size = calc_rice_params(frame.current.rc, pmin, pmax, frame.current.residual, (uint)frame.blocksize, (uint)frame.current.order, PCM.BitsPerSample);
|
||||
// not working
|
||||
//for (int o = 1; o <= frame.current.order; o++)
|
||||
//{
|
||||
@@ -877,7 +885,7 @@ namespace CUETools.Codecs.FLAKE
|
||||
int pmax = get_max_p_order(eparams.max_partition_order, frame.blocksize, frame.current.order);
|
||||
int pmin = Math.Min(eparams.min_partition_order, pmax);
|
||||
frame.current.size = (uint)(frame.current.order * frame.subframes[ch].obits) + 6
|
||||
+ calc_rice_params(frame.current.rc, pmin, pmax, frame.current.residual, (uint)frame.blocksize, (uint)frame.current.order);
|
||||
+ calc_rice_params(frame.current.rc, pmin, pmax, frame.current.residual, (uint)frame.blocksize, (uint)frame.current.order, PCM.BitsPerSample);
|
||||
|
||||
frame.subframes[ch].done_fixed |= (1U << order);
|
||||
|
||||
@@ -1054,7 +1062,7 @@ namespace CUETools.Codecs.FLAKE
|
||||
unsafe void output_residual(FlacFrame frame, BitWriter bitwriter, FlacSubframeInfo sub)
|
||||
{
|
||||
// rice-encoded block
|
||||
bitwriter.writebits(2, 0);
|
||||
bitwriter.writebits(2, sub.best.rc.coding_method);
|
||||
|
||||
// partition order
|
||||
int porder = sub.best.rc.porder;
|
||||
@@ -1063,13 +1071,14 @@ namespace CUETools.Codecs.FLAKE
|
||||
bitwriter.writebits(4, porder);
|
||||
int res_cnt = psize - sub.best.order;
|
||||
|
||||
int rice_len = 4 + sub.best.rc.coding_method;
|
||||
// residual
|
||||
int j = sub.best.order;
|
||||
fixed (byte* fixbuf = &frame_buffer[0])
|
||||
for (int p = 0; p < (1 << porder); p++)
|
||||
{
|
||||
int k = sub.best.rc.rparams[p];
|
||||
bitwriter.writebits(4, k);
|
||||
bitwriter.writebits(rice_len, k);
|
||||
if (p == 1) res_cnt = psize;
|
||||
int cnt = Math.Min(res_cnt, frame.blocksize - j);
|
||||
bitwriter.write_rice_block_signed(fixbuf, k, sub.best.residual + j, cnt);
|
||||
@@ -1436,6 +1445,9 @@ namespace CUETools.Codecs.FLAKE
|
||||
output_subframes(frame, bitwriter);
|
||||
output_frame_footer(bitwriter);
|
||||
|
||||
if (bitwriter.Length >= max_frame_size)
|
||||
throw new Exception("buffer overflow");
|
||||
|
||||
if (frame_buffer != null)
|
||||
{
|
||||
if (eparams.variable_block_size > 0)
|
||||
@@ -1732,9 +1744,6 @@ namespace CUETools.Codecs.FLAKE
|
||||
}
|
||||
if (i == 8)
|
||||
throw new Exception("non-standard bps");
|
||||
// FIXME: For now, only 16-bit encoding is supported
|
||||
if (_pcm.BitsPerSample != 16)
|
||||
throw new Exception("non-standard bps");
|
||||
|
||||
if (_blocksize == 0)
|
||||
{
|
||||
|
||||
@@ -35,6 +35,15 @@ namespace CUETools.Codecs
|
||||
return log2i((uint)v);
|
||||
}
|
||||
|
||||
public static int log2i(ulong v)
|
||||
{
|
||||
int n = 0;
|
||||
if (0 != (v & 0xffffffff00000000)) { v >>= 32; n += 32; }
|
||||
if (0 != (v & 0xffff0000)) { v >>= 16; n += 16; }
|
||||
if (0 != (v & 0xff00)) { v >>= 8; n += 8; }
|
||||
return n + byte_to_log2_table[v];
|
||||
}
|
||||
|
||||
public static int log2i(uint v)
|
||||
{
|
||||
int n = 0;
|
||||
|
||||
@@ -422,14 +422,41 @@ namespace CUETools.Codecs
|
||||
|
||||
unsafe public void Interlace(int pos, int* src1, int* src2, int n)
|
||||
{
|
||||
if (PCM.ChannelCount != 2 || PCM.BitsPerSample != 16)
|
||||
throw new Exception("");
|
||||
fixed (byte* bs = Bytes)
|
||||
if (PCM.ChannelCount != 2)
|
||||
throw new Exception("Must be stereo");
|
||||
if (PCM.BitsPerSample == 16)
|
||||
{
|
||||
int* res = ((int*)bs) + pos;
|
||||
for (int i = n; i > 0; i--)
|
||||
*(res++) = (*(src1++) & 0xffff) ^ (*(src2++) << 16);
|
||||
fixed (byte* bs = Bytes)
|
||||
{
|
||||
int* res = ((int*)bs) + pos;
|
||||
for (int i = n; i > 0; i--)
|
||||
*(res++) = (*(src1++) & 0xffff) ^ (*(src2++) << 16);
|
||||
}
|
||||
}
|
||||
else if (PCM.BitsPerSample == 24)
|
||||
{
|
||||
fixed (byte* bs = Bytes)
|
||||
{
|
||||
byte* res= bs + pos * 6;
|
||||
for (int i = n; i > 0; i--)
|
||||
{
|
||||
uint sample_out = (uint)*(src1++);
|
||||
*(res++) = (byte)(sample_out & 0xFF);
|
||||
sample_out >>= 8;
|
||||
*(res++) = (byte)(sample_out & 0xFF);
|
||||
sample_out >>= 8;
|
||||
*(res++) = (byte)(sample_out & 0xFF);
|
||||
sample_out = (uint)*(src2++);
|
||||
*(res++) = (byte)(sample_out & 0xFF);
|
||||
sample_out >>= 8;
|
||||
*(res++) = (byte)(sample_out & 0xFF);
|
||||
sample_out >>= 8;
|
||||
*(res++) = (byte)(sample_out & 0xFF);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
throw new Exception("Unsupported BPS");
|
||||
}
|
||||
|
||||
//public void Clear()
|
||||
@@ -451,6 +478,7 @@ namespace CUETools.Codecs
|
||||
short* pOutSamples = (short*)outSamples;
|
||||
for (int i = 0; i < loopCount; i++)
|
||||
pOutSamples[i] = (short)pInSamples[i];
|
||||
//*(pOutSamples++) = (short)*(pInSamples++);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -465,19 +493,8 @@ namespace CUETools.Codecs
|
||||
throw new IndexOutOfRangeException();
|
||||
}
|
||||
|
||||
fixed (int* pInSamplesFixed = &inSamples[inSampleOffset, 0])
|
||||
{
|
||||
fixed (byte* pOutSamplesFixed = &outSamples[outByteOffset])
|
||||
{
|
||||
int* pInSamples = pInSamplesFixed;
|
||||
short* pOutSamples = (short*)pOutSamplesFixed;
|
||||
|
||||
for (int i = 0; i < loopCount; i++)
|
||||
{
|
||||
*(pOutSamples++) = (short)*(pInSamples++);
|
||||
}
|
||||
}
|
||||
}
|
||||
fixed (byte* pOutSamplesFixed = &outSamples[outByteOffset])
|
||||
FLACSamplesToBytes_16(inSamples, inSampleOffset, pOutSamplesFixed, sampleCount, channelCount);
|
||||
}
|
||||
|
||||
public static unsafe void FLACSamplesToBytes_24(int[,] inSamples, int inSampleOffset,
|
||||
@@ -917,16 +934,16 @@ namespace CUETools.Codecs
|
||||
private AudioPCMConfig pcm;
|
||||
private int _sampleVal;
|
||||
|
||||
public SilenceGenerator(long sampleCount, int sampleVal)
|
||||
public SilenceGenerator(AudioPCMConfig pcm, long sampleCount, int sampleVal)
|
||||
{
|
||||
_sampleVal = sampleVal;
|
||||
_sampleOffset = 0;
|
||||
_sampleCount = sampleCount;
|
||||
pcm = AudioPCMConfig.RedBook;
|
||||
this._sampleVal = sampleVal;
|
||||
this._sampleOffset = 0;
|
||||
this._sampleCount = sampleCount;
|
||||
this.pcm = pcm;
|
||||
}
|
||||
|
||||
public SilenceGenerator(long sampleCount)
|
||||
: this(sampleCount, 0)
|
||||
: this(AudioPCMConfig.RedBook, sampleCount, 0)
|
||||
{
|
||||
}
|
||||
|
||||
@@ -1091,19 +1108,29 @@ namespace CUETools.Codecs
|
||||
{
|
||||
foundFormat = true;
|
||||
|
||||
if (_br.ReadUInt16() != 1)
|
||||
{
|
||||
throw new Exception("WAVE must be PCM format.");
|
||||
}
|
||||
uint fmtTag = _br.ReadUInt16();
|
||||
int _channelCount = _br.ReadInt16();
|
||||
int _sampleRate = _br.ReadInt32();
|
||||
_br.ReadInt32();
|
||||
_br.ReadInt32(); // bytes per second
|
||||
int _blockAlign = _br.ReadInt16();
|
||||
int _bitsPerSample = _br.ReadInt16();
|
||||
pos += 16;
|
||||
|
||||
if (fmtTag == 0xFFFEU && ckSize >= 34) // WAVE_FORMAT_EXTENSIBLE
|
||||
{
|
||||
_br.ReadInt16(); // CbSize
|
||||
_br.ReadInt16(); // ValidBitsPerSample
|
||||
int channelMask = _br.ReadInt32();
|
||||
fmtTag = _br.ReadUInt16();
|
||||
pos += 10;
|
||||
}
|
||||
|
||||
if (fmtTag != 1) // WAVE_FORMAT_PCM
|
||||
throw new Exception("WAVE format tag not WAVE_FORMAT_PCM.");
|
||||
|
||||
pcm = new AudioPCMConfig(_bitsPerSample, _channelCount, _sampleRate);
|
||||
if (pcm.BlockAlign != _blockAlign)
|
||||
throw new Exception("WAVE has strange BlockAlign");
|
||||
pos += 16;
|
||||
}
|
||||
else if (ckID == fccData)
|
||||
{
|
||||
|
||||
@@ -23,7 +23,7 @@
|
||||
<DefineConstants>DEBUG;TRACE</DefineConstants>
|
||||
<ErrorReport>prompt</ErrorReport>
|
||||
<WarningLevel>4</WarningLevel>
|
||||
<PlatformTarget>x86</PlatformTarget>
|
||||
<PlatformTarget>AnyCPU</PlatformTarget>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
|
||||
<DebugType>pdbonly</DebugType>
|
||||
|
||||
@@ -87,7 +87,7 @@ namespace CUETools.FLACCL.cmd
|
||||
min_precision = -1, max_precision = -1,
|
||||
orders_per_window = -1, orders_per_channel = -1,
|
||||
blocksize = -1;
|
||||
int input_len = 4096, input_val = 0;
|
||||
int input_len = 4096, input_val = 0, input_bps = 16, input_ch = 2, input_rate = 44100;
|
||||
int level = -1, padding = -1, vbr_mode = -1;
|
||||
bool do_seektable = true;
|
||||
bool buffered = false;
|
||||
@@ -136,6 +136,10 @@ namespace CUETools.FLACCL.cmd
|
||||
input_len = intarg;
|
||||
else if (args[arg] == "--input-value" && ++arg < args.Length && int.TryParse(args[arg], out intarg))
|
||||
input_val = intarg;
|
||||
else if (args[arg] == "--input-bps" && ++arg < args.Length && int.TryParse(args[arg], out intarg))
|
||||
input_bps = intarg;
|
||||
else if (args[arg] == "--input-channels" && ++arg < args.Length && int.TryParse(args[arg], out intarg))
|
||||
input_ch = intarg;
|
||||
else if ((args[arg] == "-o" || args[arg] == "--output") && ++arg < args.Length)
|
||||
output_file = args[arg];
|
||||
else if ((args[arg] == "-s" || args[arg] == "--stereo") && ++arg < args.Length)
|
||||
@@ -211,18 +215,28 @@ namespace CUETools.FLACCL.cmd
|
||||
}
|
||||
|
||||
IAudioSource audioSource;
|
||||
if (input_file == "-")
|
||||
audioSource = new WAVReader("", Console.OpenStandardInput());
|
||||
else if (input_file == "nul")
|
||||
audioSource = new SilenceGenerator(input_len, input_val);
|
||||
else if (File.Exists(input_file) && Path.GetExtension(input_file) == ".wav")
|
||||
audioSource = new WAVReader(input_file, null);
|
||||
else if (File.Exists(input_file) && Path.GetExtension(input_file) == ".flac")
|
||||
audioSource = new FlakeReader(input_file, null);
|
||||
else
|
||||
try
|
||||
{
|
||||
if (input_file == "-")
|
||||
audioSource = new WAVReader("", Console.OpenStandardInput());
|
||||
else if (input_file == "nul")
|
||||
audioSource = new SilenceGenerator(new AudioPCMConfig(input_bps, input_ch, input_rate), input_len, input_val);
|
||||
else if (File.Exists(input_file) && Path.GetExtension(input_file) == ".wav")
|
||||
audioSource = new WAVReader(input_file, null);
|
||||
else if (File.Exists(input_file) && Path.GetExtension(input_file) == ".flac")
|
||||
audioSource = new FlakeReader(input_file, null);
|
||||
else
|
||||
{
|
||||
Usage();
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Usage();
|
||||
return 2;
|
||||
Console.WriteLine("");
|
||||
Console.WriteLine("Error: {0}.", ex.Message);
|
||||
return 3;
|
||||
}
|
||||
if (buffered)
|
||||
audioSource = new AudioPipe(audioSource, FLACCLWriter.MAX_BLOCKSIZE);
|
||||
|
||||
8
CUETools.Flake/App.config
Normal file
8
CUETools.Flake/App.config
Normal file
@@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="utf-8" ?>
|
||||
<configuration>
|
||||
<runtime>
|
||||
<assemblyBinding xmlns="urn:schemas-microsoft-com:asm.v1">
|
||||
<probing privatePath="plugins"/>
|
||||
</assemblyBinding>
|
||||
</runtime>
|
||||
</configuration>
|
||||
@@ -2,7 +2,7 @@
|
||||
<PropertyGroup>
|
||||
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
|
||||
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
|
||||
<ProductVersion>8.0.50727</ProductVersion>
|
||||
<ProductVersion>9.0.30729</ProductVersion>
|
||||
<SchemaVersion>2.0</SchemaVersion>
|
||||
<ProjectGuid>{2379BAAF-A406-4477-BF53-2D6A326C24C8}</ProjectGuid>
|
||||
<OutputType>Exe</OutputType>
|
||||
@@ -19,7 +19,7 @@
|
||||
<DebugSymbols>true</DebugSymbols>
|
||||
<DebugType>full</DebugType>
|
||||
<Optimize>false</Optimize>
|
||||
<OutputPath>bin\Debug\</OutputPath>
|
||||
<OutputPath>..\bin\Debug\</OutputPath>
|
||||
<DefineConstants>DEBUG;TRACE</DefineConstants>
|
||||
<ErrorReport>prompt</ErrorReport>
|
||||
<WarningLevel>4</WarningLevel>
|
||||
@@ -52,6 +52,9 @@
|
||||
<Name>CUETools.Codecs</Name>
|
||||
</ProjectReference>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="App.config" />
|
||||
</ItemGroup>
|
||||
<Import Project="$(MSBuildBinPath)\Microsoft.CSharp.targets" />
|
||||
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
|
||||
Other similar extension points exist, see Microsoft.Common.targets.
|
||||
|
||||
@@ -279,9 +279,10 @@ namespace CUETools.FlakeExe
|
||||
if (!quiet)
|
||||
{
|
||||
Console.Error.Write("\r \r");
|
||||
Console.WriteLine("Results : {0:0.00}x; {1}",
|
||||
Console.WriteLine("Results : {0:0.00}x; {2} bytes in {1} seconds;",
|
||||
audioSource.Position / totalElapsed.TotalSeconds / audioSource.PCM.SampleRate,
|
||||
totalElapsed
|
||||
totalElapsed,
|
||||
flake.TotalSize
|
||||
);
|
||||
}
|
||||
audioSource.Close();
|
||||
|
||||
Reference in New Issue
Block a user