24-bit/multichannel support

This commit is contained in:
chudov
2010-12-07 22:52:34 +00:00
parent 6783bba2e9
commit 6585ea2001
12 changed files with 691 additions and 432 deletions

View File

@@ -87,7 +87,7 @@ namespace CUETools.Codecs.FLACCL
[SRDescription(typeof(Properties.Resources), "DescriptionDeviceType")]
public OpenCLDeviceType DeviceType { get; set; }
int cpu_threads = 1;
int cpu_threads = 0;
[DefaultValue(1)]
[SRDescription(typeof(Properties.Resources), "DescriptionCPUThreads")]
public int CPUThreads
@@ -214,10 +214,11 @@ namespace CUETools.Codecs.FLACCL
{
_pcm = pcm;
if (pcm.BitsPerSample != 16)
// FIXME: For now, only 16-bit encoding is supported
if (pcm.BitsPerSample != 16 && pcm.BitsPerSample != 24)
throw new Exception("Bits per sample must be 16.");
if (pcm.ChannelCount != 2)
throw new Exception("ChannelCount must be 2.");
//if (pcm.ChannelCount != 2)
// throw new Exception("ChannelCount must be 2.");
channels = pcm.ChannelCount;
sample_rate = pcm.SampleRate;
@@ -288,12 +289,6 @@ namespace CUETools.Codecs.FLACCL
if (value as FLACCLWriterSettings == null)
throw new Exception("Unsupported options " + value);
_settings = value as FLACCLWriterSettings;
if (_settings.DeviceType == OpenCLDeviceType.CPU)
{
_settings.GroupSize = 1;
//_settings.GPUOnly = true;
_settings.MappedMemory = true;
}
}
}
@@ -644,24 +639,28 @@ namespace CUETools.Codecs.FLACCL
}
}
static unsafe uint calc_optimal_rice_params(int porder, int* parm, uint* sums, uint n, uint pred_order)
static unsafe uint calc_optimal_rice_params(int porder, int* parm, ulong* sums, uint n, uint pred_order, ref int method)
{
uint part = (1U << porder);
uint cnt = (n >> porder) - pred_order;
int k = cnt > 0 ? Math.Min(Flake.MAX_RICE_PARAM, BitReader.log2i(sums[0] / cnt)) : 0;
uint all_bits = cnt * ((uint)k + 1U) + (sums[0] >> k);
int maxK = method > 0 ? 30 : Flake.MAX_RICE_PARAM;
int k = cnt > 0 ? Math.Min(maxK, BitReader.log2i(sums[0] / cnt)) : 0;
int realMaxK0 = k;
ulong all_bits = cnt * ((uint)k + 1U) + (sums[0] >> k);
parm[0] = k;
cnt = (n >> porder);
for (uint i = 1; i < part; i++)
{
k = Math.Min(Flake.MAX_RICE_PARAM, BitReader.log2i(sums[i] / cnt));
k = Math.Min(maxK, BitReader.log2i(sums[i] / cnt));
realMaxK0 = Math.Max(realMaxK0, k);
all_bits += cnt * ((uint)k + 1U) + (sums[i] >> k);
parm[i] = k;
}
return all_bits + (4 * part);
method = realMaxK0 > Flake.MAX_RICE_PARAM ? 1 : 0;
return (uint)all_bits + ((4U + (uint)method) * part);
}
static unsafe void calc_lower_sums(int pmin, int pmax, uint* sums)
static unsafe void calc_lower_sums(int pmin, int pmax, ulong* sums)
{
for (int i = pmax - 1; i >= pmin; i--)
{
@@ -674,12 +673,12 @@ namespace CUETools.Codecs.FLACCL
}
}
static unsafe void calc_sums(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums)
static unsafe void calc_sums(int pmin, int pmax, uint* data, uint n, uint pred_order, ulong* sums)
{
int parts = (1 << pmax);
uint* res = data + pred_order;
uint cnt = (n >> pmax) - pred_order;
uint sum = 0;
ulong sum = 0;
for (uint j = cnt; j > 0; j--)
sum += *(res++);
sums[0] = sum;
@@ -702,18 +701,18 @@ namespace CUETools.Codecs.FLACCL
/// <param name="n"></param>
/// <param name="pred_order"></param>
/// <param name="sums"></param>
static unsafe void calc_sums18(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums)
static unsafe void calc_sums18(int pmin, int pmax, uint* data, uint n, uint pred_order, ulong* sums)
{
int parts = (1 << pmax);
uint* res = data + pred_order;
uint cnt = 18 - pred_order;
uint sum = 0;
ulong sum = 0UL;
for (uint j = cnt; j > 0; j--)
sum += *(res++);
sums[0] = sum;
for (int i = 1; i < parts; i++)
{
sums[i] =
sums[i] = 0UL +
*(res++) + *(res++) + *(res++) + *(res++) +
*(res++) + *(res++) + *(res++) + *(res++) +
*(res++) + *(res++) + *(res++) + *(res++) +
@@ -731,18 +730,18 @@ namespace CUETools.Codecs.FLACCL
/// <param name="n"></param>
/// <param name="pred_order"></param>
/// <param name="sums"></param>
static unsafe void calc_sums16(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums)
static unsafe void calc_sums16(int pmin, int pmax, uint* data, uint n, uint pred_order, ulong* sums)
{
int parts = (1 << pmax);
uint* res = data + pred_order;
uint cnt = 16 - pred_order;
uint sum = 0;
ulong sum = 0UL;
for (uint j = cnt; j > 0; j--)
sum += *(res++);
sums[0] = sum;
for (int i = 1; i < parts; i++)
{
sums[i] =
sums[i] = 0UL +
*(res++) + *(res++) + *(res++) + *(res++) +
*(res++) + *(res++) + *(res++) + *(res++) +
*(res++) + *(res++) + *(res++) + *(res++) +
@@ -750,10 +749,10 @@ namespace CUETools.Codecs.FLACCL
}
}
static unsafe uint calc_rice_params(RiceContext rc, int pmin, int pmax, int* data, uint n, uint pred_order)
static unsafe uint calc_rice_params(RiceContext rc, int pmin, int pmax, int* data, uint n, uint pred_order, int max_method)
{
uint* udata = stackalloc uint[(int)n];
uint* sums = stackalloc uint[(pmax + 1) * Flake.MAX_PARTITIONS];
ulong* sums = stackalloc ulong[(pmax + 1) * Flake.MAX_PARTITIONS];
int* parm = stackalloc int[(pmax + 1) * Flake.MAX_PARTITIONS];
//uint* bits = stackalloc uint[Flake.MAX_PARTITION_ORDER];
@@ -776,17 +775,21 @@ namespace CUETools.Codecs.FLACCL
uint opt_bits = AudioSamples.UINT32_MAX;
int opt_porder = pmin;
int opt_method = 0;
for (int i = pmin; i <= pmax; i++)
{
uint bits = calc_optimal_rice_params(i, parm + i * Flake.MAX_PARTITIONS, sums + i * Flake.MAX_PARTITIONS, n, pred_order);
int method = max_method;
uint bits = calc_optimal_rice_params(i, parm + i * Flake.MAX_PARTITIONS, sums + i * Flake.MAX_PARTITIONS, n, pred_order, ref method);
if (bits <= opt_bits)
{
opt_bits = bits;
opt_porder = i;
opt_method = method;
}
}
rc.porder = opt_porder;
rc.coding_method = opt_method;
fixed (int* rparms = rc.rparams)
AudioSamples.MemCpy(rparms, parm + opt_porder * Flake.MAX_PARTITIONS, (1 << opt_porder));
@@ -845,8 +848,8 @@ namespace CUETools.Codecs.FLACCL
for (int i = pos; i < pos + cnt; i++)
{
int v = sub.best.residual[i];
v = (v << 1) ^ (v >> 31);
q += (v >> k);
uint uv = (uint)((v << 1) ^ (v >> 31));
q += (int)(uv >> k);
}
return (k + 1) * cnt + q;
}
@@ -857,7 +860,7 @@ namespace CUETools.Codecs.FLACCL
int porder = sub.best.rc.porder;
int psize = frame.blocksize >> porder;
//assert(porder >= 0);
int size = 6 + (4 << porder);
int size = 6 + ((4 + sub.best.rc.coding_method) << porder);
size += measure_residual(frame, sub, sub.best.order, psize - sub.best.order, sub.best.rc.rparams[0]);
// residual
for (int p = 1; p < (1 << porder); p++)
@@ -870,13 +873,13 @@ namespace CUETools.Codecs.FLACCL
FlacFrame frame = task.frame;
// rice-encoded block
frame.writer.writebits(2, 0);
frame.writer.writebits(2, sub.best.rc.coding_method);
// partition order
int porder = sub.best.rc.porder;
//assert(porder >= 0);
frame.writer.writebits(4, porder);
if (_settings.GPUOnly && _settings.DoRice)
if (task.UseGPURice)
{
int len = task.BestResidualTasks[index].size - task.BestResidualTasks[index].headerLen;
int pos = task.BestResidualTasks[index].encodingOffset;
@@ -901,7 +904,7 @@ namespace CUETools.Codecs.FLACCL
for (int p = 0; p < (1 << porder); p++)
{
int k = sub.best.rc.rparams[p];
frame.writer.writebits(4, k);
frame.writer.writebits(4 + sub.best.rc.coding_method, k);
if (p == 1) res_cnt = psize;
int cnt = Math.Min(res_cnt, frame.blocksize - j);
frame.writer.write_rice_block_signed(fixbuf, k, sub.best.residual + j, cnt);
@@ -1069,7 +1072,7 @@ namespace CUETools.Codecs.FLACCL
calculate_window(task, lpc.window_bartlett, WindowFunction.Bartlett);
if (task.nWindowFunctions == 0)
throw new Exception("invalid windowfunction");
if (!_settings.MappedMemory)
if (!task.UseMappedMemory)
task.openCLCQ.EnqueueWriteBuffer(task.clWindowFunctions, false, 0, sizeof(float) * task.nWindowFunctions * task.frameSize, task.clWindowFunctionsPtr);
}
@@ -1116,6 +1119,7 @@ namespace CUETools.Codecs.FLACCL
task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * task.channelSize + iFrame * blocksize;
task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs;
task.ResidualTasks[task.nResidualTasks].wbits = 0;
task.ResidualTasks[task.nResidualTasks].coding_method = PCM.BitsPerSample > 16 ? 1 : 0;
task.ResidualTasks[task.nResidualTasks].size = task.ResidualTasks[task.nResidualTasks].obits * blocksize;
task.nResidualTasks++;
}
@@ -1131,6 +1135,7 @@ namespace CUETools.Codecs.FLACCL
task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * task.channelSize + iFrame * blocksize;
task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs;
task.ResidualTasks[task.nResidualTasks].wbits = 0;
task.ResidualTasks[task.nResidualTasks].coding_method = PCM.BitsPerSample > 16 ? 1 : 0;
task.ResidualTasks[task.nResidualTasks].size = task.ResidualTasks[task.nResidualTasks].obits * blocksize;
task.ResidualTasks[task.nResidualTasks].residualOrder = 1;
task.ResidualTasks[task.nResidualTasks].shift = 0;
@@ -1149,6 +1154,7 @@ namespace CUETools.Codecs.FLACCL
task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * task.channelSize + iFrame * blocksize;
task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs;
task.ResidualTasks[task.nResidualTasks].wbits = 0;
task.ResidualTasks[task.nResidualTasks].coding_method = PCM.BitsPerSample > 16 ? 1 : 0;
task.ResidualTasks[task.nResidualTasks].size = task.ResidualTasks[task.nResidualTasks].obits * blocksize;
task.ResidualTasks[task.nResidualTasks].shift = 0;
switch (order)
@@ -1195,10 +1201,11 @@ namespace CUETools.Codecs.FLACCL
if (sizeof(FLACCLSubframeTask) * task.nResidualTasks > task.residualTasksLen)
throw new Exception("oops");
if (!_settings.MappedMemory)
task.openCLCQ.EnqueueWriteBuffer(task.clResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * task.nResidualTasks, task.clResidualTasksPtr);
if (!_settings.MappedMemory)
task.openCLCQ.EnqueueWriteBuffer(task.clSelectedTasks, false, 0, sizeof(int) * (nFrames * channelsCount * task.nEstimateTasksPerChannel), task.clSelectedTasksPtr);
if (!task.UseMappedMemory)
{
task.openCLCQ.EnqueueWriteBuffer(task.clResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * task.nResidualTasks, task.clResidualTasksPtr);
task.openCLCQ.EnqueueWriteBuffer(task.clSelectedTasks, false, 0, sizeof(int) * (nFrames * channelsCount * task.nEstimateTasksPerChannel), task.clSelectedTasksPtr);
}
}
unsafe void encode_residual(FLACCLTask task)
@@ -1215,7 +1222,7 @@ namespace CUETools.Codecs.FLACCL
if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true;
break;
case SubframeType.Fixed:
if (!_settings.GPUOnly)
if (!task.UseGPUOnly)
{
if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true;
encode_residual_fixed(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples,
@@ -1224,7 +1231,7 @@ namespace CUETools.Codecs.FLACCL
int pmin = get_max_p_order(eparams.min_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order);
int pmax = get_max_p_order(eparams.max_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order);
uint bits = (uint)(task.frame.subframes[ch].best.order * task.frame.subframes[ch].obits) + 6;
task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order);
task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order, PCM.BitsPerSample > 16 ? 1 : 0);
}
break;
case SubframeType.LPC:
@@ -1236,7 +1243,7 @@ namespace CUETools.Codecs.FLACCL
#if DEBUG
// check size
if (_settings.GPUOnly && !_settings.DoRice)
if (task.UseGPUOnly && !task.UseGPURice)
{
uint real_size = measure_subframe(task.frame, task.frame.subframes[ch]);
if (real_size != task.frame.subframes[ch].best.size)
@@ -1244,9 +1251,9 @@ namespace CUETools.Codecs.FLACCL
}
#endif
if (((csum << task.frame.subframes[ch].obits) >= 1UL << 32) || !_settings.GPUOnly)
if ((((csum << task.frame.subframes[ch].obits) >= 1UL << 32) && PCM.BitsPerSample == 16) || !task.UseGPUOnly)
{
if (_settings.GPUOnly && _settings.DoRice)
if (task.UseGPURice)
#if DEBUG
// throw new Exception("DoRice failed");
break;
@@ -1266,11 +1273,11 @@ namespace CUETools.Codecs.FLACCL
RiceContext rc1 = task.frame.subframes[ch].best.rc;
task.frame.subframes[ch].best.rc = new RiceContext();
#endif
task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order);
task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order, PCM.BitsPerSample > 16 ? 1 : 0);
task.frame.subframes[ch].best.size = measure_subframe(task.frame, task.frame.subframes[ch]);
#if KJHKJH
// check size
if (_settings.GPUOnly && oldsize > task.frame.subframes[ch].best.size)
if (task.UseGPUOnly && oldsize > task.frame.subframes[ch].best.size)
throw new Exception("unoptimal size reported");
#endif
//if (task.frame.subframes[ch].best.size > task.frame.subframes[ch].obits * (uint)task.frame.blocksize &&
@@ -1337,8 +1344,9 @@ namespace CUETools.Codecs.FLACCL
for (int i = 0; i < task.BestResidualTasks[index].residualOrder; i++)
frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i];
frame.subframes[ch].best.rc.porder = task.BestResidualTasks[index].porder;
if (_settings.GPUOnly && !_settings.DoRice && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC))
//if (_settings.GPUOnly && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC))
frame.subframes[ch].best.rc.coding_method = task.BestResidualTasks[index].coding_method;
if (task.UseGPUOnly && !task.UseGPURice && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC))
//if (task.UseGPUOnly && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC))
{
int* riceParams = ((int*)task.clBestRiceParamsPtr) + (index << task.max_porder);
fixed (int* dstParams = frame.subframes[ch].best.rc.rparams)
@@ -1352,7 +1360,7 @@ namespace CUETools.Codecs.FLACCL
}
else
{
if (_settings.GPUOnly && _settings.DoRice && frame.subframes[ch].best.size != task.BestResidualTasks[index].size)
if (task.UseGPURice && frame.subframes[ch].best.size != task.BestResidualTasks[index].size)
throw new Exception("size reported incorrectly");
}
}
@@ -1369,10 +1377,9 @@ namespace CUETools.Codecs.FLACCL
/// </summary>
/// <param name="task"></param>
/// <param name="doMidside"></param>
unsafe void unpack_samples(FLACCLTask task, int count)
unsafe void unpack_samples_16(FLACCLTask task, byte * srcptr, int count)
{
int iFrame = task.frame.frame_number;
short* src = ((short*)task.clSamplesBytesPtr) + iFrame * channels * task.frameSize;
short* src = (short*)srcptr;
switch (task.frame.ch_mode)
{
@@ -1382,7 +1389,7 @@ namespace CUETools.Codecs.FLACCL
int* s = task.frame.subframes[ch].samples;
int wbits = (int)task.frame.subframes[ch].wbits;
for (int i = 0; i < count; i++)
s[i] = src[i * channels + ch] >>= wbits;
s[i] = src[i * channels + ch] >> wbits;
}
break;
case ChannelMode.LeftRight:
@@ -1448,6 +1455,108 @@ namespace CUETools.Codecs.FLACCL
}
}
/// <summary>
/// Copy channel-interleaved input samples into separate subframes
/// </summary>
/// <param name="task"></param>
/// <param name="doMidside"></param>
unsafe void unpack_samples_24(FLACCLTask task, byte* srcptr, int count)
{
switch (task.frame.ch_mode)
{
case ChannelMode.NotStereo:
for (int ch = 0; ch < channels; ch++)
{
int* s = task.frame.subframes[ch].samples;
int wbits = (int)task.frame.subframes[ch].wbits;
byte* src = &srcptr[ch * 3];
for (int i = 0; i < count; i++)
{
s[i] = (((int)src[0] << 8) + ((int)src[1] << 16) + ((int)src[2] << 24)) >> (8 + wbits);
src += PCM.BlockAlign;
}
}
break;
case ChannelMode.LeftRight:
{
int* left = task.frame.subframes[0].samples;
int* right = task.frame.subframes[1].samples;
int lwbits = (int)task.frame.subframes[0].wbits;
int rwbits = (int)task.frame.subframes[1].wbits;
for (int i = 0; i < count; i++)
{
int l = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8;
int r = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8;
left[i] = l >> lwbits;
right[i] = r >> rwbits;
}
break;
}
case ChannelMode.LeftSide:
{
int* left = task.frame.subframes[0].samples;
int* right = task.frame.subframes[1].samples;
int lwbits = (int)task.frame.subframes[0].wbits;
int rwbits = (int)task.frame.subframes[1].wbits;
for (int i = 0; i < count; i++)
{
int l = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8;
int r = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8;
left[i] = l >> lwbits;
right[i] = (l - r) >> rwbits;
}
break;
}
case ChannelMode.RightSide:
{
int* left = task.frame.subframes[0].samples;
int* right = task.frame.subframes[1].samples;
int lwbits = (int)task.frame.subframes[0].wbits;
int rwbits = (int)task.frame.subframes[1].wbits;
for (int i = 0; i < count; i++)
{
int l = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8;
int r = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8;
left[i] = (l - r) >> lwbits;
right[i] = r >> rwbits;
}
break;
}
case ChannelMode.MidSide:
{
int* left = task.frame.subframes[0].samples;
int* right = task.frame.subframes[1].samples;
int lwbits = (int)task.frame.subframes[0].wbits;
int rwbits = (int)task.frame.subframes[1].wbits;
for (int i = 0; i < count; i++)
{
int l = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8;
int r = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8;
left[i] = (l + r) >> (1 + lwbits);
right[i] = (l - r) >> rwbits;
}
break;
}
}
}
/// <summary>
/// Copy channel-interleaved input samples into separate subframes
/// </summary>
/// <param name="task"></param>
/// <param name="doMidside"></param>
unsafe void unpack_samples(FLACCLTask task, int count)
{
int iFrame = task.frame.frame_number;
byte* srcptr = ((byte*)task.clSamplesBytesPtr) + iFrame * task.frameSize * PCM.BlockAlign;
if (PCM.BitsPerSample == 16)
unpack_samples_16(task, srcptr, count);
else if (PCM.BitsPerSample == 24)
unpack_samples_24(task, srcptr, count);
else
throw new Exception("Invalid BPS");
}
unsafe int encode_frame(bool doMidside, int channelCount, int iFrame, FLACCLTask task, int current_frame_number)
{
task.frame.InitSize(task.frameSize, eparams.variable_block_size != 0);
@@ -1492,8 +1601,8 @@ namespace CUETools.Codecs.FLACCL
task.framePos = frame_pos;
frame_count += nFrames;
frame_pos += nFrames * blocksize;
if (!_settings.MappedMemory)
task.openCLCQ.EnqueueWriteBuffer(task.clSamplesBytes, false, 0, sizeof(short) * channels * blocksize * nFrames, task.clSamplesBytesPtr);
if (!task.UseMappedMemory)
task.openCLCQ.EnqueueWriteBuffer(task.clSamplesBytes, false, 0, PCM.BlockAlign * blocksize * nFrames, task.clSamplesBytesPtr);
//task.openCLCQ.EnqueueUnmapMemObject(task.clSamplesBytes, task.clSamplesBytes.HostPtr);
//task.openCLCQ.EnqueueMapBuffer(task.clSamplesBytes, true, MapFlags.WRITE, 0, task.samplesBufferLen / 2);
}
@@ -1530,20 +1639,38 @@ namespace CUETools.Codecs.FLACCL
{
int decoded = task.verify.DecodeFrame(task.frame.writer.Buffer, task.frame.writer_offset, fs);
if (decoded != fs || task.verify.Remaining != task.frameSize)
throw new Exception("validation failed! frame size mismatch");
throw new Exception(string.Format("validation failed! frame size mismatch, iFrame={0}, decoded=={1}, fs=={2}", fn, decoded, fs));
fixed (int* r = task.verify.Samples)
{
for (int ch = 0; ch < channels; ch++)
{
short* res = ((short*)task.clSamplesBytesPtr) + iFrame * channels * task.frameSize + ch;
byte* res = ((byte*)task.clSamplesBytesPtr) + PCM.BlockAlign * iFrame * task.frameSize + ch * (PCM.BlockAlign / channels);
int* smp = r + ch * Flake.MAX_BLOCKSIZE;
for (int i = task.frameSize; i > 0; i--)
int ba = PCM.BlockAlign;
if (PCM.BitsPerSample == 16)
{
//if (AudioSamples.MemCmp(s + iFrame * task.frameSize + ch * FLACCLWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, task.frameSize))
if (*res != *(smp++))
throw new Exception(string.Format("validation failed! iFrame={0}, ch={1}", iFrame, ch));
res += channels;
for (int i = task.frameSize; i > 0; i--)
{
//if (AudioSamples.MemCmp(s + iFrame * task.frameSize + ch * FLACCLWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, task.frameSize))
int ress = *(short*)res;
if (ress != *(smp++))
throw new Exception(string.Format("validation failed! iFrame={0}, ch={1}", fn, ch));
res += ba;
}
}
else if (PCM.BitsPerSample == 24)
{
for (int i = task.frameSize; i > 0; i--)
{
//if (AudioSamples.MemCmp(s + iFrame * task.frameSize + ch * FLACCLWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, task.frameSize))
int ress = (((int)res[0] << 8) + ((int)res[1] << 16) + ((int)res[2] << 24)) >> (8);
if (ress != *(smp++))
throw new Exception(string.Format("validation failed! iFrame={0}, ch={1}", iFrame, ch));
res += ba;
}
}
else
throw new Exception("Invalid BPS");
}
}
}
@@ -1644,10 +1771,21 @@ namespace CUETools.Codecs.FLACCL
}
OCLMan.CreateDefaultContext(platformId, (DeviceType)_settings.DeviceType);
this.framesPerTask = (int)OCLMan.Context.Devices[0].MaxComputeUnits * _settings.TaskSize;
this.framesPerTask = (int)OCLMan.Context.Devices[0].MaxComputeUnits * Math.Max(1, _settings.TaskSize / channels);
if (!OCLMan.Context.Devices[0].Extensions.Contains("cl_khr_local_int32_extended_atomics"))
_settings.GPUOnly = false;
bool UseGPUOnly = _settings.GPUOnly && OCLMan.Context.Devices[0].Extensions.Contains("cl_khr_local_int32_extended_atomics");
bool UseGPURice = UseGPUOnly && _settings.DoRice;
if (_blocksize == 0)
{
if (eparams.block_size == 0)
eparams.block_size = select_blocksize(sample_rate, eparams.block_time_ms);
_blocksize = eparams.block_size;
}
else
eparams.block_size = _blocksize;
int maxBS = 1 << (BitReader.log2i(eparams.block_size - 1) + 1);
// The Defines string gets prepended to any and all sources that are compiled
// and serve as a convenient way to pass configuration information to the compilation process
@@ -1655,8 +1793,11 @@ namespace CUETools.Codecs.FLACCL
"#define MAX_ORDER " + eparams.max_prediction_order.ToString() + "\n" +
"#define GROUP_SIZE " + groupSize.ToString() + "\n" +
"#define FLACCL_VERSION \"" + vendor_string + "\"\n" +
(_settings.GPUOnly ? "#define DO_PARTITIONS\n" : "") +
(_settings.DoRice ? "#define DO_RICE\n" : "") +
(UseGPUOnly ? "#define DO_PARTITIONS\n" : "") +
(UseGPURice ? "#define DO_RICE\n" : "") +
"#define BITS_PER_SAMPLE " + PCM.BitsPerSample + "\n" +
"#define MAX_BLOCKSIZE " + maxBS + "\n" +
"#define MAX_CHANNELS " + PCM.ChannelCount + "\n" +
#if DEBUG
"#define DEBUG\n" +
#endif
@@ -1718,13 +1859,13 @@ namespace CUETools.Codecs.FLACCL
if (_IO.CanSeek)
first_frame_offset = _IO.Position;
task1 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, this, groupSize);
task2 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, this, groupSize);
task1 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, this, groupSize, UseGPUOnly, UseGPURice);
task2 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, this, groupSize, UseGPUOnly, UseGPURice);
if (_settings.CPUThreads > 0)
{
cpu_tasks = new FLACCLTask[_settings.CPUThreads];
for (int i = 0; i < cpu_tasks.Length; i++)
cpu_tasks[i] = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, this, groupSize);
cpu_tasks[i] = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, this, groupSize, UseGPUOnly, UseGPURice);
}
inited = true;
}
@@ -1823,10 +1964,10 @@ namespace CUETools.Codecs.FLACCL
public unsafe void do_output_frames(int nFrames)
{
send_to_GPU(task1, nFrames, eparams.block_size);
run_GPU_task(task1);
if (task2.frameCount > 0)
task2.openCLCQ.Finish();
send_to_GPU(task1, nFrames, eparams.block_size);
run_GPU_task(task1);
if (task2.frameCount > 0)
{
if (cpu_tasks != null)
@@ -1871,15 +2012,16 @@ namespace CUETools.Codecs.FLACCL
{
int blocksize = Flake.flac_blocksizes[1];
int target = (samplerate * time_ms) / 1000;
if (eparams.variable_block_size > 0)
{
blocksize = 1024;
while (target >= blocksize)
blocksize <<= 1;
return blocksize >> 1;
}
for (int i = 0; i < Flake.flac_blocksizes.Length; i++)
////if (eparams.variable_block_size > 0)
////{
//// blocksize = 1024;
//// while (target >= blocksize)
//// blocksize <<= 1;
//// return blocksize >> 1;
////}
for (int i = 8; i < Flake.flac_blocksizes.Length; i++)
if (target >= Flake.flac_blocksizes[i] && Flake.flac_blocksizes[i] > blocksize)
{
blocksize = Flake.flac_blocksizes[i];
@@ -2052,18 +2194,6 @@ namespace CUETools.Codecs.FLACCL
}
if (i == 8)
throw new Exception("non-standard bps");
// FIXME: For now, only 16-bit encoding is supported
if (bits_per_sample != 16)
throw new Exception("non-standard bps");
if (_blocksize == 0)
{
if (eparams.block_size == 0)
eparams.block_size = select_blocksize(sample_rate, eparams.block_time_ms);
_blocksize = eparams.block_size;
}
else
eparams.block_size = _blocksize;
// set maximum encoded frame size (if larger, re-encodes in verbatim mode)
if (channels == 2)
@@ -2332,7 +2462,7 @@ namespace CUETools.Codecs.FLACCL
public int type;
public int obits;
public int blocksize;
public int best_index;
public int coding_method;
public int channel;
public int residualOffs;
public int wbits;
@@ -2350,6 +2480,7 @@ namespace CUETools.Codecs.FLACCL
public Kernel clStereoDecorr;
//public Kernel cudaChannelDecorr;
public Kernel clChannelDecorr2;
public Kernel clChannelDecorrX;
public Kernel clFindWastedBits;
public Kernel clComputeAutocor;
public Kernel clComputeLPC;
@@ -2428,9 +2559,15 @@ namespace CUETools.Codecs.FLACCL
public int groupSize = 128;
public int channels, channelsCount;
public FLACCLWriter writer;
public bool UseGPUOnly = false;
public bool UseGPURice = false;
public bool UseMappedMemory = false;
unsafe public FLACCLTask(Program _openCLProgram, int channelsCount, int channels, uint bits_per_sample, int max_frame_size, FLACCLWriter writer, int groupSize)
unsafe public FLACCLTask(Program _openCLProgram, int channelsCount, int channels, uint bits_per_sample, int max_frame_size, FLACCLWriter writer, int groupSize, bool gpuOnly, bool gpuRice)
{
this.UseGPUOnly = gpuOnly;
this.UseGPURice = gpuOnly && gpuRice;
this.UseMappedMemory = writer._settings.MappedMemory || writer._settings.DeviceType == OpenCLDeviceType.CPU;
this.groupSize = groupSize;
this.channels = channels;
this.channelsCount = channelsCount;
@@ -2448,9 +2585,9 @@ namespace CUETools.Codecs.FLACCL
int MAX_CHANNELSIZE = MAX_FRAMES * writer.eparams.block_size;
residualTasksLen = sizeof(FLACCLSubframeTask) * 32 * channelsCount * MAX_FRAMES;
bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channels * MAX_FRAMES;
int samplesBufferLen = sizeof(int) * MAX_CHANNELSIZE * channelsCount;
int samplesBufferLen = writer.PCM.BlockAlign * MAX_CHANNELSIZE * channelsCount;
int residualBufferLen = sizeof(int) * MAX_CHANNELSIZE * channels; // need to adjust residualOffset?
int partitionsLen = sizeof(int) * (30 << 8) * channels * MAX_FRAMES;
int partitionsLen = sizeof(int) * ((writer.PCM.BitsPerSample > 16 ? 31 : 15) * 2 << 8) * channels * MAX_FRAMES;
int riceParamsLen = sizeof(int) * (4 << 8) * channels * MAX_FRAMES;
int autocorLen = sizeof(float) * (MAX_ORDER + 1) * lpc.MAX_LPC_WINDOWS * channelsCount * MAX_FRAMES;
int lpcDataLen = autocorLen * 32;
@@ -2459,7 +2596,7 @@ namespace CUETools.Codecs.FLACCL
int selectedLen = sizeof(int) * 32 * channelsCount * MAX_FRAMES;
int riceLen = sizeof(int) * channels * MAX_CHANNELSIZE;
if (!writer._settings.MappedMemory)
if (!this.UseMappedMemory)
{
clSamplesBytes = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, samplesBufferLen / 2);
clResidual = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, residualBufferLen);
@@ -2521,7 +2658,7 @@ namespace CUETools.Codecs.FLACCL
clAutocorOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, autocorLen);
clSelectedTasksSecondEstimate = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, selectedLen);
clSelectedTasksBestMethod = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, selectedLen);
if (writer._settings.GPUOnly)
if (UseGPUOnly)
{
clPartitions = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, partitionsLen);
clRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, riceParamsLen);
@@ -2533,6 +2670,7 @@ namespace CUETools.Codecs.FLACCL
clStereoDecorr = openCLProgram.CreateKernel("clStereoDecorr");
//cudaChannelDecorr = openCLProgram.CreateKernel("clChannelDecorr");
clChannelDecorr2 = openCLProgram.CreateKernel("clChannelDecorr2");
clChannelDecorrX = openCLProgram.CreateKernel("clChannelDecorrX");
clFindWastedBits = openCLProgram.CreateKernel("clFindWastedBits");
clComputeLPC = openCLProgram.CreateKernel("clComputeLPC");
clQuantizeLPC = openCLProgram.CreateKernel("clQuantizeLPC");
@@ -2540,15 +2678,16 @@ namespace CUETools.Codecs.FLACCL
clSelectStereoTasks = openCLProgram.CreateKernel("clSelectStereoTasks");
clEstimateResidual = openCLProgram.CreateKernel("clEstimateResidual");
clChooseBestMethod = openCLProgram.CreateKernel("clChooseBestMethod");
if (writer._settings.GPUOnly)
if (UseGPUOnly)
{
clEncodeResidual = openCLProgram.CreateKernel("clEncodeResidual");
clCalcPartition = openCLProgram.CreateKernel("clCalcPartition");
clCalcPartition16 = openCLProgram.CreateKernel("clCalcPartition16");
if (openCLCQ.Device.DeviceType != DeviceType.CPU)
clCalcPartition16 = openCLProgram.CreateKernel("clCalcPartition16");
clSumPartition = openCLProgram.CreateKernel("clSumPartition");
clFindRiceParameter = openCLProgram.CreateKernel("clFindRiceParameter");
clFindPartitionOrder = openCLProgram.CreateKernel("clFindPartitionOrder");
if (writer._settings.DoRice)
if (UseGPURice)
{
clCalcOutputOffsets = openCLProgram.CreateKernel("clCalcOutputOffsets");
clRiceEncoding = openCLProgram.CreateKernel("clRiceEncoding");
@@ -2586,6 +2725,7 @@ namespace CUETools.Codecs.FLACCL
clStereoDecorr.Dispose();
//cudaChannelDecorr.Dispose();
clChannelDecorr2.Dispose();
clChannelDecorrX.Dispose();
clFindWastedBits.Dispose();
clComputeLPC.Dispose();
clQuantizeLPC.Dispose();
@@ -2593,15 +2733,16 @@ namespace CUETools.Codecs.FLACCL
clSelectStereoTasks.Dispose();
clEstimateResidual.Dispose();
clChooseBestMethod.Dispose();
if (writer._settings.GPUOnly)
if (UseGPUOnly)
{
clEncodeResidual.Dispose();
clCalcPartition.Dispose();
clCalcPartition16.Dispose();
if (openCLCQ.Device.DeviceType != DeviceType.CPU)
clCalcPartition16.Dispose();
clSumPartition.Dispose();
clFindRiceParameter.Dispose();
clFindPartitionOrder.Dispose();
if (writer._settings.DoRice)
if (UseGPURice)
{
clCalcOutputOffsets.Dispose();
clRiceEncoding.Dispose();
@@ -2611,7 +2752,7 @@ namespace CUETools.Codecs.FLACCL
clRiceParams.Dispose();
}
if (!writer._settings.MappedMemory)
if (!this.UseMappedMemory)
{
if (clSamplesBytesPtr != IntPtr.Zero)
openCLCQ.EnqueueUnmapMemObject(clSamplesBytesPinned, clSamplesBytesPtr);
@@ -2701,19 +2842,36 @@ namespace CUETools.Codecs.FLACCL
while ((frameSize >> max_porder) < 16 && max_porder > 0)
this.max_porder--;
if (channels != 2) throw new Exception("channels != 2"); // need to Enqueue cudaChannelDecorr for each channel
Kernel clChannelDecorr = channels == 2 ? (channelsCount == 4 ? clStereoDecorr : clChannelDecorr2) : null;// cudaChannelDecorr;
// openCLCQ.EnqueueMapBuffer(cudaSamplesBytes
//openCLCQ.EnqueueUnmapMemObject(cudaSamplesBytes, cudaSamplesBytes.HostPtr);
// issue work to the GPU
clChannelDecorr.SetArgs(
clSamples,
clSamplesBytes,
channelSize / 4);
if (channels == 2)
{
Kernel clChannelDecorr = channelsCount == 4 ? clStereoDecorr : clChannelDecorr2;
int channelSize1 = writer.PCM.BitsPerSample == 16 ? channelSize / 4 : channelSize;
clChannelDecorr.SetArgs(
clSamples,
clSamplesBytes,
channelSize1);
openCLCQ.EnqueueNDRangeKernel(clChannelDecorr, 0, channelSize / 4);
openCLCQ.EnqueueNDRangeKernel(
clChannelDecorr,
0,
channelSize1);
}
else
{
clChannelDecorrX.SetArgs(
clSamples,
clSamplesBytes,
channelSize);
openCLCQ.EnqueueNDRangeKernel(
clChannelDecorrX,
0,
channelSize);
}
//openCLCQ.EnqueueNDRangeKernel(clChannelDecorr, 0, (frameSize * frameCount + 3) / 4);
if (eparams.do_wasted)
@@ -2842,14 +3000,22 @@ namespace CUETools.Codecs.FLACCL
0, channels * frameCount);
}
if (writer._settings.GPUOnly)
if (UseGPUOnly)
{
if (frameSize >> max_porder == 16)
clEncodeResidual.SetArgs(
clResidual,
clSamples,
clBestResidualTasks);
openCLCQ.EnqueueNDRangeKernel(
clEncodeResidual,
groupSize, channels * frameCount);
if ((frameSize >> max_porder == 16) && openCLCQ.Device.DeviceType != DeviceType.CPU)
{
clCalcPartition16.SetArgs(
clPartitions,
clResidual,
clSamples,
clBestResidualTasks,
max_porder);
@@ -2859,15 +3025,6 @@ namespace CUETools.Codecs.FLACCL
}
else
{
clEncodeResidual.SetArgs(
clResidual,
clSamples,
clBestResidualTasks);
openCLCQ.EnqueueNDRangeKernel(
clEncodeResidual,
groupSize, channels * frameCount);
clCalcPartition.SetArgs(
clPartitions,
clResidual,
@@ -2895,6 +3052,7 @@ namespace CUETools.Codecs.FLACCL
clPartitions,
max_porder);
int maxK = writer.PCM.BitsPerSample > 16 ? 30 : Flake.MAX_RICE_PARAM;
if (openCLCQ.Device.DeviceType == DeviceType.CPU)
openCLCQ.EnqueueNDRangeKernel(
clSumPartition,
@@ -2904,7 +3062,7 @@ namespace CUETools.Codecs.FLACCL
openCLCQ.EnqueueNDRangeKernel(
clSumPartition,
128, 1,
(Flake.MAX_RICE_PARAM + 1),
(maxK + 1),
channels * frameCount);
}
@@ -2931,7 +3089,7 @@ namespace CUETools.Codecs.FLACCL
groupSize,
channels * frameCount);
if (writer._settings.DoRice)
if (UseGPURice)
{
clCalcOutputOffsets.SetArgs(
clResidual,
@@ -2960,10 +3118,10 @@ namespace CUETools.Codecs.FLACCL
channels * frameCount);
}
if (!writer._settings.MappedMemory)
if (!this.UseMappedMemory)
{
if (writer._settings.DoRice)
openCLCQ.EnqueueReadBuffer(clRiceOutput, false, 0, (channels * frameSize * 17 + 128) / 8 * frameCount, clRiceOutputPtr);
if (UseGPURice)
openCLCQ.EnqueueReadBuffer(clRiceOutput, false, 0, (channels * frameSize * (writer.PCM.BitsPerSample + 1) + 256) / 8 * frameCount, clRiceOutputPtr);
else
{
openCLCQ.EnqueueReadBuffer(clBestRiceParams, false, 0, sizeof(int) * (1 << max_porder) * channels * frameCount, clBestRiceParamsPtr);
@@ -2971,7 +3129,7 @@ namespace CUETools.Codecs.FLACCL
}
}
}
if (!writer._settings.MappedMemory)
if (!this.UseMappedMemory)
openCLCQ.EnqueueReadBuffer(clBestResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * channels * frameCount, clBestResidualTasksPtr);
}
}

View File

@@ -65,6 +65,14 @@
#define WARP_SIZE 32
#if BITS_PER_SAMPLE > 16
#define MAX_RICE_PARAM 30
#define RICE_PARAM_BITS 5
#else
#define MAX_RICE_PARAM 14
#define RICE_PARAM_BITS 4
#endif
typedef enum
{
Constant = 0,
@@ -83,7 +91,7 @@ typedef struct
int type;
int obits;
int blocksize;
int best_index;
int coding_method;
int channel;
int residualOffs;
int wbits;
@@ -125,6 +133,49 @@ __kernel void clWindowTukey(__global float* window, int windowOffset, float p)
}
#endif
#if BITS_PER_SAMPLE > 16
__kernel void clStereoDecorr(
__global int *samples,
__global unsigned char *src,
int offset
)
{
int pos = get_global_id(0);
int bpos = pos * 6;
int x = (((int)src[bpos] << 8) | ((int)src[bpos+1] << 16) | ((int)src[bpos+2] << 24)) >> 8;
int y = (((int)src[bpos+3] << 8) | ((int)src[bpos+4] << 16) | ((int)src[bpos+5] << 24)) >> 8;
samples[pos] = x;
samples[1 * offset + pos] = y;
samples[2 * offset + pos] = (x + y) >> 1;
samples[3 * offset + pos] = x - y;
}
__kernel void clChannelDecorr2(
__global int *samples,
__global unsigned char *src,
int offset
)
{
int pos = get_global_id(0);
int bpos = pos * 6;
samples[pos] = (((int)src[bpos] << 8) | ((int)src[bpos+1] << 16) | ((int)src[bpos+2] << 24)) >> 8;
samples[offset + pos] = (((int)src[bpos+3] << 8) | ((int)src[bpos+4] << 16) | ((int)src[bpos+5] << 24)) >> 8;
}
__kernel void clChannelDecorrX(
__global int *samples,
__global unsigned char *src,
int offset
)
{
int pos = get_global_id(0);
for (int ch = 0; ch < MAX_CHANNELS; ch++)
{
int bpos = 3 * (pos * MAX_CHANNELS + ch);
samples[offset * ch + pos] = (((int)src[bpos] << 8) | ((int)src[bpos+1] << 16) | ((int)src[bpos+2] << 24)) >> 8;
}
}
#else
__kernel void clStereoDecorr(
__global int4 *samples,
__global int4 *src,
@@ -153,6 +204,21 @@ __kernel void clChannelDecorr2(
samples[offset + pos] = s >> 16;
}
__kernel void clChannelDecorrX(
__global int *samples,
__global short *src,
int offset
)
{
int pos = get_global_id(0);
for (int ch = 0; ch < MAX_CHANNELS; ch++)
{
int bpos = pos * MAX_CHANNELS + ch;
samples[offset * ch + pos] = src[bpos];
}
}
#endif
//__kernel void clChannelDecorr(
// int *samples,
// short *src,
@@ -598,7 +664,11 @@ void clQuantizeLPC(
}
// choose precision
//int cbits = max(3, min(10, 5 + (abits >> 1))); // - convert_int_rte(shared.PE[order - 1])
#if BITS_PER_SAMPLE > 16
int cbits = max(3, min(15 - minprecision + (i - ((i >> precisions) << precisions)) - (bs <= 2304) - (bs <= 1152) - (bs <= 576), abits));
#else
int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (bs <= 2304) - (bs <= 1152) - (bs <= 576), abits), clz(order) + 1 - abits));
#endif
// calculate shift based on precision and number of leading zeroes in coeffs
int shift = max(0,min(15, clz(tmpi) - 18 + cbits));
@@ -749,7 +819,11 @@ void clQuantizeLPC(
//SUM32(shared.tmpi,tid,|=);
// choose precision
//int cbits = max(3, min(10, 5 + (shared.task.abits >> 1))); // - convert_int_rte(shared.PE[order - 1])
#if BITS_PER_SAMPLE > 16
int cbits = max(3, min(min(15 - minprecision + (i - ((i >> precisions) << precisions)) - (shared.task.blocksize <= 2304) - (shared.task.blocksize <= 1152) - (shared.task.blocksize <= 576), shared.task.abits), 15));
#else
int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (shared.task.blocksize <= 2304) - (shared.task.blocksize <= 1152) - (shared.task.blocksize <= 576), shared.task.abits), clz(order) + 1 - shared.task.abits));
#endif
// calculate shift based on precision and number of leading zeroes in coeffs
int shift = max(0,min(15, clz(shared.maxcoef[i]) - 18 + cbits));
@@ -797,7 +871,6 @@ void clQuantizeLPC(
#endif
#ifdef FLACCL_CPU
inline int fastclz(int iv)
{
unsigned int v = (unsigned int)iv;
@@ -809,17 +882,44 @@ inline int fastclz(int iv)
x += (0 != (v >> x));
return 32 - x;
}
inline int calc_residual(__global int *ptr, int * coefs, int ro)
#else
inline int fastclz(int iv)
{
int sum = 0;
return clz(iv);
}
#endif
inline int fastclz64(long iv)
{
unsigned long v = (unsigned long)iv;
int x = (0 != (v >> 32)) * 32;
return 32 - x + fastclz(v >> x);
}
#if BITS_PER_SAMPLE > 16
typedef long residual_t;
#define residual_log(s) (63 - fastclz64(s))
#define convert_bps4 convert_long4
#define convert_bps_sat convert_int_sat
#define bpsint4 long4
#else
typedef int residual_t;
#define residual_log(s) (31 - fastclz(s))
#define convert_bps4
#define convert_bps_sat
#define bpsint4 int4
#endif
#ifdef FLACCL_CPU
inline residual_t calc_residual(__global int *ptr, int * coefs, int ro)
{
residual_t sum = 0;
for (int i = 0; i < ro; i++)
sum += ptr[i] * coefs[i];
sum += (residual_t) ptr[i] * coefs[i];
return sum;
}
#define ENCODE_N(cro,action) for (int pos = cro; pos < bs; pos ++) { \
int t = (data[pos] - (calc_residual(data + pos - cro, task.coefs, cro) >> task.data.shift)) >> task.data.wbits; \
residual_t t = (data[pos] - (calc_residual(data + pos - cro, task.coefs, cro) >> task.data.shift)) >> task.data.wbits; \
action; \
}
#define SWITCH_N(action) \
@@ -861,7 +961,7 @@ void clEstimateResidual(
for (int i = 0; i < 1 << EPO; i++)
len[i] = 0;
#ifdef AMD
#if defined(AMD) || BITS_PER_SAMPLE > 16
SWITCH_N((t = (t << 1) ^ (t >> 31), len[pos >> (12 - EPO)] += t & 0x7fffff))
#else
int4 c0 = vload4(0, &task.coefs[0]);
@@ -884,21 +984,19 @@ void clEstimateResidual(
int total = 0;
for (int i = 0; i < 1 << EPO; i++)
{
int res = min(0x7fffff,len[i]);
int k = iclamp(31 - (12 - EPO) - fastclz(res), 0, 14); // 25 - clz(res) == clz(64) - clz(res) == log2(res / 64)
int res = len[i];
int k = iclamp(31 - fastclz(res) - (12 - EPO), 0, MAX_RICE_PARAM); // 25 - clz(res) == clz(64) - clz(res) == log2(res / 64)
total += (k << (12 - EPO)) + (res >> k);
}
int partLen = min(0x7ffffff, total) + (bs - ro);
int obits = task.data.obits - task.data.wbits;
tasks[selectedTask].data.size = min(obits * bs,
task.data.type == Fixed ? ro * obits + 6 + (4 * 1/2) + partLen :
task.data.type == LPC ? ro * obits + 4 + 5 + ro * task.data.cbits + 6 + (4 * 1/2)/* << porder */ + partLen :
task.data.type == Fixed ? ro * obits + 6 + RICE_PARAM_BITS + partLen :
task.data.type == LPC ? ro * obits + 4 + 5 + ro * task.data.cbits + 6 + RICE_PARAM_BITS/* << porder */ + partLen :
task.data.type == Constant ? obits * select(1, bs, partLen != bs - ro) :
obits * bs);
}
#else
#define MAX_BLOCKSIZE 4096
#define ESTPARTLOG 5
__kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
@@ -1049,7 +1147,7 @@ void clEstimateResidual(
// calculate rice partition bit length for every 32 samples
barrier(CLK_LOCAL_MEM_FENCE);
// Bug: if (MAX_BLOCKSIZE >> (ESTPARTLOG + 1)) > GROUP_SIZE
int pl = get_local_id(0) < (MAX_BLOCKSIZE >> (ESTPARTLOG + 1)) ? pl = psum[tid * 2] + psum[tid * 2 + 1] : 0;
int pl = get_local_id(0) < (MAX_BLOCKSIZE >> (ESTPARTLOG + 1)) ? psum[tid * 2] + psum[tid * 2 + 1] : 0;
barrier(CLK_LOCAL_MEM_FENCE);
// for (int pos = 0; pos < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2; pos += GROUP_SIZE)
// {
@@ -1060,7 +1158,7 @@ void clEstimateResidual(
//if (offs < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2)
// psum[offs] = pl;
// }
int k = iclamp(31 - (ESTPARTLOG + 1) - clz(pl), 0, 14); // 26 - clz(res) == clz(32) - clz(res) == log2(res / 32)
int k = iclamp(31 - fastclz(pl) - (ESTPARTLOG + 1), 0, MAX_RICE_PARAM); // 26 - clz(res) == clz(32) - clz(res) == log2(res / 32)
if (tid < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2)
psum[tid] = (k << (ESTPARTLOG + 1)) + (pl >> k);
barrier(CLK_LOCAL_MEM_FENCE);
@@ -1075,8 +1173,8 @@ void clEstimateResidual(
int pl = psum[0] + (bs - ro);
int obits = task.data.obits - task.data.wbits;
int len = min(obits * task.data.blocksize,
task.data.type == Fixed ? task.data.residualOrder * obits + 6 + (4 * 1/2) + pl :
task.data.type == LPC ? task.data.residualOrder * obits + 4 + 5 + task.data.residualOrder * task.data.cbits + 6 + (4 * 1/2)/* << porder */ + pl :
task.data.type == Fixed ? task.data.residualOrder * obits + 6 + RICE_PARAM_BITS + pl :
task.data.type == LPC ? task.data.residualOrder * obits + 4 + 5 + task.data.residualOrder * task.data.cbits + 6 + RICE_PARAM_BITS/* << porder */ + pl :
task.data.type == Constant ? obits * select(1, task.data.blocksize, pl != task.data.blocksize - task.data.residualOrder) :
obits * task.data.blocksize);
tasks[selectedTask].data.size = len;
@@ -1172,7 +1270,7 @@ void clEncodeResidual(
int bs = task.data.blocksize;
int ro = task.data.residualOrder;
__global int *data = &samples[task.data.samplesOffs];
SWITCH_N(residual[task.data.residualOffs + pos] = t);
SWITCH_N(residual[task.data.residualOffs + pos] = convert_bps_sat(t));
}
#else
// get_group_id(0) == task index
@@ -1198,12 +1296,10 @@ void clEncodeResidual(
barrier(CLK_LOCAL_MEM_FENCE);
#ifdef AMD
int4 cptr0 = vload4(0, &task.coefs[0]);
int4 cptr1 = vload4(1, &task.coefs[0]);
bpsint4 cptr0 = convert_bps4(vload4(0, &task.coefs[0]));
bpsint4 cptr1 = convert_bps4(vload4(1, &task.coefs[0]));
#if MAX_ORDER > 8
int4 cptr2 = vload4(2, &task.coefs[0]);
#endif
bpsint4 cptr2 = convert_bps4(vload4(2, &task.coefs[0]));
#endif
data[tid] = 0;
@@ -1217,33 +1313,24 @@ void clEncodeResidual(
// compute residual
__local int* dptr = &data[tid + GROUP_SIZE - ro];
int4 sum
#ifdef AMD
= cptr0 * vload4(0, dptr)
+ cptr1 * vload4(1, dptr)
#else
= vload4(0, &task.coefs[0]) * vload4(0, dptr)
+ vload4(1, &task.coefs[0]) * vload4(1, dptr)
#endif
bpsint4 sum
= cptr0 * convert_bps4(vload4(0, dptr))
+ cptr1 * convert_bps4(vload4(1, dptr))
#if MAX_ORDER > 8
#ifdef AMD
+ cptr2 * vload4(2, dptr)
#else
+ vload4(2, &task.coefs[0]) * vload4(2, dptr)
#endif
+ cptr2 * convert_bps4(vload4(2, dptr))
#if MAX_ORDER > 12
+ vload4(3, &task.coefs[0]) * vload4(3, dptr)
+ convert_bps4(vload4(3, &task.coefs[0])) * convert_bps4(vload4(3, dptr))
#if MAX_ORDER > 16
+ vload4(4, &task.coefs[0]) * vload4(4, dptr)
+ vload4(5, &task.coefs[0]) * vload4(5, dptr)
+ vload4(6, &task.coefs[0]) * vload4(6, dptr)
+ vload4(7, &task.coefs[0]) * vload4(7, dptr)
+ convert_bps4(vload4(4, &task.coefs[0])) * convert_bps4(vload4(4, dptr))
+ convert_bps4(vload4(5, &task.coefs[0])) * convert_bps4(vload4(5, dptr))
+ convert_bps4(vload4(6, &task.coefs[0])) * convert_bps4(vload4(6, dptr))
+ convert_bps4(vload4(7, &task.coefs[0])) * convert_bps4(vload4(7, dptr))
#endif
#endif
#endif
;
if (off >= ro && off < bs)
output[task.data.residualOffs + off] = data[tid + GROUP_SIZE] - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift);
output[task.data.residualOffs + off] = convert_bps_sat(nextData - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift));
barrier(CLK_LOCAL_MEM_FENCE);
data[tid] = nextData;
@@ -1254,7 +1341,7 @@ void clEncodeResidual(
#ifdef FLACCL_CPU
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
void clCalcPartition(
__global int *partition_lengths,
__global ulong *partition_lengths,
__global int *residual,
__global FLACCLSubframeTask *tasks,
int max_porder, // <= 8
@@ -1265,18 +1352,16 @@ void clCalcPartition(
int bs = task.data.blocksize;
int ro = task.data.residualOrder;
//int psize = bs >> max_porder;
__global int *pl = partition_lengths + (1 << (max_porder + 1)) * get_group_id(1);
__global ulong *pl = partition_lengths + (1 << (max_porder + 1)) * get_group_id(1);
for (int p = 0; p < (1 << max_porder); p++)
pl[p] = 0;
pl[p] = 0UL;
for (int pos = ro; pos < bs; pos ++)
{
int t = residual[task.data.residualOffs + pos];
// overflow protection
t = clamp(t, -0x7fffff, 0x7fffff);
int s = residual[task.data.residualOffs + pos];
// convert to unsigned
t = (t << 1) ^ (t >> 31);
uint t = (s << 1) ^ (s >> 31);
pl[pos / psize] += t;
}
}
@@ -1292,15 +1377,15 @@ void clCalcPartition(
int psize // == task.blocksize >> max_porder?
)
{
__local int pl[(GROUP_SIZE / 8)][15];
__local uint pl[(GROUP_SIZE / 16)][MAX_RICE_PARAM + 1];
__local FLACCLSubframeData task;
const int tid = get_local_id(0);
if (tid < sizeof(task) / sizeof(int))
((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(1)]))[tid];
if (tid < (GROUP_SIZE / 8))
if (tid < (GROUP_SIZE / 16))
{
for (int k = 0; k <= 14; k++)
for (int k = 0; k <= MAX_RICE_PARAM; k++)
pl[tid][k] = 0;
}
barrier(CLK_LOCAL_MEM_FENCE);
@@ -1311,14 +1396,14 @@ void clCalcPartition(
{
// fetch residual
int s = (offs >= task.residualOrder && offs < end) ? residual[task.residualOffs + offs] : 0;
// overflow protection
s = iclamp(s, -0x7fffff, 0x7fffff);
// convert to unsigned
s = (s << 1) ^ (s >> 31);
uint t = (s << 1) ^ (s >> 31);
// calc number of unary bits for each residual sample with each rice paramater
int part = (offs - start) / psize + (tid & 1) * (GROUP_SIZE / 16);
for (int k = 0; k <= 14; k++)
atom_add(&pl[part][k], s >> k);
int part = (offs - start) / psize;
// we must ensure that psize * (t >> k) doesn't overflow;
// i.e. t < ((1 << 32) >> (log2(psize) - k)) <= (1 << 32) >> (32 - clz(MAX_BLOCKSIZE) - k)
for (int k = 0; k <= MAX_RICE_PARAM; k++)
atom_add(&pl[part][k], min(t, 0xffffffffU >> max(0, 32 - clz(MAX_BLOCKSIZE) - k)) >> k);
//pl[part][k] += s >> k;
}
barrier(CLK_LOCAL_MEM_FENCE);
@@ -1326,141 +1411,79 @@ void clCalcPartition(
int part = get_group_id(0) * (GROUP_SIZE / 16) + tid;
if (tid < (GROUP_SIZE / 16) && part < (1 << max_porder))
{
for (int k = 0; k <= 14; k++)
for (int k = 0; k <= MAX_RICE_PARAM; k++)
{
// output length
const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (k << (max_porder + 1));
int plen = pl[tid][k] + pl[tid + (GROUP_SIZE / 16)][k];
partition_lengths[pos + part] = min(0x7fffff, plen) + (psize - select(0, task.residualOrder, part == 0)) * (k + 1);
const int pos = ((MAX_RICE_PARAM + 1) << (max_porder + 1)) * get_group_id(1) + (k << (max_porder + 1));
uint plen = pl[tid][k];
partition_lengths[pos + part] = min(0x007fffffU, plen) + (uint)(psize - select(0, task.residualOrder, part == 0)) * (k + 1);
// if (get_group_id(1) == 0)
//printf("pl[%d][%d] == %d\n", k, part, min(0x7fffff, pl[k][tid]) + (psize - task.residualOrder * (part == 0)) * (k + 1));
}
}
}
#endif
#ifdef FLACCL_CPU
// get_group_id(0) == task index
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
void clCalcPartition16(
__global int *partition_lengths,
__global int *residual,
__global int *samples,
__global FLACCLSubframeTask *tasks,
int max_porder // <= 8
)
{
FLACCLSubframeTask task = tasks[get_global_id(0)];
int bs = task.data.blocksize;
int ro = task.data.residualOrder;
__global int *data = &samples[task.data.samplesOffs];
__global int *pl = partition_lengths + (1 << (max_porder + 1)) * get_global_id(0);
for (int p = 0; p < (1 << max_porder); p++)
pl[p] = 0;
__global int *rptr = residual + task.data.residualOffs;
SWITCH_N((rptr[pos] = t, pl[pos >> 4] += (t << 1) ^ (t >> 31)));
//SWITCH_N((residual[task.data.residualOffs + pos] = t, t = (t << 1) ^ (t >> 31), pl[pos >> 4] += t));
}
#else
// get_group_id(0) == task index
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
void clCalcPartition16(
__global int *partition_lengths,
__global unsigned int *partition_lengths,
__global int *residual,
__global int *samples,
__global FLACCLSubframeTask *tasks,
int max_porder // <= 8
)
{
__local FLACCLSubframeTask task;
__local int data[GROUP_SIZE * 2];
__local int res[GROUP_SIZE];
__local int pl[GROUP_SIZE >> 4][15];
__local FLACCLSubframeData task;
__local unsigned int res[GROUP_SIZE];
__local unsigned int pl[GROUP_SIZE >> 4][MAX_RICE_PARAM + 1];
const int tid = get_local_id(0);
if (tid < sizeof(task) / sizeof(int))
((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(0)]))[tid];
barrier(CLK_LOCAL_MEM_FENCE);
int bs = task.data.blocksize;
int ro = task.data.residualOrder;
int sh = task.data.shift;
if (tid >= ro && tid < 32)
task.coefs[tid] = 0;
int k = tid & 15;
int x = tid / 16;
int bs = task.blocksize;
int ro = task.residualOrder;
barrier(CLK_LOCAL_MEM_FENCE);
__global int * rptr = &residual[task.data.residualOffs];
__global int * plptr = &partition_lengths[(15 << (max_porder + 1)) * get_group_id(0) + (k << (max_porder + 1))];
__local int* dptr = &data[tid + GROUP_SIZE - ro];
int4 cptr0 = vload4(0, &task.coefs[0]);
int4 cptr1 = vload4(1, &task.coefs[0]);
int4 cptr2 = vload4(2, &task.coefs[0]);
data[tid] = 0;
for (int pos = 0; pos < bs; pos += GROUP_SIZE)
{
int offs = pos + tid;
// fetch samples
int nextData = offs < bs ? samples[task.data.samplesOffs + offs] >> task.data.wbits : 0;
data[tid + GROUP_SIZE] = nextData;
barrier(CLK_LOCAL_MEM_FENCE);
// compute residual
int4 sum = cptr0 * vload4(0, dptr)
#if MAX_ORDER > 4
+ cptr1 * vload4(1, dptr)
#if MAX_ORDER > 8
+ cptr2 * vload4(2, dptr)
#if MAX_ORDER > 12
+ vload4(3, &task.coefs[0]) * vload4(3, dptr)
#if MAX_ORDER > 16
+ vload4(4, &task.coefs[0]) * vload4(4, dptr)
+ vload4(5, &task.coefs[0]) * vload4(5, dptr)
+ vload4(6, &task.coefs[0]) * vload4(6, dptr)
+ vload4(7, &task.coefs[0]) * vload4(7, dptr)
#endif
#endif
#endif
#endif
;
int s = select(0, nextData - ((sum.x + sum.y + sum.z + sum.w) >> sh), offs >= ro && offs < bs);
// output residual
if (offs < bs)
rptr[offs] = s;
s = iclamp(s, -0x7fffff, 0x7fffff);
// fetch residual
int s = (offs >= ro && offs < bs) ? residual[task.residualOffs + offs] : 0;
// convert to unsigned
res[tid] = (s << 1) ^ (s >> 31);
// for (int k = 0; k < 15; k++) atom_add(&pl[x][k], s >> k);
barrier(CLK_LOCAL_MEM_FENCE);
for (int k0 = 0; k0 <= MAX_RICE_PARAM; k0 += 16)
{
// calc number of unary bits for each group of 16 residual samples
// with each rice parameter.
int k = k0 + (tid & 15);
int x = tid >> 4;
// we must ensure that psize * (t >> k) doesn't overflow;
// i.e. t < ((1 << 32) >> (log2(16) - k)) <= (1 << 32) >> (4 - k)
uint4 lim = 0xffffffffU >> max(0, 4 - k);
__local uint * chunk = &res[x << 4];
uint4 rsum = (min(lim,vload4(0,chunk)) >> k) + (min(lim,vload4(1,chunk)) >> k) + (min(lim,vload4(2,chunk)) >> k) + (min(lim,vload4(3,chunk)) >> k);
uint rs = rsum.x + rsum.y + rsum.z + rsum.w;
// We can safely limit length here to 0x007fffffU, not causing length
// mismatch, because any such length would cause Verbatim frame anyway.
// And this limit protects us from overflows when calculating larger
// partitions, as we can have a maximum of 2^8 partitions, resulting
// in maximum partition length of 0x7fffffffU + change.
if (k <= MAX_RICE_PARAM) pl[x][k] = min(0x007fffffU, rs) + (uint)(16 - select(0, ro, offs < 16)) * (k + 1);
}
barrier(CLK_LOCAL_MEM_FENCE);
data[tid] = nextData;
// calc number of unary bits for each residual sample with each rice paramater
__local int * chunk = &res[x << 4];
sum = (vload4(0,chunk) >> k) + (vload4(1,chunk) >> k) + (vload4(2,chunk) >> k) + (vload4(3,chunk) >> k);
s = sum.x + sum.y + sum.z + sum.w;
#if 0
if (k <= 14 && offs < bs)
plptr[offs >> 4] = min(0x7fffff, s) + (16 - select(0, ro, offs < 16)) * (k + 1);
#else
if (k <= 14) pl[x][k] = min(0x7fffff, s) + (16 - select(0, ro, offs < 16)) * (k + 1);
barrier(CLK_LOCAL_MEM_FENCE);
int k1 = tid >> 3, x1 = tid & 7;
if (k1 <= 14 && (pos >> 4) + x1 < (1 << max_porder))
partition_lengths[(15 << (max_porder + 1)) * get_group_id(0) + (k1 << (max_porder + 1)) + (pos >> 4) + x1] = pl[x1][k1];
#endif
// if (task.data.blocksize == 16 && x == 0 && k <= 14)
// printf("[%d] = %d = s:%d + %d * (k:%d + 1), ro=%d, offs=%d, lpos=%d\n", k, partition_lengths[lpos], s, (16 - select(0, ro, offs < 16)), k, ro, offs, lpos);
for (int k0 = 0; k0 <= MAX_RICE_PARAM; k0 += 16)
{
int k1 = k0 + (tid >> 3), x1 = tid & 7;
if (k1 <= MAX_RICE_PARAM && (pos >> 4) + x1 < (1 << max_porder))
partition_lengths[((MAX_RICE_PARAM + 1) << (max_porder + 1)) * get_group_id(0) + (k1 << (max_porder + 1)) + (pos >> 4) + x1] = pl[x1][k1];
}
}
}
#endif
@@ -1471,13 +1494,13 @@ void clCalcPartition16(
// get_group_id(1) == task index
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
void clSumPartition(
__global int* partition_lengths,
__global ulong* partition_lengths,
int max_porder
)
{
if (get_group_id(0) != 0) // ignore k != 0
return;
__global int * sums = partition_lengths + (1 << (max_porder + 1)) * get_group_id(1);
__global ulong * sums = partition_lengths + (1 << (max_porder + 1)) * get_group_id(1);
for (int i = max_porder - 1; i >= 0; i--)
{
for (int j = 0; j < (1 << i); j++)
@@ -1496,15 +1519,15 @@ void clSumPartition(
// get_group_id(1) == task index
__kernel __attribute__((reqd_work_group_size(128, 1, 1)))
void clSumPartition(
__global int* partition_lengths,
__global uint* partition_lengths,
int max_porder
)
{
__local int data[256]; // max_porder <= 8, data length <= 1 << 9.
const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (get_group_id(0) << (max_porder + 1));
__local uint data[256]; // max_porder <= 8, data length <= 1 << 9.
const int pos = ((MAX_RICE_PARAM + 1) << (max_porder + 1)) * get_group_id(1) + (get_group_id(0) << (max_porder + 1));
// fetch partition lengths
int2 pl = get_local_id(0) * 2 < (1 << max_porder) ? vload2(get_local_id(0),&partition_lengths[pos]) : 0;
uint2 pl = get_local_id(0) * 2 < (1 << max_porder) ? vload2(get_local_id(0),&partition_lengths[pos]) : 0;
data[get_local_id(0)] = pl.x + pl.y;
barrier(CLK_LOCAL_MEM_FENCE);
@@ -1512,7 +1535,7 @@ void clSumPartition(
int out_pos = (1 << (max_porder - 1)) + get_local_id(0);
for (int bs = 1 << (max_porder - 2); bs > 0; bs >>= 1)
{
if (get_local_id(0) < bs) data[out_pos] = data[in_pos] + data[in_pos + 1];
if (get_local_id(0) < bs) data[out_pos] = data[in_pos] + data[in_pos + 1];
in_pos += bs << 1;
out_pos += bs;
barrier(CLK_LOCAL_MEM_FENCE);
@@ -1531,7 +1554,7 @@ __kernel __attribute__((reqd_work_group_size(1, 1, 1)))
void clFindRiceParameter(
__global FLACCLSubframeTask *tasks,
__global int* rice_parameters,
__global int* partition_lengths,
__global ulong* partition_lengths,
int max_porder
)
{
@@ -1541,7 +1564,7 @@ void clFindRiceParameter(
//int psize = task->data.blocksize >> max_porder;
int bs = task->data.blocksize;
int ro = task->data.residualOrder;
__global int* ppl = &partition_lengths[get_group_id(0) << (max_porder + 1)];
__global ulong* ppl = &partition_lengths[get_group_id(0) << (max_porder + 1)];
__global int* prp = &rice_parameters[get_group_id(0) << (max_porder + 2)];
__global int* pol = prp + (1 << (max_porder + 1));
for (int porder = max_porder; porder >= 0; porder--)
@@ -1549,10 +1572,10 @@ void clFindRiceParameter(
int pos = (2 << max_porder) - (2 << porder);
int fin = pos + (1 << porder);
int pl = ppl[pos];
ulong pl = ppl[pos];
int ps = (bs >> porder) - ro;
int k = iclamp(31 - fastclz(pl / max(1, ps)), 0, 14);
int plk = ps * (k + 1) + (pl >> k);
int k = iclamp(63 - fastclz64(pl / max(1, ps)), 0, MAX_RICE_PARAM);
int plk = ps * (k + 1) + (int)(pl >> k);
// output rice parameter
prp[pos] = k;
@@ -1564,8 +1587,8 @@ void clFindRiceParameter(
for (int offs = pos + 1; offs < fin; offs++)
{
pl = ppl[offs];
k = iclamp(31 - fastclz(pl / ps), 0, 14);
plk = ps * (k + 1) + (pl >> k);
k = iclamp(63 - fastclz64(pl / ps), 0, MAX_RICE_PARAM);
plk = ps * (k + 1) + (int)(pl >> k);
// output rice parameter
prp[offs] = k;
@@ -1581,18 +1604,18 @@ __kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
void clFindRiceParameter(
__global FLACCLSubframeTask *tasks,
__global int* rice_parameters,
__global int* partition_lengths,
__global uint* partition_lengths,
int max_porder
)
{
for (int offs = get_local_id(0); offs < (2 << max_porder); offs += GROUP_SIZE)
{
const int pos = (15 << (max_porder + 1)) * get_group_id(0) + offs;
int best_l = partition_lengths[pos];
const int pos = ((MAX_RICE_PARAM + 1) << (max_porder + 1)) * get_group_id(0) + offs;
uint best_l = partition_lengths[pos];
int best_k = 0;
for (int k = 1; k <= 14; k++)
for (int k = 1; k <= MAX_RICE_PARAM; k++)
{
int l = partition_lengths[pos + (k << (max_porder + 1))];
uint l = partition_lengths[pos + (k << (max_porder + 1))];
best_k = select(best_k, k, l < best_l);
best_l = min(best_l, l);
}
@@ -1630,16 +1653,16 @@ void clFindPartitionOrder(
partlen[porder] += rice_parameters[pos + start + offs];
}
int best_length = partlen[0] + 4;
int best_length = partlen[0] + RICE_PARAM_BITS;
int best_porder = 0;
for (int porder = 1; porder <= max_porder; porder++)
{
int length = (4 << porder) + partlen[porder];
int length = (RICE_PARAM_BITS << porder) + partlen[porder];
best_porder = select(best_porder, porder, length < best_length);
best_length = min(best_length, length);
}
best_length = (4 << best_porder) + task->data.blocksize - task->data.residualOrder;
best_length = (RICE_PARAM_BITS << best_porder) + task->data.blocksize - task->data.residualOrder;
int best_psize = task->data.blocksize >> best_porder;
int start = task->data.residualOffs + task->data.residualOrder;
int fin = task->data.residualOffs + best_psize;
@@ -1704,11 +1727,11 @@ void clFindPartitionOrder(
}
barrier(CLK_LOCAL_MEM_FENCE);
int best_length = partlen[0] + 4;
int best_length = partlen[0] + RICE_PARAM_BITS;
int best_porder = 0;
for (int porder = 1; porder <= max_porder; porder++)
{
int length = (4 << porder) + partlen[porder];
int length = (RICE_PARAM_BITS << porder) + partlen[porder];
best_porder = select(best_porder, porder, length < best_length);
best_length = min(best_length, length);
}
@@ -1836,14 +1859,14 @@ void clCalcOutputOffsets(
)
{
const int channels = 2;
__local FLACCLSubframeData ltasks[2];
__local volatile int mypos[2];
__local FLACCLSubframeData ltasks[MAX_CHANNELS];
__local volatile int mypos[MAX_CHANNELS];
int offset = 0;
for (int iFrame = 0; iFrame < frameCount; iFrame++)
{
if (get_local_id(0) < sizeof(ltasks[0]) / sizeof(int))
for (int ch = 0; ch < channels; ch++)
((__local int*)&ltasks[ch])[get_local_id(0)] = ((__global int*)(&tasks[iFrame * channels + ch]))[get_local_id(0)];
for (int ch = 0; ch < MAX_CHANNELS; ch++)
((__local int*)&ltasks[ch])[get_local_id(0)] = ((__global int*)(&tasks[iFrame * MAX_CHANNELS + ch]))[get_local_id(0)];
//printf("len_utf8(%d) == %d\n", firstFrame + iFrame, len_utf8(firstFrame + iFrame));
offset += 15 + 1 + 4 + 4 + 4 + 3 + 1 + len_utf8(firstFrame + iFrame)
@@ -1856,18 +1879,18 @@ void clCalcOutputOffsets(
// assert (offset % 8) == 0
offset += 8;
if (get_local_id(0) < channels)
if (get_local_id(0) < MAX_CHANNELS)
{
int ch = get_local_id(0);
// Add 64 bits to separate frames if header is too small so they can intersect
int mylen = 8 + ltasks[ch].wbits + 64 + ltasks[ch].size;
mypos[ch] = mylen;
for (int offset = 1; offset < WARP_SIZE && offset < channels; offset <<= 1)
for (int offset = 1; offset < WARP_SIZE && offset < MAX_CHANNELS; offset <<= 1)
if (ch >= offset) mypos[ch] += mypos[ch - offset];
mypos[ch] += offset;
tasks[iFrame * channels + ch].data.encodingOffset = mypos[ch] - ltasks[ch].size + ltasks[ch].headerLen;
tasks[iFrame * MAX_CHANNELS + ch].data.encodingOffset = mypos[ch] - ltasks[ch].size + ltasks[ch].headerLen;
}
offset = mypos[channels - 1];
offset = mypos[MAX_CHANNELS - 1];
offset = (offset + 7) & ~7;
offset += 16;
}
@@ -1909,7 +1932,7 @@ void clRiceEncoding(
for (int p = 0; p < (1 << porder); p++)
{
int k = kptr[p];
writebits(&bw, 4, k);
writebits(&bw, RICE_PARAM_BITS, k);
//if (get_group_id(0) == 0) printf("[%x] ", k);
//if (get_group_id(0) == 0) printf("(%x) ", bw.bit_buf);
if (p == 1) res_cnt = psize;
@@ -1978,7 +2001,7 @@ void clRiceEncoding(
flush(&bw);
}
#else
__local unsigned int data[GROUP_SIZE];
__local uint data[GROUP_SIZE];
__local volatile int mypos[GROUP_SIZE+1];
#if 0
__local int brp[256];
@@ -2006,12 +2029,12 @@ void clRiceEncoding(
int start = task.encodingOffset;
int plen = bs >> task.porder;
//int plenoffs = 12 - task.porder;
unsigned int remainder = 0U;
uint remainder = 0U;
int pos;
for (pos = 0; pos + GROUP_SIZE - 1 < bs; pos += GROUP_SIZE)
{
int offs = pos + tid;
int v = residual[task.residualOffs + offs];
int iv = residual[task.residualOffs + offs];
int part = offs / plen; // >> plenoffs;
#if 0
int k = brp[part];
@@ -2019,10 +2042,10 @@ void clRiceEncoding(
int k = best_rice_parameters[(get_group_id(0) << max_porder) + part];
#endif
int pstart = offs == task.residualOrder || offs == part * plen;
v = (v << 1) ^ (v >> 31);
int mylen = select(0, (v >> k) + 1 + k + select(0, 4, pstart), offs >= task.residualOrder && offs < bs);
uint v = (iv << 1) ^ (iv >> 31);
int mylen = select(0, (int)(v >> k) + 1 + k + select(0, RICE_PARAM_BITS, pstart), offs >= task.residualOrder && offs < bs);
mypos[tid] = mylen;
// Inclusive scan(+)
int lane = (tid & (WARP_SIZE - 1));
for (int offset = 1; offset < WARP_SIZE; offset <<= 1)
@@ -2040,7 +2063,8 @@ void clRiceEncoding(
mp += start + select(0, warppos[tid / WARP_SIZE - 1], tid / WARP_SIZE > 0);
int start32 = start >> 5;
start += mypos[GROUP_SIZE - 1] + warppos[GROUP_SIZE / WARP_SIZE - 2];
//if (start / 32 - start32 >= GROUP_SIZE - 3)
// tasks[get_group_id(0)].data.size = 1;
//if (tid == GROUP_SIZE - 1 && mypos[tid] > (GROUP_SIZE/2) * 32)
// printf("Oops: %d\n", mypos[tid]);
data[tid] = select(0U, remainder, tid == 0);
@@ -2052,18 +2076,18 @@ void clRiceEncoding(
int kpos = mp - mylen;
int kpos0 = (kpos >> 5) - start32;
int kpos1 = kpos & 31;
unsigned int kval = (unsigned int)k << 28;
unsigned int kval0 = kval >> kpos1;
unsigned int kval1 = kval << (32 - kpos1);
uint kval = (uint)k << (32 - RICE_PARAM_BITS);
uint kval0 = kval >> kpos1;
uint kval1 = kval << (32 - kpos1);
if (kval0) atom_or(&data[kpos0], kval0);
if (kpos1 && kval1) atom_or(&data[kpos0 + 1], kval1);
}
int qpos = mp - k - 1;
int qpos0 = (qpos >> 5) - start32;
int qpos1 = qpos & 31;
unsigned int qval = (1U << 31) | ((unsigned int)v << (31 - k));
unsigned int qval0 = qval >> qpos1;
unsigned int qval1= qval << (32 - qpos1);
uint qval = (1U << 31) | (v << (31 - k));
uint qval0 = qval >> qpos1;
uint qval1= qval << (32 - qpos1);
if (qval0) atom_or(&data[qpos0], qval0);
if (qpos1 && qval1) atom_or(&data[qpos0 + 1], qval1);
}
@@ -2075,13 +2099,13 @@ void clRiceEncoding(
if (pos < bs)
{
int offs = pos + tid;
int v = offs < bs ? residual[task.residualOffs + offs] : 0;
int iv = offs < bs ? residual[task.residualOffs + offs] : 0;
int part = offs / plen; // >> plenoffs;
//int k = brp[min(255, part)];
int k = offs < bs ? best_rice_parameters[(get_group_id(0) << max_porder) + part] : 0;
int pstart = offs == task.residualOrder || offs == part * plen;
v = (v << 1) ^ (v >> 31);
int mylen = select(0, (v >> k) + 1 + k + select(0, 4, pstart), offs >= task.residualOrder && offs < bs);
uint v = (iv << 1) ^ (iv >> 31);
int mylen = select(0, (int)(v >> k) + 1 + k + select(0, RICE_PARAM_BITS, pstart), offs >= task.residualOrder && offs < bs);
mypos[tid] = mylen;
// Inclusive scan(+)
@@ -2113,18 +2137,18 @@ void clRiceEncoding(
int kpos = mp - mylen;
int kpos0 = (kpos >> 5) - start32;
int kpos1 = kpos & 31;
unsigned int kval = (unsigned int)k << 28;
unsigned int kval0 = kval >> kpos1;
unsigned int kval1 = kval << (32 - kpos1);
uint kval = (uint)k << (32 - RICE_PARAM_BITS);
uint kval0 = kval >> kpos1;
uint kval1 = kval << (32 - kpos1);
if (kval0) atom_or(&data[kpos0], kval0);
if (kpos1 && kval1) atom_or(&data[kpos0 + 1], kval1);
}
int qpos = mp - k - 1;
int qpos0 = (qpos >> 5) - start32;
int qpos1 = qpos & 31;
unsigned int qval = (1U << 31) | ((unsigned int)v << (31 - k));
unsigned int qval0 = qval >> qpos1;
unsigned int qval1= qval << (32 - qpos1);
uint qval = (1U << 31) | (v << (31 - k));
uint qval0 = qval >> qpos1;
uint qval1= qval << (32 - qpos1);
if (qval0) atom_or(&data[qpos0], qval0);
if (qpos1 && qval1) atom_or(&data[qpos0 + 1], qval1);
}

View File

@@ -83,6 +83,11 @@ namespace CUETools.Codecs.FLAKE
/// </summary>
public int porder;
/// <summary>
/// coding method: rice parameters use 4 bits for coding_method 0 and 5 bits for coding_method 1
/// </summary>
public int coding_method;
/// <summary>
/// Rice parameters
/// </summary>

View File

@@ -102,7 +102,7 @@ namespace CUETools.Codecs.FLAKE
}
_samplesInBuffer = 0;
if (PCM.BitsPerSample != 16 || PCM.ChannelCount != 2 || PCM.SampleRate != 44100)
if ((PCM.BitsPerSample != 16 && PCM.BitsPerSample != 24) || PCM.ChannelCount != 2 || (PCM.SampleRate != 44100 && PCM.SampleRate != 48000))
throw new Exception("invalid flac file");
samplesBuffer = new int[Flake.MAX_BLOCKSIZE * PCM.ChannelCount];
@@ -362,8 +362,9 @@ namespace CUETools.Codecs.FLAKE
unsafe void decode_residual(BitReader bitreader, FlacFrame frame, int ch)
{
// rice-encoded block
uint coding_method = bitreader.readbits(2); // ????? == 0
if (coding_method != 0 && coding_method != 1) // if 1, then parameter length == 5 bits instead of 4
// coding method
frame.subframes[ch].best.rc.coding_method = (int)bitreader.readbits(2); // ????? == 0
if (frame.subframes[ch].best.rc.coding_method != 0 && frame.subframes[ch].best.rc.coding_method != 1)
throw new Exception("unsupported residual coding");
// partition order
frame.subframes[ch].best.rc.porder = (int)bitreader.readbits(4);
@@ -372,7 +373,7 @@ namespace CUETools.Codecs.FLAKE
int psize = frame.blocksize >> frame.subframes[ch].best.rc.porder;
int res_cnt = psize - frame.subframes[ch].best.order;
int rice_len = 4 + (int)coding_method;
int rice_len = 4 + frame.subframes[ch].best.rc.coding_method;
// residual
int j = frame.subframes[ch].best.order;
int* r = frame.subframes[ch].best.residual + j;

View File

@@ -125,8 +125,8 @@ namespace CUETools.Codecs.FLAKE
{
_pcm = pcm;
if (_pcm.BitsPerSample != 16)
throw new Exception("Bits per sample must be 16.");
//if (_pcm.BitsPerSample != 16)
// throw new Exception("Bits per sample must be 16.");
if (_pcm.ChannelCount != 2)
throw new Exception("ChannelCount must be 2.");
@@ -571,14 +571,14 @@ namespace CUETools.Codecs.FLAKE
samplesInBuffer += block;
}
unsafe static void channel_decorrelation(int* leftS, int* rightS, int *leftM, int *rightM, int blocksize)
{
for (int i = 0; i < blocksize; i++)
{
leftM[i] = (leftS[i] + rightS[i]) >> 1;
rightM[i] = leftS[i] - rightS[i];
}
}
//unsafe static void channel_decorrelation(int* leftS, int* rightS, int *leftM, int *rightM, int blocksize)
//{
// for (int i = 0; i < blocksize; i++)
// {
// leftM[i] = (leftS[i] + rightS[i]) >> 1;
// rightM[i] = leftS[i] - rightS[i];
// }
//}
unsafe void encode_residual_verbatim(int* res, int* smp, uint n)
{
@@ -638,24 +638,28 @@ namespace CUETools.Codecs.FLAKE
}
}
static unsafe uint calc_optimal_rice_params(int porder, int* parm, uint* sums, uint n, uint pred_order)
static unsafe uint calc_optimal_rice_params(int porder, int* parm, ulong* sums, uint n, uint pred_order, ref int method)
{
uint part = (1U << porder);
uint cnt = (n >> porder) - pred_order;
int k = cnt > 0 ? Math.Min(Flake.MAX_RICE_PARAM, BitReader.log2i(sums[0] / cnt)) : 0;
uint all_bits = cnt * ((uint)k + 1U) + (sums[0] >> k);
int maxK = method > 0 ? 30 : Flake.MAX_RICE_PARAM;
int k = cnt > 0 ? Math.Min(maxK, BitReader.log2i(sums[0] / cnt)) : 0;
int realMaxK0 = k;
ulong all_bits = cnt * ((uint)k + 1U) + (sums[0] >> k);
parm[0] = k;
cnt = (n >> porder);
for (uint i = 1; i < part; i++)
{
k = Math.Min(Flake.MAX_RICE_PARAM, BitReader.log2i(sums[i] / cnt));
k = Math.Min(maxK, BitReader.log2i(sums[i] / cnt));
realMaxK0 = Math.Max(realMaxK0, k);
all_bits += cnt * ((uint)k + 1U) + (sums[i] >> k);
parm[i] = k;
}
return all_bits + (4 * part);
method = realMaxK0 > Flake.MAX_RICE_PARAM ? 1 : 0;
return (uint)all_bits + ((4U + (uint)method) * part);
}
static unsafe void calc_lower_sums(int pmin, int pmax, uint* sums)
static unsafe void calc_lower_sums(int pmin, int pmax, ulong* sums)
{
for (int i = pmax - 1; i >= pmin; i--)
{
@@ -668,12 +672,12 @@ namespace CUETools.Codecs.FLAKE
}
}
static unsafe void calc_sums(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums)
static unsafe void calc_sums(int pmin, int pmax, uint* data, uint n, uint pred_order, ulong* sums)
{
int parts = (1 << pmax);
uint* res = data + pred_order;
uint cnt = (n >> pmax) - pred_order;
uint sum = 0;
ulong sum = 0;
for (uint j = cnt; j > 0; j--)
sum += *(res++);
sums[0] = sum;
@@ -696,18 +700,18 @@ namespace CUETools.Codecs.FLAKE
/// <param name="n"></param>
/// <param name="pred_order"></param>
/// <param name="sums"></param>
static unsafe void calc_sums18(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums)
static unsafe void calc_sums18(int pmin, int pmax, uint* data, uint n, uint pred_order, ulong* sums)
{
int parts = (1 << pmax);
uint* res = data + pred_order;
uint cnt = 18 - pred_order;
uint sum = 0;
ulong sum = 0;
for (uint j = cnt; j > 0; j--)
sum += *(res++);
sums[0] = sum;
for (int i = 1; i < parts; i++)
{
sums[i] =
sums[i] = 0UL +
*(res++) + *(res++) + *(res++) + *(res++) +
*(res++) + *(res++) + *(res++) + *(res++) +
*(res++) + *(res++) + *(res++) + *(res++) +
@@ -725,18 +729,18 @@ namespace CUETools.Codecs.FLAKE
/// <param name="n"></param>
/// <param name="pred_order"></param>
/// <param name="sums"></param>
static unsafe void calc_sums16(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums)
static unsafe void calc_sums16(int pmin, int pmax, uint* data, uint n, uint pred_order, ulong* sums)
{
int parts = (1 << pmax);
uint* res = data + pred_order;
uint cnt = 16 - pred_order;
uint sum = 0;
ulong sum = 0;
for (uint j = cnt; j > 0; j--)
sum += *(res++);
sums[0] = sum;
for (int i = 1; i < parts; i++)
{
sums[i] =
sums[i] = 0UL +
*(res++) + *(res++) + *(res++) + *(res++) +
*(res++) + *(res++) + *(res++) + *(res++) +
*(res++) + *(res++) + *(res++) + *(res++) +
@@ -744,10 +748,10 @@ namespace CUETools.Codecs.FLAKE
}
}
static unsafe uint calc_rice_params(RiceContext rc, int pmin, int pmax, int* data, uint n, uint pred_order)
static unsafe uint calc_rice_params(RiceContext rc, int pmin, int pmax, int* data, uint n, uint pred_order, int bps)
{
uint* udata = stackalloc uint[(int)n];
uint* sums = stackalloc uint[(pmax + 1) * Flake.MAX_PARTITIONS];
ulong* sums = stackalloc ulong[(pmax + 1) * Flake.MAX_PARTITIONS];
int* parm = stackalloc int[(pmax + 1) * Flake.MAX_PARTITIONS];
//uint* bits = stackalloc uint[Flake.MAX_PARTITION_ORDER];
@@ -770,17 +774,21 @@ namespace CUETools.Codecs.FLAKE
uint opt_bits = AudioSamples.UINT32_MAX;
int opt_porder = pmin;
int opt_method = 0;
for (int i = pmin; i <= pmax; i++)
{
uint bits = calc_optimal_rice_params(i, parm + i * Flake.MAX_PARTITIONS, sums + i * Flake.MAX_PARTITIONS, n, pred_order);
int method = bps > 16 ? 1 : 0;
uint bits = calc_optimal_rice_params(i, parm + i * Flake.MAX_PARTITIONS, sums + i * Flake.MAX_PARTITIONS, n, pred_order, ref method);
if (bits <= opt_bits)
{
opt_bits = bits;
opt_porder = i;
opt_method = method;
}
}
rc.porder = opt_porder;
rc.coding_method = opt_method;
fixed (int* rparms = rc.rparams)
AudioSamples.MemCpy(rparms, parm + opt_porder * Flake.MAX_PARTITIONS, (1 << opt_porder));
@@ -841,7 +849,7 @@ namespace CUETools.Codecs.FLAKE
}
int pmax = get_max_p_order(eparams.max_partition_order, frame.blocksize, frame.current.order);
int pmin = Math.Min(eparams.min_partition_order, pmax);
uint best_size = calc_rice_params(frame.current.rc, pmin, pmax, frame.current.residual, (uint)frame.blocksize, (uint)frame.current.order);
uint best_size = calc_rice_params(frame.current.rc, pmin, pmax, frame.current.residual, (uint)frame.blocksize, (uint)frame.current.order, PCM.BitsPerSample);
// not working
//for (int o = 1; o <= frame.current.order; o++)
//{
@@ -877,7 +885,7 @@ namespace CUETools.Codecs.FLAKE
int pmax = get_max_p_order(eparams.max_partition_order, frame.blocksize, frame.current.order);
int pmin = Math.Min(eparams.min_partition_order, pmax);
frame.current.size = (uint)(frame.current.order * frame.subframes[ch].obits) + 6
+ calc_rice_params(frame.current.rc, pmin, pmax, frame.current.residual, (uint)frame.blocksize, (uint)frame.current.order);
+ calc_rice_params(frame.current.rc, pmin, pmax, frame.current.residual, (uint)frame.blocksize, (uint)frame.current.order, PCM.BitsPerSample);
frame.subframes[ch].done_fixed |= (1U << order);
@@ -1054,7 +1062,7 @@ namespace CUETools.Codecs.FLAKE
unsafe void output_residual(FlacFrame frame, BitWriter bitwriter, FlacSubframeInfo sub)
{
// rice-encoded block
bitwriter.writebits(2, 0);
bitwriter.writebits(2, sub.best.rc.coding_method);
// partition order
int porder = sub.best.rc.porder;
@@ -1063,13 +1071,14 @@ namespace CUETools.Codecs.FLAKE
bitwriter.writebits(4, porder);
int res_cnt = psize - sub.best.order;
int rice_len = 4 + sub.best.rc.coding_method;
// residual
int j = sub.best.order;
fixed (byte* fixbuf = &frame_buffer[0])
for (int p = 0; p < (1 << porder); p++)
{
int k = sub.best.rc.rparams[p];
bitwriter.writebits(4, k);
bitwriter.writebits(rice_len, k);
if (p == 1) res_cnt = psize;
int cnt = Math.Min(res_cnt, frame.blocksize - j);
bitwriter.write_rice_block_signed(fixbuf, k, sub.best.residual + j, cnt);
@@ -1436,6 +1445,9 @@ namespace CUETools.Codecs.FLAKE
output_subframes(frame, bitwriter);
output_frame_footer(bitwriter);
if (bitwriter.Length >= max_frame_size)
throw new Exception("buffer overflow");
if (frame_buffer != null)
{
if (eparams.variable_block_size > 0)
@@ -1732,9 +1744,6 @@ namespace CUETools.Codecs.FLAKE
}
if (i == 8)
throw new Exception("non-standard bps");
// FIXME: For now, only 16-bit encoding is supported
if (_pcm.BitsPerSample != 16)
throw new Exception("non-standard bps");
if (_blocksize == 0)
{

View File

@@ -35,6 +35,15 @@ namespace CUETools.Codecs
return log2i((uint)v);
}
public static int log2i(ulong v)
{
int n = 0;
if (0 != (v & 0xffffffff00000000)) { v >>= 32; n += 32; }
if (0 != (v & 0xffff0000)) { v >>= 16; n += 16; }
if (0 != (v & 0xff00)) { v >>= 8; n += 8; }
return n + byte_to_log2_table[v];
}
public static int log2i(uint v)
{
int n = 0;

View File

@@ -422,14 +422,41 @@ namespace CUETools.Codecs
unsafe public void Interlace(int pos, int* src1, int* src2, int n)
{
if (PCM.ChannelCount != 2 || PCM.BitsPerSample != 16)
throw new Exception("");
fixed (byte* bs = Bytes)
if (PCM.ChannelCount != 2)
throw new Exception("Must be stereo");
if (PCM.BitsPerSample == 16)
{
int* res = ((int*)bs) + pos;
for (int i = n; i > 0; i--)
*(res++) = (*(src1++) & 0xffff) ^ (*(src2++) << 16);
fixed (byte* bs = Bytes)
{
int* res = ((int*)bs) + pos;
for (int i = n; i > 0; i--)
*(res++) = (*(src1++) & 0xffff) ^ (*(src2++) << 16);
}
}
else if (PCM.BitsPerSample == 24)
{
fixed (byte* bs = Bytes)
{
byte* res= bs + pos * 6;
for (int i = n; i > 0; i--)
{
uint sample_out = (uint)*(src1++);
*(res++) = (byte)(sample_out & 0xFF);
sample_out >>= 8;
*(res++) = (byte)(sample_out & 0xFF);
sample_out >>= 8;
*(res++) = (byte)(sample_out & 0xFF);
sample_out = (uint)*(src2++);
*(res++) = (byte)(sample_out & 0xFF);
sample_out >>= 8;
*(res++) = (byte)(sample_out & 0xFF);
sample_out >>= 8;
*(res++) = (byte)(sample_out & 0xFF);
}
}
}
else
throw new Exception("Unsupported BPS");
}
//public void Clear()
@@ -451,6 +478,7 @@ namespace CUETools.Codecs
short* pOutSamples = (short*)outSamples;
for (int i = 0; i < loopCount; i++)
pOutSamples[i] = (short)pInSamples[i];
//*(pOutSamples++) = (short)*(pInSamples++);
}
}
@@ -465,19 +493,8 @@ namespace CUETools.Codecs
throw new IndexOutOfRangeException();
}
fixed (int* pInSamplesFixed = &inSamples[inSampleOffset, 0])
{
fixed (byte* pOutSamplesFixed = &outSamples[outByteOffset])
{
int* pInSamples = pInSamplesFixed;
short* pOutSamples = (short*)pOutSamplesFixed;
for (int i = 0; i < loopCount; i++)
{
*(pOutSamples++) = (short)*(pInSamples++);
}
}
}
fixed (byte* pOutSamplesFixed = &outSamples[outByteOffset])
FLACSamplesToBytes_16(inSamples, inSampleOffset, pOutSamplesFixed, sampleCount, channelCount);
}
public static unsafe void FLACSamplesToBytes_24(int[,] inSamples, int inSampleOffset,
@@ -917,16 +934,16 @@ namespace CUETools.Codecs
private AudioPCMConfig pcm;
private int _sampleVal;
public SilenceGenerator(long sampleCount, int sampleVal)
public SilenceGenerator(AudioPCMConfig pcm, long sampleCount, int sampleVal)
{
_sampleVal = sampleVal;
_sampleOffset = 0;
_sampleCount = sampleCount;
pcm = AudioPCMConfig.RedBook;
this._sampleVal = sampleVal;
this._sampleOffset = 0;
this._sampleCount = sampleCount;
this.pcm = pcm;
}
public SilenceGenerator(long sampleCount)
: this(sampleCount, 0)
: this(AudioPCMConfig.RedBook, sampleCount, 0)
{
}
@@ -1091,19 +1108,29 @@ namespace CUETools.Codecs
{
foundFormat = true;
if (_br.ReadUInt16() != 1)
{
throw new Exception("WAVE must be PCM format.");
}
uint fmtTag = _br.ReadUInt16();
int _channelCount = _br.ReadInt16();
int _sampleRate = _br.ReadInt32();
_br.ReadInt32();
_br.ReadInt32(); // bytes per second
int _blockAlign = _br.ReadInt16();
int _bitsPerSample = _br.ReadInt16();
pos += 16;
if (fmtTag == 0xFFFEU && ckSize >= 34) // WAVE_FORMAT_EXTENSIBLE
{
_br.ReadInt16(); // CbSize
_br.ReadInt16(); // ValidBitsPerSample
int channelMask = _br.ReadInt32();
fmtTag = _br.ReadUInt16();
pos += 10;
}
if (fmtTag != 1) // WAVE_FORMAT_PCM
throw new Exception("WAVE format tag not WAVE_FORMAT_PCM.");
pcm = new AudioPCMConfig(_bitsPerSample, _channelCount, _sampleRate);
if (pcm.BlockAlign != _blockAlign)
throw new Exception("WAVE has strange BlockAlign");
pos += 16;
}
else if (ckID == fccData)
{

View File

@@ -23,7 +23,7 @@
<DefineConstants>DEBUG;TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
<PlatformTarget>x86</PlatformTarget>
<PlatformTarget>AnyCPU</PlatformTarget>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
<DebugType>pdbonly</DebugType>

View File

@@ -87,7 +87,7 @@ namespace CUETools.FLACCL.cmd
min_precision = -1, max_precision = -1,
orders_per_window = -1, orders_per_channel = -1,
blocksize = -1;
int input_len = 4096, input_val = 0;
int input_len = 4096, input_val = 0, input_bps = 16, input_ch = 2, input_rate = 44100;
int level = -1, padding = -1, vbr_mode = -1;
bool do_seektable = true;
bool buffered = false;
@@ -136,6 +136,10 @@ namespace CUETools.FLACCL.cmd
input_len = intarg;
else if (args[arg] == "--input-value" && ++arg < args.Length && int.TryParse(args[arg], out intarg))
input_val = intarg;
else if (args[arg] == "--input-bps" && ++arg < args.Length && int.TryParse(args[arg], out intarg))
input_bps = intarg;
else if (args[arg] == "--input-channels" && ++arg < args.Length && int.TryParse(args[arg], out intarg))
input_ch = intarg;
else if ((args[arg] == "-o" || args[arg] == "--output") && ++arg < args.Length)
output_file = args[arg];
else if ((args[arg] == "-s" || args[arg] == "--stereo") && ++arg < args.Length)
@@ -211,18 +215,28 @@ namespace CUETools.FLACCL.cmd
}
IAudioSource audioSource;
if (input_file == "-")
audioSource = new WAVReader("", Console.OpenStandardInput());
else if (input_file == "nul")
audioSource = new SilenceGenerator(input_len, input_val);
else if (File.Exists(input_file) && Path.GetExtension(input_file) == ".wav")
audioSource = new WAVReader(input_file, null);
else if (File.Exists(input_file) && Path.GetExtension(input_file) == ".flac")
audioSource = new FlakeReader(input_file, null);
else
try
{
if (input_file == "-")
audioSource = new WAVReader("", Console.OpenStandardInput());
else if (input_file == "nul")
audioSource = new SilenceGenerator(new AudioPCMConfig(input_bps, input_ch, input_rate), input_len, input_val);
else if (File.Exists(input_file) && Path.GetExtension(input_file) == ".wav")
audioSource = new WAVReader(input_file, null);
else if (File.Exists(input_file) && Path.GetExtension(input_file) == ".flac")
audioSource = new FlakeReader(input_file, null);
else
{
Usage();
return 2;
}
}
catch (Exception ex)
{
Usage();
return 2;
Console.WriteLine("");
Console.WriteLine("Error: {0}.", ex.Message);
return 3;
}
if (buffered)
audioSource = new AudioPipe(audioSource, FLACCLWriter.MAX_BLOCKSIZE);

View File

@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="utf-8" ?>
<configuration>
<runtime>
<assemblyBinding xmlns="urn:schemas-microsoft-com:asm.v1">
<probing privatePath="plugins"/>
</assemblyBinding>
</runtime>
</configuration>

View File

@@ -2,7 +2,7 @@
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
<ProductVersion>8.0.50727</ProductVersion>
<ProductVersion>9.0.30729</ProductVersion>
<SchemaVersion>2.0</SchemaVersion>
<ProjectGuid>{2379BAAF-A406-4477-BF53-2D6A326C24C8}</ProjectGuid>
<OutputType>Exe</OutputType>
@@ -19,7 +19,7 @@
<DebugSymbols>true</DebugSymbols>
<DebugType>full</DebugType>
<Optimize>false</Optimize>
<OutputPath>bin\Debug\</OutputPath>
<OutputPath>..\bin\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
@@ -52,6 +52,9 @@
<Name>CUETools.Codecs</Name>
</ProjectReference>
</ItemGroup>
<ItemGroup>
<None Include="App.config" />
</ItemGroup>
<Import Project="$(MSBuildBinPath)\Microsoft.CSharp.targets" />
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
Other similar extension points exist, see Microsoft.Common.targets.

View File

@@ -279,9 +279,10 @@ namespace CUETools.FlakeExe
if (!quiet)
{
Console.Error.Write("\r \r");
Console.WriteLine("Results : {0:0.00}x; {1}",
Console.WriteLine("Results : {0:0.00}x; {2} bytes in {1} seconds;",
audioSource.Position / totalElapsed.TotalSeconds / audioSource.PCM.SampleRate,
totalElapsed
totalElapsed,
flake.TotalSize
);
}
audioSource.Close();