diff --git a/CUETools.Codecs.FLACCL/FLACCLWriter.cs b/CUETools.Codecs.FLACCL/FLACCLWriter.cs index ec6d1a8..434ec7f 100644 --- a/CUETools.Codecs.FLACCL/FLACCLWriter.cs +++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs @@ -87,7 +87,7 @@ namespace CUETools.Codecs.FLACCL [SRDescription(typeof(Properties.Resources), "DescriptionDeviceType")] public OpenCLDeviceType DeviceType { get; set; } - int cpu_threads = 1; + int cpu_threads = 0; [DefaultValue(1)] [SRDescription(typeof(Properties.Resources), "DescriptionCPUThreads")] public int CPUThreads @@ -214,10 +214,11 @@ namespace CUETools.Codecs.FLACCL { _pcm = pcm; - if (pcm.BitsPerSample != 16) + // FIXME: For now, only 16-bit encoding is supported + if (pcm.BitsPerSample != 16 && pcm.BitsPerSample != 24) throw new Exception("Bits per sample must be 16."); - if (pcm.ChannelCount != 2) - throw new Exception("ChannelCount must be 2."); + //if (pcm.ChannelCount != 2) + // throw new Exception("ChannelCount must be 2."); channels = pcm.ChannelCount; sample_rate = pcm.SampleRate; @@ -288,12 +289,6 @@ namespace CUETools.Codecs.FLACCL if (value as FLACCLWriterSettings == null) throw new Exception("Unsupported options " + value); _settings = value as FLACCLWriterSettings; - if (_settings.DeviceType == OpenCLDeviceType.CPU) - { - _settings.GroupSize = 1; - //_settings.GPUOnly = true; - _settings.MappedMemory = true; - } } } @@ -644,24 +639,28 @@ namespace CUETools.Codecs.FLACCL } } - static unsafe uint calc_optimal_rice_params(int porder, int* parm, uint* sums, uint n, uint pred_order) + static unsafe uint calc_optimal_rice_params(int porder, int* parm, ulong* sums, uint n, uint pred_order, ref int method) { uint part = (1U << porder); uint cnt = (n >> porder) - pred_order; - int k = cnt > 0 ? Math.Min(Flake.MAX_RICE_PARAM, BitReader.log2i(sums[0] / cnt)) : 0; - uint all_bits = cnt * ((uint)k + 1U) + (sums[0] >> k); + int maxK = method > 0 ? 30 : Flake.MAX_RICE_PARAM; + int k = cnt > 0 ? Math.Min(maxK, BitReader.log2i(sums[0] / cnt)) : 0; + int realMaxK0 = k; + ulong all_bits = cnt * ((uint)k + 1U) + (sums[0] >> k); parm[0] = k; cnt = (n >> porder); for (uint i = 1; i < part; i++) { - k = Math.Min(Flake.MAX_RICE_PARAM, BitReader.log2i(sums[i] / cnt)); + k = Math.Min(maxK, BitReader.log2i(sums[i] / cnt)); + realMaxK0 = Math.Max(realMaxK0, k); all_bits += cnt * ((uint)k + 1U) + (sums[i] >> k); parm[i] = k; } - return all_bits + (4 * part); + method = realMaxK0 > Flake.MAX_RICE_PARAM ? 1 : 0; + return (uint)all_bits + ((4U + (uint)method) * part); } - static unsafe void calc_lower_sums(int pmin, int pmax, uint* sums) + static unsafe void calc_lower_sums(int pmin, int pmax, ulong* sums) { for (int i = pmax - 1; i >= pmin; i--) { @@ -674,12 +673,12 @@ namespace CUETools.Codecs.FLACCL } } - static unsafe void calc_sums(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums) + static unsafe void calc_sums(int pmin, int pmax, uint* data, uint n, uint pred_order, ulong* sums) { int parts = (1 << pmax); uint* res = data + pred_order; uint cnt = (n >> pmax) - pred_order; - uint sum = 0; + ulong sum = 0; for (uint j = cnt; j > 0; j--) sum += *(res++); sums[0] = sum; @@ -702,18 +701,18 @@ namespace CUETools.Codecs.FLACCL /// /// /// - static unsafe void calc_sums18(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums) + static unsafe void calc_sums18(int pmin, int pmax, uint* data, uint n, uint pred_order, ulong* sums) { int parts = (1 << pmax); uint* res = data + pred_order; uint cnt = 18 - pred_order; - uint sum = 0; + ulong sum = 0UL; for (uint j = cnt; j > 0; j--) sum += *(res++); sums[0] = sum; for (int i = 1; i < parts; i++) { - sums[i] = + sums[i] = 0UL + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + @@ -731,18 +730,18 @@ namespace CUETools.Codecs.FLACCL /// /// /// - static unsafe void calc_sums16(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums) + static unsafe void calc_sums16(int pmin, int pmax, uint* data, uint n, uint pred_order, ulong* sums) { int parts = (1 << pmax); uint* res = data + pred_order; uint cnt = 16 - pred_order; - uint sum = 0; + ulong sum = 0UL; for (uint j = cnt; j > 0; j--) sum += *(res++); sums[0] = sum; for (int i = 1; i < parts; i++) { - sums[i] = + sums[i] = 0UL + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + @@ -750,10 +749,10 @@ namespace CUETools.Codecs.FLACCL } } - static unsafe uint calc_rice_params(RiceContext rc, int pmin, int pmax, int* data, uint n, uint pred_order) + static unsafe uint calc_rice_params(RiceContext rc, int pmin, int pmax, int* data, uint n, uint pred_order, int max_method) { uint* udata = stackalloc uint[(int)n]; - uint* sums = stackalloc uint[(pmax + 1) * Flake.MAX_PARTITIONS]; + ulong* sums = stackalloc ulong[(pmax + 1) * Flake.MAX_PARTITIONS]; int* parm = stackalloc int[(pmax + 1) * Flake.MAX_PARTITIONS]; //uint* bits = stackalloc uint[Flake.MAX_PARTITION_ORDER]; @@ -776,17 +775,21 @@ namespace CUETools.Codecs.FLACCL uint opt_bits = AudioSamples.UINT32_MAX; int opt_porder = pmin; + int opt_method = 0; for (int i = pmin; i <= pmax; i++) { - uint bits = calc_optimal_rice_params(i, parm + i * Flake.MAX_PARTITIONS, sums + i * Flake.MAX_PARTITIONS, n, pred_order); + int method = max_method; + uint bits = calc_optimal_rice_params(i, parm + i * Flake.MAX_PARTITIONS, sums + i * Flake.MAX_PARTITIONS, n, pred_order, ref method); if (bits <= opt_bits) { opt_bits = bits; opt_porder = i; + opt_method = method; } } rc.porder = opt_porder; + rc.coding_method = opt_method; fixed (int* rparms = rc.rparams) AudioSamples.MemCpy(rparms, parm + opt_porder * Flake.MAX_PARTITIONS, (1 << opt_porder)); @@ -845,8 +848,8 @@ namespace CUETools.Codecs.FLACCL for (int i = pos; i < pos + cnt; i++) { int v = sub.best.residual[i]; - v = (v << 1) ^ (v >> 31); - q += (v >> k); + uint uv = (uint)((v << 1) ^ (v >> 31)); + q += (int)(uv >> k); } return (k + 1) * cnt + q; } @@ -857,7 +860,7 @@ namespace CUETools.Codecs.FLACCL int porder = sub.best.rc.porder; int psize = frame.blocksize >> porder; //assert(porder >= 0); - int size = 6 + (4 << porder); + int size = 6 + ((4 + sub.best.rc.coding_method) << porder); size += measure_residual(frame, sub, sub.best.order, psize - sub.best.order, sub.best.rc.rparams[0]); // residual for (int p = 1; p < (1 << porder); p++) @@ -870,13 +873,13 @@ namespace CUETools.Codecs.FLACCL FlacFrame frame = task.frame; // rice-encoded block - frame.writer.writebits(2, 0); + frame.writer.writebits(2, sub.best.rc.coding_method); // partition order int porder = sub.best.rc.porder; //assert(porder >= 0); frame.writer.writebits(4, porder); - if (_settings.GPUOnly && _settings.DoRice) + if (task.UseGPURice) { int len = task.BestResidualTasks[index].size - task.BestResidualTasks[index].headerLen; int pos = task.BestResidualTasks[index].encodingOffset; @@ -901,7 +904,7 @@ namespace CUETools.Codecs.FLACCL for (int p = 0; p < (1 << porder); p++) { int k = sub.best.rc.rparams[p]; - frame.writer.writebits(4, k); + frame.writer.writebits(4 + sub.best.rc.coding_method, k); if (p == 1) res_cnt = psize; int cnt = Math.Min(res_cnt, frame.blocksize - j); frame.writer.write_rice_block_signed(fixbuf, k, sub.best.residual + j, cnt); @@ -1069,7 +1072,7 @@ namespace CUETools.Codecs.FLACCL calculate_window(task, lpc.window_bartlett, WindowFunction.Bartlett); if (task.nWindowFunctions == 0) throw new Exception("invalid windowfunction"); - if (!_settings.MappedMemory) + if (!task.UseMappedMemory) task.openCLCQ.EnqueueWriteBuffer(task.clWindowFunctions, false, 0, sizeof(float) * task.nWindowFunctions * task.frameSize, task.clWindowFunctionsPtr); } @@ -1116,6 +1119,7 @@ namespace CUETools.Codecs.FLACCL task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * task.channelSize + iFrame * blocksize; task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs; task.ResidualTasks[task.nResidualTasks].wbits = 0; + task.ResidualTasks[task.nResidualTasks].coding_method = PCM.BitsPerSample > 16 ? 1 : 0; task.ResidualTasks[task.nResidualTasks].size = task.ResidualTasks[task.nResidualTasks].obits * blocksize; task.nResidualTasks++; } @@ -1131,6 +1135,7 @@ namespace CUETools.Codecs.FLACCL task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * task.channelSize + iFrame * blocksize; task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs; task.ResidualTasks[task.nResidualTasks].wbits = 0; + task.ResidualTasks[task.nResidualTasks].coding_method = PCM.BitsPerSample > 16 ? 1 : 0; task.ResidualTasks[task.nResidualTasks].size = task.ResidualTasks[task.nResidualTasks].obits * blocksize; task.ResidualTasks[task.nResidualTasks].residualOrder = 1; task.ResidualTasks[task.nResidualTasks].shift = 0; @@ -1149,6 +1154,7 @@ namespace CUETools.Codecs.FLACCL task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * task.channelSize + iFrame * blocksize; task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs; task.ResidualTasks[task.nResidualTasks].wbits = 0; + task.ResidualTasks[task.nResidualTasks].coding_method = PCM.BitsPerSample > 16 ? 1 : 0; task.ResidualTasks[task.nResidualTasks].size = task.ResidualTasks[task.nResidualTasks].obits * blocksize; task.ResidualTasks[task.nResidualTasks].shift = 0; switch (order) @@ -1195,10 +1201,11 @@ namespace CUETools.Codecs.FLACCL if (sizeof(FLACCLSubframeTask) * task.nResidualTasks > task.residualTasksLen) throw new Exception("oops"); - if (!_settings.MappedMemory) - task.openCLCQ.EnqueueWriteBuffer(task.clResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * task.nResidualTasks, task.clResidualTasksPtr); - if (!_settings.MappedMemory) - task.openCLCQ.EnqueueWriteBuffer(task.clSelectedTasks, false, 0, sizeof(int) * (nFrames * channelsCount * task.nEstimateTasksPerChannel), task.clSelectedTasksPtr); + if (!task.UseMappedMemory) + { + task.openCLCQ.EnqueueWriteBuffer(task.clResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * task.nResidualTasks, task.clResidualTasksPtr); + task.openCLCQ.EnqueueWriteBuffer(task.clSelectedTasks, false, 0, sizeof(int) * (nFrames * channelsCount * task.nEstimateTasksPerChannel), task.clSelectedTasksPtr); + } } unsafe void encode_residual(FLACCLTask task) @@ -1215,7 +1222,7 @@ namespace CUETools.Codecs.FLACCL if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true; break; case SubframeType.Fixed: - if (!_settings.GPUOnly) + if (!task.UseGPUOnly) { if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true; encode_residual_fixed(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, @@ -1224,7 +1231,7 @@ namespace CUETools.Codecs.FLACCL int pmin = get_max_p_order(eparams.min_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order); int pmax = get_max_p_order(eparams.max_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order); uint bits = (uint)(task.frame.subframes[ch].best.order * task.frame.subframes[ch].obits) + 6; - task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order); + task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order, PCM.BitsPerSample > 16 ? 1 : 0); } break; case SubframeType.LPC: @@ -1236,7 +1243,7 @@ namespace CUETools.Codecs.FLACCL #if DEBUG // check size - if (_settings.GPUOnly && !_settings.DoRice) + if (task.UseGPUOnly && !task.UseGPURice) { uint real_size = measure_subframe(task.frame, task.frame.subframes[ch]); if (real_size != task.frame.subframes[ch].best.size) @@ -1244,9 +1251,9 @@ namespace CUETools.Codecs.FLACCL } #endif - if (((csum << task.frame.subframes[ch].obits) >= 1UL << 32) || !_settings.GPUOnly) + if ((((csum << task.frame.subframes[ch].obits) >= 1UL << 32) && PCM.BitsPerSample == 16) || !task.UseGPUOnly) { - if (_settings.GPUOnly && _settings.DoRice) + if (task.UseGPURice) #if DEBUG // throw new Exception("DoRice failed"); break; @@ -1266,11 +1273,11 @@ namespace CUETools.Codecs.FLACCL RiceContext rc1 = task.frame.subframes[ch].best.rc; task.frame.subframes[ch].best.rc = new RiceContext(); #endif - task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order); + task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order, PCM.BitsPerSample > 16 ? 1 : 0); task.frame.subframes[ch].best.size = measure_subframe(task.frame, task.frame.subframes[ch]); #if KJHKJH // check size - if (_settings.GPUOnly && oldsize > task.frame.subframes[ch].best.size) + if (task.UseGPUOnly && oldsize > task.frame.subframes[ch].best.size) throw new Exception("unoptimal size reported"); #endif //if (task.frame.subframes[ch].best.size > task.frame.subframes[ch].obits * (uint)task.frame.blocksize && @@ -1337,8 +1344,9 @@ namespace CUETools.Codecs.FLACCL for (int i = 0; i < task.BestResidualTasks[index].residualOrder; i++) frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i]; frame.subframes[ch].best.rc.porder = task.BestResidualTasks[index].porder; - if (_settings.GPUOnly && !_settings.DoRice && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC)) - //if (_settings.GPUOnly && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC)) + frame.subframes[ch].best.rc.coding_method = task.BestResidualTasks[index].coding_method; + if (task.UseGPUOnly && !task.UseGPURice && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC)) + //if (task.UseGPUOnly && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC)) { int* riceParams = ((int*)task.clBestRiceParamsPtr) + (index << task.max_porder); fixed (int* dstParams = frame.subframes[ch].best.rc.rparams) @@ -1352,7 +1360,7 @@ namespace CUETools.Codecs.FLACCL } else { - if (_settings.GPUOnly && _settings.DoRice && frame.subframes[ch].best.size != task.BestResidualTasks[index].size) + if (task.UseGPURice && frame.subframes[ch].best.size != task.BestResidualTasks[index].size) throw new Exception("size reported incorrectly"); } } @@ -1369,10 +1377,9 @@ namespace CUETools.Codecs.FLACCL /// /// /// - unsafe void unpack_samples(FLACCLTask task, int count) + unsafe void unpack_samples_16(FLACCLTask task, byte * srcptr, int count) { - int iFrame = task.frame.frame_number; - short* src = ((short*)task.clSamplesBytesPtr) + iFrame * channels * task.frameSize; + short* src = (short*)srcptr; switch (task.frame.ch_mode) { @@ -1382,7 +1389,7 @@ namespace CUETools.Codecs.FLACCL int* s = task.frame.subframes[ch].samples; int wbits = (int)task.frame.subframes[ch].wbits; for (int i = 0; i < count; i++) - s[i] = src[i * channels + ch] >>= wbits; + s[i] = src[i * channels + ch] >> wbits; } break; case ChannelMode.LeftRight: @@ -1448,6 +1455,108 @@ namespace CUETools.Codecs.FLACCL } } + /// + /// Copy channel-interleaved input samples into separate subframes + /// + /// + /// + unsafe void unpack_samples_24(FLACCLTask task, byte* srcptr, int count) + { + switch (task.frame.ch_mode) + { + case ChannelMode.NotStereo: + for (int ch = 0; ch < channels; ch++) + { + int* s = task.frame.subframes[ch].samples; + int wbits = (int)task.frame.subframes[ch].wbits; + byte* src = &srcptr[ch * 3]; + for (int i = 0; i < count; i++) + { + s[i] = (((int)src[0] << 8) + ((int)src[1] << 16) + ((int)src[2] << 24)) >> (8 + wbits); + src += PCM.BlockAlign; + } + } + break; + case ChannelMode.LeftRight: + { + int* left = task.frame.subframes[0].samples; + int* right = task.frame.subframes[1].samples; + int lwbits = (int)task.frame.subframes[0].wbits; + int rwbits = (int)task.frame.subframes[1].wbits; + for (int i = 0; i < count; i++) + { + int l = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8; + int r = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8; + left[i] = l >> lwbits; + right[i] = r >> rwbits; + } + break; + } + case ChannelMode.LeftSide: + { + int* left = task.frame.subframes[0].samples; + int* right = task.frame.subframes[1].samples; + int lwbits = (int)task.frame.subframes[0].wbits; + int rwbits = (int)task.frame.subframes[1].wbits; + for (int i = 0; i < count; i++) + { + int l = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8; + int r = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8; + left[i] = l >> lwbits; + right[i] = (l - r) >> rwbits; + } + break; + } + case ChannelMode.RightSide: + { + int* left = task.frame.subframes[0].samples; + int* right = task.frame.subframes[1].samples; + int lwbits = (int)task.frame.subframes[0].wbits; + int rwbits = (int)task.frame.subframes[1].wbits; + for (int i = 0; i < count; i++) + { + int l = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8; + int r = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8; + left[i] = (l - r) >> lwbits; + right[i] = r >> rwbits; + } + break; + } + case ChannelMode.MidSide: + { + int* left = task.frame.subframes[0].samples; + int* right = task.frame.subframes[1].samples; + int lwbits = (int)task.frame.subframes[0].wbits; + int rwbits = (int)task.frame.subframes[1].wbits; + for (int i = 0; i < count; i++) + { + int l = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8; + int r = (((int)*(srcptr++) << 8) + ((int)*(srcptr++) << 16) + ((int)*(srcptr++) << 24)) >> 8; + left[i] = (l + r) >> (1 + lwbits); + right[i] = (l - r) >> rwbits; + } + break; + } + } + } + + /// + /// Copy channel-interleaved input samples into separate subframes + /// + /// + /// + unsafe void unpack_samples(FLACCLTask task, int count) + { + int iFrame = task.frame.frame_number; + byte* srcptr = ((byte*)task.clSamplesBytesPtr) + iFrame * task.frameSize * PCM.BlockAlign; + if (PCM.BitsPerSample == 16) + unpack_samples_16(task, srcptr, count); + else if (PCM.BitsPerSample == 24) + unpack_samples_24(task, srcptr, count); + else + throw new Exception("Invalid BPS"); + } + unsafe int encode_frame(bool doMidside, int channelCount, int iFrame, FLACCLTask task, int current_frame_number) { task.frame.InitSize(task.frameSize, eparams.variable_block_size != 0); @@ -1492,8 +1601,8 @@ namespace CUETools.Codecs.FLACCL task.framePos = frame_pos; frame_count += nFrames; frame_pos += nFrames * blocksize; - if (!_settings.MappedMemory) - task.openCLCQ.EnqueueWriteBuffer(task.clSamplesBytes, false, 0, sizeof(short) * channels * blocksize * nFrames, task.clSamplesBytesPtr); + if (!task.UseMappedMemory) + task.openCLCQ.EnqueueWriteBuffer(task.clSamplesBytes, false, 0, PCM.BlockAlign * blocksize * nFrames, task.clSamplesBytesPtr); //task.openCLCQ.EnqueueUnmapMemObject(task.clSamplesBytes, task.clSamplesBytes.HostPtr); //task.openCLCQ.EnqueueMapBuffer(task.clSamplesBytes, true, MapFlags.WRITE, 0, task.samplesBufferLen / 2); } @@ -1530,20 +1639,38 @@ namespace CUETools.Codecs.FLACCL { int decoded = task.verify.DecodeFrame(task.frame.writer.Buffer, task.frame.writer_offset, fs); if (decoded != fs || task.verify.Remaining != task.frameSize) - throw new Exception("validation failed! frame size mismatch"); + throw new Exception(string.Format("validation failed! frame size mismatch, iFrame={0}, decoded=={1}, fs=={2}", fn, decoded, fs)); fixed (int* r = task.verify.Samples) { for (int ch = 0; ch < channels; ch++) { - short* res = ((short*)task.clSamplesBytesPtr) + iFrame * channels * task.frameSize + ch; + byte* res = ((byte*)task.clSamplesBytesPtr) + PCM.BlockAlign * iFrame * task.frameSize + ch * (PCM.BlockAlign / channels); int* smp = r + ch * Flake.MAX_BLOCKSIZE; - for (int i = task.frameSize; i > 0; i--) + int ba = PCM.BlockAlign; + if (PCM.BitsPerSample == 16) { - //if (AudioSamples.MemCmp(s + iFrame * task.frameSize + ch * FLACCLWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, task.frameSize)) - if (*res != *(smp++)) - throw new Exception(string.Format("validation failed! iFrame={0}, ch={1}", iFrame, ch)); - res += channels; + for (int i = task.frameSize; i > 0; i--) + { + //if (AudioSamples.MemCmp(s + iFrame * task.frameSize + ch * FLACCLWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, task.frameSize)) + int ress = *(short*)res; + if (ress != *(smp++)) + throw new Exception(string.Format("validation failed! iFrame={0}, ch={1}", fn, ch)); + res += ba; + } } + else if (PCM.BitsPerSample == 24) + { + for (int i = task.frameSize; i > 0; i--) + { + //if (AudioSamples.MemCmp(s + iFrame * task.frameSize + ch * FLACCLWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, task.frameSize)) + int ress = (((int)res[0] << 8) + ((int)res[1] << 16) + ((int)res[2] << 24)) >> (8); + if (ress != *(smp++)) + throw new Exception(string.Format("validation failed! iFrame={0}, ch={1}", iFrame, ch)); + res += ba; + } + } + else + throw new Exception("Invalid BPS"); } } } @@ -1644,10 +1771,21 @@ namespace CUETools.Codecs.FLACCL } OCLMan.CreateDefaultContext(platformId, (DeviceType)_settings.DeviceType); - this.framesPerTask = (int)OCLMan.Context.Devices[0].MaxComputeUnits * _settings.TaskSize; + this.framesPerTask = (int)OCLMan.Context.Devices[0].MaxComputeUnits * Math.Max(1, _settings.TaskSize / channels); - if (!OCLMan.Context.Devices[0].Extensions.Contains("cl_khr_local_int32_extended_atomics")) - _settings.GPUOnly = false; + bool UseGPUOnly = _settings.GPUOnly && OCLMan.Context.Devices[0].Extensions.Contains("cl_khr_local_int32_extended_atomics"); + bool UseGPURice = UseGPUOnly && _settings.DoRice; + + if (_blocksize == 0) + { + if (eparams.block_size == 0) + eparams.block_size = select_blocksize(sample_rate, eparams.block_time_ms); + _blocksize = eparams.block_size; + } + else + eparams.block_size = _blocksize; + + int maxBS = 1 << (BitReader.log2i(eparams.block_size - 1) + 1); // The Defines string gets prepended to any and all sources that are compiled // and serve as a convenient way to pass configuration information to the compilation process @@ -1655,8 +1793,11 @@ namespace CUETools.Codecs.FLACCL "#define MAX_ORDER " + eparams.max_prediction_order.ToString() + "\n" + "#define GROUP_SIZE " + groupSize.ToString() + "\n" + "#define FLACCL_VERSION \"" + vendor_string + "\"\n" + - (_settings.GPUOnly ? "#define DO_PARTITIONS\n" : "") + - (_settings.DoRice ? "#define DO_RICE\n" : "") + + (UseGPUOnly ? "#define DO_PARTITIONS\n" : "") + + (UseGPURice ? "#define DO_RICE\n" : "") + + "#define BITS_PER_SAMPLE " + PCM.BitsPerSample + "\n" + + "#define MAX_BLOCKSIZE " + maxBS + "\n" + + "#define MAX_CHANNELS " + PCM.ChannelCount + "\n" + #if DEBUG "#define DEBUG\n" + #endif @@ -1718,13 +1859,13 @@ namespace CUETools.Codecs.FLACCL if (_IO.CanSeek) first_frame_offset = _IO.Position; - task1 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, this, groupSize); - task2 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, this, groupSize); + task1 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, this, groupSize, UseGPUOnly, UseGPURice); + task2 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, this, groupSize, UseGPUOnly, UseGPURice); if (_settings.CPUThreads > 0) { cpu_tasks = new FLACCLTask[_settings.CPUThreads]; for (int i = 0; i < cpu_tasks.Length; i++) - cpu_tasks[i] = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, this, groupSize); + cpu_tasks[i] = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, this, groupSize, UseGPUOnly, UseGPURice); } inited = true; } @@ -1823,10 +1964,10 @@ namespace CUETools.Codecs.FLACCL public unsafe void do_output_frames(int nFrames) { - send_to_GPU(task1, nFrames, eparams.block_size); - run_GPU_task(task1); if (task2.frameCount > 0) task2.openCLCQ.Finish(); + send_to_GPU(task1, nFrames, eparams.block_size); + run_GPU_task(task1); if (task2.frameCount > 0) { if (cpu_tasks != null) @@ -1871,15 +2012,16 @@ namespace CUETools.Codecs.FLACCL { int blocksize = Flake.flac_blocksizes[1]; int target = (samplerate * time_ms) / 1000; - if (eparams.variable_block_size > 0) - { - blocksize = 1024; - while (target >= blocksize) - blocksize <<= 1; - return blocksize >> 1; - } - for (int i = 0; i < Flake.flac_blocksizes.Length; i++) + ////if (eparams.variable_block_size > 0) + ////{ + //// blocksize = 1024; + //// while (target >= blocksize) + //// blocksize <<= 1; + //// return blocksize >> 1; + ////} + + for (int i = 8; i < Flake.flac_blocksizes.Length; i++) if (target >= Flake.flac_blocksizes[i] && Flake.flac_blocksizes[i] > blocksize) { blocksize = Flake.flac_blocksizes[i]; @@ -2052,18 +2194,6 @@ namespace CUETools.Codecs.FLACCL } if (i == 8) throw new Exception("non-standard bps"); - // FIXME: For now, only 16-bit encoding is supported - if (bits_per_sample != 16) - throw new Exception("non-standard bps"); - - if (_blocksize == 0) - { - if (eparams.block_size == 0) - eparams.block_size = select_blocksize(sample_rate, eparams.block_time_ms); - _blocksize = eparams.block_size; - } - else - eparams.block_size = _blocksize; // set maximum encoded frame size (if larger, re-encodes in verbatim mode) if (channels == 2) @@ -2332,7 +2462,7 @@ namespace CUETools.Codecs.FLACCL public int type; public int obits; public int blocksize; - public int best_index; + public int coding_method; public int channel; public int residualOffs; public int wbits; @@ -2350,6 +2480,7 @@ namespace CUETools.Codecs.FLACCL public Kernel clStereoDecorr; //public Kernel cudaChannelDecorr; public Kernel clChannelDecorr2; + public Kernel clChannelDecorrX; public Kernel clFindWastedBits; public Kernel clComputeAutocor; public Kernel clComputeLPC; @@ -2428,9 +2559,15 @@ namespace CUETools.Codecs.FLACCL public int groupSize = 128; public int channels, channelsCount; public FLACCLWriter writer; + public bool UseGPUOnly = false; + public bool UseGPURice = false; + public bool UseMappedMemory = false; - unsafe public FLACCLTask(Program _openCLProgram, int channelsCount, int channels, uint bits_per_sample, int max_frame_size, FLACCLWriter writer, int groupSize) + unsafe public FLACCLTask(Program _openCLProgram, int channelsCount, int channels, uint bits_per_sample, int max_frame_size, FLACCLWriter writer, int groupSize, bool gpuOnly, bool gpuRice) { + this.UseGPUOnly = gpuOnly; + this.UseGPURice = gpuOnly && gpuRice; + this.UseMappedMemory = writer._settings.MappedMemory || writer._settings.DeviceType == OpenCLDeviceType.CPU; this.groupSize = groupSize; this.channels = channels; this.channelsCount = channelsCount; @@ -2448,9 +2585,9 @@ namespace CUETools.Codecs.FLACCL int MAX_CHANNELSIZE = MAX_FRAMES * writer.eparams.block_size; residualTasksLen = sizeof(FLACCLSubframeTask) * 32 * channelsCount * MAX_FRAMES; bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channels * MAX_FRAMES; - int samplesBufferLen = sizeof(int) * MAX_CHANNELSIZE * channelsCount; + int samplesBufferLen = writer.PCM.BlockAlign * MAX_CHANNELSIZE * channelsCount; int residualBufferLen = sizeof(int) * MAX_CHANNELSIZE * channels; // need to adjust residualOffset? - int partitionsLen = sizeof(int) * (30 << 8) * channels * MAX_FRAMES; + int partitionsLen = sizeof(int) * ((writer.PCM.BitsPerSample > 16 ? 31 : 15) * 2 << 8) * channels * MAX_FRAMES; int riceParamsLen = sizeof(int) * (4 << 8) * channels * MAX_FRAMES; int autocorLen = sizeof(float) * (MAX_ORDER + 1) * lpc.MAX_LPC_WINDOWS * channelsCount * MAX_FRAMES; int lpcDataLen = autocorLen * 32; @@ -2459,7 +2596,7 @@ namespace CUETools.Codecs.FLACCL int selectedLen = sizeof(int) * 32 * channelsCount * MAX_FRAMES; int riceLen = sizeof(int) * channels * MAX_CHANNELSIZE; - if (!writer._settings.MappedMemory) + if (!this.UseMappedMemory) { clSamplesBytes = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, samplesBufferLen / 2); clResidual = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, residualBufferLen); @@ -2521,7 +2658,7 @@ namespace CUETools.Codecs.FLACCL clAutocorOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, autocorLen); clSelectedTasksSecondEstimate = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, selectedLen); clSelectedTasksBestMethod = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, selectedLen); - if (writer._settings.GPUOnly) + if (UseGPUOnly) { clPartitions = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, partitionsLen); clRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, riceParamsLen); @@ -2533,6 +2670,7 @@ namespace CUETools.Codecs.FLACCL clStereoDecorr = openCLProgram.CreateKernel("clStereoDecorr"); //cudaChannelDecorr = openCLProgram.CreateKernel("clChannelDecorr"); clChannelDecorr2 = openCLProgram.CreateKernel("clChannelDecorr2"); + clChannelDecorrX = openCLProgram.CreateKernel("clChannelDecorrX"); clFindWastedBits = openCLProgram.CreateKernel("clFindWastedBits"); clComputeLPC = openCLProgram.CreateKernel("clComputeLPC"); clQuantizeLPC = openCLProgram.CreateKernel("clQuantizeLPC"); @@ -2540,15 +2678,16 @@ namespace CUETools.Codecs.FLACCL clSelectStereoTasks = openCLProgram.CreateKernel("clSelectStereoTasks"); clEstimateResidual = openCLProgram.CreateKernel("clEstimateResidual"); clChooseBestMethod = openCLProgram.CreateKernel("clChooseBestMethod"); - if (writer._settings.GPUOnly) + if (UseGPUOnly) { clEncodeResidual = openCLProgram.CreateKernel("clEncodeResidual"); clCalcPartition = openCLProgram.CreateKernel("clCalcPartition"); - clCalcPartition16 = openCLProgram.CreateKernel("clCalcPartition16"); + if (openCLCQ.Device.DeviceType != DeviceType.CPU) + clCalcPartition16 = openCLProgram.CreateKernel("clCalcPartition16"); clSumPartition = openCLProgram.CreateKernel("clSumPartition"); clFindRiceParameter = openCLProgram.CreateKernel("clFindRiceParameter"); clFindPartitionOrder = openCLProgram.CreateKernel("clFindPartitionOrder"); - if (writer._settings.DoRice) + if (UseGPURice) { clCalcOutputOffsets = openCLProgram.CreateKernel("clCalcOutputOffsets"); clRiceEncoding = openCLProgram.CreateKernel("clRiceEncoding"); @@ -2586,6 +2725,7 @@ namespace CUETools.Codecs.FLACCL clStereoDecorr.Dispose(); //cudaChannelDecorr.Dispose(); clChannelDecorr2.Dispose(); + clChannelDecorrX.Dispose(); clFindWastedBits.Dispose(); clComputeLPC.Dispose(); clQuantizeLPC.Dispose(); @@ -2593,15 +2733,16 @@ namespace CUETools.Codecs.FLACCL clSelectStereoTasks.Dispose(); clEstimateResidual.Dispose(); clChooseBestMethod.Dispose(); - if (writer._settings.GPUOnly) + if (UseGPUOnly) { clEncodeResidual.Dispose(); clCalcPartition.Dispose(); - clCalcPartition16.Dispose(); + if (openCLCQ.Device.DeviceType != DeviceType.CPU) + clCalcPartition16.Dispose(); clSumPartition.Dispose(); clFindRiceParameter.Dispose(); clFindPartitionOrder.Dispose(); - if (writer._settings.DoRice) + if (UseGPURice) { clCalcOutputOffsets.Dispose(); clRiceEncoding.Dispose(); @@ -2611,7 +2752,7 @@ namespace CUETools.Codecs.FLACCL clRiceParams.Dispose(); } - if (!writer._settings.MappedMemory) + if (!this.UseMappedMemory) { if (clSamplesBytesPtr != IntPtr.Zero) openCLCQ.EnqueueUnmapMemObject(clSamplesBytesPinned, clSamplesBytesPtr); @@ -2701,19 +2842,36 @@ namespace CUETools.Codecs.FLACCL while ((frameSize >> max_porder) < 16 && max_porder > 0) this.max_porder--; - if (channels != 2) throw new Exception("channels != 2"); // need to Enqueue cudaChannelDecorr for each channel - Kernel clChannelDecorr = channels == 2 ? (channelsCount == 4 ? clStereoDecorr : clChannelDecorr2) : null;// cudaChannelDecorr; - // openCLCQ.EnqueueMapBuffer(cudaSamplesBytes //openCLCQ.EnqueueUnmapMemObject(cudaSamplesBytes, cudaSamplesBytes.HostPtr); // issue work to the GPU - clChannelDecorr.SetArgs( - clSamples, - clSamplesBytes, - channelSize / 4); + if (channels == 2) + { + Kernel clChannelDecorr = channelsCount == 4 ? clStereoDecorr : clChannelDecorr2; + int channelSize1 = writer.PCM.BitsPerSample == 16 ? channelSize / 4 : channelSize; + clChannelDecorr.SetArgs( + clSamples, + clSamplesBytes, + channelSize1); - openCLCQ.EnqueueNDRangeKernel(clChannelDecorr, 0, channelSize / 4); + openCLCQ.EnqueueNDRangeKernel( + clChannelDecorr, + 0, + channelSize1); + } + else + { + clChannelDecorrX.SetArgs( + clSamples, + clSamplesBytes, + channelSize); + + openCLCQ.EnqueueNDRangeKernel( + clChannelDecorrX, + 0, + channelSize); + } //openCLCQ.EnqueueNDRangeKernel(clChannelDecorr, 0, (frameSize * frameCount + 3) / 4); if (eparams.do_wasted) @@ -2842,14 +3000,22 @@ namespace CUETools.Codecs.FLACCL 0, channels * frameCount); } - if (writer._settings.GPUOnly) + if (UseGPUOnly) { - if (frameSize >> max_porder == 16) + clEncodeResidual.SetArgs( + clResidual, + clSamples, + clBestResidualTasks); + + openCLCQ.EnqueueNDRangeKernel( + clEncodeResidual, + groupSize, channels * frameCount); + + if ((frameSize >> max_porder == 16) && openCLCQ.Device.DeviceType != DeviceType.CPU) { clCalcPartition16.SetArgs( clPartitions, clResidual, - clSamples, clBestResidualTasks, max_porder); @@ -2859,15 +3025,6 @@ namespace CUETools.Codecs.FLACCL } else { - clEncodeResidual.SetArgs( - clResidual, - clSamples, - clBestResidualTasks); - - openCLCQ.EnqueueNDRangeKernel( - clEncodeResidual, - groupSize, channels * frameCount); - clCalcPartition.SetArgs( clPartitions, clResidual, @@ -2895,6 +3052,7 @@ namespace CUETools.Codecs.FLACCL clPartitions, max_porder); + int maxK = writer.PCM.BitsPerSample > 16 ? 30 : Flake.MAX_RICE_PARAM; if (openCLCQ.Device.DeviceType == DeviceType.CPU) openCLCQ.EnqueueNDRangeKernel( clSumPartition, @@ -2904,7 +3062,7 @@ namespace CUETools.Codecs.FLACCL openCLCQ.EnqueueNDRangeKernel( clSumPartition, 128, 1, - (Flake.MAX_RICE_PARAM + 1), + (maxK + 1), channels * frameCount); } @@ -2931,7 +3089,7 @@ namespace CUETools.Codecs.FLACCL groupSize, channels * frameCount); - if (writer._settings.DoRice) + if (UseGPURice) { clCalcOutputOffsets.SetArgs( clResidual, @@ -2960,10 +3118,10 @@ namespace CUETools.Codecs.FLACCL channels * frameCount); } - if (!writer._settings.MappedMemory) + if (!this.UseMappedMemory) { - if (writer._settings.DoRice) - openCLCQ.EnqueueReadBuffer(clRiceOutput, false, 0, (channels * frameSize * 17 + 128) / 8 * frameCount, clRiceOutputPtr); + if (UseGPURice) + openCLCQ.EnqueueReadBuffer(clRiceOutput, false, 0, (channels * frameSize * (writer.PCM.BitsPerSample + 1) + 256) / 8 * frameCount, clRiceOutputPtr); else { openCLCQ.EnqueueReadBuffer(clBestRiceParams, false, 0, sizeof(int) * (1 << max_porder) * channels * frameCount, clBestRiceParamsPtr); @@ -2971,7 +3129,7 @@ namespace CUETools.Codecs.FLACCL } } } - if (!writer._settings.MappedMemory) + if (!this.UseMappedMemory) openCLCQ.EnqueueReadBuffer(clBestResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * channels * frameCount, clBestResidualTasksPtr); } } diff --git a/CUETools.Codecs.FLACCL/flac.cl b/CUETools.Codecs.FLACCL/flac.cl index 8cd66c2..a671445 100644 --- a/CUETools.Codecs.FLACCL/flac.cl +++ b/CUETools.Codecs.FLACCL/flac.cl @@ -65,6 +65,14 @@ #define WARP_SIZE 32 +#if BITS_PER_SAMPLE > 16 +#define MAX_RICE_PARAM 30 +#define RICE_PARAM_BITS 5 +#else +#define MAX_RICE_PARAM 14 +#define RICE_PARAM_BITS 4 +#endif + typedef enum { Constant = 0, @@ -83,7 +91,7 @@ typedef struct int type; int obits; int blocksize; - int best_index; + int coding_method; int channel; int residualOffs; int wbits; @@ -125,6 +133,49 @@ __kernel void clWindowTukey(__global float* window, int windowOffset, float p) } #endif +#if BITS_PER_SAMPLE > 16 +__kernel void clStereoDecorr( + __global int *samples, + __global unsigned char *src, + int offset +) +{ + int pos = get_global_id(0); + int bpos = pos * 6; + int x = (((int)src[bpos] << 8) | ((int)src[bpos+1] << 16) | ((int)src[bpos+2] << 24)) >> 8; + int y = (((int)src[bpos+3] << 8) | ((int)src[bpos+4] << 16) | ((int)src[bpos+5] << 24)) >> 8; + samples[pos] = x; + samples[1 * offset + pos] = y; + samples[2 * offset + pos] = (x + y) >> 1; + samples[3 * offset + pos] = x - y; +} + +__kernel void clChannelDecorr2( + __global int *samples, + __global unsigned char *src, + int offset +) +{ + int pos = get_global_id(0); + int bpos = pos * 6; + samples[pos] = (((int)src[bpos] << 8) | ((int)src[bpos+1] << 16) | ((int)src[bpos+2] << 24)) >> 8; + samples[offset + pos] = (((int)src[bpos+3] << 8) | ((int)src[bpos+4] << 16) | ((int)src[bpos+5] << 24)) >> 8; +} + +__kernel void clChannelDecorrX( + __global int *samples, + __global unsigned char *src, + int offset +) +{ + int pos = get_global_id(0); + for (int ch = 0; ch < MAX_CHANNELS; ch++) + { + int bpos = 3 * (pos * MAX_CHANNELS + ch); + samples[offset * ch + pos] = (((int)src[bpos] << 8) | ((int)src[bpos+1] << 16) | ((int)src[bpos+2] << 24)) >> 8; + } +} +#else __kernel void clStereoDecorr( __global int4 *samples, __global int4 *src, @@ -153,6 +204,21 @@ __kernel void clChannelDecorr2( samples[offset + pos] = s >> 16; } +__kernel void clChannelDecorrX( + __global int *samples, + __global short *src, + int offset +) +{ + int pos = get_global_id(0); + for (int ch = 0; ch < MAX_CHANNELS; ch++) + { + int bpos = pos * MAX_CHANNELS + ch; + samples[offset * ch + pos] = src[bpos]; + } +} +#endif + //__kernel void clChannelDecorr( // int *samples, // short *src, @@ -598,7 +664,11 @@ void clQuantizeLPC( } // choose precision //int cbits = max(3, min(10, 5 + (abits >> 1))); // - convert_int_rte(shared.PE[order - 1]) +#if BITS_PER_SAMPLE > 16 + int cbits = max(3, min(15 - minprecision + (i - ((i >> precisions) << precisions)) - (bs <= 2304) - (bs <= 1152) - (bs <= 576), abits)); +#else int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (bs <= 2304) - (bs <= 1152) - (bs <= 576), abits), clz(order) + 1 - abits)); +#endif // calculate shift based on precision and number of leading zeroes in coeffs int shift = max(0,min(15, clz(tmpi) - 18 + cbits)); @@ -749,7 +819,11 @@ void clQuantizeLPC( //SUM32(shared.tmpi,tid,|=); // choose precision //int cbits = max(3, min(10, 5 + (shared.task.abits >> 1))); // - convert_int_rte(shared.PE[order - 1]) +#if BITS_PER_SAMPLE > 16 + int cbits = max(3, min(min(15 - minprecision + (i - ((i >> precisions) << precisions)) - (shared.task.blocksize <= 2304) - (shared.task.blocksize <= 1152) - (shared.task.blocksize <= 576), shared.task.abits), 15)); +#else int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (shared.task.blocksize <= 2304) - (shared.task.blocksize <= 1152) - (shared.task.blocksize <= 576), shared.task.abits), clz(order) + 1 - shared.task.abits)); +#endif // calculate shift based on precision and number of leading zeroes in coeffs int shift = max(0,min(15, clz(shared.maxcoef[i]) - 18 + cbits)); @@ -797,7 +871,6 @@ void clQuantizeLPC( #endif #ifdef FLACCL_CPU - inline int fastclz(int iv) { unsigned int v = (unsigned int)iv; @@ -809,17 +882,44 @@ inline int fastclz(int iv) x += (0 != (v >> x)); return 32 - x; } - -inline int calc_residual(__global int *ptr, int * coefs, int ro) +#else +inline int fastclz(int iv) { - int sum = 0; + return clz(iv); +} +#endif +inline int fastclz64(long iv) +{ + unsigned long v = (unsigned long)iv; + int x = (0 != (v >> 32)) * 32; + return 32 - x + fastclz(v >> x); +} + +#if BITS_PER_SAMPLE > 16 +typedef long residual_t; +#define residual_log(s) (63 - fastclz64(s)) +#define convert_bps4 convert_long4 +#define convert_bps_sat convert_int_sat +#define bpsint4 long4 +#else +typedef int residual_t; +#define residual_log(s) (31 - fastclz(s)) +#define convert_bps4 +#define convert_bps_sat +#define bpsint4 int4 +#endif + +#ifdef FLACCL_CPU +inline residual_t calc_residual(__global int *ptr, int * coefs, int ro) +{ + residual_t sum = 0; for (int i = 0; i < ro; i++) - sum += ptr[i] * coefs[i]; + sum += (residual_t) ptr[i] * coefs[i]; return sum; } #define ENCODE_N(cro,action) for (int pos = cro; pos < bs; pos ++) { \ - int t = (data[pos] - (calc_residual(data + pos - cro, task.coefs, cro) >> task.data.shift)) >> task.data.wbits; \ + residual_t t = (data[pos] - (calc_residual(data + pos - cro, task.coefs, cro) >> task.data.shift)) >> task.data.wbits; \ action; \ } #define SWITCH_N(action) \ @@ -861,7 +961,7 @@ void clEstimateResidual( for (int i = 0; i < 1 << EPO; i++) len[i] = 0; -#ifdef AMD +#if defined(AMD) || BITS_PER_SAMPLE > 16 SWITCH_N((t = (t << 1) ^ (t >> 31), len[pos >> (12 - EPO)] += t & 0x7fffff)) #else int4 c0 = vload4(0, &task.coefs[0]); @@ -884,21 +984,19 @@ void clEstimateResidual( int total = 0; for (int i = 0; i < 1 << EPO; i++) { - int res = min(0x7fffff,len[i]); - int k = iclamp(31 - (12 - EPO) - fastclz(res), 0, 14); // 25 - clz(res) == clz(64) - clz(res) == log2(res / 64) + int res = len[i]; + int k = iclamp(31 - fastclz(res) - (12 - EPO), 0, MAX_RICE_PARAM); // 25 - clz(res) == clz(64) - clz(res) == log2(res / 64) total += (k << (12 - EPO)) + (res >> k); } int partLen = min(0x7ffffff, total) + (bs - ro); int obits = task.data.obits - task.data.wbits; tasks[selectedTask].data.size = min(obits * bs, - task.data.type == Fixed ? ro * obits + 6 + (4 * 1/2) + partLen : - task.data.type == LPC ? ro * obits + 4 + 5 + ro * task.data.cbits + 6 + (4 * 1/2)/* << porder */ + partLen : + task.data.type == Fixed ? ro * obits + 6 + RICE_PARAM_BITS + partLen : + task.data.type == LPC ? ro * obits + 4 + 5 + ro * task.data.cbits + 6 + RICE_PARAM_BITS/* << porder */ + partLen : task.data.type == Constant ? obits * select(1, bs, partLen != bs - ro) : obits * bs); } #else - -#define MAX_BLOCKSIZE 4096 #define ESTPARTLOG 5 __kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) @@ -1049,7 +1147,7 @@ void clEstimateResidual( // calculate rice partition bit length for every 32 samples barrier(CLK_LOCAL_MEM_FENCE); // Bug: if (MAX_BLOCKSIZE >> (ESTPARTLOG + 1)) > GROUP_SIZE - int pl = get_local_id(0) < (MAX_BLOCKSIZE >> (ESTPARTLOG + 1)) ? pl = psum[tid * 2] + psum[tid * 2 + 1] : 0; + int pl = get_local_id(0) < (MAX_BLOCKSIZE >> (ESTPARTLOG + 1)) ? psum[tid * 2] + psum[tid * 2 + 1] : 0; barrier(CLK_LOCAL_MEM_FENCE); // for (int pos = 0; pos < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2; pos += GROUP_SIZE) // { @@ -1060,7 +1158,7 @@ void clEstimateResidual( //if (offs < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2) // psum[offs] = pl; // } - int k = iclamp(31 - (ESTPARTLOG + 1) - clz(pl), 0, 14); // 26 - clz(res) == clz(32) - clz(res) == log2(res / 32) + int k = iclamp(31 - fastclz(pl) - (ESTPARTLOG + 1), 0, MAX_RICE_PARAM); // 26 - clz(res) == clz(32) - clz(res) == log2(res / 32) if (tid < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2) psum[tid] = (k << (ESTPARTLOG + 1)) + (pl >> k); barrier(CLK_LOCAL_MEM_FENCE); @@ -1075,8 +1173,8 @@ void clEstimateResidual( int pl = psum[0] + (bs - ro); int obits = task.data.obits - task.data.wbits; int len = min(obits * task.data.blocksize, - task.data.type == Fixed ? task.data.residualOrder * obits + 6 + (4 * 1/2) + pl : - task.data.type == LPC ? task.data.residualOrder * obits + 4 + 5 + task.data.residualOrder * task.data.cbits + 6 + (4 * 1/2)/* << porder */ + pl : + task.data.type == Fixed ? task.data.residualOrder * obits + 6 + RICE_PARAM_BITS + pl : + task.data.type == LPC ? task.data.residualOrder * obits + 4 + 5 + task.data.residualOrder * task.data.cbits + 6 + RICE_PARAM_BITS/* << porder */ + pl : task.data.type == Constant ? obits * select(1, task.data.blocksize, pl != task.data.blocksize - task.data.residualOrder) : obits * task.data.blocksize); tasks[selectedTask].data.size = len; @@ -1172,7 +1270,7 @@ void clEncodeResidual( int bs = task.data.blocksize; int ro = task.data.residualOrder; __global int *data = &samples[task.data.samplesOffs]; - SWITCH_N(residual[task.data.residualOffs + pos] = t); + SWITCH_N(residual[task.data.residualOffs + pos] = convert_bps_sat(t)); } #else // get_group_id(0) == task index @@ -1198,12 +1296,10 @@ void clEncodeResidual( barrier(CLK_LOCAL_MEM_FENCE); -#ifdef AMD - int4 cptr0 = vload4(0, &task.coefs[0]); - int4 cptr1 = vload4(1, &task.coefs[0]); + bpsint4 cptr0 = convert_bps4(vload4(0, &task.coefs[0])); + bpsint4 cptr1 = convert_bps4(vload4(1, &task.coefs[0])); #if MAX_ORDER > 8 - int4 cptr2 = vload4(2, &task.coefs[0]); -#endif + bpsint4 cptr2 = convert_bps4(vload4(2, &task.coefs[0])); #endif data[tid] = 0; @@ -1217,33 +1313,24 @@ void clEncodeResidual( // compute residual __local int* dptr = &data[tid + GROUP_SIZE - ro]; - int4 sum -#ifdef AMD - = cptr0 * vload4(0, dptr) - + cptr1 * vload4(1, dptr) -#else - = vload4(0, &task.coefs[0]) * vload4(0, dptr) - + vload4(1, &task.coefs[0]) * vload4(1, dptr) -#endif + bpsint4 sum + = cptr0 * convert_bps4(vload4(0, dptr)) + + cptr1 * convert_bps4(vload4(1, dptr)) #if MAX_ORDER > 8 -#ifdef AMD - + cptr2 * vload4(2, dptr) -#else - + vload4(2, &task.coefs[0]) * vload4(2, dptr) -#endif + + cptr2 * convert_bps4(vload4(2, dptr)) #if MAX_ORDER > 12 - + vload4(3, &task.coefs[0]) * vload4(3, dptr) + + convert_bps4(vload4(3, &task.coefs[0])) * convert_bps4(vload4(3, dptr)) #if MAX_ORDER > 16 - + vload4(4, &task.coefs[0]) * vload4(4, dptr) - + vload4(5, &task.coefs[0]) * vload4(5, dptr) - + vload4(6, &task.coefs[0]) * vload4(6, dptr) - + vload4(7, &task.coefs[0]) * vload4(7, dptr) + + convert_bps4(vload4(4, &task.coefs[0])) * convert_bps4(vload4(4, dptr)) + + convert_bps4(vload4(5, &task.coefs[0])) * convert_bps4(vload4(5, dptr)) + + convert_bps4(vload4(6, &task.coefs[0])) * convert_bps4(vload4(6, dptr)) + + convert_bps4(vload4(7, &task.coefs[0])) * convert_bps4(vload4(7, dptr)) #endif #endif #endif ; if (off >= ro && off < bs) - output[task.data.residualOffs + off] = data[tid + GROUP_SIZE] - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift); + output[task.data.residualOffs + off] = convert_bps_sat(nextData - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift)); barrier(CLK_LOCAL_MEM_FENCE); data[tid] = nextData; @@ -1254,7 +1341,7 @@ void clEncodeResidual( #ifdef FLACCL_CPU __kernel __attribute__((reqd_work_group_size(1, 1, 1))) void clCalcPartition( - __global int *partition_lengths, + __global ulong *partition_lengths, __global int *residual, __global FLACCLSubframeTask *tasks, int max_porder, // <= 8 @@ -1265,18 +1352,16 @@ void clCalcPartition( int bs = task.data.blocksize; int ro = task.data.residualOrder; //int psize = bs >> max_porder; - __global int *pl = partition_lengths + (1 << (max_porder + 1)) * get_group_id(1); + __global ulong *pl = partition_lengths + (1 << (max_porder + 1)) * get_group_id(1); for (int p = 0; p < (1 << max_porder); p++) - pl[p] = 0; + pl[p] = 0UL; for (int pos = ro; pos < bs; pos ++) { - int t = residual[task.data.residualOffs + pos]; - // overflow protection - t = clamp(t, -0x7fffff, 0x7fffff); + int s = residual[task.data.residualOffs + pos]; // convert to unsigned - t = (t << 1) ^ (t >> 31); + uint t = (s << 1) ^ (s >> 31); pl[pos / psize] += t; } } @@ -1292,15 +1377,15 @@ void clCalcPartition( int psize // == task.blocksize >> max_porder? ) { - __local int pl[(GROUP_SIZE / 8)][15]; + __local uint pl[(GROUP_SIZE / 16)][MAX_RICE_PARAM + 1]; __local FLACCLSubframeData task; const int tid = get_local_id(0); if (tid < sizeof(task) / sizeof(int)) ((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(1)]))[tid]; - if (tid < (GROUP_SIZE / 8)) + if (tid < (GROUP_SIZE / 16)) { - for (int k = 0; k <= 14; k++) + for (int k = 0; k <= MAX_RICE_PARAM; k++) pl[tid][k] = 0; } barrier(CLK_LOCAL_MEM_FENCE); @@ -1311,14 +1396,14 @@ void clCalcPartition( { // fetch residual int s = (offs >= task.residualOrder && offs < end) ? residual[task.residualOffs + offs] : 0; - // overflow protection - s = iclamp(s, -0x7fffff, 0x7fffff); // convert to unsigned - s = (s << 1) ^ (s >> 31); + uint t = (s << 1) ^ (s >> 31); // calc number of unary bits for each residual sample with each rice paramater - int part = (offs - start) / psize + (tid & 1) * (GROUP_SIZE / 16); - for (int k = 0; k <= 14; k++) - atom_add(&pl[part][k], s >> k); + int part = (offs - start) / psize; + // we must ensure that psize * (t >> k) doesn't overflow; + // i.e. t < ((1 << 32) >> (log2(psize) - k)) <= (1 << 32) >> (32 - clz(MAX_BLOCKSIZE) - k) + for (int k = 0; k <= MAX_RICE_PARAM; k++) + atom_add(&pl[part][k], min(t, 0xffffffffU >> max(0, 32 - clz(MAX_BLOCKSIZE) - k)) >> k); //pl[part][k] += s >> k; } barrier(CLK_LOCAL_MEM_FENCE); @@ -1326,141 +1411,79 @@ void clCalcPartition( int part = get_group_id(0) * (GROUP_SIZE / 16) + tid; if (tid < (GROUP_SIZE / 16) && part < (1 << max_porder)) { - for (int k = 0; k <= 14; k++) + for (int k = 0; k <= MAX_RICE_PARAM; k++) { // output length - const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (k << (max_porder + 1)); - int plen = pl[tid][k] + pl[tid + (GROUP_SIZE / 16)][k]; - partition_lengths[pos + part] = min(0x7fffff, plen) + (psize - select(0, task.residualOrder, part == 0)) * (k + 1); + const int pos = ((MAX_RICE_PARAM + 1) << (max_porder + 1)) * get_group_id(1) + (k << (max_porder + 1)); + uint plen = pl[tid][k]; + partition_lengths[pos + part] = min(0x007fffffU, plen) + (uint)(psize - select(0, task.residualOrder, part == 0)) * (k + 1); // if (get_group_id(1) == 0) //printf("pl[%d][%d] == %d\n", k, part, min(0x7fffff, pl[k][tid]) + (psize - task.residualOrder * (part == 0)) * (k + 1)); } } } -#endif -#ifdef FLACCL_CPU -// get_group_id(0) == task index -__kernel __attribute__((reqd_work_group_size(1, 1, 1))) -void clCalcPartition16( - __global int *partition_lengths, - __global int *residual, - __global int *samples, - __global FLACCLSubframeTask *tasks, - int max_porder // <= 8 - ) -{ - FLACCLSubframeTask task = tasks[get_global_id(0)]; - int bs = task.data.blocksize; - int ro = task.data.residualOrder; - __global int *data = &samples[task.data.samplesOffs]; - __global int *pl = partition_lengths + (1 << (max_porder + 1)) * get_global_id(0); - for (int p = 0; p < (1 << max_porder); p++) - pl[p] = 0; - __global int *rptr = residual + task.data.residualOffs; - SWITCH_N((rptr[pos] = t, pl[pos >> 4] += (t << 1) ^ (t >> 31))); - //SWITCH_N((residual[task.data.residualOffs + pos] = t, t = (t << 1) ^ (t >> 31), pl[pos >> 4] += t)); -} -#else -// get_group_id(0) == task index __kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) void clCalcPartition16( - __global int *partition_lengths, + __global unsigned int *partition_lengths, __global int *residual, - __global int *samples, __global FLACCLSubframeTask *tasks, int max_porder // <= 8 ) { - __local FLACCLSubframeTask task; - __local int data[GROUP_SIZE * 2]; - __local int res[GROUP_SIZE]; - __local int pl[GROUP_SIZE >> 4][15]; + __local FLACCLSubframeData task; + __local unsigned int res[GROUP_SIZE]; + __local unsigned int pl[GROUP_SIZE >> 4][MAX_RICE_PARAM + 1]; const int tid = get_local_id(0); if (tid < sizeof(task) / sizeof(int)) ((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(0)]))[tid]; barrier(CLK_LOCAL_MEM_FENCE); - int bs = task.data.blocksize; - int ro = task.data.residualOrder; - int sh = task.data.shift; - - if (tid >= ro && tid < 32) - task.coefs[tid] = 0; - - int k = tid & 15; - int x = tid / 16; + int bs = task.blocksize; + int ro = task.residualOrder; barrier(CLK_LOCAL_MEM_FENCE); - __global int * rptr = &residual[task.data.residualOffs]; - __global int * plptr = &partition_lengths[(15 << (max_porder + 1)) * get_group_id(0) + (k << (max_porder + 1))]; - __local int* dptr = &data[tid + GROUP_SIZE - ro]; - - int4 cptr0 = vload4(0, &task.coefs[0]); - int4 cptr1 = vload4(1, &task.coefs[0]); - int4 cptr2 = vload4(2, &task.coefs[0]); - data[tid] = 0; for (int pos = 0; pos < bs; pos += GROUP_SIZE) { int offs = pos + tid; - // fetch samples - int nextData = offs < bs ? samples[task.data.samplesOffs + offs] >> task.data.wbits : 0; - data[tid + GROUP_SIZE] = nextData; - barrier(CLK_LOCAL_MEM_FENCE); - - // compute residual - int4 sum = cptr0 * vload4(0, dptr) -#if MAX_ORDER > 4 - + cptr1 * vload4(1, dptr) -#if MAX_ORDER > 8 - + cptr2 * vload4(2, dptr) -#if MAX_ORDER > 12 - + vload4(3, &task.coefs[0]) * vload4(3, dptr) -#if MAX_ORDER > 16 - + vload4(4, &task.coefs[0]) * vload4(4, dptr) - + vload4(5, &task.coefs[0]) * vload4(5, dptr) - + vload4(6, &task.coefs[0]) * vload4(6, dptr) - + vload4(7, &task.coefs[0]) * vload4(7, dptr) -#endif -#endif -#endif -#endif - ; - int s = select(0, nextData - ((sum.x + sum.y + sum.z + sum.w) >> sh), offs >= ro && offs < bs); - - // output residual - if (offs < bs) - rptr[offs] = s; - - s = iclamp(s, -0x7fffff, 0x7fffff); + // fetch residual + int s = (offs >= ro && offs < bs) ? residual[task.residualOffs + offs] : 0; // convert to unsigned res[tid] = (s << 1) ^ (s >> 31); - // for (int k = 0; k < 15; k++) atom_add(&pl[x][k], s >> k); + barrier(CLK_LOCAL_MEM_FENCE); + + for (int k0 = 0; k0 <= MAX_RICE_PARAM; k0 += 16) + { + // calc number of unary bits for each group of 16 residual samples + // with each rice parameter. + int k = k0 + (tid & 15); + int x = tid >> 4; + // we must ensure that psize * (t >> k) doesn't overflow; + // i.e. t < ((1 << 32) >> (log2(16) - k)) <= (1 << 32) >> (4 - k) + uint4 lim = 0xffffffffU >> max(0, 4 - k); + __local uint * chunk = &res[x << 4]; + uint4 rsum = (min(lim,vload4(0,chunk)) >> k) + (min(lim,vload4(1,chunk)) >> k) + (min(lim,vload4(2,chunk)) >> k) + (min(lim,vload4(3,chunk)) >> k); + uint rs = rsum.x + rsum.y + rsum.z + rsum.w; + + // We can safely limit length here to 0x007fffffU, not causing length + // mismatch, because any such length would cause Verbatim frame anyway. + // And this limit protects us from overflows when calculating larger + // partitions, as we can have a maximum of 2^8 partitions, resulting + // in maximum partition length of 0x7fffffffU + change. + if (k <= MAX_RICE_PARAM) pl[x][k] = min(0x007fffffU, rs) + (uint)(16 - select(0, ro, offs < 16)) * (k + 1); + } barrier(CLK_LOCAL_MEM_FENCE); - data[tid] = nextData; - - // calc number of unary bits for each residual sample with each rice paramater - __local int * chunk = &res[x << 4]; - sum = (vload4(0,chunk) >> k) + (vload4(1,chunk) >> k) + (vload4(2,chunk) >> k) + (vload4(3,chunk) >> k); - s = sum.x + sum.y + sum.z + sum.w; - -#if 0 - if (k <= 14 && offs < bs) - plptr[offs >> 4] = min(0x7fffff, s) + (16 - select(0, ro, offs < 16)) * (k + 1); -#else - if (k <= 14) pl[x][k] = min(0x7fffff, s) + (16 - select(0, ro, offs < 16)) * (k + 1); - barrier(CLK_LOCAL_MEM_FENCE); - int k1 = tid >> 3, x1 = tid & 7; - if (k1 <= 14 && (pos >> 4) + x1 < (1 << max_porder)) - partition_lengths[(15 << (max_porder + 1)) * get_group_id(0) + (k1 << (max_porder + 1)) + (pos >> 4) + x1] = pl[x1][k1]; -#endif - -// if (task.data.blocksize == 16 && x == 0 && k <= 14) -// printf("[%d] = %d = s:%d + %d * (k:%d + 1), ro=%d, offs=%d, lpos=%d\n", k, partition_lengths[lpos], s, (16 - select(0, ro, offs < 16)), k, ro, offs, lpos); + + for (int k0 = 0; k0 <= MAX_RICE_PARAM; k0 += 16) + { + int k1 = k0 + (tid >> 3), x1 = tid & 7; + if (k1 <= MAX_RICE_PARAM && (pos >> 4) + x1 < (1 << max_porder)) + partition_lengths[((MAX_RICE_PARAM + 1) << (max_porder + 1)) * get_group_id(0) + (k1 << (max_porder + 1)) + (pos >> 4) + x1] = pl[x1][k1]; + } } } #endif @@ -1471,13 +1494,13 @@ void clCalcPartition16( // get_group_id(1) == task index __kernel __attribute__((reqd_work_group_size(1, 1, 1))) void clSumPartition( - __global int* partition_lengths, + __global ulong* partition_lengths, int max_porder ) { if (get_group_id(0) != 0) // ignore k != 0 return; - __global int * sums = partition_lengths + (1 << (max_porder + 1)) * get_group_id(1); + __global ulong * sums = partition_lengths + (1 << (max_porder + 1)) * get_group_id(1); for (int i = max_porder - 1; i >= 0; i--) { for (int j = 0; j < (1 << i); j++) @@ -1496,15 +1519,15 @@ void clSumPartition( // get_group_id(1) == task index __kernel __attribute__((reqd_work_group_size(128, 1, 1))) void clSumPartition( - __global int* partition_lengths, + __global uint* partition_lengths, int max_porder ) { - __local int data[256]; // max_porder <= 8, data length <= 1 << 9. - const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (get_group_id(0) << (max_porder + 1)); + __local uint data[256]; // max_porder <= 8, data length <= 1 << 9. + const int pos = ((MAX_RICE_PARAM + 1) << (max_porder + 1)) * get_group_id(1) + (get_group_id(0) << (max_porder + 1)); // fetch partition lengths - int2 pl = get_local_id(0) * 2 < (1 << max_porder) ? vload2(get_local_id(0),&partition_lengths[pos]) : 0; + uint2 pl = get_local_id(0) * 2 < (1 << max_porder) ? vload2(get_local_id(0),&partition_lengths[pos]) : 0; data[get_local_id(0)] = pl.x + pl.y; barrier(CLK_LOCAL_MEM_FENCE); @@ -1512,7 +1535,7 @@ void clSumPartition( int out_pos = (1 << (max_porder - 1)) + get_local_id(0); for (int bs = 1 << (max_porder - 2); bs > 0; bs >>= 1) { - if (get_local_id(0) < bs) data[out_pos] = data[in_pos] + data[in_pos + 1]; + if (get_local_id(0) < bs) data[out_pos] = data[in_pos] + data[in_pos + 1]; in_pos += bs << 1; out_pos += bs; barrier(CLK_LOCAL_MEM_FENCE); @@ -1531,7 +1554,7 @@ __kernel __attribute__((reqd_work_group_size(1, 1, 1))) void clFindRiceParameter( __global FLACCLSubframeTask *tasks, __global int* rice_parameters, - __global int* partition_lengths, + __global ulong* partition_lengths, int max_porder ) { @@ -1541,7 +1564,7 @@ void clFindRiceParameter( //int psize = task->data.blocksize >> max_porder; int bs = task->data.blocksize; int ro = task->data.residualOrder; - __global int* ppl = &partition_lengths[get_group_id(0) << (max_porder + 1)]; + __global ulong* ppl = &partition_lengths[get_group_id(0) << (max_porder + 1)]; __global int* prp = &rice_parameters[get_group_id(0) << (max_porder + 2)]; __global int* pol = prp + (1 << (max_porder + 1)); for (int porder = max_porder; porder >= 0; porder--) @@ -1549,10 +1572,10 @@ void clFindRiceParameter( int pos = (2 << max_porder) - (2 << porder); int fin = pos + (1 << porder); - int pl = ppl[pos]; + ulong pl = ppl[pos]; int ps = (bs >> porder) - ro; - int k = iclamp(31 - fastclz(pl / max(1, ps)), 0, 14); - int plk = ps * (k + 1) + (pl >> k); + int k = iclamp(63 - fastclz64(pl / max(1, ps)), 0, MAX_RICE_PARAM); + int plk = ps * (k + 1) + (int)(pl >> k); // output rice parameter prp[pos] = k; @@ -1564,8 +1587,8 @@ void clFindRiceParameter( for (int offs = pos + 1; offs < fin; offs++) { pl = ppl[offs]; - k = iclamp(31 - fastclz(pl / ps), 0, 14); - plk = ps * (k + 1) + (pl >> k); + k = iclamp(63 - fastclz64(pl / ps), 0, MAX_RICE_PARAM); + plk = ps * (k + 1) + (int)(pl >> k); // output rice parameter prp[offs] = k; @@ -1581,18 +1604,18 @@ __kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) void clFindRiceParameter( __global FLACCLSubframeTask *tasks, __global int* rice_parameters, - __global int* partition_lengths, + __global uint* partition_lengths, int max_porder ) { for (int offs = get_local_id(0); offs < (2 << max_porder); offs += GROUP_SIZE) { - const int pos = (15 << (max_porder + 1)) * get_group_id(0) + offs; - int best_l = partition_lengths[pos]; + const int pos = ((MAX_RICE_PARAM + 1) << (max_porder + 1)) * get_group_id(0) + offs; + uint best_l = partition_lengths[pos]; int best_k = 0; - for (int k = 1; k <= 14; k++) + for (int k = 1; k <= MAX_RICE_PARAM; k++) { - int l = partition_lengths[pos + (k << (max_porder + 1))]; + uint l = partition_lengths[pos + (k << (max_porder + 1))]; best_k = select(best_k, k, l < best_l); best_l = min(best_l, l); } @@ -1630,16 +1653,16 @@ void clFindPartitionOrder( partlen[porder] += rice_parameters[pos + start + offs]; } - int best_length = partlen[0] + 4; + int best_length = partlen[0] + RICE_PARAM_BITS; int best_porder = 0; for (int porder = 1; porder <= max_porder; porder++) { - int length = (4 << porder) + partlen[porder]; + int length = (RICE_PARAM_BITS << porder) + partlen[porder]; best_porder = select(best_porder, porder, length < best_length); best_length = min(best_length, length); } - best_length = (4 << best_porder) + task->data.blocksize - task->data.residualOrder; + best_length = (RICE_PARAM_BITS << best_porder) + task->data.blocksize - task->data.residualOrder; int best_psize = task->data.blocksize >> best_porder; int start = task->data.residualOffs + task->data.residualOrder; int fin = task->data.residualOffs + best_psize; @@ -1704,11 +1727,11 @@ void clFindPartitionOrder( } barrier(CLK_LOCAL_MEM_FENCE); - int best_length = partlen[0] + 4; + int best_length = partlen[0] + RICE_PARAM_BITS; int best_porder = 0; for (int porder = 1; porder <= max_porder; porder++) { - int length = (4 << porder) + partlen[porder]; + int length = (RICE_PARAM_BITS << porder) + partlen[porder]; best_porder = select(best_porder, porder, length < best_length); best_length = min(best_length, length); } @@ -1836,14 +1859,14 @@ void clCalcOutputOffsets( ) { const int channels = 2; - __local FLACCLSubframeData ltasks[2]; - __local volatile int mypos[2]; + __local FLACCLSubframeData ltasks[MAX_CHANNELS]; + __local volatile int mypos[MAX_CHANNELS]; int offset = 0; for (int iFrame = 0; iFrame < frameCount; iFrame++) { if (get_local_id(0) < sizeof(ltasks[0]) / sizeof(int)) - for (int ch = 0; ch < channels; ch++) - ((__local int*)<asks[ch])[get_local_id(0)] = ((__global int*)(&tasks[iFrame * channels + ch]))[get_local_id(0)]; + for (int ch = 0; ch < MAX_CHANNELS; ch++) + ((__local int*)<asks[ch])[get_local_id(0)] = ((__global int*)(&tasks[iFrame * MAX_CHANNELS + ch]))[get_local_id(0)]; //printf("len_utf8(%d) == %d\n", firstFrame + iFrame, len_utf8(firstFrame + iFrame)); offset += 15 + 1 + 4 + 4 + 4 + 3 + 1 + len_utf8(firstFrame + iFrame) @@ -1856,18 +1879,18 @@ void clCalcOutputOffsets( // assert (offset % 8) == 0 offset += 8; - if (get_local_id(0) < channels) + if (get_local_id(0) < MAX_CHANNELS) { int ch = get_local_id(0); // Add 64 bits to separate frames if header is too small so they can intersect int mylen = 8 + ltasks[ch].wbits + 64 + ltasks[ch].size; mypos[ch] = mylen; - for (int offset = 1; offset < WARP_SIZE && offset < channels; offset <<= 1) + for (int offset = 1; offset < WARP_SIZE && offset < MAX_CHANNELS; offset <<= 1) if (ch >= offset) mypos[ch] += mypos[ch - offset]; mypos[ch] += offset; - tasks[iFrame * channels + ch].data.encodingOffset = mypos[ch] - ltasks[ch].size + ltasks[ch].headerLen; + tasks[iFrame * MAX_CHANNELS + ch].data.encodingOffset = mypos[ch] - ltasks[ch].size + ltasks[ch].headerLen; } - offset = mypos[channels - 1]; + offset = mypos[MAX_CHANNELS - 1]; offset = (offset + 7) & ~7; offset += 16; } @@ -1909,7 +1932,7 @@ void clRiceEncoding( for (int p = 0; p < (1 << porder); p++) { int k = kptr[p]; - writebits(&bw, 4, k); + writebits(&bw, RICE_PARAM_BITS, k); //if (get_group_id(0) == 0) printf("[%x] ", k); //if (get_group_id(0) == 0) printf("(%x) ", bw.bit_buf); if (p == 1) res_cnt = psize; @@ -1978,7 +2001,7 @@ void clRiceEncoding( flush(&bw); } #else - __local unsigned int data[GROUP_SIZE]; + __local uint data[GROUP_SIZE]; __local volatile int mypos[GROUP_SIZE+1]; #if 0 __local int brp[256]; @@ -2006,12 +2029,12 @@ void clRiceEncoding( int start = task.encodingOffset; int plen = bs >> task.porder; //int plenoffs = 12 - task.porder; - unsigned int remainder = 0U; + uint remainder = 0U; int pos; for (pos = 0; pos + GROUP_SIZE - 1 < bs; pos += GROUP_SIZE) { int offs = pos + tid; - int v = residual[task.residualOffs + offs]; + int iv = residual[task.residualOffs + offs]; int part = offs / plen; // >> plenoffs; #if 0 int k = brp[part]; @@ -2019,10 +2042,10 @@ void clRiceEncoding( int k = best_rice_parameters[(get_group_id(0) << max_porder) + part]; #endif int pstart = offs == task.residualOrder || offs == part * plen; - v = (v << 1) ^ (v >> 31); - int mylen = select(0, (v >> k) + 1 + k + select(0, 4, pstart), offs >= task.residualOrder && offs < bs); + uint v = (iv << 1) ^ (iv >> 31); + int mylen = select(0, (int)(v >> k) + 1 + k + select(0, RICE_PARAM_BITS, pstart), offs >= task.residualOrder && offs < bs); mypos[tid] = mylen; - + // Inclusive scan(+) int lane = (tid & (WARP_SIZE - 1)); for (int offset = 1; offset < WARP_SIZE; offset <<= 1) @@ -2040,7 +2063,8 @@ void clRiceEncoding( mp += start + select(0, warppos[tid / WARP_SIZE - 1], tid / WARP_SIZE > 0); int start32 = start >> 5; start += mypos[GROUP_SIZE - 1] + warppos[GROUP_SIZE / WARP_SIZE - 2]; - + //if (start / 32 - start32 >= GROUP_SIZE - 3) + // tasks[get_group_id(0)].data.size = 1; //if (tid == GROUP_SIZE - 1 && mypos[tid] > (GROUP_SIZE/2) * 32) // printf("Oops: %d\n", mypos[tid]); data[tid] = select(0U, remainder, tid == 0); @@ -2052,18 +2076,18 @@ void clRiceEncoding( int kpos = mp - mylen; int kpos0 = (kpos >> 5) - start32; int kpos1 = kpos & 31; - unsigned int kval = (unsigned int)k << 28; - unsigned int kval0 = kval >> kpos1; - unsigned int kval1 = kval << (32 - kpos1); + uint kval = (uint)k << (32 - RICE_PARAM_BITS); + uint kval0 = kval >> kpos1; + uint kval1 = kval << (32 - kpos1); if (kval0) atom_or(&data[kpos0], kval0); if (kpos1 && kval1) atom_or(&data[kpos0 + 1], kval1); } int qpos = mp - k - 1; int qpos0 = (qpos >> 5) - start32; int qpos1 = qpos & 31; - unsigned int qval = (1U << 31) | ((unsigned int)v << (31 - k)); - unsigned int qval0 = qval >> qpos1; - unsigned int qval1= qval << (32 - qpos1); + uint qval = (1U << 31) | (v << (31 - k)); + uint qval0 = qval >> qpos1; + uint qval1= qval << (32 - qpos1); if (qval0) atom_or(&data[qpos0], qval0); if (qpos1 && qval1) atom_or(&data[qpos0 + 1], qval1); } @@ -2075,13 +2099,13 @@ void clRiceEncoding( if (pos < bs) { int offs = pos + tid; - int v = offs < bs ? residual[task.residualOffs + offs] : 0; + int iv = offs < bs ? residual[task.residualOffs + offs] : 0; int part = offs / plen; // >> plenoffs; //int k = brp[min(255, part)]; int k = offs < bs ? best_rice_parameters[(get_group_id(0) << max_porder) + part] : 0; int pstart = offs == task.residualOrder || offs == part * plen; - v = (v << 1) ^ (v >> 31); - int mylen = select(0, (v >> k) + 1 + k + select(0, 4, pstart), offs >= task.residualOrder && offs < bs); + uint v = (iv << 1) ^ (iv >> 31); + int mylen = select(0, (int)(v >> k) + 1 + k + select(0, RICE_PARAM_BITS, pstart), offs >= task.residualOrder && offs < bs); mypos[tid] = mylen; // Inclusive scan(+) @@ -2113,18 +2137,18 @@ void clRiceEncoding( int kpos = mp - mylen; int kpos0 = (kpos >> 5) - start32; int kpos1 = kpos & 31; - unsigned int kval = (unsigned int)k << 28; - unsigned int kval0 = kval >> kpos1; - unsigned int kval1 = kval << (32 - kpos1); + uint kval = (uint)k << (32 - RICE_PARAM_BITS); + uint kval0 = kval >> kpos1; + uint kval1 = kval << (32 - kpos1); if (kval0) atom_or(&data[kpos0], kval0); if (kpos1 && kval1) atom_or(&data[kpos0 + 1], kval1); } int qpos = mp - k - 1; int qpos0 = (qpos >> 5) - start32; int qpos1 = qpos & 31; - unsigned int qval = (1U << 31) | ((unsigned int)v << (31 - k)); - unsigned int qval0 = qval >> qpos1; - unsigned int qval1= qval << (32 - qpos1); + uint qval = (1U << 31) | (v << (31 - k)); + uint qval0 = qval >> qpos1; + uint qval1= qval << (32 - qpos1); if (qval0) atom_or(&data[qpos0], qval0); if (qpos1 && qval1) atom_or(&data[qpos0 + 1], qval1); } diff --git a/CUETools.Codecs.FLAKE/Flake.cs b/CUETools.Codecs.FLAKE/Flake.cs index ab474f9..fd69f5f 100644 --- a/CUETools.Codecs.FLAKE/Flake.cs +++ b/CUETools.Codecs.FLAKE/Flake.cs @@ -83,6 +83,11 @@ namespace CUETools.Codecs.FLAKE /// public int porder; + /// + /// coding method: rice parameters use 4 bits for coding_method 0 and 5 bits for coding_method 1 + /// + public int coding_method; + /// /// Rice parameters /// diff --git a/CUETools.Codecs.FLAKE/FlakeReader.cs b/CUETools.Codecs.FLAKE/FlakeReader.cs index cfb0f1e..78d81e5 100644 --- a/CUETools.Codecs.FLAKE/FlakeReader.cs +++ b/CUETools.Codecs.FLAKE/FlakeReader.cs @@ -102,7 +102,7 @@ namespace CUETools.Codecs.FLAKE } _samplesInBuffer = 0; - if (PCM.BitsPerSample != 16 || PCM.ChannelCount != 2 || PCM.SampleRate != 44100) + if ((PCM.BitsPerSample != 16 && PCM.BitsPerSample != 24) || PCM.ChannelCount != 2 || (PCM.SampleRate != 44100 && PCM.SampleRate != 48000)) throw new Exception("invalid flac file"); samplesBuffer = new int[Flake.MAX_BLOCKSIZE * PCM.ChannelCount]; @@ -362,8 +362,9 @@ namespace CUETools.Codecs.FLAKE unsafe void decode_residual(BitReader bitreader, FlacFrame frame, int ch) { // rice-encoded block - uint coding_method = bitreader.readbits(2); // ????? == 0 - if (coding_method != 0 && coding_method != 1) // if 1, then parameter length == 5 bits instead of 4 + // coding method + frame.subframes[ch].best.rc.coding_method = (int)bitreader.readbits(2); // ????? == 0 + if (frame.subframes[ch].best.rc.coding_method != 0 && frame.subframes[ch].best.rc.coding_method != 1) throw new Exception("unsupported residual coding"); // partition order frame.subframes[ch].best.rc.porder = (int)bitreader.readbits(4); @@ -372,7 +373,7 @@ namespace CUETools.Codecs.FLAKE int psize = frame.blocksize >> frame.subframes[ch].best.rc.porder; int res_cnt = psize - frame.subframes[ch].best.order; - int rice_len = 4 + (int)coding_method; + int rice_len = 4 + frame.subframes[ch].best.rc.coding_method; // residual int j = frame.subframes[ch].best.order; int* r = frame.subframes[ch].best.residual + j; diff --git a/CUETools.Codecs.FLAKE/FlakeWriter.cs b/CUETools.Codecs.FLAKE/FlakeWriter.cs index cab0e5b..9cbc6a4 100644 --- a/CUETools.Codecs.FLAKE/FlakeWriter.cs +++ b/CUETools.Codecs.FLAKE/FlakeWriter.cs @@ -125,8 +125,8 @@ namespace CUETools.Codecs.FLAKE { _pcm = pcm; - if (_pcm.BitsPerSample != 16) - throw new Exception("Bits per sample must be 16."); + //if (_pcm.BitsPerSample != 16) + // throw new Exception("Bits per sample must be 16."); if (_pcm.ChannelCount != 2) throw new Exception("ChannelCount must be 2."); @@ -571,14 +571,14 @@ namespace CUETools.Codecs.FLAKE samplesInBuffer += block; } - unsafe static void channel_decorrelation(int* leftS, int* rightS, int *leftM, int *rightM, int blocksize) - { - for (int i = 0; i < blocksize; i++) - { - leftM[i] = (leftS[i] + rightS[i]) >> 1; - rightM[i] = leftS[i] - rightS[i]; - } - } + //unsafe static void channel_decorrelation(int* leftS, int* rightS, int *leftM, int *rightM, int blocksize) + //{ + // for (int i = 0; i < blocksize; i++) + // { + // leftM[i] = (leftS[i] + rightS[i]) >> 1; + // rightM[i] = leftS[i] - rightS[i]; + // } + //} unsafe void encode_residual_verbatim(int* res, int* smp, uint n) { @@ -638,24 +638,28 @@ namespace CUETools.Codecs.FLAKE } } - static unsafe uint calc_optimal_rice_params(int porder, int* parm, uint* sums, uint n, uint pred_order) + static unsafe uint calc_optimal_rice_params(int porder, int* parm, ulong* sums, uint n, uint pred_order, ref int method) { uint part = (1U << porder); uint cnt = (n >> porder) - pred_order; - int k = cnt > 0 ? Math.Min(Flake.MAX_RICE_PARAM, BitReader.log2i(sums[0] / cnt)) : 0; - uint all_bits = cnt * ((uint)k + 1U) + (sums[0] >> k); + int maxK = method > 0 ? 30 : Flake.MAX_RICE_PARAM; + int k = cnt > 0 ? Math.Min(maxK, BitReader.log2i(sums[0] / cnt)) : 0; + int realMaxK0 = k; + ulong all_bits = cnt * ((uint)k + 1U) + (sums[0] >> k); parm[0] = k; cnt = (n >> porder); for (uint i = 1; i < part; i++) { - k = Math.Min(Flake.MAX_RICE_PARAM, BitReader.log2i(sums[i] / cnt)); + k = Math.Min(maxK, BitReader.log2i(sums[i] / cnt)); + realMaxK0 = Math.Max(realMaxK0, k); all_bits += cnt * ((uint)k + 1U) + (sums[i] >> k); parm[i] = k; } - return all_bits + (4 * part); + method = realMaxK0 > Flake.MAX_RICE_PARAM ? 1 : 0; + return (uint)all_bits + ((4U + (uint)method) * part); } - static unsafe void calc_lower_sums(int pmin, int pmax, uint* sums) + static unsafe void calc_lower_sums(int pmin, int pmax, ulong* sums) { for (int i = pmax - 1; i >= pmin; i--) { @@ -668,12 +672,12 @@ namespace CUETools.Codecs.FLAKE } } - static unsafe void calc_sums(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums) + static unsafe void calc_sums(int pmin, int pmax, uint* data, uint n, uint pred_order, ulong* sums) { int parts = (1 << pmax); uint* res = data + pred_order; uint cnt = (n >> pmax) - pred_order; - uint sum = 0; + ulong sum = 0; for (uint j = cnt; j > 0; j--) sum += *(res++); sums[0] = sum; @@ -696,18 +700,18 @@ namespace CUETools.Codecs.FLAKE /// /// /// - static unsafe void calc_sums18(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums) + static unsafe void calc_sums18(int pmin, int pmax, uint* data, uint n, uint pred_order, ulong* sums) { int parts = (1 << pmax); uint* res = data + pred_order; uint cnt = 18 - pred_order; - uint sum = 0; + ulong sum = 0; for (uint j = cnt; j > 0; j--) sum += *(res++); sums[0] = sum; for (int i = 1; i < parts; i++) { - sums[i] = + sums[i] = 0UL + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + @@ -725,18 +729,18 @@ namespace CUETools.Codecs.FLAKE /// /// /// - static unsafe void calc_sums16(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums) + static unsafe void calc_sums16(int pmin, int pmax, uint* data, uint n, uint pred_order, ulong* sums) { int parts = (1 << pmax); uint* res = data + pred_order; uint cnt = 16 - pred_order; - uint sum = 0; + ulong sum = 0; for (uint j = cnt; j > 0; j--) sum += *(res++); sums[0] = sum; for (int i = 1; i < parts; i++) { - sums[i] = + sums[i] = 0UL + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + *(res++) + @@ -744,10 +748,10 @@ namespace CUETools.Codecs.FLAKE } } - static unsafe uint calc_rice_params(RiceContext rc, int pmin, int pmax, int* data, uint n, uint pred_order) + static unsafe uint calc_rice_params(RiceContext rc, int pmin, int pmax, int* data, uint n, uint pred_order, int bps) { uint* udata = stackalloc uint[(int)n]; - uint* sums = stackalloc uint[(pmax + 1) * Flake.MAX_PARTITIONS]; + ulong* sums = stackalloc ulong[(pmax + 1) * Flake.MAX_PARTITIONS]; int* parm = stackalloc int[(pmax + 1) * Flake.MAX_PARTITIONS]; //uint* bits = stackalloc uint[Flake.MAX_PARTITION_ORDER]; @@ -770,17 +774,21 @@ namespace CUETools.Codecs.FLAKE uint opt_bits = AudioSamples.UINT32_MAX; int opt_porder = pmin; + int opt_method = 0; for (int i = pmin; i <= pmax; i++) { - uint bits = calc_optimal_rice_params(i, parm + i * Flake.MAX_PARTITIONS, sums + i * Flake.MAX_PARTITIONS, n, pred_order); + int method = bps > 16 ? 1 : 0; + uint bits = calc_optimal_rice_params(i, parm + i * Flake.MAX_PARTITIONS, sums + i * Flake.MAX_PARTITIONS, n, pred_order, ref method); if (bits <= opt_bits) { opt_bits = bits; opt_porder = i; + opt_method = method; } } rc.porder = opt_porder; + rc.coding_method = opt_method; fixed (int* rparms = rc.rparams) AudioSamples.MemCpy(rparms, parm + opt_porder * Flake.MAX_PARTITIONS, (1 << opt_porder)); @@ -841,7 +849,7 @@ namespace CUETools.Codecs.FLAKE } int pmax = get_max_p_order(eparams.max_partition_order, frame.blocksize, frame.current.order); int pmin = Math.Min(eparams.min_partition_order, pmax); - uint best_size = calc_rice_params(frame.current.rc, pmin, pmax, frame.current.residual, (uint)frame.blocksize, (uint)frame.current.order); + uint best_size = calc_rice_params(frame.current.rc, pmin, pmax, frame.current.residual, (uint)frame.blocksize, (uint)frame.current.order, PCM.BitsPerSample); // not working //for (int o = 1; o <= frame.current.order; o++) //{ @@ -877,7 +885,7 @@ namespace CUETools.Codecs.FLAKE int pmax = get_max_p_order(eparams.max_partition_order, frame.blocksize, frame.current.order); int pmin = Math.Min(eparams.min_partition_order, pmax); frame.current.size = (uint)(frame.current.order * frame.subframes[ch].obits) + 6 - + calc_rice_params(frame.current.rc, pmin, pmax, frame.current.residual, (uint)frame.blocksize, (uint)frame.current.order); + + calc_rice_params(frame.current.rc, pmin, pmax, frame.current.residual, (uint)frame.blocksize, (uint)frame.current.order, PCM.BitsPerSample); frame.subframes[ch].done_fixed |= (1U << order); @@ -1054,7 +1062,7 @@ namespace CUETools.Codecs.FLAKE unsafe void output_residual(FlacFrame frame, BitWriter bitwriter, FlacSubframeInfo sub) { // rice-encoded block - bitwriter.writebits(2, 0); + bitwriter.writebits(2, sub.best.rc.coding_method); // partition order int porder = sub.best.rc.porder; @@ -1063,13 +1071,14 @@ namespace CUETools.Codecs.FLAKE bitwriter.writebits(4, porder); int res_cnt = psize - sub.best.order; + int rice_len = 4 + sub.best.rc.coding_method; // residual int j = sub.best.order; fixed (byte* fixbuf = &frame_buffer[0]) for (int p = 0; p < (1 << porder); p++) { int k = sub.best.rc.rparams[p]; - bitwriter.writebits(4, k); + bitwriter.writebits(rice_len, k); if (p == 1) res_cnt = psize; int cnt = Math.Min(res_cnt, frame.blocksize - j); bitwriter.write_rice_block_signed(fixbuf, k, sub.best.residual + j, cnt); @@ -1436,6 +1445,9 @@ namespace CUETools.Codecs.FLAKE output_subframes(frame, bitwriter); output_frame_footer(bitwriter); + if (bitwriter.Length >= max_frame_size) + throw new Exception("buffer overflow"); + if (frame_buffer != null) { if (eparams.variable_block_size > 0) @@ -1732,9 +1744,6 @@ namespace CUETools.Codecs.FLAKE } if (i == 8) throw new Exception("non-standard bps"); - // FIXME: For now, only 16-bit encoding is supported - if (_pcm.BitsPerSample != 16) - throw new Exception("non-standard bps"); if (_blocksize == 0) { diff --git a/CUETools.Codecs/BitReader.cs b/CUETools.Codecs/BitReader.cs index 9f0d9da..afb1322 100644 --- a/CUETools.Codecs/BitReader.cs +++ b/CUETools.Codecs/BitReader.cs @@ -35,6 +35,15 @@ namespace CUETools.Codecs return log2i((uint)v); } + public static int log2i(ulong v) + { + int n = 0; + if (0 != (v & 0xffffffff00000000)) { v >>= 32; n += 32; } + if (0 != (v & 0xffff0000)) { v >>= 16; n += 16; } + if (0 != (v & 0xff00)) { v >>= 8; n += 8; } + return n + byte_to_log2_table[v]; + } + public static int log2i(uint v) { int n = 0; diff --git a/CUETools.Codecs/Codecs.cs b/CUETools.Codecs/Codecs.cs index 6614ec5..460d38a 100644 --- a/CUETools.Codecs/Codecs.cs +++ b/CUETools.Codecs/Codecs.cs @@ -422,14 +422,41 @@ namespace CUETools.Codecs unsafe public void Interlace(int pos, int* src1, int* src2, int n) { - if (PCM.ChannelCount != 2 || PCM.BitsPerSample != 16) - throw new Exception(""); - fixed (byte* bs = Bytes) + if (PCM.ChannelCount != 2) + throw new Exception("Must be stereo"); + if (PCM.BitsPerSample == 16) { - int* res = ((int*)bs) + pos; - for (int i = n; i > 0; i--) - *(res++) = (*(src1++) & 0xffff) ^ (*(src2++) << 16); + fixed (byte* bs = Bytes) + { + int* res = ((int*)bs) + pos; + for (int i = n; i > 0; i--) + *(res++) = (*(src1++) & 0xffff) ^ (*(src2++) << 16); + } } + else if (PCM.BitsPerSample == 24) + { + fixed (byte* bs = Bytes) + { + byte* res= bs + pos * 6; + for (int i = n; i > 0; i--) + { + uint sample_out = (uint)*(src1++); + *(res++) = (byte)(sample_out & 0xFF); + sample_out >>= 8; + *(res++) = (byte)(sample_out & 0xFF); + sample_out >>= 8; + *(res++) = (byte)(sample_out & 0xFF); + sample_out = (uint)*(src2++); + *(res++) = (byte)(sample_out & 0xFF); + sample_out >>= 8; + *(res++) = (byte)(sample_out & 0xFF); + sample_out >>= 8; + *(res++) = (byte)(sample_out & 0xFF); + } + } + } + else + throw new Exception("Unsupported BPS"); } //public void Clear() @@ -451,6 +478,7 @@ namespace CUETools.Codecs short* pOutSamples = (short*)outSamples; for (int i = 0; i < loopCount; i++) pOutSamples[i] = (short)pInSamples[i]; + //*(pOutSamples++) = (short)*(pInSamples++); } } @@ -465,19 +493,8 @@ namespace CUETools.Codecs throw new IndexOutOfRangeException(); } - fixed (int* pInSamplesFixed = &inSamples[inSampleOffset, 0]) - { - fixed (byte* pOutSamplesFixed = &outSamples[outByteOffset]) - { - int* pInSamples = pInSamplesFixed; - short* pOutSamples = (short*)pOutSamplesFixed; - - for (int i = 0; i < loopCount; i++) - { - *(pOutSamples++) = (short)*(pInSamples++); - } - } - } + fixed (byte* pOutSamplesFixed = &outSamples[outByteOffset]) + FLACSamplesToBytes_16(inSamples, inSampleOffset, pOutSamplesFixed, sampleCount, channelCount); } public static unsafe void FLACSamplesToBytes_24(int[,] inSamples, int inSampleOffset, @@ -917,16 +934,16 @@ namespace CUETools.Codecs private AudioPCMConfig pcm; private int _sampleVal; - public SilenceGenerator(long sampleCount, int sampleVal) + public SilenceGenerator(AudioPCMConfig pcm, long sampleCount, int sampleVal) { - _sampleVal = sampleVal; - _sampleOffset = 0; - _sampleCount = sampleCount; - pcm = AudioPCMConfig.RedBook; + this._sampleVal = sampleVal; + this._sampleOffset = 0; + this._sampleCount = sampleCount; + this.pcm = pcm; } public SilenceGenerator(long sampleCount) - : this(sampleCount, 0) + : this(AudioPCMConfig.RedBook, sampleCount, 0) { } @@ -1091,19 +1108,29 @@ namespace CUETools.Codecs { foundFormat = true; - if (_br.ReadUInt16() != 1) - { - throw new Exception("WAVE must be PCM format."); - } + uint fmtTag = _br.ReadUInt16(); int _channelCount = _br.ReadInt16(); int _sampleRate = _br.ReadInt32(); - _br.ReadInt32(); + _br.ReadInt32(); // bytes per second int _blockAlign = _br.ReadInt16(); int _bitsPerSample = _br.ReadInt16(); + pos += 16; + + if (fmtTag == 0xFFFEU && ckSize >= 34) // WAVE_FORMAT_EXTENSIBLE + { + _br.ReadInt16(); // CbSize + _br.ReadInt16(); // ValidBitsPerSample + int channelMask = _br.ReadInt32(); + fmtTag = _br.ReadUInt16(); + pos += 10; + } + + if (fmtTag != 1) // WAVE_FORMAT_PCM + throw new Exception("WAVE format tag not WAVE_FORMAT_PCM."); + pcm = new AudioPCMConfig(_bitsPerSample, _channelCount, _sampleRate); if (pcm.BlockAlign != _blockAlign) throw new Exception("WAVE has strange BlockAlign"); - pos += 16; } else if (ckID == fccData) { diff --git a/CUETools.FLACCL.cmd/CUETools.FLACL.cmd.csproj b/CUETools.FLACCL.cmd/CUETools.FLACL.cmd.csproj index 550563b..a2ebb00 100644 --- a/CUETools.FLACCL.cmd/CUETools.FLACL.cmd.csproj +++ b/CUETools.FLACCL.cmd/CUETools.FLACL.cmd.csproj @@ -23,7 +23,7 @@ DEBUG;TRACE prompt 4 - x86 + AnyCPU pdbonly diff --git a/CUETools.FLACCL.cmd/Program.cs b/CUETools.FLACCL.cmd/Program.cs index 2a2f988..578fc83 100644 --- a/CUETools.FLACCL.cmd/Program.cs +++ b/CUETools.FLACCL.cmd/Program.cs @@ -87,7 +87,7 @@ namespace CUETools.FLACCL.cmd min_precision = -1, max_precision = -1, orders_per_window = -1, orders_per_channel = -1, blocksize = -1; - int input_len = 4096, input_val = 0; + int input_len = 4096, input_val = 0, input_bps = 16, input_ch = 2, input_rate = 44100; int level = -1, padding = -1, vbr_mode = -1; bool do_seektable = true; bool buffered = false; @@ -136,6 +136,10 @@ namespace CUETools.FLACCL.cmd input_len = intarg; else if (args[arg] == "--input-value" && ++arg < args.Length && int.TryParse(args[arg], out intarg)) input_val = intarg; + else if (args[arg] == "--input-bps" && ++arg < args.Length && int.TryParse(args[arg], out intarg)) + input_bps = intarg; + else if (args[arg] == "--input-channels" && ++arg < args.Length && int.TryParse(args[arg], out intarg)) + input_ch = intarg; else if ((args[arg] == "-o" || args[arg] == "--output") && ++arg < args.Length) output_file = args[arg]; else if ((args[arg] == "-s" || args[arg] == "--stereo") && ++arg < args.Length) @@ -211,18 +215,28 @@ namespace CUETools.FLACCL.cmd } IAudioSource audioSource; - if (input_file == "-") - audioSource = new WAVReader("", Console.OpenStandardInput()); - else if (input_file == "nul") - audioSource = new SilenceGenerator(input_len, input_val); - else if (File.Exists(input_file) && Path.GetExtension(input_file) == ".wav") - audioSource = new WAVReader(input_file, null); - else if (File.Exists(input_file) && Path.GetExtension(input_file) == ".flac") - audioSource = new FlakeReader(input_file, null); - else + try + { + if (input_file == "-") + audioSource = new WAVReader("", Console.OpenStandardInput()); + else if (input_file == "nul") + audioSource = new SilenceGenerator(new AudioPCMConfig(input_bps, input_ch, input_rate), input_len, input_val); + else if (File.Exists(input_file) && Path.GetExtension(input_file) == ".wav") + audioSource = new WAVReader(input_file, null); + else if (File.Exists(input_file) && Path.GetExtension(input_file) == ".flac") + audioSource = new FlakeReader(input_file, null); + else + { + Usage(); + return 2; + } + } + catch (Exception ex) { Usage(); - return 2; + Console.WriteLine(""); + Console.WriteLine("Error: {0}.", ex.Message); + return 3; } if (buffered) audioSource = new AudioPipe(audioSource, FLACCLWriter.MAX_BLOCKSIZE); diff --git a/CUETools.Flake/App.config b/CUETools.Flake/App.config new file mode 100644 index 0000000..c8d467d --- /dev/null +++ b/CUETools.Flake/App.config @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/CUETools.Flake/CUETools.Flake.csproj b/CUETools.Flake/CUETools.Flake.csproj index bd7f938..613e183 100644 --- a/CUETools.Flake/CUETools.Flake.csproj +++ b/CUETools.Flake/CUETools.Flake.csproj @@ -2,7 +2,7 @@ Debug AnyCPU - 8.0.50727 + 9.0.30729 2.0 {2379BAAF-A406-4477-BF53-2D6A326C24C8} Exe @@ -19,7 +19,7 @@ true full false - bin\Debug\ + ..\bin\Debug\ DEBUG;TRACE prompt 4 @@ -52,6 +52,9 @@ CUETools.Codecs + + +