diff --git a/CUETools.Codecs.FLACCL/CUETools.Codecs.FLACCL.csproj b/CUETools.Codecs.FLACCL/CUETools.Codecs.FLACCL.csproj new file mode 100644 index 0000000..1282a58 --- /dev/null +++ b/CUETools.Codecs.FLACCL/CUETools.Codecs.FLACCL.csproj @@ -0,0 +1,91 @@ + + + Debug + AnyCPU + 9.0.30729 + 2.0 + {DFE55765-564C-4B8F-993B-A94C4D1C212E} + Library + Properties + CUETools.Codecs.FLACCL + CUETools.Codecs.FLACCL + + + 2.0 + + + + + true + full + false + ..\bin\Debug\plugins\ + DEBUG;TRACE + prompt + 4 + true + + + pdbonly + true + ..\bin\Release\plugins\ + TRACE + prompt + 4 + true + + + + False + OpenCLNet.dll + + + + + + + + + + True + True + Resources.resx + + + + + {082D6B9E-326E-4D15-9798-EDAE9EDE70A6} + CUETools.Codecs.FLAKE + False + + + {6458A13A-30EF-45A9-9D58-E5031B17BEE2} + CUETools.Codecs + False + + + + + + + + + ResXFileCodeGenerator + Resources.Designer.cs + + + + + + + + + + + \ No newline at end of file diff --git a/CUETools.Codecs.FLACCL/FLACCLWriter.cs b/CUETools.Codecs.FLACCL/FLACCLWriter.cs new file mode 100644 index 0000000..65b5cf2 --- /dev/null +++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs @@ -0,0 +1,2473 @@ +/** + * CUETools.FLACCL: FLAC audio encoder using CUDA + * Copyright (c) 2009 Gregory S. Chudov + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +using System; +using System.ComponentModel; +using System.Collections.Generic; +using System.IO; +using System.Security.Cryptography; +using System.Threading; +using System.Text; +using System.Runtime.InteropServices; +using CUETools.Codecs; +using CUETools.Codecs.FLAKE; +using OpenCLNet; + +namespace CUETools.Codecs.FLACCL +{ + public class FLACCLWriterSettings + { + public FLACCLWriterSettings() { DoVerify = false; GPUOnly = false; DoMD5 = true; } + + [DefaultValue(false)] + [DisplayName("Verify")] + [SRDescription(typeof(Properties.Resources), "DoVerifyDescription")] + public bool DoVerify { get; set; } + + [DefaultValue(true)] + [DisplayName("MD5")] + [SRDescription(typeof(Properties.Resources), "DoMD5Description")] + public bool DoMD5 { get; set; } + + [DefaultValue(true)] + [SRDescription(typeof(Properties.Resources), "DescriptionGPUOnly")] + public bool GPUOnly { get; set; } + + int cpu_threads = 1; + [DefaultValue(1)] + [SRDescription(typeof(Properties.Resources), "DescriptionCPUThreads")] + public int CPUThreads + { + get + { + return cpu_threads; + } + set + { + if (value < 0 || value > 16) + throw new Exception("CPUThreads must be between 0..16"); + cpu_threads = value; + } + } + } + + [AudioEncoderClass("FLACCL", "flac", true, "0 1 2 3 4 5 6 7 8 9 10 11", "8", 2, typeof(FLACCLWriterSettings))] + //[AudioEncoderClass("FLACCL nonsub", "flac", true, "9 10 11", "9", 1, typeof(FLACCLWriterSettings))] + public class FLACCLWriter : IAudioDest + { + Stream _IO = null; + string _path; + long _position; + + // number of audio channels + // valid values are 1 to 8 + int channels, ch_code; + + // audio sample rate in Hz + int sample_rate, sr_code0, sr_code1; + + // sample size in bits + // only 16-bit is currently supported + uint bits_per_sample; + int bps_code; + + // total stream samples + // if 0, stream length is unknown + int sample_count = -1; + + FlakeEncodeParams eparams; + + // maximum frame size in bytes + // this can be used to allocate memory for output + int max_frame_size; + + int frame_count = 0; + int frame_pos = 0; + + long first_frame_offset = 0; + + TimeSpan _userProcessorTime; + + // header bytes + // allocated by flake_encode_init and freed by flake_encode_close + byte[] header; + + float[] windowBuffer; + int samplesInBuffer = 0; + int max_frames = 0; + + int _compressionLevel = 7; + int _blocksize = 0; + int _totalSize = 0; + int _windowsize = 0, _windowcount = 0; + + Crc8 crc8; + Crc16 crc16; + MD5 md5; + + SeekPoint[] seek_table; + int seek_table_offset = -1; + + bool inited = false; + + OpenCLManager OCLMan; + Context openCLContext; + Program openCLProgram; + + FLACCLTask task1; + FLACCLTask task2; + FLACCLTask[] cpu_tasks; + int oldest_cpu_task = 0; + + Mem cudaWindow; + + AudioPCMConfig _pcm; + + public const int MAX_BLOCKSIZE = 4096 * 16; + internal const int maxFrames = 128; + internal const int maxAutocorParts = (MAX_BLOCKSIZE + 255) / 256; + + public FLACCLWriter(string path, Stream IO, AudioPCMConfig pcm) + { + _pcm = pcm; + + if (pcm.BitsPerSample != 16) + throw new Exception("Bits per sample must be 16."); + if (pcm.ChannelCount != 2) + throw new Exception("ChannelCount must be 2."); + + channels = pcm.ChannelCount; + sample_rate = pcm.SampleRate; + bits_per_sample = (uint) pcm.BitsPerSample; + + // flake_validate_params + + _path = path; + _IO = IO; + + windowBuffer = new float[FLACCLWriter.MAX_BLOCKSIZE * lpc.MAX_LPC_WINDOWS]; + + eparams.flake_set_defaults(_compressionLevel, !_settings.GPUOnly); + eparams.padding_size = 8192; + + crc8 = new Crc8(); + crc16 = new Crc16(); + } + + public FLACCLWriter(string path, AudioPCMConfig pcm) + : this(path, null, pcm) + { + } + + public int TotalSize + { + get + { + return _totalSize; + } + } + + public long Padding + { + get + { + return eparams.padding_size; + } + set + { + eparams.padding_size = value; + } + } + + public int CompressionLevel + { + get + { + return _compressionLevel; + } + set + { + if (value < 0 || value > 11) + throw new Exception("unsupported compression level"); + _compressionLevel = value; + eparams.flake_set_defaults(_compressionLevel, !_settings.GPUOnly); + } + } + + FLACCLWriterSettings _settings = new FLACCLWriterSettings(); + + public object Settings + { + get + { + return _settings; + } + set + { + if (value as FLACCLWriterSettings == null) + throw new Exception("Unsupported options " + value); + _settings = value as FLACCLWriterSettings; + eparams.flake_set_defaults(_compressionLevel, !_settings.GPUOnly); + } + } + + //[DllImport("kernel32.dll")] + //static extern bool GetThreadTimes(IntPtr hThread, out long lpCreationTime, out long lpExitTime, out long lpKernelTime, out long lpUserTime); + //[DllImport("kernel32.dll")] + //static extern IntPtr GetCurrentThread(); + + void DoClose() + { + if (inited) + { + int nFrames = samplesInBuffer / eparams.block_size; + if (nFrames > 0) + do_output_frames(nFrames); + if (samplesInBuffer > 0) + { + eparams.block_size = samplesInBuffer; + do_output_frames(1); + } + if (task2.frameCount > 0) + { + if (cpu_tasks != null) + { + for (int i = 0; i < cpu_tasks.Length; i++) + { + wait_for_cpu_task(); + FLACCLTask task = cpu_tasks[oldest_cpu_task]; + oldest_cpu_task = (oldest_cpu_task + 1) % cpu_tasks.Length; + if (task.frameCount > 0) + { + write_result(task); + task.frameCount = 0; + } + } + } + task2.openCLCQ.Finish(); // cuda.SynchronizeStream(task2.stream); + process_result(task2); + write_result(task2); + task2.frameCount = 0; + } + + if (_IO.CanSeek) + { + if (sample_count <= 0 && _position != 0) + { + BitWriter bitwriter = new BitWriter(header, 0, 4); + bitwriter.writebits(32, (int)_position); + bitwriter.flush(); + _IO.Position = 22; + _IO.Write(header, 0, 4); + } + + if (md5 != null) + { + md5.TransformFinalBlock(new byte[] { 0 }, 0, 0); + _IO.Position = 26; + _IO.Write(md5.Hash, 0, md5.Hash.Length); + } + + if (seek_table != null) + { + _IO.Position = seek_table_offset; + int len = write_seekpoints(header, 0, 0); + _IO.Write(header, 4, len - 4); + } + } + _IO.Close(); + + cudaWindow.Dispose(); + task1.Dispose(); + task2.Dispose(); + if (cpu_tasks != null) + foreach (FLACCLTask task in cpu_tasks) + task.Dispose(); + openCLProgram.Dispose(); + openCLContext.Dispose(); + inited = false; + } + } + + public void Close() + { + DoClose(); + if (sample_count > 0 && _position != sample_count) + throw new Exception(string.Format("Samples written differs from the expected sample count. Expected {0}, got {1}.", sample_count, _position)); + } + + public void Delete() + { + if (inited) + { + _IO.Close(); + cudaWindow.Dispose(); + task1.Dispose(); + task2.Dispose(); + if (cpu_tasks != null) + foreach (FLACCLTask task in cpu_tasks) + task.Dispose(); + openCLProgram.Dispose(); + openCLContext.Dispose(); + inited = false; + } + + if (_path != "") + File.Delete(_path); + } + + public long Position + { + get + { + return _position; + } + } + + public long FinalSampleCount + { + set { sample_count = (int)value; } + } + + public long BlockSize + { + set { + if (value < 256 || value > MAX_BLOCKSIZE ) + throw new Exception("unsupported BlockSize value"); + _blocksize = (int)value; + } + get { return _blocksize == 0 ? eparams.block_size : _blocksize; } + } + + public StereoMethod StereoMethod + { + get { return eparams.do_midside ? StereoMethod.Search : StereoMethod.Independent; } + set { eparams.do_midside = value != StereoMethod.Independent; } + } + + public int MinPrecisionSearch + { + get { return eparams.lpc_min_precision_search; } + set + { + if (value < 0 || value > eparams.lpc_max_precision_search) + throw new Exception("unsupported MinPrecisionSearch value"); + eparams.lpc_min_precision_search = value; + } + } + + public int MaxPrecisionSearch + { + get { return eparams.lpc_max_precision_search; } + set + { + if (value < eparams.lpc_min_precision_search || value >= lpc.MAX_LPC_PRECISIONS) + throw new Exception("unsupported MaxPrecisionSearch value"); + eparams.lpc_max_precision_search = value; + } + } + + public WindowFunction WindowFunction + { + get { return eparams.window_function; } + set { eparams.window_function = value; } + } + + public bool DoSeekTable + { + get { return eparams.do_seektable; } + set { eparams.do_seektable = value; } + } + + public int VBRMode + { + get { return eparams.variable_block_size; } + set { eparams.variable_block_size = value; } + } + + public int OrdersPerWindow + { + get + { + return eparams.orders_per_window; + } + set + { + if (value < 0 || value > 32) + throw new Exception("invalid OrdersPerWindow " + value.ToString()); + eparams.orders_per_window = value; + } + } + + public int MinLPCOrder + { + get + { + return eparams.min_prediction_order; + } + set + { + if (value < 1 || value > eparams.max_prediction_order) + throw new Exception("invalid MinLPCOrder " + value.ToString()); + eparams.min_prediction_order = value; + } + } + + public int MaxLPCOrder + { + get + { + return eparams.max_prediction_order; + } + set + { + if (value > lpc.MAX_LPC_ORDER || value < eparams.min_prediction_order) + throw new Exception("invalid MaxLPCOrder " + value.ToString()); + eparams.max_prediction_order = value; + } + } + + public int MinFixedOrder + { + get + { + return eparams.min_fixed_order; + } + set + { + if (value < 0 || value > eparams.max_fixed_order) + throw new Exception("invalid MinFixedOrder " + value.ToString()); + eparams.min_fixed_order = value; + } + } + + public int MaxFixedOrder + { + get + { + return eparams.max_fixed_order; + } + set + { + if (value > 4 || value < eparams.min_fixed_order) + throw new Exception("invalid MaxFixedOrder " + value.ToString()); + eparams.max_fixed_order = value; + } + } + + public int MinPartitionOrder + { + get { return eparams.min_partition_order; } + set + { + if (value < 0 || value > eparams.max_partition_order) + throw new Exception("invalid MinPartitionOrder " + value.ToString()); + eparams.min_partition_order = value; + } + } + + public int MaxPartitionOrder + { + get { return eparams.max_partition_order; } + set + { + if (value > 8 || value < eparams.min_partition_order) + throw new Exception("invalid MaxPartitionOrder " + value.ToString()); + eparams.max_partition_order = value; + } + } + + public TimeSpan UserProcessorTime + { + get { return _userProcessorTime; } + } + + public AudioPCMConfig PCM + { + get { return _pcm; } + } + + unsafe void encode_residual_fixed(int* res, int* smp, int n, int order) + { + int i; + int s0, s1, s2; + switch (order) + { + case 0: + AudioSamples.MemCpy(res, smp, n); + return; + case 1: + *(res++) = s1 = *(smp++); + for (i = n - 1; i > 0; i--) + { + s0 = *(smp++); + *(res++) = s0 - s1; + s1 = s0; + } + return; + case 2: + *(res++) = s2 = *(smp++); + *(res++) = s1 = *(smp++); + for (i = n - 2; i > 0; i--) + { + s0 = *(smp++); + *(res++) = s0 - 2 * s1 + s2; + s2 = s1; + s1 = s0; + } + return; + case 3: + res[0] = smp[0]; + res[1] = smp[1]; + res[2] = smp[2]; + for (i = 3; i < n; i++) + { + res[i] = smp[i] - 3 * smp[i - 1] + 3 * smp[i - 2] - smp[i - 3]; + } + return; + case 4: + res[0] = smp[0]; + res[1] = smp[1]; + res[2] = smp[2]; + res[3] = smp[3]; + for (i = 4; i < n; i++) + { + res[i] = smp[i] - 4 * smp[i - 1] + 6 * smp[i - 2] - 4 * smp[i - 3] + smp[i - 4]; + } + return; + default: + return; + } + } + + static unsafe uint calc_optimal_rice_params(int porder, int* parm, uint* sums, uint n, uint pred_order) + { + uint part = (1U << porder); + uint cnt = (n >> porder) - pred_order; + int k = cnt > 0 ? Math.Min(Flake.MAX_RICE_PARAM, BitReader.log2i(sums[0] / cnt)) : 0; + uint all_bits = cnt * ((uint)k + 1U) + (sums[0] >> k); + parm[0] = k; + cnt = (n >> porder); + for (uint i = 1; i < part; i++) + { + k = Math.Min(Flake.MAX_RICE_PARAM, BitReader.log2i(sums[i] / cnt)); + all_bits += cnt * ((uint)k + 1U) + (sums[i] >> k); + parm[i] = k; + } + return all_bits + (4 * part); + } + + static unsafe void calc_lower_sums(int pmin, int pmax, uint* sums) + { + for (int i = pmax - 1; i >= pmin; i--) + { + for (int j = 0; j < (1 << i); j++) + { + sums[i * Flake.MAX_PARTITIONS + j] = + sums[(i + 1) * Flake.MAX_PARTITIONS + 2 * j] + + sums[(i + 1) * Flake.MAX_PARTITIONS + 2 * j + 1]; + } + } + } + + static unsafe void calc_sums(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums) + { + int parts = (1 << pmax); + uint* res = data + pred_order; + uint cnt = (n >> pmax) - pred_order; + uint sum = 0; + for (uint j = cnt; j > 0; j--) + sum += *(res++); + sums[0] = sum; + cnt = (n >> pmax); + for (int i = 1; i < parts; i++) + { + sum = 0; + for (uint j = cnt; j > 0; j--) + sum += *(res++); + sums[i] = sum; + } + } + + /// + /// Special case when (n >> pmax) == 18 + /// + /// + /// + /// + /// + /// + /// + static unsafe void calc_sums18(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums) + { + int parts = (1 << pmax); + uint* res = data + pred_order; + uint cnt = 18 - pred_order; + uint sum = 0; + for (uint j = cnt; j > 0; j--) + sum += *(res++); + sums[0] = sum; + for (int i = 1; i < parts; i++) + { + sums[i] = + *(res++) + *(res++) + *(res++) + *(res++) + + *(res++) + *(res++) + *(res++) + *(res++) + + *(res++) + *(res++) + *(res++) + *(res++) + + *(res++) + *(res++) + *(res++) + *(res++) + + *(res++) + *(res++); + } + } + + /// + /// Special case when (n >> pmax) == 18 + /// + /// + /// + /// + /// + /// + /// + static unsafe void calc_sums16(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums) + { + int parts = (1 << pmax); + uint* res = data + pred_order; + uint cnt = 16 - pred_order; + uint sum = 0; + for (uint j = cnt; j > 0; j--) + sum += *(res++); + sums[0] = sum; + for (int i = 1; i < parts; i++) + { + sums[i] = + *(res++) + *(res++) + *(res++) + *(res++) + + *(res++) + *(res++) + *(res++) + *(res++) + + *(res++) + *(res++) + *(res++) + *(res++) + + *(res++) + *(res++) + *(res++) + *(res++); + } + } + + static unsafe uint calc_rice_params(RiceContext rc, int pmin, int pmax, int* data, uint n, uint pred_order) + { + uint* udata = stackalloc uint[(int)n]; + uint* sums = stackalloc uint[(pmax + 1) * Flake.MAX_PARTITIONS]; + int* parm = stackalloc int[(pmax + 1) * Flake.MAX_PARTITIONS]; + //uint* bits = stackalloc uint[Flake.MAX_PARTITION_ORDER]; + + //assert(pmin >= 0 && pmin <= Flake.MAX_PARTITION_ORDER); + //assert(pmax >= 0 && pmax <= Flake.MAX_PARTITION_ORDER); + //assert(pmin <= pmax); + + for (uint i = 0; i < n; i++) + udata[i] = (uint)((data[i] << 1) ^ (data[i] >> 31)); + + // sums for highest level + if ((n >> pmax) == 18) + calc_sums18(pmin, pmax, udata, n, pred_order, sums + pmax * Flake.MAX_PARTITIONS); + else if ((n >> pmax) == 16) + calc_sums16(pmin, pmax, udata, n, pred_order, sums + pmax * Flake.MAX_PARTITIONS); + else + calc_sums(pmin, pmax, udata, n, pred_order, sums + pmax * Flake.MAX_PARTITIONS); + // sums for lower levels + calc_lower_sums(pmin, pmax, sums); + + uint opt_bits = AudioSamples.UINT32_MAX; + int opt_porder = pmin; + for (int i = pmin; i <= pmax; i++) + { + uint bits = calc_optimal_rice_params(i, parm + i * Flake.MAX_PARTITIONS, sums + i * Flake.MAX_PARTITIONS, n, pred_order); + if (bits <= opt_bits) + { + opt_bits = bits; + opt_porder = i; + } + } + + rc.porder = opt_porder; + fixed (int* rparms = rc.rparams) + AudioSamples.MemCpy(rparms, parm + opt_porder * Flake.MAX_PARTITIONS, (1 << opt_porder)); + + return opt_bits; + } + + static int get_max_p_order(int max_porder, int n, int order) + { + int porder = Math.Min(max_porder, BitReader.log2i(n ^ (n - 1))); + if (order > 0) + porder = Math.Min(porder, BitReader.log2i(n / order)); + return porder; + } + + unsafe void output_frame_header(FlacFrame frame) + { + frame.writer.writebits(15, 0x7FFC); + frame.writer.writebits(1, eparams.variable_block_size > 0 ? 1 : 0); + frame.writer.writebits(4, frame.bs_code0); + frame.writer.writebits(4, sr_code0); + if (frame.ch_mode == ChannelMode.NotStereo) + frame.writer.writebits(4, ch_code); + else + frame.writer.writebits(4, (int)frame.ch_mode); + frame.writer.writebits(3, bps_code); + frame.writer.writebits(1, 0); + frame.writer.write_utf8(frame.frame_number); + + // custom block size + if (frame.bs_code1 >= 0) + { + if (frame.bs_code1 < 256) + frame.writer.writebits(8, frame.bs_code1); + else + frame.writer.writebits(16, frame.bs_code1); + } + + // custom sample rate + if (sr_code1 > 0) + { + if (sr_code1 < 256) + frame.writer.writebits(8, sr_code1); + else + frame.writer.writebits(16, sr_code1); + } + + // CRC-8 of frame header + frame.writer.flush(); + byte crc = crc8.ComputeChecksum(frame.writer.Buffer, frame.writer_offset, frame.writer.Length - frame.writer_offset); + frame.writer.writebits(8, crc); + } + + unsafe void output_residual(FlacFrame frame, FlacSubframeInfo sub) + { + // rice-encoded block + frame.writer.writebits(2, 0); + + // partition order + int porder = sub.best.rc.porder; + int psize = frame.blocksize >> porder; + //assert(porder >= 0); + frame.writer.writebits(4, porder); + int res_cnt = psize - sub.best.order; + + // residual + int j = sub.best.order; + fixed (byte* fixbuf = frame.writer.Buffer) + for (int p = 0; p < (1 << porder); p++) + { + int k = sub.best.rc.rparams[p]; + frame.writer.writebits(4, k); + if (p == 1) res_cnt = psize; + int cnt = Math.Min(res_cnt, frame.blocksize - j); + frame.writer.write_rice_block_signed(fixbuf, k, sub.best.residual + j, cnt); + j += cnt; + } + } + + unsafe void + output_subframe_constant(FlacFrame frame, FlacSubframeInfo sub) + { + frame.writer.writebits_signed(sub.obits, sub.samples[0]); + } + + unsafe void + output_subframe_verbatim(FlacFrame frame, FlacSubframeInfo sub) + { + int n = frame.blocksize; + for (int i = 0; i < n; i++) + frame.writer.writebits_signed(sub.obits, sub.samples[i]); + // Don't use residual here, because we don't copy samples to residual for verbatim frames. + } + + unsafe void + output_subframe_fixed(FlacFrame frame, FlacSubframeInfo sub) + { + // warm-up samples + for (int i = 0; i < sub.best.order; i++) + frame.writer.writebits_signed(sub.obits, sub.samples[i]); + + // residual + output_residual(frame, sub); + } + + unsafe void + output_subframe_lpc(FlacFrame frame, FlacSubframeInfo sub) + { + // warm-up samples + for (int i = 0; i < sub.best.order; i++) + frame.writer.writebits_signed(sub.obits, sub.samples[i]); + + // LPC coefficients + frame.writer.writebits(4, sub.best.cbits - 1); + frame.writer.writebits_signed(5, sub.best.shift); + for (int i = 0; i < sub.best.order; i++) + frame.writer.writebits_signed(sub.best.cbits, sub.best.coefs[i]); + + // residual + output_residual(frame, sub); + } + + unsafe void output_subframes(FlacFrame frame) + { + for (int ch = 0; ch < channels; ch++) + { + FlacSubframeInfo sub = frame.subframes[ch]; + // subframe header + int type_code = (int) sub.best.type; + if (sub.best.type == SubframeType.Fixed) + type_code |= sub.best.order; + if (sub.best.type == SubframeType.LPC) + type_code |= sub.best.order - 1; + frame.writer.writebits(1, 0); + frame.writer.writebits(6, type_code); + frame.writer.writebits(1, sub.wbits != 0 ? 1 : 0); + if (sub.wbits > 0) + frame.writer.writebits((int)sub.wbits, 1); + + //if (frame_writer.Length >= frame_buffer.Length) + // throw new Exception("buffer overflow"); + + // subframe + switch (sub.best.type) + { + case SubframeType.Constant: + output_subframe_constant(frame, sub); + break; + case SubframeType.Verbatim: + output_subframe_verbatim(frame, sub); + break; + case SubframeType.Fixed: + output_subframe_fixed(frame, sub); + break; + case SubframeType.LPC: + output_subframe_lpc(frame, sub); + break; + } + //if (frame_writer.Length >= frame_buffer.Length) + // throw new Exception("buffer overflow"); + } + } + + void output_frame_footer(FlacFrame frame) + { + frame.writer.flush(); + ushort crc = crc16.ComputeChecksum(frame.writer.Buffer, frame.writer_offset, frame.writer.Length - frame.writer_offset); + frame.writer.writebits(16, crc); + frame.writer.flush(); + } + + unsafe delegate void window_function(float* window, int size); + + unsafe void calculate_window(float* window, window_function func, WindowFunction flag) + { + if ((eparams.window_function & flag) == 0 || _windowcount == lpc.MAX_LPC_WINDOWS) + return; + + func(window + _windowcount * _windowsize, _windowsize); + //int sz = _windowsize; + //float* pos = window + _windowcount * FLACCLWriter.MAX_BLOCKSIZE * 2; + //do + //{ + // func(pos, sz); + // if ((sz & 1) != 0) + // break; + // pos += sz; + // sz >>= 1; + //} while (sz >= 32); + _windowcount++; + } + + unsafe void initializeSubframeTasks(int blocksize, int channelsCount, int nFrames, FLACCLTask task) + { + task.nResidualTasks = 0; + task.nTasksPerWindow = Math.Min(32, eparams.orders_per_window); + task.nResidualTasksPerChannel = _windowcount * task.nTasksPerWindow + 1 + (eparams.do_constant ? 1 : 0) + eparams.max_fixed_order - eparams.min_fixed_order; + if (task.nResidualTasksPerChannel >= 4) + task.nResidualTasksPerChannel = (task.nResidualTasksPerChannel + 7) & ~7; + task.nAutocorTasksPerChannel = _windowcount; + for (int iFrame = 0; iFrame < nFrames; iFrame++) + { + for (int ch = 0; ch < channelsCount; ch++) + { + for (int iWindow = 0; iWindow < _windowcount; iWindow++) + { + // LPC tasks + for (int order = 0; order < task.nTasksPerWindow; order++) + { + task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.LPC; + task.ResidualTasks[task.nResidualTasks].channel = ch; + task.ResidualTasks[task.nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0); + task.ResidualTasks[task.nResidualTasks].abits = task.ResidualTasks[task.nResidualTasks].obits; + task.ResidualTasks[task.nResidualTasks].blocksize = blocksize; + task.ResidualTasks[task.nResidualTasks].residualOrder = order + 1; + task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * blocksize; + task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs; + task.nResidualTasks++; + } + } + // Constant frames + if (eparams.do_constant) + { + task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.Constant; + task.ResidualTasks[task.nResidualTasks].channel = ch; + task.ResidualTasks[task.nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0); + task.ResidualTasks[task.nResidualTasks].abits = task.ResidualTasks[task.nResidualTasks].obits; + task.ResidualTasks[task.nResidualTasks].blocksize = blocksize; + task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * blocksize; + task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs; + task.ResidualTasks[task.nResidualTasks].residualOrder = 1; + task.ResidualTasks[task.nResidualTasks].shift = 0; + task.ResidualTasks[task.nResidualTasks].coefs[0] = 1; + task.nResidualTasks++; + } + // Fixed prediction + for (int order = eparams.min_fixed_order; order <= eparams.max_fixed_order; order++) + { + task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.Fixed; + task.ResidualTasks[task.nResidualTasks].channel = ch; + task.ResidualTasks[task.nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0); + task.ResidualTasks[task.nResidualTasks].abits = task.ResidualTasks[task.nResidualTasks].obits; + task.ResidualTasks[task.nResidualTasks].blocksize = blocksize; + task.ResidualTasks[task.nResidualTasks].residualOrder = order; + task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * blocksize; + task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs; + task.ResidualTasks[task.nResidualTasks].shift = 0; + switch (order) + { + case 0: + break; + case 1: + task.ResidualTasks[task.nResidualTasks].coefs[0] = 1; + break; + case 2: + task.ResidualTasks[task.nResidualTasks].coefs[1] = 2; + task.ResidualTasks[task.nResidualTasks].coefs[0] = -1; + break; + case 3: + task.ResidualTasks[task.nResidualTasks].coefs[2] = 3; + task.ResidualTasks[task.nResidualTasks].coefs[1] = -3; + task.ResidualTasks[task.nResidualTasks].coefs[0] = 1; + break; + case 4: + task.ResidualTasks[task.nResidualTasks].coefs[3] = 4; + task.ResidualTasks[task.nResidualTasks].coefs[2] = -6; + task.ResidualTasks[task.nResidualTasks].coefs[1] = 4; + task.ResidualTasks[task.nResidualTasks].coefs[0] = -1; + break; + } + task.nResidualTasks++; + } + // Filler + while ((task.nResidualTasks % task.nResidualTasksPerChannel) != 0) + { + task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.Verbatim; + task.ResidualTasks[task.nResidualTasks].channel = ch; + task.ResidualTasks[task.nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0); + task.ResidualTasks[task.nResidualTasks].abits = task.ResidualTasks[task.nResidualTasks].obits; + task.ResidualTasks[task.nResidualTasks].blocksize = blocksize; + task.ResidualTasks[task.nResidualTasks].residualOrder = 0; + task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * blocksize; + task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs; + task.ResidualTasks[task.nResidualTasks].shift = 0; + task.nResidualTasks++; + } + } + } + if (sizeof(FLACCLSubframeTask) * task.nResidualTasks > task.residualTasksLen) + throw new Exception("oops"); + task.openCLCQ.EnqueueWriteBuffer(task.cudaResidualTasks, true, 0, sizeof(FLACCLSubframeTask) * task.nResidualTasks, task.residualTasksPtr.AddrOfPinnedObject()); + task.openCLCQ.EnqueueBarrier(); + + task.frameSize = blocksize; + } + + unsafe void encode_residual(FLACCLTask task) + { + bool unpacked = false; + unpack_samples(task, Math.Min(32, task.frameSize)); + for (int ch = 0; ch < channels; ch++) + { + switch (task.frame.subframes[ch].best.type) + { + case SubframeType.Constant: + break; + case SubframeType.Verbatim: + if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true; + break; + case SubframeType.Fixed: + // if (!_settings.GPUOnly) + { + if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true; + encode_residual_fixed(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, + task.frame.blocksize, task.frame.subframes[ch].best.order); + + int pmin = get_max_p_order(eparams.min_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order); + int pmax = get_max_p_order(eparams.max_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order); + uint bits = (uint)(task.frame.subframes[ch].best.order * task.frame.subframes[ch].obits) + 6; + task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order); + } + break; + case SubframeType.LPC: + fixed (int* coefs = task.frame.subframes[ch].best.coefs) + { + ulong csum = 0; + for (int i = task.frame.subframes[ch].best.order; i > 0; i--) + csum += (ulong)Math.Abs(coefs[i - 1]); + // if ((csum << task.frame.subframes[ch].obits) >= 1UL << 32 || !_settings.GPUOnly) + { + if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true; + if ((csum << task.frame.subframes[ch].obits) >= 1UL << 32) + lpc.encode_residual_long(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, task.frame.blocksize, task.frame.subframes[ch].best.order, coefs, task.frame.subframes[ch].best.shift); + else + lpc.encode_residual(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, task.frame.blocksize, task.frame.subframes[ch].best.order, coefs, task.frame.subframes[ch].best.shift); + int pmin = get_max_p_order(eparams.min_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order); + int pmax = get_max_p_order(eparams.max_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order); + uint bits = (uint)(task.frame.subframes[ch].best.order * task.frame.subframes[ch].obits) + 4 + 5 + (uint)task.frame.subframes[ch].best.order * (uint)task.frame.subframes[ch].best.cbits + 6; + //uint oldsize = task.frame.subframes[ch].best.size; + task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order); + //if (task.frame.subframes[ch].best.size > task.frame.subframes[ch].obits * (uint)task.frame.blocksize && + // oldsize <= task.frame.subframes[ch].obits * (uint)task.frame.blocksize) + // throw new Exception("oops"); + } + } + break; + } + if (task.frame.subframes[ch].best.size > task.frame.subframes[ch].obits * task.frame.blocksize) + { +#if DEBUG + throw new Exception("larger than verbatim"); +#endif + task.frame.subframes[ch].best.type = SubframeType.Verbatim; + task.frame.subframes[ch].best.size = (uint)(task.frame.subframes[ch].obits * task.frame.blocksize); + if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true; + } + } + } + + unsafe void select_best_methods(FlacFrame frame, int channelsCount, int iFrame, FLACCLTask task) + { + if (channelsCount == 4 && channels == 2) + { + if (task.BestResidualTasks[iFrame * 2].channel == 0 && task.BestResidualTasks[iFrame * 2 + 1].channel == 1) + frame.ch_mode = ChannelMode.LeftRight; + else if (task.BestResidualTasks[iFrame * 2].channel == 0 && task.BestResidualTasks[iFrame * 2 + 1].channel == 3) + frame.ch_mode = ChannelMode.LeftSide; + else if (task.BestResidualTasks[iFrame * 2].channel == 3 && task.BestResidualTasks[iFrame * 2 + 1].channel == 1) + frame.ch_mode = ChannelMode.RightSide; + else if (task.BestResidualTasks[iFrame * 2].channel == 2 && task.BestResidualTasks[iFrame * 2 + 1].channel == 3) + frame.ch_mode = ChannelMode.MidSide; + else + throw new Exception("internal error: invalid stereo mode"); + frame.SwapSubframes(0, task.BestResidualTasks[iFrame * 2].channel); + frame.SwapSubframes(1, task.BestResidualTasks[iFrame * 2 + 1].channel); + } + else + frame.ch_mode = channels != 2 ? ChannelMode.NotStereo : ChannelMode.LeftRight; + + for (int ch = 0; ch < channels; ch++) + { + int index = ch + iFrame * channels; + frame.subframes[ch].best.residual = ((int*)task.residualBufferPtr.AddrOfPinnedObject()) + task.BestResidualTasks[index].residualOffs; + frame.subframes[ch].best.type = SubframeType.Verbatim; + frame.subframes[ch].best.size = (uint)(frame.subframes[ch].obits * frame.blocksize); + frame.subframes[ch].wbits = 0; + + if (task.BestResidualTasks[index].size < 0) + throw new Exception("internal error"); + if (frame.blocksize > Math.Max(4, eparams.max_prediction_order) && frame.subframes[ch].best.size > task.BestResidualTasks[index].size) + { + frame.subframes[ch].best.type = (SubframeType)task.BestResidualTasks[index].type; + frame.subframes[ch].best.size = (uint)task.BestResidualTasks[index].size; + frame.subframes[ch].best.order = task.BestResidualTasks[index].residualOrder; + frame.subframes[ch].best.cbits = task.BestResidualTasks[index].cbits; + frame.subframes[ch].best.shift = task.BestResidualTasks[index].shift; + frame.subframes[ch].obits -= task.BestResidualTasks[index].wbits; + frame.subframes[ch].wbits = task.BestResidualTasks[index].wbits; + frame.subframes[ch].best.rc.porder = task.BestResidualTasks[index].porder; + for (int i = 0; i < task.BestResidualTasks[index].residualOrder; i++) + frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i]; + //if (_settings.GPUOnly && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC)) + //{ + // int* riceParams = ((int*)task.bestRiceParamsPtr.AddrOfPinnedObject()) + (index << task.max_porder); + // fixed (int* dstParams = frame.subframes[ch].best.rc.rparams) + // AudioSamples.MemCpy(dstParams, riceParams, (1 << frame.subframes[ch].best.rc.porder)); + // //for (int i = 0; i < (1 << frame.subframes[ch].best.rc.porder); i++) + // // frame.subframes[ch].best.rc.rparams[i] = riceParams[i]; + //} + } + } + } + + unsafe void estimate_residual(FLACCLTask task, int channelsCount) + { + if (task.frameSize <= 4) + return; + + //int autocorPartSize = (2 * 256 - eparams.max_prediction_order) & ~15; + int autocorPartSize = 32 * 7; + int autocorPartCount = (task.frameSize + autocorPartSize - 1) / autocorPartSize; + if (autocorPartCount > maxAutocorParts) + throw new Exception("internal error"); + + int max_porder = get_max_p_order(eparams.max_partition_order, task.frameSize, eparams.max_prediction_order); + int calcPartitionPartSize = task.frameSize >> max_porder; + while (calcPartitionPartSize < 16 && max_porder > 0) + { + calcPartitionPartSize <<= 1; + max_porder--; + } + int calcPartitionPartCount = (calcPartitionPartSize >= 128) ? 1 : (256 / calcPartitionPartSize); + + if (channels != 2) throw new Exception("channels != 2"); // need to Enqueue cudaChannelDecorr for each channel + Kernel cudaChannelDecorr = channels == 2 ? (channelsCount == 4 ? task.cudaStereoDecorr : task.cudaChannelDecorr2) : null;// task.cudaChannelDecorr; + //Kernel cudaCalcPartition = calcPartitionPartSize >= 128 ? task.cudaCalcLargePartition : calcPartitionPartSize == 16 && task.frameSize >= 256 ? task.cudaCalcPartition16 : task.cudaCalcPartition; + + cudaChannelDecorr.SetArg(0, task.cudaSamples); + cudaChannelDecorr.SetArg(1, task.cudaSamplesBytes); + cudaChannelDecorr.SetArg(2, (uint)MAX_BLOCKSIZE); + + task.cudaComputeLPC.SetArg(0, task.cudaResidualTasks); + task.cudaComputeLPC.SetArg(1, (uint)task.nResidualTasksPerChannel); + task.cudaComputeLPC.SetArg(2, task.cudaAutocorOutput); + task.cudaComputeLPC.SetArg(3, (uint)eparams.max_prediction_order); + task.cudaComputeLPC.SetArg(4, task.cudaLPCData); + task.cudaComputeLPC.SetArg(5, (uint)_windowcount); + task.cudaComputeLPC.SetArg(6, (uint)autocorPartCount); + + //task.cudaComputeLPCLattice.SetArg(0, task.cudaResidualTasks); + //task.cudaComputeLPCLattice.SetArg(1, (uint)task.nResidualTasksPerChannel); + //task.cudaComputeLPCLattice.SetArg(2, task.cudaSamples); + //task.cudaComputeLPCLattice.SetArg(3, (uint)_windowcount); + //task.cudaComputeLPCLattice.SetArg(4, (uint)eparams.max_prediction_order); + //task.cudaComputeLPCLattice.SetArg(5, task.cudaLPCData); + //cuda.SetFunctionBlockShape(task.cudaComputeLPCLattice, 256, 1, 1); + + task.cudaQuantizeLPC.SetArg(0, task.cudaResidualTasks); + task.cudaQuantizeLPC.SetArg(1, (uint)task.nResidualTasksPerChannel); + task.cudaQuantizeLPC.SetArg(2, (uint)task.nTasksPerWindow); + task.cudaQuantizeLPC.SetArg(3, task.cudaLPCData); + task.cudaQuantizeLPC.SetArg(4, (uint)eparams.max_prediction_order); + task.cudaQuantizeLPC.SetArg(5, (uint)eparams.lpc_min_precision_search); + task.cudaQuantizeLPC.SetArg(6, (uint)(eparams.lpc_max_precision_search - eparams.lpc_min_precision_search)); + + task.cudaCopyBestMethod.SetArg(0, task.cudaBestResidualTasks); + task.cudaCopyBestMethod.SetArg(1, task.cudaResidualTasks); + task.cudaCopyBestMethod.SetArg(2, (uint)task.nResidualTasksPerChannel); + + task.cudaCopyBestMethodStereo.SetArg(0, task.cudaBestResidualTasks); + task.cudaCopyBestMethodStereo.SetArg(1, task.cudaResidualTasks); + task.cudaCopyBestMethodStereo.SetArg(2, (uint)task.nResidualTasksPerChannel); + + //task.cudaEncodeResidual.SetArg(0, task.cudaResidual); + //task.cudaEncodeResidual.SetArg(1, task.cudaSamples); + //task.cudaEncodeResidual.SetArg(2, task.cudaBestResidualTasks); + //cuda.SetFunctionBlockShape(task.cudaEncodeResidual, residualPartSize, 1, 1); + + //cudaCalcPartition.SetArg(0, task.cudaPartitions); + //cudaCalcPartition.SetArg(1, task.cudaResidual); + //cudaCalcPartition.SetArg(2, task.cudaSamples); + //cudaCalcPartition.SetArg(3, task.cudaBestResidualTasks); + //cudaCalcPartition.SetArg(4, (uint)max_porder); + //cudaCalcPartition.SetArg(5, (uint)calcPartitionPartSize); + //cudaCalcPartition.SetArg(6, (uint)calcPartitionPartCount); + //cuda.SetFunctionBlockShape(cudaCalcPartition, 16, 16, 1); + + //task.cudaSumPartition.SetArg(0, task.cudaPartitions); + //task.cudaSumPartition.SetArg(1, (uint)max_porder); + //cuda.SetFunctionBlockShape(task.cudaSumPartition, Math.Max(32, 1 << (max_porder - 1)), 1, 1); + + //task.cudaFindRiceParameter.SetArg(0, task.cudaRiceParams); + //task.cudaFindRiceParameter.SetArg(1, task.cudaPartitions); + //task.cudaFindRiceParameter.SetArg(2, (uint)max_porder); + //cuda.SetFunctionBlockShape(task.cudaFindRiceParameter, 32, 8, 1); + + //task.cudaFindPartitionOrder.SetArg(0, task.cudaBestRiceParams); + //task.cudaFindPartitionOrder.SetArg(1, task.cudaBestResidualTasks); + //task.cudaFindPartitionOrder.SetArg(2, task.cudaRiceParams); + //task.cudaFindPartitionOrder.SetArg(3, (uint)max_porder); + //cuda.SetFunctionBlockShape(task.cudaFindPartitionOrder, 256, 1, 1); + + + // issue work to the GPU + task.openCLCQ.EnqueueBarrier(); + task.openCLCQ.EnqueueNDRangeKernel(cudaChannelDecorr, 1, null, new int[] { task.frameCount * task.frameSize }, null ); + //task.openCLCQ.EnqueueNDRangeKernel(cudaChannelDecorr, 1, null, new int[] { 64 * 128 }, new int[] { 128 }); + //cuda.SetFunctionBlockShape(cudaChannelDecorr, 256, 1, 1); + //cuda.LaunchAsync(cudaChannelDecorr, (task.frameCount * task.frameSize + 255) / 256, channels == 2 ? 1 : channels, task.stream); + + if (eparams.do_wasted) + { + task.openCLCQ.EnqueueBarrier(); + task.EnqueueFindWasted(channelsCount); + } + + // geometry??? + task.openCLCQ.EnqueueBarrier(); + task.EnqueueComputeAutocor(autocorPartCount, channelsCount, cudaWindow, eparams.max_prediction_order); + + //float* autoc = stackalloc float[1024]; + //task.openCLCQ.EnqueueBarrier(); + //task.openCLCQ.EnqueueReadBuffer(task.cudaAutocorOutput, true, 0, sizeof(float) * 1024, (IntPtr)autoc); + + task.openCLCQ.EnqueueBarrier(); + task.openCLCQ.EnqueueNDRangeKernel(task.cudaComputeLPC, 2, null, new int[] { task.nAutocorTasksPerChannel * 32, channelsCount * task.frameCount }, new int[] { 32, 1 }); + //cuda.SetFunctionBlockShape(task.cudaComputeLPC, 32, 1, 1); + + //float* lpcs = stackalloc float[1024]; + //task.openCLCQ.EnqueueBarrier(); + //task.openCLCQ.EnqueueReadBuffer(task.cudaLPCData, true, 0, sizeof(float) * 1024, (IntPtr)lpcs); + + task.openCLCQ.EnqueueBarrier(); + task.openCLCQ.EnqueueNDRangeKernel(task.cudaQuantizeLPC, 2, null, new int[] { task.nAutocorTasksPerChannel * 32, channelsCount * task.frameCount * 4 }, new int[] { 32, 4 }); + //cuda.SetFunctionBlockShape(task.cudaQuantizeLPC, 32, 4, 1); + + task.openCLCQ.EnqueueBarrier(); + task.EnqueueEstimateResidual(channelsCount, eparams.max_prediction_order); + + //int* rr = stackalloc int[1024]; + //task.openCLCQ.EnqueueBarrier(); + //task.openCLCQ.EnqueueReadBuffer(task.cudaResidualOutput, true, 0, sizeof(int) * 1024, (IntPtr)rr); + + task.openCLCQ.EnqueueBarrier(); + task.EnqueueChooseBestMethod(channelsCount); + + task.openCLCQ.EnqueueBarrier(); + if (channels == 2 && channelsCount == 4) + task.openCLCQ.EnqueueNDRangeKernel(task.cudaCopyBestMethodStereo, 2, null, new int[] { 64, task.frameCount }, new int[] { 64, 1 }); + //cuda.SetFunctionBlockShape(task.cudaCopyBestMethodStereo, 64, 1, 1); + else + task.openCLCQ.EnqueueNDRangeKernel(task.cudaCopyBestMethod, 2, null, new int[] { 64, channels * task.frameCount }, new int[] { 64, 1 }); + //cuda.SetFunctionBlockShape(task.cudaCopyBestMethod, 64, 1, 1); + //if (_settings.GPUOnly) + //{ + // int bsz = calcPartitionPartCount * calcPartitionPartSize; + // if (cudaCalcPartition.Pointer == task.cudaCalcLargePartition.Pointer) + // cuda.LaunchAsync(task.cudaEncodeResidual, residualPartCount, channels * task.frameCount, task.stream); + // cuda.LaunchAsync(cudaCalcPartition, (task.frameSize + bsz - 1) / bsz, channels * task.frameCount, task.stream); + // if (max_porder > 0) + // cuda.LaunchAsync(task.cudaSumPartition, Flake.MAX_RICE_PARAM + 1, channels * task.frameCount, task.stream); + // cuda.LaunchAsync(task.cudaFindRiceParameter, ((2 << max_porder) + 31) / 32, channels * task.frameCount, task.stream); + // //if (max_porder > 0) // need to run even if max_porder==0 just to calculate the final frame size + // cuda.LaunchAsync(task.cudaFindPartitionOrder, 1, channels * task.frameCount, task.stream); + // cuda.CopyDeviceToHostAsync(task.cudaResidual, task.residualBufferPtr, (uint)(sizeof(int) * MAX_BLOCKSIZE * channels), task.stream); + // cuda.CopyDeviceToHostAsync(task.cudaBestRiceParams, task.bestRiceParamsPtr, (uint)(sizeof(int) * (1 << max_porder) * channels * task.frameCount), task.stream); + // task.max_porder = max_porder; + //} + task.openCLCQ.EnqueueBarrier(); + task.openCLCQ.EnqueueReadBuffer(task.cudaBestResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * channels * task.frameCount, task.bestResidualTasksPtr.AddrOfPinnedObject()); + //task.openCLCQ.EnqueueBarrier(); + //task.openCLCQ.EnqueueReadBuffer(task.cudaResidualTasks, true, 0, sizeof(FLACCLSubframeTask) * task.nResidualTasks, task.residualTasksPtr.AddrOfPinnedObject()); + //task.openCLCQ.EnqueueBarrier(); + } + + /// + /// Copy channel-interleaved input samples into separate subframes + /// + /// + /// + unsafe void unpack_samples(FLACCLTask task, int count) + { + int iFrame = task.frame.frame_number; + short* src = ((short*)task.samplesBytesPtr.AddrOfPinnedObject()) + iFrame * channels * task.frameSize; + + switch (task.frame.ch_mode) + { + case ChannelMode.NotStereo: + for (int ch = 0; ch < channels; ch++) + { + int* s = task.frame.subframes[ch].samples; + int wbits = (int)task.frame.subframes[ch].wbits; + for (int i = 0; i < count; i++) + s[i] = src[i * channels + ch] >>= wbits; + } + break; + case ChannelMode.LeftRight: + { + int* left = task.frame.subframes[0].samples; + int* right = task.frame.subframes[1].samples; + int lwbits = (int)task.frame.subframes[0].wbits; + int rwbits = (int)task.frame.subframes[1].wbits; + for (int i = 0; i < count; i++) + { + int l = *(src++); + int r = *(src++); + left[i] = l >> lwbits; + right[i] = r >> rwbits; + } + break; + } + case ChannelMode.LeftSide: + { + int* left = task.frame.subframes[0].samples; + int* right = task.frame.subframes[1].samples; + int lwbits = (int)task.frame.subframes[0].wbits; + int rwbits = (int)task.frame.subframes[1].wbits; + for (int i = 0; i < count; i++) + { + int l = *(src++); + int r = *(src++); + left[i] = l >> lwbits; + right[i] = (l - r) >> rwbits; + } + break; + } + case ChannelMode.RightSide: + { + int* left = task.frame.subframes[0].samples; + int* right = task.frame.subframes[1].samples; + int lwbits = (int)task.frame.subframes[0].wbits; + int rwbits = (int)task.frame.subframes[1].wbits; + for (int i = 0; i < count; i++) + { + int l = *(src++); + int r = *(src++); + left[i] = (l - r) >> lwbits; + right[i] = r >> rwbits; + } + break; + } + case ChannelMode.MidSide: + { + int* left = task.frame.subframes[0].samples; + int* right = task.frame.subframes[1].samples; + int lwbits = (int)task.frame.subframes[0].wbits; + int rwbits = (int)task.frame.subframes[1].wbits; + for (int i = 0; i < count; i++) + { + int l = *(src++); + int r = *(src++); + left[i] = (l + r) >> (1 + lwbits); + right[i] = (l - r) >> rwbits; + } + break; + } + } + } + + unsafe int encode_frame(bool doMidside, int channelCount, int iFrame, FLACCLTask task, int current_frame_number) + { + task.frame.InitSize(task.frameSize, eparams.variable_block_size != 0); + task.frame.frame_number = iFrame; + task.frame.ch_mode = ChannelMode.NotStereo; + + fixed (int* smp = task.samplesBuffer) + { + for (int ch = 0; ch < channelCount; ch++) + task.frame.subframes[ch].Init( + smp + ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * task.frameSize, + ((int*)task.residualBufferPtr.AddrOfPinnedObject()) + ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * task.frameSize, + _pcm.BitsPerSample + (doMidside && ch == 3 ? 1 : 0), 0); + + select_best_methods(task.frame, channelCount, iFrame, task); + //unpack_samples(task); + encode_residual(task); + + //task.frame.writer.Reset(); + task.frame.frame_number = current_frame_number; + task.frame.writer_offset = task.frame.writer.Length; + + output_frame_header(task.frame); + output_subframes(task.frame); + output_frame_footer(task.frame); + if (task.frame.writer.Length - task.frame.writer_offset >= max_frame_size) + throw new Exception("buffer overflow"); + + return task.frame.writer.Length - task.frame.writer_offset; + } + } + + unsafe void send_to_GPU(FLACCLTask task, int nFrames, int blocksize) + { + bool doMidside = channels == 2 && eparams.do_midside; + int channelsCount = doMidside ? 2 * channels : channels; + if (blocksize != task.frameSize) + task.nResidualTasks = 0; + task.frameCount = nFrames; + task.frameSize = blocksize; + task.frameNumber = eparams.variable_block_size > 0 ? frame_pos : frame_count; + task.framePos = frame_pos; + frame_count += nFrames; + frame_pos += nFrames * blocksize; + task.openCLCQ.EnqueueWriteBuffer(task.cudaSamplesBytes, false, 0, sizeof(short) * channels * blocksize * nFrames, task.samplesBytesPtr.AddrOfPinnedObject()); + task.openCLCQ.EnqueueBarrier(); + } + + unsafe void run_GPU_task(FLACCLTask task) + { + bool doMidside = channels == 2 && eparams.do_midside; + int channelsCount = doMidside ? 2 * channels : channels; + + if (task.frameSize != _windowsize && task.frameSize > 4) + fixed (float* window = windowBuffer) + { + _windowsize = task.frameSize; + _windowcount = 0; + calculate_window(window, lpc.window_welch, WindowFunction.Welch); + calculate_window(window, lpc.window_flattop, WindowFunction.Flattop); + calculate_window(window, lpc.window_tukey, WindowFunction.Tukey); + calculate_window(window, lpc.window_hann, WindowFunction.Hann); + calculate_window(window, lpc.window_bartlett, WindowFunction.Bartlett); + if (_windowcount == 0) + throw new Exception("invalid windowfunction"); + task.openCLCQ.EnqueueWriteBuffer(cudaWindow, true, 0, sizeof(float) * windowBuffer.Length, (IntPtr)window); + task.openCLCQ.EnqueueBarrier(); + } + if (task.nResidualTasks == 0) + initializeSubframeTasks(task.frameSize, channelsCount, max_frames, task); + + estimate_residual(task, channelsCount); + } + + unsafe void process_result(FLACCLTask task) + { + bool doMidside = channels == 2 && eparams.do_midside; + int channelCount = doMidside ? 2 * channels : channels; + + long iSample = 0; + long iByte = 0; + task.frame.writer.Reset(); + task.frame.writer_offset = 0; + for (int iFrame = 0; iFrame < task.frameCount; iFrame++) + { + //if (0 != eparams.variable_block_size && 0 == (task.blocksize & 7) && task.blocksize >= 128) + // fs = encode_frame_vbs(); + //else + int fn = task.frameNumber + (eparams.variable_block_size > 0 ? (int)iSample : iFrame); + int fs = encode_frame(doMidside, channelCount, iFrame, task, fn); + + if (task.verify != null) + { + int decoded = task.verify.DecodeFrame(task.frame.writer.Buffer, task.frame.writer_offset, fs); + if (decoded != fs || task.verify.Remaining != task.frameSize) + throw new Exception("validation failed! frame size mismatch"); + fixed (int* r = task.verify.Samples) + { + for (int ch = 0; ch < channels; ch++) + { + short* res = ((short*)task.samplesBytesPtr.AddrOfPinnedObject()) + iFrame * channels * task.frameSize + ch; + int* smp = r + ch * Flake.MAX_BLOCKSIZE; + for (int i = task.frameSize; i > 0; i--) + { + //if (AudioSamples.MemCmp(s + iFrame * task.frameSize + ch * FLACCLWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, task.frameSize)) + if (*res != *(smp++)) + throw new Exception(string.Format("validation failed! iFrame={0}, ch={1}", iFrame, ch)); + res += channels; + } + } + } + } + + if (seek_table != null && _IO.CanSeek) + { + for (int sp = 0; sp < seek_table.Length; sp++) + { + if (seek_table[sp].framesize != 0) + continue; + if (seek_table[sp].number >= task.framePos + iSample + task.frameSize) + break; + if (seek_table[sp].number >= task.framePos + iSample) + { + seek_table[sp].number = task.framePos + iSample; + seek_table[sp].offset = iByte; + seek_table[sp].framesize = task.frameSize; + } + } + } + + //Array.Copy(task.frame.buffer, 0, task.outputBuffer, iByte, fs); + + iSample += task.frameSize; + iByte += fs; + } + task.outputSize = (int)iByte; + if (iByte != task.frame.writer.Length) + throw new Exception("invalid length"); + } + + unsafe void write_result(FLACCLTask task) + { + int iSample = task.frameSize * task.frameCount; + + if (seek_table != null && _IO.CanSeek) + for (int sp = 0; sp < seek_table.Length; sp++) + { + if (seek_table[sp].number >= task.framePos + iSample) + break; + if (seek_table[sp].number >= task.framePos) + seek_table[sp].offset += _IO.Position - first_frame_offset; + } + _IO.Write(task.outputBuffer, 0, task.outputSize); + _position += iSample; + _totalSize += task.outputSize; + } + + public unsafe void InitTasks() + { + bool doMidside = channels == 2 && eparams.do_midside; + int channelCount = doMidside ? 2 * channels : channels; + + if (!inited) + { + if (OpenCL.NumberOfPlatforms < 1) + throw new Exception("no opencl platforms found"); + + OCLMan = new OpenCLManager(); + // Attempt to save binaries after compilation, as well as load precompiled binaries + // to avoid compilation. Usually you'll want this to be true. + OCLMan.AttemptUseBinaries = false; // true; + // Attempt to compile sources. This should probably be true for almost all projects. + // Setting it to false means that when you attempt to compile "mysource.cl", it will + // only scan the precompiled binary directory for a binary corresponding to a source + // with that name. There's a further restriction that the compiled binary also has to + // use the same Defines and BuildOptions + OCLMan.AttemptUseSource = true; + // Binary and source paths + // This is where we store our sources and where compiled binaries are placed + //OCLMan.BinaryPath = @"OpenCL\bin"; + //OCLMan.SourcePath = @"OpenCL\src"; + // If true, RequireImageSupport will filter out any devices without image support + // In this project we don't need image support though, so we set it to false + OCLMan.RequireImageSupport = false; + // The Defines string gets prepended to any and all sources that are compiled + // and serve as a convenient way to pass configuration information to the compilation process + OCLMan.Defines = "#define MAX_ORDER " + eparams.max_prediction_order.ToString(); + // The BuildOptions string is passed directly to clBuild and can be used to do debug builds etc + OCLMan.BuildOptions = ""; + + OCLMan.CreateDefaultContext(0, DeviceType.GPU); + + openCLContext = OCLMan.Context; + //try + //{ + // openCLProgram = OCLMan.CompileFile("flac.cl"); + //} + //catch (OpenCLBuildException ex) + //{ + // string buildLog = ex.BuildLogs[0]; + // throw ex; + //} + using (Stream kernel = GetType().Assembly.GetManifestResourceStream(GetType(), "flac.cl")) + using (StreamReader sr = new StreamReader(kernel)) + { + try + { + openCLProgram = OCLMan.CompileSource(sr.ReadToEnd()); ; + } + catch (OpenCLBuildException ex) + { + string buildLog = ex.BuildLogs[0]; + throw ex; + } + } +#if TTTTKJHSKJH + var openCLPlatform = OpenCL.GetPlatform(0); + openCLContext = openCLPlatform.CreateDefaultContext(); + using (Stream kernel = GetType().Assembly.GetManifestResourceStream(GetType(), "flac.cl")) + using (StreamReader sr = new StreamReader(kernel)) + openCLProgram = openCLContext.CreateProgramWithSource(sr.ReadToEnd()); + try + { + openCLProgram.Build(); + } + catch (OpenCLException) + { + string buildLog = openCLProgram.GetBuildLog(openCLProgram.Devices[0]); + throw; + } +#endif + + if (_IO == null) + _IO = new FileStream(_path, FileMode.Create, FileAccess.Write, FileShare.Read); + int header_size = flake_encode_init(); + _IO.Write(header, 0, header_size); + if (_IO.CanSeek) + first_frame_offset = _IO.Position; + + task1 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify); + task2 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify); + if (_settings.CPUThreads > 0) + { + cpu_tasks = new FLACCLTask[_settings.CPUThreads]; + for (int i = 0; i < cpu_tasks.Length; i++) + cpu_tasks[i] = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify); + } + cudaWindow = openCLProgram.Context.CreateBuffer(MemFlags.READ_ONLY, sizeof(float) * FLACCLWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS); + + inited = true; + } + } + + public unsafe void Write(AudioBuffer buff) + { + InitTasks(); + buff.Prepare(this); + int pos = 0; + while (pos < buff.Length) + { + int block = Math.Min(buff.Length - pos, eparams.block_size * max_frames - samplesInBuffer); + + fixed (byte* buf = buff.Bytes) + AudioSamples.MemCpy(((byte*)task1.samplesBytesPtr.AddrOfPinnedObject()) + samplesInBuffer * _pcm.BlockAlign, buf + pos * _pcm.BlockAlign, block * _pcm.BlockAlign); + + samplesInBuffer += block; + pos += block; + + int nFrames = samplesInBuffer / eparams.block_size; + if (nFrames >= max_frames) + do_output_frames(nFrames); + } + if (md5 != null) + md5.TransformBlock(buff.Bytes, 0, buff.ByteLength, null, 0); + } + + public void wait_for_cpu_task() + { + FLACCLTask task = cpu_tasks[oldest_cpu_task]; + if (task.workThread == null) + return; + lock (task) + { + while (!task.done && task.exception == null) + Monitor.Wait(task); + if (task.exception != null) + throw task.exception; + } + } + + public void cpu_task_thread(object param) + { + FLACCLTask task = param as FLACCLTask; + try + { + while (true) + { + lock (task) + { + while (task.done && !task.exit) + Monitor.Wait(task); + if (task.exit) + return; + } + process_result(task); + lock (task) + { + task.done = true; + Monitor.Pulse(task); + } + } + } + catch (Exception ex) + { + lock (task) + { + task.exception = ex; + Monitor.Pulse(task); + } + } + } + + public void start_cpu_task() + { + FLACCLTask task = cpu_tasks[oldest_cpu_task]; + if (task.workThread == null) + { + task.done = false; + task.exit = false; + task.workThread = new Thread(cpu_task_thread); + task.workThread.IsBackground = true; + //task.workThread.Priority = ThreadPriority.BelowNormal; + task.workThread.Start(task); + } + else + { + lock (task) + { + task.done = false; + Monitor.Pulse(task); + } + } + } + + public unsafe void do_output_frames(int nFrames) + { + send_to_GPU(task1, nFrames, eparams.block_size); + if (task2.frameCount > 0) + task2.openCLCQ.Finish(); + run_GPU_task(task1); + if (task2.frameCount > 0) + { + if (cpu_tasks != null) + { + wait_for_cpu_task(); + + FLACCLTask ttmp = cpu_tasks[oldest_cpu_task]; + cpu_tasks[oldest_cpu_task] = task2; + task2 = ttmp; + + start_cpu_task(); + + oldest_cpu_task = (oldest_cpu_task + 1) % cpu_tasks.Length; + + if (task2.frameCount > 0) + write_result(task2); + } + else + { + process_result(task2); + write_result(task2); + } + } + int bs = eparams.block_size * nFrames; + samplesInBuffer -= bs; + if (samplesInBuffer > 0) + AudioSamples.MemCpy( + ((byte*)task2.samplesBytesPtr.AddrOfPinnedObject()), + ((byte*)task1.samplesBytesPtr.AddrOfPinnedObject()) + bs * _pcm.BlockAlign, + samplesInBuffer * _pcm.BlockAlign); + FLACCLTask tmp = task1; + task1 = task2; + task2 = tmp; + task1.frameCount = 0; + } + + public string Path { get { return _path; } } + + public static readonly string vendor_string = "FLACCL#.91"; + + int select_blocksize(int samplerate, int time_ms) + { + int blocksize = Flake.flac_blocksizes[1]; + int target = (samplerate * time_ms) / 1000; + if (eparams.variable_block_size > 0) + { + blocksize = 1024; + while (target >= blocksize) + blocksize <<= 1; + return blocksize >> 1; + } + + for (int i = 0; i < Flake.flac_blocksizes.Length; i++) + if (target >= Flake.flac_blocksizes[i] && Flake.flac_blocksizes[i] > blocksize) + { + blocksize = Flake.flac_blocksizes[i]; + } + return blocksize; + } + + void write_streaminfo(byte[] header, int pos, int last) + { + Array.Clear(header, pos, 38); + BitWriter bitwriter = new BitWriter(header, pos, 38); + + // metadata header + bitwriter.writebits(1, last); + bitwriter.writebits(7, (int)MetadataType.StreamInfo); + bitwriter.writebits(24, 34); + + if (eparams.variable_block_size > 0) + bitwriter.writebits(16, 0); + else + bitwriter.writebits(16, eparams.block_size); + + bitwriter.writebits(16, eparams.block_size); + bitwriter.writebits(24, 0); + bitwriter.writebits(24, max_frame_size); + bitwriter.writebits(20, sample_rate); + bitwriter.writebits(3, channels - 1); + bitwriter.writebits(5, bits_per_sample - 1); + + // total samples + if (sample_count > 0) + { + bitwriter.writebits(4, 0); + bitwriter.writebits(32, sample_count); + } + else + { + bitwriter.writebits(4, 0); + bitwriter.writebits(32, 0); + } + bitwriter.flush(); + } + + /** + * Write vorbis comment metadata block to byte array. + * Just writes the vendor string for now. + */ + int write_vorbis_comment(byte[] comment, int pos, int last) + { + BitWriter bitwriter = new BitWriter(comment, pos, 4); + Encoding enc = new ASCIIEncoding(); + int vendor_len = enc.GetBytes(vendor_string, 0, vendor_string.Length, comment, pos + 8); + + // metadata header + bitwriter.writebits(1, last); + bitwriter.writebits(7, (int)MetadataType.VorbisComment); + bitwriter.writebits(24, vendor_len + 8); + + comment[pos + 4] = (byte)(vendor_len & 0xFF); + comment[pos + 5] = (byte)((vendor_len >> 8) & 0xFF); + comment[pos + 6] = (byte)((vendor_len >> 16) & 0xFF); + comment[pos + 7] = (byte)((vendor_len >> 24) & 0xFF); + comment[pos + 8 + vendor_len] = 0; + comment[pos + 9 + vendor_len] = 0; + comment[pos + 10 + vendor_len] = 0; + comment[pos + 11 + vendor_len] = 0; + bitwriter.flush(); + return vendor_len + 12; + } + + int write_seekpoints(byte[] header, int pos, int last) + { + seek_table_offset = pos + 4; + + BitWriter bitwriter = new BitWriter(header, pos, 4 + 18 * seek_table.Length); + + // metadata header + bitwriter.writebits(1, last); + bitwriter.writebits(7, (int)MetadataType.Seektable); + bitwriter.writebits(24, 18 * seek_table.Length); + for (int i = 0; i < seek_table.Length; i++) + { + bitwriter.writebits64(Flake.FLAC__STREAM_METADATA_SEEKPOINT_SAMPLE_NUMBER_LEN, (ulong)seek_table[i].number); + bitwriter.writebits64(Flake.FLAC__STREAM_METADATA_SEEKPOINT_STREAM_OFFSET_LEN, (ulong)seek_table[i].offset); + bitwriter.writebits(Flake.FLAC__STREAM_METADATA_SEEKPOINT_FRAME_SAMPLES_LEN, seek_table[i].framesize); + } + bitwriter.flush(); + return 4 + 18 * seek_table.Length; + } + + /** + * Write padding metadata block to byte array. + */ + int + write_padding(byte[] padding, int pos, int last, long padlen) + { + BitWriter bitwriter = new BitWriter(padding, pos, 4); + + // metadata header + bitwriter.writebits(1, last); + bitwriter.writebits(7, (int)MetadataType.Padding); + bitwriter.writebits(24, (int)padlen); + + return (int)padlen + 4; + } + + int write_headers() + { + int header_size = 0; + int last = 0; + + // stream marker + header[0] = 0x66; + header[1] = 0x4C; + header[2] = 0x61; + header[3] = 0x43; + header_size += 4; + + // streaminfo + write_streaminfo(header, header_size, last); + header_size += 38; + + // seek table + if (_IO.CanSeek && seek_table != null) + header_size += write_seekpoints(header, header_size, last); + + // vorbis comment + if (eparams.padding_size == 0) last = 1; + header_size += write_vorbis_comment(header, header_size, last); + + // padding + if (eparams.padding_size > 0) + { + last = 1; + header_size += write_padding(header, header_size, last, eparams.padding_size); + } + + return header_size; + } + + int flake_encode_init() + { + int i, header_len; + + //if(flake_validate_params(s) < 0) + + ch_code = channels - 1; + + // find samplerate in table + for (i = 4; i < 12; i++) + { + if (sample_rate == Flake.flac_samplerates[i]) + { + sr_code0 = i; + break; + } + } + + // if not in table, samplerate is non-standard + if (i == 12) + throw new Exception("non-standard samplerate"); + + for (i = 1; i < 8; i++) + { + if (bits_per_sample == Flake.flac_bitdepths[i]) + { + bps_code = i; + break; + } + } + if (i == 8) + throw new Exception("non-standard bps"); + // FIXME: For now, only 16-bit encoding is supported + if (bits_per_sample != 16) + throw new Exception("non-standard bps"); + + if (_blocksize == 0) + { + if (eparams.block_size == 0) + eparams.block_size = select_blocksize(sample_rate, eparams.block_time_ms); + _blocksize = eparams.block_size; + } + else + eparams.block_size = _blocksize; + + max_frames = Math.Min(maxFrames, FLACCLWriter.MAX_BLOCKSIZE / eparams.block_size); + + // set maximum encoded frame size (if larger, re-encodes in verbatim mode) + if (channels == 2) + max_frame_size = 16 + ((eparams.block_size * (int)(bits_per_sample + bits_per_sample + 1) + 7) >> 3); + else + max_frame_size = 16 + ((eparams.block_size * channels * (int)bits_per_sample + 7) >> 3); + + if (_IO.CanSeek && eparams.do_seektable && sample_count > 0) + { + int seek_points_distance = sample_rate * 10; + int num_seek_points = 1 + sample_count / seek_points_distance; // 1 seek point per 10 seconds + if (sample_count % seek_points_distance == 0) + num_seek_points--; + seek_table = new SeekPoint[num_seek_points]; + for (int sp = 0; sp < num_seek_points; sp++) + { + seek_table[sp].framesize = 0; + seek_table[sp].offset = 0; + seek_table[sp].number = sp * seek_points_distance; + } + } + + // output header bytes + header = new byte[eparams.padding_size + 1024 + (seek_table == null ? 0 : seek_table.Length * 18)]; + header_len = write_headers(); + + // initialize CRC & MD5 + if (_IO.CanSeek && _settings.DoMD5) + md5 = new MD5CryptoServiceProvider(); + + return header_len; + } + } + + struct FlakeEncodeParams + { + // compression quality + // set by user prior to calling flake_encode_init + // standard values are 0 to 8 + // 0 is lower compression, faster encoding + // 8 is higher compression, slower encoding + // extended values 9 to 12 are slower and/or use + // higher prediction orders + public int compression; + + // stereo decorrelation method + // set by user prior to calling flake_encode_init + // if set to less than 0, it is chosen based on compression. + // valid values are 0 to 2 + // 0 = independent L+R channels + // 1 = mid-side encoding + public bool do_midside; + + // block size in samples + // set by the user prior to calling flake_encode_init + // if set to 0, a block size is chosen based on block_time_ms + // can also be changed by user before encoding a frame + public int block_size; + + // block time in milliseconds + // set by the user prior to calling flake_encode_init + // used to calculate block_size based on sample rate + // can also be changed by user before encoding a frame + public int block_time_ms; + + // padding size in bytes + // set by the user prior to calling flake_encode_init + // if set to less than 0, defaults to 4096 + public long padding_size; + + // minimum LPC order + // set by user prior to calling flake_encode_init + // if set to less than 0, it is chosen based on compression. + // valid values are 1 to 32 + public int min_prediction_order; + + // maximum LPC order + // set by user prior to calling flake_encode_init + // if set to less than 0, it is chosen based on compression. + // valid values are 1 to 32 + public int max_prediction_order; + + public int orders_per_window; + + // minimum fixed prediction order + // set by user prior to calling flake_encode_init + // if set to less than 0, it is chosen based on compression. + // valid values are 0 to 4 + public int min_fixed_order; + + // maximum fixed prediction order + // set by user prior to calling flake_encode_init + // if set to less than 0, it is chosen based on compression. + // valid values are 0 to 4 + public int max_fixed_order; + + // minimum partition order + // set by user prior to calling flake_encode_init + // if set to less than 0, it is chosen based on compression. + // valid values are 0 to 8 + public int min_partition_order; + + // maximum partition order + // set by user prior to calling flake_encode_init + // if set to less than 0, it is chosen based on compression. + // valid values are 0 to 8 + public int max_partition_order; + + // whether to use variable block sizes + // set by user prior to calling flake_encode_init + // 0 = fixed block size + // 1 = variable block size + public int variable_block_size; + + // whether to try various lpc_precisions + // 0 - use only one precision + // 1 - try two precisions + public int lpc_max_precision_search; + + public int lpc_min_precision_search; + + public bool do_wasted; + + public bool do_constant; + + public WindowFunction window_function; + + public bool do_seektable; + + public int flake_set_defaults(int lvl, bool encode_on_cpu) + { + compression = lvl; + + if ((lvl < 0 || lvl > 12) && (lvl != 99)) + { + return -1; + } + + // default to level 5 params + window_function = WindowFunction.Flattop | WindowFunction.Tukey; + do_midside = true; + block_size = 0; + block_time_ms = 100; + min_fixed_order = 0; + max_fixed_order = 4; + min_prediction_order = 1; + max_prediction_order = 12; + min_partition_order = 0; + max_partition_order = 6; + variable_block_size = 0; + lpc_min_precision_search = 0; + lpc_max_precision_search = 0; + do_seektable = true; + do_wasted = true; + do_constant = true; + + // differences from level 7 + switch (lvl) + { + case 0: + do_constant = false; + do_wasted = false; + do_midside = false; + orders_per_window = 1; + max_partition_order = 4; + max_prediction_order = 7; + min_fixed_order = 2; + max_fixed_order = 2; + break; + case 1: + do_wasted = false; + do_midside = false; + window_function = WindowFunction.Bartlett; + orders_per_window = 1; + max_prediction_order = 12; + max_partition_order = 4; + break; + case 2: + do_constant = false; + window_function = WindowFunction.Bartlett; + min_fixed_order = 3; + max_fixed_order = 2; + orders_per_window = 1; + max_prediction_order = 7; + max_partition_order = 4; + break; + case 3: + window_function = WindowFunction.Bartlett; + min_fixed_order = 2; + max_fixed_order = 2; + orders_per_window = 6; + max_prediction_order = 7; + max_partition_order = 4; + break; + case 4: + min_fixed_order = 2; + max_fixed_order = 2; + orders_per_window = 3; + max_prediction_order = 8; + max_partition_order = 4; + break; + case 5: + do_constant = false; + min_fixed_order = 2; + max_fixed_order = 2; + orders_per_window = 1; + break; + case 6: + min_fixed_order = 2; + max_fixed_order = 2; + orders_per_window = 3; + break; + case 7: + min_fixed_order = 2; + max_fixed_order = 2; + orders_per_window = 7; + break; + case 8: + orders_per_window = 12; + break; + case 9: + min_fixed_order = 2; + max_fixed_order = 2; + orders_per_window = 3; + max_prediction_order = 32; + break; + case 10: + min_fixed_order = 2; + max_fixed_order = 2; + orders_per_window = 7; + max_prediction_order = 32; + break; + case 11: + min_fixed_order = 2; + max_fixed_order = 2; + orders_per_window = 11; + max_prediction_order = 32; + break; + } + + if (!encode_on_cpu) + max_partition_order = 8; + + return 0; + } + } + + unsafe struct FLACCLSubframeTask + { + public int residualOrder; + public int samplesOffs; + public int shift; + public int cbits; + public int size; + public int type; + public int obits; + public int blocksize; + public int best_index; + public int channel; + public int residualOffs; + public int wbits; + public int abits; + public int porder; + public fixed int reserved[2]; + public fixed int coefs[32]; + }; + + internal class FLACCLTask + { + Program openCLProgram; + public CommandQueue openCLCQ; + public Kernel cudaStereoDecorr; + //public Kernel cudaChannelDecorr; + public Kernel cudaChannelDecorr2; + public Kernel cudaFindWastedBits; + public Kernel cudaComputeAutocor; + public Kernel cudaComputeLPC; + //public Kernel cudaComputeLPCLattice; + public Kernel cudaQuantizeLPC; + public Kernel cudaEstimateResidual; + public Kernel cudaChooseBestMethod; + public Kernel cudaCopyBestMethod; + public Kernel cudaCopyBestMethodStereo; + //public Kernel cudaEncodeResidual; + //public Kernel cudaCalcPartition; + //public Kernel cudaCalcPartition16; + //public Kernel cudaCalcLargePartition; + //public Kernel cudaSumPartition; + //public Kernel cudaFindRiceParameter; + //public Kernel cudaFindPartitionOrder; + public Mem cudaSamplesBytes; + public Mem cudaSamples; + public Mem cudaLPCData; + public Mem cudaResidual; + public Mem cudaPartitions; + public Mem cudaRiceParams; + public Mem cudaBestRiceParams; + public Mem cudaAutocorOutput; + public Mem cudaResidualTasks; + public Mem cudaResidualOutput; + public Mem cudaBestResidualTasks; + public GCHandle samplesBytesPtr; + public GCHandle residualBufferPtr; + public GCHandle bestRiceParamsPtr; + public GCHandle residualTasksPtr; + public GCHandle bestResidualTasksPtr; + public int[] samplesBuffer; + public byte[] outputBuffer; + public int outputSize = 0; + public int frameSize = 0; + public int frameCount = 0; + public int frameNumber = 0; + public int framePos = 0; + public FlacFrame frame; + public int residualTasksLen; + public int bestResidualTasksLen; + public int samplesBufferLen; + public int nResidualTasks = 0; + public int nResidualTasksPerChannel = 0; + public int nTasksPerWindow = 0; + public int nAutocorTasksPerChannel = 0; + //public int max_porder = 0; + + public FlakeReader verify; + + public Thread workThread = null; + public Exception exception = null; + public bool done = false; + public bool exit = false; + + unsafe public FLACCLTask(Program _openCLProgram, int channelCount, int channels, uint bits_per_sample, int max_frame_size, bool do_verify) + { + openCLProgram = _openCLProgram; + Device[] openCLDevices = openCLProgram.Context.Platform.QueryDevices(DeviceType.GPU); + openCLCQ = openCLProgram.Context.CreateCommandQueue(openCLDevices[0], CommandQueueProperties.PROFILING_ENABLE); + + residualTasksLen = sizeof(FLACCLSubframeTask) * channelCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 8) * FLACCLWriter.maxFrames; + bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channelCount * FLACCLWriter.maxFrames; + samplesBufferLen = sizeof(int) * FLACCLWriter.MAX_BLOCKSIZE * channelCount; + int partitionsLen = sizeof(int) * (30 << 8) * channelCount * FLACCLWriter.maxFrames; + int riceParamsLen = sizeof(int) * (4 << 8) * channelCount * FLACCLWriter.maxFrames; + int lpcDataLen = sizeof(float) * 32 * 33 * lpc.MAX_LPC_WINDOWS * channelCount * FLACCLWriter.maxFrames; + + cudaSamplesBytes = openCLProgram.Context.CreateBuffer(MemFlags.READ_ONLY | MemFlags.ALLOC_HOST_PTR, (uint)samplesBufferLen / 2); + cudaSamples = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, samplesBufferLen); + cudaResidual = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, samplesBufferLen); + cudaLPCData = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, lpcDataLen); + cudaPartitions = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, partitionsLen); + cudaRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, riceParamsLen); + cudaBestRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceParamsLen / 4); + cudaAutocorOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * (FLACCLWriter.maxAutocorParts + FLACCLWriter.maxFrames)); + cudaResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, residualTasksLen); + cudaBestResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, bestResidualTasksLen); + cudaResidualOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 8) * 64 /*FLACCLWriter.maxResidualParts*/ * FLACCLWriter.maxFrames); + + samplesBytesPtr = GCHandle.Alloc(new byte[samplesBufferLen / 2], GCHandleType.Pinned); + residualBufferPtr = GCHandle.Alloc(new byte[samplesBufferLen], GCHandleType.Pinned); + bestRiceParamsPtr = GCHandle.Alloc(new byte[riceParamsLen / 4], GCHandleType.Pinned); + residualTasksPtr = GCHandle.Alloc(new byte[residualTasksLen], GCHandleType.Pinned); + bestResidualTasksPtr = GCHandle.Alloc(new byte[bestResidualTasksLen], GCHandleType.Pinned); + + cudaComputeAutocor = openCLProgram.CreateKernel("cudaComputeAutocor"); + cudaStereoDecorr = openCLProgram.CreateKernel("cudaStereoDecorr"); + //cudaChannelDecorr = openCLProgram.CreateKernel("cudaChannelDecorr"); + cudaChannelDecorr2 = openCLProgram.CreateKernel("cudaChannelDecorr2"); + cudaFindWastedBits = openCLProgram.CreateKernel("cudaFindWastedBits"); + cudaComputeLPC = openCLProgram.CreateKernel("cudaComputeLPC"); + cudaQuantizeLPC = openCLProgram.CreateKernel("cudaQuantizeLPC"); + //cudaComputeLPCLattice = openCLProgram.CreateKernel("cudaComputeLPCLattice"); + cudaEstimateResidual = openCLProgram.CreateKernel("cudaEstimateResidual"); + cudaChooseBestMethod = openCLProgram.CreateKernel("cudaChooseBestMethod"); + cudaCopyBestMethod = openCLProgram.CreateKernel("cudaCopyBestMethod"); + cudaCopyBestMethodStereo = openCLProgram.CreateKernel("cudaCopyBestMethodStereo"); + //cudaEncodeResidual = openCLProgram.CreateKernel("cudaEncodeResidual"); + //cudaCalcPartition = openCLProgram.CreateKernel("cudaCalcPartition"); + //cudaCalcPartition16 = openCLProgram.CreateKernel("cudaCalcPartition16"); + //cudaCalcLargePartition = openCLProgram.CreateKernel("cudaCalcLargePartition"); + //cudaSumPartition = openCLProgram.CreateKernel("cudaSumPartition"); + //cudaFindRiceParameter = openCLProgram.CreateKernel("cudaFindRiceParameter"); + //cudaFindPartitionOrder = openCLProgram.CreateKernel("cudaFindPartitionOrder"); + + samplesBuffer = new int[FLACCLWriter.MAX_BLOCKSIZE * channelCount]; + outputBuffer = new byte[max_frame_size * FLACCLWriter.maxFrames + 1]; + frame = new FlacFrame(channelCount); + frame.writer = new BitWriter(outputBuffer, 0, outputBuffer.Length); + + if (do_verify) + { + verify = new FlakeReader(new AudioPCMConfig((int)bits_per_sample, channels, 44100)); + verify.DoCRC = false; + } + } + + public void Dispose() + { + if (workThread != null) + { + lock (this) + { + exit = true; + Monitor.Pulse(this); + } + workThread.Join(); + workThread = null; + } + + cudaComputeAutocor.Dispose(); + cudaStereoDecorr.Dispose(); + //cudaChannelDecorr.Dispose(); + cudaChannelDecorr2.Dispose(); + cudaFindWastedBits.Dispose(); + cudaComputeLPC.Dispose(); + cudaQuantizeLPC.Dispose(); + //cudaComputeLPCLattice.Dispose(); + cudaEstimateResidual.Dispose(); + cudaChooseBestMethod.Dispose(); + cudaCopyBestMethod.Dispose(); + cudaCopyBestMethodStereo.Dispose(); + //cudaEncodeResidual.Dispose(); + //cudaCalcPartition.Dispose(); + //cudaCalcPartition16.Dispose(); + //cudaCalcLargePartition.Dispose(); + //cudaSumPartition.Dispose(); + //cudaFindRiceParameter.Dispose(); + //cudaFindPartitionOrder.Dispose(); + + cudaSamples.Dispose(); + cudaSamplesBytes.Dispose(); + cudaLPCData.Dispose(); + cudaResidual.Dispose(); + cudaPartitions.Dispose(); + cudaAutocorOutput.Dispose(); + cudaResidualTasks.Dispose(); + cudaResidualOutput.Dispose(); + cudaBestResidualTasks.Dispose(); + + samplesBytesPtr.Free(); + residualBufferPtr.Free(); + bestRiceParamsPtr.Free(); + residualTasksPtr.Free(); + bestResidualTasksPtr.Free(); + + openCLCQ.Dispose(); + } + + public void EnqueueFindWasted(int channelsCount) + { + cudaFindWastedBits.SetArg(0, cudaResidualTasks); + cudaFindWastedBits.SetArg(1, cudaSamples); + cudaFindWastedBits.SetArg(2, nResidualTasksPerChannel); + + int workX = 128; // 256 + int grpX = frameCount * channelsCount; + //openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { 128 }, new int[] { 128 }); + //openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { 128 }, null); + openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { grpX * workX }, new int[] { workX }); + + //openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { 256 * 128 }, new int[] { 128 }); + //openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { grpX * workX }, null); + //cuda.SetFunctionBlockShape(task.cudaFindWastedBits, 256, 1, 1); + //cuda.LaunchAsync(task.cudaFindWastedBits, channelsCount * task.frameCount, 1, task.stream); + } + + public void EnqueueComputeAutocor(int autocorPartCount, int channelsCount, Mem cudaWindow, int max_prediction_order) + { + cudaComputeAutocor.SetArg(0, cudaAutocorOutput); + cudaComputeAutocor.SetArg(1, cudaSamples); + cudaComputeAutocor.SetArg(2, cudaWindow); + cudaComputeAutocor.SetArg(3, cudaResidualTasks); + cudaComputeAutocor.SetArg(4, max_prediction_order); + cudaComputeAutocor.SetArg(5, (uint)nAutocorTasksPerChannel - 1); + cudaComputeAutocor.SetArg(6, (uint)nResidualTasksPerChannel); + int workX = autocorPartCount; + int workY = nAutocorTasksPerChannel * channelsCount * frameCount; + int ws = 32; + int wy = 4; + openCLCQ.EnqueueNDRangeKernel(cudaComputeAutocor, 2, null, new int[] { workX * ws, workY * wy }, new int[] { ws, wy }); + //openCLCQ.EnqueueNDRangeKernel(cudaComputeAutocor, 3, null, new int[] { workX * ws, workY, max_prediction_order + 1 }, new int[] { ws, 1, 1 }); + + //cuda.SetFunctionBlockShape(task.cudaComputeAutocor, 32, 8, 1); + //cuda.LaunchAsync(task.cudaComputeAutocor, autocorPartCount, task.nAutocorTasksPerChannel * channelsCount * task.frameCount, task.stream); + } + + public void EnqueueEstimateResidual(int channelsCount, int max_prediction_order) + { + cudaEstimateResidual.SetArg(0, cudaResidualOutput); + cudaEstimateResidual.SetArg(1, cudaSamples); + cudaEstimateResidual.SetArg(2, cudaResidualTasks); + + int threads_x = 128; + int workX = threads_x; + int workY = nResidualTasksPerChannel * channelsCount * frameCount; + + openCLCQ.EnqueueNDRangeKernel(cudaEstimateResidual, 2, null, new int[] { workX, workY }, new int[] { threads_x, 1 }); + } + + public void EnqueueChooseBestMethod(int channelsCount) + { + cudaChooseBestMethod.SetArg(0, cudaResidualTasks); + cudaChooseBestMethod.SetArg(1, cudaResidualOutput); + cudaChooseBestMethod.SetArg(2, (uint)nResidualTasksPerChannel); + + int threadsY = 4; + + openCLCQ.EnqueueNDRangeKernel(cudaChooseBestMethod, 2, null, new int[] { 32, channelsCount * frameCount * threadsY }, new int[] { 32, threadsY }); + //cuda.SetFunctionBlockShape(task.cudaChooseBestMethod, 32, 8, 1); + } + + public unsafe FLACCLSubframeTask* ResidualTasks + { + get + { + return (FLACCLSubframeTask*)residualTasksPtr.AddrOfPinnedObject(); + } + } + + public unsafe FLACCLSubframeTask* BestResidualTasks + { + get + { + return (FLACCLSubframeTask*)bestResidualTasksPtr.AddrOfPinnedObject(); + } + } + } +} diff --git a/CUETools.Codecs.FLACCL/OpenCLNet.dll b/CUETools.Codecs.FLACCL/OpenCLNet.dll new file mode 100644 index 0000000..b6bf0cc Binary files /dev/null and b/CUETools.Codecs.FLACCL/OpenCLNet.dll differ diff --git a/CUETools.Codecs.FLACCL/OpenCLNet.pdb b/CUETools.Codecs.FLACCL/OpenCLNet.pdb new file mode 100644 index 0000000..e3f941b Binary files /dev/null and b/CUETools.Codecs.FLACCL/OpenCLNet.pdb differ diff --git a/CUETools.Codecs.FLACCL/Properties/AssemblyInfo.cs b/CUETools.Codecs.FLACCL/Properties/AssemblyInfo.cs new file mode 100644 index 0000000..83e2da4 --- /dev/null +++ b/CUETools.Codecs.FLACCL/Properties/AssemblyInfo.cs @@ -0,0 +1,35 @@ +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +// General Information about an assembly is controlled through the following +// set of attributes. Change these attribute values to modify the information +// associated with an assembly. +[assembly: AssemblyTitle("CUETools.Codecs.FLACCL")] +[assembly: AssemblyDescription("")] +[assembly: AssemblyConfiguration("")] +[assembly: AssemblyCompany("")] +[assembly: AssemblyProduct("CUETools.Codecs.FLACCL")] +[assembly: AssemblyCopyright("Copyright © 2009-2010 Gregory S. Chudov")] +[assembly: AssemblyTrademark("")] +[assembly: AssemblyCulture("")] + +// Setting ComVisible to false makes the types in this assembly not visible +// to COM components. If you need to access a type in this assembly from +// COM, set the ComVisible attribute to true on that type. +[assembly: ComVisible(false)] + +// The following GUID is for the ID of the typelib if this project is exposed to COM +[assembly: Guid("b28ffece-6c89-426b-b227-e647b435cc3d")] + +// Version information for an assembly consists of the following four values: +// +// Major Version +// Minor Version +// Build Number +// Revision +// +// You can specify all the values or you can default the Revision and Build Numbers +// by using the '*' as shown below: +[assembly: AssemblyVersion("2.0.9.0")] +[assembly: AssemblyFileVersion("2.0.9.0")] diff --git a/CUETools.Codecs.FLACCL/Properties/Resources.Designer.cs b/CUETools.Codecs.FLACCL/Properties/Resources.Designer.cs new file mode 100644 index 0000000..7b3d40a --- /dev/null +++ b/CUETools.Codecs.FLACCL/Properties/Resources.Designer.cs @@ -0,0 +1,117 @@ +//------------------------------------------------------------------------------ +// +// This code was generated by a tool. +// Runtime Version:2.0.50727.4200 +// +// Changes to this file may cause incorrect behavior and will be lost if +// the code is regenerated. +// +//------------------------------------------------------------------------------ + +namespace CUETools.Codecs.FLACCL.Properties { + using System; + + + /// + /// A strongly-typed resource class, for looking up localized strings, etc. + /// + // This class was auto-generated by the StronglyTypedResourceBuilder + // class via a tool like ResGen or Visual Studio. + // To add or remove a member, edit your .ResX file then rerun ResGen + // with the /str option, or rebuild your VS project. + [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Resources.Tools.StronglyTypedResourceBuilder", "2.0.0.0")] + [global::System.Diagnostics.DebuggerNonUserCodeAttribute()] + [global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()] + internal class Resources { + + private static global::System.Resources.ResourceManager resourceMan; + + private static global::System.Globalization.CultureInfo resourceCulture; + + [global::System.Diagnostics.CodeAnalysis.SuppressMessageAttribute("Microsoft.Performance", "CA1811:AvoidUncalledPrivateCode")] + internal Resources() { + } + + /// + /// Returns the cached ResourceManager instance used by this class. + /// + [global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)] + internal static global::System.Resources.ResourceManager ResourceManager { + get { + if (object.ReferenceEquals(resourceMan, null)) { + global::System.Resources.ResourceManager temp = new global::System.Resources.ResourceManager("CUETools.Codecs.FLACCL.Properties.Resources", typeof(Resources).Assembly); + resourceMan = temp; + } + return resourceMan; + } + } + + /// + /// Overrides the current thread's CurrentUICulture property for all + /// resource lookups using this strongly typed resource class. + /// + [global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)] + internal static global::System.Globalization.CultureInfo Culture { + get { + return resourceCulture; + } + set { + resourceCulture = value; + } + } + + /// + /// Looks up a localized string similar to Use additional CPU threads. + /// + internal static string DescriptionCPUThreads { + get { + return ResourceManager.GetString("DescriptionCPUThreads", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Use GPU on all stages. + /// + internal static string DescriptionGPUOnly { + get { + return ResourceManager.GetString("DescriptionGPUOnly", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Calculate MD5 hash for audio stream. + /// + internal static string DoMD5Description { + get { + return ResourceManager.GetString("DoMD5Description", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Decode each frame and compare with original. + /// + internal static string DoVerifyDescription { + get { + return ResourceManager.GetString("DoVerifyDescription", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Samples written differs from the expected sample count. + /// + internal static string ExceptionSampleCount { + get { + return ResourceManager.GetString("ExceptionSampleCount", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Validation failed. + /// + internal static string ExceptionValidationFailed { + get { + return ResourceManager.GetString("ExceptionValidationFailed", resourceCulture); + } + } + } +} diff --git a/CUETools.Codecs.FLACCL/Properties/Resources.resx b/CUETools.Codecs.FLACCL/Properties/Resources.resx new file mode 100644 index 0000000..1579dd9 --- /dev/null +++ b/CUETools.Codecs.FLACCL/Properties/Resources.resx @@ -0,0 +1,138 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + text/microsoft-resx + + + 2.0 + + + System.Resources.ResXResourceReader, System.Windows.Forms, Version=2.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 + + + System.Resources.ResXResourceWriter, System.Windows.Forms, Version=2.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 + + + Use additional CPU threads + + + Use GPU on all stages + + + Calculate MD5 hash for audio stream + + + Decode each frame and compare with original + + + Samples written differs from the expected sample count + + + Validation failed + + \ No newline at end of file diff --git a/CUETools.Codecs.FLACCL/Properties/Resources.ru-RU.resx b/CUETools.Codecs.FLACCL/Properties/Resources.ru-RU.resx new file mode 100644 index 0000000..5f87a8e --- /dev/null +++ b/CUETools.Codecs.FLACCL/Properties/Resources.ru-RU.resx @@ -0,0 +1,138 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + text/microsoft-resx + + + 2.0 + + + System.Resources.ResXResourceReader, System.Windows.Forms, Version=2.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 + + + System.Resources.ResXResourceWriter, System.Windows.Forms, Version=2.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 + + + Использовать дополнительные потоки + + + Использовать GPU на всех стадиях + + + Вычислять MD5-хеш аудиопотока + + + Декодировать каждый блок и сравнивать с оригиналом + + + Количество записанных сэмплов отличается от ожидавшегося + + + Ошибка верификации + + \ No newline at end of file diff --git a/CUETools.Codecs.FLACCL/flac.cl b/CUETools.Codecs.FLACCL/flac.cl new file mode 100644 index 0000000..6c30b50 --- /dev/null +++ b/CUETools.Codecs.FLACCL/flac.cl @@ -0,0 +1,1275 @@ +/** + * CUETools.FLACCL: FLAC audio encoder using OpenCL + * Copyright (c) 2009 Gregory S. Chudov + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _FLACCL_KERNEL_H_ +#define _FLACCL_KERNEL_H_ + +typedef enum +{ + Constant = 0, + Verbatim = 1, + Fixed = 8, + LPC = 32 +} SubframeType; + +typedef struct +{ + int residualOrder; // <= 32 + int samplesOffs; + int shift; + int cbits; + int size; + int type; + int obits; + int blocksize; + int best_index; + int channel; + int residualOffs; + int wbits; + int abits; + int porder; + int reserved[2]; +} FLACCLSubframeData; + +typedef struct +{ + FLACCLSubframeData data; + union + { + int coefs[32]; // fixme: should be short? + int4 coefs4[8]; + }; +} FLACCLSubframeTask; + +__kernel void cudaStereoDecorr( + __global int *samples, + __global short2 *src, + int offset +) +{ + int pos = get_global_id(0); + if (pos < offset) + { + short2 s = src[pos]; + samples[pos] = s.x; + samples[1 * offset + pos] = s.y; + samples[2 * offset + pos] = (s.x + s.y) >> 1; + samples[3 * offset + pos] = s.x - s.y; + } +} + +__kernel void cudaChannelDecorr2( + __global int *samples, + __global short2 *src, + int offset +) +{ + int pos = get_global_id(0); + if (pos < offset) + { + short2 s = src[pos]; + samples[pos] = s.x; + samples[1 * offset + pos] = s.y; + } +} + +//__kernel void cudaChannelDecorr( +// int *samples, +// short *src, +// int offset +//) +//{ +// int pos = get_global_id(0); +// if (pos < offset) +// samples[get_group_id(1) * offset + pos] = src[pos * get_num_groups(1) + get_group_id(1)]; +//} + +#define __ffs(a) (32 - clz(a & (-a))) +//#define __ffs(a) (33 - clz(~a & (a - 1))) + +__kernel __attribute__((reqd_work_group_size(128, 1, 1))) +void cudaFindWastedBits( + __global FLACCLSubframeTask *tasks, + __global int *samples, + int tasksPerChannel +) +{ + __local volatile int wbits[128]; + __local volatile int abits[128]; + __local FLACCLSubframeData task; + + int tid = get_local_id(0); + if (tid < sizeof(task) / sizeof(int)) + ((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(0) * tasksPerChannel].data))[tid]; + barrier(CLK_LOCAL_MEM_FENCE); + + int w = 0, a = 0; + for (int pos = 0; pos < task.blocksize; pos += get_local_size(0)) + { + int smp = pos + tid < task.blocksize ? samples[task.samplesOffs + pos + tid] : 0; + w |= smp; + a |= smp ^ (smp >> 31); + } + wbits[tid] = w; + abits[tid] = a; + barrier(CLK_LOCAL_MEM_FENCE); + //atom_or(shared.wbits, shared.wbits[tid]); + //atom_or(shared.abits, shared.abits[tid]); + //SUM256(shared.wbits, tid, |=); + //SUM256(shared.abits, tid, |=); + //SUM128(wbits, tid, |=); + //SUM128(abits, tid, |=); + + for (int s = get_local_size(0) / 2; s > 0; s >>= 1) + { + if (tid < s) + { + wbits[tid] |= wbits[tid + s]; + abits[tid] |= abits[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (tid == 0) + task.wbits = max(0,__ffs(wbits[0]) - 1); + if (tid == 0) + task.abits = 32 - clz(abits[0]) - task.wbits; + // if (tid == 0) + //task.wbits = get_num_groups(0); + // if (tid == 0) + //task.abits = get_local_size(0); + + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < tasksPerChannel) + tasks[get_group_id(0) * tasksPerChannel + tid].data.wbits = task.wbits; + if (tid < tasksPerChannel) + tasks[get_group_id(0) * tasksPerChannel + tid].data.abits = task.abits; +} + +//__kernel __attribute__((reqd_work_group_size(32, 4, 1))) +//void cudaComputeAutocor( +// __global float *output, +// __global const int *samples, +// __global const float *window, +// __global FLACCLSubframeTask *tasks, +// const int max_order, // should be <= 32 +// const int windowCount, // windows (log2: 0,1) +// const int taskCount // tasks per block +//) +//{ +// __local struct { +// float data[256]; +// volatile float product[128]; +// FLACCLSubframeData task; +// volatile int dataPos; +// volatile int dataLen; +// } shared; +// const int tid = get_local_id(0) + get_local_id(1) * 32; +// // fetch task data +// if (tid < sizeof(shared.task) / sizeof(int)) +// ((__local int*)&shared.task)[tid] = ((__global int*)(tasks + taskCount * (get_group_id(1) >> windowCount)))[tid]; +// if (tid == 0) +// { +// shared.dataPos = get_group_id(0) * 7 * 32; +// shared.dataLen = min(shared.task.blocksize - shared.dataPos, 7 * 32 + max_order); +// } +// barrier(CLK_LOCAL_MEM_FENCE); +// +// // fetch samples +// shared.data[tid] = tid < shared.dataLen ? samples[tid] * window[tid]: 0.0f; +// int tid2 = tid + 128; +// shared.data[tid2] = tid2 < shared.dataLen ? samples[tid2] * window[tid2]: 0.0f; +// barrier(CLK_LOCAL_MEM_FENCE); +// +// for (int lag = 0; lag <= max_order; lag ++) +// { +// if (lag <= 12) +// shared.product[tid] = 0.0f; +// barrier(CLK_LOCAL_MEM_FENCE); +// } +// barrier(CLK_LOCAL_MEM_FENCE); +// if (tid <= max_order) +// output[(get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) + tid] = shared.product[tid]; +//} + +__kernel __attribute__((reqd_work_group_size(32, 4, 1))) +void cudaComputeAutocor( + __global float *output, + __global const int *samples, + __global const float *window, + __global FLACCLSubframeTask *tasks, + const int max_order, // should be <= 32 + const int windowCount, // windows (log2: 0,1) + const int taskCount // tasks per block +) +{ + __local struct { + float data[256]; + volatile float product[128]; + FLACCLSubframeData task; + volatile float result[33]; + volatile int dataPos; + volatile int dataLen; + volatile int windowOffs; + volatile int samplesOffs; + //volatile int resultOffs; + } shared; + const int tid = get_local_id(0) + get_local_id(1) * 32; + // fetch task data + if (tid < sizeof(shared.task) / sizeof(int)) + ((__local int*)&shared.task)[tid] = ((__global int*)(tasks + taskCount * (get_group_id(1) >> windowCount)))[tid]; + if (tid == 0) + { + shared.dataPos = get_group_id(0) * 7 * 32; + shared.windowOffs = (get_group_id(1) & ((1 << windowCount)-1)) * shared.task.blocksize + shared.dataPos; + shared.samplesOffs = shared.task.samplesOffs + shared.dataPos; + shared.dataLen = min(shared.task.blocksize - shared.dataPos, 7 * 32 + max_order); + } + //if (tid == 32) + //shared.resultOffs = __mul24(get_group_id(0) + __mul24(get_group_id(1), get_num_groups(0)), max_order + 1); + barrier(CLK_LOCAL_MEM_FENCE); + + // fetch samples + shared.data[tid] = tid < shared.dataLen ? samples[shared.samplesOffs + tid] * window[shared.windowOffs + tid]: 0.0f; + int tid2 = tid + 128; + shared.data[tid2] = tid2 < shared.dataLen ? samples[shared.samplesOffs + tid2] * window[shared.windowOffs + tid2]: 0.0f; + barrier(CLK_LOCAL_MEM_FENCE); + + const int ptr = get_local_id(0) * 7; + //if (get_local_id(1) == 0) for (int lag = 0; lag <= max_order; lag ++) + //for (int lag = get_local_id(1); lag <= max_order; lag += get_local_size(1)) + for (int lag0 = 0; lag0 <= max_order; lag0 += get_local_size(1)) + { + ////const int productLen = min(shared.task.blocksize - get_group_id(0) * partSize - lag, partSize); + const int lag = lag0 + get_local_id(1); + const int ptr2 = ptr + lag; + shared.product[tid] = + shared.data[ptr + 0] * shared.data[ptr2 + 0] + + shared.data[ptr + 1] * shared.data[ptr2 + 1] + + shared.data[ptr + 2] * shared.data[ptr2 + 2] + + shared.data[ptr + 3] * shared.data[ptr2 + 3] + + shared.data[ptr + 4] * shared.data[ptr2 + 4] + + shared.data[ptr + 5] * shared.data[ptr2 + 5] + + shared.data[ptr + 6] * shared.data[ptr2 + 6]; + barrier(CLK_LOCAL_MEM_FENCE); + for (int l = 16; l > 1; l >>= 1) + { + if (get_local_id(0) < l) + shared.product[tid] = shared.product[tid] + shared.product[tid + l]; + barrier(CLK_LOCAL_MEM_FENCE); + } + // return results + if (get_local_id(0) == 0 && lag <= max_order) + shared.result[lag] = shared.product[tid] + shared.product[tid + 1]; + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid <= max_order) + output[(get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) + tid] = shared.result[tid]; + //output[(get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) + tid] = shared.product[tid]; + //output[(get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) + tid] = shared.windowOffs; +} + +__kernel __attribute__((reqd_work_group_size(32, 1, 1))) +void cudaComputeLPC( + __global FLACCLSubframeTask *tasks, + int taskCount, // tasks per block + __global float*autoc, + int max_order, // should be <= 32 + __global float *lpcs, + int windowCount, + int partCount +) +{ + __local struct { + FLACCLSubframeData task; + volatile float parts[32]; + volatile float ldr[32]; + volatile float gen1[32]; + volatile float error[32]; + volatile float autoc[33]; + volatile int lpcOffs; + volatile int autocOffs; + } shared; + const int tid = get_local_id(0);// + get_local_id(1) * 32; + + // fetch task data + if (tid < sizeof(shared.task) / sizeof(int)) + ((__local int*)&shared.task)[tid] = ((__global int*)(tasks + get_group_id(1) * taskCount))[tid]; + if (tid == 0) + { + shared.lpcOffs = (get_group_id(0) + get_group_id(1) * windowCount) * (max_order + 1) * 32; + shared.autocOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) * partCount; + } + barrier(CLK_LOCAL_MEM_FENCE); + + // add up autocorrelation parts + + // for (int order = get_local_id(0); order <= max_order; order += 32) + // { + //float sum = 0.0f; + //for (int pos = 0; pos < partCount; pos++) + // sum += autoc[shared.autocOffs + pos * (max_order + 1) + order]; + //shared.autoc[order] = sum; + // } + + for (int order = 0; order <= max_order; order ++) + { + float part = 0.0f; + for (int pos = get_local_id(0); pos < partCount; pos += get_local_size(0)) + part += autoc[shared.autocOffs + pos * (max_order + 1) + order]; + shared.parts[tid] = part; + barrier(CLK_LOCAL_MEM_FENCE); + for (int l = get_local_size(0) / 2; l > 1; l >>= 1) + { + if (get_local_id(0) < l) + shared.parts[tid] += shared.parts[tid + l]; + barrier(CLK_LOCAL_MEM_FENCE); + } + if (get_local_id(0) == 0) + shared.autoc[order] = shared.parts[tid] + shared.parts[tid + 1]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + // Compute LPC using Schur and Levinson-Durbin recursion + float gen0 = shared.gen1[get_local_id(0)] = shared.autoc[get_local_id(0)+1]; + shared.ldr[get_local_id(0)] = 0.0f; + float error = shared.autoc[0]; + barrier(CLK_LOCAL_MEM_FENCE); + for (int order = 0; order < max_order; order++) + { + // Schur recursion + float reff = -shared.gen1[0] / error; + error += shared.gen1[0] * reff; // Equivalent to error *= (1 - reff * reff); + float gen1; + if (get_local_id(0) < max_order - 1 - order) + { + gen1 = shared.gen1[get_local_id(0) + 1] + reff * gen0; + gen0 += shared.gen1[get_local_id(0) + 1] * reff; + } + barrier(CLK_LOCAL_MEM_FENCE); + if (get_local_id(0) < max_order - 1 - order) + shared.gen1[get_local_id(0)] = gen1; + + // Store prediction error + if (get_local_id(0) == 0) + shared.error[order] = error; + + // Levinson-Durbin recursion + float ldr = + select(0.0f, reff * shared.ldr[order - 1 - get_local_id(0)], get_local_id(0) < order) + + select(0.0f, reff, get_local_id(0) == order); + barrier(CLK_LOCAL_MEM_FENCE); + shared.ldr[get_local_id(0)] += ldr; + barrier(CLK_LOCAL_MEM_FENCE); + + // Output coeffs + if (get_local_id(0) <= order) + lpcs[shared.lpcOffs + order * 32 + get_local_id(0)] = -shared.ldr[order - get_local_id(0)]; + } + barrier(CLK_LOCAL_MEM_FENCE); + // Output prediction error estimates + if (get_local_id(0) < max_order) + lpcs[shared.lpcOffs + max_order * 32 + get_local_id(0)] = shared.error[get_local_id(0)]; +} + +//__kernel void cudaComputeLPCLattice( +// FLACCLSubframeTask *tasks, +// const int taskCount, // tasks per block +// const int *samples, +// const int windowCount, +// const int max_order, // should be <= 12 +// float*lpcs +//) +//{ +// __local struct { +// volatile FLACCLSubframeData task; +// volatile float F[512]; +// volatile float arp[32]; +// volatile float tmp[256]; +// volatile float error[32]; +// volatile int lpcOffs; +// } shared; +// +// // fetch task data +// if (get_local_id(0) < sizeof(shared.task) / sizeof(int)) +// ((int*)&shared.task)[get_local_id(0)] = ((int*)(tasks + taskCount * get_group_id(1)))[get_local_id(0)]; +// if (get_local_id(0) == 0) +// shared.lpcOffs = __mul24(__mul24(get_group_id(1) + 1, windowCount) - 1, max_order + 1) * 32; +// barrier(CLK_LOCAL_MEM_FENCE); +// +// // F = samples; B = samples +// float s1 = get_local_id(0) < shared.task.blocksize ? (samples[shared.task.samplesOffs + get_local_id(0)]) / 32768.0f : 0.0f; +// float s2 = get_local_id(0) + 256 < shared.task.blocksize ? (samples[shared.task.samplesOffs + get_local_id(0) + 256]) / 32768.0f : 0.0f; +// shared.F[get_local_id(0)] = s1; +// shared.F[get_local_id(0) + 256] = s2; +// barrier(CLK_LOCAL_MEM_FENCE); +// +// shared.tmp[get_local_id(0)] = FSQR(s1) + FSQR(s2); +// barrier(CLK_LOCAL_MEM_FENCE); +// SUM256(shared.tmp, get_local_id(0), +=); +// barrier(CLK_LOCAL_MEM_FENCE); +// float DEN = shared.tmp[0]; +// barrier(CLK_LOCAL_MEM_FENCE); +// +// for (int order = 0; order < max_order; order++) +// { +// // reff = F(order+1:frameSize) * B(1:frameSize-order)' / DEN +// int idxF = get_local_id(0) + order + 1; +// int idxF2 = idxF + 256; +// +// shared.tmp[get_local_id(0)] = idxF < shared.task.blocksize ? shared.F[idxF] * s1 : 0.0f; +// shared.tmp[get_local_id(0)] += idxF2 < shared.task.blocksize ? shared.F[idxF2] * s2 : 0.0f; +// barrier(CLK_LOCAL_MEM_FENCE); +// SUM256(shared.tmp, get_local_id(0), +=); +// barrier(CLK_LOCAL_MEM_FENCE); +// float reff = shared.tmp[0] / DEN; +// barrier(CLK_LOCAL_MEM_FENCE); +// +// // arp(order) = rc(order) = reff +// if (get_local_id(0) == 0) +// shared.arp[order] = reff; +// //shared.rc[order - 1] = shared.lpc[order - 1][order - 1] = reff; +// +// // Levinson-Durbin recursion +// // arp(1:order-1) = arp(1:order-1) - reff * arp(order-1:-1:1) +// if (get_local_id(0) < order) +// shared.arp[get_local_id(0)] = shared.arp[get_local_id(0)] - reff * shared.arp[order - 1 - get_local_id(0)]; +// +// // Output coeffs +// if (get_local_id(0) <= order) +// lpcs[shared.lpcOffs + order * 32 + get_local_id(0)] = shared.arp[order - get_local_id(0)]; +// +// // F1 = F(order+1:frameSize) - reff * B(1:frameSize-order) +// // B(1:frameSize-order) = B(1:frameSize-order) - reff * F(order+1:frameSize) +// // F(order+1:frameSize) = F1 +// if (idxF < shared.task.blocksize) +// { +// float f1 = shared.F[idxF]; +// shared.F[idxF] -= reff * s1; +// s1 -= reff * f1; +// } +// if (idxF2 < shared.task.blocksize) +// { +// float f2 = shared.F[idxF2]; +// shared.F[idxF2] -= reff * s2; +// s2 -= reff * f2; +// } +// +// // DEN = F(order+1:frameSize) * F(order+1:frameSize)' + B(1:frameSize-order) * B(1:frameSize-order)' (BURG) +// shared.tmp[get_local_id(0)] = (idxF + 1 < shared.task.blocksize ? FSQR(shared.F[idxF]) + FSQR(s1) : 0); +// shared.tmp[get_local_id(0)] += (idxF2 + 1 < shared.task.blocksize ? FSQR(shared.F[idxF2]) + FSQR(s2) : 0); +// barrier(CLK_LOCAL_MEM_FENCE); +// SUM256(shared.tmp, get_local_id(0), +=); +// barrier(CLK_LOCAL_MEM_FENCE); +// DEN = shared.tmp[0] / 2; +// // shared.PE[order-1] = shared.tmp[0] / 2 / (frameSize - order + 1); +// if (get_local_id(0) == 0) +// shared.error[order] = DEN / (shared.task.blocksize - order); +// barrier(CLK_LOCAL_MEM_FENCE); +// } +// +// // Output prediction error estimates +// if (get_local_id(0) < max_order) +// lpcs[shared.lpcOffs + max_order * 32 + get_local_id(0)] = shared.error[get_local_id(0)]; +//} + +__kernel __attribute__((reqd_work_group_size(32, 4, 1))) +void cudaQuantizeLPC( + __global FLACCLSubframeTask *tasks, + int taskCount, // tasks per block + int taskCountLPC, // tasks per set of coeffs (<= 32) + __global float*lpcs, + int max_order, // should be <= 32 + int minprecision, + int precisions +) +{ + __local struct { + FLACCLSubframeData task; + volatile int tmpi[128]; + volatile int index[64]; + volatile float error[64]; + volatile int lpcOffs; + } shared; + + const int tid = get_local_id(0) + get_local_id(1) * 32; + + // fetch task data + if (tid < sizeof(shared.task) / sizeof(int)) + ((__local int*)&shared.task)[tid] = ((__global int*)(tasks + get_group_id(1) * taskCount))[tid]; + if (tid == 0) + shared.lpcOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) * 32; + barrier(CLK_LOCAL_MEM_FENCE); + + if (get_local_id(1) == 0) + { + shared.index[get_local_id(0)] = min(max_order - 1, get_local_id(0)); + shared.error[get_local_id(0)] = shared.task.blocksize * 64 + get_local_id(0); + shared.index[32 + get_local_id(0)] = min(max_order - 1, get_local_id(0)); + shared.error[32 + get_local_id(0)] = shared.task.blocksize * 64 + get_local_id(0); + + // Select best orders based on Akaike's Criteria + + // Load prediction error estimates + if (get_local_id(0) < max_order) + shared.error[get_local_id(0)] = shared.task.blocksize * log(lpcs[shared.lpcOffs + max_order * 32 + get_local_id(0)]) + get_local_id(0) * 5.12f * log(shared.task.blocksize); + //shared.error[get_local_id(0)] = shared.task.blocksize * log(lpcs[shared.lpcOffs + max_order * 32 + get_local_id(0)]) + get_local_id(0) * 0.30f * (shared.task.abits + 1) * log(shared.task.blocksize); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // Sort using bitonic sort + for(int size = 2; size < 64; size <<= 1){ + //Bitonic merge + int ddd = (get_local_id(0) & (size / 2)) == 0; + for(int stride = size / 2; stride > 0; stride >>= 1){ + int pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + float e0, e1; + int i0, i1; + if (get_local_id(1) == 0) + { + e0 = shared.error[pos]; + e1 = shared.error[pos + stride]; + i0 = shared.index[pos]; + i1 = shared.index[pos + stride]; + } + barrier(CLK_LOCAL_MEM_FENCE); + if ((e0 >= e1) == ddd && get_local_id(1) == 0) + { + shared.error[pos] = e1; + shared.error[pos + stride] = e0; + shared.index[pos] = i1; + shared.index[pos + stride] = i0; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + } + + //ddd == dir for the last bitonic merge step + { + for(int stride = 32; stride > 0; stride >>= 1){ + //barrier(CLK_LOCAL_MEM_FENCE); + int pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + float e0, e1; + int i0, i1; + if (get_local_id(1) == 0) + { + e0 = shared.error[pos]; + e1 = shared.error[pos + stride]; + i0 = shared.index[pos]; + i1 = shared.index[pos + stride]; + } + barrier(CLK_LOCAL_MEM_FENCE); + if (e0 >= e1 && get_local_id(1) == 0) + { + shared.error[pos] = e1; + shared.error[pos + stride] = e0; + shared.index[pos] = i1; + shared.index[pos + stride] = i0; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + } + + // Quantization + for (int ii = 0; ii < taskCountLPC; ii += get_local_size(1)) + { + int i = ii + get_local_id(1); + int order = shared.index[i >> precisions]; + float lpc = get_local_id(0) <= order ? lpcs[shared.lpcOffs + order * 32 + get_local_id(0)] : 0.0f; + // get 15 bits of each coeff + int coef = convert_int_rte(lpc * (1 << 15)); + // remove sign bits + shared.tmpi[tid] = coef ^ (coef >> 31); + barrier(CLK_LOCAL_MEM_FENCE); + // OR reduction + for (int l = get_local_size(0) / 2; l > 1; l >>= 1) + { + if (get_local_id(0) < l) + shared.tmpi[tid] |= shared.tmpi[tid + l]; + barrier(CLK_LOCAL_MEM_FENCE); + } + //SUM32(shared.tmpi,tid,|=); + // choose precision + //int cbits = max(3, min(10, 5 + (shared.task.abits >> 1))); // - convert_int_rte(shared.PE[order - 1]) + int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (shared.task.blocksize <= 2304) - (shared.task.blocksize <= 1152) - (shared.task.blocksize <= 576), shared.task.abits), clz(order) + 1 - shared.task.abits)); + // calculate shift based on precision and number of leading zeroes in coeffs + int shift = max(0,min(15, clz(shared.tmpi[get_local_id(1) * 32] | shared.tmpi[get_local_id(1) * 32 + 1]) - 18 + cbits)); + + //cbits = 13; + //shift = 15; + + //if (shared.task.abits + 32 - clz(order) < shift + //int shift = max(0,min(15, (shared.task.abits >> 2) - 14 + clz(shared.tmpi[get_local_id(0) & ~31]) + ((32 - clz(order))>>1))); + // quantize coeffs with given shift + coef = convert_int_rte(clamp(lpc * (1 << shift), -1 << (cbits - 1), 1 << (cbits - 1))); + // error correction + //shared.tmp[get_local_id(0)] = (get_local_id(0) != 0) * (shared.arp[get_local_id(0) - 1]*(1 << shared.task.shift) - shared.task.coefs[get_local_id(0) - 1]); + //shared.task.coefs[get_local_id(0)] = max(-(1 << (shared.task.cbits - 1)), min((1 << (shared.task.cbits - 1))-1, convert_int_rte((shared.arp[get_local_id(0)]) * (1 << shared.task.shift) + shared.tmp[get_local_id(0)]))); + // remove sign bits + shared.tmpi[tid] = coef ^ (coef >> 31); + barrier(CLK_LOCAL_MEM_FENCE); + // OR reduction + for (int l = get_local_size(0) / 2; l > 1; l >>= 1) + { + if (get_local_id(0) < l) + shared.tmpi[tid] |= shared.tmpi[tid + l]; + barrier(CLK_LOCAL_MEM_FENCE); + } + //SUM32(shared.tmpi,tid,|=); + // calculate actual number of bits (+1 for sign) + cbits = 1 + 32 - clz(shared.tmpi[get_local_id(1) * 32] | shared.tmpi[get_local_id(1) * 32 + 1]); + + // output shift, cbits and output coeffs + if (i < taskCountLPC) + { + int taskNo = get_group_id(1) * taskCount + get_group_id(0) * taskCountLPC + i; + if (get_local_id(0) == 0) + tasks[taskNo].data.shift = shift; + if (get_local_id(0) == 0) + tasks[taskNo].data.cbits = cbits; + if (get_local_id(0) == 0) + tasks[taskNo].data.residualOrder = order + 1; + if (get_local_id(0) <= order) + tasks[taskNo].coefs[get_local_id(0)] = coef; + } + } +} + +__kernel __attribute__(( vec_type_hint (int4))) +void cudaEstimateResidual( + __global int*output, + __global int*samples, + __global FLACCLSubframeTask *tasks + ) +{ + __local float data[128 * 2]; + __local int residual[128]; + __local FLACCLSubframeTask task; + __local float4 coefsf4[8]; + + const int tid = get_local_id(0); + if (tid < sizeof(task)/sizeof(int)) + ((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(1)]))[tid]; + barrier(CLK_GLOBAL_MEM_FENCE); + + int ro = task.data.residualOrder; + int bs = task.data.blocksize; + float res = 0; + + if (tid < 32) + ((__local float *)&coefsf4[0])[tid] = select(0.0f, ((float)task.coefs[tid]) / (1 << task.data.shift), tid < ro); + data[tid] = tid < bs ? (float)(samples[task.data.samplesOffs + tid] >> task.data.wbits) : 0.0f; + for (int pos = 0; pos < bs; pos += get_local_size(0)) + { + // fetch samples + float nextData = pos + tid + get_local_size(0) < bs ? (float)(samples[task.data.samplesOffs + pos + tid + get_local_size(0)] >> task.data.wbits) : 0.0f; + data[tid + get_local_size(0)] = nextData; + barrier(CLK_LOCAL_MEM_FENCE); + + // compute residual + __local float4 * dptr = (__local float4 *)&data[tid]; + float sumf = data[tid + ro] - + ( dot(dptr[0], coefsf4[0]) + + dot(dptr[1], coefsf4[1]) +#if MAX_ORDER > 8 + + dot(dptr[2], coefsf4[2]) +#if MAX_ORDER > 12 + + dot(dptr[3], coefsf4[3]) +#if MAX_ORDER > 16 + + dot(dptr[4], coefsf4[4]) + + dot(dptr[5], coefsf4[5]) + + dot(dptr[6], coefsf4[6]) + + dot(dptr[7], coefsf4[7]) +#endif +#endif +#endif + ); + //residual[tid] = sum; + + res += select(0.0f, min(fabs(sumf), (float)0x7fffff), pos + tid + ro < bs); + barrier(CLK_LOCAL_MEM_FENCE); + + //int k = min(33 - clz(sum), 14); + //res += select(0, 1 + k, pos + tid + ro < bs); + + //sum = residual[tid] + residual[tid + 1] + residual[tid + 2] + residual[tid + 3] + // + residual[tid + 4] + residual[tid + 5] + residual[tid + 6] + residual[tid + 7]; + //int k = clamp(29 - clz(sum), 0, 14); + //res += select(0, 8 * (k + 1) + (sum >> k), pos + tid + ro < bs && !(tid & 7)); + + data[tid] = nextData; + } + + int residualLen = (bs - ro) / get_local_size(0) + select(0, 1, tid < (bs - ro) % get_local_size(0)); + int k = clamp(convert_int_rtn(log2((res + 0.000001f) / (residualLen + 0.000001f))), 0, 14); + residual[tid] = residualLen * (k + 1) + (convert_int_rtz(res) >> k); + + barrier(CLK_LOCAL_MEM_FENCE); + for (int l = get_local_size(0) / 2; l > 0; l >>= 1) + { + if (tid < l) + residual[tid] += residual[tid + l]; + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) + output[get_group_id(1)] = residual[0]; +} + +__kernel void cudaChooseBestMethod( + __global FLACCLSubframeTask *tasks, + __global int *residual, + int taskCount + ) +{ + __local struct { + volatile int index[128]; + volatile int length[256]; + volatile FLACCLSubframeTask task[8]; + } shared; + const int tid = get_local_id(0) + get_local_id(1) * 32; + + shared.length[tid] = 0x7fffffff; + shared.index[tid] = tid; + for (int task = 0; task < taskCount; task += get_local_size(1)) + if (task + get_local_id(1) < taskCount) + { + // fetch task data + ((__local int*)&shared.task[get_local_id(1)])[get_local_id(0)] = + ((__global int*)(tasks + task + get_local_id(1) + taskCount * get_group_id(1)))[get_local_id(0)]; + + barrier(CLK_LOCAL_MEM_FENCE); + + if (get_local_id(0) == 0) + { + // fetch part sum + int partLen = residual[task + get_local_id(1) + taskCount * get_group_id(1)]; + //// calculate part size + //int residualLen = shared.task[get_local_id(1)].data.blocksize - shared.task[get_local_id(1)].data.residualOrder; + //residualLen = residualLen * (shared.task[get_local_id(1)].data.type != Constant || psum != 0); + //// calculate rice parameter + //int k = max(0, min(14, convert_int_rtz(log2((psum + 0.000001f) / (residualLen + 0.000001f) + 0.5f)))); + //// calculate part bit length + //int partLen = residualLen * (k + 1) + (psum >> k); + + int obits = shared.task[get_local_id(1)].data.obits - shared.task[get_local_id(1)].data.wbits; + shared.length[task + get_local_id(1)] = + min(obits * shared.task[get_local_id(1)].data.blocksize, + shared.task[get_local_id(1)].data.type == Fixed ? shared.task[get_local_id(1)].data.residualOrder * obits + 6 + (4 * 1/2) + partLen : + shared.task[get_local_id(1)].data.type == LPC ? shared.task[get_local_id(1)].data.residualOrder * obits + 4 + 5 + shared.task[get_local_id(1)].data.residualOrder * shared.task[get_local_id(1)].data.cbits + 6 + (4 * 1/2)/* << porder */ + partLen : + shared.task[get_local_id(1)].data.type == Constant ? obits * (1 + shared.task[get_local_id(1)].data.blocksize * (partLen != 0)) : + obits * shared.task[get_local_id(1)].data.blocksize); + } + } + //shared.index[get_local_id(0)] = get_local_id(0); + //shared.length[get_local_id(0)] = (get_local_id(0) < taskCount) ? tasks[get_local_id(0) + taskCount * get_group_id(1)].size : 0x7fffffff; + + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < taskCount) + tasks[tid + taskCount * get_group_id(1)].data.size = shared.length[tid]; + + int l1 = shared.length[tid]; + for (int sh = 8; sh > 0; sh --) + { + if (tid + (1 << sh) < get_local_size(0) * get_local_size(1)) + { + int l2 = shared.length[tid + (1 << sh)]; + shared.index[tid] = shared.index[tid + ((l2 < l1) << sh)]; + shared.length[tid] = l1 = min(l1, l2); + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) + tasks[taskCount * get_group_id(1)].data.best_index = taskCount * get_group_id(1) + shared.index[shared.length[1] < shared.length[0]]; +} + +__kernel void cudaCopyBestMethod( + __global FLACCLSubframeTask *tasks_out, + __global FLACCLSubframeTask *tasks, + int count + ) +{ + __local int best_index; + if (get_local_id(0) == 0) + best_index = tasks[count * get_group_id(1)].data.best_index; + barrier(CLK_LOCAL_MEM_FENCE); + if (get_local_id(0) < sizeof(FLACCLSubframeTask)/sizeof(int)) + ((__global int*)(tasks_out + get_group_id(1)))[get_local_id(0)] = ((__global int*)(tasks + best_index))[get_local_id(0)]; +} + +__kernel void cudaCopyBestMethodStereo( + __global FLACCLSubframeTask *tasks_out, + __global FLACCLSubframeTask *tasks, + int count + ) +{ + __local struct { + int best_index[4]; + int best_size[4]; + int lr_index[2]; + } shared; + if (get_local_id(0) < 4) + shared.best_index[get_local_id(0)] = tasks[count * (get_group_id(1) * 4 + get_local_id(0))].data.best_index; + barrier(CLK_LOCAL_MEM_FENCE); + if (get_local_id(0) < 4) + shared.best_size[get_local_id(0)] = tasks[shared.best_index[get_local_id(0)]].data.size; + barrier(CLK_LOCAL_MEM_FENCE); + if (get_local_id(0) == 0) + { + int bitsBest = shared.best_size[2] + shared.best_size[3]; // MidSide + shared.lr_index[0] = shared.best_index[2]; + shared.lr_index[1] = shared.best_index[3]; + if (bitsBest > shared.best_size[3] + shared.best_size[1]) // RightSide + { + bitsBest = shared.best_size[3] + shared.best_size[1]; + shared.lr_index[0] = shared.best_index[3]; + shared.lr_index[1] = shared.best_index[1]; + } + if (bitsBest > shared.best_size[0] + shared.best_size[3]) // LeftSide + { + bitsBest = shared.best_size[0] + shared.best_size[3]; + shared.lr_index[0] = shared.best_index[0]; + shared.lr_index[1] = shared.best_index[3]; + } + if (bitsBest > shared.best_size[0] + shared.best_size[1]) // LeftRight + { + bitsBest = shared.best_size[0] + shared.best_size[1]; + shared.lr_index[0] = shared.best_index[0]; + shared.lr_index[1] = shared.best_index[1]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + if (get_local_id(0) < sizeof(FLACCLSubframeTask)/sizeof(int)) + ((__global int*)(tasks_out + 2 * get_group_id(1)))[get_local_id(0)] = ((__global int*)(tasks + shared.lr_index[0]))[get_local_id(0)]; + if (get_local_id(0) == 0) + tasks_out[2 * get_group_id(1)].data.residualOffs = tasks[shared.best_index[0]].data.residualOffs; + if (get_local_id(0) < sizeof(FLACCLSubframeTask)/sizeof(int)) + ((__global int*)(tasks_out + 2 * get_group_id(1) + 1))[get_local_id(0)] = ((__global int*)(tasks + shared.lr_index[1]))[get_local_id(0)]; + if (get_local_id(0) == 0) + tasks_out[2 * get_group_id(1) + 1].data.residualOffs = tasks[shared.best_index[1]].data.residualOffs; +} + +//__kernel void cudaEncodeResidual( +// int*output, +// int*samples, +// FLACCLSubframeTask *tasks +// ) +//{ +// __local struct { +// int data[256 + 32]; +// FLACCLSubframeTask task; +// } shared; +// const int tid = get_local_id(0); +// if (get_local_id(0) < sizeof(shared.task) / sizeof(int)) +// ((int*)&shared.task)[get_local_id(0)] = ((int*)(&tasks[get_group_id(1)]))[get_local_id(0)]; +// barrier(CLK_LOCAL_MEM_FENCE); +// const int partSize = get_local_size(0); +// const int pos = get_group_id(0) * partSize; +// const int dataLen = min(shared.task.data.blocksize - pos, partSize + shared.task.data.residualOrder); +// +// // fetch samples +// shared.data[tid] = tid < dataLen ? samples[shared.task.data.samplesOffs + pos + tid] >> shared.task.data.wbits : 0; +// if (tid < 32) shared.data[tid + partSize] = tid + partSize < dataLen ? samples[shared.task.data.samplesOffs + pos + tid + partSize] >> shared.task.data.wbits : 0; +// const int residualLen = max(0,min(shared.task.data.blocksize - pos - shared.task.data.residualOrder, partSize)); +// +// barrier(CLK_LOCAL_MEM_FENCE); +// // compute residual +// int sum = 0; +// for (int c = 0; c < shared.task.data.residualOrder; c++) +// sum += __mul24(shared.data[tid + c], shared.task.coefs[c]); +// barrier(CLK_LOCAL_MEM_FENCE); +// shared.data[tid + shared.task.data.residualOrder] -= (sum >> shared.task.data.shift); +// barrier(CLK_LOCAL_MEM_FENCE); +// if (tid >= shared.task.data.residualOrder && tid < residualLen + shared.task.data.residualOrder) +// output[shared.task.data.residualOffs + pos + tid] = shared.data[tid]; +// if (tid + 256 < residualLen + shared.task.data.residualOrder) +// output[shared.task.data.residualOffs + pos + tid + 256] = shared.data[tid + 256]; +//} +// +//__kernel void cudaCalcPartition( +// int* partition_lengths, +// int* residual, +// int* samples, +// FLACCLSubframeTask *tasks, +// int max_porder, // <= 8 +// int psize, // == (shared.task.data.blocksize >> max_porder), < 256 +// int parts_per_block // == 256 / psize, > 0, <= 16 +// ) +//{ +// __local struct { +// int data[256+32]; +// FLACCLSubframeTask task; +// } shared; +// const int tid = get_local_id(0) + (get_local_id(1) << 4); +// if (tid < sizeof(shared.task) / sizeof(int)) +// ((int*)&shared.task)[tid] = ((int*)(&tasks[get_group_id(1)]))[tid]; +// barrier(CLK_LOCAL_MEM_FENCE); +// +// const int parts = min(parts_per_block, (1 << max_porder) - get_group_id(0) * parts_per_block); +// const int offs = get_group_id(0) * psize * parts_per_block + tid; +// +// // fetch samples +// if (tid < 32) shared.data[tid] = min(offs, tid + shared.task.data.residualOrder) >= 32 ? samples[shared.task.data.samplesOffs + offs - 32] >> shared.task.data.wbits : 0; +// shared.data[32 + tid] = tid < parts * psize ? samples[shared.task.data.samplesOffs + offs] >> shared.task.data.wbits : 0; +// barrier(CLK_LOCAL_MEM_FENCE); +// +// // compute residual +// int s = 0; +// for (int c = -shared.task.data.residualOrder; c < 0; c++) +// s += __mul24(shared.data[32 + tid + c], shared.task.coefs[shared.task.data.residualOrder + c]); +// s = shared.data[32 + tid] - (s >> shared.task.data.shift); +// +// if (offs >= shared.task.data.residualOrder && tid < parts * psize) +// residual[shared.task.data.residualOffs + offs] = s; +// else +// s = 0; +// +// // convert to unsigned +// s = min(0xfffff, (s << 1) ^ (s >> 31)); +// +// //barrier(CLK_LOCAL_MEM_FENCE); +// //shared.data[tid] = s; +// //barrier(CLK_LOCAL_MEM_FENCE); +// +// //shared.data[tid] = (shared.data[tid] & (0x0000ffff << (tid & 16))) | (((shared.data[tid ^ 16] & (0x0000ffff << (tid & 16))) << (~tid & 16)) >> (tid & 16)); +// //shared.data[tid] = (shared.data[tid] & (0x00ff00ff << (tid & 8))) | (((shared.data[tid ^ 8] & (0x00ff00ff << (tid & 8))) << (~tid & 8)) >> (tid & 8)); +// //shared.data[tid] = (shared.data[tid] & (0x0f0f0f0f << (tid & 4))) | (((shared.data[tid ^ 4] & (0x0f0f0f0f << (tid & 4))) << (~tid & 4)) >> (tid & 4)); +// //shared.data[tid] = (shared.data[tid] & (0x33333333 << (tid & 2))) | (((shared.data[tid ^ 2] & (0x33333333 << (tid & 2))) << (~tid & 2)) >> (tid & 2)); +// //shared.data[tid] = (shared.data[tid] & (0x55555555 << (tid & 1))) | (((shared.data[tid ^ 1] & (0x55555555 << (tid & 1))) << (~tid & 1)) >> (tid & 1)); +// //shared.data[tid] = __popc(shared.data[tid]); +// +// barrier(CLK_LOCAL_MEM_FENCE); +// shared.data[tid + (tid / psize)] = s; +// //shared.data[tid] = s; +// barrier(CLK_LOCAL_MEM_FENCE); +// +// s = (psize - shared.task.data.residualOrder * (get_local_id(0) + get_group_id(0) == 0)) * (get_local_id(1) + 1); +// int dpos = __mul24(get_local_id(0), psize + 1); +// //int dpos = __mul24(get_local_id(0), psize); +// // calc number of unary bits for part get_local_id(0) with rice paramater get_local_id(1) +//#pragma unroll 0 +// for (int i = 0; i < psize; i++) +// s += shared.data[dpos + i] >> get_local_id(1); +// +// // output length +// const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (get_local_id(1) << (max_porder + 1)); +// if (get_local_id(1) <= 14 && get_local_id(0) < parts) +// partition_lengths[pos + get_group_id(0) * parts_per_block + get_local_id(0)] = s; +//} +// +//__kernel void cudaCalcPartition16( +// int* partition_lengths, +// int* residual, +// int* samples, +// FLACCLSubframeTask *tasks, +// int max_porder, // <= 8 +// int psize, // == 16 +// int parts_per_block // == 16 +// ) +//{ +// __local struct { +// int data[256+32]; +// FLACCLSubframeTask task; +// } shared; +// const int tid = get_local_id(0) + (get_local_id(1) << 4); +// if (tid < sizeof(shared.task) / sizeof(int)) +// ((int*)&shared.task)[tid] = ((int*)(&tasks[get_group_id(1)]))[tid]; +// barrier(CLK_LOCAL_MEM_FENCE); +// +// const int offs = (get_group_id(0) << 8) + tid; +// +// // fetch samples +// if (tid < 32) shared.data[tid] = min(offs, tid + shared.task.data.residualOrder) >= 32 ? samples[shared.task.data.samplesOffs + offs - 32] >> shared.task.data.wbits : 0; +// shared.data[32 + tid] = samples[shared.task.data.samplesOffs + offs] >> shared.task.data.wbits; +// // if (tid < 32 && tid >= shared.task.data.residualOrder) +// //shared.task.coefs[tid] = 0; +// barrier(CLK_LOCAL_MEM_FENCE); +// +// // compute residual +// int s = 0; +// for (int c = -shared.task.data.residualOrder; c < 0; c++) +// s += __mul24(shared.data[32 + tid + c], shared.task.coefs[shared.task.data.residualOrder + c]); +// // int spos = 32 + tid - shared.task.data.residualOrder; +// // int s= +// //__mul24(shared.data[spos + 0], shared.task.coefs[0]) + __mul24(shared.data[spos + 1], shared.task.coefs[1]) + +// //__mul24(shared.data[spos + 2], shared.task.coefs[2]) + __mul24(shared.data[spos + 3], shared.task.coefs[3]) + +// //__mul24(shared.data[spos + 4], shared.task.coefs[4]) + __mul24(shared.data[spos + 5], shared.task.coefs[5]) + +// //__mul24(shared.data[spos + 6], shared.task.coefs[6]) + __mul24(shared.data[spos + 7], shared.task.coefs[7]) + +// //__mul24(shared.data[spos + 8], shared.task.coefs[8]) + __mul24(shared.data[spos + 9], shared.task.coefs[9]) + +// //__mul24(shared.data[spos + 10], shared.task.coefs[10]) + __mul24(shared.data[spos + 11], shared.task.coefs[11]) + +// //__mul24(shared.data[spos + 12], shared.task.coefs[12]) + __mul24(shared.data[spos + 13], shared.task.coefs[13]) + +// //__mul24(shared.data[spos + 14], shared.task.coefs[14]) + __mul24(shared.data[spos + 15], shared.task.coefs[15]); +// s = shared.data[32 + tid] - (s >> shared.task.data.shift); +// +// if (get_group_id(0) != 0 || tid >= shared.task.data.residualOrder) +// residual[shared.task.data.residualOffs + (get_group_id(0) << 8) + tid] = s; +// else +// s = 0; +// +// // convert to unsigned +// s = min(0xfffff, (s << 1) ^ (s >> 31)); +// barrier(CLK_LOCAL_MEM_FENCE); +// shared.data[tid + get_local_id(1)] = s; +// barrier(CLK_LOCAL_MEM_FENCE); +// +// // calc number of unary bits for part get_local_id(0) with rice paramater get_local_id(1) +// int dpos = __mul24(get_local_id(0), 17); +// int sum = +// (shared.data[dpos + 0] >> get_local_id(1)) + (shared.data[dpos + 1] >> get_local_id(1)) + +// (shared.data[dpos + 2] >> get_local_id(1)) + (shared.data[dpos + 3] >> get_local_id(1)) + +// (shared.data[dpos + 4] >> get_local_id(1)) + (shared.data[dpos + 5] >> get_local_id(1)) + +// (shared.data[dpos + 6] >> get_local_id(1)) + (shared.data[dpos + 7] >> get_local_id(1)) + +// (shared.data[dpos + 8] >> get_local_id(1)) + (shared.data[dpos + 9] >> get_local_id(1)) + +// (shared.data[dpos + 10] >> get_local_id(1)) + (shared.data[dpos + 11] >> get_local_id(1)) + +// (shared.data[dpos + 12] >> get_local_id(1)) + (shared.data[dpos + 13] >> get_local_id(1)) + +// (shared.data[dpos + 14] >> get_local_id(1)) + (shared.data[dpos + 15] >> get_local_id(1)); +// +// // output length +// const int pos = ((15 * get_group_id(1) + get_local_id(1)) << (max_porder + 1)) + (get_group_id(0) << 4) + get_local_id(0); +// if (get_local_id(1) <= 14) +// partition_lengths[pos] = sum + (16 - shared.task.data.residualOrder * (get_local_id(0) + get_group_id(0) == 0)) * (get_local_id(1) + 1); +//} +// +//__kernel void cudaCalcLargePartition( +// int* partition_lengths, +// int* residual, +// int* samples, +// FLACCLSubframeTask *tasks, +// int max_porder, // <= 8 +// int psize, // == >= 128 +// int parts_per_block // == 1 +// ) +//{ +// __local struct { +// int data[256]; +// volatile int length[256]; +// FLACCLSubframeTask task; +// } shared; +// const int tid = get_local_id(0) + (get_local_id(1) << 4); +// if (tid < sizeof(shared.task) / sizeof(int)) +// ((int*)&shared.task)[tid] = ((int*)(&tasks[get_group_id(1)]))[tid]; +// barrier(CLK_LOCAL_MEM_FENCE); +// +// int sum = 0; +// for (int pos = 0; pos < psize; pos += 256) +// { +// // fetch residual +// int offs = get_group_id(0) * psize + pos + tid; +// int s = (offs >= shared.task.data.residualOrder && pos + tid < psize) ? residual[shared.task.data.residualOffs + offs] : 0; +// // convert to unsigned +// shared.data[tid] = min(0xfffff, (s << 1) ^ (s >> 31)); +// barrier(CLK_LOCAL_MEM_FENCE); +// +// // calc number of unary bits for each residual sample with each rice paramater +//#pragma unroll 0 +// for (int i = get_local_id(0); i < min(psize,256); i += 16) +// // for sample (i + get_local_id(0)) with this rice paramater (get_local_id(1)) +// sum += shared.data[i] >> get_local_id(1); +// barrier(CLK_LOCAL_MEM_FENCE); +// } +// shared.length[tid] = min(0xfffff,sum); +// SUM16(shared.length,tid,+=); +// +// // output length +// const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (get_local_id(1) << (max_porder + 1)); +// if (get_local_id(1) <= 14 && get_local_id(0) == 0) +// partition_lengths[pos + get_group_id(0)] = min(0xfffff,shared.length[tid]) + (psize - shared.task.data.residualOrder * (get_group_id(0) == 0)) * (get_local_id(1) + 1); +//} +// +//// Sums partition lengths for a certain k == get_group_id(0) +//// Requires 128 threads +//__kernel void cudaSumPartition( +// int* partition_lengths, +// int max_porder +// ) +//{ +// __local struct { +// volatile int data[512+32]; // max_porder <= 8, data length <= 1 << 9. +// } shared; +// +// const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (get_group_id(0) << (max_porder + 1)); +// +// // fetch partition lengths +// shared.data[get_local_id(0)] = get_local_id(0) < (1 << max_porder) ? partition_lengths[pos + get_local_id(0)] : 0; +// shared.data[get_local_size(0) + get_local_id(0)] = get_local_size(0) + get_local_id(0) < (1 << max_porder) ? partition_lengths[pos + get_local_size(0) + get_local_id(0)] : 0; +// barrier(CLK_LOCAL_MEM_FENCE); +// +// int in_pos = (get_local_id(0) << 1); +// int out_pos = (1 << max_porder) + get_local_id(0); +// int bs; +// for (bs = 1 << (max_porder - 1); bs > 32; bs >>= 1) +// { +// if (get_local_id(0) < bs) shared.data[out_pos] = shared.data[in_pos] + shared.data[in_pos + 1]; +// in_pos += bs << 1; +// out_pos += bs; +// barrier(CLK_LOCAL_MEM_FENCE); +// } +// if (get_local_id(0) < 32) +// for (; bs > 0; bs >>= 1) +// { +// shared.data[out_pos] = shared.data[in_pos] + shared.data[in_pos + 1]; +// in_pos += bs << 1; +// out_pos += bs; +// } +// barrier(CLK_LOCAL_MEM_FENCE); +// if (get_local_id(0) < (1 << max_porder)) +// partition_lengths[pos + (1 << max_porder) + get_local_id(0)] = shared.data[(1 << max_porder) + get_local_id(0)]; +// if (get_local_size(0) + get_local_id(0) < (1 << max_porder)) +// partition_lengths[pos + (1 << max_porder) + get_local_size(0) + get_local_id(0)] = shared.data[(1 << max_porder) + get_local_size(0) + get_local_id(0)]; +//} +// +//// Finds optimal rice parameter for up to 16 partitions at a time. +//// Requires 16x16 threads +//__kernel void cudaFindRiceParameter( +// int* rice_parameters, +// int* partition_lengths, +// int max_porder +// ) +//{ +// __local struct { +// volatile int length[256]; +// volatile int index[256]; +// } shared; +// const int tid = get_local_id(0) + (get_local_id(1) << 5); +// const int parts = min(32, 2 << max_porder); +// const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (get_local_id(1) << (max_porder + 1)); +// +// // read length for 32 partitions +// int l1 = (get_local_id(0) < parts) ? partition_lengths[pos + get_group_id(0) * 32 + get_local_id(0)] : 0xffffff; +// int l2 = (get_local_id(1) + 8 <= 14 && get_local_id(0) < parts) ? partition_lengths[pos + (8 << (max_porder + 1)) + get_group_id(0) * 32 + get_local_id(0)] : 0xffffff; +// // find best rice parameter +// shared.index[tid] = get_local_id(1) + ((l2 < l1) << 3); +// shared.length[tid] = l1 = min(l1, l2); +// barrier(CLK_LOCAL_MEM_FENCE); +//#pragma unroll 3 +// for (int sh = 7; sh >= 5; sh --) +// { +// if (tid < (1 << sh)) +// { +// l2 = shared.length[tid + (1 << sh)]; +// shared.index[tid] = shared.index[tid + ((l2 < l1) << sh)]; +// shared.length[tid] = l1 = min(l1, l2); +// } +// barrier(CLK_LOCAL_MEM_FENCE); +// } +// if (tid < parts) +// { +// // output rice parameter +// rice_parameters[(get_group_id(1) << (max_porder + 2)) + get_group_id(0) * parts + tid] = shared.index[tid]; +// // output length +// rice_parameters[(get_group_id(1) << (max_porder + 2)) + (1 << (max_porder + 1)) + get_group_id(0) * parts + tid] = shared.length[tid]; +// } +//} +// +//__kernel void cudaFindPartitionOrder( +// int* best_rice_parameters, +// FLACCLSubframeTask *tasks, +// int* rice_parameters, +// int max_porder +// ) +//{ +// __local struct { +// int data[512]; +// volatile int tmp[256]; +// int length[32]; +// int index[32]; +// //char4 ch[64]; +// FLACCLSubframeTask task; +// } shared; +// const int pos = (get_group_id(1) << (max_porder + 2)) + (2 << max_porder); +// if (get_local_id(0) < sizeof(shared.task) / sizeof(int)) +// ((int*)&shared.task)[get_local_id(0)] = ((int*)(&tasks[get_group_id(1)]))[get_local_id(0)]; +// // fetch partition lengths +// shared.data[get_local_id(0)] = get_local_id(0) < (2 << max_porder) ? rice_parameters[pos + get_local_id(0)] : 0; +// shared.data[get_local_id(0) + 256] = get_local_id(0) + 256 < (2 << max_porder) ? rice_parameters[pos + 256 + get_local_id(0)] : 0; +// barrier(CLK_LOCAL_MEM_FENCE); +// +// for (int porder = max_porder; porder >= 0; porder--) +// { +// shared.tmp[get_local_id(0)] = (get_local_id(0) < (1 << porder)) * shared.data[(2 << max_porder) - (2 << porder) + get_local_id(0)]; +// barrier(CLK_LOCAL_MEM_FENCE); +// SUM256(shared.tmp, get_local_id(0), +=); +// if (get_local_id(0) == 0) +// shared.length[porder] = shared.tmp[0] + (4 << porder); +// barrier(CLK_LOCAL_MEM_FENCE); +// } +// +// if (get_local_id(0) < 32) +// { +// shared.index[get_local_id(0)] = get_local_id(0); +// if (get_local_id(0) > max_porder) +// shared.length[get_local_id(0)] = 0xfffffff; +// int l1 = shared.length[get_local_id(0)]; +// #pragma unroll 4 +// for (int sh = 3; sh >= 0; sh --) +// { +// int l2 = shared.length[get_local_id(0) + (1 << sh)]; +// shared.index[get_local_id(0)] = shared.index[get_local_id(0) + ((l2 < l1) << sh)]; +// shared.length[get_local_id(0)] = l1 = min(l1, l2); +// } +// if (get_local_id(0) == 0) +// tasks[get_group_id(1)].data.porder = shared.index[0]; +// if (get_local_id(0) == 0) +// { +// int obits = shared.task.data.obits - shared.task.data.wbits; +// tasks[get_group_id(1)].data.size = +// shared.task.data.type == Fixed ? shared.task.data.residualOrder * obits + 6 + l1 : +// shared.task.data.type == LPC ? shared.task.data.residualOrder * obits + 6 + l1 + 4 + 5 + shared.task.data.residualOrder * shared.task.data.cbits : +// shared.task.data.type == Constant ? obits : obits * shared.task.data.blocksize; +// } +// } +// barrier(CLK_LOCAL_MEM_FENCE); +// int porder = shared.index[0]; +// if (get_local_id(0) < (1 << porder)) +// best_rice_parameters[(get_group_id(1) << max_porder) + get_local_id(0)] = rice_parameters[pos - (2 << porder) + get_local_id(0)]; +// // FIXME: should be bytes? +// // if (get_local_id(0) < (1 << porder)) +// //shared.tmp[get_local_id(0)] = rice_parameters[pos - (2 << porder) + get_local_id(0)]; +// // barrier(CLK_LOCAL_MEM_FENCE); +// // if (get_local_id(0) < max(1, (1 << porder) >> 2)) +// // { +// //char4 ch; +// //ch.x = shared.tmp[(get_local_id(0) << 2)]; +// //ch.y = shared.tmp[(get_local_id(0) << 2) + 1]; +// //ch.z = shared.tmp[(get_local_id(0) << 2) + 2]; +// //ch.w = shared.tmp[(get_local_id(0) << 2) + 3]; +// //shared.ch[get_local_id(0)] = ch +// // } +// // barrier(CLK_LOCAL_MEM_FENCE); +// // if (get_local_id(0) < max(1, (1 << porder) >> 2)) +// //best_rice_parameters[(get_group_id(1) << max_porder) + get_local_id(0)] = shared.ch[get_local_id(0)]; +//} +// +//#endif +// +//#if 0 +// if (get_local_id(0) < order) +// { +// for (int i = 0; i < order; i++) +// if (get_local_id(0) >= i) +// sum[get_local_id(0) - i] += coefs[get_local_id(0)] * sample[order - i - 1]; +// fot (int i = order; i < blocksize; i++) +// { +// if (!get_local_id(0)) sample[order + i] = s = residual[order + i] + (sum[order + i] >> shift); +// sum[get_local_id(0) + i + 1] += coefs[get_local_id(0)] * s; +// } +// } +//#endif +#endif