diff --git a/CUETools.Codecs.FLACCL/CUETools.Codecs.FLACCL.csproj b/CUETools.Codecs.FLACCL/CUETools.Codecs.FLACCL.csproj
new file mode 100644
index 0000000..1282a58
--- /dev/null
+++ b/CUETools.Codecs.FLACCL/CUETools.Codecs.FLACCL.csproj
@@ -0,0 +1,91 @@
+
+
+ Debug
+ AnyCPU
+ 9.0.30729
+ 2.0
+ {DFE55765-564C-4B8F-993B-A94C4D1C212E}
+ Library
+ Properties
+ CUETools.Codecs.FLACCL
+ CUETools.Codecs.FLACCL
+
+
+ 2.0
+
+
+
+
+ true
+ full
+ false
+ ..\bin\Debug\plugins\
+ DEBUG;TRACE
+ prompt
+ 4
+ true
+
+
+ pdbonly
+ true
+ ..\bin\Release\plugins\
+ TRACE
+ prompt
+ 4
+ true
+
+
+
+ False
+ OpenCLNet.dll
+
+
+
+
+
+
+
+
+
+ True
+ True
+ Resources.resx
+
+
+
+
+ {082D6B9E-326E-4D15-9798-EDAE9EDE70A6}
+ CUETools.Codecs.FLAKE
+ False
+
+
+ {6458A13A-30EF-45A9-9D58-E5031B17BEE2}
+ CUETools.Codecs
+ False
+
+
+
+
+
+
+
+
+ ResXFileCodeGenerator
+ Resources.Designer.cs
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/CUETools.Codecs.FLACCL/FLACCLWriter.cs b/CUETools.Codecs.FLACCL/FLACCLWriter.cs
new file mode 100644
index 0000000..65b5cf2
--- /dev/null
+++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs
@@ -0,0 +1,2473 @@
+/**
+ * CUETools.FLACCL: FLAC audio encoder using CUDA
+ * Copyright (c) 2009 Gregory S. Chudov
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+using System;
+using System.ComponentModel;
+using System.Collections.Generic;
+using System.IO;
+using System.Security.Cryptography;
+using System.Threading;
+using System.Text;
+using System.Runtime.InteropServices;
+using CUETools.Codecs;
+using CUETools.Codecs.FLAKE;
+using OpenCLNet;
+
+namespace CUETools.Codecs.FLACCL
+{
+ public class FLACCLWriterSettings
+ {
+ public FLACCLWriterSettings() { DoVerify = false; GPUOnly = false; DoMD5 = true; }
+
+ [DefaultValue(false)]
+ [DisplayName("Verify")]
+ [SRDescription(typeof(Properties.Resources), "DoVerifyDescription")]
+ public bool DoVerify { get; set; }
+
+ [DefaultValue(true)]
+ [DisplayName("MD5")]
+ [SRDescription(typeof(Properties.Resources), "DoMD5Description")]
+ public bool DoMD5 { get; set; }
+
+ [DefaultValue(true)]
+ [SRDescription(typeof(Properties.Resources), "DescriptionGPUOnly")]
+ public bool GPUOnly { get; set; }
+
+ int cpu_threads = 1;
+ [DefaultValue(1)]
+ [SRDescription(typeof(Properties.Resources), "DescriptionCPUThreads")]
+ public int CPUThreads
+ {
+ get
+ {
+ return cpu_threads;
+ }
+ set
+ {
+ if (value < 0 || value > 16)
+ throw new Exception("CPUThreads must be between 0..16");
+ cpu_threads = value;
+ }
+ }
+ }
+
+ [AudioEncoderClass("FLACCL", "flac", true, "0 1 2 3 4 5 6 7 8 9 10 11", "8", 2, typeof(FLACCLWriterSettings))]
+ //[AudioEncoderClass("FLACCL nonsub", "flac", true, "9 10 11", "9", 1, typeof(FLACCLWriterSettings))]
+ public class FLACCLWriter : IAudioDest
+ {
+ Stream _IO = null;
+ string _path;
+ long _position;
+
+ // number of audio channels
+ // valid values are 1 to 8
+ int channels, ch_code;
+
+ // audio sample rate in Hz
+ int sample_rate, sr_code0, sr_code1;
+
+ // sample size in bits
+ // only 16-bit is currently supported
+ uint bits_per_sample;
+ int bps_code;
+
+ // total stream samples
+ // if 0, stream length is unknown
+ int sample_count = -1;
+
+ FlakeEncodeParams eparams;
+
+ // maximum frame size in bytes
+ // this can be used to allocate memory for output
+ int max_frame_size;
+
+ int frame_count = 0;
+ int frame_pos = 0;
+
+ long first_frame_offset = 0;
+
+ TimeSpan _userProcessorTime;
+
+ // header bytes
+ // allocated by flake_encode_init and freed by flake_encode_close
+ byte[] header;
+
+ float[] windowBuffer;
+ int samplesInBuffer = 0;
+ int max_frames = 0;
+
+ int _compressionLevel = 7;
+ int _blocksize = 0;
+ int _totalSize = 0;
+ int _windowsize = 0, _windowcount = 0;
+
+ Crc8 crc8;
+ Crc16 crc16;
+ MD5 md5;
+
+ SeekPoint[] seek_table;
+ int seek_table_offset = -1;
+
+ bool inited = false;
+
+ OpenCLManager OCLMan;
+ Context openCLContext;
+ Program openCLProgram;
+
+ FLACCLTask task1;
+ FLACCLTask task2;
+ FLACCLTask[] cpu_tasks;
+ int oldest_cpu_task = 0;
+
+ Mem cudaWindow;
+
+ AudioPCMConfig _pcm;
+
+ public const int MAX_BLOCKSIZE = 4096 * 16;
+ internal const int maxFrames = 128;
+ internal const int maxAutocorParts = (MAX_BLOCKSIZE + 255) / 256;
+
+ public FLACCLWriter(string path, Stream IO, AudioPCMConfig pcm)
+ {
+ _pcm = pcm;
+
+ if (pcm.BitsPerSample != 16)
+ throw new Exception("Bits per sample must be 16.");
+ if (pcm.ChannelCount != 2)
+ throw new Exception("ChannelCount must be 2.");
+
+ channels = pcm.ChannelCount;
+ sample_rate = pcm.SampleRate;
+ bits_per_sample = (uint) pcm.BitsPerSample;
+
+ // flake_validate_params
+
+ _path = path;
+ _IO = IO;
+
+ windowBuffer = new float[FLACCLWriter.MAX_BLOCKSIZE * lpc.MAX_LPC_WINDOWS];
+
+ eparams.flake_set_defaults(_compressionLevel, !_settings.GPUOnly);
+ eparams.padding_size = 8192;
+
+ crc8 = new Crc8();
+ crc16 = new Crc16();
+ }
+
+ public FLACCLWriter(string path, AudioPCMConfig pcm)
+ : this(path, null, pcm)
+ {
+ }
+
+ public int TotalSize
+ {
+ get
+ {
+ return _totalSize;
+ }
+ }
+
+ public long Padding
+ {
+ get
+ {
+ return eparams.padding_size;
+ }
+ set
+ {
+ eparams.padding_size = value;
+ }
+ }
+
+ public int CompressionLevel
+ {
+ get
+ {
+ return _compressionLevel;
+ }
+ set
+ {
+ if (value < 0 || value > 11)
+ throw new Exception("unsupported compression level");
+ _compressionLevel = value;
+ eparams.flake_set_defaults(_compressionLevel, !_settings.GPUOnly);
+ }
+ }
+
+ FLACCLWriterSettings _settings = new FLACCLWriterSettings();
+
+ public object Settings
+ {
+ get
+ {
+ return _settings;
+ }
+ set
+ {
+ if (value as FLACCLWriterSettings == null)
+ throw new Exception("Unsupported options " + value);
+ _settings = value as FLACCLWriterSettings;
+ eparams.flake_set_defaults(_compressionLevel, !_settings.GPUOnly);
+ }
+ }
+
+ //[DllImport("kernel32.dll")]
+ //static extern bool GetThreadTimes(IntPtr hThread, out long lpCreationTime, out long lpExitTime, out long lpKernelTime, out long lpUserTime);
+ //[DllImport("kernel32.dll")]
+ //static extern IntPtr GetCurrentThread();
+
+ void DoClose()
+ {
+ if (inited)
+ {
+ int nFrames = samplesInBuffer / eparams.block_size;
+ if (nFrames > 0)
+ do_output_frames(nFrames);
+ if (samplesInBuffer > 0)
+ {
+ eparams.block_size = samplesInBuffer;
+ do_output_frames(1);
+ }
+ if (task2.frameCount > 0)
+ {
+ if (cpu_tasks != null)
+ {
+ for (int i = 0; i < cpu_tasks.Length; i++)
+ {
+ wait_for_cpu_task();
+ FLACCLTask task = cpu_tasks[oldest_cpu_task];
+ oldest_cpu_task = (oldest_cpu_task + 1) % cpu_tasks.Length;
+ if (task.frameCount > 0)
+ {
+ write_result(task);
+ task.frameCount = 0;
+ }
+ }
+ }
+ task2.openCLCQ.Finish(); // cuda.SynchronizeStream(task2.stream);
+ process_result(task2);
+ write_result(task2);
+ task2.frameCount = 0;
+ }
+
+ if (_IO.CanSeek)
+ {
+ if (sample_count <= 0 && _position != 0)
+ {
+ BitWriter bitwriter = new BitWriter(header, 0, 4);
+ bitwriter.writebits(32, (int)_position);
+ bitwriter.flush();
+ _IO.Position = 22;
+ _IO.Write(header, 0, 4);
+ }
+
+ if (md5 != null)
+ {
+ md5.TransformFinalBlock(new byte[] { 0 }, 0, 0);
+ _IO.Position = 26;
+ _IO.Write(md5.Hash, 0, md5.Hash.Length);
+ }
+
+ if (seek_table != null)
+ {
+ _IO.Position = seek_table_offset;
+ int len = write_seekpoints(header, 0, 0);
+ _IO.Write(header, 4, len - 4);
+ }
+ }
+ _IO.Close();
+
+ cudaWindow.Dispose();
+ task1.Dispose();
+ task2.Dispose();
+ if (cpu_tasks != null)
+ foreach (FLACCLTask task in cpu_tasks)
+ task.Dispose();
+ openCLProgram.Dispose();
+ openCLContext.Dispose();
+ inited = false;
+ }
+ }
+
+ public void Close()
+ {
+ DoClose();
+ if (sample_count > 0 && _position != sample_count)
+ throw new Exception(string.Format("Samples written differs from the expected sample count. Expected {0}, got {1}.", sample_count, _position));
+ }
+
+ public void Delete()
+ {
+ if (inited)
+ {
+ _IO.Close();
+ cudaWindow.Dispose();
+ task1.Dispose();
+ task2.Dispose();
+ if (cpu_tasks != null)
+ foreach (FLACCLTask task in cpu_tasks)
+ task.Dispose();
+ openCLProgram.Dispose();
+ openCLContext.Dispose();
+ inited = false;
+ }
+
+ if (_path != "")
+ File.Delete(_path);
+ }
+
+ public long Position
+ {
+ get
+ {
+ return _position;
+ }
+ }
+
+ public long FinalSampleCount
+ {
+ set { sample_count = (int)value; }
+ }
+
+ public long BlockSize
+ {
+ set {
+ if (value < 256 || value > MAX_BLOCKSIZE )
+ throw new Exception("unsupported BlockSize value");
+ _blocksize = (int)value;
+ }
+ get { return _blocksize == 0 ? eparams.block_size : _blocksize; }
+ }
+
+ public StereoMethod StereoMethod
+ {
+ get { return eparams.do_midside ? StereoMethod.Search : StereoMethod.Independent; }
+ set { eparams.do_midside = value != StereoMethod.Independent; }
+ }
+
+ public int MinPrecisionSearch
+ {
+ get { return eparams.lpc_min_precision_search; }
+ set
+ {
+ if (value < 0 || value > eparams.lpc_max_precision_search)
+ throw new Exception("unsupported MinPrecisionSearch value");
+ eparams.lpc_min_precision_search = value;
+ }
+ }
+
+ public int MaxPrecisionSearch
+ {
+ get { return eparams.lpc_max_precision_search; }
+ set
+ {
+ if (value < eparams.lpc_min_precision_search || value >= lpc.MAX_LPC_PRECISIONS)
+ throw new Exception("unsupported MaxPrecisionSearch value");
+ eparams.lpc_max_precision_search = value;
+ }
+ }
+
+ public WindowFunction WindowFunction
+ {
+ get { return eparams.window_function; }
+ set { eparams.window_function = value; }
+ }
+
+ public bool DoSeekTable
+ {
+ get { return eparams.do_seektable; }
+ set { eparams.do_seektable = value; }
+ }
+
+ public int VBRMode
+ {
+ get { return eparams.variable_block_size; }
+ set { eparams.variable_block_size = value; }
+ }
+
+ public int OrdersPerWindow
+ {
+ get
+ {
+ return eparams.orders_per_window;
+ }
+ set
+ {
+ if (value < 0 || value > 32)
+ throw new Exception("invalid OrdersPerWindow " + value.ToString());
+ eparams.orders_per_window = value;
+ }
+ }
+
+ public int MinLPCOrder
+ {
+ get
+ {
+ return eparams.min_prediction_order;
+ }
+ set
+ {
+ if (value < 1 || value > eparams.max_prediction_order)
+ throw new Exception("invalid MinLPCOrder " + value.ToString());
+ eparams.min_prediction_order = value;
+ }
+ }
+
+ public int MaxLPCOrder
+ {
+ get
+ {
+ return eparams.max_prediction_order;
+ }
+ set
+ {
+ if (value > lpc.MAX_LPC_ORDER || value < eparams.min_prediction_order)
+ throw new Exception("invalid MaxLPCOrder " + value.ToString());
+ eparams.max_prediction_order = value;
+ }
+ }
+
+ public int MinFixedOrder
+ {
+ get
+ {
+ return eparams.min_fixed_order;
+ }
+ set
+ {
+ if (value < 0 || value > eparams.max_fixed_order)
+ throw new Exception("invalid MinFixedOrder " + value.ToString());
+ eparams.min_fixed_order = value;
+ }
+ }
+
+ public int MaxFixedOrder
+ {
+ get
+ {
+ return eparams.max_fixed_order;
+ }
+ set
+ {
+ if (value > 4 || value < eparams.min_fixed_order)
+ throw new Exception("invalid MaxFixedOrder " + value.ToString());
+ eparams.max_fixed_order = value;
+ }
+ }
+
+ public int MinPartitionOrder
+ {
+ get { return eparams.min_partition_order; }
+ set
+ {
+ if (value < 0 || value > eparams.max_partition_order)
+ throw new Exception("invalid MinPartitionOrder " + value.ToString());
+ eparams.min_partition_order = value;
+ }
+ }
+
+ public int MaxPartitionOrder
+ {
+ get { return eparams.max_partition_order; }
+ set
+ {
+ if (value > 8 || value < eparams.min_partition_order)
+ throw new Exception("invalid MaxPartitionOrder " + value.ToString());
+ eparams.max_partition_order = value;
+ }
+ }
+
+ public TimeSpan UserProcessorTime
+ {
+ get { return _userProcessorTime; }
+ }
+
+ public AudioPCMConfig PCM
+ {
+ get { return _pcm; }
+ }
+
+ unsafe void encode_residual_fixed(int* res, int* smp, int n, int order)
+ {
+ int i;
+ int s0, s1, s2;
+ switch (order)
+ {
+ case 0:
+ AudioSamples.MemCpy(res, smp, n);
+ return;
+ case 1:
+ *(res++) = s1 = *(smp++);
+ for (i = n - 1; i > 0; i--)
+ {
+ s0 = *(smp++);
+ *(res++) = s0 - s1;
+ s1 = s0;
+ }
+ return;
+ case 2:
+ *(res++) = s2 = *(smp++);
+ *(res++) = s1 = *(smp++);
+ for (i = n - 2; i > 0; i--)
+ {
+ s0 = *(smp++);
+ *(res++) = s0 - 2 * s1 + s2;
+ s2 = s1;
+ s1 = s0;
+ }
+ return;
+ case 3:
+ res[0] = smp[0];
+ res[1] = smp[1];
+ res[2] = smp[2];
+ for (i = 3; i < n; i++)
+ {
+ res[i] = smp[i] - 3 * smp[i - 1] + 3 * smp[i - 2] - smp[i - 3];
+ }
+ return;
+ case 4:
+ res[0] = smp[0];
+ res[1] = smp[1];
+ res[2] = smp[2];
+ res[3] = smp[3];
+ for (i = 4; i < n; i++)
+ {
+ res[i] = smp[i] - 4 * smp[i - 1] + 6 * smp[i - 2] - 4 * smp[i - 3] + smp[i - 4];
+ }
+ return;
+ default:
+ return;
+ }
+ }
+
+ static unsafe uint calc_optimal_rice_params(int porder, int* parm, uint* sums, uint n, uint pred_order)
+ {
+ uint part = (1U << porder);
+ uint cnt = (n >> porder) - pred_order;
+ int k = cnt > 0 ? Math.Min(Flake.MAX_RICE_PARAM, BitReader.log2i(sums[0] / cnt)) : 0;
+ uint all_bits = cnt * ((uint)k + 1U) + (sums[0] >> k);
+ parm[0] = k;
+ cnt = (n >> porder);
+ for (uint i = 1; i < part; i++)
+ {
+ k = Math.Min(Flake.MAX_RICE_PARAM, BitReader.log2i(sums[i] / cnt));
+ all_bits += cnt * ((uint)k + 1U) + (sums[i] >> k);
+ parm[i] = k;
+ }
+ return all_bits + (4 * part);
+ }
+
+ static unsafe void calc_lower_sums(int pmin, int pmax, uint* sums)
+ {
+ for (int i = pmax - 1; i >= pmin; i--)
+ {
+ for (int j = 0; j < (1 << i); j++)
+ {
+ sums[i * Flake.MAX_PARTITIONS + j] =
+ sums[(i + 1) * Flake.MAX_PARTITIONS + 2 * j] +
+ sums[(i + 1) * Flake.MAX_PARTITIONS + 2 * j + 1];
+ }
+ }
+ }
+
+ static unsafe void calc_sums(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums)
+ {
+ int parts = (1 << pmax);
+ uint* res = data + pred_order;
+ uint cnt = (n >> pmax) - pred_order;
+ uint sum = 0;
+ for (uint j = cnt; j > 0; j--)
+ sum += *(res++);
+ sums[0] = sum;
+ cnt = (n >> pmax);
+ for (int i = 1; i < parts; i++)
+ {
+ sum = 0;
+ for (uint j = cnt; j > 0; j--)
+ sum += *(res++);
+ sums[i] = sum;
+ }
+ }
+
+ ///
+ /// Special case when (n >> pmax) == 18
+ ///
+ ///
+ ///
+ ///
+ ///
+ ///
+ ///
+ static unsafe void calc_sums18(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums)
+ {
+ int parts = (1 << pmax);
+ uint* res = data + pred_order;
+ uint cnt = 18 - pred_order;
+ uint sum = 0;
+ for (uint j = cnt; j > 0; j--)
+ sum += *(res++);
+ sums[0] = sum;
+ for (int i = 1; i < parts; i++)
+ {
+ sums[i] =
+ *(res++) + *(res++) + *(res++) + *(res++) +
+ *(res++) + *(res++) + *(res++) + *(res++) +
+ *(res++) + *(res++) + *(res++) + *(res++) +
+ *(res++) + *(res++) + *(res++) + *(res++) +
+ *(res++) + *(res++);
+ }
+ }
+
+ ///
+ /// Special case when (n >> pmax) == 18
+ ///
+ ///
+ ///
+ ///
+ ///
+ ///
+ ///
+ static unsafe void calc_sums16(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums)
+ {
+ int parts = (1 << pmax);
+ uint* res = data + pred_order;
+ uint cnt = 16 - pred_order;
+ uint sum = 0;
+ for (uint j = cnt; j > 0; j--)
+ sum += *(res++);
+ sums[0] = sum;
+ for (int i = 1; i < parts; i++)
+ {
+ sums[i] =
+ *(res++) + *(res++) + *(res++) + *(res++) +
+ *(res++) + *(res++) + *(res++) + *(res++) +
+ *(res++) + *(res++) + *(res++) + *(res++) +
+ *(res++) + *(res++) + *(res++) + *(res++);
+ }
+ }
+
+ static unsafe uint calc_rice_params(RiceContext rc, int pmin, int pmax, int* data, uint n, uint pred_order)
+ {
+ uint* udata = stackalloc uint[(int)n];
+ uint* sums = stackalloc uint[(pmax + 1) * Flake.MAX_PARTITIONS];
+ int* parm = stackalloc int[(pmax + 1) * Flake.MAX_PARTITIONS];
+ //uint* bits = stackalloc uint[Flake.MAX_PARTITION_ORDER];
+
+ //assert(pmin >= 0 && pmin <= Flake.MAX_PARTITION_ORDER);
+ //assert(pmax >= 0 && pmax <= Flake.MAX_PARTITION_ORDER);
+ //assert(pmin <= pmax);
+
+ for (uint i = 0; i < n; i++)
+ udata[i] = (uint)((data[i] << 1) ^ (data[i] >> 31));
+
+ // sums for highest level
+ if ((n >> pmax) == 18)
+ calc_sums18(pmin, pmax, udata, n, pred_order, sums + pmax * Flake.MAX_PARTITIONS);
+ else if ((n >> pmax) == 16)
+ calc_sums16(pmin, pmax, udata, n, pred_order, sums + pmax * Flake.MAX_PARTITIONS);
+ else
+ calc_sums(pmin, pmax, udata, n, pred_order, sums + pmax * Flake.MAX_PARTITIONS);
+ // sums for lower levels
+ calc_lower_sums(pmin, pmax, sums);
+
+ uint opt_bits = AudioSamples.UINT32_MAX;
+ int opt_porder = pmin;
+ for (int i = pmin; i <= pmax; i++)
+ {
+ uint bits = calc_optimal_rice_params(i, parm + i * Flake.MAX_PARTITIONS, sums + i * Flake.MAX_PARTITIONS, n, pred_order);
+ if (bits <= opt_bits)
+ {
+ opt_bits = bits;
+ opt_porder = i;
+ }
+ }
+
+ rc.porder = opt_porder;
+ fixed (int* rparms = rc.rparams)
+ AudioSamples.MemCpy(rparms, parm + opt_porder * Flake.MAX_PARTITIONS, (1 << opt_porder));
+
+ return opt_bits;
+ }
+
+ static int get_max_p_order(int max_porder, int n, int order)
+ {
+ int porder = Math.Min(max_porder, BitReader.log2i(n ^ (n - 1)));
+ if (order > 0)
+ porder = Math.Min(porder, BitReader.log2i(n / order));
+ return porder;
+ }
+
+ unsafe void output_frame_header(FlacFrame frame)
+ {
+ frame.writer.writebits(15, 0x7FFC);
+ frame.writer.writebits(1, eparams.variable_block_size > 0 ? 1 : 0);
+ frame.writer.writebits(4, frame.bs_code0);
+ frame.writer.writebits(4, sr_code0);
+ if (frame.ch_mode == ChannelMode.NotStereo)
+ frame.writer.writebits(4, ch_code);
+ else
+ frame.writer.writebits(4, (int)frame.ch_mode);
+ frame.writer.writebits(3, bps_code);
+ frame.writer.writebits(1, 0);
+ frame.writer.write_utf8(frame.frame_number);
+
+ // custom block size
+ if (frame.bs_code1 >= 0)
+ {
+ if (frame.bs_code1 < 256)
+ frame.writer.writebits(8, frame.bs_code1);
+ else
+ frame.writer.writebits(16, frame.bs_code1);
+ }
+
+ // custom sample rate
+ if (sr_code1 > 0)
+ {
+ if (sr_code1 < 256)
+ frame.writer.writebits(8, sr_code1);
+ else
+ frame.writer.writebits(16, sr_code1);
+ }
+
+ // CRC-8 of frame header
+ frame.writer.flush();
+ byte crc = crc8.ComputeChecksum(frame.writer.Buffer, frame.writer_offset, frame.writer.Length - frame.writer_offset);
+ frame.writer.writebits(8, crc);
+ }
+
+ unsafe void output_residual(FlacFrame frame, FlacSubframeInfo sub)
+ {
+ // rice-encoded block
+ frame.writer.writebits(2, 0);
+
+ // partition order
+ int porder = sub.best.rc.porder;
+ int psize = frame.blocksize >> porder;
+ //assert(porder >= 0);
+ frame.writer.writebits(4, porder);
+ int res_cnt = psize - sub.best.order;
+
+ // residual
+ int j = sub.best.order;
+ fixed (byte* fixbuf = frame.writer.Buffer)
+ for (int p = 0; p < (1 << porder); p++)
+ {
+ int k = sub.best.rc.rparams[p];
+ frame.writer.writebits(4, k);
+ if (p == 1) res_cnt = psize;
+ int cnt = Math.Min(res_cnt, frame.blocksize - j);
+ frame.writer.write_rice_block_signed(fixbuf, k, sub.best.residual + j, cnt);
+ j += cnt;
+ }
+ }
+
+ unsafe void
+ output_subframe_constant(FlacFrame frame, FlacSubframeInfo sub)
+ {
+ frame.writer.writebits_signed(sub.obits, sub.samples[0]);
+ }
+
+ unsafe void
+ output_subframe_verbatim(FlacFrame frame, FlacSubframeInfo sub)
+ {
+ int n = frame.blocksize;
+ for (int i = 0; i < n; i++)
+ frame.writer.writebits_signed(sub.obits, sub.samples[i]);
+ // Don't use residual here, because we don't copy samples to residual for verbatim frames.
+ }
+
+ unsafe void
+ output_subframe_fixed(FlacFrame frame, FlacSubframeInfo sub)
+ {
+ // warm-up samples
+ for (int i = 0; i < sub.best.order; i++)
+ frame.writer.writebits_signed(sub.obits, sub.samples[i]);
+
+ // residual
+ output_residual(frame, sub);
+ }
+
+ unsafe void
+ output_subframe_lpc(FlacFrame frame, FlacSubframeInfo sub)
+ {
+ // warm-up samples
+ for (int i = 0; i < sub.best.order; i++)
+ frame.writer.writebits_signed(sub.obits, sub.samples[i]);
+
+ // LPC coefficients
+ frame.writer.writebits(4, sub.best.cbits - 1);
+ frame.writer.writebits_signed(5, sub.best.shift);
+ for (int i = 0; i < sub.best.order; i++)
+ frame.writer.writebits_signed(sub.best.cbits, sub.best.coefs[i]);
+
+ // residual
+ output_residual(frame, sub);
+ }
+
+ unsafe void output_subframes(FlacFrame frame)
+ {
+ for (int ch = 0; ch < channels; ch++)
+ {
+ FlacSubframeInfo sub = frame.subframes[ch];
+ // subframe header
+ int type_code = (int) sub.best.type;
+ if (sub.best.type == SubframeType.Fixed)
+ type_code |= sub.best.order;
+ if (sub.best.type == SubframeType.LPC)
+ type_code |= sub.best.order - 1;
+ frame.writer.writebits(1, 0);
+ frame.writer.writebits(6, type_code);
+ frame.writer.writebits(1, sub.wbits != 0 ? 1 : 0);
+ if (sub.wbits > 0)
+ frame.writer.writebits((int)sub.wbits, 1);
+
+ //if (frame_writer.Length >= frame_buffer.Length)
+ // throw new Exception("buffer overflow");
+
+ // subframe
+ switch (sub.best.type)
+ {
+ case SubframeType.Constant:
+ output_subframe_constant(frame, sub);
+ break;
+ case SubframeType.Verbatim:
+ output_subframe_verbatim(frame, sub);
+ break;
+ case SubframeType.Fixed:
+ output_subframe_fixed(frame, sub);
+ break;
+ case SubframeType.LPC:
+ output_subframe_lpc(frame, sub);
+ break;
+ }
+ //if (frame_writer.Length >= frame_buffer.Length)
+ // throw new Exception("buffer overflow");
+ }
+ }
+
+ void output_frame_footer(FlacFrame frame)
+ {
+ frame.writer.flush();
+ ushort crc = crc16.ComputeChecksum(frame.writer.Buffer, frame.writer_offset, frame.writer.Length - frame.writer_offset);
+ frame.writer.writebits(16, crc);
+ frame.writer.flush();
+ }
+
+ unsafe delegate void window_function(float* window, int size);
+
+ unsafe void calculate_window(float* window, window_function func, WindowFunction flag)
+ {
+ if ((eparams.window_function & flag) == 0 || _windowcount == lpc.MAX_LPC_WINDOWS)
+ return;
+
+ func(window + _windowcount * _windowsize, _windowsize);
+ //int sz = _windowsize;
+ //float* pos = window + _windowcount * FLACCLWriter.MAX_BLOCKSIZE * 2;
+ //do
+ //{
+ // func(pos, sz);
+ // if ((sz & 1) != 0)
+ // break;
+ // pos += sz;
+ // sz >>= 1;
+ //} while (sz >= 32);
+ _windowcount++;
+ }
+
+ unsafe void initializeSubframeTasks(int blocksize, int channelsCount, int nFrames, FLACCLTask task)
+ {
+ task.nResidualTasks = 0;
+ task.nTasksPerWindow = Math.Min(32, eparams.orders_per_window);
+ task.nResidualTasksPerChannel = _windowcount * task.nTasksPerWindow + 1 + (eparams.do_constant ? 1 : 0) + eparams.max_fixed_order - eparams.min_fixed_order;
+ if (task.nResidualTasksPerChannel >= 4)
+ task.nResidualTasksPerChannel = (task.nResidualTasksPerChannel + 7) & ~7;
+ task.nAutocorTasksPerChannel = _windowcount;
+ for (int iFrame = 0; iFrame < nFrames; iFrame++)
+ {
+ for (int ch = 0; ch < channelsCount; ch++)
+ {
+ for (int iWindow = 0; iWindow < _windowcount; iWindow++)
+ {
+ // LPC tasks
+ for (int order = 0; order < task.nTasksPerWindow; order++)
+ {
+ task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.LPC;
+ task.ResidualTasks[task.nResidualTasks].channel = ch;
+ task.ResidualTasks[task.nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0);
+ task.ResidualTasks[task.nResidualTasks].abits = task.ResidualTasks[task.nResidualTasks].obits;
+ task.ResidualTasks[task.nResidualTasks].blocksize = blocksize;
+ task.ResidualTasks[task.nResidualTasks].residualOrder = order + 1;
+ task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * blocksize;
+ task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs;
+ task.nResidualTasks++;
+ }
+ }
+ // Constant frames
+ if (eparams.do_constant)
+ {
+ task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.Constant;
+ task.ResidualTasks[task.nResidualTasks].channel = ch;
+ task.ResidualTasks[task.nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0);
+ task.ResidualTasks[task.nResidualTasks].abits = task.ResidualTasks[task.nResidualTasks].obits;
+ task.ResidualTasks[task.nResidualTasks].blocksize = blocksize;
+ task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * blocksize;
+ task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs;
+ task.ResidualTasks[task.nResidualTasks].residualOrder = 1;
+ task.ResidualTasks[task.nResidualTasks].shift = 0;
+ task.ResidualTasks[task.nResidualTasks].coefs[0] = 1;
+ task.nResidualTasks++;
+ }
+ // Fixed prediction
+ for (int order = eparams.min_fixed_order; order <= eparams.max_fixed_order; order++)
+ {
+ task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.Fixed;
+ task.ResidualTasks[task.nResidualTasks].channel = ch;
+ task.ResidualTasks[task.nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0);
+ task.ResidualTasks[task.nResidualTasks].abits = task.ResidualTasks[task.nResidualTasks].obits;
+ task.ResidualTasks[task.nResidualTasks].blocksize = blocksize;
+ task.ResidualTasks[task.nResidualTasks].residualOrder = order;
+ task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * blocksize;
+ task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs;
+ task.ResidualTasks[task.nResidualTasks].shift = 0;
+ switch (order)
+ {
+ case 0:
+ break;
+ case 1:
+ task.ResidualTasks[task.nResidualTasks].coefs[0] = 1;
+ break;
+ case 2:
+ task.ResidualTasks[task.nResidualTasks].coefs[1] = 2;
+ task.ResidualTasks[task.nResidualTasks].coefs[0] = -1;
+ break;
+ case 3:
+ task.ResidualTasks[task.nResidualTasks].coefs[2] = 3;
+ task.ResidualTasks[task.nResidualTasks].coefs[1] = -3;
+ task.ResidualTasks[task.nResidualTasks].coefs[0] = 1;
+ break;
+ case 4:
+ task.ResidualTasks[task.nResidualTasks].coefs[3] = 4;
+ task.ResidualTasks[task.nResidualTasks].coefs[2] = -6;
+ task.ResidualTasks[task.nResidualTasks].coefs[1] = 4;
+ task.ResidualTasks[task.nResidualTasks].coefs[0] = -1;
+ break;
+ }
+ task.nResidualTasks++;
+ }
+ // Filler
+ while ((task.nResidualTasks % task.nResidualTasksPerChannel) != 0)
+ {
+ task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.Verbatim;
+ task.ResidualTasks[task.nResidualTasks].channel = ch;
+ task.ResidualTasks[task.nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0);
+ task.ResidualTasks[task.nResidualTasks].abits = task.ResidualTasks[task.nResidualTasks].obits;
+ task.ResidualTasks[task.nResidualTasks].blocksize = blocksize;
+ task.ResidualTasks[task.nResidualTasks].residualOrder = 0;
+ task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * blocksize;
+ task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs;
+ task.ResidualTasks[task.nResidualTasks].shift = 0;
+ task.nResidualTasks++;
+ }
+ }
+ }
+ if (sizeof(FLACCLSubframeTask) * task.nResidualTasks > task.residualTasksLen)
+ throw new Exception("oops");
+ task.openCLCQ.EnqueueWriteBuffer(task.cudaResidualTasks, true, 0, sizeof(FLACCLSubframeTask) * task.nResidualTasks, task.residualTasksPtr.AddrOfPinnedObject());
+ task.openCLCQ.EnqueueBarrier();
+
+ task.frameSize = blocksize;
+ }
+
+ unsafe void encode_residual(FLACCLTask task)
+ {
+ bool unpacked = false;
+ unpack_samples(task, Math.Min(32, task.frameSize));
+ for (int ch = 0; ch < channels; ch++)
+ {
+ switch (task.frame.subframes[ch].best.type)
+ {
+ case SubframeType.Constant:
+ break;
+ case SubframeType.Verbatim:
+ if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true;
+ break;
+ case SubframeType.Fixed:
+ // if (!_settings.GPUOnly)
+ {
+ if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true;
+ encode_residual_fixed(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples,
+ task.frame.blocksize, task.frame.subframes[ch].best.order);
+
+ int pmin = get_max_p_order(eparams.min_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order);
+ int pmax = get_max_p_order(eparams.max_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order);
+ uint bits = (uint)(task.frame.subframes[ch].best.order * task.frame.subframes[ch].obits) + 6;
+ task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order);
+ }
+ break;
+ case SubframeType.LPC:
+ fixed (int* coefs = task.frame.subframes[ch].best.coefs)
+ {
+ ulong csum = 0;
+ for (int i = task.frame.subframes[ch].best.order; i > 0; i--)
+ csum += (ulong)Math.Abs(coefs[i - 1]);
+ // if ((csum << task.frame.subframes[ch].obits) >= 1UL << 32 || !_settings.GPUOnly)
+ {
+ if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true;
+ if ((csum << task.frame.subframes[ch].obits) >= 1UL << 32)
+ lpc.encode_residual_long(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, task.frame.blocksize, task.frame.subframes[ch].best.order, coefs, task.frame.subframes[ch].best.shift);
+ else
+ lpc.encode_residual(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, task.frame.blocksize, task.frame.subframes[ch].best.order, coefs, task.frame.subframes[ch].best.shift);
+ int pmin = get_max_p_order(eparams.min_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order);
+ int pmax = get_max_p_order(eparams.max_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order);
+ uint bits = (uint)(task.frame.subframes[ch].best.order * task.frame.subframes[ch].obits) + 4 + 5 + (uint)task.frame.subframes[ch].best.order * (uint)task.frame.subframes[ch].best.cbits + 6;
+ //uint oldsize = task.frame.subframes[ch].best.size;
+ task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order);
+ //if (task.frame.subframes[ch].best.size > task.frame.subframes[ch].obits * (uint)task.frame.blocksize &&
+ // oldsize <= task.frame.subframes[ch].obits * (uint)task.frame.blocksize)
+ // throw new Exception("oops");
+ }
+ }
+ break;
+ }
+ if (task.frame.subframes[ch].best.size > task.frame.subframes[ch].obits * task.frame.blocksize)
+ {
+#if DEBUG
+ throw new Exception("larger than verbatim");
+#endif
+ task.frame.subframes[ch].best.type = SubframeType.Verbatim;
+ task.frame.subframes[ch].best.size = (uint)(task.frame.subframes[ch].obits * task.frame.blocksize);
+ if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true;
+ }
+ }
+ }
+
+ unsafe void select_best_methods(FlacFrame frame, int channelsCount, int iFrame, FLACCLTask task)
+ {
+ if (channelsCount == 4 && channels == 2)
+ {
+ if (task.BestResidualTasks[iFrame * 2].channel == 0 && task.BestResidualTasks[iFrame * 2 + 1].channel == 1)
+ frame.ch_mode = ChannelMode.LeftRight;
+ else if (task.BestResidualTasks[iFrame * 2].channel == 0 && task.BestResidualTasks[iFrame * 2 + 1].channel == 3)
+ frame.ch_mode = ChannelMode.LeftSide;
+ else if (task.BestResidualTasks[iFrame * 2].channel == 3 && task.BestResidualTasks[iFrame * 2 + 1].channel == 1)
+ frame.ch_mode = ChannelMode.RightSide;
+ else if (task.BestResidualTasks[iFrame * 2].channel == 2 && task.BestResidualTasks[iFrame * 2 + 1].channel == 3)
+ frame.ch_mode = ChannelMode.MidSide;
+ else
+ throw new Exception("internal error: invalid stereo mode");
+ frame.SwapSubframes(0, task.BestResidualTasks[iFrame * 2].channel);
+ frame.SwapSubframes(1, task.BestResidualTasks[iFrame * 2 + 1].channel);
+ }
+ else
+ frame.ch_mode = channels != 2 ? ChannelMode.NotStereo : ChannelMode.LeftRight;
+
+ for (int ch = 0; ch < channels; ch++)
+ {
+ int index = ch + iFrame * channels;
+ frame.subframes[ch].best.residual = ((int*)task.residualBufferPtr.AddrOfPinnedObject()) + task.BestResidualTasks[index].residualOffs;
+ frame.subframes[ch].best.type = SubframeType.Verbatim;
+ frame.subframes[ch].best.size = (uint)(frame.subframes[ch].obits * frame.blocksize);
+ frame.subframes[ch].wbits = 0;
+
+ if (task.BestResidualTasks[index].size < 0)
+ throw new Exception("internal error");
+ if (frame.blocksize > Math.Max(4, eparams.max_prediction_order) && frame.subframes[ch].best.size > task.BestResidualTasks[index].size)
+ {
+ frame.subframes[ch].best.type = (SubframeType)task.BestResidualTasks[index].type;
+ frame.subframes[ch].best.size = (uint)task.BestResidualTasks[index].size;
+ frame.subframes[ch].best.order = task.BestResidualTasks[index].residualOrder;
+ frame.subframes[ch].best.cbits = task.BestResidualTasks[index].cbits;
+ frame.subframes[ch].best.shift = task.BestResidualTasks[index].shift;
+ frame.subframes[ch].obits -= task.BestResidualTasks[index].wbits;
+ frame.subframes[ch].wbits = task.BestResidualTasks[index].wbits;
+ frame.subframes[ch].best.rc.porder = task.BestResidualTasks[index].porder;
+ for (int i = 0; i < task.BestResidualTasks[index].residualOrder; i++)
+ frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i];
+ //if (_settings.GPUOnly && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC))
+ //{
+ // int* riceParams = ((int*)task.bestRiceParamsPtr.AddrOfPinnedObject()) + (index << task.max_porder);
+ // fixed (int* dstParams = frame.subframes[ch].best.rc.rparams)
+ // AudioSamples.MemCpy(dstParams, riceParams, (1 << frame.subframes[ch].best.rc.porder));
+ // //for (int i = 0; i < (1 << frame.subframes[ch].best.rc.porder); i++)
+ // // frame.subframes[ch].best.rc.rparams[i] = riceParams[i];
+ //}
+ }
+ }
+ }
+
+ unsafe void estimate_residual(FLACCLTask task, int channelsCount)
+ {
+ if (task.frameSize <= 4)
+ return;
+
+ //int autocorPartSize = (2 * 256 - eparams.max_prediction_order) & ~15;
+ int autocorPartSize = 32 * 7;
+ int autocorPartCount = (task.frameSize + autocorPartSize - 1) / autocorPartSize;
+ if (autocorPartCount > maxAutocorParts)
+ throw new Exception("internal error");
+
+ int max_porder = get_max_p_order(eparams.max_partition_order, task.frameSize, eparams.max_prediction_order);
+ int calcPartitionPartSize = task.frameSize >> max_porder;
+ while (calcPartitionPartSize < 16 && max_porder > 0)
+ {
+ calcPartitionPartSize <<= 1;
+ max_porder--;
+ }
+ int calcPartitionPartCount = (calcPartitionPartSize >= 128) ? 1 : (256 / calcPartitionPartSize);
+
+ if (channels != 2) throw new Exception("channels != 2"); // need to Enqueue cudaChannelDecorr for each channel
+ Kernel cudaChannelDecorr = channels == 2 ? (channelsCount == 4 ? task.cudaStereoDecorr : task.cudaChannelDecorr2) : null;// task.cudaChannelDecorr;
+ //Kernel cudaCalcPartition = calcPartitionPartSize >= 128 ? task.cudaCalcLargePartition : calcPartitionPartSize == 16 && task.frameSize >= 256 ? task.cudaCalcPartition16 : task.cudaCalcPartition;
+
+ cudaChannelDecorr.SetArg(0, task.cudaSamples);
+ cudaChannelDecorr.SetArg(1, task.cudaSamplesBytes);
+ cudaChannelDecorr.SetArg(2, (uint)MAX_BLOCKSIZE);
+
+ task.cudaComputeLPC.SetArg(0, task.cudaResidualTasks);
+ task.cudaComputeLPC.SetArg(1, (uint)task.nResidualTasksPerChannel);
+ task.cudaComputeLPC.SetArg(2, task.cudaAutocorOutput);
+ task.cudaComputeLPC.SetArg(3, (uint)eparams.max_prediction_order);
+ task.cudaComputeLPC.SetArg(4, task.cudaLPCData);
+ task.cudaComputeLPC.SetArg(5, (uint)_windowcount);
+ task.cudaComputeLPC.SetArg(6, (uint)autocorPartCount);
+
+ //task.cudaComputeLPCLattice.SetArg(0, task.cudaResidualTasks);
+ //task.cudaComputeLPCLattice.SetArg(1, (uint)task.nResidualTasksPerChannel);
+ //task.cudaComputeLPCLattice.SetArg(2, task.cudaSamples);
+ //task.cudaComputeLPCLattice.SetArg(3, (uint)_windowcount);
+ //task.cudaComputeLPCLattice.SetArg(4, (uint)eparams.max_prediction_order);
+ //task.cudaComputeLPCLattice.SetArg(5, task.cudaLPCData);
+ //cuda.SetFunctionBlockShape(task.cudaComputeLPCLattice, 256, 1, 1);
+
+ task.cudaQuantizeLPC.SetArg(0, task.cudaResidualTasks);
+ task.cudaQuantizeLPC.SetArg(1, (uint)task.nResidualTasksPerChannel);
+ task.cudaQuantizeLPC.SetArg(2, (uint)task.nTasksPerWindow);
+ task.cudaQuantizeLPC.SetArg(3, task.cudaLPCData);
+ task.cudaQuantizeLPC.SetArg(4, (uint)eparams.max_prediction_order);
+ task.cudaQuantizeLPC.SetArg(5, (uint)eparams.lpc_min_precision_search);
+ task.cudaQuantizeLPC.SetArg(6, (uint)(eparams.lpc_max_precision_search - eparams.lpc_min_precision_search));
+
+ task.cudaCopyBestMethod.SetArg(0, task.cudaBestResidualTasks);
+ task.cudaCopyBestMethod.SetArg(1, task.cudaResidualTasks);
+ task.cudaCopyBestMethod.SetArg(2, (uint)task.nResidualTasksPerChannel);
+
+ task.cudaCopyBestMethodStereo.SetArg(0, task.cudaBestResidualTasks);
+ task.cudaCopyBestMethodStereo.SetArg(1, task.cudaResidualTasks);
+ task.cudaCopyBestMethodStereo.SetArg(2, (uint)task.nResidualTasksPerChannel);
+
+ //task.cudaEncodeResidual.SetArg(0, task.cudaResidual);
+ //task.cudaEncodeResidual.SetArg(1, task.cudaSamples);
+ //task.cudaEncodeResidual.SetArg(2, task.cudaBestResidualTasks);
+ //cuda.SetFunctionBlockShape(task.cudaEncodeResidual, residualPartSize, 1, 1);
+
+ //cudaCalcPartition.SetArg(0, task.cudaPartitions);
+ //cudaCalcPartition.SetArg(1, task.cudaResidual);
+ //cudaCalcPartition.SetArg(2, task.cudaSamples);
+ //cudaCalcPartition.SetArg(3, task.cudaBestResidualTasks);
+ //cudaCalcPartition.SetArg(4, (uint)max_porder);
+ //cudaCalcPartition.SetArg(5, (uint)calcPartitionPartSize);
+ //cudaCalcPartition.SetArg(6, (uint)calcPartitionPartCount);
+ //cuda.SetFunctionBlockShape(cudaCalcPartition, 16, 16, 1);
+
+ //task.cudaSumPartition.SetArg(0, task.cudaPartitions);
+ //task.cudaSumPartition.SetArg(1, (uint)max_porder);
+ //cuda.SetFunctionBlockShape(task.cudaSumPartition, Math.Max(32, 1 << (max_porder - 1)), 1, 1);
+
+ //task.cudaFindRiceParameter.SetArg(0, task.cudaRiceParams);
+ //task.cudaFindRiceParameter.SetArg(1, task.cudaPartitions);
+ //task.cudaFindRiceParameter.SetArg(2, (uint)max_porder);
+ //cuda.SetFunctionBlockShape(task.cudaFindRiceParameter, 32, 8, 1);
+
+ //task.cudaFindPartitionOrder.SetArg(0, task.cudaBestRiceParams);
+ //task.cudaFindPartitionOrder.SetArg(1, task.cudaBestResidualTasks);
+ //task.cudaFindPartitionOrder.SetArg(2, task.cudaRiceParams);
+ //task.cudaFindPartitionOrder.SetArg(3, (uint)max_porder);
+ //cuda.SetFunctionBlockShape(task.cudaFindPartitionOrder, 256, 1, 1);
+
+
+ // issue work to the GPU
+ task.openCLCQ.EnqueueBarrier();
+ task.openCLCQ.EnqueueNDRangeKernel(cudaChannelDecorr, 1, null, new int[] { task.frameCount * task.frameSize }, null );
+ //task.openCLCQ.EnqueueNDRangeKernel(cudaChannelDecorr, 1, null, new int[] { 64 * 128 }, new int[] { 128 });
+ //cuda.SetFunctionBlockShape(cudaChannelDecorr, 256, 1, 1);
+ //cuda.LaunchAsync(cudaChannelDecorr, (task.frameCount * task.frameSize + 255) / 256, channels == 2 ? 1 : channels, task.stream);
+
+ if (eparams.do_wasted)
+ {
+ task.openCLCQ.EnqueueBarrier();
+ task.EnqueueFindWasted(channelsCount);
+ }
+
+ // geometry???
+ task.openCLCQ.EnqueueBarrier();
+ task.EnqueueComputeAutocor(autocorPartCount, channelsCount, cudaWindow, eparams.max_prediction_order);
+
+ //float* autoc = stackalloc float[1024];
+ //task.openCLCQ.EnqueueBarrier();
+ //task.openCLCQ.EnqueueReadBuffer(task.cudaAutocorOutput, true, 0, sizeof(float) * 1024, (IntPtr)autoc);
+
+ task.openCLCQ.EnqueueBarrier();
+ task.openCLCQ.EnqueueNDRangeKernel(task.cudaComputeLPC, 2, null, new int[] { task.nAutocorTasksPerChannel * 32, channelsCount * task.frameCount }, new int[] { 32, 1 });
+ //cuda.SetFunctionBlockShape(task.cudaComputeLPC, 32, 1, 1);
+
+ //float* lpcs = stackalloc float[1024];
+ //task.openCLCQ.EnqueueBarrier();
+ //task.openCLCQ.EnqueueReadBuffer(task.cudaLPCData, true, 0, sizeof(float) * 1024, (IntPtr)lpcs);
+
+ task.openCLCQ.EnqueueBarrier();
+ task.openCLCQ.EnqueueNDRangeKernel(task.cudaQuantizeLPC, 2, null, new int[] { task.nAutocorTasksPerChannel * 32, channelsCount * task.frameCount * 4 }, new int[] { 32, 4 });
+ //cuda.SetFunctionBlockShape(task.cudaQuantizeLPC, 32, 4, 1);
+
+ task.openCLCQ.EnqueueBarrier();
+ task.EnqueueEstimateResidual(channelsCount, eparams.max_prediction_order);
+
+ //int* rr = stackalloc int[1024];
+ //task.openCLCQ.EnqueueBarrier();
+ //task.openCLCQ.EnqueueReadBuffer(task.cudaResidualOutput, true, 0, sizeof(int) * 1024, (IntPtr)rr);
+
+ task.openCLCQ.EnqueueBarrier();
+ task.EnqueueChooseBestMethod(channelsCount);
+
+ task.openCLCQ.EnqueueBarrier();
+ if (channels == 2 && channelsCount == 4)
+ task.openCLCQ.EnqueueNDRangeKernel(task.cudaCopyBestMethodStereo, 2, null, new int[] { 64, task.frameCount }, new int[] { 64, 1 });
+ //cuda.SetFunctionBlockShape(task.cudaCopyBestMethodStereo, 64, 1, 1);
+ else
+ task.openCLCQ.EnqueueNDRangeKernel(task.cudaCopyBestMethod, 2, null, new int[] { 64, channels * task.frameCount }, new int[] { 64, 1 });
+ //cuda.SetFunctionBlockShape(task.cudaCopyBestMethod, 64, 1, 1);
+ //if (_settings.GPUOnly)
+ //{
+ // int bsz = calcPartitionPartCount * calcPartitionPartSize;
+ // if (cudaCalcPartition.Pointer == task.cudaCalcLargePartition.Pointer)
+ // cuda.LaunchAsync(task.cudaEncodeResidual, residualPartCount, channels * task.frameCount, task.stream);
+ // cuda.LaunchAsync(cudaCalcPartition, (task.frameSize + bsz - 1) / bsz, channels * task.frameCount, task.stream);
+ // if (max_porder > 0)
+ // cuda.LaunchAsync(task.cudaSumPartition, Flake.MAX_RICE_PARAM + 1, channels * task.frameCount, task.stream);
+ // cuda.LaunchAsync(task.cudaFindRiceParameter, ((2 << max_porder) + 31) / 32, channels * task.frameCount, task.stream);
+ // //if (max_porder > 0) // need to run even if max_porder==0 just to calculate the final frame size
+ // cuda.LaunchAsync(task.cudaFindPartitionOrder, 1, channels * task.frameCount, task.stream);
+ // cuda.CopyDeviceToHostAsync(task.cudaResidual, task.residualBufferPtr, (uint)(sizeof(int) * MAX_BLOCKSIZE * channels), task.stream);
+ // cuda.CopyDeviceToHostAsync(task.cudaBestRiceParams, task.bestRiceParamsPtr, (uint)(sizeof(int) * (1 << max_porder) * channels * task.frameCount), task.stream);
+ // task.max_porder = max_porder;
+ //}
+ task.openCLCQ.EnqueueBarrier();
+ task.openCLCQ.EnqueueReadBuffer(task.cudaBestResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * channels * task.frameCount, task.bestResidualTasksPtr.AddrOfPinnedObject());
+ //task.openCLCQ.EnqueueBarrier();
+ //task.openCLCQ.EnqueueReadBuffer(task.cudaResidualTasks, true, 0, sizeof(FLACCLSubframeTask) * task.nResidualTasks, task.residualTasksPtr.AddrOfPinnedObject());
+ //task.openCLCQ.EnqueueBarrier();
+ }
+
+ ///
+ /// Copy channel-interleaved input samples into separate subframes
+ ///
+ ///
+ ///
+ unsafe void unpack_samples(FLACCLTask task, int count)
+ {
+ int iFrame = task.frame.frame_number;
+ short* src = ((short*)task.samplesBytesPtr.AddrOfPinnedObject()) + iFrame * channels * task.frameSize;
+
+ switch (task.frame.ch_mode)
+ {
+ case ChannelMode.NotStereo:
+ for (int ch = 0; ch < channels; ch++)
+ {
+ int* s = task.frame.subframes[ch].samples;
+ int wbits = (int)task.frame.subframes[ch].wbits;
+ for (int i = 0; i < count; i++)
+ s[i] = src[i * channels + ch] >>= wbits;
+ }
+ break;
+ case ChannelMode.LeftRight:
+ {
+ int* left = task.frame.subframes[0].samples;
+ int* right = task.frame.subframes[1].samples;
+ int lwbits = (int)task.frame.subframes[0].wbits;
+ int rwbits = (int)task.frame.subframes[1].wbits;
+ for (int i = 0; i < count; i++)
+ {
+ int l = *(src++);
+ int r = *(src++);
+ left[i] = l >> lwbits;
+ right[i] = r >> rwbits;
+ }
+ break;
+ }
+ case ChannelMode.LeftSide:
+ {
+ int* left = task.frame.subframes[0].samples;
+ int* right = task.frame.subframes[1].samples;
+ int lwbits = (int)task.frame.subframes[0].wbits;
+ int rwbits = (int)task.frame.subframes[1].wbits;
+ for (int i = 0; i < count; i++)
+ {
+ int l = *(src++);
+ int r = *(src++);
+ left[i] = l >> lwbits;
+ right[i] = (l - r) >> rwbits;
+ }
+ break;
+ }
+ case ChannelMode.RightSide:
+ {
+ int* left = task.frame.subframes[0].samples;
+ int* right = task.frame.subframes[1].samples;
+ int lwbits = (int)task.frame.subframes[0].wbits;
+ int rwbits = (int)task.frame.subframes[1].wbits;
+ for (int i = 0; i < count; i++)
+ {
+ int l = *(src++);
+ int r = *(src++);
+ left[i] = (l - r) >> lwbits;
+ right[i] = r >> rwbits;
+ }
+ break;
+ }
+ case ChannelMode.MidSide:
+ {
+ int* left = task.frame.subframes[0].samples;
+ int* right = task.frame.subframes[1].samples;
+ int lwbits = (int)task.frame.subframes[0].wbits;
+ int rwbits = (int)task.frame.subframes[1].wbits;
+ for (int i = 0; i < count; i++)
+ {
+ int l = *(src++);
+ int r = *(src++);
+ left[i] = (l + r) >> (1 + lwbits);
+ right[i] = (l - r) >> rwbits;
+ }
+ break;
+ }
+ }
+ }
+
+ unsafe int encode_frame(bool doMidside, int channelCount, int iFrame, FLACCLTask task, int current_frame_number)
+ {
+ task.frame.InitSize(task.frameSize, eparams.variable_block_size != 0);
+ task.frame.frame_number = iFrame;
+ task.frame.ch_mode = ChannelMode.NotStereo;
+
+ fixed (int* smp = task.samplesBuffer)
+ {
+ for (int ch = 0; ch < channelCount; ch++)
+ task.frame.subframes[ch].Init(
+ smp + ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * task.frameSize,
+ ((int*)task.residualBufferPtr.AddrOfPinnedObject()) + ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * task.frameSize,
+ _pcm.BitsPerSample + (doMidside && ch == 3 ? 1 : 0), 0);
+
+ select_best_methods(task.frame, channelCount, iFrame, task);
+ //unpack_samples(task);
+ encode_residual(task);
+
+ //task.frame.writer.Reset();
+ task.frame.frame_number = current_frame_number;
+ task.frame.writer_offset = task.frame.writer.Length;
+
+ output_frame_header(task.frame);
+ output_subframes(task.frame);
+ output_frame_footer(task.frame);
+ if (task.frame.writer.Length - task.frame.writer_offset >= max_frame_size)
+ throw new Exception("buffer overflow");
+
+ return task.frame.writer.Length - task.frame.writer_offset;
+ }
+ }
+
+ unsafe void send_to_GPU(FLACCLTask task, int nFrames, int blocksize)
+ {
+ bool doMidside = channels == 2 && eparams.do_midside;
+ int channelsCount = doMidside ? 2 * channels : channels;
+ if (blocksize != task.frameSize)
+ task.nResidualTasks = 0;
+ task.frameCount = nFrames;
+ task.frameSize = blocksize;
+ task.frameNumber = eparams.variable_block_size > 0 ? frame_pos : frame_count;
+ task.framePos = frame_pos;
+ frame_count += nFrames;
+ frame_pos += nFrames * blocksize;
+ task.openCLCQ.EnqueueWriteBuffer(task.cudaSamplesBytes, false, 0, sizeof(short) * channels * blocksize * nFrames, task.samplesBytesPtr.AddrOfPinnedObject());
+ task.openCLCQ.EnqueueBarrier();
+ }
+
+ unsafe void run_GPU_task(FLACCLTask task)
+ {
+ bool doMidside = channels == 2 && eparams.do_midside;
+ int channelsCount = doMidside ? 2 * channels : channels;
+
+ if (task.frameSize != _windowsize && task.frameSize > 4)
+ fixed (float* window = windowBuffer)
+ {
+ _windowsize = task.frameSize;
+ _windowcount = 0;
+ calculate_window(window, lpc.window_welch, WindowFunction.Welch);
+ calculate_window(window, lpc.window_flattop, WindowFunction.Flattop);
+ calculate_window(window, lpc.window_tukey, WindowFunction.Tukey);
+ calculate_window(window, lpc.window_hann, WindowFunction.Hann);
+ calculate_window(window, lpc.window_bartlett, WindowFunction.Bartlett);
+ if (_windowcount == 0)
+ throw new Exception("invalid windowfunction");
+ task.openCLCQ.EnqueueWriteBuffer(cudaWindow, true, 0, sizeof(float) * windowBuffer.Length, (IntPtr)window);
+ task.openCLCQ.EnqueueBarrier();
+ }
+ if (task.nResidualTasks == 0)
+ initializeSubframeTasks(task.frameSize, channelsCount, max_frames, task);
+
+ estimate_residual(task, channelsCount);
+ }
+
+ unsafe void process_result(FLACCLTask task)
+ {
+ bool doMidside = channels == 2 && eparams.do_midside;
+ int channelCount = doMidside ? 2 * channels : channels;
+
+ long iSample = 0;
+ long iByte = 0;
+ task.frame.writer.Reset();
+ task.frame.writer_offset = 0;
+ for (int iFrame = 0; iFrame < task.frameCount; iFrame++)
+ {
+ //if (0 != eparams.variable_block_size && 0 == (task.blocksize & 7) && task.blocksize >= 128)
+ // fs = encode_frame_vbs();
+ //else
+ int fn = task.frameNumber + (eparams.variable_block_size > 0 ? (int)iSample : iFrame);
+ int fs = encode_frame(doMidside, channelCount, iFrame, task, fn);
+
+ if (task.verify != null)
+ {
+ int decoded = task.verify.DecodeFrame(task.frame.writer.Buffer, task.frame.writer_offset, fs);
+ if (decoded != fs || task.verify.Remaining != task.frameSize)
+ throw new Exception("validation failed! frame size mismatch");
+ fixed (int* r = task.verify.Samples)
+ {
+ for (int ch = 0; ch < channels; ch++)
+ {
+ short* res = ((short*)task.samplesBytesPtr.AddrOfPinnedObject()) + iFrame * channels * task.frameSize + ch;
+ int* smp = r + ch * Flake.MAX_BLOCKSIZE;
+ for (int i = task.frameSize; i > 0; i--)
+ {
+ //if (AudioSamples.MemCmp(s + iFrame * task.frameSize + ch * FLACCLWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, task.frameSize))
+ if (*res != *(smp++))
+ throw new Exception(string.Format("validation failed! iFrame={0}, ch={1}", iFrame, ch));
+ res += channels;
+ }
+ }
+ }
+ }
+
+ if (seek_table != null && _IO.CanSeek)
+ {
+ for (int sp = 0; sp < seek_table.Length; sp++)
+ {
+ if (seek_table[sp].framesize != 0)
+ continue;
+ if (seek_table[sp].number >= task.framePos + iSample + task.frameSize)
+ break;
+ if (seek_table[sp].number >= task.framePos + iSample)
+ {
+ seek_table[sp].number = task.framePos + iSample;
+ seek_table[sp].offset = iByte;
+ seek_table[sp].framesize = task.frameSize;
+ }
+ }
+ }
+
+ //Array.Copy(task.frame.buffer, 0, task.outputBuffer, iByte, fs);
+
+ iSample += task.frameSize;
+ iByte += fs;
+ }
+ task.outputSize = (int)iByte;
+ if (iByte != task.frame.writer.Length)
+ throw new Exception("invalid length");
+ }
+
+ unsafe void write_result(FLACCLTask task)
+ {
+ int iSample = task.frameSize * task.frameCount;
+
+ if (seek_table != null && _IO.CanSeek)
+ for (int sp = 0; sp < seek_table.Length; sp++)
+ {
+ if (seek_table[sp].number >= task.framePos + iSample)
+ break;
+ if (seek_table[sp].number >= task.framePos)
+ seek_table[sp].offset += _IO.Position - first_frame_offset;
+ }
+ _IO.Write(task.outputBuffer, 0, task.outputSize);
+ _position += iSample;
+ _totalSize += task.outputSize;
+ }
+
+ public unsafe void InitTasks()
+ {
+ bool doMidside = channels == 2 && eparams.do_midside;
+ int channelCount = doMidside ? 2 * channels : channels;
+
+ if (!inited)
+ {
+ if (OpenCL.NumberOfPlatforms < 1)
+ throw new Exception("no opencl platforms found");
+
+ OCLMan = new OpenCLManager();
+ // Attempt to save binaries after compilation, as well as load precompiled binaries
+ // to avoid compilation. Usually you'll want this to be true.
+ OCLMan.AttemptUseBinaries = false; // true;
+ // Attempt to compile sources. This should probably be true for almost all projects.
+ // Setting it to false means that when you attempt to compile "mysource.cl", it will
+ // only scan the precompiled binary directory for a binary corresponding to a source
+ // with that name. There's a further restriction that the compiled binary also has to
+ // use the same Defines and BuildOptions
+ OCLMan.AttemptUseSource = true;
+ // Binary and source paths
+ // This is where we store our sources and where compiled binaries are placed
+ //OCLMan.BinaryPath = @"OpenCL\bin";
+ //OCLMan.SourcePath = @"OpenCL\src";
+ // If true, RequireImageSupport will filter out any devices without image support
+ // In this project we don't need image support though, so we set it to false
+ OCLMan.RequireImageSupport = false;
+ // The Defines string gets prepended to any and all sources that are compiled
+ // and serve as a convenient way to pass configuration information to the compilation process
+ OCLMan.Defines = "#define MAX_ORDER " + eparams.max_prediction_order.ToString();
+ // The BuildOptions string is passed directly to clBuild and can be used to do debug builds etc
+ OCLMan.BuildOptions = "";
+
+ OCLMan.CreateDefaultContext(0, DeviceType.GPU);
+
+ openCLContext = OCLMan.Context;
+ //try
+ //{
+ // openCLProgram = OCLMan.CompileFile("flac.cl");
+ //}
+ //catch (OpenCLBuildException ex)
+ //{
+ // string buildLog = ex.BuildLogs[0];
+ // throw ex;
+ //}
+ using (Stream kernel = GetType().Assembly.GetManifestResourceStream(GetType(), "flac.cl"))
+ using (StreamReader sr = new StreamReader(kernel))
+ {
+ try
+ {
+ openCLProgram = OCLMan.CompileSource(sr.ReadToEnd()); ;
+ }
+ catch (OpenCLBuildException ex)
+ {
+ string buildLog = ex.BuildLogs[0];
+ throw ex;
+ }
+ }
+#if TTTTKJHSKJH
+ var openCLPlatform = OpenCL.GetPlatform(0);
+ openCLContext = openCLPlatform.CreateDefaultContext();
+ using (Stream kernel = GetType().Assembly.GetManifestResourceStream(GetType(), "flac.cl"))
+ using (StreamReader sr = new StreamReader(kernel))
+ openCLProgram = openCLContext.CreateProgramWithSource(sr.ReadToEnd());
+ try
+ {
+ openCLProgram.Build();
+ }
+ catch (OpenCLException)
+ {
+ string buildLog = openCLProgram.GetBuildLog(openCLProgram.Devices[0]);
+ throw;
+ }
+#endif
+
+ if (_IO == null)
+ _IO = new FileStream(_path, FileMode.Create, FileAccess.Write, FileShare.Read);
+ int header_size = flake_encode_init();
+ _IO.Write(header, 0, header_size);
+ if (_IO.CanSeek)
+ first_frame_offset = _IO.Position;
+
+ task1 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify);
+ task2 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify);
+ if (_settings.CPUThreads > 0)
+ {
+ cpu_tasks = new FLACCLTask[_settings.CPUThreads];
+ for (int i = 0; i < cpu_tasks.Length; i++)
+ cpu_tasks[i] = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify);
+ }
+ cudaWindow = openCLProgram.Context.CreateBuffer(MemFlags.READ_ONLY, sizeof(float) * FLACCLWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS);
+
+ inited = true;
+ }
+ }
+
+ public unsafe void Write(AudioBuffer buff)
+ {
+ InitTasks();
+ buff.Prepare(this);
+ int pos = 0;
+ while (pos < buff.Length)
+ {
+ int block = Math.Min(buff.Length - pos, eparams.block_size * max_frames - samplesInBuffer);
+
+ fixed (byte* buf = buff.Bytes)
+ AudioSamples.MemCpy(((byte*)task1.samplesBytesPtr.AddrOfPinnedObject()) + samplesInBuffer * _pcm.BlockAlign, buf + pos * _pcm.BlockAlign, block * _pcm.BlockAlign);
+
+ samplesInBuffer += block;
+ pos += block;
+
+ int nFrames = samplesInBuffer / eparams.block_size;
+ if (nFrames >= max_frames)
+ do_output_frames(nFrames);
+ }
+ if (md5 != null)
+ md5.TransformBlock(buff.Bytes, 0, buff.ByteLength, null, 0);
+ }
+
+ public void wait_for_cpu_task()
+ {
+ FLACCLTask task = cpu_tasks[oldest_cpu_task];
+ if (task.workThread == null)
+ return;
+ lock (task)
+ {
+ while (!task.done && task.exception == null)
+ Monitor.Wait(task);
+ if (task.exception != null)
+ throw task.exception;
+ }
+ }
+
+ public void cpu_task_thread(object param)
+ {
+ FLACCLTask task = param as FLACCLTask;
+ try
+ {
+ while (true)
+ {
+ lock (task)
+ {
+ while (task.done && !task.exit)
+ Monitor.Wait(task);
+ if (task.exit)
+ return;
+ }
+ process_result(task);
+ lock (task)
+ {
+ task.done = true;
+ Monitor.Pulse(task);
+ }
+ }
+ }
+ catch (Exception ex)
+ {
+ lock (task)
+ {
+ task.exception = ex;
+ Monitor.Pulse(task);
+ }
+ }
+ }
+
+ public void start_cpu_task()
+ {
+ FLACCLTask task = cpu_tasks[oldest_cpu_task];
+ if (task.workThread == null)
+ {
+ task.done = false;
+ task.exit = false;
+ task.workThread = new Thread(cpu_task_thread);
+ task.workThread.IsBackground = true;
+ //task.workThread.Priority = ThreadPriority.BelowNormal;
+ task.workThread.Start(task);
+ }
+ else
+ {
+ lock (task)
+ {
+ task.done = false;
+ Monitor.Pulse(task);
+ }
+ }
+ }
+
+ public unsafe void do_output_frames(int nFrames)
+ {
+ send_to_GPU(task1, nFrames, eparams.block_size);
+ if (task2.frameCount > 0)
+ task2.openCLCQ.Finish();
+ run_GPU_task(task1);
+ if (task2.frameCount > 0)
+ {
+ if (cpu_tasks != null)
+ {
+ wait_for_cpu_task();
+
+ FLACCLTask ttmp = cpu_tasks[oldest_cpu_task];
+ cpu_tasks[oldest_cpu_task] = task2;
+ task2 = ttmp;
+
+ start_cpu_task();
+
+ oldest_cpu_task = (oldest_cpu_task + 1) % cpu_tasks.Length;
+
+ if (task2.frameCount > 0)
+ write_result(task2);
+ }
+ else
+ {
+ process_result(task2);
+ write_result(task2);
+ }
+ }
+ int bs = eparams.block_size * nFrames;
+ samplesInBuffer -= bs;
+ if (samplesInBuffer > 0)
+ AudioSamples.MemCpy(
+ ((byte*)task2.samplesBytesPtr.AddrOfPinnedObject()),
+ ((byte*)task1.samplesBytesPtr.AddrOfPinnedObject()) + bs * _pcm.BlockAlign,
+ samplesInBuffer * _pcm.BlockAlign);
+ FLACCLTask tmp = task1;
+ task1 = task2;
+ task2 = tmp;
+ task1.frameCount = 0;
+ }
+
+ public string Path { get { return _path; } }
+
+ public static readonly string vendor_string = "FLACCL#.91";
+
+ int select_blocksize(int samplerate, int time_ms)
+ {
+ int blocksize = Flake.flac_blocksizes[1];
+ int target = (samplerate * time_ms) / 1000;
+ if (eparams.variable_block_size > 0)
+ {
+ blocksize = 1024;
+ while (target >= blocksize)
+ blocksize <<= 1;
+ return blocksize >> 1;
+ }
+
+ for (int i = 0; i < Flake.flac_blocksizes.Length; i++)
+ if (target >= Flake.flac_blocksizes[i] && Flake.flac_blocksizes[i] > blocksize)
+ {
+ blocksize = Flake.flac_blocksizes[i];
+ }
+ return blocksize;
+ }
+
+ void write_streaminfo(byte[] header, int pos, int last)
+ {
+ Array.Clear(header, pos, 38);
+ BitWriter bitwriter = new BitWriter(header, pos, 38);
+
+ // metadata header
+ bitwriter.writebits(1, last);
+ bitwriter.writebits(7, (int)MetadataType.StreamInfo);
+ bitwriter.writebits(24, 34);
+
+ if (eparams.variable_block_size > 0)
+ bitwriter.writebits(16, 0);
+ else
+ bitwriter.writebits(16, eparams.block_size);
+
+ bitwriter.writebits(16, eparams.block_size);
+ bitwriter.writebits(24, 0);
+ bitwriter.writebits(24, max_frame_size);
+ bitwriter.writebits(20, sample_rate);
+ bitwriter.writebits(3, channels - 1);
+ bitwriter.writebits(5, bits_per_sample - 1);
+
+ // total samples
+ if (sample_count > 0)
+ {
+ bitwriter.writebits(4, 0);
+ bitwriter.writebits(32, sample_count);
+ }
+ else
+ {
+ bitwriter.writebits(4, 0);
+ bitwriter.writebits(32, 0);
+ }
+ bitwriter.flush();
+ }
+
+ /**
+ * Write vorbis comment metadata block to byte array.
+ * Just writes the vendor string for now.
+ */
+ int write_vorbis_comment(byte[] comment, int pos, int last)
+ {
+ BitWriter bitwriter = new BitWriter(comment, pos, 4);
+ Encoding enc = new ASCIIEncoding();
+ int vendor_len = enc.GetBytes(vendor_string, 0, vendor_string.Length, comment, pos + 8);
+
+ // metadata header
+ bitwriter.writebits(1, last);
+ bitwriter.writebits(7, (int)MetadataType.VorbisComment);
+ bitwriter.writebits(24, vendor_len + 8);
+
+ comment[pos + 4] = (byte)(vendor_len & 0xFF);
+ comment[pos + 5] = (byte)((vendor_len >> 8) & 0xFF);
+ comment[pos + 6] = (byte)((vendor_len >> 16) & 0xFF);
+ comment[pos + 7] = (byte)((vendor_len >> 24) & 0xFF);
+ comment[pos + 8 + vendor_len] = 0;
+ comment[pos + 9 + vendor_len] = 0;
+ comment[pos + 10 + vendor_len] = 0;
+ comment[pos + 11 + vendor_len] = 0;
+ bitwriter.flush();
+ return vendor_len + 12;
+ }
+
+ int write_seekpoints(byte[] header, int pos, int last)
+ {
+ seek_table_offset = pos + 4;
+
+ BitWriter bitwriter = new BitWriter(header, pos, 4 + 18 * seek_table.Length);
+
+ // metadata header
+ bitwriter.writebits(1, last);
+ bitwriter.writebits(7, (int)MetadataType.Seektable);
+ bitwriter.writebits(24, 18 * seek_table.Length);
+ for (int i = 0; i < seek_table.Length; i++)
+ {
+ bitwriter.writebits64(Flake.FLAC__STREAM_METADATA_SEEKPOINT_SAMPLE_NUMBER_LEN, (ulong)seek_table[i].number);
+ bitwriter.writebits64(Flake.FLAC__STREAM_METADATA_SEEKPOINT_STREAM_OFFSET_LEN, (ulong)seek_table[i].offset);
+ bitwriter.writebits(Flake.FLAC__STREAM_METADATA_SEEKPOINT_FRAME_SAMPLES_LEN, seek_table[i].framesize);
+ }
+ bitwriter.flush();
+ return 4 + 18 * seek_table.Length;
+ }
+
+ /**
+ * Write padding metadata block to byte array.
+ */
+ int
+ write_padding(byte[] padding, int pos, int last, long padlen)
+ {
+ BitWriter bitwriter = new BitWriter(padding, pos, 4);
+
+ // metadata header
+ bitwriter.writebits(1, last);
+ bitwriter.writebits(7, (int)MetadataType.Padding);
+ bitwriter.writebits(24, (int)padlen);
+
+ return (int)padlen + 4;
+ }
+
+ int write_headers()
+ {
+ int header_size = 0;
+ int last = 0;
+
+ // stream marker
+ header[0] = 0x66;
+ header[1] = 0x4C;
+ header[2] = 0x61;
+ header[3] = 0x43;
+ header_size += 4;
+
+ // streaminfo
+ write_streaminfo(header, header_size, last);
+ header_size += 38;
+
+ // seek table
+ if (_IO.CanSeek && seek_table != null)
+ header_size += write_seekpoints(header, header_size, last);
+
+ // vorbis comment
+ if (eparams.padding_size == 0) last = 1;
+ header_size += write_vorbis_comment(header, header_size, last);
+
+ // padding
+ if (eparams.padding_size > 0)
+ {
+ last = 1;
+ header_size += write_padding(header, header_size, last, eparams.padding_size);
+ }
+
+ return header_size;
+ }
+
+ int flake_encode_init()
+ {
+ int i, header_len;
+
+ //if(flake_validate_params(s) < 0)
+
+ ch_code = channels - 1;
+
+ // find samplerate in table
+ for (i = 4; i < 12; i++)
+ {
+ if (sample_rate == Flake.flac_samplerates[i])
+ {
+ sr_code0 = i;
+ break;
+ }
+ }
+
+ // if not in table, samplerate is non-standard
+ if (i == 12)
+ throw new Exception("non-standard samplerate");
+
+ for (i = 1; i < 8; i++)
+ {
+ if (bits_per_sample == Flake.flac_bitdepths[i])
+ {
+ bps_code = i;
+ break;
+ }
+ }
+ if (i == 8)
+ throw new Exception("non-standard bps");
+ // FIXME: For now, only 16-bit encoding is supported
+ if (bits_per_sample != 16)
+ throw new Exception("non-standard bps");
+
+ if (_blocksize == 0)
+ {
+ if (eparams.block_size == 0)
+ eparams.block_size = select_blocksize(sample_rate, eparams.block_time_ms);
+ _blocksize = eparams.block_size;
+ }
+ else
+ eparams.block_size = _blocksize;
+
+ max_frames = Math.Min(maxFrames, FLACCLWriter.MAX_BLOCKSIZE / eparams.block_size);
+
+ // set maximum encoded frame size (if larger, re-encodes in verbatim mode)
+ if (channels == 2)
+ max_frame_size = 16 + ((eparams.block_size * (int)(bits_per_sample + bits_per_sample + 1) + 7) >> 3);
+ else
+ max_frame_size = 16 + ((eparams.block_size * channels * (int)bits_per_sample + 7) >> 3);
+
+ if (_IO.CanSeek && eparams.do_seektable && sample_count > 0)
+ {
+ int seek_points_distance = sample_rate * 10;
+ int num_seek_points = 1 + sample_count / seek_points_distance; // 1 seek point per 10 seconds
+ if (sample_count % seek_points_distance == 0)
+ num_seek_points--;
+ seek_table = new SeekPoint[num_seek_points];
+ for (int sp = 0; sp < num_seek_points; sp++)
+ {
+ seek_table[sp].framesize = 0;
+ seek_table[sp].offset = 0;
+ seek_table[sp].number = sp * seek_points_distance;
+ }
+ }
+
+ // output header bytes
+ header = new byte[eparams.padding_size + 1024 + (seek_table == null ? 0 : seek_table.Length * 18)];
+ header_len = write_headers();
+
+ // initialize CRC & MD5
+ if (_IO.CanSeek && _settings.DoMD5)
+ md5 = new MD5CryptoServiceProvider();
+
+ return header_len;
+ }
+ }
+
+ struct FlakeEncodeParams
+ {
+ // compression quality
+ // set by user prior to calling flake_encode_init
+ // standard values are 0 to 8
+ // 0 is lower compression, faster encoding
+ // 8 is higher compression, slower encoding
+ // extended values 9 to 12 are slower and/or use
+ // higher prediction orders
+ public int compression;
+
+ // stereo decorrelation method
+ // set by user prior to calling flake_encode_init
+ // if set to less than 0, it is chosen based on compression.
+ // valid values are 0 to 2
+ // 0 = independent L+R channels
+ // 1 = mid-side encoding
+ public bool do_midside;
+
+ // block size in samples
+ // set by the user prior to calling flake_encode_init
+ // if set to 0, a block size is chosen based on block_time_ms
+ // can also be changed by user before encoding a frame
+ public int block_size;
+
+ // block time in milliseconds
+ // set by the user prior to calling flake_encode_init
+ // used to calculate block_size based on sample rate
+ // can also be changed by user before encoding a frame
+ public int block_time_ms;
+
+ // padding size in bytes
+ // set by the user prior to calling flake_encode_init
+ // if set to less than 0, defaults to 4096
+ public long padding_size;
+
+ // minimum LPC order
+ // set by user prior to calling flake_encode_init
+ // if set to less than 0, it is chosen based on compression.
+ // valid values are 1 to 32
+ public int min_prediction_order;
+
+ // maximum LPC order
+ // set by user prior to calling flake_encode_init
+ // if set to less than 0, it is chosen based on compression.
+ // valid values are 1 to 32
+ public int max_prediction_order;
+
+ public int orders_per_window;
+
+ // minimum fixed prediction order
+ // set by user prior to calling flake_encode_init
+ // if set to less than 0, it is chosen based on compression.
+ // valid values are 0 to 4
+ public int min_fixed_order;
+
+ // maximum fixed prediction order
+ // set by user prior to calling flake_encode_init
+ // if set to less than 0, it is chosen based on compression.
+ // valid values are 0 to 4
+ public int max_fixed_order;
+
+ // minimum partition order
+ // set by user prior to calling flake_encode_init
+ // if set to less than 0, it is chosen based on compression.
+ // valid values are 0 to 8
+ public int min_partition_order;
+
+ // maximum partition order
+ // set by user prior to calling flake_encode_init
+ // if set to less than 0, it is chosen based on compression.
+ // valid values are 0 to 8
+ public int max_partition_order;
+
+ // whether to use variable block sizes
+ // set by user prior to calling flake_encode_init
+ // 0 = fixed block size
+ // 1 = variable block size
+ public int variable_block_size;
+
+ // whether to try various lpc_precisions
+ // 0 - use only one precision
+ // 1 - try two precisions
+ public int lpc_max_precision_search;
+
+ public int lpc_min_precision_search;
+
+ public bool do_wasted;
+
+ public bool do_constant;
+
+ public WindowFunction window_function;
+
+ public bool do_seektable;
+
+ public int flake_set_defaults(int lvl, bool encode_on_cpu)
+ {
+ compression = lvl;
+
+ if ((lvl < 0 || lvl > 12) && (lvl != 99))
+ {
+ return -1;
+ }
+
+ // default to level 5 params
+ window_function = WindowFunction.Flattop | WindowFunction.Tukey;
+ do_midside = true;
+ block_size = 0;
+ block_time_ms = 100;
+ min_fixed_order = 0;
+ max_fixed_order = 4;
+ min_prediction_order = 1;
+ max_prediction_order = 12;
+ min_partition_order = 0;
+ max_partition_order = 6;
+ variable_block_size = 0;
+ lpc_min_precision_search = 0;
+ lpc_max_precision_search = 0;
+ do_seektable = true;
+ do_wasted = true;
+ do_constant = true;
+
+ // differences from level 7
+ switch (lvl)
+ {
+ case 0:
+ do_constant = false;
+ do_wasted = false;
+ do_midside = false;
+ orders_per_window = 1;
+ max_partition_order = 4;
+ max_prediction_order = 7;
+ min_fixed_order = 2;
+ max_fixed_order = 2;
+ break;
+ case 1:
+ do_wasted = false;
+ do_midside = false;
+ window_function = WindowFunction.Bartlett;
+ orders_per_window = 1;
+ max_prediction_order = 12;
+ max_partition_order = 4;
+ break;
+ case 2:
+ do_constant = false;
+ window_function = WindowFunction.Bartlett;
+ min_fixed_order = 3;
+ max_fixed_order = 2;
+ orders_per_window = 1;
+ max_prediction_order = 7;
+ max_partition_order = 4;
+ break;
+ case 3:
+ window_function = WindowFunction.Bartlett;
+ min_fixed_order = 2;
+ max_fixed_order = 2;
+ orders_per_window = 6;
+ max_prediction_order = 7;
+ max_partition_order = 4;
+ break;
+ case 4:
+ min_fixed_order = 2;
+ max_fixed_order = 2;
+ orders_per_window = 3;
+ max_prediction_order = 8;
+ max_partition_order = 4;
+ break;
+ case 5:
+ do_constant = false;
+ min_fixed_order = 2;
+ max_fixed_order = 2;
+ orders_per_window = 1;
+ break;
+ case 6:
+ min_fixed_order = 2;
+ max_fixed_order = 2;
+ orders_per_window = 3;
+ break;
+ case 7:
+ min_fixed_order = 2;
+ max_fixed_order = 2;
+ orders_per_window = 7;
+ break;
+ case 8:
+ orders_per_window = 12;
+ break;
+ case 9:
+ min_fixed_order = 2;
+ max_fixed_order = 2;
+ orders_per_window = 3;
+ max_prediction_order = 32;
+ break;
+ case 10:
+ min_fixed_order = 2;
+ max_fixed_order = 2;
+ orders_per_window = 7;
+ max_prediction_order = 32;
+ break;
+ case 11:
+ min_fixed_order = 2;
+ max_fixed_order = 2;
+ orders_per_window = 11;
+ max_prediction_order = 32;
+ break;
+ }
+
+ if (!encode_on_cpu)
+ max_partition_order = 8;
+
+ return 0;
+ }
+ }
+
+ unsafe struct FLACCLSubframeTask
+ {
+ public int residualOrder;
+ public int samplesOffs;
+ public int shift;
+ public int cbits;
+ public int size;
+ public int type;
+ public int obits;
+ public int blocksize;
+ public int best_index;
+ public int channel;
+ public int residualOffs;
+ public int wbits;
+ public int abits;
+ public int porder;
+ public fixed int reserved[2];
+ public fixed int coefs[32];
+ };
+
+ internal class FLACCLTask
+ {
+ Program openCLProgram;
+ public CommandQueue openCLCQ;
+ public Kernel cudaStereoDecorr;
+ //public Kernel cudaChannelDecorr;
+ public Kernel cudaChannelDecorr2;
+ public Kernel cudaFindWastedBits;
+ public Kernel cudaComputeAutocor;
+ public Kernel cudaComputeLPC;
+ //public Kernel cudaComputeLPCLattice;
+ public Kernel cudaQuantizeLPC;
+ public Kernel cudaEstimateResidual;
+ public Kernel cudaChooseBestMethod;
+ public Kernel cudaCopyBestMethod;
+ public Kernel cudaCopyBestMethodStereo;
+ //public Kernel cudaEncodeResidual;
+ //public Kernel cudaCalcPartition;
+ //public Kernel cudaCalcPartition16;
+ //public Kernel cudaCalcLargePartition;
+ //public Kernel cudaSumPartition;
+ //public Kernel cudaFindRiceParameter;
+ //public Kernel cudaFindPartitionOrder;
+ public Mem cudaSamplesBytes;
+ public Mem cudaSamples;
+ public Mem cudaLPCData;
+ public Mem cudaResidual;
+ public Mem cudaPartitions;
+ public Mem cudaRiceParams;
+ public Mem cudaBestRiceParams;
+ public Mem cudaAutocorOutput;
+ public Mem cudaResidualTasks;
+ public Mem cudaResidualOutput;
+ public Mem cudaBestResidualTasks;
+ public GCHandle samplesBytesPtr;
+ public GCHandle residualBufferPtr;
+ public GCHandle bestRiceParamsPtr;
+ public GCHandle residualTasksPtr;
+ public GCHandle bestResidualTasksPtr;
+ public int[] samplesBuffer;
+ public byte[] outputBuffer;
+ public int outputSize = 0;
+ public int frameSize = 0;
+ public int frameCount = 0;
+ public int frameNumber = 0;
+ public int framePos = 0;
+ public FlacFrame frame;
+ public int residualTasksLen;
+ public int bestResidualTasksLen;
+ public int samplesBufferLen;
+ public int nResidualTasks = 0;
+ public int nResidualTasksPerChannel = 0;
+ public int nTasksPerWindow = 0;
+ public int nAutocorTasksPerChannel = 0;
+ //public int max_porder = 0;
+
+ public FlakeReader verify;
+
+ public Thread workThread = null;
+ public Exception exception = null;
+ public bool done = false;
+ public bool exit = false;
+
+ unsafe public FLACCLTask(Program _openCLProgram, int channelCount, int channels, uint bits_per_sample, int max_frame_size, bool do_verify)
+ {
+ openCLProgram = _openCLProgram;
+ Device[] openCLDevices = openCLProgram.Context.Platform.QueryDevices(DeviceType.GPU);
+ openCLCQ = openCLProgram.Context.CreateCommandQueue(openCLDevices[0], CommandQueueProperties.PROFILING_ENABLE);
+
+ residualTasksLen = sizeof(FLACCLSubframeTask) * channelCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 8) * FLACCLWriter.maxFrames;
+ bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channelCount * FLACCLWriter.maxFrames;
+ samplesBufferLen = sizeof(int) * FLACCLWriter.MAX_BLOCKSIZE * channelCount;
+ int partitionsLen = sizeof(int) * (30 << 8) * channelCount * FLACCLWriter.maxFrames;
+ int riceParamsLen = sizeof(int) * (4 << 8) * channelCount * FLACCLWriter.maxFrames;
+ int lpcDataLen = sizeof(float) * 32 * 33 * lpc.MAX_LPC_WINDOWS * channelCount * FLACCLWriter.maxFrames;
+
+ cudaSamplesBytes = openCLProgram.Context.CreateBuffer(MemFlags.READ_ONLY | MemFlags.ALLOC_HOST_PTR, (uint)samplesBufferLen / 2);
+ cudaSamples = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, samplesBufferLen);
+ cudaResidual = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, samplesBufferLen);
+ cudaLPCData = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, lpcDataLen);
+ cudaPartitions = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, partitionsLen);
+ cudaRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, riceParamsLen);
+ cudaBestRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceParamsLen / 4);
+ cudaAutocorOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * (FLACCLWriter.maxAutocorParts + FLACCLWriter.maxFrames));
+ cudaResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, residualTasksLen);
+ cudaBestResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, bestResidualTasksLen);
+ cudaResidualOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 8) * 64 /*FLACCLWriter.maxResidualParts*/ * FLACCLWriter.maxFrames);
+
+ samplesBytesPtr = GCHandle.Alloc(new byte[samplesBufferLen / 2], GCHandleType.Pinned);
+ residualBufferPtr = GCHandle.Alloc(new byte[samplesBufferLen], GCHandleType.Pinned);
+ bestRiceParamsPtr = GCHandle.Alloc(new byte[riceParamsLen / 4], GCHandleType.Pinned);
+ residualTasksPtr = GCHandle.Alloc(new byte[residualTasksLen], GCHandleType.Pinned);
+ bestResidualTasksPtr = GCHandle.Alloc(new byte[bestResidualTasksLen], GCHandleType.Pinned);
+
+ cudaComputeAutocor = openCLProgram.CreateKernel("cudaComputeAutocor");
+ cudaStereoDecorr = openCLProgram.CreateKernel("cudaStereoDecorr");
+ //cudaChannelDecorr = openCLProgram.CreateKernel("cudaChannelDecorr");
+ cudaChannelDecorr2 = openCLProgram.CreateKernel("cudaChannelDecorr2");
+ cudaFindWastedBits = openCLProgram.CreateKernel("cudaFindWastedBits");
+ cudaComputeLPC = openCLProgram.CreateKernel("cudaComputeLPC");
+ cudaQuantizeLPC = openCLProgram.CreateKernel("cudaQuantizeLPC");
+ //cudaComputeLPCLattice = openCLProgram.CreateKernel("cudaComputeLPCLattice");
+ cudaEstimateResidual = openCLProgram.CreateKernel("cudaEstimateResidual");
+ cudaChooseBestMethod = openCLProgram.CreateKernel("cudaChooseBestMethod");
+ cudaCopyBestMethod = openCLProgram.CreateKernel("cudaCopyBestMethod");
+ cudaCopyBestMethodStereo = openCLProgram.CreateKernel("cudaCopyBestMethodStereo");
+ //cudaEncodeResidual = openCLProgram.CreateKernel("cudaEncodeResidual");
+ //cudaCalcPartition = openCLProgram.CreateKernel("cudaCalcPartition");
+ //cudaCalcPartition16 = openCLProgram.CreateKernel("cudaCalcPartition16");
+ //cudaCalcLargePartition = openCLProgram.CreateKernel("cudaCalcLargePartition");
+ //cudaSumPartition = openCLProgram.CreateKernel("cudaSumPartition");
+ //cudaFindRiceParameter = openCLProgram.CreateKernel("cudaFindRiceParameter");
+ //cudaFindPartitionOrder = openCLProgram.CreateKernel("cudaFindPartitionOrder");
+
+ samplesBuffer = new int[FLACCLWriter.MAX_BLOCKSIZE * channelCount];
+ outputBuffer = new byte[max_frame_size * FLACCLWriter.maxFrames + 1];
+ frame = new FlacFrame(channelCount);
+ frame.writer = new BitWriter(outputBuffer, 0, outputBuffer.Length);
+
+ if (do_verify)
+ {
+ verify = new FlakeReader(new AudioPCMConfig((int)bits_per_sample, channels, 44100));
+ verify.DoCRC = false;
+ }
+ }
+
+ public void Dispose()
+ {
+ if (workThread != null)
+ {
+ lock (this)
+ {
+ exit = true;
+ Monitor.Pulse(this);
+ }
+ workThread.Join();
+ workThread = null;
+ }
+
+ cudaComputeAutocor.Dispose();
+ cudaStereoDecorr.Dispose();
+ //cudaChannelDecorr.Dispose();
+ cudaChannelDecorr2.Dispose();
+ cudaFindWastedBits.Dispose();
+ cudaComputeLPC.Dispose();
+ cudaQuantizeLPC.Dispose();
+ //cudaComputeLPCLattice.Dispose();
+ cudaEstimateResidual.Dispose();
+ cudaChooseBestMethod.Dispose();
+ cudaCopyBestMethod.Dispose();
+ cudaCopyBestMethodStereo.Dispose();
+ //cudaEncodeResidual.Dispose();
+ //cudaCalcPartition.Dispose();
+ //cudaCalcPartition16.Dispose();
+ //cudaCalcLargePartition.Dispose();
+ //cudaSumPartition.Dispose();
+ //cudaFindRiceParameter.Dispose();
+ //cudaFindPartitionOrder.Dispose();
+
+ cudaSamples.Dispose();
+ cudaSamplesBytes.Dispose();
+ cudaLPCData.Dispose();
+ cudaResidual.Dispose();
+ cudaPartitions.Dispose();
+ cudaAutocorOutput.Dispose();
+ cudaResidualTasks.Dispose();
+ cudaResidualOutput.Dispose();
+ cudaBestResidualTasks.Dispose();
+
+ samplesBytesPtr.Free();
+ residualBufferPtr.Free();
+ bestRiceParamsPtr.Free();
+ residualTasksPtr.Free();
+ bestResidualTasksPtr.Free();
+
+ openCLCQ.Dispose();
+ }
+
+ public void EnqueueFindWasted(int channelsCount)
+ {
+ cudaFindWastedBits.SetArg(0, cudaResidualTasks);
+ cudaFindWastedBits.SetArg(1, cudaSamples);
+ cudaFindWastedBits.SetArg(2, nResidualTasksPerChannel);
+
+ int workX = 128; // 256
+ int grpX = frameCount * channelsCount;
+ //openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { 128 }, new int[] { 128 });
+ //openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { 128 }, null);
+ openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { grpX * workX }, new int[] { workX });
+
+ //openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { 256 * 128 }, new int[] { 128 });
+ //openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { grpX * workX }, null);
+ //cuda.SetFunctionBlockShape(task.cudaFindWastedBits, 256, 1, 1);
+ //cuda.LaunchAsync(task.cudaFindWastedBits, channelsCount * task.frameCount, 1, task.stream);
+ }
+
+ public void EnqueueComputeAutocor(int autocorPartCount, int channelsCount, Mem cudaWindow, int max_prediction_order)
+ {
+ cudaComputeAutocor.SetArg(0, cudaAutocorOutput);
+ cudaComputeAutocor.SetArg(1, cudaSamples);
+ cudaComputeAutocor.SetArg(2, cudaWindow);
+ cudaComputeAutocor.SetArg(3, cudaResidualTasks);
+ cudaComputeAutocor.SetArg(4, max_prediction_order);
+ cudaComputeAutocor.SetArg(5, (uint)nAutocorTasksPerChannel - 1);
+ cudaComputeAutocor.SetArg(6, (uint)nResidualTasksPerChannel);
+ int workX = autocorPartCount;
+ int workY = nAutocorTasksPerChannel * channelsCount * frameCount;
+ int ws = 32;
+ int wy = 4;
+ openCLCQ.EnqueueNDRangeKernel(cudaComputeAutocor, 2, null, new int[] { workX * ws, workY * wy }, new int[] { ws, wy });
+ //openCLCQ.EnqueueNDRangeKernel(cudaComputeAutocor, 3, null, new int[] { workX * ws, workY, max_prediction_order + 1 }, new int[] { ws, 1, 1 });
+
+ //cuda.SetFunctionBlockShape(task.cudaComputeAutocor, 32, 8, 1);
+ //cuda.LaunchAsync(task.cudaComputeAutocor, autocorPartCount, task.nAutocorTasksPerChannel * channelsCount * task.frameCount, task.stream);
+ }
+
+ public void EnqueueEstimateResidual(int channelsCount, int max_prediction_order)
+ {
+ cudaEstimateResidual.SetArg(0, cudaResidualOutput);
+ cudaEstimateResidual.SetArg(1, cudaSamples);
+ cudaEstimateResidual.SetArg(2, cudaResidualTasks);
+
+ int threads_x = 128;
+ int workX = threads_x;
+ int workY = nResidualTasksPerChannel * channelsCount * frameCount;
+
+ openCLCQ.EnqueueNDRangeKernel(cudaEstimateResidual, 2, null, new int[] { workX, workY }, new int[] { threads_x, 1 });
+ }
+
+ public void EnqueueChooseBestMethod(int channelsCount)
+ {
+ cudaChooseBestMethod.SetArg(0, cudaResidualTasks);
+ cudaChooseBestMethod.SetArg(1, cudaResidualOutput);
+ cudaChooseBestMethod.SetArg(2, (uint)nResidualTasksPerChannel);
+
+ int threadsY = 4;
+
+ openCLCQ.EnqueueNDRangeKernel(cudaChooseBestMethod, 2, null, new int[] { 32, channelsCount * frameCount * threadsY }, new int[] { 32, threadsY });
+ //cuda.SetFunctionBlockShape(task.cudaChooseBestMethod, 32, 8, 1);
+ }
+
+ public unsafe FLACCLSubframeTask* ResidualTasks
+ {
+ get
+ {
+ return (FLACCLSubframeTask*)residualTasksPtr.AddrOfPinnedObject();
+ }
+ }
+
+ public unsafe FLACCLSubframeTask* BestResidualTasks
+ {
+ get
+ {
+ return (FLACCLSubframeTask*)bestResidualTasksPtr.AddrOfPinnedObject();
+ }
+ }
+ }
+}
diff --git a/CUETools.Codecs.FLACCL/OpenCLNet.dll b/CUETools.Codecs.FLACCL/OpenCLNet.dll
new file mode 100644
index 0000000..b6bf0cc
Binary files /dev/null and b/CUETools.Codecs.FLACCL/OpenCLNet.dll differ
diff --git a/CUETools.Codecs.FLACCL/OpenCLNet.pdb b/CUETools.Codecs.FLACCL/OpenCLNet.pdb
new file mode 100644
index 0000000..e3f941b
Binary files /dev/null and b/CUETools.Codecs.FLACCL/OpenCLNet.pdb differ
diff --git a/CUETools.Codecs.FLACCL/Properties/AssemblyInfo.cs b/CUETools.Codecs.FLACCL/Properties/AssemblyInfo.cs
new file mode 100644
index 0000000..83e2da4
--- /dev/null
+++ b/CUETools.Codecs.FLACCL/Properties/AssemblyInfo.cs
@@ -0,0 +1,35 @@
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+// General Information about an assembly is controlled through the following
+// set of attributes. Change these attribute values to modify the information
+// associated with an assembly.
+[assembly: AssemblyTitle("CUETools.Codecs.FLACCL")]
+[assembly: AssemblyDescription("")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyCompany("")]
+[assembly: AssemblyProduct("CUETools.Codecs.FLACCL")]
+[assembly: AssemblyCopyright("Copyright © 2009-2010 Gregory S. Chudov")]
+[assembly: AssemblyTrademark("")]
+[assembly: AssemblyCulture("")]
+
+// Setting ComVisible to false makes the types in this assembly not visible
+// to COM components. If you need to access a type in this assembly from
+// COM, set the ComVisible attribute to true on that type.
+[assembly: ComVisible(false)]
+
+// The following GUID is for the ID of the typelib if this project is exposed to COM
+[assembly: Guid("b28ffece-6c89-426b-b227-e647b435cc3d")]
+
+// Version information for an assembly consists of the following four values:
+//
+// Major Version
+// Minor Version
+// Build Number
+// Revision
+//
+// You can specify all the values or you can default the Revision and Build Numbers
+// by using the '*' as shown below:
+[assembly: AssemblyVersion("2.0.9.0")]
+[assembly: AssemblyFileVersion("2.0.9.0")]
diff --git a/CUETools.Codecs.FLACCL/Properties/Resources.Designer.cs b/CUETools.Codecs.FLACCL/Properties/Resources.Designer.cs
new file mode 100644
index 0000000..7b3d40a
--- /dev/null
+++ b/CUETools.Codecs.FLACCL/Properties/Resources.Designer.cs
@@ -0,0 +1,117 @@
+//------------------------------------------------------------------------------
+//
+// This code was generated by a tool.
+// Runtime Version:2.0.50727.4200
+//
+// Changes to this file may cause incorrect behavior and will be lost if
+// the code is regenerated.
+//
+//------------------------------------------------------------------------------
+
+namespace CUETools.Codecs.FLACCL.Properties {
+ using System;
+
+
+ ///
+ /// A strongly-typed resource class, for looking up localized strings, etc.
+ ///
+ // This class was auto-generated by the StronglyTypedResourceBuilder
+ // class via a tool like ResGen or Visual Studio.
+ // To add or remove a member, edit your .ResX file then rerun ResGen
+ // with the /str option, or rebuild your VS project.
+ [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Resources.Tools.StronglyTypedResourceBuilder", "2.0.0.0")]
+ [global::System.Diagnostics.DebuggerNonUserCodeAttribute()]
+ [global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()]
+ internal class Resources {
+
+ private static global::System.Resources.ResourceManager resourceMan;
+
+ private static global::System.Globalization.CultureInfo resourceCulture;
+
+ [global::System.Diagnostics.CodeAnalysis.SuppressMessageAttribute("Microsoft.Performance", "CA1811:AvoidUncalledPrivateCode")]
+ internal Resources() {
+ }
+
+ ///
+ /// Returns the cached ResourceManager instance used by this class.
+ ///
+ [global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)]
+ internal static global::System.Resources.ResourceManager ResourceManager {
+ get {
+ if (object.ReferenceEquals(resourceMan, null)) {
+ global::System.Resources.ResourceManager temp = new global::System.Resources.ResourceManager("CUETools.Codecs.FLACCL.Properties.Resources", typeof(Resources).Assembly);
+ resourceMan = temp;
+ }
+ return resourceMan;
+ }
+ }
+
+ ///
+ /// Overrides the current thread's CurrentUICulture property for all
+ /// resource lookups using this strongly typed resource class.
+ ///
+ [global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)]
+ internal static global::System.Globalization.CultureInfo Culture {
+ get {
+ return resourceCulture;
+ }
+ set {
+ resourceCulture = value;
+ }
+ }
+
+ ///
+ /// Looks up a localized string similar to Use additional CPU threads.
+ ///
+ internal static string DescriptionCPUThreads {
+ get {
+ return ResourceManager.GetString("DescriptionCPUThreads", resourceCulture);
+ }
+ }
+
+ ///
+ /// Looks up a localized string similar to Use GPU on all stages.
+ ///
+ internal static string DescriptionGPUOnly {
+ get {
+ return ResourceManager.GetString("DescriptionGPUOnly", resourceCulture);
+ }
+ }
+
+ ///
+ /// Looks up a localized string similar to Calculate MD5 hash for audio stream.
+ ///
+ internal static string DoMD5Description {
+ get {
+ return ResourceManager.GetString("DoMD5Description", resourceCulture);
+ }
+ }
+
+ ///
+ /// Looks up a localized string similar to Decode each frame and compare with original.
+ ///
+ internal static string DoVerifyDescription {
+ get {
+ return ResourceManager.GetString("DoVerifyDescription", resourceCulture);
+ }
+ }
+
+ ///
+ /// Looks up a localized string similar to Samples written differs from the expected sample count.
+ ///
+ internal static string ExceptionSampleCount {
+ get {
+ return ResourceManager.GetString("ExceptionSampleCount", resourceCulture);
+ }
+ }
+
+ ///
+ /// Looks up a localized string similar to Validation failed.
+ ///
+ internal static string ExceptionValidationFailed {
+ get {
+ return ResourceManager.GetString("ExceptionValidationFailed", resourceCulture);
+ }
+ }
+ }
+}
diff --git a/CUETools.Codecs.FLACCL/Properties/Resources.resx b/CUETools.Codecs.FLACCL/Properties/Resources.resx
new file mode 100644
index 0000000..1579dd9
--- /dev/null
+++ b/CUETools.Codecs.FLACCL/Properties/Resources.resx
@@ -0,0 +1,138 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ text/microsoft-resx
+
+
+ 2.0
+
+
+ System.Resources.ResXResourceReader, System.Windows.Forms, Version=2.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089
+
+
+ System.Resources.ResXResourceWriter, System.Windows.Forms, Version=2.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089
+
+
+ Use additional CPU threads
+
+
+ Use GPU on all stages
+
+
+ Calculate MD5 hash for audio stream
+
+
+ Decode each frame and compare with original
+
+
+ Samples written differs from the expected sample count
+
+
+ Validation failed
+
+
\ No newline at end of file
diff --git a/CUETools.Codecs.FLACCL/Properties/Resources.ru-RU.resx b/CUETools.Codecs.FLACCL/Properties/Resources.ru-RU.resx
new file mode 100644
index 0000000..5f87a8e
--- /dev/null
+++ b/CUETools.Codecs.FLACCL/Properties/Resources.ru-RU.resx
@@ -0,0 +1,138 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ text/microsoft-resx
+
+
+ 2.0
+
+
+ System.Resources.ResXResourceReader, System.Windows.Forms, Version=2.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089
+
+
+ System.Resources.ResXResourceWriter, System.Windows.Forms, Version=2.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089
+
+
+ Использовать дополнительные потоки
+
+
+ Использовать GPU на всех стадиях
+
+
+ Вычислять MD5-хеш аудиопотока
+
+
+ Декодировать каждый блок и сравнивать с оригиналом
+
+
+ Количество записанных сэмплов отличается от ожидавшегося
+
+
+ Ошибка верификации
+
+
\ No newline at end of file
diff --git a/CUETools.Codecs.FLACCL/flac.cl b/CUETools.Codecs.FLACCL/flac.cl
new file mode 100644
index 0000000..6c30b50
--- /dev/null
+++ b/CUETools.Codecs.FLACCL/flac.cl
@@ -0,0 +1,1275 @@
+/**
+ * CUETools.FLACCL: FLAC audio encoder using OpenCL
+ * Copyright (c) 2009 Gregory S. Chudov
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef _FLACCL_KERNEL_H_
+#define _FLACCL_KERNEL_H_
+
+typedef enum
+{
+ Constant = 0,
+ Verbatim = 1,
+ Fixed = 8,
+ LPC = 32
+} SubframeType;
+
+typedef struct
+{
+ int residualOrder; // <= 32
+ int samplesOffs;
+ int shift;
+ int cbits;
+ int size;
+ int type;
+ int obits;
+ int blocksize;
+ int best_index;
+ int channel;
+ int residualOffs;
+ int wbits;
+ int abits;
+ int porder;
+ int reserved[2];
+} FLACCLSubframeData;
+
+typedef struct
+{
+ FLACCLSubframeData data;
+ union
+ {
+ int coefs[32]; // fixme: should be short?
+ int4 coefs4[8];
+ };
+} FLACCLSubframeTask;
+
+__kernel void cudaStereoDecorr(
+ __global int *samples,
+ __global short2 *src,
+ int offset
+)
+{
+ int pos = get_global_id(0);
+ if (pos < offset)
+ {
+ short2 s = src[pos];
+ samples[pos] = s.x;
+ samples[1 * offset + pos] = s.y;
+ samples[2 * offset + pos] = (s.x + s.y) >> 1;
+ samples[3 * offset + pos] = s.x - s.y;
+ }
+}
+
+__kernel void cudaChannelDecorr2(
+ __global int *samples,
+ __global short2 *src,
+ int offset
+)
+{
+ int pos = get_global_id(0);
+ if (pos < offset)
+ {
+ short2 s = src[pos];
+ samples[pos] = s.x;
+ samples[1 * offset + pos] = s.y;
+ }
+}
+
+//__kernel void cudaChannelDecorr(
+// int *samples,
+// short *src,
+// int offset
+//)
+//{
+// int pos = get_global_id(0);
+// if (pos < offset)
+// samples[get_group_id(1) * offset + pos] = src[pos * get_num_groups(1) + get_group_id(1)];
+//}
+
+#define __ffs(a) (32 - clz(a & (-a)))
+//#define __ffs(a) (33 - clz(~a & (a - 1)))
+
+__kernel __attribute__((reqd_work_group_size(128, 1, 1)))
+void cudaFindWastedBits(
+ __global FLACCLSubframeTask *tasks,
+ __global int *samples,
+ int tasksPerChannel
+)
+{
+ __local volatile int wbits[128];
+ __local volatile int abits[128];
+ __local FLACCLSubframeData task;
+
+ int tid = get_local_id(0);
+ if (tid < sizeof(task) / sizeof(int))
+ ((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(0) * tasksPerChannel].data))[tid];
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ int w = 0, a = 0;
+ for (int pos = 0; pos < task.blocksize; pos += get_local_size(0))
+ {
+ int smp = pos + tid < task.blocksize ? samples[task.samplesOffs + pos + tid] : 0;
+ w |= smp;
+ a |= smp ^ (smp >> 31);
+ }
+ wbits[tid] = w;
+ abits[tid] = a;
+ barrier(CLK_LOCAL_MEM_FENCE);
+ //atom_or(shared.wbits, shared.wbits[tid]);
+ //atom_or(shared.abits, shared.abits[tid]);
+ //SUM256(shared.wbits, tid, |=);
+ //SUM256(shared.abits, tid, |=);
+ //SUM128(wbits, tid, |=);
+ //SUM128(abits, tid, |=);
+
+ for (int s = get_local_size(0) / 2; s > 0; s >>= 1)
+ {
+ if (tid < s)
+ {
+ wbits[tid] |= wbits[tid + s];
+ abits[tid] |= abits[tid + s];
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+ if (tid == 0)
+ task.wbits = max(0,__ffs(wbits[0]) - 1);
+ if (tid == 0)
+ task.abits = 32 - clz(abits[0]) - task.wbits;
+ // if (tid == 0)
+ //task.wbits = get_num_groups(0);
+ // if (tid == 0)
+ //task.abits = get_local_size(0);
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (tid < tasksPerChannel)
+ tasks[get_group_id(0) * tasksPerChannel + tid].data.wbits = task.wbits;
+ if (tid < tasksPerChannel)
+ tasks[get_group_id(0) * tasksPerChannel + tid].data.abits = task.abits;
+}
+
+//__kernel __attribute__((reqd_work_group_size(32, 4, 1)))
+//void cudaComputeAutocor(
+// __global float *output,
+// __global const int *samples,
+// __global const float *window,
+// __global FLACCLSubframeTask *tasks,
+// const int max_order, // should be <= 32
+// const int windowCount, // windows (log2: 0,1)
+// const int taskCount // tasks per block
+//)
+//{
+// __local struct {
+// float data[256];
+// volatile float product[128];
+// FLACCLSubframeData task;
+// volatile int dataPos;
+// volatile int dataLen;
+// } shared;
+// const int tid = get_local_id(0) + get_local_id(1) * 32;
+// // fetch task data
+// if (tid < sizeof(shared.task) / sizeof(int))
+// ((__local int*)&shared.task)[tid] = ((__global int*)(tasks + taskCount * (get_group_id(1) >> windowCount)))[tid];
+// if (tid == 0)
+// {
+// shared.dataPos = get_group_id(0) * 7 * 32;
+// shared.dataLen = min(shared.task.blocksize - shared.dataPos, 7 * 32 + max_order);
+// }
+// barrier(CLK_LOCAL_MEM_FENCE);
+//
+// // fetch samples
+// shared.data[tid] = tid < shared.dataLen ? samples[tid] * window[tid]: 0.0f;
+// int tid2 = tid + 128;
+// shared.data[tid2] = tid2 < shared.dataLen ? samples[tid2] * window[tid2]: 0.0f;
+// barrier(CLK_LOCAL_MEM_FENCE);
+//
+// for (int lag = 0; lag <= max_order; lag ++)
+// {
+// if (lag <= 12)
+// shared.product[tid] = 0.0f;
+// barrier(CLK_LOCAL_MEM_FENCE);
+// }
+// barrier(CLK_LOCAL_MEM_FENCE);
+// if (tid <= max_order)
+// output[(get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) + tid] = shared.product[tid];
+//}
+
+__kernel __attribute__((reqd_work_group_size(32, 4, 1)))
+void cudaComputeAutocor(
+ __global float *output,
+ __global const int *samples,
+ __global const float *window,
+ __global FLACCLSubframeTask *tasks,
+ const int max_order, // should be <= 32
+ const int windowCount, // windows (log2: 0,1)
+ const int taskCount // tasks per block
+)
+{
+ __local struct {
+ float data[256];
+ volatile float product[128];
+ FLACCLSubframeData task;
+ volatile float result[33];
+ volatile int dataPos;
+ volatile int dataLen;
+ volatile int windowOffs;
+ volatile int samplesOffs;
+ //volatile int resultOffs;
+ } shared;
+ const int tid = get_local_id(0) + get_local_id(1) * 32;
+ // fetch task data
+ if (tid < sizeof(shared.task) / sizeof(int))
+ ((__local int*)&shared.task)[tid] = ((__global int*)(tasks + taskCount * (get_group_id(1) >> windowCount)))[tid];
+ if (tid == 0)
+ {
+ shared.dataPos = get_group_id(0) * 7 * 32;
+ shared.windowOffs = (get_group_id(1) & ((1 << windowCount)-1)) * shared.task.blocksize + shared.dataPos;
+ shared.samplesOffs = shared.task.samplesOffs + shared.dataPos;
+ shared.dataLen = min(shared.task.blocksize - shared.dataPos, 7 * 32 + max_order);
+ }
+ //if (tid == 32)
+ //shared.resultOffs = __mul24(get_group_id(0) + __mul24(get_group_id(1), get_num_groups(0)), max_order + 1);
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // fetch samples
+ shared.data[tid] = tid < shared.dataLen ? samples[shared.samplesOffs + tid] * window[shared.windowOffs + tid]: 0.0f;
+ int tid2 = tid + 128;
+ shared.data[tid2] = tid2 < shared.dataLen ? samples[shared.samplesOffs + tid2] * window[shared.windowOffs + tid2]: 0.0f;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ const int ptr = get_local_id(0) * 7;
+ //if (get_local_id(1) == 0) for (int lag = 0; lag <= max_order; lag ++)
+ //for (int lag = get_local_id(1); lag <= max_order; lag += get_local_size(1))
+ for (int lag0 = 0; lag0 <= max_order; lag0 += get_local_size(1))
+ {
+ ////const int productLen = min(shared.task.blocksize - get_group_id(0) * partSize - lag, partSize);
+ const int lag = lag0 + get_local_id(1);
+ const int ptr2 = ptr + lag;
+ shared.product[tid] =
+ shared.data[ptr + 0] * shared.data[ptr2 + 0] +
+ shared.data[ptr + 1] * shared.data[ptr2 + 1] +
+ shared.data[ptr + 2] * shared.data[ptr2 + 2] +
+ shared.data[ptr + 3] * shared.data[ptr2 + 3] +
+ shared.data[ptr + 4] * shared.data[ptr2 + 4] +
+ shared.data[ptr + 5] * shared.data[ptr2 + 5] +
+ shared.data[ptr + 6] * shared.data[ptr2 + 6];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ for (int l = 16; l > 1; l >>= 1)
+ {
+ if (get_local_id(0) < l)
+ shared.product[tid] = shared.product[tid] + shared.product[tid + l];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ // return results
+ if (get_local_id(0) == 0 && lag <= max_order)
+ shared.result[lag] = shared.product[tid] + shared.product[tid + 1];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ if (tid <= max_order)
+ output[(get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) + tid] = shared.result[tid];
+ //output[(get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) + tid] = shared.product[tid];
+ //output[(get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) + tid] = shared.windowOffs;
+}
+
+__kernel __attribute__((reqd_work_group_size(32, 1, 1)))
+void cudaComputeLPC(
+ __global FLACCLSubframeTask *tasks,
+ int taskCount, // tasks per block
+ __global float*autoc,
+ int max_order, // should be <= 32
+ __global float *lpcs,
+ int windowCount,
+ int partCount
+)
+{
+ __local struct {
+ FLACCLSubframeData task;
+ volatile float parts[32];
+ volatile float ldr[32];
+ volatile float gen1[32];
+ volatile float error[32];
+ volatile float autoc[33];
+ volatile int lpcOffs;
+ volatile int autocOffs;
+ } shared;
+ const int tid = get_local_id(0);// + get_local_id(1) * 32;
+
+ // fetch task data
+ if (tid < sizeof(shared.task) / sizeof(int))
+ ((__local int*)&shared.task)[tid] = ((__global int*)(tasks + get_group_id(1) * taskCount))[tid];
+ if (tid == 0)
+ {
+ shared.lpcOffs = (get_group_id(0) + get_group_id(1) * windowCount) * (max_order + 1) * 32;
+ shared.autocOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) * partCount;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // add up autocorrelation parts
+
+ // for (int order = get_local_id(0); order <= max_order; order += 32)
+ // {
+ //float sum = 0.0f;
+ //for (int pos = 0; pos < partCount; pos++)
+ // sum += autoc[shared.autocOffs + pos * (max_order + 1) + order];
+ //shared.autoc[order] = sum;
+ // }
+
+ for (int order = 0; order <= max_order; order ++)
+ {
+ float part = 0.0f;
+ for (int pos = get_local_id(0); pos < partCount; pos += get_local_size(0))
+ part += autoc[shared.autocOffs + pos * (max_order + 1) + order];
+ shared.parts[tid] = part;
+ barrier(CLK_LOCAL_MEM_FENCE);
+ for (int l = get_local_size(0) / 2; l > 1; l >>= 1)
+ {
+ if (get_local_id(0) < l)
+ shared.parts[tid] += shared.parts[tid + l];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ if (get_local_id(0) == 0)
+ shared.autoc[order] = shared.parts[tid] + shared.parts[tid + 1];
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // Compute LPC using Schur and Levinson-Durbin recursion
+ float gen0 = shared.gen1[get_local_id(0)] = shared.autoc[get_local_id(0)+1];
+ shared.ldr[get_local_id(0)] = 0.0f;
+ float error = shared.autoc[0];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ for (int order = 0; order < max_order; order++)
+ {
+ // Schur recursion
+ float reff = -shared.gen1[0] / error;
+ error += shared.gen1[0] * reff; // Equivalent to error *= (1 - reff * reff);
+ float gen1;
+ if (get_local_id(0) < max_order - 1 - order)
+ {
+ gen1 = shared.gen1[get_local_id(0) + 1] + reff * gen0;
+ gen0 += shared.gen1[get_local_id(0) + 1] * reff;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (get_local_id(0) < max_order - 1 - order)
+ shared.gen1[get_local_id(0)] = gen1;
+
+ // Store prediction error
+ if (get_local_id(0) == 0)
+ shared.error[order] = error;
+
+ // Levinson-Durbin recursion
+ float ldr =
+ select(0.0f, reff * shared.ldr[order - 1 - get_local_id(0)], get_local_id(0) < order) +
+ select(0.0f, reff, get_local_id(0) == order);
+ barrier(CLK_LOCAL_MEM_FENCE);
+ shared.ldr[get_local_id(0)] += ldr;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // Output coeffs
+ if (get_local_id(0) <= order)
+ lpcs[shared.lpcOffs + order * 32 + get_local_id(0)] = -shared.ldr[order - get_local_id(0)];
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ // Output prediction error estimates
+ if (get_local_id(0) < max_order)
+ lpcs[shared.lpcOffs + max_order * 32 + get_local_id(0)] = shared.error[get_local_id(0)];
+}
+
+//__kernel void cudaComputeLPCLattice(
+// FLACCLSubframeTask *tasks,
+// const int taskCount, // tasks per block
+// const int *samples,
+// const int windowCount,
+// const int max_order, // should be <= 12
+// float*lpcs
+//)
+//{
+// __local struct {
+// volatile FLACCLSubframeData task;
+// volatile float F[512];
+// volatile float arp[32];
+// volatile float tmp[256];
+// volatile float error[32];
+// volatile int lpcOffs;
+// } shared;
+//
+// // fetch task data
+// if (get_local_id(0) < sizeof(shared.task) / sizeof(int))
+// ((int*)&shared.task)[get_local_id(0)] = ((int*)(tasks + taskCount * get_group_id(1)))[get_local_id(0)];
+// if (get_local_id(0) == 0)
+// shared.lpcOffs = __mul24(__mul24(get_group_id(1) + 1, windowCount) - 1, max_order + 1) * 32;
+// barrier(CLK_LOCAL_MEM_FENCE);
+//
+// // F = samples; B = samples
+// float s1 = get_local_id(0) < shared.task.blocksize ? (samples[shared.task.samplesOffs + get_local_id(0)]) / 32768.0f : 0.0f;
+// float s2 = get_local_id(0) + 256 < shared.task.blocksize ? (samples[shared.task.samplesOffs + get_local_id(0) + 256]) / 32768.0f : 0.0f;
+// shared.F[get_local_id(0)] = s1;
+// shared.F[get_local_id(0) + 256] = s2;
+// barrier(CLK_LOCAL_MEM_FENCE);
+//
+// shared.tmp[get_local_id(0)] = FSQR(s1) + FSQR(s2);
+// barrier(CLK_LOCAL_MEM_FENCE);
+// SUM256(shared.tmp, get_local_id(0), +=);
+// barrier(CLK_LOCAL_MEM_FENCE);
+// float DEN = shared.tmp[0];
+// barrier(CLK_LOCAL_MEM_FENCE);
+//
+// for (int order = 0; order < max_order; order++)
+// {
+// // reff = F(order+1:frameSize) * B(1:frameSize-order)' / DEN
+// int idxF = get_local_id(0) + order + 1;
+// int idxF2 = idxF + 256;
+//
+// shared.tmp[get_local_id(0)] = idxF < shared.task.blocksize ? shared.F[idxF] * s1 : 0.0f;
+// shared.tmp[get_local_id(0)] += idxF2 < shared.task.blocksize ? shared.F[idxF2] * s2 : 0.0f;
+// barrier(CLK_LOCAL_MEM_FENCE);
+// SUM256(shared.tmp, get_local_id(0), +=);
+// barrier(CLK_LOCAL_MEM_FENCE);
+// float reff = shared.tmp[0] / DEN;
+// barrier(CLK_LOCAL_MEM_FENCE);
+//
+// // arp(order) = rc(order) = reff
+// if (get_local_id(0) == 0)
+// shared.arp[order] = reff;
+// //shared.rc[order - 1] = shared.lpc[order - 1][order - 1] = reff;
+//
+// // Levinson-Durbin recursion
+// // arp(1:order-1) = arp(1:order-1) - reff * arp(order-1:-1:1)
+// if (get_local_id(0) < order)
+// shared.arp[get_local_id(0)] = shared.arp[get_local_id(0)] - reff * shared.arp[order - 1 - get_local_id(0)];
+//
+// // Output coeffs
+// if (get_local_id(0) <= order)
+// lpcs[shared.lpcOffs + order * 32 + get_local_id(0)] = shared.arp[order - get_local_id(0)];
+//
+// // F1 = F(order+1:frameSize) - reff * B(1:frameSize-order)
+// // B(1:frameSize-order) = B(1:frameSize-order) - reff * F(order+1:frameSize)
+// // F(order+1:frameSize) = F1
+// if (idxF < shared.task.blocksize)
+// {
+// float f1 = shared.F[idxF];
+// shared.F[idxF] -= reff * s1;
+// s1 -= reff * f1;
+// }
+// if (idxF2 < shared.task.blocksize)
+// {
+// float f2 = shared.F[idxF2];
+// shared.F[idxF2] -= reff * s2;
+// s2 -= reff * f2;
+// }
+//
+// // DEN = F(order+1:frameSize) * F(order+1:frameSize)' + B(1:frameSize-order) * B(1:frameSize-order)' (BURG)
+// shared.tmp[get_local_id(0)] = (idxF + 1 < shared.task.blocksize ? FSQR(shared.F[idxF]) + FSQR(s1) : 0);
+// shared.tmp[get_local_id(0)] += (idxF2 + 1 < shared.task.blocksize ? FSQR(shared.F[idxF2]) + FSQR(s2) : 0);
+// barrier(CLK_LOCAL_MEM_FENCE);
+// SUM256(shared.tmp, get_local_id(0), +=);
+// barrier(CLK_LOCAL_MEM_FENCE);
+// DEN = shared.tmp[0] / 2;
+// // shared.PE[order-1] = shared.tmp[0] / 2 / (frameSize - order + 1);
+// if (get_local_id(0) == 0)
+// shared.error[order] = DEN / (shared.task.blocksize - order);
+// barrier(CLK_LOCAL_MEM_FENCE);
+// }
+//
+// // Output prediction error estimates
+// if (get_local_id(0) < max_order)
+// lpcs[shared.lpcOffs + max_order * 32 + get_local_id(0)] = shared.error[get_local_id(0)];
+//}
+
+__kernel __attribute__((reqd_work_group_size(32, 4, 1)))
+void cudaQuantizeLPC(
+ __global FLACCLSubframeTask *tasks,
+ int taskCount, // tasks per block
+ int taskCountLPC, // tasks per set of coeffs (<= 32)
+ __global float*lpcs,
+ int max_order, // should be <= 32
+ int minprecision,
+ int precisions
+)
+{
+ __local struct {
+ FLACCLSubframeData task;
+ volatile int tmpi[128];
+ volatile int index[64];
+ volatile float error[64];
+ volatile int lpcOffs;
+ } shared;
+
+ const int tid = get_local_id(0) + get_local_id(1) * 32;
+
+ // fetch task data
+ if (tid < sizeof(shared.task) / sizeof(int))
+ ((__local int*)&shared.task)[tid] = ((__global int*)(tasks + get_group_id(1) * taskCount))[tid];
+ if (tid == 0)
+ shared.lpcOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) * 32;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (get_local_id(1) == 0)
+ {
+ shared.index[get_local_id(0)] = min(max_order - 1, get_local_id(0));
+ shared.error[get_local_id(0)] = shared.task.blocksize * 64 + get_local_id(0);
+ shared.index[32 + get_local_id(0)] = min(max_order - 1, get_local_id(0));
+ shared.error[32 + get_local_id(0)] = shared.task.blocksize * 64 + get_local_id(0);
+
+ // Select best orders based on Akaike's Criteria
+
+ // Load prediction error estimates
+ if (get_local_id(0) < max_order)
+ shared.error[get_local_id(0)] = shared.task.blocksize * log(lpcs[shared.lpcOffs + max_order * 32 + get_local_id(0)]) + get_local_id(0) * 5.12f * log(shared.task.blocksize);
+ //shared.error[get_local_id(0)] = shared.task.blocksize * log(lpcs[shared.lpcOffs + max_order * 32 + get_local_id(0)]) + get_local_id(0) * 0.30f * (shared.task.abits + 1) * log(shared.task.blocksize);
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // Sort using bitonic sort
+ for(int size = 2; size < 64; size <<= 1){
+ //Bitonic merge
+ int ddd = (get_local_id(0) & (size / 2)) == 0;
+ for(int stride = size / 2; stride > 0; stride >>= 1){
+ int pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+ float e0, e1;
+ int i0, i1;
+ if (get_local_id(1) == 0)
+ {
+ e0 = shared.error[pos];
+ e1 = shared.error[pos + stride];
+ i0 = shared.index[pos];
+ i1 = shared.index[pos + stride];
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if ((e0 >= e1) == ddd && get_local_id(1) == 0)
+ {
+ shared.error[pos] = e1;
+ shared.error[pos + stride] = e0;
+ shared.index[pos] = i1;
+ shared.index[pos + stride] = i0;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ }
+
+ //ddd == dir for the last bitonic merge step
+ {
+ for(int stride = 32; stride > 0; stride >>= 1){
+ //barrier(CLK_LOCAL_MEM_FENCE);
+ int pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
+ float e0, e1;
+ int i0, i1;
+ if (get_local_id(1) == 0)
+ {
+ e0 = shared.error[pos];
+ e1 = shared.error[pos + stride];
+ i0 = shared.index[pos];
+ i1 = shared.index[pos + stride];
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (e0 >= e1 && get_local_id(1) == 0)
+ {
+ shared.error[pos] = e1;
+ shared.error[pos + stride] = e0;
+ shared.index[pos] = i1;
+ shared.index[pos + stride] = i0;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ }
+
+ // Quantization
+ for (int ii = 0; ii < taskCountLPC; ii += get_local_size(1))
+ {
+ int i = ii + get_local_id(1);
+ int order = shared.index[i >> precisions];
+ float lpc = get_local_id(0) <= order ? lpcs[shared.lpcOffs + order * 32 + get_local_id(0)] : 0.0f;
+ // get 15 bits of each coeff
+ int coef = convert_int_rte(lpc * (1 << 15));
+ // remove sign bits
+ shared.tmpi[tid] = coef ^ (coef >> 31);
+ barrier(CLK_LOCAL_MEM_FENCE);
+ // OR reduction
+ for (int l = get_local_size(0) / 2; l > 1; l >>= 1)
+ {
+ if (get_local_id(0) < l)
+ shared.tmpi[tid] |= shared.tmpi[tid + l];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ //SUM32(shared.tmpi,tid,|=);
+ // choose precision
+ //int cbits = max(3, min(10, 5 + (shared.task.abits >> 1))); // - convert_int_rte(shared.PE[order - 1])
+ int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (shared.task.blocksize <= 2304) - (shared.task.blocksize <= 1152) - (shared.task.blocksize <= 576), shared.task.abits), clz(order) + 1 - shared.task.abits));
+ // calculate shift based on precision and number of leading zeroes in coeffs
+ int shift = max(0,min(15, clz(shared.tmpi[get_local_id(1) * 32] | shared.tmpi[get_local_id(1) * 32 + 1]) - 18 + cbits));
+
+ //cbits = 13;
+ //shift = 15;
+
+ //if (shared.task.abits + 32 - clz(order) < shift
+ //int shift = max(0,min(15, (shared.task.abits >> 2) - 14 + clz(shared.tmpi[get_local_id(0) & ~31]) + ((32 - clz(order))>>1)));
+ // quantize coeffs with given shift
+ coef = convert_int_rte(clamp(lpc * (1 << shift), -1 << (cbits - 1), 1 << (cbits - 1)));
+ // error correction
+ //shared.tmp[get_local_id(0)] = (get_local_id(0) != 0) * (shared.arp[get_local_id(0) - 1]*(1 << shared.task.shift) - shared.task.coefs[get_local_id(0) - 1]);
+ //shared.task.coefs[get_local_id(0)] = max(-(1 << (shared.task.cbits - 1)), min((1 << (shared.task.cbits - 1))-1, convert_int_rte((shared.arp[get_local_id(0)]) * (1 << shared.task.shift) + shared.tmp[get_local_id(0)])));
+ // remove sign bits
+ shared.tmpi[tid] = coef ^ (coef >> 31);
+ barrier(CLK_LOCAL_MEM_FENCE);
+ // OR reduction
+ for (int l = get_local_size(0) / 2; l > 1; l >>= 1)
+ {
+ if (get_local_id(0) < l)
+ shared.tmpi[tid] |= shared.tmpi[tid + l];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ //SUM32(shared.tmpi,tid,|=);
+ // calculate actual number of bits (+1 for sign)
+ cbits = 1 + 32 - clz(shared.tmpi[get_local_id(1) * 32] | shared.tmpi[get_local_id(1) * 32 + 1]);
+
+ // output shift, cbits and output coeffs
+ if (i < taskCountLPC)
+ {
+ int taskNo = get_group_id(1) * taskCount + get_group_id(0) * taskCountLPC + i;
+ if (get_local_id(0) == 0)
+ tasks[taskNo].data.shift = shift;
+ if (get_local_id(0) == 0)
+ tasks[taskNo].data.cbits = cbits;
+ if (get_local_id(0) == 0)
+ tasks[taskNo].data.residualOrder = order + 1;
+ if (get_local_id(0) <= order)
+ tasks[taskNo].coefs[get_local_id(0)] = coef;
+ }
+ }
+}
+
+__kernel __attribute__(( vec_type_hint (int4)))
+void cudaEstimateResidual(
+ __global int*output,
+ __global int*samples,
+ __global FLACCLSubframeTask *tasks
+ )
+{
+ __local float data[128 * 2];
+ __local int residual[128];
+ __local FLACCLSubframeTask task;
+ __local float4 coefsf4[8];
+
+ const int tid = get_local_id(0);
+ if (tid < sizeof(task)/sizeof(int))
+ ((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(1)]))[tid];
+ barrier(CLK_GLOBAL_MEM_FENCE);
+
+ int ro = task.data.residualOrder;
+ int bs = task.data.blocksize;
+ float res = 0;
+
+ if (tid < 32)
+ ((__local float *)&coefsf4[0])[tid] = select(0.0f, ((float)task.coefs[tid]) / (1 << task.data.shift), tid < ro);
+ data[tid] = tid < bs ? (float)(samples[task.data.samplesOffs + tid] >> task.data.wbits) : 0.0f;
+ for (int pos = 0; pos < bs; pos += get_local_size(0))
+ {
+ // fetch samples
+ float nextData = pos + tid + get_local_size(0) < bs ? (float)(samples[task.data.samplesOffs + pos + tid + get_local_size(0)] >> task.data.wbits) : 0.0f;
+ data[tid + get_local_size(0)] = nextData;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // compute residual
+ __local float4 * dptr = (__local float4 *)&data[tid];
+ float sumf = data[tid + ro] -
+ ( dot(dptr[0], coefsf4[0])
+ + dot(dptr[1], coefsf4[1])
+#if MAX_ORDER > 8
+ + dot(dptr[2], coefsf4[2])
+#if MAX_ORDER > 12
+ + dot(dptr[3], coefsf4[3])
+#if MAX_ORDER > 16
+ + dot(dptr[4], coefsf4[4])
+ + dot(dptr[5], coefsf4[5])
+ + dot(dptr[6], coefsf4[6])
+ + dot(dptr[7], coefsf4[7])
+#endif
+#endif
+#endif
+ );
+ //residual[tid] = sum;
+
+ res += select(0.0f, min(fabs(sumf), (float)0x7fffff), pos + tid + ro < bs);
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ //int k = min(33 - clz(sum), 14);
+ //res += select(0, 1 + k, pos + tid + ro < bs);
+
+ //sum = residual[tid] + residual[tid + 1] + residual[tid + 2] + residual[tid + 3]
+ // + residual[tid + 4] + residual[tid + 5] + residual[tid + 6] + residual[tid + 7];
+ //int k = clamp(29 - clz(sum), 0, 14);
+ //res += select(0, 8 * (k + 1) + (sum >> k), pos + tid + ro < bs && !(tid & 7));
+
+ data[tid] = nextData;
+ }
+
+ int residualLen = (bs - ro) / get_local_size(0) + select(0, 1, tid < (bs - ro) % get_local_size(0));
+ int k = clamp(convert_int_rtn(log2((res + 0.000001f) / (residualLen + 0.000001f))), 0, 14);
+ residual[tid] = residualLen * (k + 1) + (convert_int_rtz(res) >> k);
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ for (int l = get_local_size(0) / 2; l > 0; l >>= 1)
+ {
+ if (tid < l)
+ residual[tid] += residual[tid + l];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ if (tid == 0)
+ output[get_group_id(1)] = residual[0];
+}
+
+__kernel void cudaChooseBestMethod(
+ __global FLACCLSubframeTask *tasks,
+ __global int *residual,
+ int taskCount
+ )
+{
+ __local struct {
+ volatile int index[128];
+ volatile int length[256];
+ volatile FLACCLSubframeTask task[8];
+ } shared;
+ const int tid = get_local_id(0) + get_local_id(1) * 32;
+
+ shared.length[tid] = 0x7fffffff;
+ shared.index[tid] = tid;
+ for (int task = 0; task < taskCount; task += get_local_size(1))
+ if (task + get_local_id(1) < taskCount)
+ {
+ // fetch task data
+ ((__local int*)&shared.task[get_local_id(1)])[get_local_id(0)] =
+ ((__global int*)(tasks + task + get_local_id(1) + taskCount * get_group_id(1)))[get_local_id(0)];
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (get_local_id(0) == 0)
+ {
+ // fetch part sum
+ int partLen = residual[task + get_local_id(1) + taskCount * get_group_id(1)];
+ //// calculate part size
+ //int residualLen = shared.task[get_local_id(1)].data.blocksize - shared.task[get_local_id(1)].data.residualOrder;
+ //residualLen = residualLen * (shared.task[get_local_id(1)].data.type != Constant || psum != 0);
+ //// calculate rice parameter
+ //int k = max(0, min(14, convert_int_rtz(log2((psum + 0.000001f) / (residualLen + 0.000001f) + 0.5f))));
+ //// calculate part bit length
+ //int partLen = residualLen * (k + 1) + (psum >> k);
+
+ int obits = shared.task[get_local_id(1)].data.obits - shared.task[get_local_id(1)].data.wbits;
+ shared.length[task + get_local_id(1)] =
+ min(obits * shared.task[get_local_id(1)].data.blocksize,
+ shared.task[get_local_id(1)].data.type == Fixed ? shared.task[get_local_id(1)].data.residualOrder * obits + 6 + (4 * 1/2) + partLen :
+ shared.task[get_local_id(1)].data.type == LPC ? shared.task[get_local_id(1)].data.residualOrder * obits + 4 + 5 + shared.task[get_local_id(1)].data.residualOrder * shared.task[get_local_id(1)].data.cbits + 6 + (4 * 1/2)/* << porder */ + partLen :
+ shared.task[get_local_id(1)].data.type == Constant ? obits * (1 + shared.task[get_local_id(1)].data.blocksize * (partLen != 0)) :
+ obits * shared.task[get_local_id(1)].data.blocksize);
+ }
+ }
+ //shared.index[get_local_id(0)] = get_local_id(0);
+ //shared.length[get_local_id(0)] = (get_local_id(0) < taskCount) ? tasks[get_local_id(0) + taskCount * get_group_id(1)].size : 0x7fffffff;
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (tid < taskCount)
+ tasks[tid + taskCount * get_group_id(1)].data.size = shared.length[tid];
+
+ int l1 = shared.length[tid];
+ for (int sh = 8; sh > 0; sh --)
+ {
+ if (tid + (1 << sh) < get_local_size(0) * get_local_size(1))
+ {
+ int l2 = shared.length[tid + (1 << sh)];
+ shared.index[tid] = shared.index[tid + ((l2 < l1) << sh)];
+ shared.length[tid] = l1 = min(l1, l2);
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ if (tid == 0)
+ tasks[taskCount * get_group_id(1)].data.best_index = taskCount * get_group_id(1) + shared.index[shared.length[1] < shared.length[0]];
+}
+
+__kernel void cudaCopyBestMethod(
+ __global FLACCLSubframeTask *tasks_out,
+ __global FLACCLSubframeTask *tasks,
+ int count
+ )
+{
+ __local int best_index;
+ if (get_local_id(0) == 0)
+ best_index = tasks[count * get_group_id(1)].data.best_index;
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (get_local_id(0) < sizeof(FLACCLSubframeTask)/sizeof(int))
+ ((__global int*)(tasks_out + get_group_id(1)))[get_local_id(0)] = ((__global int*)(tasks + best_index))[get_local_id(0)];
+}
+
+__kernel void cudaCopyBestMethodStereo(
+ __global FLACCLSubframeTask *tasks_out,
+ __global FLACCLSubframeTask *tasks,
+ int count
+ )
+{
+ __local struct {
+ int best_index[4];
+ int best_size[4];
+ int lr_index[2];
+ } shared;
+ if (get_local_id(0) < 4)
+ shared.best_index[get_local_id(0)] = tasks[count * (get_group_id(1) * 4 + get_local_id(0))].data.best_index;
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (get_local_id(0) < 4)
+ shared.best_size[get_local_id(0)] = tasks[shared.best_index[get_local_id(0)]].data.size;
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (get_local_id(0) == 0)
+ {
+ int bitsBest = shared.best_size[2] + shared.best_size[3]; // MidSide
+ shared.lr_index[0] = shared.best_index[2];
+ shared.lr_index[1] = shared.best_index[3];
+ if (bitsBest > shared.best_size[3] + shared.best_size[1]) // RightSide
+ {
+ bitsBest = shared.best_size[3] + shared.best_size[1];
+ shared.lr_index[0] = shared.best_index[3];
+ shared.lr_index[1] = shared.best_index[1];
+ }
+ if (bitsBest > shared.best_size[0] + shared.best_size[3]) // LeftSide
+ {
+ bitsBest = shared.best_size[0] + shared.best_size[3];
+ shared.lr_index[0] = shared.best_index[0];
+ shared.lr_index[1] = shared.best_index[3];
+ }
+ if (bitsBest > shared.best_size[0] + shared.best_size[1]) // LeftRight
+ {
+ bitsBest = shared.best_size[0] + shared.best_size[1];
+ shared.lr_index[0] = shared.best_index[0];
+ shared.lr_index[1] = shared.best_index[1];
+ }
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (get_local_id(0) < sizeof(FLACCLSubframeTask)/sizeof(int))
+ ((__global int*)(tasks_out + 2 * get_group_id(1)))[get_local_id(0)] = ((__global int*)(tasks + shared.lr_index[0]))[get_local_id(0)];
+ if (get_local_id(0) == 0)
+ tasks_out[2 * get_group_id(1)].data.residualOffs = tasks[shared.best_index[0]].data.residualOffs;
+ if (get_local_id(0) < sizeof(FLACCLSubframeTask)/sizeof(int))
+ ((__global int*)(tasks_out + 2 * get_group_id(1) + 1))[get_local_id(0)] = ((__global int*)(tasks + shared.lr_index[1]))[get_local_id(0)];
+ if (get_local_id(0) == 0)
+ tasks_out[2 * get_group_id(1) + 1].data.residualOffs = tasks[shared.best_index[1]].data.residualOffs;
+}
+
+//__kernel void cudaEncodeResidual(
+// int*output,
+// int*samples,
+// FLACCLSubframeTask *tasks
+// )
+//{
+// __local struct {
+// int data[256 + 32];
+// FLACCLSubframeTask task;
+// } shared;
+// const int tid = get_local_id(0);
+// if (get_local_id(0) < sizeof(shared.task) / sizeof(int))
+// ((int*)&shared.task)[get_local_id(0)] = ((int*)(&tasks[get_group_id(1)]))[get_local_id(0)];
+// barrier(CLK_LOCAL_MEM_FENCE);
+// const int partSize = get_local_size(0);
+// const int pos = get_group_id(0) * partSize;
+// const int dataLen = min(shared.task.data.blocksize - pos, partSize + shared.task.data.residualOrder);
+//
+// // fetch samples
+// shared.data[tid] = tid < dataLen ? samples[shared.task.data.samplesOffs + pos + tid] >> shared.task.data.wbits : 0;
+// if (tid < 32) shared.data[tid + partSize] = tid + partSize < dataLen ? samples[shared.task.data.samplesOffs + pos + tid + partSize] >> shared.task.data.wbits : 0;
+// const int residualLen = max(0,min(shared.task.data.blocksize - pos - shared.task.data.residualOrder, partSize));
+//
+// barrier(CLK_LOCAL_MEM_FENCE);
+// // compute residual
+// int sum = 0;
+// for (int c = 0; c < shared.task.data.residualOrder; c++)
+// sum += __mul24(shared.data[tid + c], shared.task.coefs[c]);
+// barrier(CLK_LOCAL_MEM_FENCE);
+// shared.data[tid + shared.task.data.residualOrder] -= (sum >> shared.task.data.shift);
+// barrier(CLK_LOCAL_MEM_FENCE);
+// if (tid >= shared.task.data.residualOrder && tid < residualLen + shared.task.data.residualOrder)
+// output[shared.task.data.residualOffs + pos + tid] = shared.data[tid];
+// if (tid + 256 < residualLen + shared.task.data.residualOrder)
+// output[shared.task.data.residualOffs + pos + tid + 256] = shared.data[tid + 256];
+//}
+//
+//__kernel void cudaCalcPartition(
+// int* partition_lengths,
+// int* residual,
+// int* samples,
+// FLACCLSubframeTask *tasks,
+// int max_porder, // <= 8
+// int psize, // == (shared.task.data.blocksize >> max_porder), < 256
+// int parts_per_block // == 256 / psize, > 0, <= 16
+// )
+//{
+// __local struct {
+// int data[256+32];
+// FLACCLSubframeTask task;
+// } shared;
+// const int tid = get_local_id(0) + (get_local_id(1) << 4);
+// if (tid < sizeof(shared.task) / sizeof(int))
+// ((int*)&shared.task)[tid] = ((int*)(&tasks[get_group_id(1)]))[tid];
+// barrier(CLK_LOCAL_MEM_FENCE);
+//
+// const int parts = min(parts_per_block, (1 << max_porder) - get_group_id(0) * parts_per_block);
+// const int offs = get_group_id(0) * psize * parts_per_block + tid;
+//
+// // fetch samples
+// if (tid < 32) shared.data[tid] = min(offs, tid + shared.task.data.residualOrder) >= 32 ? samples[shared.task.data.samplesOffs + offs - 32] >> shared.task.data.wbits : 0;
+// shared.data[32 + tid] = tid < parts * psize ? samples[shared.task.data.samplesOffs + offs] >> shared.task.data.wbits : 0;
+// barrier(CLK_LOCAL_MEM_FENCE);
+//
+// // compute residual
+// int s = 0;
+// for (int c = -shared.task.data.residualOrder; c < 0; c++)
+// s += __mul24(shared.data[32 + tid + c], shared.task.coefs[shared.task.data.residualOrder + c]);
+// s = shared.data[32 + tid] - (s >> shared.task.data.shift);
+//
+// if (offs >= shared.task.data.residualOrder && tid < parts * psize)
+// residual[shared.task.data.residualOffs + offs] = s;
+// else
+// s = 0;
+//
+// // convert to unsigned
+// s = min(0xfffff, (s << 1) ^ (s >> 31));
+//
+// //barrier(CLK_LOCAL_MEM_FENCE);
+// //shared.data[tid] = s;
+// //barrier(CLK_LOCAL_MEM_FENCE);
+//
+// //shared.data[tid] = (shared.data[tid] & (0x0000ffff << (tid & 16))) | (((shared.data[tid ^ 16] & (0x0000ffff << (tid & 16))) << (~tid & 16)) >> (tid & 16));
+// //shared.data[tid] = (shared.data[tid] & (0x00ff00ff << (tid & 8))) | (((shared.data[tid ^ 8] & (0x00ff00ff << (tid & 8))) << (~tid & 8)) >> (tid & 8));
+// //shared.data[tid] = (shared.data[tid] & (0x0f0f0f0f << (tid & 4))) | (((shared.data[tid ^ 4] & (0x0f0f0f0f << (tid & 4))) << (~tid & 4)) >> (tid & 4));
+// //shared.data[tid] = (shared.data[tid] & (0x33333333 << (tid & 2))) | (((shared.data[tid ^ 2] & (0x33333333 << (tid & 2))) << (~tid & 2)) >> (tid & 2));
+// //shared.data[tid] = (shared.data[tid] & (0x55555555 << (tid & 1))) | (((shared.data[tid ^ 1] & (0x55555555 << (tid & 1))) << (~tid & 1)) >> (tid & 1));
+// //shared.data[tid] = __popc(shared.data[tid]);
+//
+// barrier(CLK_LOCAL_MEM_FENCE);
+// shared.data[tid + (tid / psize)] = s;
+// //shared.data[tid] = s;
+// barrier(CLK_LOCAL_MEM_FENCE);
+//
+// s = (psize - shared.task.data.residualOrder * (get_local_id(0) + get_group_id(0) == 0)) * (get_local_id(1) + 1);
+// int dpos = __mul24(get_local_id(0), psize + 1);
+// //int dpos = __mul24(get_local_id(0), psize);
+// // calc number of unary bits for part get_local_id(0) with rice paramater get_local_id(1)
+//#pragma unroll 0
+// for (int i = 0; i < psize; i++)
+// s += shared.data[dpos + i] >> get_local_id(1);
+//
+// // output length
+// const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (get_local_id(1) << (max_porder + 1));
+// if (get_local_id(1) <= 14 && get_local_id(0) < parts)
+// partition_lengths[pos + get_group_id(0) * parts_per_block + get_local_id(0)] = s;
+//}
+//
+//__kernel void cudaCalcPartition16(
+// int* partition_lengths,
+// int* residual,
+// int* samples,
+// FLACCLSubframeTask *tasks,
+// int max_porder, // <= 8
+// int psize, // == 16
+// int parts_per_block // == 16
+// )
+//{
+// __local struct {
+// int data[256+32];
+// FLACCLSubframeTask task;
+// } shared;
+// const int tid = get_local_id(0) + (get_local_id(1) << 4);
+// if (tid < sizeof(shared.task) / sizeof(int))
+// ((int*)&shared.task)[tid] = ((int*)(&tasks[get_group_id(1)]))[tid];
+// barrier(CLK_LOCAL_MEM_FENCE);
+//
+// const int offs = (get_group_id(0) << 8) + tid;
+//
+// // fetch samples
+// if (tid < 32) shared.data[tid] = min(offs, tid + shared.task.data.residualOrder) >= 32 ? samples[shared.task.data.samplesOffs + offs - 32] >> shared.task.data.wbits : 0;
+// shared.data[32 + tid] = samples[shared.task.data.samplesOffs + offs] >> shared.task.data.wbits;
+// // if (tid < 32 && tid >= shared.task.data.residualOrder)
+// //shared.task.coefs[tid] = 0;
+// barrier(CLK_LOCAL_MEM_FENCE);
+//
+// // compute residual
+// int s = 0;
+// for (int c = -shared.task.data.residualOrder; c < 0; c++)
+// s += __mul24(shared.data[32 + tid + c], shared.task.coefs[shared.task.data.residualOrder + c]);
+// // int spos = 32 + tid - shared.task.data.residualOrder;
+// // int s=
+// //__mul24(shared.data[spos + 0], shared.task.coefs[0]) + __mul24(shared.data[spos + 1], shared.task.coefs[1]) +
+// //__mul24(shared.data[spos + 2], shared.task.coefs[2]) + __mul24(shared.data[spos + 3], shared.task.coefs[3]) +
+// //__mul24(shared.data[spos + 4], shared.task.coefs[4]) + __mul24(shared.data[spos + 5], shared.task.coefs[5]) +
+// //__mul24(shared.data[spos + 6], shared.task.coefs[6]) + __mul24(shared.data[spos + 7], shared.task.coefs[7]) +
+// //__mul24(shared.data[spos + 8], shared.task.coefs[8]) + __mul24(shared.data[spos + 9], shared.task.coefs[9]) +
+// //__mul24(shared.data[spos + 10], shared.task.coefs[10]) + __mul24(shared.data[spos + 11], shared.task.coefs[11]) +
+// //__mul24(shared.data[spos + 12], shared.task.coefs[12]) + __mul24(shared.data[spos + 13], shared.task.coefs[13]) +
+// //__mul24(shared.data[spos + 14], shared.task.coefs[14]) + __mul24(shared.data[spos + 15], shared.task.coefs[15]);
+// s = shared.data[32 + tid] - (s >> shared.task.data.shift);
+//
+// if (get_group_id(0) != 0 || tid >= shared.task.data.residualOrder)
+// residual[shared.task.data.residualOffs + (get_group_id(0) << 8) + tid] = s;
+// else
+// s = 0;
+//
+// // convert to unsigned
+// s = min(0xfffff, (s << 1) ^ (s >> 31));
+// barrier(CLK_LOCAL_MEM_FENCE);
+// shared.data[tid + get_local_id(1)] = s;
+// barrier(CLK_LOCAL_MEM_FENCE);
+//
+// // calc number of unary bits for part get_local_id(0) with rice paramater get_local_id(1)
+// int dpos = __mul24(get_local_id(0), 17);
+// int sum =
+// (shared.data[dpos + 0] >> get_local_id(1)) + (shared.data[dpos + 1] >> get_local_id(1)) +
+// (shared.data[dpos + 2] >> get_local_id(1)) + (shared.data[dpos + 3] >> get_local_id(1)) +
+// (shared.data[dpos + 4] >> get_local_id(1)) + (shared.data[dpos + 5] >> get_local_id(1)) +
+// (shared.data[dpos + 6] >> get_local_id(1)) + (shared.data[dpos + 7] >> get_local_id(1)) +
+// (shared.data[dpos + 8] >> get_local_id(1)) + (shared.data[dpos + 9] >> get_local_id(1)) +
+// (shared.data[dpos + 10] >> get_local_id(1)) + (shared.data[dpos + 11] >> get_local_id(1)) +
+// (shared.data[dpos + 12] >> get_local_id(1)) + (shared.data[dpos + 13] >> get_local_id(1)) +
+// (shared.data[dpos + 14] >> get_local_id(1)) + (shared.data[dpos + 15] >> get_local_id(1));
+//
+// // output length
+// const int pos = ((15 * get_group_id(1) + get_local_id(1)) << (max_porder + 1)) + (get_group_id(0) << 4) + get_local_id(0);
+// if (get_local_id(1) <= 14)
+// partition_lengths[pos] = sum + (16 - shared.task.data.residualOrder * (get_local_id(0) + get_group_id(0) == 0)) * (get_local_id(1) + 1);
+//}
+//
+//__kernel void cudaCalcLargePartition(
+// int* partition_lengths,
+// int* residual,
+// int* samples,
+// FLACCLSubframeTask *tasks,
+// int max_porder, // <= 8
+// int psize, // == >= 128
+// int parts_per_block // == 1
+// )
+//{
+// __local struct {
+// int data[256];
+// volatile int length[256];
+// FLACCLSubframeTask task;
+// } shared;
+// const int tid = get_local_id(0) + (get_local_id(1) << 4);
+// if (tid < sizeof(shared.task) / sizeof(int))
+// ((int*)&shared.task)[tid] = ((int*)(&tasks[get_group_id(1)]))[tid];
+// barrier(CLK_LOCAL_MEM_FENCE);
+//
+// int sum = 0;
+// for (int pos = 0; pos < psize; pos += 256)
+// {
+// // fetch residual
+// int offs = get_group_id(0) * psize + pos + tid;
+// int s = (offs >= shared.task.data.residualOrder && pos + tid < psize) ? residual[shared.task.data.residualOffs + offs] : 0;
+// // convert to unsigned
+// shared.data[tid] = min(0xfffff, (s << 1) ^ (s >> 31));
+// barrier(CLK_LOCAL_MEM_FENCE);
+//
+// // calc number of unary bits for each residual sample with each rice paramater
+//#pragma unroll 0
+// for (int i = get_local_id(0); i < min(psize,256); i += 16)
+// // for sample (i + get_local_id(0)) with this rice paramater (get_local_id(1))
+// sum += shared.data[i] >> get_local_id(1);
+// barrier(CLK_LOCAL_MEM_FENCE);
+// }
+// shared.length[tid] = min(0xfffff,sum);
+// SUM16(shared.length,tid,+=);
+//
+// // output length
+// const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (get_local_id(1) << (max_porder + 1));
+// if (get_local_id(1) <= 14 && get_local_id(0) == 0)
+// partition_lengths[pos + get_group_id(0)] = min(0xfffff,shared.length[tid]) + (psize - shared.task.data.residualOrder * (get_group_id(0) == 0)) * (get_local_id(1) + 1);
+//}
+//
+//// Sums partition lengths for a certain k == get_group_id(0)
+//// Requires 128 threads
+//__kernel void cudaSumPartition(
+// int* partition_lengths,
+// int max_porder
+// )
+//{
+// __local struct {
+// volatile int data[512+32]; // max_porder <= 8, data length <= 1 << 9.
+// } shared;
+//
+// const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (get_group_id(0) << (max_porder + 1));
+//
+// // fetch partition lengths
+// shared.data[get_local_id(0)] = get_local_id(0) < (1 << max_porder) ? partition_lengths[pos + get_local_id(0)] : 0;
+// shared.data[get_local_size(0) + get_local_id(0)] = get_local_size(0) + get_local_id(0) < (1 << max_porder) ? partition_lengths[pos + get_local_size(0) + get_local_id(0)] : 0;
+// barrier(CLK_LOCAL_MEM_FENCE);
+//
+// int in_pos = (get_local_id(0) << 1);
+// int out_pos = (1 << max_porder) + get_local_id(0);
+// int bs;
+// for (bs = 1 << (max_porder - 1); bs > 32; bs >>= 1)
+// {
+// if (get_local_id(0) < bs) shared.data[out_pos] = shared.data[in_pos] + shared.data[in_pos + 1];
+// in_pos += bs << 1;
+// out_pos += bs;
+// barrier(CLK_LOCAL_MEM_FENCE);
+// }
+// if (get_local_id(0) < 32)
+// for (; bs > 0; bs >>= 1)
+// {
+// shared.data[out_pos] = shared.data[in_pos] + shared.data[in_pos + 1];
+// in_pos += bs << 1;
+// out_pos += bs;
+// }
+// barrier(CLK_LOCAL_MEM_FENCE);
+// if (get_local_id(0) < (1 << max_porder))
+// partition_lengths[pos + (1 << max_porder) + get_local_id(0)] = shared.data[(1 << max_porder) + get_local_id(0)];
+// if (get_local_size(0) + get_local_id(0) < (1 << max_porder))
+// partition_lengths[pos + (1 << max_porder) + get_local_size(0) + get_local_id(0)] = shared.data[(1 << max_porder) + get_local_size(0) + get_local_id(0)];
+//}
+//
+//// Finds optimal rice parameter for up to 16 partitions at a time.
+//// Requires 16x16 threads
+//__kernel void cudaFindRiceParameter(
+// int* rice_parameters,
+// int* partition_lengths,
+// int max_porder
+// )
+//{
+// __local struct {
+// volatile int length[256];
+// volatile int index[256];
+// } shared;
+// const int tid = get_local_id(0) + (get_local_id(1) << 5);
+// const int parts = min(32, 2 << max_porder);
+// const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (get_local_id(1) << (max_porder + 1));
+//
+// // read length for 32 partitions
+// int l1 = (get_local_id(0) < parts) ? partition_lengths[pos + get_group_id(0) * 32 + get_local_id(0)] : 0xffffff;
+// int l2 = (get_local_id(1) + 8 <= 14 && get_local_id(0) < parts) ? partition_lengths[pos + (8 << (max_porder + 1)) + get_group_id(0) * 32 + get_local_id(0)] : 0xffffff;
+// // find best rice parameter
+// shared.index[tid] = get_local_id(1) + ((l2 < l1) << 3);
+// shared.length[tid] = l1 = min(l1, l2);
+// barrier(CLK_LOCAL_MEM_FENCE);
+//#pragma unroll 3
+// for (int sh = 7; sh >= 5; sh --)
+// {
+// if (tid < (1 << sh))
+// {
+// l2 = shared.length[tid + (1 << sh)];
+// shared.index[tid] = shared.index[tid + ((l2 < l1) << sh)];
+// shared.length[tid] = l1 = min(l1, l2);
+// }
+// barrier(CLK_LOCAL_MEM_FENCE);
+// }
+// if (tid < parts)
+// {
+// // output rice parameter
+// rice_parameters[(get_group_id(1) << (max_porder + 2)) + get_group_id(0) * parts + tid] = shared.index[tid];
+// // output length
+// rice_parameters[(get_group_id(1) << (max_porder + 2)) + (1 << (max_porder + 1)) + get_group_id(0) * parts + tid] = shared.length[tid];
+// }
+//}
+//
+//__kernel void cudaFindPartitionOrder(
+// int* best_rice_parameters,
+// FLACCLSubframeTask *tasks,
+// int* rice_parameters,
+// int max_porder
+// )
+//{
+// __local struct {
+// int data[512];
+// volatile int tmp[256];
+// int length[32];
+// int index[32];
+// //char4 ch[64];
+// FLACCLSubframeTask task;
+// } shared;
+// const int pos = (get_group_id(1) << (max_porder + 2)) + (2 << max_porder);
+// if (get_local_id(0) < sizeof(shared.task) / sizeof(int))
+// ((int*)&shared.task)[get_local_id(0)] = ((int*)(&tasks[get_group_id(1)]))[get_local_id(0)];
+// // fetch partition lengths
+// shared.data[get_local_id(0)] = get_local_id(0) < (2 << max_porder) ? rice_parameters[pos + get_local_id(0)] : 0;
+// shared.data[get_local_id(0) + 256] = get_local_id(0) + 256 < (2 << max_porder) ? rice_parameters[pos + 256 + get_local_id(0)] : 0;
+// barrier(CLK_LOCAL_MEM_FENCE);
+//
+// for (int porder = max_porder; porder >= 0; porder--)
+// {
+// shared.tmp[get_local_id(0)] = (get_local_id(0) < (1 << porder)) * shared.data[(2 << max_porder) - (2 << porder) + get_local_id(0)];
+// barrier(CLK_LOCAL_MEM_FENCE);
+// SUM256(shared.tmp, get_local_id(0), +=);
+// if (get_local_id(0) == 0)
+// shared.length[porder] = shared.tmp[0] + (4 << porder);
+// barrier(CLK_LOCAL_MEM_FENCE);
+// }
+//
+// if (get_local_id(0) < 32)
+// {
+// shared.index[get_local_id(0)] = get_local_id(0);
+// if (get_local_id(0) > max_porder)
+// shared.length[get_local_id(0)] = 0xfffffff;
+// int l1 = shared.length[get_local_id(0)];
+// #pragma unroll 4
+// for (int sh = 3; sh >= 0; sh --)
+// {
+// int l2 = shared.length[get_local_id(0) + (1 << sh)];
+// shared.index[get_local_id(0)] = shared.index[get_local_id(0) + ((l2 < l1) << sh)];
+// shared.length[get_local_id(0)] = l1 = min(l1, l2);
+// }
+// if (get_local_id(0) == 0)
+// tasks[get_group_id(1)].data.porder = shared.index[0];
+// if (get_local_id(0) == 0)
+// {
+// int obits = shared.task.data.obits - shared.task.data.wbits;
+// tasks[get_group_id(1)].data.size =
+// shared.task.data.type == Fixed ? shared.task.data.residualOrder * obits + 6 + l1 :
+// shared.task.data.type == LPC ? shared.task.data.residualOrder * obits + 6 + l1 + 4 + 5 + shared.task.data.residualOrder * shared.task.data.cbits :
+// shared.task.data.type == Constant ? obits : obits * shared.task.data.blocksize;
+// }
+// }
+// barrier(CLK_LOCAL_MEM_FENCE);
+// int porder = shared.index[0];
+// if (get_local_id(0) < (1 << porder))
+// best_rice_parameters[(get_group_id(1) << max_porder) + get_local_id(0)] = rice_parameters[pos - (2 << porder) + get_local_id(0)];
+// // FIXME: should be bytes?
+// // if (get_local_id(0) < (1 << porder))
+// //shared.tmp[get_local_id(0)] = rice_parameters[pos - (2 << porder) + get_local_id(0)];
+// // barrier(CLK_LOCAL_MEM_FENCE);
+// // if (get_local_id(0) < max(1, (1 << porder) >> 2))
+// // {
+// //char4 ch;
+// //ch.x = shared.tmp[(get_local_id(0) << 2)];
+// //ch.y = shared.tmp[(get_local_id(0) << 2) + 1];
+// //ch.z = shared.tmp[(get_local_id(0) << 2) + 2];
+// //ch.w = shared.tmp[(get_local_id(0) << 2) + 3];
+// //shared.ch[get_local_id(0)] = ch
+// // }
+// // barrier(CLK_LOCAL_MEM_FENCE);
+// // if (get_local_id(0) < max(1, (1 << porder) >> 2))
+// //best_rice_parameters[(get_group_id(1) << max_porder) + get_local_id(0)] = shared.ch[get_local_id(0)];
+//}
+//
+//#endif
+//
+//#if 0
+// if (get_local_id(0) < order)
+// {
+// for (int i = 0; i < order; i++)
+// if (get_local_id(0) >= i)
+// sum[get_local_id(0) - i] += coefs[get_local_id(0)] * sample[order - i - 1];
+// fot (int i = order; i < blocksize; i++)
+// {
+// if (!get_local_id(0)) sample[order + i] = s = residual[order + i] + (sum[order + i] >> shift);
+// sum[get_local_id(0) + i + 1] += coefs[get_local_id(0)] * s;
+// }
+// }
+//#endif
+#endif