diff --git a/CUETools.FlaCuda/CUETools.Codecs.FlaCuda.csproj b/CUETools.FlaCuda/CUETools.Codecs.FlaCuda.csproj new file mode 100644 index 0000000..db38a62 --- /dev/null +++ b/CUETools.FlaCuda/CUETools.Codecs.FlaCuda.csproj @@ -0,0 +1,73 @@ + + + Debug + AnyCPU + 8.0.50727 + 2.0 + {DFE50673-906C-4B8F-993B-A24CAD1CA17D} + Library + Properties + CUETools.Codecs.FlaCuda + CUETools.Codecs.FlaCuda + + + true + full + false + bin\Debug\ + DEBUG;TRACE + prompt + 4 + true + + + pdbonly + true + bin\Release\ + TRACE + prompt + 4 + true + + + + False + ..\..\CUDANET\lib\CUDA.NET.dll + + + + + + + + + + + + {082D6B9E-326E-4D15-9798-EDAE9EDE70A6} + CUETools.Codecs.FLAKE + + + {6458A13A-30EF-45A9-9D58-E5031B17BEE2} + CUETools.Codecs + + + + + PreserveNewest + "%24%28CUDA_BIN_PATH%29\nvcc.exe" flacuda.cu --cubin -cbin "%24%28VCInstallDir%29bin" + + + + + + nvcc flacuda.cu --maxrregcount 10 --cubin --compiler-bindir "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\bin" --system-include "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\include" + + + \ No newline at end of file diff --git a/CUETools.FlaCuda/FlaCudaWriter.cs b/CUETools.FlaCuda/FlaCudaWriter.cs new file mode 100644 index 0000000..b6e5a14 --- /dev/null +++ b/CUETools.FlaCuda/FlaCudaWriter.cs @@ -0,0 +1,2003 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Security.Cryptography; +using System.Text; +using System.Runtime.InteropServices; +using CUETools.Codecs; +using CUETools.Codecs.FLAKE; +using GASS.CUDA; +using GASS.CUDA.Types; + +namespace CUETools.Codecs.FlaCuda +{ + public class FlaCudaWriter : IAudioDest + { + Stream _IO = null; + string _path; + long _position; + + // number of audio channels + // valid values are 1 to 8 + int channels, ch_code; + + // audio sample rate in Hz + int sample_rate, sr_code0, sr_code1; + + // sample size in bits + // only 16-bit is currently supported + uint bits_per_sample; + int bps_code; + + // total stream samples + // if 0, stream length is unknown + int sample_count; + + FlakeEncodeParams eparams; + + // maximum frame size in bytes + // this can be used to allocate memory for output + int max_frame_size; + + byte[] frame_buffer = null; + + int frame_count = 0; + + long first_frame_offset = 0; + + TimeSpan _userProcessorTime; + + // header bytes + // allocated by flake_encode_init and freed by flake_encode_close + byte[] header; + + int[] samplesBuffer; + int[] verifyBuffer; + int[] residualBuffer; + float[] windowBuffer; + int[] autocorBuffer3int; + float[] autocorBuffer; + int samplesInBuffer = 0; + + int _compressionLevel = 7; + int _blocksize = 0; + int _totalSize = 0; + int _windowsize = 0, _windowcount = 0; + + Crc8 crc8; + Crc16 crc16; + MD5 md5; + + FlacFrame frame; + FlakeReader verify; + + SeekPoint[] seek_table; + int seek_table_offset = -1; + + bool inited = false; + + CUDA cuda; + CUfunction cudaComputeAutocor; + CUfunction cudaEncodeResidual; + CUdeviceptr cudaSamples; + CUdeviceptr cudaWindow; + CUdeviceptr cudaAutocor; + CUdeviceptr cudaCoeffs; + CUdeviceptr cudaShifts; + + public const int MAX_BLOCKSIZE = 8192; + + public FlaCudaWriter(string path, int bitsPerSample, int channelCount, int sampleRate, Stream IO) + { + if (bitsPerSample != 16) + throw new Exception("Bits per sample must be 16."); + if (channelCount != 2) + throw new Exception("ChannelCount must be 2."); + + channels = channelCount; + sample_rate = sampleRate; + bits_per_sample = (uint) bitsPerSample; + + // flake_validate_params + + _path = path; + _IO = IO; + + samplesBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels)]; + residualBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 10 : channels + 1)]; + windowBuffer = new float[FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS]; + autocorBuffer3int = new int[(lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * 64]; + autocorBuffer = new float[(lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * 22]; + + eparams.flake_set_defaults(_compressionLevel); + eparams.padding_size = 8192; + + crc8 = new Crc8(); + crc16 = new Crc16(); + frame = new FlacFrame(channels * 2); + } + + public int TotalSize + { + get + { + return _totalSize; + } + } + + public int PaddingLength + { + get + { + return eparams.padding_size; + } + set + { + eparams.padding_size = value; + } + } + + public int CompressionLevel + { + get + { + return _compressionLevel; + } + set + { + if (value < 0 || value > 11) + throw new Exception("unsupported compression level"); + _compressionLevel = value; + eparams.flake_set_defaults(_compressionLevel); + } + } + + //[DllImport("kernel32.dll")] + //static extern bool GetThreadTimes(IntPtr hThread, out long lpCreationTime, out long lpExitTime, out long lpKernelTime, out long lpUserTime); + //[DllImport("kernel32.dll")] + //static extern IntPtr GetCurrentThread(); + + void DoClose() + { + if (inited) + { + while (samplesInBuffer > 0) + { + eparams.block_size = samplesInBuffer; + output_frame(); + } + + if (_IO.CanSeek) + { + if (md5 != null) + { + md5.TransformFinalBlock(frame_buffer, 0, 0); + _IO.Position = 26; + _IO.Write(md5.Hash, 0, md5.Hash.Length); + } + + if (seek_table != null) + { + _IO.Position = seek_table_offset; + int len = write_seekpoints(header, 0, 0); + _IO.Write(header, 4, len - 4); + } + } + _IO.Close(); + + cuda.Free(cudaWindow); + cuda.Free(cudaSamples); + cuda.Free(cudaAutocor); + cuda.Free(cudaCoeffs); + cuda.Free(cudaShifts); + cuda.Dispose(); + inited = false; + } + + //long fake, KernelStart, UserStart; + //GetThreadTimes(GetCurrentThread(), out fake, out fake, out KernelStart, out UserStart); + //_userProcessorTime = new TimeSpan(UserStart); + } + + public void Close() + { + DoClose(); + if (sample_count != 0 && _position != sample_count) + throw new Exception("Samples written differs from the expected sample count."); + } + + public void Delete() + { + if (inited) + { + _IO.Close(); + cuda.Free(cudaWindow); + cuda.Free(cudaSamples); + cuda.Free(cudaAutocor); + cuda.Free(cudaCoeffs); + cuda.Free(cudaShifts); + cuda.Dispose(); + inited = false; + } + + if (_path != "") + File.Delete(_path); + } + + public long Position + { + get + { + return _position; + } + } + + public long FinalSampleCount + { + set { sample_count = (int)value; } + } + + public long BlockSize + { + set { _blocksize = (int)value; } + get { return _blocksize == 0 ? eparams.block_size : _blocksize; } + } + + public OrderMethod OrderMethod + { + get { return eparams.order_method; } + set { eparams.order_method = value; } + } + + public PredictionType PredictionType + { + get { return eparams.prediction_type; } + set { eparams.prediction_type = value; } + } + + public StereoMethod StereoMethod + { + get { return eparams.stereo_method; } + set { eparams.stereo_method = value; } + } + + public int MinPrecisionSearch + { + get { return eparams.lpc_min_precision_search; } + set + { + if (value < 0 || value > eparams.lpc_max_precision_search) + throw new Exception("unsupported MinPrecisionSearch value"); + eparams.lpc_min_precision_search = value; + } + } + + public int MaxPrecisionSearch + { + get { return eparams.lpc_max_precision_search; } + set + { + if (value < eparams.lpc_min_precision_search || value >= lpc.MAX_LPC_PRECISIONS) + throw new Exception("unsupported MaxPrecisionSearch value"); + eparams.lpc_max_precision_search = value; + } + } + + public WindowFunction WindowFunction + { + get { return eparams.window_function; } + set { eparams.window_function = value; } + } + + public bool DoMD5 + { + get { return eparams.do_md5; } + set { eparams.do_md5 = value; } + } + + public bool DoVerify + { + get { return eparams.do_verify; } + set { eparams.do_verify = value; } + } + + public bool DoSeekTable + { + get { return eparams.do_seektable; } + set { eparams.do_seektable = value; } + } + + public int VBRMode + { + get { return eparams.variable_block_size; } + set { eparams.variable_block_size = value; } + } + + public int MinPredictionOrder + { + get + { + return PredictionType == PredictionType.Fixed ? + MinFixedOrder : MinLPCOrder; + } + set + { + if (PredictionType == PredictionType.Fixed) + MinFixedOrder = value; + else + MinLPCOrder = value; + } + } + + public int MaxPredictionOrder + { + get + { + return PredictionType == PredictionType.Fixed ? + MaxFixedOrder : MaxLPCOrder; + } + set + { + if (PredictionType == PredictionType.Fixed) + MaxFixedOrder = value; + else + MaxLPCOrder = value; + } + } + + public int MinLPCOrder + { + get + { + return eparams.min_prediction_order; + } + set + { + if (value < 1 || value > eparams.max_prediction_order) + throw new Exception("invalid MinLPCOrder " + value.ToString()); + eparams.min_prediction_order = value; + } + } + + public int MaxLPCOrder + { + get + { + return eparams.max_prediction_order; + } + set + { + if (value > lpc.MAX_LPC_ORDER || value < eparams.min_prediction_order) + throw new Exception("invalid MaxLPCOrder " + value.ToString()); + eparams.max_prediction_order = value; + } + } + + public int EstimationDepth + { + get + { + return eparams.estimation_depth; + } + set + { + if (value > 32 || value < 1) + throw new Exception("invalid estimation_depth " + value.ToString()); + eparams.estimation_depth = value; + } + } + + public int MinFixedOrder + { + get + { + return eparams.min_fixed_order; + } + set + { + if (value < 0 || value > eparams.max_fixed_order) + throw new Exception("invalid MinFixedOrder " + value.ToString()); + eparams.min_fixed_order = value; + } + } + + public int MaxFixedOrder + { + get + { + return eparams.max_fixed_order; + } + set + { + if (value > 4 || value < eparams.min_fixed_order) + throw new Exception("invalid MaxFixedOrder " + value.ToString()); + eparams.max_fixed_order = value; + } + } + + public int MinPartitionOrder + { + get { return eparams.min_partition_order; } + set + { + if (value < 0 || value > eparams.max_partition_order) + throw new Exception("invalid MinPartitionOrder " + value.ToString()); + eparams.min_partition_order = value; + } + } + + public int MaxPartitionOrder + { + get { return eparams.max_partition_order; } + set + { + if (value > 8 || value < eparams.min_partition_order) + throw new Exception("invalid MaxPartitionOrder " + value.ToString()); + eparams.max_partition_order = value; + } + } + + public TimeSpan UserProcessorTime + { + get { return _userProcessorTime; } + } + + public int BitsPerSample + { + get { return 16; } + } + + unsafe uint get_wasted_bits(int* signal, int samples) + { + int i, shift; + int x = 0; + + for (i = 0; i < samples && 0 == (x & 1); i++) + x |= signal[i]; + + if (x == 0) + { + shift = 0; + } + else + { + for (shift = 0; 0 == (x & 1); shift++) + x >>= 1; + } + + if (shift > 0) + { + for (i = 0; i < samples; i++) + signal[i] >>= shift; + } + + return (uint)shift; + } + + /// + /// Copy channel-interleaved input samples into separate subframes + /// + /// + /// + /// + unsafe void copy_samples(int[,] samples, int pos, int block) + { + fixed (int* fsamples = samplesBuffer, src = &samples[pos, 0]) + { + if (channels == 2) + AudioSamples.Deinterlace(fsamples + samplesInBuffer, fsamples + FlaCudaWriter.MAX_BLOCKSIZE + samplesInBuffer, src, block); + else + for (int ch = 0; ch < channels; ch++) + { + int* psamples = fsamples + ch * FlaCudaWriter.MAX_BLOCKSIZE + samplesInBuffer; + for (int i = 0; i < block; i++) + psamples[i] = src[i * channels + ch]; + } + } + samplesInBuffer += block; + } + + static uint rice_encode_count(uint sum, uint n, uint k) + { + return n*(k+1) + ((sum-(n>>1))>>(int)k); + } + + //static unsafe uint find_optimal_rice_param(uint sum, uint n) + //{ + // uint* nbits = stackalloc uint[Flake.MAX_RICE_PARAM + 1]; + // int k_opt = 0; + + // nbits[0] = UINT32_MAX; + // for (int k = 0; k <= Flake.MAX_RICE_PARAM; k++) + // { + // nbits[k] = rice_encode_count(sum, n, (uint)k); + // if (nbits[k] < nbits[k_opt]) + // k_opt = k; + // } + // return (uint)k_opt; + //} + + static unsafe int find_optimal_rice_param(uint sum, uint n, out uint nbits_best) + { + int k_opt = 0; + uint a = n; + uint b = sum - (n >> 1); + uint nbits = a + b; + for (int k = 1; k <= Flake.MAX_RICE_PARAM; k++) + { + a += n; + b >>= 1; + uint nbits_k = a + b; + if (nbits_k < nbits) + { + k_opt = k; + nbits = nbits_k; + } + } + nbits_best = nbits; + return k_opt; + } + + unsafe uint calc_decorr_score(FlacFrame frame, int ch) + { + int* s = frame.subframes[ch].samples; + int n = frame.blocksize; + ulong sum = 0; + for (int i = 2; i < n; i++) + sum += (ulong)Math.Abs(s[i] - 2 * s[i - 1] + s[i - 2]); + uint nbits; + find_optimal_rice_param((uint)(2 * sum), (uint)n, out nbits); + return nbits; + } + + unsafe static void channel_decorrelation(int* leftS, int* rightS, int *leftM, int *rightM, int blocksize) + { + for (int i = 0; i < blocksize; i++) + { + leftM[i] = (leftS[i] + rightS[i]) >> 1; + rightM[i] = leftS[i] - rightS[i]; + } + } + + unsafe void encode_residual_verbatim(int* res, int* smp, uint n) + { + AudioSamples.MemCpy(res, smp, (int) n); + } + + unsafe void encode_residual_fixed(int* res, int* smp, int n, int order) + { + int i; + int s0, s1, s2; + switch (order) + { + case 0: + AudioSamples.MemCpy(res, smp, n); + return; + case 1: + *(res++) = s1 = *(smp++); + for (i = n - 1; i > 0; i--) + { + s0 = *(smp++); + *(res++) = s0 - s1; + s1 = s0; + } + return; + case 2: + *(res++) = s2 = *(smp++); + *(res++) = s1 = *(smp++); + for (i = n - 2; i > 0; i--) + { + s0 = *(smp++); + *(res++) = s0 - 2 * s1 + s2; + s2 = s1; + s1 = s0; + } + return; + case 3: + res[0] = smp[0]; + res[1] = smp[1]; + res[2] = smp[2]; + for (i = 3; i < n; i++) + { + res[i] = smp[i] - 3 * smp[i - 1] + 3 * smp[i - 2] - smp[i - 3]; + } + return; + case 4: + res[0] = smp[0]; + res[1] = smp[1]; + res[2] = smp[2]; + res[3] = smp[3]; + for (i = 4; i < n; i++) + { + res[i] = smp[i] - 4 * smp[i - 1] + 6 * smp[i - 2] - 4 * smp[i - 3] + smp[i - 4]; + } + return; + default: + return; + } + } + + static unsafe uint calc_optimal_rice_params(ref RiceContext rc, int porder, uint* sums, uint n, uint pred_order) + { + uint part = (1U << porder); + uint all_bits = 0; + rc.rparams[0] = find_optimal_rice_param(sums[0], (n >> porder) - pred_order, out all_bits); + uint cnt = (n >> porder); + for (uint i = 1; i < part; i++) + { + uint nbits; + rc.rparams[i] = find_optimal_rice_param(sums[i], cnt, out nbits); + all_bits += nbits; + } + all_bits += (4 * part); + rc.porder = porder; + return all_bits; + } + + static unsafe void calc_sums(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums) + { + // sums for highest level + int parts = (1 << pmax); + uint* res = data + pred_order; + uint cnt = (n >> pmax) - pred_order; + uint sum = 0; + for (uint j = cnt; j > 0; j--) + sum += *(res++); + sums[pmax * Flake.MAX_PARTITIONS + 0] = sum; + cnt = (n >> pmax); + for (int i = 1; i < parts; i++) + { + sum = 0; + for (uint j = cnt; j > 0; j--) + sum += *(res++); + sums[pmax * Flake.MAX_PARTITIONS + i] = sum; + } + // sums for lower levels + for (int i = pmax - 1; i >= pmin; i--) + { + parts = (1 << i); + for (int j = 0; j < parts; j++) + { + sums[i * Flake.MAX_PARTITIONS + j] = + sums[(i + 1) * Flake.MAX_PARTITIONS + 2 * j] + + sums[(i + 1) * Flake.MAX_PARTITIONS + 2 * j + 1]; + } + } + } + + static unsafe uint calc_rice_params(ref RiceContext rc, int pmin, int pmax, int* data, uint n, uint pred_order) + { + RiceContext tmp_rc = new RiceContext(), tmp_rc2; + uint* udata = stackalloc uint[(int)n]; + uint* sums = stackalloc uint[(pmax + 1) * Flake.MAX_PARTITIONS]; + //uint* bits = stackalloc uint[Flake.MAX_PARTITION_ORDER]; + + //assert(pmin >= 0 && pmin <= Flake.MAX_PARTITION_ORDER); + //assert(pmax >= 0 && pmax <= Flake.MAX_PARTITION_ORDER); + //assert(pmin <= pmax); + + for (uint i = 0; i < n; i++) + udata[i] = (uint) ((2 * data[i]) ^ (data[i] >> 31)); + + calc_sums(pmin, pmax, udata, n, pred_order, sums); + + int opt_porder = pmin; + uint opt_bits = AudioSamples.UINT32_MAX; + for (int i = pmin; i <= pmax; i++) + { + uint bits = calc_optimal_rice_params(ref tmp_rc, i, sums + i * Flake.MAX_PARTITIONS, n, pred_order); + if (bits <= opt_bits) + { + opt_porder = i; + opt_bits = bits; + tmp_rc2 = rc; + rc = tmp_rc; + tmp_rc = tmp_rc2; + } + } + + return opt_bits; + } + + static int get_max_p_order(int max_porder, int n, int order) + { + int porder = Math.Min(max_porder, BitReader.log2i(n ^ (n - 1))); + if (order > 0) + porder = Math.Min(porder, BitReader.log2i(n / order)); + return porder; + } + + static unsafe uint calc_rice_params_fixed(ref RiceContext rc, int pmin, int pmax, + int* data, int n, int pred_order, uint bps) + { + pmin = get_max_p_order(pmin, n, pred_order); + pmax = get_max_p_order(pmax, n, pred_order); + uint bits = (uint)pred_order * bps + 6; + bits += calc_rice_params(ref rc, pmin, pmax, data, (uint)n, (uint)pred_order); + return bits; + } + + static unsafe uint calc_rice_params_lpc(ref RiceContext rc, int pmin, int pmax, + int* data, int n, int pred_order, uint bps, uint precision) + { + pmin = get_max_p_order(pmin, n, pred_order); + pmax = get_max_p_order(pmax, n, pred_order); + uint bits = (uint)pred_order * bps + 4 + 5 + (uint)pred_order * precision + 6; + bits += calc_rice_params(ref rc, pmin, pmax, data, (uint)n, (uint)pred_order); + return bits; + } + + // select LPC precision based on block size + static uint get_precision(int blocksize) + { + uint lpc_precision; + if (blocksize <= 192) lpc_precision = 7U; + else if (blocksize <= 384) lpc_precision = 8U; + else if (blocksize <= 576) lpc_precision = 9U; + else if (blocksize <= 1152) lpc_precision = 10U; + else if (blocksize <= 2304) lpc_precision = 11U; + else if (blocksize <= 4608) lpc_precision = 12U; + else if (blocksize <= 8192) lpc_precision = 13U; + else if (blocksize <= 16384) lpc_precision = 14U; + else lpc_precision = 15; + return lpc_precision; + } + + unsafe void encode_residual_lpc_sub(FlacFrame frame, float * lpcs, int iWindow, int order, int ch) + { + // select LPC precision based on block size + uint lpc_precision = get_precision(frame.blocksize); + + for (int i_precision = eparams.lpc_min_precision_search; i_precision <= eparams.lpc_max_precision_search && lpc_precision + i_precision < 16; i_precision++) + // check if we already calculated with this order, window and precision + if ((frame.subframes[ch].lpc_ctx[iWindow].done_lpcs[i_precision] & (1U << (order - 1))) == 0) + { + frame.subframes[ch].lpc_ctx[iWindow].done_lpcs[i_precision] |= (1U << (order - 1)); + + uint cbits = lpc_precision + (uint)i_precision; + + frame.current.type = SubframeType.LPC; + frame.current.order = order; + frame.current.window = iWindow; + + fixed (int* coefs = frame.current.coefs) + { + lpc.quantize_lpc_coefs(lpcs + (frame.current.order - 1) * lpc.MAX_LPC_ORDER, + frame.current.order, cbits, coefs, out frame.current.shift, 15, 0); + + if (frame.current.shift < 0 || frame.current.shift > 15) + throw new Exception("negative shift"); + + ulong csum = 0; + for (int i = frame.current.order; i > 0; i--) + csum += (ulong)Math.Abs(coefs[i - 1]); + + if ((csum << (int)frame.subframes[ch].obits) >= 1UL << 32) + lpc.encode_residual_long(frame.current.residual, frame.subframes[ch].samples, frame.blocksize, frame.current.order, coefs, frame.current.shift); + else + lpc.encode_residual(frame.current.residual, frame.subframes[ch].samples, frame.blocksize, frame.current.order, coefs, frame.current.shift); + + } + + frame.current.size = calc_rice_params_lpc(ref frame.current.rc, eparams.min_partition_order, eparams.max_partition_order, + frame.current.residual, frame.blocksize, frame.current.order, frame.subframes[ch].obits, cbits); + + frame.ChooseBestSubframe(ch); + } + } + + unsafe void encode_residual_fixed_sub(FlacFrame frame, int order, int ch) + { + if ((frame.subframes[ch].done_fixed & (1U << order)) != 0) + return; // already calculated; + + frame.current.order = order; + frame.current.type = SubframeType.Fixed; + + encode_residual_fixed(frame.current.residual, frame.subframes[ch].samples, frame.blocksize, frame.current.order); + + frame.current.size = calc_rice_params_fixed(ref frame.current.rc, eparams.min_partition_order, eparams.max_partition_order, + frame.current.residual, frame.blocksize, frame.current.order, frame.subframes[ch].obits); + + frame.subframes[ch].done_fixed |= (1U << order); + + frame.ChooseBestSubframe(ch); + } + + unsafe void encode_residual(FlacFrame frame, int ch, PredictionType predict, OrderMethod omethod, int pass) + { + int* smp = frame.subframes[ch].samples; + int i, n = frame.blocksize; + // save best.window, because we can overwrite it later with fixed frame + int best_window = frame.subframes[ch].best.type == SubframeType.LPC ? frame.subframes[ch].best.window : -1; + + // CONSTANT + for (i = 1; i < n; i++) + { + if (smp[i] != smp[0]) break; + } + if (i == n) + { + frame.subframes[ch].best.type = SubframeType.Constant; + frame.subframes[ch].best.residual[0] = smp[0]; + frame.subframes[ch].best.size = frame.subframes[ch].obits; + return; + } + + // VERBATIM + frame.current.type = SubframeType.Verbatim; + frame.current.size = frame.subframes[ch].obits * (uint)frame.blocksize; + frame.ChooseBestSubframe(ch); + + if (n < 5 || predict == PredictionType.None) + return; + + // FIXED + if (predict == PredictionType.Fixed || + (predict == PredictionType.Search && pass != 1) || + //predict == PredictionType.Search || + //(pass == 2 && frame.subframes[ch].best.type == SubframeType.Fixed) || + n <= eparams.max_prediction_order) + { + int max_fixed_order = Math.Min(eparams.max_fixed_order, 4); + int min_fixed_order = Math.Min(eparams.min_fixed_order, max_fixed_order); + + for (i = min_fixed_order; i <= max_fixed_order; i++) + encode_residual_fixed_sub(frame, i, ch); + } + + // LPC + if (n > eparams.max_prediction_order && + (predict == PredictionType.Levinson || + predict == PredictionType.Search) + //predict == PredictionType.Search || + //(pass == 2 && frame.subframes[ch].best.type == SubframeType.LPC)) + ) + { + float* lpcs = stackalloc float[lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER]; + int min_order = eparams.min_prediction_order; + int max_order = eparams.max_prediction_order; + + for (int iWindow = 0; iWindow < _windowcount; iWindow++) + { + if (pass == 2 && iWindow != best_window) + continue; + + LpcContext lpc_ctx = frame.subframes[ch].lpc_ctx[iWindow]; + + lpc_ctx.GetReflection(max_order, smp, n, frame.window_buffer + iWindow * FlaCudaWriter.MAX_BLOCKSIZE * 2); + lpc_ctx.ComputeLPC(lpcs); + + switch (omethod) + { + case OrderMethod.Max: + // always use maximum order + encode_residual_lpc_sub(frame, lpcs, iWindow, max_order, ch); + break; + case OrderMethod.Estimate: + // estimated orders + // Search at reflection coeff thresholds (where they cross 0.10) + { + int found = 0; + for (i = max_order; i >= min_order && found < eparams.estimation_depth; i--) + if (lpc_ctx.IsInterestingOrder(i, max_order)) + { + encode_residual_lpc_sub(frame, lpcs, iWindow, i, ch); + found++; + } + if (0 == found) + encode_residual_lpc_sub(frame, lpcs, iWindow, min_order, ch); + } + break; + case OrderMethod.EstSearch2: + // Search at reflection coeff thresholds (where they cross 0.10) + { + int found = 0; + for (i = min_order; i <= max_order && found < eparams.estimation_depth; i++) + if (lpc_ctx.IsInterestingOrder(i)) + { + encode_residual_lpc_sub(frame, lpcs, iWindow, i, ch); + found++; + } + if (0 == found) + encode_residual_lpc_sub(frame, lpcs, iWindow, min_order, ch); + } + break; + case OrderMethod.Search: + // brute-force optimal order search + for (i = max_order; i >= min_order; i--) + encode_residual_lpc_sub(frame, lpcs, iWindow, i, ch); + break; + case OrderMethod.LogFast: + // Try max, est, 32,16,8,4,2,1 + encode_residual_lpc_sub(frame, lpcs, iWindow, max_order, ch); + for (i = lpc.MAX_LPC_ORDER; i >= min_order; i >>= 1) + if (i < max_order) + encode_residual_lpc_sub(frame, lpcs, iWindow, i, ch); + break; + case OrderMethod.LogSearch: + // do LogFast first + encode_residual_lpc_sub(frame, lpcs, iWindow, max_order, ch); + for (i = lpc.MAX_LPC_ORDER; i >= min_order; i >>= 1) + if (i < max_order) + encode_residual_lpc_sub(frame, lpcs, iWindow, i, ch); + // if found a good order, try to search around it + if (frame.subframes[ch].best.type == SubframeType.LPC) + { + // log search (written by Michael Niedermayer for FFmpeg) + for (int step = lpc.MAX_LPC_ORDER; step > 0; step >>= 1) + { + int last = frame.subframes[ch].best.order; + if (step <= (last + 1) / 2) + for (i = last - step; i <= last + step; i += step) + if (i >= min_order && i <= max_order) + encode_residual_lpc_sub(frame, lpcs, iWindow, i, ch); + } + } + break; + default: + throw new Exception("unknown ordermethod"); + } + } + } + } + + unsafe void encode_selected_residual(FlacFrame frame, int ch, PredictionType predict, OrderMethod omethod, int best_window, int best_order) + { + int* smp = frame.subframes[ch].samples; + int i, n = frame.blocksize; + + // CONSTANT + for (i = 1; i < n; i++) + { + if (smp[i] != smp[0]) break; + } + if (i == n) + { + frame.subframes[ch].best.type = SubframeType.Constant; + frame.subframes[ch].best.residual[0] = smp[0]; + frame.subframes[ch].best.size = frame.subframes[ch].obits; + return; + } + + // VERBATIM + frame.current.type = SubframeType.Verbatim; + frame.current.size = frame.subframes[ch].obits * (uint)frame.blocksize; + frame.ChooseBestSubframe(ch); + + if (n < 5 || predict == PredictionType.None) + return; + + // FIXED + if (predict == PredictionType.Fixed || + (predict == PredictionType.Search) || + n <= eparams.max_prediction_order) + { + int max_fixed_order = Math.Min(eparams.max_fixed_order, 4); + int min_fixed_order = Math.Min(eparams.min_fixed_order, max_fixed_order); + + for (i = min_fixed_order; i <= max_fixed_order; i++) + encode_residual_fixed_sub(frame, i, ch); + } + + // LPC + if (n > eparams.max_prediction_order && + (predict == PredictionType.Levinson || + predict == PredictionType.Search) + ) + { + float* lpcs = stackalloc float[lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER]; + LpcContext lpc_ctx = frame.subframes[ch].lpc_ctx[best_window]; + + lpc_ctx.GetReflection(eparams.max_prediction_order, smp, n, frame.window_buffer + best_window * FlaCudaWriter.MAX_BLOCKSIZE * 2); + lpc_ctx.ComputeLPC(lpcs); + + encode_residual_lpc_sub(frame, lpcs, best_window, best_order, ch); + } + } + + unsafe void output_frame_header(FlacFrame frame, BitWriter bitwriter) + { + bitwriter.writebits(15, 0x7FFC); + bitwriter.writebits(1, eparams.variable_block_size > 0 ? 1 : 0); + bitwriter.writebits(4, frame.bs_code0); + bitwriter.writebits(4, sr_code0); + if (frame.ch_mode == ChannelMode.NotStereo) + bitwriter.writebits(4, ch_code); + else + bitwriter.writebits(4, (int) frame.ch_mode); + bitwriter.writebits(3, bps_code); + bitwriter.writebits(1, 0); + bitwriter.write_utf8(frame_count); + + // custom block size + if (frame.bs_code1 >= 0) + { + if (frame.bs_code1 < 256) + bitwriter.writebits(8, frame.bs_code1); + else + bitwriter.writebits(16, frame.bs_code1); + } + + // custom sample rate + if (sr_code1 > 0) + { + if (sr_code1 < 256) + bitwriter.writebits(8, sr_code1); + else + bitwriter.writebits(16, sr_code1); + } + + // CRC-8 of frame header + bitwriter.flush(); + byte crc = crc8.ComputeChecksum(frame_buffer, 0, bitwriter.Length); + bitwriter.writebits(8, crc); + } + + unsafe void output_residual(FlacFrame frame, BitWriter bitwriter, FlacSubframeInfo sub) + { + // rice-encoded block + bitwriter.writebits(2, 0); + + // partition order + int porder = sub.best.rc.porder; + int psize = frame.blocksize >> porder; + //assert(porder >= 0); + bitwriter.writebits(4, porder); + int res_cnt = psize - sub.best.order; + + // residual + int j = sub.best.order; + for (int p = 0; p < (1 << porder); p++) + { + int k = sub.best.rc.rparams[p]; + bitwriter.writebits(4, k); + if (p == 1) res_cnt = psize; + int cnt = Math.Min(res_cnt, frame.blocksize - j); + bitwriter.write_rice_block_signed(k, sub.best.residual + j, cnt); + j += cnt; + } + } + + unsafe void + output_subframe_constant(FlacFrame frame, BitWriter bitwriter, FlacSubframeInfo sub) + { + bitwriter.writebits_signed(sub.obits, sub.best.residual[0]); + } + + unsafe void + output_subframe_verbatim(FlacFrame frame, BitWriter bitwriter, FlacSubframeInfo sub) + { + int n = frame.blocksize; + for (int i = 0; i < n; i++) + bitwriter.writebits_signed(sub.obits, sub.samples[i]); + // Don't use residual here, because we don't copy samples to residual for verbatim frames. + } + + unsafe void + output_subframe_fixed(FlacFrame frame, BitWriter bitwriter, FlacSubframeInfo sub) + { + // warm-up samples + for (int i = 0; i < sub.best.order; i++) + bitwriter.writebits_signed(sub.obits, sub.samples[i]); + + // residual + output_residual(frame, bitwriter, sub); + } + + unsafe void + output_subframe_lpc(FlacFrame frame, BitWriter bitwriter, FlacSubframeInfo sub) + { + // warm-up samples + for (int i = 0; i < sub.best.order; i++) + bitwriter.writebits_signed(sub.obits, sub.samples[i]); + + // LPC coefficients + int cbits = 1; + for (int i = 0; i < sub.best.order; i++) + while (cbits < 16 && sub.best.coefs[i] != (sub.best.coefs[i] << (32 - cbits)) >> (32 - cbits)) + cbits++; + bitwriter.writebits(4, cbits - 1); + bitwriter.writebits_signed(5, sub.best.shift); + for (int i = 0; i < sub.best.order; i++) + bitwriter.writebits_signed(cbits, sub.best.coefs[i]); + + // residual + output_residual(frame, bitwriter, sub); + } + + unsafe void output_subframes(FlacFrame frame, BitWriter bitwriter) + { + for (int ch = 0; ch < channels; ch++) + { + FlacSubframeInfo sub = frame.subframes[ch]; + // subframe header + int type_code = (int) sub.best.type; + if (sub.best.type == SubframeType.Fixed) + type_code |= sub.best.order; + if (sub.best.type == SubframeType.LPC) + type_code |= sub.best.order - 1; + bitwriter.writebits(1, 0); + bitwriter.writebits(6, type_code); + bitwriter.writebits(1, sub.wbits != 0 ? 1 : 0); + if (sub.wbits > 0) + bitwriter.writebits((int)sub.wbits, 1); + + // subframe + switch (sub.best.type) + { + case SubframeType.Constant: + output_subframe_constant(frame, bitwriter, sub); + break; + case SubframeType.Verbatim: + output_subframe_verbatim(frame, bitwriter, sub); + break; + case SubframeType.Fixed: + output_subframe_fixed(frame, bitwriter, sub); + break; + case SubframeType.LPC: + output_subframe_lpc(frame, bitwriter, sub); + break; + } + } + } + + void output_frame_footer(BitWriter bitwriter) + { + bitwriter.flush(); + ushort crc = crc16.ComputeChecksum(frame_buffer, 0, bitwriter.Length); + bitwriter.writebits(16, crc); + bitwriter.flush(); + } + + unsafe void encode_residual_pass1(FlacFrame frame, int ch) + { + int max_prediction_order = eparams.max_prediction_order; + int max_fixed_order = eparams.max_fixed_order; + int min_fixed_order = eparams.min_fixed_order; + int lpc_min_precision_search = eparams.lpc_min_precision_search; + int lpc_max_precision_search = eparams.lpc_max_precision_search; + int max_partition_order = eparams.max_partition_order; + int estimation_depth = eparams.estimation_depth; + eparams.min_fixed_order = 2; + eparams.max_fixed_order = 2; + eparams.lpc_min_precision_search = eparams.lpc_max_precision_search; + eparams.max_prediction_order = 8; + eparams.estimation_depth = 1; + encode_residual(frame, ch, eparams.prediction_type, OrderMethod.Estimate, 1); + eparams.min_fixed_order = min_fixed_order; + eparams.max_fixed_order = max_fixed_order; + eparams.max_prediction_order = max_prediction_order; + eparams.lpc_min_precision_search = lpc_min_precision_search; + eparams.lpc_max_precision_search = lpc_max_precision_search; + eparams.max_partition_order = max_partition_order; + eparams.estimation_depth = estimation_depth; + } + + unsafe void encode_residual_pass2(FlacFrame frame, int ch) + { + encode_residual(frame, ch, eparams.prediction_type, eparams.order_method, 2); + } + + unsafe void encode_residual_onepass(FlacFrame frame, int ch) + { + if (_windowcount > 1) + { + encode_residual_pass1(frame, ch); + encode_residual_pass2(frame, ch); + } else + encode_residual(frame, ch, eparams.prediction_type, eparams.order_method, 0); + } + + unsafe uint measure_frame_size(FlacFrame frame, bool do_midside) + { + // crude estimation of header/footer size + uint total = (uint)(32 + ((BitReader.log2i(frame_count) + 4) / 5) * 8 + (eparams.variable_block_size != 0 ? 16 : 0) + 16); + + if (do_midside) + { + uint bitsBest = AudioSamples.UINT32_MAX; + ChannelMode modeBest = ChannelMode.LeftRight; + + if (bitsBest > frame.subframes[2].best.size + frame.subframes[3].best.size) + { + bitsBest = frame.subframes[2].best.size + frame.subframes[3].best.size; + modeBest = ChannelMode.MidSide; + } + if (bitsBest > frame.subframes[3].best.size + frame.subframes[1].best.size) + { + bitsBest = frame.subframes[3].best.size + frame.subframes[1].best.size; + modeBest = ChannelMode.RightSide; + } + if (bitsBest > frame.subframes[3].best.size + frame.subframes[0].best.size) + { + bitsBest = frame.subframes[3].best.size + frame.subframes[0].best.size; + modeBest = ChannelMode.LeftSide; + } + if (bitsBest > frame.subframes[0].best.size + frame.subframes[1].best.size) + { + bitsBest = frame.subframes[0].best.size + frame.subframes[1].best.size; + modeBest = ChannelMode.LeftRight; + } + frame.ch_mode = modeBest; + return total + bitsBest; + } + + for (int ch = 0; ch < channels; ch++) + total += frame.subframes[ch].best.size; + return total; + } + + unsafe delegate void window_function(float* window, int size); + + unsafe void calculate_window(float* window, window_function func, WindowFunction flag) + { + if ((eparams.window_function & flag) == 0 || _windowcount == lpc.MAX_LPC_WINDOWS) + return; + int sz = _windowsize; + float* pos = window + _windowcount * FlaCudaWriter.MAX_BLOCKSIZE * 2; + do + { + func(pos, sz); + if ((sz & 1) != 0) + break; + pos += sz; + sz >>= 1; + } while (sz >= 32); + _windowcount++; + } + + unsafe int encode_frame(out int size) + { + fixed (int* s = samplesBuffer, r = residualBuffer) + fixed (float* window = windowBuffer) + { + frame.InitSize(eparams.block_size, eparams.variable_block_size != 0); + + if (frame.blocksize != _windowsize && frame.blocksize > 4) + { + _windowsize = frame.blocksize; + _windowcount = 0; + calculate_window(window, lpc.window_welch, WindowFunction.Welch); + calculate_window(window, lpc.window_tukey, WindowFunction.Tukey); + calculate_window(window, lpc.window_hann, WindowFunction.Hann); + calculate_window(window, lpc.window_flattop, WindowFunction.Flattop); + calculate_window(window, lpc.window_bartlett, WindowFunction.Bartlett); + if (_windowcount == 0) + throw new Exception("invalid windowfunction"); + cuda.CopyHostToDevice(cudaWindow, windowBuffer); + } + + if (channels != 2 || frame.blocksize <= 32 || eparams.stereo_method == StereoMethod.Independent) + { + frame.window_buffer = window; + frame.current.residual = r + channels * FlaCudaWriter.MAX_BLOCKSIZE; + frame.ch_mode = channels != 2 ? ChannelMode.NotStereo : ChannelMode.LeftRight; + for (int ch = 0; ch < channels; ch++) + frame.subframes[ch].Init(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * FlaCudaWriter.MAX_BLOCKSIZE, + bits_per_sample, get_wasted_bits(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, frame.blocksize)); + + for (int ch = 0; ch < channels; ch++) + encode_residual_onepass(frame, ch); + } + else + { + channel_decorrelation(s, s + FlaCudaWriter.MAX_BLOCKSIZE, s + 2 * FlaCudaWriter.MAX_BLOCKSIZE, s + 3 * FlaCudaWriter.MAX_BLOCKSIZE, frame.blocksize); + frame.window_buffer = window; + frame.current.residual = r + 4 * FlaCudaWriter.MAX_BLOCKSIZE; + for (int ch = 0; ch < 4; ch++) + frame.subframes[ch].Init(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * FlaCudaWriter.MAX_BLOCKSIZE, + bits_per_sample + (ch == 3 ? 1U : 0U), get_wasted_bits(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, frame.blocksize)); + + int orders = 8; + while (orders < eparams.max_prediction_order + 1 && orders < 32) + orders <<= 1; + int threads = 512; + int threads_x = orders; + int threads_y = threads / threads_x; + int blocks_y = ((threads_x - 1) * (threads_y - 1)) / threads_y; + int blocks = (frame.blocksize + blocks_y * threads_y - 1) / (blocks_y * threads_y); + cuda.CopyHostToDevice(cudaSamples, (IntPtr)s, (uint)FlaCudaWriter.MAX_BLOCKSIZE * 4 * sizeof(int)); + cuda.SetParameter(cudaComputeAutocor, 0, (uint)cudaSamples.Pointer); + cuda.SetParameter(cudaComputeAutocor, IntPtr.Size, (uint)cudaWindow.Pointer); + cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 2, (uint)cudaAutocor.Pointer); + cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3, (uint)frame.blocksize); + cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3 + sizeof(uint), (uint)FlaCudaWriter.MAX_BLOCKSIZE); + cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3 + sizeof(uint) * 2, (uint)(blocks_y*threads_y)); + cuda.SetParameterSize(cudaComputeAutocor, (uint)(IntPtr.Size * 3) + sizeof(uint) * 3); + cuda.SetFunctionBlockShape(cudaComputeAutocor, threads_x, threads_y, 1); + cuda.SetFunctionSharedSize(cudaComputeAutocor, (uint)(sizeof(float) * 1090));//545)); + cuda.Launch(cudaComputeAutocor, blocks, 4 * _windowcount); + cuda.CopyDeviceToHost(cudaAutocor, autocorBuffer); + //cuda.CopyDeviceToHost(cudaAutocor, autocorBuffer3int); + + int* shift = stackalloc int[lpc.MAX_LPC_ORDER * 4 * _windowcount]; + int* coefs = stackalloc int[lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER * 4 * _windowcount]; + uint cbits = get_precision(frame.blocksize) + 1; + AudioSamples.MemSet(coefs, 0, lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER * 4 * _windowcount); + + for (int ch = 0; ch < 4; ch++) + for (int iWindow = 0; iWindow < _windowcount; iWindow++) + { + double* ac = stackalloc double[lpc.MAX_LPC_ORDER + 1]; + for (int i = 0; i < orders; i++) + { + ac[i] = 0; + for (int i_block = 0; i_block < blocks; i_block++) + ac[i] += autocorBuffer[orders * (i_block + blocks * (ch + 4 * iWindow)) + i]; + } + //double* ac = stackalloc double[33]; + //for (int i = 0; i < 33; i++) + // ac[i] = autocorBuffer3int[(33 * (ch + 4 * iWindow) + i) * 3] * (double)(1 << 18) + + // autocorBuffer3int[(33 * (ch + 4 * iWindow) + i) * 3 + 1] * (double)(1 << 9) + + // autocorBuffer3int[(33 * (ch + 4 * iWindow) + i) * 3 + 2]; + //fixed (float *ac = &autocorBuffer[orders * (ch + 4 * iWindow)]) + frame.subframes[ch].lpc_ctx[iWindow].ComputeReflection(orders - 1, ac); + float* lpcs = stackalloc float[lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER]; + frame.subframes[ch].lpc_ctx[iWindow].ComputeLPC(lpcs); + for (int order = 0; order < orders - 1; order++) + lpc.quantize_lpc_coefs(lpcs + order * lpc.MAX_LPC_ORDER, + order + 1, cbits, coefs + order * lpc.MAX_LPC_ORDER + (ch + 4 * iWindow) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER, + out shift[order + (ch + 4 * iWindow) * lpc.MAX_LPC_ORDER], 15, 0); + } + + int max_order = Math.Min(orders - 1, eparams.max_prediction_order); + + cuda.CopyHostToDevice(cudaCoeffs, (IntPtr)coefs, (uint)(sizeof(int) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER * 4 * _windowcount)); + cuda.CopyHostToDevice(cudaShifts, (IntPtr)shift, (uint)(sizeof(int) * lpc.MAX_LPC_ORDER* 4 * _windowcount)); + //if (frame_count == 0) + { + cuda.SetParameter(cudaEncodeResidual, 0, (uint)cudaAutocor.Pointer); + cuda.SetParameter(cudaEncodeResidual, IntPtr.Size, (uint)cudaSamples.Pointer); + cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 2, (uint)cudaCoeffs.Pointer); + cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 3, (uint)cudaShifts.Pointer); + cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 4, (uint)frame.blocksize); + cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 4 + sizeof(uint), (uint)FlaCudaWriter.MAX_BLOCKSIZE); + cuda.SetParameterSize(cudaEncodeResidual, (uint)(IntPtr.Size * 4) + sizeof(uint) * 2); + cuda.SetFunctionBlockShape(cudaEncodeResidual, threads_x, 256 / threads_x, 1); + } + //cuda.SetFunctionSharedSize(cudaEncodeResidual, (uint)(sizeof(float) * 1090));//545)); + cuda.Launch(cudaEncodeResidual, max_order, 4 * _windowcount); + cuda.CopyDeviceToHost(cudaAutocor, autocorBuffer3int); + + for (int ch = 0; ch < 4; ch++) + { + frame.subframes[ch].best.size = AudioSamples.UINT32_MAX; + for (int iWindow = 0; iWindow < _windowcount; iWindow++) + { + for (int order = 1; order <= max_order; order++) + { + uint nbits = (uint)autocorBuffer3int[order - 1 + (ch + 4 * iWindow) * max_order]; + nbits += (uint)order * frame.subframes[ch].obits + 4 + 5 + (uint)order * cbits + 6; + if (frame.subframes[ch].best.size > nbits) + { + frame.subframes[ch].best.size = nbits; + frame.subframes[ch].best.order = order; + frame.subframes[ch].best.window = iWindow; + frame.subframes[ch].best.type = SubframeType.LPC; + } + } + //uint[] sums_buf = new uint[Flake.MAX_PARTITION_ORDER * Flake.MAX_PARTITIONS]; + //fixed (uint* sums = sums_buf) + // for (int order = 1; order <= max_order; order++) + //{ + // //uint nbits; + // //find_optimal_rice_param(2*(uint)autocorBuffer3int[order - 1 + (ch + 4 * iWindow) * max_order], (uint)frame.blocksize, out nbits); + // //uint nbits = (uint)autocorBuffer3int[order - 1 + (ch + 4 * iWindow) * max_order]; + // //nbits += (uint)order * frame.subframes[ch].obits + 4 + 5 + (uint)order * cbits + 6; + // for (int ip = 0; ip < 64; ip++) + // sums[6 * Flake.MAX_PARTITIONS + ip] = (uint)autocorBuffer3int[64 * (order - 1 + (ch + 4 * iWindow) * max_order) + ip]; + // for (int ip = 5; ip >= 0; ip--) + // { + // int parts = (1 << ip); + // for (int j = 0; j < parts; j++) + // { + // sums[ip * Flake.MAX_PARTITIONS + j] = + // sums[(ip + 1) * Flake.MAX_PARTITIONS + 2 * j] + + // sums[(ip + 1) * Flake.MAX_PARTITIONS + 2 * j + 1]; + // } + // } + // for (int ip = 0; ip <= get_max_p_order(Math.Min(6, eparams.max_partition_order), frame.blocksize, order); ip++) + // { + // uint nbits = calc_optimal_rice_params(ref frame.current.rc, ip, sums + ip * Flake.MAX_PARTITIONS, (uint)frame.blocksize, (uint)order); + // nbits += (uint)order * frame.subframes[ch].obits + 4 + 5 + (uint)order * cbits + 6; + // if (frame.subframes[ch].best.size > nbits) + // { + // frame.subframes[ch].best.size = nbits; + // frame.subframes[ch].best.order = order; + // frame.subframes[ch].best.window = iWindow; + // frame.subframes[ch].best.type = SubframeType.LPC; + // } + // } + //} + } + } + + uint fs = measure_frame_size(frame, true); + frame.ChooseSubframes(); + for (int ch = 0; ch < channels; ch++) + { + frame.subframes[ch].best.size = AudioSamples.UINT32_MAX; + encode_selected_residual(frame, ch, eparams.prediction_type, eparams.order_method, + frame.subframes[ch].best.window, frame.subframes[ch].best.order); + } + } + + BitWriter bitwriter = new BitWriter(frame_buffer, 0, max_frame_size); + + output_frame_header(frame, bitwriter); + output_subframes(frame, bitwriter); + output_frame_footer(bitwriter); + + if (frame_buffer != null) + { + if (eparams.variable_block_size > 0) + frame_count += frame.blocksize; + else + frame_count++; + } + size = frame.blocksize; + return bitwriter.Length; + } + } + + unsafe int output_frame() + { + if (verify != null) + { + fixed (int* s = verifyBuffer, r = samplesBuffer) + for (int ch = 0; ch < channels; ch++) + AudioSamples.MemCpy(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * FlaCudaWriter.MAX_BLOCKSIZE, eparams.block_size); + } + + int fs, bs; + //if (0 != eparams.variable_block_size && 0 == (eparams.block_size & 7) && eparams.block_size >= 128) + // fs = encode_frame_vbs(); + //else + fs = encode_frame(out bs); + + if (seek_table != null && _IO.CanSeek) + { + for (int sp = 0; sp < seek_table.Length; sp++) + { + if (seek_table[sp].framesize != 0) + continue; + if (seek_table[sp].number > (ulong)_position + (ulong)bs) + break; + if (seek_table[sp].number >= (ulong)_position) + { + seek_table[sp].number = (ulong)_position; + seek_table[sp].offset = (ulong)(_IO.Position - first_frame_offset); + seek_table[sp].framesize = (uint)bs; + } + } + } + + _position += bs; + _IO.Write(frame_buffer, 0, fs); + _totalSize += fs; + + if (verify != null) + { + int decoded = verify.DecodeFrame(frame_buffer, 0, fs); + if (decoded != fs || verify.Remaining != (ulong)bs) + throw new Exception("validation failed!"); + fixed (int* s = verifyBuffer, r = verify.Samples) + { + for (int ch = 0; ch < channels; ch++) + if (AudioSamples.MemCmp(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * FlaCudaWriter.MAX_BLOCKSIZE, bs)) + throw new Exception("validation failed!"); + } + } + + if (bs < eparams.block_size) + { + fixed (int* s = samplesBuffer) + for (int ch = 0; ch < channels; ch++) + AudioSamples.MemCpy(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, s + bs + ch * FlaCudaWriter.MAX_BLOCKSIZE, eparams.block_size - bs); + } + + samplesInBuffer -= bs; + + return bs; + } + + public void Write(int[,] buff, int pos, int sampleCount) + { + if (!inited) + { + cuda = new CUDA(0, true); + cuda.LoadModule(System.IO.Path.Combine(Environment.CurrentDirectory, "flacuda.cubin")); + cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor"); + cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual"); + //cudaAutocor = cuda.Allocate((uint)(sizeof(int) * (lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * 3); + cudaAutocor = cuda.Allocate((uint)(sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * 64); + cudaSamples = cuda.Allocate((uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels))); + cudaCoeffs = cuda.Allocate((uint)(sizeof(int) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS)); + cudaShifts = cuda.Allocate((uint)(sizeof(int) * lpc.MAX_LPC_ORDER * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS)); + cudaWindow = cuda.Allocate((uint)sizeof(float) * FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS); + if (_IO == null) + _IO = new FileStream(_path, FileMode.Create, FileAccess.Write, FileShare.Read); + int header_size = flake_encode_init(); + _IO.Write(header, 0, header_size); + if (_IO.CanSeek) + first_frame_offset = _IO.Position; + inited = true; + } + + int len = sampleCount; + while (len > 0) + { + int block = Math.Min(len, eparams.block_size - samplesInBuffer); + + copy_samples(buff, pos, block); + + if (md5 != null) + { + AudioSamples.FLACSamplesToBytes(buff, pos, frame_buffer, 0, block, channels, (int)bits_per_sample); + md5.TransformBlock(frame_buffer, 0, block * channels * ((int)bits_per_sample >> 3), null, 0); + } + + len -= block; + pos += block; + + while (samplesInBuffer >= eparams.block_size) + output_frame(); + } + } + + public string Path { get { return _path; } } + + string vendor_string = "Flake#0.1"; + + int select_blocksize(int samplerate, int time_ms) + { + int blocksize = Flake.flac_blocksizes[1]; + int target = (samplerate * time_ms) / 1000; + if (eparams.variable_block_size > 0) + { + blocksize = 1024; + while (target >= blocksize) + blocksize <<= 1; + return blocksize >> 1; + } + + for (int i = 0; i < Flake.flac_blocksizes.Length; i++) + if (target >= Flake.flac_blocksizes[i] && Flake.flac_blocksizes[i] > blocksize) + { + blocksize = Flake.flac_blocksizes[i]; + } + return blocksize; + } + + void write_streaminfo(byte[] header, int pos, int last) + { + Array.Clear(header, pos, 38); + BitWriter bitwriter = new BitWriter(header, pos, 38); + + // metadata header + bitwriter.writebits(1, last); + bitwriter.writebits(7, (int)MetadataType.StreamInfo); + bitwriter.writebits(24, 34); + + if (eparams.variable_block_size > 0) + bitwriter.writebits(16, 0); + else + bitwriter.writebits(16, eparams.block_size); + + bitwriter.writebits(16, eparams.block_size); + bitwriter.writebits(24, 0); + bitwriter.writebits(24, max_frame_size); + bitwriter.writebits(20, sample_rate); + bitwriter.writebits(3, channels - 1); + bitwriter.writebits(5, bits_per_sample - 1); + + // total samples + if (sample_count > 0) + { + bitwriter.writebits(4, 0); + bitwriter.writebits(32, sample_count); + } + else + { + bitwriter.writebits(4, 0); + bitwriter.writebits(32, 0); + } + bitwriter.flush(); + } + + /** + * Write vorbis comment metadata block to byte array. + * Just writes the vendor string for now. + */ + int write_vorbis_comment(byte[] comment, int pos, int last) + { + BitWriter bitwriter = new BitWriter(comment, pos, 4); + Encoding enc = new ASCIIEncoding(); + int vendor_len = enc.GetBytes(vendor_string, 0, vendor_string.Length, comment, pos + 8); + + // metadata header + bitwriter.writebits(1, last); + bitwriter.writebits(7, (int)MetadataType.VorbisComment); + bitwriter.writebits(24, vendor_len + 8); + + comment[pos + 4] = (byte)(vendor_len & 0xFF); + comment[pos + 5] = (byte)((vendor_len >> 8) & 0xFF); + comment[pos + 6] = (byte)((vendor_len >> 16) & 0xFF); + comment[pos + 7] = (byte)((vendor_len >> 24) & 0xFF); + comment[pos + 8 + vendor_len] = 0; + comment[pos + 9 + vendor_len] = 0; + comment[pos + 10 + vendor_len] = 0; + comment[pos + 11 + vendor_len] = 0; + bitwriter.flush(); + return vendor_len + 12; + } + + int write_seekpoints(byte[] header, int pos, int last) + { + seek_table_offset = pos + 4; + + BitWriter bitwriter = new BitWriter(header, pos, 4 + 18 * seek_table.Length); + + // metadata header + bitwriter.writebits(1, last); + bitwriter.writebits(7, (int)MetadataType.Seektable); + bitwriter.writebits(24, 18 * seek_table.Length); + for (int i = 0; i < seek_table.Length; i++) + { + bitwriter.writebits64(Flake.FLAC__STREAM_METADATA_SEEKPOINT_SAMPLE_NUMBER_LEN, seek_table[i].number); + bitwriter.writebits64(Flake.FLAC__STREAM_METADATA_SEEKPOINT_STREAM_OFFSET_LEN, seek_table[i].offset); + bitwriter.writebits(Flake.FLAC__STREAM_METADATA_SEEKPOINT_FRAME_SAMPLES_LEN, seek_table[i].framesize); + } + bitwriter.flush(); + return 4 + 18 * seek_table.Length; + } + + /** + * Write padding metadata block to byte array. + */ + int + write_padding(byte[] padding, int pos, int last, int padlen) + { + BitWriter bitwriter = new BitWriter(padding, pos, 4); + + // metadata header + bitwriter.writebits(1, last); + bitwriter.writebits(7, (int)MetadataType.Padding); + bitwriter.writebits(24, padlen); + + return padlen + 4; + } + + int write_headers() + { + int header_size = 0; + int last = 0; + + // stream marker + header[0] = 0x66; + header[1] = 0x4C; + header[2] = 0x61; + header[3] = 0x43; + header_size += 4; + + // streaminfo + write_streaminfo(header, header_size, last); + header_size += 38; + + // seek table + if (_IO.CanSeek && seek_table != null) + header_size += write_seekpoints(header, header_size, last); + + // vorbis comment + if (eparams.padding_size == 0) last = 1; + header_size += write_vorbis_comment(header, header_size, last); + + // padding + if (eparams.padding_size > 0) + { + last = 1; + header_size += write_padding(header, header_size, last, eparams.padding_size); + } + + return header_size; + } + + int flake_encode_init() + { + int i, header_len; + + //if(flake_validate_params(s) < 0) + + ch_code = channels - 1; + + // find samplerate in table + for (i = 4; i < 12; i++) + { + if (sample_rate == Flake.flac_samplerates[i]) + { + sr_code0 = i; + break; + } + } + + // if not in table, samplerate is non-standard + if (i == 12) + throw new Exception("non-standard samplerate"); + + for (i = 1; i < 8; i++) + { + if (bits_per_sample == Flake.flac_bitdepths[i]) + { + bps_code = i; + break; + } + } + if (i == 8) + throw new Exception("non-standard bps"); + // FIXME: For now, only 16-bit encoding is supported + if (bits_per_sample != 16) + throw new Exception("non-standard bps"); + + if (_blocksize == 0) + { + if (eparams.block_size == 0) + eparams.block_size = select_blocksize(sample_rate, eparams.block_time_ms); + _blocksize = eparams.block_size; + } + else + eparams.block_size = _blocksize; + + // set maximum encoded frame size (if larger, re-encodes in verbatim mode) + if (channels == 2) + max_frame_size = 16 + ((eparams.block_size * (int)(bits_per_sample + bits_per_sample + 1) + 7) >> 3); + else + max_frame_size = 16 + ((eparams.block_size * channels * (int)bits_per_sample + 7) >> 3); + + if (_IO.CanSeek && eparams.do_seektable) + { + int seek_points_distance = sample_rate * 10; + int num_seek_points = 1 + sample_count / seek_points_distance; // 1 seek point per 10 seconds + if (sample_count % seek_points_distance == 0) + num_seek_points--; + seek_table = new SeekPoint[num_seek_points]; + for (int sp = 0; sp < num_seek_points; sp++) + { + seek_table[sp].framesize = 0; + seek_table[sp].offset = 0; + seek_table[sp].number = (ulong)(sp * seek_points_distance); + } + } + + // output header bytes + header = new byte[eparams.padding_size + 1024 + (seek_table == null ? 0 : seek_table.Length * 18)]; + header_len = write_headers(); + + // initialize CRC & MD5 + if (_IO.CanSeek && eparams.do_md5) + md5 = new MD5CryptoServiceProvider(); + + if (eparams.do_verify) + { + verify = new FlakeReader(channels, bits_per_sample); + verifyBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * channels]; + } + + frame_buffer = new byte[max_frame_size]; + + return header_len; + } + } + + struct FlakeEncodeParams + { + // compression quality + // set by user prior to calling flake_encode_init + // standard values are 0 to 8 + // 0 is lower compression, faster encoding + // 8 is higher compression, slower encoding + // extended values 9 to 12 are slower and/or use + // higher prediction orders + public int compression; + + // prediction order selection method + // set by user prior to calling flake_encode_init + // if set to less than 0, it is chosen based on compression. + // valid values are 0 to 5 + // 0 = use maximum order only + // 1 = use estimation + // 2 = 2-level + // 3 = 4-level + // 4 = 8-level + // 5 = full search + // 6 = log search + public OrderMethod order_method; + + + // stereo decorrelation method + // set by user prior to calling flake_encode_init + // if set to less than 0, it is chosen based on compression. + // valid values are 0 to 2 + // 0 = independent L+R channels + // 1 = mid-side encoding + public StereoMethod stereo_method; + + // block size in samples + // set by the user prior to calling flake_encode_init + // if set to 0, a block size is chosen based on block_time_ms + // can also be changed by user before encoding a frame + public int block_size; + + // block time in milliseconds + // set by the user prior to calling flake_encode_init + // used to calculate block_size based on sample rate + // can also be changed by user before encoding a frame + public int block_time_ms; + + // padding size in bytes + // set by the user prior to calling flake_encode_init + // if set to less than 0, defaults to 4096 + public int padding_size; + + // minimum LPC order + // set by user prior to calling flake_encode_init + // if set to less than 0, it is chosen based on compression. + // valid values are 1 to 32 + public int min_prediction_order; + + // maximum LPC order + // set by user prior to calling flake_encode_init + // if set to less than 0, it is chosen based on compression. + // valid values are 1 to 32 + public int max_prediction_order; + + // Number of LPC orders to try (for estimate mode) + // set by user prior to calling flake_encode_init + // if set to less than 0, it is chosen based on compression. + // valid values are 1 to 32 + public int estimation_depth; + + // minimum fixed prediction order + // set by user prior to calling flake_encode_init + // if set to less than 0, it is chosen based on compression. + // valid values are 0 to 4 + public int min_fixed_order; + + // maximum fixed prediction order + // set by user prior to calling flake_encode_init + // if set to less than 0, it is chosen based on compression. + // valid values are 0 to 4 + public int max_fixed_order; + + // type of linear prediction + // set by user prior to calling flake_encode_init + public PredictionType prediction_type; + + // minimum partition order + // set by user prior to calling flake_encode_init + // if set to less than 0, it is chosen based on compression. + // valid values are 0 to 8 + public int min_partition_order; + + // maximum partition order + // set by user prior to calling flake_encode_init + // if set to less than 0, it is chosen based on compression. + // valid values are 0 to 8 + public int max_partition_order; + + // whether to use variable block sizes + // set by user prior to calling flake_encode_init + // 0 = fixed block size + // 1 = variable block size + public int variable_block_size; + + // whether to try various lpc_precisions + // 0 - use only one precision + // 1 - try two precisions + public int lpc_max_precision_search; + + public int lpc_min_precision_search; + + public WindowFunction window_function; + + public bool do_md5; + public bool do_verify; + public bool do_seektable; + + public int flake_set_defaults(int lvl) + { + compression = lvl; + + if ((lvl < 0 || lvl > 12) && (lvl != 99)) + { + return -1; + } + + // default to level 5 params + window_function = WindowFunction.Flattop | WindowFunction.Tukey; + order_method = OrderMethod.Estimate; + stereo_method = StereoMethod.Evaluate; + block_size = 0; + block_time_ms = 105; + prediction_type = PredictionType.Search; + min_prediction_order = 1; + max_prediction_order = 12; + estimation_depth = 1; + min_fixed_order = 2; + max_fixed_order = 2; + min_partition_order = 0; + max_partition_order = 6; + variable_block_size = 0; + lpc_min_precision_search = 1; + lpc_max_precision_search = 1; + do_md5 = true; + do_verify = false; + do_seektable = true; + + // differences from level 7 + switch (lvl) + { + case 0: + block_time_ms = 53; + prediction_type = PredictionType.Fixed; + stereo_method = StereoMethod.Independent; + max_partition_order = 4; + break; + case 1: + prediction_type = PredictionType.Levinson; + stereo_method = StereoMethod.Independent; + window_function = WindowFunction.Bartlett; + max_prediction_order = 8; + max_partition_order = 4; + break; + case 2: + stereo_method = StereoMethod.Independent; + window_function = WindowFunction.Bartlett; + max_partition_order = 4; + break; + case 3: + stereo_method = StereoMethod.Estimate; + window_function = WindowFunction.Bartlett; + max_prediction_order = 8; + break; + case 4: + stereo_method = StereoMethod.Estimate; + window_function = WindowFunction.Bartlett; + break; + case 5: + window_function = WindowFunction.Bartlett; + break; + case 6: + stereo_method = StereoMethod.Estimate; + break; + case 7: + break; + case 8: + estimation_depth = 3; + min_fixed_order = 0; + max_fixed_order = 4; + lpc_max_precision_search = 2; + break; + case 9: + window_function = WindowFunction.Bartlett; + max_prediction_order = 32; + break; + case 10: + min_fixed_order = 0; + max_fixed_order = 4; + max_prediction_order = 32; + //lpc_max_precision_search = 2; + break; + case 11: + min_fixed_order = 0; + max_fixed_order = 4; + max_prediction_order = 32; + estimation_depth = 5; + //lpc_max_precision_search = 2; + variable_block_size = 4; + break; + } + + return 0; + } + }} diff --git a/CUETools.FlaCuda/Properties/AssemblyInfo.cs b/CUETools.FlaCuda/Properties/AssemblyInfo.cs new file mode 100644 index 0000000..da44f74 --- /dev/null +++ b/CUETools.FlaCuda/Properties/AssemblyInfo.cs @@ -0,0 +1,35 @@ +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +// General Information about an assembly is controlled through the following +// set of attributes. Change these attribute values to modify the information +// associated with an assembly. +[assembly: AssemblyTitle("CUETools.FlaCuda")] +[assembly: AssemblyDescription("")] +[assembly: AssemblyConfiguration("")] +[assembly: AssemblyCompany("Microsoft")] +[assembly: AssemblyProduct("CUETools.FlaCuda")] +[assembly: AssemblyCopyright("Copyright © Microsoft 2009")] +[assembly: AssemblyTrademark("")] +[assembly: AssemblyCulture("")] + +// Setting ComVisible to false makes the types in this assembly not visible +// to COM components. If you need to access a type in this assembly from +// COM, set the ComVisible attribute to true on that type. +[assembly: ComVisible(false)] + +// The following GUID is for the ID of the typelib if this project is exposed to COM +[assembly: Guid("b28ffece-6c89-426b-b227-e647b435cc3d")] + +// Version information for an assembly consists of the following four values: +// +// Major Version +// Minor Version +// Build Number +// Revision +// +// You can specify all the values or you can default the Revision and Build Numbers +// by using the '*' as shown below: +[assembly: AssemblyVersion("1.0.0.0")] +[assembly: AssemblyFileVersion("1.0.0.0")] diff --git a/CUETools.FlaCuda/flacuda.cu b/CUETools.FlaCuda/flacuda.cu new file mode 100644 index 0000000..6cc5d87 --- /dev/null +++ b/CUETools.FlaCuda/flacuda.cu @@ -0,0 +1,369 @@ +/* + * Copyright 1993-2007 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO USER: + * + * This source code is subject to NVIDIA ownership rights under U.S. and + * international Copyright laws. Users and possessors of this source code + * are hereby granted a nonexclusive, royalty-free license to use this code + * in individual and commercial software. + * + * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE + * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR + * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE + * OR PERFORMANCE OF THIS SOURCE CODE. + * + * U.S. Government End Users. This source code is a "commercial item" as + * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of + * "commercial computer software" and "commercial computer software + * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) + * and is provided to the U.S. Government only as a commercial end item. + * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through + * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the + * source code with only those rights set forth herein. + * + * Any use of this source code in individual and commercial software must + * include, in the user documentation and internal comments to the code, + * the above Disclaimer and U.S. Government End Users Notice. + */ + +#ifndef _FLACUDA_KERNEL_H_ +#define _FLACUDA_KERNEL_H_ + +extern "C" __global__ void cudaComputeAutocor(const int * samples, const float * window, float* output, int frameSize, int frameOffset, int blocks) +{ + extern __shared__ float fshared[]; + float * const matrix = fshared + 513;//257; + const int iWin = blockIdx.y >> 2; + const int iCh = blockIdx.y & 3; + const int smpBase = iCh * frameOffset; + const int winBase = iWin * 2 * frameOffset; + const int pos = blockIdx.x * blocks; + + // fetch blockDim.x*blockDim.y samples + int tid = threadIdx.x + threadIdx.y * blockDim.x; + fshared[tid] = pos + tid < frameSize ? samples[smpBase + pos + tid] * window[winBase + pos + tid] : 0.0f; + __syncthreads(); + + float s = 0.0f; + for (int i = 0; i < blocks; i += blockDim.y) + s += fshared[i + threadIdx.y] * fshared[i + threadIdx.y + threadIdx.x]; + matrix[tid + threadIdx.y] = s; + __syncthreads(); + + // reduction in shared mem + for(unsigned int s=blockDim.y/2; s>1; s>>=1) + { + if (threadIdx.y < s) + matrix[tid + threadIdx.y] += matrix[threadIdx.x + (s + threadIdx.y) * (1 + blockDim.x)]; + __syncthreads(); + } + + // return results + if (threadIdx.y == 0) + output[(blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x] = matrix[threadIdx.x] + matrix[threadIdx.x + 1 + blockDim.x]; +} + +#ifdef __DEVICE_EMULATION__ +#define EMUSYNC __syncthreads() +#else +#define EMUSYNC +#endif + +extern "C" __global__ void cudaEncodeResidual( + int*output, + int*samples, + int*allcoefs, + int*shifts, + int frameSize, + int frameOffset) +{ + __shared__ struct { + int data[256]; + int residual[256]; + int coefs[32]; + int shift; + } shared; + const int smpBase = (blockIdx.y & 3) * frameOffset; + const int residualOrder = blockIdx.x; + const int tid = threadIdx.x + __mul24(threadIdx.y, blockDim.x); + const int step = __mul24(blockDim.x, blockDim.y - 1); + int total = 0; + + shared.residual[tid] = 0; + if (threadIdx.y == 0) shared.coefs[threadIdx.x] = threadIdx.x <= residualOrder ? allcoefs[threadIdx.x + residualOrder * 32 + blockIdx.y * 32 * 32] : 0; + if (tid == 0) shared.shift = shifts[32 * blockIdx.y + residualOrder]; + __syncthreads(); + + for(int pos = 0; pos < frameSize - residualOrder - 1; pos += step) + { + // fetch blockDim.x * blockDim.y samples + shared.data[tid] = pos + tid < frameSize ? samples[smpBase + pos + tid] : 0; + __syncthreads(); + + long sum = 0; + for (int c = 0; c <= residualOrder; c++) + sum += shared.data[tid + c] * shared.coefs[residualOrder - c]; + int res = shared.data[tid + residualOrder + 1] - (sum >> shared.shift); + __syncthreads(); + + int limit = min(frameSize - pos - residualOrder - 1, step); + shared.residual[tid] = tid < limit ? (2 * res) ^ (res >> 31) : 0; + + __syncthreads(); + // reduction in shared mem + for(unsigned int s=blockDim.x/2; s >= blockDim.y; s>>=1) + { + if (threadIdx.x < s) + shared.residual[tid] += shared.residual[tid + s]; + __syncthreads(); + } + for(unsigned int s=blockDim.y/2; s >= blockDim.x; s>>=1) + { + if (threadIdx.y < s) + shared.residual[tid] += shared.residual[tid + s * blockDim.x]; + __syncthreads(); + } + for(unsigned int s=min(blockDim.x,blockDim.y)/2; s > 0; s>>=1) + { + if (threadIdx.x < s && threadIdx.y < s) + shared.residual[tid] += shared.residual[tid + s] + shared.residual[tid + s * blockDim.x] + shared.residual[tid + s + s * blockDim.x]; + EMUSYNC; + } + +#ifndef __DEVICE_EMULATION__ + if (tid < 16) // Max rice param is really 15 +#endif + { + shared.data[tid] = limit * (tid + 1) + ((shared.residual[0] - (limit >> 1)) >> tid); EMUSYNC; + //if (tid == 16) shared.rice[15] = 0x7fffffff; +#ifndef __DEVICE_EMULATION__ + if (threadIdx.x < 8) +#endif + { + shared.data[threadIdx.x] = min(shared.data[threadIdx.x], shared.data[threadIdx.x + 8]); EMUSYNC; + shared.data[threadIdx.x] = min(shared.data[threadIdx.x], shared.data[threadIdx.x + 4]); EMUSYNC; + shared.data[threadIdx.x] = min(shared.data[threadIdx.x], shared.data[threadIdx.x + 2]); EMUSYNC; + } + } + total += min(shared.data[0], shared.data[1]); + //total += shared.residual[0]; + } + + if (tid == 0) + output[blockIdx.x + blockIdx.y * gridDim.x] = total; +#ifdef __DEVICE_EMULATION__ + if (tid == 0) + printf("%d,%d:2:%d\n", blockIdx.x, blockIdx.y, total); +#endif +} + +#if 0 +extern "C" __global__ void cudaComputeAutocor3int(const int * samples, const float * window, int* output, int frameSize, int frameOffset, int channels) +{ + extern __shared__ short shared[]; + int *ishared = (int*)shared; + const int lag = blockIdx.x; + + // fetch signal, multiply by window and split high bits/low bits + const int iWin = blockIdx.y >> 2; // blockIdx.y/channels; + const int iCh = (blockIdx.y - iWin * channels); + const int smpBase = iCh * frameOffset; + const int winBase = iWin * 2 * frameOffset; + + for(int i = threadIdx.x; i < frameSize; i += blockDim.x) + { + float val = samples[smpBase + i] * window[winBase + i]; + int ival = __float2int_rz(fabs(val)); + int sg = (1 - 2 *signbit(val)); + //int ival = (int) val; + //int sg = ival < 0 ? -1 : 1; + ival = ival < 0 ? -ival : ival; + shared[i*2] = __mul24(sg, (ival >> 9)); + shared[i*2+1] = __mul24(sg, (ival & ((1 << 9) - 1))); + } + __syncthreads(); + + // correlation + int sum1 = 0; + int sum2 = 0; + int sum3 = 0; + for (int i = threadIdx.x; i < frameSize - lag; i += blockDim.x) + { + sum1 += __mul24(shared[2*i], shared[2*(i+lag)]); + sum2 += __mul24(shared[2*i+1], shared[2*(i+lag)]); + sum2 += __mul24(shared[2*i], shared[2*(i+lag)+1]); + sum3 += __mul24(shared[2*i+1], shared[2*(i+lag)+1]); + } + __syncthreads(); + ishared[threadIdx.x] = sum1; + ishared[threadIdx.x + blockDim.x] = sum2; + ishared[threadIdx.x + blockDim.x * 2] = sum3; + __syncthreads(); + + // reduction in shared mem + for(unsigned int s=blockDim.x/2; s>0; s>>=1) + { + if (threadIdx.x < s) + { + ishared[threadIdx.x] += ishared[threadIdx.x + s]; + ishared[threadIdx.x + blockDim.x] += ishared[threadIdx.x + s + blockDim.x]; + ishared[threadIdx.x + blockDim.x * 2] += ishared[threadIdx.x + s + blockDim.x * 2]; + } + __syncthreads(); + } + + if (threadIdx.x == 0) { + output[(blockIdx.x + blockIdx.y * gridDim.x) * 3] = ishared[threadIdx.x]; + output[(blockIdx.x + blockIdx.y * gridDim.x) * 3 + 1] = ishared[threadIdx.x + blockDim.x]; + output[(blockIdx.x + blockIdx.y * gridDim.x) * 3 + 2] = ishared[threadIdx.x + blockDim.x * 2]; + } +} + +__device__ float Bartlett(int i, int blockSize) +{ + float n = fminf(i, blockSize - i); + float k = 2.0f / blockSize * (blockSize / 2.0f - n); + k = 1.0f - k * k; + return k*k; +} + +extern "C" __global__ void cudaComputeAutocorPart(const int * samples, const float * window, float* output, int frameSize, int frameOffset, int iCh, int iWin) +{ + extern __shared__ float fshared[]; + // fetch signal, multiply by window + //const int iWin = blockIdx.y; + //const int iCh = blockIdx.x; + const int smpBase = iCh * frameOffset; + const int winBase = iWin * 2 * frameOffset; + float * matrix = fshared + 129; + + // initialize results matrix + matrix[threadIdx.x + threadIdx.y * (blockDim.x + 1)] = 0.0f; + + // prefetch blockDim.x + blockDim.y samples + int tid = threadIdx.x + threadIdx.y * blockDim.x; + if (tid < blockDim.x + blockDim.y) + { + if (blockIdx.x * blockDim.y + tid < frameSize) + fshared[tid] = samples[smpBase + blockIdx.x * blockDim.y + tid] * window[winBase + blockIdx.x * blockDim.y + tid]; + else + fshared[tid] = 0.0f; + } + + __syncthreads(); + + matrix[threadIdx.x + threadIdx.y * (1 + blockDim.x)] += fshared[threadIdx.y] * fshared[threadIdx.y + threadIdx.x]; + + __syncthreads(); + + // reduction in shared mem + for(unsigned int s=blockDim.y/2; s>0; s>>=1) + { + if (threadIdx.y < s) + matrix[threadIdx.x + threadIdx.y * (1 + blockDim.x)] += matrix[threadIdx.x + (s + threadIdx.y) * (1 + blockDim.x)]; + __syncthreads(); + } + + // return results + if (threadIdx.y == 0) + output[blockIdx.x * blockDim.x + threadIdx.x] = matrix[threadIdx.x]; +} + +extern "C" __global__ void cudaComputeAutocor2(const int * samples, const float * window, float* output, int frameSize, int frameOffset) +{ + extern __shared__ float fshared[]; + // fetch signal, multiply by window + const int iWin = blockIdx.y; + const int iCh = blockIdx.x; + const int smpBase = iCh * frameOffset; + const int winBase = iWin * 2 * frameOffset; + + for(int i = threadIdx.x + threadIdx.y * blockDim.x; i < frameSize; i += blockDim.x * blockDim.y) + fshared[i] = samples[smpBase + i] * window[winBase + i]; + + __syncthreads(); + + const int lag = threadIdx.y; + + // correlation + float sum = 0.0f; + for (int i = threadIdx.x; i < frameSize - lag; i += blockDim.x) + sum += fshared[i] * fshared[i+lag]; + __syncthreads(); + + fshared[threadIdx.x + threadIdx.y * blockDim.x] = sum; + + __syncthreads(); + + // reduction in shared mem + for(unsigned int s=blockDim.x/2; s>0; s>>=1) + { + if (threadIdx.x < s) + fshared[threadIdx.x + threadIdx.y * blockDim.x] += fshared[threadIdx.x + s + threadIdx.y * blockDim.x]; + __syncthreads(); + } + + // if (threadIdx.x < 32) + // { + //if (blockDim.x >= 64) fshared[threadIdx.x + threadIdx.y * blockDim.x] += fshared[threadIdx.x + 32 + threadIdx.y * blockDim.x]; + //if (blockDim.x >= 32) fshared[threadIdx.x + threadIdx.y * blockDim.x] += fshared[threadIdx.x + 16 + threadIdx.y * blockDim.x]; + //if (blockDim.x >= 16) fshared[threadIdx.x + threadIdx.y * blockDim.x] += fshared[threadIdx.x + 8 + threadIdx.y * blockDim.x]; + //if (blockDim.x >= 8) fshared[threadIdx.x + threadIdx.y * blockDim.x] += fshared[threadIdx.x + 4 + threadIdx.y * blockDim.x]; + //if (blockDim.x >= 4) fshared[threadIdx.x + threadIdx.y * blockDim.x] += fshared[threadIdx.x + 2 + threadIdx.y * blockDim.x]; + //if (blockDim.x >= 2) fshared[threadIdx.x + threadIdx.y * blockDim.x] += fshared[threadIdx.x + 1 + threadIdx.y * blockDim.x]; + // } + + if (threadIdx.x == 0) { + output[(blockIdx.x + blockIdx.y * gridDim.x) * blockDim.y + threadIdx.y] + = fshared[threadIdx.x + threadIdx.y * blockDim.x]; + } +} + +extern "C" __global__ void cudaComputeAutocorOld(const int * samples, const float * window, float* output, int frameSize, int frameOffset, int channels) +{ + extern __shared__ float fshared[]; + const int lag = blockIdx.x; + + // fetch signal, multiply by window + const int iWin = blockIdx.y >> 2; // blockIdx.y/channels; + const int iCh = (blockIdx.y - iWin * channels); + const int smpBase = iCh * frameOffset; + const int winBase = iWin * 2 * frameOffset; + + for(int i = threadIdx.x; i < frameSize; i += blockDim.x) + fshared[i] = samples[smpBase + i] * window[winBase + i]; + + __syncthreads(); + + // correlation + float sum = 0.0f; + for (int i = threadIdx.x; i < frameSize - lag; i += blockDim.x) + sum += fshared[i] * fshared[i+lag]; + __syncthreads(); + + fshared[threadIdx.x] = sum; + + __syncthreads(); + + // reduction in shared mem + for(unsigned int s=blockDim.x/2; s>0; s>>=1) + { + if (threadIdx.x < s) + fshared[threadIdx.x] += fshared[threadIdx.x + s]; + __syncthreads(); + } + + if (threadIdx.x == 0) { + output[blockIdx.x + blockIdx.y * gridDim.x] = fshared[threadIdx.x]; + } +} +#endif + +#endif // _FLACUDA_KERNEL_H_