/**
* CUETools.FlaCuda: FLAC audio encoder using CUDA
* Copyright (c) 2009 Gregory S. Chudov
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
using System;
using System.ComponentModel;
using System.Collections.Generic;
using System.IO;
using System.Security.Cryptography;
using System.Threading;
using System.Text;
using CUETools.Codecs;
using CUETools.Codecs.FLAKE;
using GASS.CUDA;
using GASS.CUDA.Types;
namespace CUETools.Codecs.FlaCuda
{
public class FlaCudaWriterSettings
{
public FlaCudaWriterSettings() { DoVerify = false; GPUOnly = true; DoMD5 = true; }
[DefaultValue(false)]
[DisplayName("Verify")]
[SRDescription(typeof(Properties.Resources), "DoVerifyDescription")]
public bool DoVerify { get; set; }
[DefaultValue(true)]
[DisplayName("MD5")]
[SRDescription(typeof(Properties.Resources), "DoMD5Description")]
public bool DoMD5 { get; set; }
[DefaultValue(true)]
[SRDescription(typeof(Properties.Resources), "DescriptionGPUOnly")]
public bool GPUOnly { get; set; }
int cpu_threads = 1;
[DefaultValue(1)]
[SRDescription(typeof(Properties.Resources), "DescriptionCPUThreads")]
public int CPUThreads
{
get
{
return cpu_threads;
}
set
{
if (value < 0 || value > 16)
throw new Exception("CPUThreads must be between 0..16");
cpu_threads = value;
}
}
}
[AudioEncoderClass("FlaCuda", "flac", true, "0 1 2 3 4 5 6 7 8 9 10 11", "8", 2, typeof(FlaCudaWriterSettings))]
//[AudioEncoderClass("FlaCuda nonsub", "flac", true, "9 10 11", "9", 1, typeof(FlaCudaWriterSettings))]
public class FlaCudaWriter : IAudioDest
{
Stream _IO = null;
string _path;
long _position;
// number of audio channels
// valid values are 1 to 8
int channels, ch_code;
// audio sample rate in Hz
int sample_rate, sr_code0, sr_code1;
// sample size in bits
// only 16-bit is currently supported
uint bits_per_sample;
int bps_code;
// total stream samples
// if 0, stream length is unknown
int sample_count = -1;
FlakeEncodeParams eparams;
// maximum frame size in bytes
// this can be used to allocate memory for output
int max_frame_size;
int frame_count = 0;
int frame_pos = 0;
long first_frame_offset = 0;
TimeSpan _userProcessorTime;
// header bytes
// allocated by flake_encode_init and freed by flake_encode_close
byte[] header;
float[] windowBuffer;
int samplesInBuffer = 0;
int max_frames = 0;
int _compressionLevel = 7;
int _blocksize = 0;
int _totalSize = 0;
int _windowsize = 0, _windowcount = 0;
Crc8 crc8;
Crc16 crc16;
MD5 md5;
SeekPoint[] seek_table;
int seek_table_offset = -1;
bool inited = false;
CUDA cuda;
FlaCudaTask task1;
FlaCudaTask task2;
FlaCudaTask[] cpu_tasks;
int oldest_cpu_task = 0;
CUdeviceptr cudaWindow;
bool do_lattice = false;
AudioPCMConfig _pcm;
public const int MAX_BLOCKSIZE = 4096 * 16;
internal const int maxFrames = 128;
internal const int maxResidualParts = 64; // not (MAX_BLOCKSIZE + 255) / 256!! 64 is hardcoded in cudaEstimateResidual. It's per block.
internal const int maxAutocorParts = (MAX_BLOCKSIZE + 255) / 256;
public FlaCudaWriter(string path, Stream IO, AudioPCMConfig pcm)
{
_pcm = pcm;
if (pcm.BitsPerSample != 16)
throw new Exception("Bits per sample must be 16.");
if (pcm.ChannelCount != 2)
throw new Exception("ChannelCount must be 2.");
channels = pcm.ChannelCount;
sample_rate = pcm.SampleRate;
bits_per_sample = (uint) pcm.BitsPerSample;
// flake_validate_params
_path = path;
_IO = IO;
windowBuffer = new float[FlaCudaWriter.MAX_BLOCKSIZE * lpc.MAX_LPC_WINDOWS];
eparams.flake_set_defaults(_compressionLevel, !_settings.GPUOnly);
eparams.padding_size = 8192;
crc8 = new Crc8();
crc16 = new Crc16();
}
public FlaCudaWriter(string path, AudioPCMConfig pcm)
: this(path, null, pcm)
{
}
public int TotalSize
{
get
{
return _totalSize;
}
}
public long Padding
{
get
{
return eparams.padding_size;
}
set
{
eparams.padding_size = value;
}
}
public int CompressionLevel
{
get
{
return _compressionLevel;
}
set
{
if (value < 0 || value > 11)
throw new Exception("unsupported compression level");
_compressionLevel = value;
eparams.flake_set_defaults(_compressionLevel, !_settings.GPUOnly);
}
}
FlaCudaWriterSettings _settings = new FlaCudaWriterSettings();
public object Settings
{
get
{
return _settings;
}
set
{
if (value as FlaCudaWriterSettings == null)
throw new Exception("Unsupported options " + value);
_settings = value as FlaCudaWriterSettings;
eparams.flake_set_defaults(_compressionLevel, !_settings.GPUOnly);
}
}
public bool UseLattice
{
get
{
return do_lattice;
}
set
{
do_lattice = value;
}
}
//[DllImport("kernel32.dll")]
//static extern bool GetThreadTimes(IntPtr hThread, out long lpCreationTime, out long lpExitTime, out long lpKernelTime, out long lpUserTime);
//[DllImport("kernel32.dll")]
//static extern IntPtr GetCurrentThread();
void DoClose()
{
if (inited)
{
int nFrames = samplesInBuffer / eparams.block_size;
if (nFrames > 0)
do_output_frames(nFrames);
if (samplesInBuffer > 0)
{
eparams.block_size = samplesInBuffer;
do_output_frames(1);
}
if (task2.frameCount > 0)
{
if (cpu_tasks != null)
{
for (int i = 0; i < cpu_tasks.Length; i++)
{
wait_for_cpu_task();
FlaCudaTask task = cpu_tasks[oldest_cpu_task];
oldest_cpu_task = (oldest_cpu_task + 1) % cpu_tasks.Length;
if (task.frameCount > 0)
{
write_result(task);
task.frameCount = 0;
}
}
}
cuda.SynchronizeStream(task2.stream);
process_result(task2);
write_result(task2);
task2.frameCount = 0;
}
if (_IO.CanSeek)
{
if (sample_count <= 0 && _position != 0)
{
BitWriter bitwriter = new BitWriter(header, 0, 4);
bitwriter.writebits(32, (int)_position);
bitwriter.flush();
_IO.Position = 22;
_IO.Write(header, 0, 4);
}
if (md5 != null)
{
md5.TransformFinalBlock(new byte[] { 0 }, 0, 0);
_IO.Position = 26;
_IO.Write(md5.Hash, 0, md5.Hash.Length);
}
if (seek_table != null)
{
_IO.Position = seek_table_offset;
int len = write_seekpoints(header, 0, 0);
_IO.Write(header, 4, len - 4);
}
}
_IO.Close();
cuda.Free(cudaWindow);
task1.Dispose();
task2.Dispose();
if (cpu_tasks != null)
foreach (FlaCudaTask task in cpu_tasks)
task.Dispose();
cuda.UnloadModule();
cuda.DestroyContext();
cuda.Dispose();
inited = false;
}
}
public void Close()
{
DoClose();
if (sample_count > 0 && _position != sample_count)
throw new Exception(string.Format("Samples written differs from the expected sample count. Expected {0}, got {1}.", sample_count, _position));
}
public void Delete()
{
if (inited)
{
_IO.Close();
cuda.Free(cudaWindow);
task1.Dispose();
task2.Dispose();
if (cpu_tasks != null)
foreach (FlaCudaTask task in cpu_tasks)
task.Dispose();
cuda.UnloadModule();
cuda.DestroyContext();
cuda.Dispose();
inited = false;
}
if (_path != "")
File.Delete(_path);
}
public long Position
{
get
{
return _position;
}
}
public long FinalSampleCount
{
set { sample_count = (int)value; }
}
public long BlockSize
{
set {
if (value < 256 || value > MAX_BLOCKSIZE )
throw new Exception("unsupported BlockSize value");
_blocksize = (int)value;
}
get { return _blocksize == 0 ? eparams.block_size : _blocksize; }
}
public StereoMethod StereoMethod
{
get { return eparams.do_midside ? StereoMethod.Search : StereoMethod.Independent; }
set { eparams.do_midside = value != StereoMethod.Independent; }
}
public int MinPrecisionSearch
{
get { return eparams.lpc_min_precision_search; }
set
{
if (value < 0 || value > eparams.lpc_max_precision_search)
throw new Exception("unsupported MinPrecisionSearch value");
eparams.lpc_min_precision_search = value;
}
}
public int MaxPrecisionSearch
{
get { return eparams.lpc_max_precision_search; }
set
{
if (value < eparams.lpc_min_precision_search || value >= lpc.MAX_LPC_PRECISIONS)
throw new Exception("unsupported MaxPrecisionSearch value");
eparams.lpc_max_precision_search = value;
}
}
public WindowFunction WindowFunction
{
get { return eparams.window_function; }
set { eparams.window_function = value; }
}
public bool DoSeekTable
{
get { return eparams.do_seektable; }
set { eparams.do_seektable = value; }
}
public int VBRMode
{
get { return eparams.variable_block_size; }
set { eparams.variable_block_size = value; }
}
public int OrdersPerWindow
{
get
{
return eparams.orders_per_window;
}
set
{
if (value < 0 || value > 32)
throw new Exception("invalid OrdersPerWindow " + value.ToString());
eparams.orders_per_window = value;
}
}
public int MinLPCOrder
{
get
{
return eparams.min_prediction_order;
}
set
{
if (value < 1 || value > eparams.max_prediction_order)
throw new Exception("invalid MinLPCOrder " + value.ToString());
eparams.min_prediction_order = value;
}
}
public int MaxLPCOrder
{
get
{
return eparams.max_prediction_order;
}
set
{
if (value > lpc.MAX_LPC_ORDER || value < eparams.min_prediction_order)
throw new Exception("invalid MaxLPCOrder " + value.ToString());
eparams.max_prediction_order = value;
}
}
public int MinFixedOrder
{
get
{
return eparams.min_fixed_order;
}
set
{
if (value < 0 || value > eparams.max_fixed_order)
throw new Exception("invalid MinFixedOrder " + value.ToString());
eparams.min_fixed_order = value;
}
}
public int MaxFixedOrder
{
get
{
return eparams.max_fixed_order;
}
set
{
if (value > 4 || value < eparams.min_fixed_order)
throw new Exception("invalid MaxFixedOrder " + value.ToString());
eparams.max_fixed_order = value;
}
}
public int MinPartitionOrder
{
get { return eparams.min_partition_order; }
set
{
if (value < 0 || value > eparams.max_partition_order)
throw new Exception("invalid MinPartitionOrder " + value.ToString());
eparams.min_partition_order = value;
}
}
public int MaxPartitionOrder
{
get { return eparams.max_partition_order; }
set
{
if (value > 8 || value < eparams.min_partition_order)
throw new Exception("invalid MaxPartitionOrder " + value.ToString());
eparams.max_partition_order = value;
}
}
public TimeSpan UserProcessorTime
{
get { return _userProcessorTime; }
}
public AudioPCMConfig PCM
{
get { return _pcm; }
}
unsafe void encode_residual_fixed(int* res, int* smp, int n, int order)
{
int i;
int s0, s1, s2;
switch (order)
{
case 0:
AudioSamples.MemCpy(res, smp, n);
return;
case 1:
*(res++) = s1 = *(smp++);
for (i = n - 1; i > 0; i--)
{
s0 = *(smp++);
*(res++) = s0 - s1;
s1 = s0;
}
return;
case 2:
*(res++) = s2 = *(smp++);
*(res++) = s1 = *(smp++);
for (i = n - 2; i > 0; i--)
{
s0 = *(smp++);
*(res++) = s0 - 2 * s1 + s2;
s2 = s1;
s1 = s0;
}
return;
case 3:
res[0] = smp[0];
res[1] = smp[1];
res[2] = smp[2];
for (i = 3; i < n; i++)
{
res[i] = smp[i] - 3 * smp[i - 1] + 3 * smp[i - 2] - smp[i - 3];
}
return;
case 4:
res[0] = smp[0];
res[1] = smp[1];
res[2] = smp[2];
res[3] = smp[3];
for (i = 4; i < n; i++)
{
res[i] = smp[i] - 4 * smp[i - 1] + 6 * smp[i - 2] - 4 * smp[i - 3] + smp[i - 4];
}
return;
default:
return;
}
}
static unsafe uint calc_optimal_rice_params(int porder, int* parm, uint* sums, uint n, uint pred_order)
{
uint part = (1U << porder);
uint cnt = (n >> porder) - pred_order;
int k = cnt > 0 ? Math.Min(Flake.MAX_RICE_PARAM, BitReader.log2i(sums[0] / cnt)) : 0;
uint all_bits = cnt * ((uint)k + 1U) + (sums[0] >> k);
parm[0] = k;
cnt = (n >> porder);
for (uint i = 1; i < part; i++)
{
k = Math.Min(Flake.MAX_RICE_PARAM, BitReader.log2i(sums[i] / cnt));
all_bits += cnt * ((uint)k + 1U) + (sums[i] >> k);
parm[i] = k;
}
return all_bits + (4 * part);
}
static unsafe void calc_lower_sums(int pmin, int pmax, uint* sums)
{
for (int i = pmax - 1; i >= pmin; i--)
{
for (int j = 0; j < (1 << i); j++)
{
sums[i * Flake.MAX_PARTITIONS + j] =
sums[(i + 1) * Flake.MAX_PARTITIONS + 2 * j] +
sums[(i + 1) * Flake.MAX_PARTITIONS + 2 * j + 1];
}
}
}
static unsafe void calc_sums(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums)
{
int parts = (1 << pmax);
uint* res = data + pred_order;
uint cnt = (n >> pmax) - pred_order;
uint sum = 0;
for (uint j = cnt; j > 0; j--)
sum += *(res++);
sums[0] = sum;
cnt = (n >> pmax);
for (int i = 1; i < parts; i++)
{
sum = 0;
for (uint j = cnt; j > 0; j--)
sum += *(res++);
sums[i] = sum;
}
}
///
/// Special case when (n >> pmax) == 18
///
///
///
///
///
///
///
static unsafe void calc_sums18(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums)
{
int parts = (1 << pmax);
uint* res = data + pred_order;
uint cnt = 18 - pred_order;
uint sum = 0;
for (uint j = cnt; j > 0; j--)
sum += *(res++);
sums[0] = sum;
for (int i = 1; i < parts; i++)
{
sums[i] =
*(res++) + *(res++) + *(res++) + *(res++) +
*(res++) + *(res++) + *(res++) + *(res++) +
*(res++) + *(res++) + *(res++) + *(res++) +
*(res++) + *(res++) + *(res++) + *(res++) +
*(res++) + *(res++);
}
}
///
/// Special case when (n >> pmax) == 18
///
///
///
///
///
///
///
static unsafe void calc_sums16(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums)
{
int parts = (1 << pmax);
uint* res = data + pred_order;
uint cnt = 16 - pred_order;
uint sum = 0;
for (uint j = cnt; j > 0; j--)
sum += *(res++);
sums[0] = sum;
for (int i = 1; i < parts; i++)
{
sums[i] =
*(res++) + *(res++) + *(res++) + *(res++) +
*(res++) + *(res++) + *(res++) + *(res++) +
*(res++) + *(res++) + *(res++) + *(res++) +
*(res++) + *(res++) + *(res++) + *(res++);
}
}
static unsafe uint calc_rice_params(RiceContext rc, int pmin, int pmax, int* data, uint n, uint pred_order)
{
uint* udata = stackalloc uint[(int)n];
uint* sums = stackalloc uint[(pmax + 1) * Flake.MAX_PARTITIONS];
int* parm = stackalloc int[(pmax + 1) * Flake.MAX_PARTITIONS];
//uint* bits = stackalloc uint[Flake.MAX_PARTITION_ORDER];
//assert(pmin >= 0 && pmin <= Flake.MAX_PARTITION_ORDER);
//assert(pmax >= 0 && pmax <= Flake.MAX_PARTITION_ORDER);
//assert(pmin <= pmax);
for (uint i = 0; i < n; i++)
udata[i] = (uint)((data[i] << 1) ^ (data[i] >> 31));
// sums for highest level
if ((n >> pmax) == 18)
calc_sums18(pmin, pmax, udata, n, pred_order, sums + pmax * Flake.MAX_PARTITIONS);
else if ((n >> pmax) == 16)
calc_sums16(pmin, pmax, udata, n, pred_order, sums + pmax * Flake.MAX_PARTITIONS);
else
calc_sums(pmin, pmax, udata, n, pred_order, sums + pmax * Flake.MAX_PARTITIONS);
// sums for lower levels
calc_lower_sums(pmin, pmax, sums);
uint opt_bits = AudioSamples.UINT32_MAX;
int opt_porder = pmin;
for (int i = pmin; i <= pmax; i++)
{
uint bits = calc_optimal_rice_params(i, parm + i * Flake.MAX_PARTITIONS, sums + i * Flake.MAX_PARTITIONS, n, pred_order);
if (bits <= opt_bits)
{
opt_bits = bits;
opt_porder = i;
}
}
rc.porder = opt_porder;
fixed (int* rparms = rc.rparams)
AudioSamples.MemCpy(rparms, parm + opt_porder * Flake.MAX_PARTITIONS, (1 << opt_porder));
return opt_bits;
}
static int get_max_p_order(int max_porder, int n, int order)
{
int porder = Math.Min(max_porder, BitReader.log2i(n ^ (n - 1)));
if (order > 0)
porder = Math.Min(porder, BitReader.log2i(n / order));
return porder;
}
unsafe void output_frame_header(FlacFrame frame)
{
frame.writer.writebits(15, 0x7FFC);
frame.writer.writebits(1, eparams.variable_block_size > 0 ? 1 : 0);
frame.writer.writebits(4, frame.bs_code0);
frame.writer.writebits(4, sr_code0);
if (frame.ch_mode == ChannelMode.NotStereo)
frame.writer.writebits(4, ch_code);
else
frame.writer.writebits(4, (int)frame.ch_mode);
frame.writer.writebits(3, bps_code);
frame.writer.writebits(1, 0);
frame.writer.write_utf8(frame.frame_number);
// custom block size
if (frame.bs_code1 >= 0)
{
if (frame.bs_code1 < 256)
frame.writer.writebits(8, frame.bs_code1);
else
frame.writer.writebits(16, frame.bs_code1);
}
// custom sample rate
if (sr_code1 > 0)
{
if (sr_code1 < 256)
frame.writer.writebits(8, sr_code1);
else
frame.writer.writebits(16, sr_code1);
}
// CRC-8 of frame header
frame.writer.flush();
byte crc = crc8.ComputeChecksum(frame.writer.Buffer, frame.writer_offset, frame.writer.Length - frame.writer_offset);
frame.writer.writebits(8, crc);
}
unsafe void output_residual(FlacFrame frame, FlacSubframeInfo sub)
{
// rice-encoded block
frame.writer.writebits(2, 0);
// partition order
int porder = sub.best.rc.porder;
int psize = frame.blocksize >> porder;
//assert(porder >= 0);
frame.writer.writebits(4, porder);
int res_cnt = psize - sub.best.order;
// residual
int j = sub.best.order;
fixed (byte* fixbuf = frame.writer.Buffer)
for (int p = 0; p < (1 << porder); p++)
{
int k = sub.best.rc.rparams[p];
frame.writer.writebits(4, k);
if (p == 1) res_cnt = psize;
int cnt = Math.Min(res_cnt, frame.blocksize - j);
frame.writer.write_rice_block_signed(fixbuf, k, sub.best.residual + j, cnt);
j += cnt;
}
}
unsafe void
output_subframe_constant(FlacFrame frame, FlacSubframeInfo sub)
{
frame.writer.writebits_signed(sub.obits, sub.samples[0]);
}
unsafe void
output_subframe_verbatim(FlacFrame frame, FlacSubframeInfo sub)
{
int n = frame.blocksize;
for (int i = 0; i < n; i++)
frame.writer.writebits_signed(sub.obits, sub.samples[i]);
// Don't use residual here, because we don't copy samples to residual for verbatim frames.
}
unsafe void
output_subframe_fixed(FlacFrame frame, FlacSubframeInfo sub)
{
// warm-up samples
for (int i = 0; i < sub.best.order; i++)
frame.writer.writebits_signed(sub.obits, sub.samples[i]);
// residual
output_residual(frame, sub);
}
unsafe void
output_subframe_lpc(FlacFrame frame, FlacSubframeInfo sub)
{
// warm-up samples
for (int i = 0; i < sub.best.order; i++)
frame.writer.writebits_signed(sub.obits, sub.samples[i]);
// LPC coefficients
frame.writer.writebits(4, sub.best.cbits - 1);
frame.writer.writebits_signed(5, sub.best.shift);
for (int i = 0; i < sub.best.order; i++)
frame.writer.writebits_signed(sub.best.cbits, sub.best.coefs[i]);
// residual
output_residual(frame, sub);
}
unsafe void output_subframes(FlacFrame frame)
{
for (int ch = 0; ch < channels; ch++)
{
FlacSubframeInfo sub = frame.subframes[ch];
// subframe header
int type_code = (int) sub.best.type;
if (sub.best.type == SubframeType.Fixed)
type_code |= sub.best.order;
if (sub.best.type == SubframeType.LPC)
type_code |= sub.best.order - 1;
frame.writer.writebits(1, 0);
frame.writer.writebits(6, type_code);
frame.writer.writebits(1, sub.wbits != 0 ? 1 : 0);
if (sub.wbits > 0)
frame.writer.writebits((int)sub.wbits, 1);
//if (frame_writer.Length >= frame_buffer.Length)
// throw new Exception("buffer overflow");
// subframe
switch (sub.best.type)
{
case SubframeType.Constant:
output_subframe_constant(frame, sub);
break;
case SubframeType.Verbatim:
output_subframe_verbatim(frame, sub);
break;
case SubframeType.Fixed:
output_subframe_fixed(frame, sub);
break;
case SubframeType.LPC:
output_subframe_lpc(frame, sub);
break;
}
//if (frame_writer.Length >= frame_buffer.Length)
// throw new Exception("buffer overflow");
}
}
void output_frame_footer(FlacFrame frame)
{
frame.writer.flush();
ushort crc = crc16.ComputeChecksum(frame.writer.Buffer, frame.writer_offset, frame.writer.Length - frame.writer_offset);
frame.writer.writebits(16, crc);
frame.writer.flush();
}
unsafe delegate void window_function(float* window, int size);
unsafe void calculate_window(float* window, window_function func, WindowFunction flag)
{
if ((eparams.window_function & flag) == 0 || _windowcount == lpc.MAX_LPC_WINDOWS)
return;
func(window + _windowcount * _windowsize, _windowsize);
//int sz = _windowsize;
//float* pos = window + _windowcount * FlaCudaWriter.MAX_BLOCKSIZE * 2;
//do
//{
// func(pos, sz);
// if ((sz & 1) != 0)
// break;
// pos += sz;
// sz >>= 1;
//} while (sz >= 32);
_windowcount++;
}
unsafe void initializeSubframeTasks(int blocksize, int channelsCount, int nFrames, FlaCudaTask task)
{
task.nResidualTasks = 0;
task.nTasksPerWindow = Math.Min(32, eparams.orders_per_window);
task.nResidualTasksPerChannel = _windowcount * task.nTasksPerWindow + 1 + (eparams.do_constant ? 1 : 0) + eparams.max_fixed_order - eparams.min_fixed_order;
if (task.nResidualTasksPerChannel >= 4)
task.nResidualTasksPerChannel = (task.nResidualTasksPerChannel + 7) & ~7;
task.nAutocorTasksPerChannel = _windowcount;
for (int iFrame = 0; iFrame < nFrames; iFrame++)
{
for (int ch = 0; ch < channelsCount; ch++)
{
for (int iWindow = 0; iWindow < _windowcount; iWindow++)
{
// LPC tasks
for (int order = 0; order < task.nTasksPerWindow; order++)
{
task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.LPC;
task.ResidualTasks[task.nResidualTasks].channel = ch;
task.ResidualTasks[task.nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0);
task.ResidualTasks[task.nResidualTasks].abits = task.ResidualTasks[task.nResidualTasks].obits;
task.ResidualTasks[task.nResidualTasks].blocksize = blocksize;
task.ResidualTasks[task.nResidualTasks].residualOrder = order + 1;
task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize;
task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs;
task.nResidualTasks++;
}
}
// Constant frames
if (eparams.do_constant)
{
task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.Constant;
task.ResidualTasks[task.nResidualTasks].channel = ch;
task.ResidualTasks[task.nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0);
task.ResidualTasks[task.nResidualTasks].abits = task.ResidualTasks[task.nResidualTasks].obits;
task.ResidualTasks[task.nResidualTasks].blocksize = blocksize;
task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize;
task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs;
task.ResidualTasks[task.nResidualTasks].residualOrder = 1;
task.ResidualTasks[task.nResidualTasks].shift = 0;
task.ResidualTasks[task.nResidualTasks].coefs[0] = 1;
task.nResidualTasks++;
}
// Fixed prediction
for (int order = eparams.min_fixed_order; order <= eparams.max_fixed_order; order++)
{
task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.Fixed;
task.ResidualTasks[task.nResidualTasks].channel = ch;
task.ResidualTasks[task.nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0);
task.ResidualTasks[task.nResidualTasks].abits = task.ResidualTasks[task.nResidualTasks].obits;
task.ResidualTasks[task.nResidualTasks].blocksize = blocksize;
task.ResidualTasks[task.nResidualTasks].residualOrder = order;
task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize;
task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs;
task.ResidualTasks[task.nResidualTasks].shift = 0;
switch (order)
{
case 0:
break;
case 1:
task.ResidualTasks[task.nResidualTasks].coefs[0] = 1;
break;
case 2:
task.ResidualTasks[task.nResidualTasks].coefs[1] = 2;
task.ResidualTasks[task.nResidualTasks].coefs[0] = -1;
break;
case 3:
task.ResidualTasks[task.nResidualTasks].coefs[2] = 3;
task.ResidualTasks[task.nResidualTasks].coefs[1] = -3;
task.ResidualTasks[task.nResidualTasks].coefs[0] = 1;
break;
case 4:
task.ResidualTasks[task.nResidualTasks].coefs[3] = 4;
task.ResidualTasks[task.nResidualTasks].coefs[2] = -6;
task.ResidualTasks[task.nResidualTasks].coefs[1] = 4;
task.ResidualTasks[task.nResidualTasks].coefs[0] = -1;
break;
}
task.nResidualTasks++;
}
// Filler
while ((task.nResidualTasks % task.nResidualTasksPerChannel) != 0)
{
task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.Verbatim;
task.ResidualTasks[task.nResidualTasks].channel = ch;
task.ResidualTasks[task.nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0);
task.ResidualTasks[task.nResidualTasks].abits = task.ResidualTasks[task.nResidualTasks].obits;
task.ResidualTasks[task.nResidualTasks].blocksize = blocksize;
task.ResidualTasks[task.nResidualTasks].residualOrder = 0;
task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize;
task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs;
task.ResidualTasks[task.nResidualTasks].shift = 0;
task.nResidualTasks++;
}
}
}
if (sizeof(FlaCudaSubframeTask) * task.nResidualTasks > task.residualTasksLen)
throw new Exception("oops");
cuda.CopyHostToDeviceAsync(task.cudaResidualTasks, task.residualTasksPtr, (uint)(sizeof(FlaCudaSubframeTask) * task.nResidualTasks), task.stream);
task.frameSize = blocksize;
}
unsafe void encode_residual(FlaCudaTask task)
{
bool unpacked = false;
unpack_samples(task, Math.Min(32, task.frameSize));
for (int ch = 0; ch < channels; ch++)
{
switch (task.frame.subframes[ch].best.type)
{
case SubframeType.Constant:
break;
case SubframeType.Verbatim:
if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true;
break;
case SubframeType.Fixed:
if (!_settings.GPUOnly)
{
if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true;
encode_residual_fixed(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples,
task.frame.blocksize, task.frame.subframes[ch].best.order);
int pmin = get_max_p_order(eparams.min_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order);
int pmax = get_max_p_order(eparams.max_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order);
uint bits = (uint)(task.frame.subframes[ch].best.order * task.frame.subframes[ch].obits) + 6;
task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order);
}
break;
case SubframeType.LPC:
fixed (int* coefs = task.frame.subframes[ch].best.coefs)
{
ulong csum = 0;
for (int i = task.frame.subframes[ch].best.order; i > 0; i--)
csum += (ulong)Math.Abs(coefs[i - 1]);
if ((csum << task.frame.subframes[ch].obits) >= 1UL << 32 || !_settings.GPUOnly)
{
if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true;
if ((csum << task.frame.subframes[ch].obits) >= 1UL << 32)
lpc.encode_residual_long(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, task.frame.blocksize, task.frame.subframes[ch].best.order, coefs, task.frame.subframes[ch].best.shift);
else
lpc.encode_residual(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, task.frame.blocksize, task.frame.subframes[ch].best.order, coefs, task.frame.subframes[ch].best.shift);
int pmin = get_max_p_order(eparams.min_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order);
int pmax = get_max_p_order(eparams.max_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order);
uint bits = (uint)(task.frame.subframes[ch].best.order * task.frame.subframes[ch].obits) + 4 + 5 + (uint)task.frame.subframes[ch].best.order * (uint)task.frame.subframes[ch].best.cbits + 6;
//uint oldsize = task.frame.subframes[ch].best.size;
task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order);
//if (task.frame.subframes[ch].best.size > task.frame.subframes[ch].obits * (uint)task.frame.blocksize &&
// oldsize <= task.frame.subframes[ch].obits * (uint)task.frame.blocksize)
// throw new Exception("oops");
}
}
break;
}
if (task.frame.subframes[ch].best.size > task.frame.subframes[ch].obits * task.frame.blocksize)
{
#if DEBUG
throw new Exception("larger than verbatim");
#endif
task.frame.subframes[ch].best.type = SubframeType.Verbatim;
task.frame.subframes[ch].best.size = (uint)(task.frame.subframes[ch].obits * task.frame.blocksize);
if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true;
}
}
}
unsafe void select_best_methods(FlacFrame frame, int channelsCount, int iFrame, FlaCudaTask task)
{
if (channelsCount == 4 && channels == 2)
{
if (task.BestResidualTasks[iFrame * 2].channel == 0 && task.BestResidualTasks[iFrame * 2 + 1].channel == 1)
frame.ch_mode = ChannelMode.LeftRight;
else if (task.BestResidualTasks[iFrame * 2].channel == 0 && task.BestResidualTasks[iFrame * 2 + 1].channel == 3)
frame.ch_mode = ChannelMode.LeftSide;
else if (task.BestResidualTasks[iFrame * 2].channel == 3 && task.BestResidualTasks[iFrame * 2 + 1].channel == 1)
frame.ch_mode = ChannelMode.RightSide;
else if (task.BestResidualTasks[iFrame * 2].channel == 2 && task.BestResidualTasks[iFrame * 2 + 1].channel == 3)
frame.ch_mode = ChannelMode.MidSide;
else
throw new Exception("internal error: invalid stereo mode");
frame.SwapSubframes(0, task.BestResidualTasks[iFrame * 2].channel);
frame.SwapSubframes(1, task.BestResidualTasks[iFrame * 2 + 1].channel);
}
else
frame.ch_mode = channels != 2 ? ChannelMode.NotStereo : ChannelMode.LeftRight;
for (int ch = 0; ch < channels; ch++)
{
int index = ch + iFrame * channels;
frame.subframes[ch].best.residual = ((int*)task.residualBufferPtr) + task.BestResidualTasks[index].residualOffs;
frame.subframes[ch].best.type = SubframeType.Verbatim;
frame.subframes[ch].best.size = (uint)(frame.subframes[ch].obits * frame.blocksize);
frame.subframes[ch].wbits = 0;
if (task.BestResidualTasks[index].size < 0)
throw new Exception("internal error");
if (frame.blocksize > Math.Max(4, eparams.max_prediction_order) && frame.subframes[ch].best.size > task.BestResidualTasks[index].size)
{
frame.subframes[ch].best.type = (SubframeType)task.BestResidualTasks[index].type;
frame.subframes[ch].best.size = (uint)task.BestResidualTasks[index].size;
frame.subframes[ch].best.order = task.BestResidualTasks[index].residualOrder;
frame.subframes[ch].best.cbits = task.BestResidualTasks[index].cbits;
frame.subframes[ch].best.shift = task.BestResidualTasks[index].shift;
frame.subframes[ch].obits -= task.BestResidualTasks[index].wbits;
frame.subframes[ch].wbits = task.BestResidualTasks[index].wbits;
frame.subframes[ch].best.rc.porder = task.BestResidualTasks[index].porder;
for (int i = 0; i < task.BestResidualTasks[index].residualOrder; i++)
frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i];
if (_settings.GPUOnly && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC))
{
int* riceParams = ((int*)task.bestRiceParamsPtr) + (index << task.max_porder);
fixed (int* dstParams = frame.subframes[ch].best.rc.rparams)
AudioSamples.MemCpy(dstParams, riceParams, (1 << frame.subframes[ch].best.rc.porder));
//for (int i = 0; i < (1 << frame.subframes[ch].best.rc.porder); i++)
// frame.subframes[ch].best.rc.rparams[i] = riceParams[i];
}
}
}
}
unsafe void estimate_residual(FlaCudaTask task, int channelsCount)
{
if (task.frameSize <= 4)
return;
//int autocorPartSize = (2 * 256 - eparams.max_prediction_order) & ~15;
int autocorPartSize = 32 * 15;
int autocorPartCount = (task.frameSize + autocorPartSize - 1) / autocorPartSize;
if (autocorPartCount > maxAutocorParts)
throw new Exception("internal error");
int threads_y;
if (task.nResidualTasksPerChannel < 4)
threads_y = 8;
else if (task.nResidualTasksPerChannel >= 4 && task.nResidualTasksPerChannel <= 8)
threads_y = task.nResidualTasksPerChannel;
else if ((task.nResidualTasksPerChannel % 8) == 0)
threads_y = 8;
else if ((task.nResidualTasksPerChannel % 7) == 0)
threads_y = 7;
else if ((task.nResidualTasksPerChannel % 6) == 0)
threads_y = 6;
else if ((task.nResidualTasksPerChannel % 5) == 0)
threads_y = 5;
else if ((task.nResidualTasksPerChannel % 4) == 0)
threads_y = 4;
else
throw new Exception("invalid LPC order");
int residualPartSize = 32 * threads_y;
int residualPartCount = (task.frameSize + residualPartSize - 1) / residualPartSize;
if (residualPartCount > maxResidualParts)
throw new Exception("invalid combination of block size and LPC order");
int max_porder = get_max_p_order(eparams.max_partition_order, task.frameSize, eparams.max_prediction_order);
int calcPartitionPartSize = task.frameSize >> max_porder;
while (calcPartitionPartSize < 16 && max_porder > 0)
{
calcPartitionPartSize <<= 1;
max_porder--;
}
int calcPartitionPartCount = (calcPartitionPartSize >= 128) ? 1 : (256 / calcPartitionPartSize);
CUfunction cudaChannelDecorr = channels == 2 ? (channelsCount == 4 ? task.cudaStereoDecorr : task.cudaChannelDecorr2) : task.cudaChannelDecorr;
CUfunction cudaCalcPartition = calcPartitionPartSize >= 128 ? task.cudaCalcLargePartition : calcPartitionPartSize == 16 && task.frameSize >= 256 ? task.cudaCalcPartition16 : task.cudaCalcPartition;
CUfunction cudaEstimateResidual = task.nResidualTasksPerChannel < 4 ? task.cudaEstimateResidual1 : eparams.max_prediction_order <= 8 ? task.cudaEstimateResidual8 : eparams.max_prediction_order <= 12 ? task.cudaEstimateResidual12 : task.cudaEstimateResidual;
cuda.SetParameter(cudaChannelDecorr, 0 * sizeof(uint), (uint)task.cudaSamples.Pointer);
cuda.SetParameter(cudaChannelDecorr, 1 * sizeof(uint), (uint)task.cudaSamplesBytes.Pointer);
cuda.SetParameter(cudaChannelDecorr, 2 * sizeof(uint), (uint)MAX_BLOCKSIZE);
cuda.SetParameterSize(cudaChannelDecorr, sizeof(uint) * 3U);
cuda.SetFunctionBlockShape(cudaChannelDecorr, 256, 1, 1);
cuda.SetParameter(task.cudaFindWastedBits, 0 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer);
cuda.SetParameter(task.cudaFindWastedBits, 1 * sizeof(uint), (uint)task.cudaSamples.Pointer);
cuda.SetParameter(task.cudaFindWastedBits, 2 * sizeof(uint), (uint)task.nResidualTasksPerChannel);
cuda.SetParameterSize(task.cudaFindWastedBits, sizeof(uint) * 3U);
cuda.SetFunctionBlockShape(task.cudaFindWastedBits, 256, 1, 1);
cuda.SetParameter(task.cudaComputeAutocor, 0, (uint)task.cudaAutocorOutput.Pointer);
cuda.SetParameter(task.cudaComputeAutocor, 1 * sizeof(uint), (uint)task.cudaSamples.Pointer);
cuda.SetParameter(task.cudaComputeAutocor, 2 * sizeof(uint), (uint)cudaWindow.Pointer);
cuda.SetParameter(task.cudaComputeAutocor, 3 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer);
cuda.SetParameter(task.cudaComputeAutocor, 4 * sizeof(uint), (uint)eparams.max_prediction_order);
cuda.SetParameter(task.cudaComputeAutocor, 5 * sizeof(uint), (uint)task.nAutocorTasksPerChannel - 1);
cuda.SetParameter(task.cudaComputeAutocor, 6 * sizeof(uint), (uint)task.nResidualTasksPerChannel);
cuda.SetParameterSize(task.cudaComputeAutocor, 7U * sizeof(uint));
cuda.SetFunctionBlockShape(task.cudaComputeAutocor, 32, 8, 1);
cuda.SetParameter(task.cudaComputeLPC, 0, (uint)task.cudaResidualTasks.Pointer);
cuda.SetParameter(task.cudaComputeLPC, 1 * sizeof(uint), (uint)task.nResidualTasksPerChannel);
cuda.SetParameter(task.cudaComputeLPC, 2 * sizeof(uint), (uint)task.cudaAutocorOutput.Pointer);
cuda.SetParameter(task.cudaComputeLPC, 3 * sizeof(uint), (uint)eparams.max_prediction_order);
cuda.SetParameter(task.cudaComputeLPC, 4 * sizeof(uint), (uint)task.cudaLPCData.Pointer);
cuda.SetParameter(task.cudaComputeLPC, 5 * sizeof(uint), (uint)_windowcount);
cuda.SetParameter(task.cudaComputeLPC, 6 * sizeof(uint), (uint)autocorPartCount);
cuda.SetParameterSize(task.cudaComputeLPC, 7U * sizeof(uint));
cuda.SetFunctionBlockShape(task.cudaComputeLPC, 32, 1, 1);
cuda.SetParameter(task.cudaComputeLPCLattice, 0, (uint)task.cudaResidualTasks.Pointer);
cuda.SetParameter(task.cudaComputeLPCLattice, 1 * sizeof(uint), (uint)task.nResidualTasksPerChannel);
cuda.SetParameter(task.cudaComputeLPCLattice, 2 * sizeof(uint), (uint)task.cudaSamples.Pointer);
cuda.SetParameter(task.cudaComputeLPCLattice, 3 * sizeof(uint), (uint)_windowcount);
cuda.SetParameter(task.cudaComputeLPCLattice, 4 * sizeof(uint), (uint)eparams.max_prediction_order);
cuda.SetParameter(task.cudaComputeLPCLattice, 5 * sizeof(uint), (uint)task.cudaLPCData.Pointer);
cuda.SetParameterSize(task.cudaComputeLPCLattice, 6U * sizeof(uint));
cuda.SetFunctionBlockShape(task.cudaComputeLPCLattice, 256, 1, 1);
cuda.SetParameter(task.cudaQuantizeLPC, 0, (uint)task.cudaResidualTasks.Pointer);
cuda.SetParameter(task.cudaQuantizeLPC, 1 * sizeof(uint), (uint)task.nResidualTasksPerChannel);
cuda.SetParameter(task.cudaQuantizeLPC, 2 * sizeof(uint), (uint)task.nTasksPerWindow);
cuda.SetParameter(task.cudaQuantizeLPC, 3 * sizeof(uint), (uint)task.cudaLPCData.Pointer);
cuda.SetParameter(task.cudaQuantizeLPC, 4 * sizeof(uint), (uint)eparams.max_prediction_order);
cuda.SetParameter(task.cudaQuantizeLPC, 5 * sizeof(uint), (uint)eparams.lpc_min_precision_search);
cuda.SetParameter(task.cudaQuantizeLPC, 6 * sizeof(uint), (uint)(eparams.lpc_max_precision_search - eparams.lpc_min_precision_search));
cuda.SetParameterSize(task.cudaQuantizeLPC, 7U * sizeof(uint));
cuda.SetFunctionBlockShape(task.cudaQuantizeLPC, 32, 4, 1);
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 0, (uint)task.cudaResidualOutput.Pointer);
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 1, (uint)task.cudaSamples.Pointer);
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 2, (uint)task.cudaResidualTasks.Pointer);
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 3, (uint)eparams.max_prediction_order);
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 4, (uint)residualPartSize);
cuda.SetParameterSize(cudaEstimateResidual, 5U * sizeof(uint));
cuda.SetFunctionBlockShape(cudaEstimateResidual, 32, threads_y, 1);
cuda.SetParameter(task.cudaChooseBestMethod, 0 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer);
cuda.SetParameter(task.cudaChooseBestMethod, 1 * sizeof(uint), (uint)task.cudaResidualOutput.Pointer);
cuda.SetParameter(task.cudaChooseBestMethod, 2 * sizeof(uint), (uint)residualPartSize);
cuda.SetParameter(task.cudaChooseBestMethod, 3 * sizeof(uint), (uint)residualPartCount);
cuda.SetParameter(task.cudaChooseBestMethod, 4 * sizeof(uint), (uint)task.nResidualTasksPerChannel);
cuda.SetParameterSize(task.cudaChooseBestMethod, 5U * sizeof(uint));
cuda.SetFunctionBlockShape(task.cudaChooseBestMethod, 32, 8, 1);
cuda.SetParameter(task.cudaCopyBestMethod, 0, (uint)task.cudaBestResidualTasks.Pointer);
cuda.SetParameter(task.cudaCopyBestMethod, 1 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer);
cuda.SetParameter(task.cudaCopyBestMethod, 2 * sizeof(uint), (uint)task.nResidualTasksPerChannel);
cuda.SetParameterSize(task.cudaCopyBestMethod, sizeof(uint) * 3U);
cuda.SetFunctionBlockShape(task.cudaCopyBestMethod, 64, 1, 1);
cuda.SetParameter(task.cudaCopyBestMethodStereo, 0, (uint)task.cudaBestResidualTasks.Pointer);
cuda.SetParameter(task.cudaCopyBestMethodStereo, 1 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer);
cuda.SetParameter(task.cudaCopyBestMethodStereo, 2 * sizeof(uint), (uint)task.nResidualTasksPerChannel);
cuda.SetParameterSize(task.cudaCopyBestMethodStereo, sizeof(uint) * 3U);
cuda.SetFunctionBlockShape(task.cudaCopyBestMethodStereo, 64, 1, 1);
cuda.SetParameter(task.cudaEncodeResidual, 0, (uint)task.cudaResidual.Pointer);
cuda.SetParameter(task.cudaEncodeResidual, 1 * sizeof(uint), (uint)task.cudaSamples.Pointer);
cuda.SetParameter(task.cudaEncodeResidual, 2 * sizeof(uint), (uint)task.cudaBestResidualTasks.Pointer);
cuda.SetParameterSize(task.cudaEncodeResidual, sizeof(uint) * 3U);
cuda.SetFunctionBlockShape(task.cudaEncodeResidual, residualPartSize, 1, 1);
cuda.SetParameter(cudaCalcPartition, 0, (uint)task.cudaPartitions.Pointer);
cuda.SetParameter(cudaCalcPartition, 1 * sizeof(uint), (uint)task.cudaResidual.Pointer);
cuda.SetParameter(cudaCalcPartition, 2 * sizeof(uint), (uint)task.cudaSamples.Pointer);
cuda.SetParameter(cudaCalcPartition, 3 * sizeof(uint), (uint)task.cudaBestResidualTasks.Pointer);
cuda.SetParameter(cudaCalcPartition, 4 * sizeof(uint), (uint)max_porder);
cuda.SetParameter(cudaCalcPartition, 5 * sizeof(uint), (uint)calcPartitionPartSize);
cuda.SetParameter(cudaCalcPartition, 6 * sizeof(uint), (uint)calcPartitionPartCount);
cuda.SetParameterSize(cudaCalcPartition, 7U * sizeof(uint));
cuda.SetFunctionBlockShape(cudaCalcPartition, 16, 16, 1);
cuda.SetParameter(task.cudaSumPartition, 0, (uint)task.cudaPartitions.Pointer);
cuda.SetParameter(task.cudaSumPartition, 1 * sizeof(uint), (uint)max_porder);
cuda.SetParameterSize(task.cudaSumPartition, 2U * sizeof(uint));
cuda.SetFunctionBlockShape(task.cudaSumPartition, Math.Max(32, 1 << (max_porder - 1)), 1, 1);
cuda.SetParameter(task.cudaFindRiceParameter, 0, (uint)task.cudaRiceParams.Pointer);
cuda.SetParameter(task.cudaFindRiceParameter, 1 * sizeof(uint), (uint)task.cudaPartitions.Pointer);
cuda.SetParameter(task.cudaFindRiceParameter, 2 * sizeof(uint), (uint)max_porder);
cuda.SetParameterSize(task.cudaFindRiceParameter, 3U * sizeof(uint));
cuda.SetFunctionBlockShape(task.cudaFindRiceParameter, 32, 8, 1);
cuda.SetParameter(task.cudaFindPartitionOrder, 0, (uint)task.cudaBestRiceParams.Pointer);
cuda.SetParameter(task.cudaFindPartitionOrder, 1 * sizeof(uint), (uint)task.cudaBestResidualTasks.Pointer);
cuda.SetParameter(task.cudaFindPartitionOrder, 2 * sizeof(uint), (uint)task.cudaRiceParams.Pointer);
cuda.SetParameter(task.cudaFindPartitionOrder, 3 * sizeof(uint), (uint)max_porder);
cuda.SetParameterSize(task.cudaFindPartitionOrder, 4U * sizeof(uint));
cuda.SetFunctionBlockShape(task.cudaFindPartitionOrder, 256, 1, 1);
// issue work to the GPU
cuda.LaunchAsync(cudaChannelDecorr, (task.frameCount * task.frameSize + 255) / 256, channels == 2 ? 1 : channels, task.stream);
if (eparams.do_wasted)
cuda.LaunchAsync(task.cudaFindWastedBits, channelsCount * task.frameCount, 1, task.stream);
bool lattice = do_lattice && task.frameSize <= 512 && eparams.max_prediction_order <= 12;
if (!lattice || _windowcount > 1)
{
cuda.LaunchAsync(task.cudaComputeAutocor, autocorPartCount, task.nAutocorTasksPerChannel * channelsCount * task.frameCount, task.stream);
cuda.LaunchAsync(task.cudaComputeLPC, task.nAutocorTasksPerChannel, channelsCount * task.frameCount, task.stream);
}
if (lattice)
cuda.LaunchAsync(task.cudaComputeLPCLattice, 1, channelsCount * task.frameCount, task.stream);
cuda.LaunchAsync(task.cudaQuantizeLPC, task.nAutocorTasksPerChannel, channelsCount * task.frameCount, task.stream);
cuda.LaunchAsync(cudaEstimateResidual, residualPartCount, task.nResidualTasksPerChannel * channelsCount * task.frameCount / (task.nResidualTasksPerChannel < 4 ? 1 : threads_y), task.stream);
cuda.LaunchAsync(task.cudaChooseBestMethod, 1, channelsCount * task.frameCount, task.stream);
if (channels == 2 && channelsCount == 4)
cuda.LaunchAsync(task.cudaCopyBestMethodStereo, 1, task.frameCount, task.stream);
else
cuda.LaunchAsync(task.cudaCopyBestMethod, 1, channels * task.frameCount, task.stream);
if (_settings.GPUOnly)
{
int bsz = calcPartitionPartCount * calcPartitionPartSize;
if (cudaCalcPartition.Pointer == task.cudaCalcLargePartition.Pointer)
cuda.LaunchAsync(task.cudaEncodeResidual, residualPartCount, channels * task.frameCount, task.stream);
cuda.LaunchAsync(cudaCalcPartition, (task.frameSize + bsz - 1) / bsz, channels * task.frameCount, task.stream);
if (max_porder > 0)
cuda.LaunchAsync(task.cudaSumPartition, Flake.MAX_RICE_PARAM + 1, channels * task.frameCount, task.stream);
cuda.LaunchAsync(task.cudaFindRiceParameter, ((2 << max_porder) + 31) / 32, channels * task.frameCount, task.stream);
//if (max_porder > 0) // need to run even if max_porder==0 just to calculate the final frame size
cuda.LaunchAsync(task.cudaFindPartitionOrder, 1, channels * task.frameCount, task.stream);
cuda.CopyDeviceToHostAsync(task.cudaResidual, task.residualBufferPtr, (uint)(sizeof(int) * MAX_BLOCKSIZE * channels), task.stream);
cuda.CopyDeviceToHostAsync(task.cudaBestRiceParams, task.bestRiceParamsPtr, (uint)(sizeof(int) * (1 << max_porder) * channels * task.frameCount), task.stream);
task.max_porder = max_porder;
}
cuda.CopyDeviceToHostAsync(task.cudaBestResidualTasks, task.bestResidualTasksPtr, (uint)(sizeof(FlaCudaSubframeTask) * channels * task.frameCount), task.stream);
}
///
/// Copy channel-interleaved input samples into separate subframes
///
///
///
unsafe void unpack_samples(FlaCudaTask task, int count)
{
int iFrame = task.frame.frame_number;
short* src = ((short*)task.samplesBytesPtr) + iFrame * channels * task.frameSize;
switch (task.frame.ch_mode)
{
case ChannelMode.NotStereo:
for (int ch = 0; ch < channels; ch++)
{
int* s = task.frame.subframes[ch].samples;
int wbits = (int)task.frame.subframes[ch].wbits;
for (int i = 0; i < count; i++)
s[i] = src[i * channels + ch] >>= wbits;
}
break;
case ChannelMode.LeftRight:
{
int* left = task.frame.subframes[0].samples;
int* right = task.frame.subframes[1].samples;
int lwbits = (int)task.frame.subframes[0].wbits;
int rwbits = (int)task.frame.subframes[1].wbits;
for (int i = 0; i < count; i++)
{
int l = *(src++);
int r = *(src++);
left[i] = l >> lwbits;
right[i] = r >> rwbits;
}
break;
}
case ChannelMode.LeftSide:
{
int* left = task.frame.subframes[0].samples;
int* right = task.frame.subframes[1].samples;
int lwbits = (int)task.frame.subframes[0].wbits;
int rwbits = (int)task.frame.subframes[1].wbits;
for (int i = 0; i < count; i++)
{
int l = *(src++);
int r = *(src++);
left[i] = l >> lwbits;
right[i] = (l - r) >> rwbits;
}
break;
}
case ChannelMode.RightSide:
{
int* left = task.frame.subframes[0].samples;
int* right = task.frame.subframes[1].samples;
int lwbits = (int)task.frame.subframes[0].wbits;
int rwbits = (int)task.frame.subframes[1].wbits;
for (int i = 0; i < count; i++)
{
int l = *(src++);
int r = *(src++);
left[i] = (l - r) >> lwbits;
right[i] = r >> rwbits;
}
break;
}
case ChannelMode.MidSide:
{
int* left = task.frame.subframes[0].samples;
int* right = task.frame.subframes[1].samples;
int lwbits = (int)task.frame.subframes[0].wbits;
int rwbits = (int)task.frame.subframes[1].wbits;
for (int i = 0; i < count; i++)
{
int l = *(src++);
int r = *(src++);
left[i] = (l + r) >> (1 + lwbits);
right[i] = (l - r) >> rwbits;
}
break;
}
}
}
unsafe int encode_frame(bool doMidside, int channelCount, int iFrame, FlaCudaTask task, int current_frame_number)
{
task.frame.InitSize(task.frameSize, eparams.variable_block_size != 0);
task.frame.frame_number = iFrame;
task.frame.ch_mode = ChannelMode.NotStereo;
fixed (int* smp = task.samplesBuffer)
{
for (int ch = 0; ch < channelCount; ch++)
task.frame.subframes[ch].Init(
smp + ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * task.frameSize,
((int*)task.residualBufferPtr) + ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * task.frameSize,
_pcm.BitsPerSample + (doMidside && ch == 3 ? 1 : 0), 0);
select_best_methods(task.frame, channelCount, iFrame, task);
//unpack_samples(task);
encode_residual(task);
//task.frame.writer.Reset();
task.frame.frame_number = current_frame_number;
task.frame.writer_offset = task.frame.writer.Length;
output_frame_header(task.frame);
output_subframes(task.frame);
output_frame_footer(task.frame);
if (task.frame.writer.Length - task.frame.writer_offset >= max_frame_size)
throw new Exception("buffer overflow");
return task.frame.writer.Length - task.frame.writer_offset;
}
}
unsafe void send_to_GPU(FlaCudaTask task, int nFrames, int blocksize)
{
bool doMidside = channels == 2 && eparams.do_midside;
int channelsCount = doMidside ? 2 * channels : channels;
if (blocksize != task.frameSize)
task.nResidualTasks = 0;
task.frameCount = nFrames;
task.frameSize = blocksize;
task.frameNumber = eparams.variable_block_size > 0 ? frame_pos : frame_count;
task.framePos = frame_pos;
frame_count += nFrames;
frame_pos += nFrames * blocksize;
cuda.CopyHostToDeviceAsync(task.cudaSamplesBytes, task.samplesBytesPtr, (uint)(sizeof(short) * channels * blocksize * nFrames), task.stream);
}
unsafe void run_GPU_task(FlaCudaTask task)
{
bool doMidside = channels == 2 && eparams.do_midside;
int channelsCount = doMidside ? 2 * channels : channels;
if (task.frameSize != _windowsize && task.frameSize > 4)
fixed (float* window = windowBuffer)
{
_windowsize = task.frameSize;
_windowcount = 0;
calculate_window(window, lpc.window_welch, WindowFunction.Welch);
calculate_window(window, lpc.window_flattop, WindowFunction.Flattop);
calculate_window(window, lpc.window_tukey, WindowFunction.Tukey);
calculate_window(window, lpc.window_hann, WindowFunction.Hann);
calculate_window(window, lpc.window_bartlett, WindowFunction.Bartlett);
if (_windowcount == 0)
throw new Exception("invalid windowfunction");
cuda.CopyHostToDevice(cudaWindow, windowBuffer);
}
if (task.nResidualTasks == 0)
initializeSubframeTasks(task.frameSize, channelsCount, max_frames, task);
estimate_residual(task, channelsCount);
}
unsafe void process_result(FlaCudaTask task)
{
bool doMidside = channels == 2 && eparams.do_midside;
int channelCount = doMidside ? 2 * channels : channels;
long iSample = 0;
long iByte = 0;
task.frame.writer.Reset();
task.frame.writer_offset = 0;
for (int iFrame = 0; iFrame < task.frameCount; iFrame++)
{
//if (0 != eparams.variable_block_size && 0 == (task.blocksize & 7) && task.blocksize >= 128)
// fs = encode_frame_vbs();
//else
int fn = task.frameNumber + (eparams.variable_block_size > 0 ? (int)iSample : iFrame);
int fs = encode_frame(doMidside, channelCount, iFrame, task, fn);
if (task.verify != null)
{
int decoded = task.verify.DecodeFrame(task.frame.writer.Buffer, task.frame.writer_offset, fs);
if (decoded != fs || task.verify.Remaining != task.frameSize)
throw new Exception("validation failed! frame size mismatch");
fixed (int* r = task.verify.Samples)
{
for (int ch = 0; ch < channels; ch++)
{
short* res = ((short*)task.samplesBytesPtr) + iFrame * channels * task.frameSize + ch;
int* smp = r + ch * Flake.MAX_BLOCKSIZE;
for (int i = task.frameSize; i > 0; i--)
{
//if (AudioSamples.MemCmp(s + iFrame * task.frameSize + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, task.frameSize))
if (*res != *(smp++))
throw new Exception(string.Format("validation failed! iFrame={0}, ch={1}", iFrame, ch));
res += channels;
}
}
}
}
if (seek_table != null && _IO.CanSeek)
{
for (int sp = 0; sp < seek_table.Length; sp++)
{
if (seek_table[sp].framesize != 0)
continue;
if (seek_table[sp].number >= task.framePos + iSample + task.frameSize)
break;
if (seek_table[sp].number >= task.framePos + iSample)
{
seek_table[sp].number = task.framePos + iSample;
seek_table[sp].offset = iByte;
seek_table[sp].framesize = task.frameSize;
}
}
}
//Array.Copy(task.frame.buffer, 0, task.outputBuffer, iByte, fs);
iSample += task.frameSize;
iByte += fs;
}
task.outputSize = (int)iByte;
if (iByte != task.frame.writer.Length)
throw new Exception("invalid length");
}
unsafe void write_result(FlaCudaTask task)
{
int iSample = task.frameSize * task.frameCount;
if (seek_table != null && _IO.CanSeek)
for (int sp = 0; sp < seek_table.Length; sp++)
{
if (seek_table[sp].number >= task.framePos + iSample)
break;
if (seek_table[sp].number >= task.framePos)
seek_table[sp].offset += _IO.Position - first_frame_offset;
}
_IO.Write(task.outputBuffer, 0, task.outputSize);
_position += iSample;
_totalSize += task.outputSize;
}
public unsafe void InitTasks()
{
bool doMidside = channels == 2 && eparams.do_midside;
int channelCount = doMidside ? 2 * channels : channels;
if (!inited)
{
cuda = new CUDA(true, InitializationFlags.None);
cuda.CreateContext(0, CUCtxFlags.SchedAuto);
using (Stream cubin = GetType().Assembly.GetManifestResourceStream(GetType(), "flacuda.cubin"))
using (StreamReader sr = new StreamReader(cubin))
cuda.LoadModule(new ASCIIEncoding().GetBytes(sr.ReadToEnd()));
//cuda.LoadModule(System.IO.Path.Combine(Environment.CurrentDirectory, "flacuda.cubin"));
if (_IO == null)
_IO = new FileStream(_path, FileMode.Create, FileAccess.Write, FileShare.Read);
int header_size = flake_encode_init();
_IO.Write(header, 0, header_size);
if (_IO.CanSeek)
first_frame_offset = _IO.Position;
task1 = new FlaCudaTask(cuda, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify);
task2 = new FlaCudaTask(cuda, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify);
if (_settings.CPUThreads > 0)
{
cpu_tasks = new FlaCudaTask[_settings.CPUThreads];
for (int i = 0; i < cpu_tasks.Length; i++)
cpu_tasks[i] = new FlaCudaTask(cuda, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify);
}
cudaWindow = cuda.Allocate((uint)sizeof(float) * FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS);
inited = true;
}
}
public unsafe void Write(AudioBuffer buff)
{
InitTasks();
buff.Prepare(this);
int pos = 0;
while (pos < buff.Length)
{
int block = Math.Min(buff.Length - pos, eparams.block_size * max_frames - samplesInBuffer);
fixed (byte* buf = buff.Bytes)
AudioSamples.MemCpy(((byte*)task1.samplesBytesPtr) + samplesInBuffer * _pcm.BlockAlign, buf + pos * _pcm.BlockAlign, block * _pcm.BlockAlign);
samplesInBuffer += block;
pos += block;
int nFrames = samplesInBuffer / eparams.block_size;
if (nFrames >= max_frames)
do_output_frames(nFrames);
}
if (md5 != null)
md5.TransformBlock(buff.Bytes, 0, buff.ByteLength, null, 0);
}
public void wait_for_cpu_task()
{
FlaCudaTask task = cpu_tasks[oldest_cpu_task];
if (task.workThread == null)
return;
lock (task)
{
while (!task.done && task.exception == null)
Monitor.Wait(task);
if (task.exception != null)
throw task.exception;
}
}
public void cpu_task_thread(object param)
{
FlaCudaTask task = param as FlaCudaTask;
try
{
while (true)
{
lock (task)
{
while (task.done && !task.exit)
Monitor.Wait(task);
if (task.exit)
return;
}
process_result(task);
lock (task)
{
task.done = true;
Monitor.Pulse(task);
}
}
}
catch (Exception ex)
{
lock (task)
{
task.exception = ex;
Monitor.Pulse(task);
}
}
}
public void start_cpu_task()
{
FlaCudaTask task = cpu_tasks[oldest_cpu_task];
if (task.workThread == null)
{
task.done = false;
task.exit = false;
task.workThread = new Thread(cpu_task_thread);
task.workThread.IsBackground = true;
//task.workThread.Priority = ThreadPriority.BelowNormal;
task.workThread.Start(task);
}
else
{
lock (task)
{
task.done = false;
Monitor.Pulse(task);
}
}
}
public unsafe void do_output_frames(int nFrames)
{
send_to_GPU(task1, nFrames, eparams.block_size);
if (task2.frameCount > 0)
cuda.SynchronizeStream(task2.stream);
run_GPU_task(task1);
if (task2.frameCount > 0)
{
if (cpu_tasks != null)
{
wait_for_cpu_task();
FlaCudaTask ttmp = cpu_tasks[oldest_cpu_task];
cpu_tasks[oldest_cpu_task] = task2;
task2 = ttmp;
start_cpu_task();
oldest_cpu_task = (oldest_cpu_task + 1) % cpu_tasks.Length;
if (task2.frameCount > 0)
write_result(task2);
}
else
{
process_result(task2);
write_result(task2);
}
}
int bs = eparams.block_size * nFrames;
samplesInBuffer -= bs;
if (samplesInBuffer > 0)
AudioSamples.MemCpy(((byte*)task2.samplesBytesPtr), ((byte*)task1.samplesBytesPtr) + bs * _pcm.BlockAlign, samplesInBuffer * _pcm.BlockAlign);
FlaCudaTask tmp = task1;
task1 = task2;
task2 = tmp;
task1.frameCount = 0;
}
public string Path { get { return _path; } }
public static readonly string vendor_string = "FlaCuda#.91";
int select_blocksize(int samplerate, int time_ms)
{
int blocksize = Flake.flac_blocksizes[1];
int target = (samplerate * time_ms) / 1000;
if (eparams.variable_block_size > 0)
{
blocksize = 1024;
while (target >= blocksize)
blocksize <<= 1;
return blocksize >> 1;
}
for (int i = 0; i < Flake.flac_blocksizes.Length; i++)
if (target >= Flake.flac_blocksizes[i] && Flake.flac_blocksizes[i] > blocksize)
{
blocksize = Flake.flac_blocksizes[i];
}
return blocksize;
}
void write_streaminfo(byte[] header, int pos, int last)
{
Array.Clear(header, pos, 38);
BitWriter bitwriter = new BitWriter(header, pos, 38);
// metadata header
bitwriter.writebits(1, last);
bitwriter.writebits(7, (int)MetadataType.StreamInfo);
bitwriter.writebits(24, 34);
if (eparams.variable_block_size > 0)
bitwriter.writebits(16, 0);
else
bitwriter.writebits(16, eparams.block_size);
bitwriter.writebits(16, eparams.block_size);
bitwriter.writebits(24, 0);
bitwriter.writebits(24, max_frame_size);
bitwriter.writebits(20, sample_rate);
bitwriter.writebits(3, channels - 1);
bitwriter.writebits(5, bits_per_sample - 1);
// total samples
if (sample_count > 0)
{
bitwriter.writebits(4, 0);
bitwriter.writebits(32, sample_count);
}
else
{
bitwriter.writebits(4, 0);
bitwriter.writebits(32, 0);
}
bitwriter.flush();
}
/**
* Write vorbis comment metadata block to byte array.
* Just writes the vendor string for now.
*/
int write_vorbis_comment(byte[] comment, int pos, int last)
{
BitWriter bitwriter = new BitWriter(comment, pos, 4);
Encoding enc = new ASCIIEncoding();
int vendor_len = enc.GetBytes(vendor_string, 0, vendor_string.Length, comment, pos + 8);
// metadata header
bitwriter.writebits(1, last);
bitwriter.writebits(7, (int)MetadataType.VorbisComment);
bitwriter.writebits(24, vendor_len + 8);
comment[pos + 4] = (byte)(vendor_len & 0xFF);
comment[pos + 5] = (byte)((vendor_len >> 8) & 0xFF);
comment[pos + 6] = (byte)((vendor_len >> 16) & 0xFF);
comment[pos + 7] = (byte)((vendor_len >> 24) & 0xFF);
comment[pos + 8 + vendor_len] = 0;
comment[pos + 9 + vendor_len] = 0;
comment[pos + 10 + vendor_len] = 0;
comment[pos + 11 + vendor_len] = 0;
bitwriter.flush();
return vendor_len + 12;
}
int write_seekpoints(byte[] header, int pos, int last)
{
seek_table_offset = pos + 4;
BitWriter bitwriter = new BitWriter(header, pos, 4 + 18 * seek_table.Length);
// metadata header
bitwriter.writebits(1, last);
bitwriter.writebits(7, (int)MetadataType.Seektable);
bitwriter.writebits(24, 18 * seek_table.Length);
for (int i = 0; i < seek_table.Length; i++)
{
bitwriter.writebits64(Flake.FLAC__STREAM_METADATA_SEEKPOINT_SAMPLE_NUMBER_LEN, (ulong)seek_table[i].number);
bitwriter.writebits64(Flake.FLAC__STREAM_METADATA_SEEKPOINT_STREAM_OFFSET_LEN, (ulong)seek_table[i].offset);
bitwriter.writebits(Flake.FLAC__STREAM_METADATA_SEEKPOINT_FRAME_SAMPLES_LEN, seek_table[i].framesize);
}
bitwriter.flush();
return 4 + 18 * seek_table.Length;
}
/**
* Write padding metadata block to byte array.
*/
int
write_padding(byte[] padding, int pos, int last, long padlen)
{
BitWriter bitwriter = new BitWriter(padding, pos, 4);
// metadata header
bitwriter.writebits(1, last);
bitwriter.writebits(7, (int)MetadataType.Padding);
bitwriter.writebits(24, (int)padlen);
return (int)padlen + 4;
}
int write_headers()
{
int header_size = 0;
int last = 0;
// stream marker
header[0] = 0x66;
header[1] = 0x4C;
header[2] = 0x61;
header[3] = 0x43;
header_size += 4;
// streaminfo
write_streaminfo(header, header_size, last);
header_size += 38;
// seek table
if (_IO.CanSeek && seek_table != null)
header_size += write_seekpoints(header, header_size, last);
// vorbis comment
if (eparams.padding_size == 0) last = 1;
header_size += write_vorbis_comment(header, header_size, last);
// padding
if (eparams.padding_size > 0)
{
last = 1;
header_size += write_padding(header, header_size, last, eparams.padding_size);
}
return header_size;
}
int flake_encode_init()
{
int i, header_len;
//if(flake_validate_params(s) < 0)
ch_code = channels - 1;
// find samplerate in table
for (i = 4; i < 12; i++)
{
if (sample_rate == Flake.flac_samplerates[i])
{
sr_code0 = i;
break;
}
}
// if not in table, samplerate is non-standard
if (i == 12)
throw new Exception("non-standard samplerate");
for (i = 1; i < 8; i++)
{
if (bits_per_sample == Flake.flac_bitdepths[i])
{
bps_code = i;
break;
}
}
if (i == 8)
throw new Exception("non-standard bps");
// FIXME: For now, only 16-bit encoding is supported
if (bits_per_sample != 16)
throw new Exception("non-standard bps");
if (_blocksize == 0)
{
if (eparams.block_size == 0)
eparams.block_size = select_blocksize(sample_rate, eparams.block_time_ms);
_blocksize = eparams.block_size;
}
else
eparams.block_size = _blocksize;
max_frames = Math.Min(maxFrames, FlaCudaWriter.MAX_BLOCKSIZE / eparams.block_size);
// set maximum encoded frame size (if larger, re-encodes in verbatim mode)
if (channels == 2)
max_frame_size = 16 + ((eparams.block_size * (int)(bits_per_sample + bits_per_sample + 1) + 7) >> 3);
else
max_frame_size = 16 + ((eparams.block_size * channels * (int)bits_per_sample + 7) >> 3);
if (_IO.CanSeek && eparams.do_seektable && sample_count > 0)
{
int seek_points_distance = sample_rate * 10;
int num_seek_points = 1 + sample_count / seek_points_distance; // 1 seek point per 10 seconds
if (sample_count % seek_points_distance == 0)
num_seek_points--;
seek_table = new SeekPoint[num_seek_points];
for (int sp = 0; sp < num_seek_points; sp++)
{
seek_table[sp].framesize = 0;
seek_table[sp].offset = 0;
seek_table[sp].number = sp * seek_points_distance;
}
}
// output header bytes
header = new byte[eparams.padding_size + 1024 + (seek_table == null ? 0 : seek_table.Length * 18)];
header_len = write_headers();
// initialize CRC & MD5
if (_IO.CanSeek && _settings.DoMD5)
md5 = new MD5CryptoServiceProvider();
return header_len;
}
}
struct FlakeEncodeParams
{
// compression quality
// set by user prior to calling flake_encode_init
// standard values are 0 to 8
// 0 is lower compression, faster encoding
// 8 is higher compression, slower encoding
// extended values 9 to 12 are slower and/or use
// higher prediction orders
public int compression;
// stereo decorrelation method
// set by user prior to calling flake_encode_init
// if set to less than 0, it is chosen based on compression.
// valid values are 0 to 2
// 0 = independent L+R channels
// 1 = mid-side encoding
public bool do_midside;
// block size in samples
// set by the user prior to calling flake_encode_init
// if set to 0, a block size is chosen based on block_time_ms
// can also be changed by user before encoding a frame
public int block_size;
// block time in milliseconds
// set by the user prior to calling flake_encode_init
// used to calculate block_size based on sample rate
// can also be changed by user before encoding a frame
public int block_time_ms;
// padding size in bytes
// set by the user prior to calling flake_encode_init
// if set to less than 0, defaults to 4096
public long padding_size;
// minimum LPC order
// set by user prior to calling flake_encode_init
// if set to less than 0, it is chosen based on compression.
// valid values are 1 to 32
public int min_prediction_order;
// maximum LPC order
// set by user prior to calling flake_encode_init
// if set to less than 0, it is chosen based on compression.
// valid values are 1 to 32
public int max_prediction_order;
public int orders_per_window;
// minimum fixed prediction order
// set by user prior to calling flake_encode_init
// if set to less than 0, it is chosen based on compression.
// valid values are 0 to 4
public int min_fixed_order;
// maximum fixed prediction order
// set by user prior to calling flake_encode_init
// if set to less than 0, it is chosen based on compression.
// valid values are 0 to 4
public int max_fixed_order;
// minimum partition order
// set by user prior to calling flake_encode_init
// if set to less than 0, it is chosen based on compression.
// valid values are 0 to 8
public int min_partition_order;
// maximum partition order
// set by user prior to calling flake_encode_init
// if set to less than 0, it is chosen based on compression.
// valid values are 0 to 8
public int max_partition_order;
// whether to use variable block sizes
// set by user prior to calling flake_encode_init
// 0 = fixed block size
// 1 = variable block size
public int variable_block_size;
// whether to try various lpc_precisions
// 0 - use only one precision
// 1 - try two precisions
public int lpc_max_precision_search;
public int lpc_min_precision_search;
public bool do_wasted;
public bool do_constant;
public WindowFunction window_function;
public bool do_seektable;
public int flake_set_defaults(int lvl, bool encode_on_cpu)
{
compression = lvl;
if ((lvl < 0 || lvl > 12) && (lvl != 99))
{
return -1;
}
// default to level 5 params
window_function = WindowFunction.Flattop | WindowFunction.Tukey;
do_midside = true;
block_size = 0;
block_time_ms = 100;
min_fixed_order = 0;
max_fixed_order = 4;
min_prediction_order = 1;
max_prediction_order = 12;
min_partition_order = 0;
max_partition_order = 6;
variable_block_size = 0;
lpc_min_precision_search = 0;
lpc_max_precision_search = 0;
do_seektable = true;
do_wasted = true;
do_constant = true;
// differences from level 7
switch (lvl)
{
case 0:
do_constant = false;
do_wasted = false;
do_midside = false;
orders_per_window = 1;
max_partition_order = 4;
max_prediction_order = 7;
min_fixed_order = 2;
max_fixed_order = 2;
break;
case 1:
do_wasted = false;
do_midside = false;
window_function = WindowFunction.Bartlett;
orders_per_window = 1;
max_prediction_order = 12;
max_partition_order = 4;
break;
case 2:
do_constant = false;
window_function = WindowFunction.Bartlett;
min_fixed_order = 3;
max_fixed_order = 2;
orders_per_window = 1;
max_prediction_order = 7;
max_partition_order = 4;
break;
case 3:
window_function = WindowFunction.Bartlett;
min_fixed_order = 2;
max_fixed_order = 2;
orders_per_window = 6;
max_prediction_order = 7;
max_partition_order = 4;
break;
case 4:
min_fixed_order = 2;
max_fixed_order = 2;
orders_per_window = 3;
max_prediction_order = 8;
max_partition_order = 4;
break;
case 5:
do_constant = false;
min_fixed_order = 2;
max_fixed_order = 2;
orders_per_window = 1;
break;
case 6:
min_fixed_order = 2;
max_fixed_order = 2;
orders_per_window = 3;
break;
case 7:
min_fixed_order = 2;
max_fixed_order = 2;
orders_per_window = 7;
break;
case 8:
orders_per_window = 12;
break;
case 9:
min_fixed_order = 2;
max_fixed_order = 2;
orders_per_window = 3;
max_prediction_order = 32;
break;
case 10:
min_fixed_order = 2;
max_fixed_order = 2;
orders_per_window = 7;
max_prediction_order = 32;
break;
case 11:
min_fixed_order = 2;
max_fixed_order = 2;
orders_per_window = 11;
max_prediction_order = 32;
break;
}
if (!encode_on_cpu)
max_partition_order = 8;
return 0;
}
}
unsafe struct FlaCudaSubframeTask
{
public int residualOrder;
public int samplesOffs;
public int shift;
public int cbits;
public int size;
public int type;
public int obits;
public int blocksize;
public int best_index;
public int channel;
public int residualOffs;
public int wbits;
public int abits;
public int porder;
public fixed int reserved[2];
public fixed int coefs[32];
};
internal class FlaCudaTask
{
CUDA cuda;
public CUfunction cudaStereoDecorr;
public CUfunction cudaChannelDecorr;
public CUfunction cudaChannelDecorr2;
public CUfunction cudaFindWastedBits;
public CUfunction cudaComputeAutocor;
public CUfunction cudaComputeLPC;
public CUfunction cudaComputeLPCLattice;
public CUfunction cudaQuantizeLPC;
public CUfunction cudaEstimateResidual;
public CUfunction cudaEstimateResidual8;
public CUfunction cudaEstimateResidual12;
public CUfunction cudaEstimateResidual1;
public CUfunction cudaChooseBestMethod;
public CUfunction cudaCopyBestMethod;
public CUfunction cudaCopyBestMethodStereo;
public CUfunction cudaEncodeResidual;
public CUfunction cudaCalcPartition;
public CUfunction cudaCalcPartition16;
public CUfunction cudaCalcLargePartition;
public CUfunction cudaSumPartition;
public CUfunction cudaFindRiceParameter;
public CUfunction cudaFindPartitionOrder;
public CUdeviceptr cudaSamplesBytes;
public CUdeviceptr cudaSamples;
public CUdeviceptr cudaLPCData;
public CUdeviceptr cudaResidual;
public CUdeviceptr cudaPartitions;
public CUdeviceptr cudaRiceParams;
public CUdeviceptr cudaBestRiceParams;
public CUdeviceptr cudaAutocorOutput;
public CUdeviceptr cudaResidualTasks;
public CUdeviceptr cudaResidualOutput;
public CUdeviceptr cudaBestResidualTasks;
public IntPtr samplesBytesPtr = IntPtr.Zero;
public IntPtr residualBufferPtr = IntPtr.Zero;
public IntPtr bestRiceParamsPtr = IntPtr.Zero;
public IntPtr residualTasksPtr = IntPtr.Zero;
public IntPtr bestResidualTasksPtr = IntPtr.Zero;
public CUstream stream;
public int[] samplesBuffer;
public byte[] outputBuffer;
public int outputSize = 0;
public int frameSize = 0;
public int frameCount = 0;
public int frameNumber = 0;
public int framePos = 0;
public FlacFrame frame;
public int residualTasksLen;
public int bestResidualTasksLen;
public int samplesBufferLen;
public int nResidualTasks = 0;
public int nResidualTasksPerChannel = 0;
public int nTasksPerWindow = 0;
public int nAutocorTasksPerChannel = 0;
public int max_porder = 0;
public FlakeReader verify;
public Thread workThread = null;
public Exception exception = null;
public bool done = false;
public bool exit = false;
unsafe public FlaCudaTask(CUDA _cuda, int channelCount, int channels, uint bits_per_sample, int max_frame_size, bool do_verify)
{
cuda = _cuda;
residualTasksLen = sizeof(FlaCudaSubframeTask) * channelCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 8) * FlaCudaWriter.maxFrames;
bestResidualTasksLen = sizeof(FlaCudaSubframeTask) * channelCount * FlaCudaWriter.maxFrames;
samplesBufferLen = sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelCount;
int partitionsLen = sizeof(int) * (30 << 8) * channelCount * FlaCudaWriter.maxFrames;
int riceParamsLen = sizeof(int) * (4 << 8) * channelCount * FlaCudaWriter.maxFrames;
int lpcDataLen = sizeof(float) * 32 * 33 * lpc.MAX_LPC_WINDOWS * channelCount * FlaCudaWriter.maxFrames;
cudaSamplesBytes = cuda.Allocate((uint)samplesBufferLen / 2);
cudaSamples = cuda.Allocate((uint)samplesBufferLen);
cudaResidual = cuda.Allocate((uint)samplesBufferLen);
cudaLPCData = cuda.Allocate((uint)lpcDataLen);
cudaPartitions = cuda.Allocate((uint)partitionsLen);
cudaRiceParams = cuda.Allocate((uint)riceParamsLen);
cudaBestRiceParams = cuda.Allocate((uint)riceParamsLen / 4);
cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * (FlaCudaWriter.maxAutocorParts + FlaCudaWriter.maxFrames)));
cudaResidualTasks = cuda.Allocate((uint)residualTasksLen);
cudaBestResidualTasks = cuda.Allocate((uint)bestResidualTasksLen);
cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 8) * 64 /*FlaCudaWriter.maxResidualParts*/ * FlaCudaWriter.maxFrames));
CUResult cuErr = CUResult.Success;
if (cuErr == CUResult.Success)
cuErr = CUDADriver.cuMemAllocHost(ref samplesBytesPtr, (uint)samplesBufferLen/2);
if (cuErr == CUResult.Success)
cuErr = CUDADriver.cuMemAllocHost(ref residualBufferPtr, (uint)samplesBufferLen);
if (cuErr == CUResult.Success)
cuErr = CUDADriver.cuMemAllocHost(ref bestRiceParamsPtr, (uint)riceParamsLen / 4);
if (cuErr == CUResult.Success)
cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)residualTasksLen);
if (cuErr == CUResult.Success)
cuErr = CUDADriver.cuMemAllocHost(ref bestResidualTasksPtr, (uint)bestResidualTasksLen);
if (cuErr != CUResult.Success)
{
if (samplesBytesPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBytesPtr); samplesBytesPtr = IntPtr.Zero;
if (residualBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualBufferPtr); residualBufferPtr = IntPtr.Zero;
if (bestRiceParamsPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(bestRiceParamsPtr); bestRiceParamsPtr = IntPtr.Zero;
if (residualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualTasksPtr); residualTasksPtr = IntPtr.Zero;
if (bestResidualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(bestResidualTasksPtr); bestResidualTasksPtr = IntPtr.Zero;
throw new CUDAException(cuErr);
}
cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor");
cudaStereoDecorr = cuda.GetModuleFunction("cudaStereoDecorr");
cudaChannelDecorr = cuda.GetModuleFunction("cudaChannelDecorr");
cudaChannelDecorr2 = cuda.GetModuleFunction("cudaChannelDecorr2");
cudaFindWastedBits = cuda.GetModuleFunction("cudaFindWastedBits");
cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC");
cudaQuantizeLPC = cuda.GetModuleFunction("cudaQuantizeLPC");
cudaComputeLPCLattice = cuda.GetModuleFunction("cudaComputeLPCLattice");
cudaEstimateResidual = cuda.GetModuleFunction("cudaEstimateResidual");
cudaEstimateResidual8 = cuda.GetModuleFunction("cudaEstimateResidual8");
cudaEstimateResidual12 = cuda.GetModuleFunction("cudaEstimateResidual12");
cudaEstimateResidual1 = cuda.GetModuleFunction("cudaEstimateResidual1");
cudaChooseBestMethod = cuda.GetModuleFunction("cudaChooseBestMethod");
cudaCopyBestMethod = cuda.GetModuleFunction("cudaCopyBestMethod");
cudaCopyBestMethodStereo = cuda.GetModuleFunction("cudaCopyBestMethodStereo");
cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual");
cudaCalcPartition = cuda.GetModuleFunction("cudaCalcPartition");
cudaCalcPartition16 = cuda.GetModuleFunction("cudaCalcPartition16");
cudaCalcLargePartition = cuda.GetModuleFunction("cudaCalcLargePartition");
cudaSumPartition = cuda.GetModuleFunction("cudaSumPartition");
cudaFindRiceParameter = cuda.GetModuleFunction("cudaFindRiceParameter");
cudaFindPartitionOrder = cuda.GetModuleFunction("cudaFindPartitionOrder");
stream = cuda.CreateStream();
samplesBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * channelCount];
outputBuffer = new byte[max_frame_size * FlaCudaWriter.maxFrames + 1];
frame = new FlacFrame(channelCount);
frame.writer = new BitWriter(outputBuffer, 0, outputBuffer.Length);
if (do_verify)
{
verify = new FlakeReader(new AudioPCMConfig((int)bits_per_sample, channels, 44100));
verify.DoCRC = false;
}
}
public void Dispose()
{
if (workThread != null)
{
lock (this)
{
exit = true;
Monitor.Pulse(this);
}
workThread.Join();
workThread = null;
}
cuda.Free(cudaSamples);
cuda.Free(cudaSamplesBytes);
cuda.Free(cudaLPCData);
cuda.Free(cudaResidual);
cuda.Free(cudaPartitions);
cuda.Free(cudaAutocorOutput);
cuda.Free(cudaResidualTasks);
cuda.Free(cudaResidualOutput);
cuda.Free(cudaBestResidualTasks);
CUDADriver.cuMemFreeHost(samplesBytesPtr);
CUDADriver.cuMemFreeHost(residualBufferPtr);
CUDADriver.cuMemFreeHost(bestRiceParamsPtr);
CUDADriver.cuMemFreeHost(residualTasksPtr);
CUDADriver.cuMemFreeHost(bestResidualTasksPtr);
cuda.DestroyStream(stream);
}
public unsafe FlaCudaSubframeTask* ResidualTasks
{
get
{
return (FlaCudaSubframeTask*)residualTasksPtr;
}
}
public unsafe FlaCudaSubframeTask* BestResidualTasks
{
get
{
return (FlaCudaSubframeTask*)bestResidualTasksPtr;
}
}
}
}