optimizations

This commit is contained in:
chudov
2010-11-18 06:06:12 +00:00
parent 884e30e01a
commit 76762f2e16
4 changed files with 73 additions and 34 deletions

View File

@@ -41,6 +41,7 @@ namespace CUETools.Codecs.FLACCL
this.MappedMemory = false;
this.DoMD5 = true;
this.GroupSize = 128;
this.TaskSize = 32;
this.DeviceType = OpenCLDeviceType.GPU;
}
@@ -70,6 +71,10 @@ namespace CUETools.Codecs.FLACCL
[SRDescription(typeof(Properties.Resources), "DescriptionGroupSize")]
public int GroupSize { get; set; }
[DefaultValue(32)]
[SRDescription(typeof(Properties.Resources), "DescriptionTashSize")]
public int TaskSize { get; set; }
[SRDescription(typeof(Properties.Resources), "DescriptionDefines")]
public string Defines { get; set; }
@@ -146,7 +151,6 @@ namespace CUETools.Codecs.FLACCL
byte[] header;
int samplesInBuffer = 0;
int max_frames = 0;
int _compressionLevel = 7;
int _blocksize = 0;
@@ -171,8 +175,7 @@ namespace CUETools.Codecs.FLACCL
AudioPCMConfig _pcm;
public const int MAX_BLOCKSIZE = 4096 * 16;
internal const int maxFrames = 128;
public const int MAX_BLOCKSIZE = 65536;
public FLACCLWriter(string path, Stream IO, AudioPCMConfig pcm)
{
@@ -1023,6 +1026,7 @@ namespace CUETools.Codecs.FLACCL
unsafe void initializeSubframeTasks(int blocksize, int channelsCount, int nFrames, FLACCLTask task)
{
task.channelSize = ((blocksize + 3) & ~3) * nFrames;
task.frameSize = blocksize;
task.nWindowFunctions = 0;
if (task.frameSize > 4)
@@ -1078,7 +1082,7 @@ namespace CUETools.Codecs.FLACCL
task.ResidualTasks[task.nResidualTasks].abits = task.ResidualTasks[task.nResidualTasks].obits;
task.ResidualTasks[task.nResidualTasks].blocksize = blocksize;
task.ResidualTasks[task.nResidualTasks].residualOrder = order + 1;
task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * blocksize;
task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * task.channelSize + iFrame * blocksize;
task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs;
task.ResidualTasks[task.nResidualTasks].wbits = 0;
task.ResidualTasks[task.nResidualTasks].size = task.ResidualTasks[task.nResidualTasks].obits * blocksize;
@@ -1093,7 +1097,7 @@ namespace CUETools.Codecs.FLACCL
task.ResidualTasks[task.nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0);
task.ResidualTasks[task.nResidualTasks].abits = task.ResidualTasks[task.nResidualTasks].obits;
task.ResidualTasks[task.nResidualTasks].blocksize = blocksize;
task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * blocksize;
task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * task.channelSize + iFrame * blocksize;
task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs;
task.ResidualTasks[task.nResidualTasks].wbits = 0;
task.ResidualTasks[task.nResidualTasks].size = task.ResidualTasks[task.nResidualTasks].obits * blocksize;
@@ -1111,7 +1115,7 @@ namespace CUETools.Codecs.FLACCL
task.ResidualTasks[task.nResidualTasks].abits = task.ResidualTasks[task.nResidualTasks].obits;
task.ResidualTasks[task.nResidualTasks].blocksize = blocksize;
task.ResidualTasks[task.nResidualTasks].residualOrder = order;
task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * blocksize;
task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * task.channelSize + iFrame * blocksize;
task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs;
task.ResidualTasks[task.nResidualTasks].wbits = 0;
task.ResidualTasks[task.nResidualTasks].size = task.ResidualTasks[task.nResidualTasks].obits * blocksize;
@@ -1150,7 +1154,7 @@ namespace CUETools.Codecs.FLACCL
// task.ResidualTasks[task.nResidualTasks].abits = task.ResidualTasks[task.nResidualTasks].obits;
// task.ResidualTasks[task.nResidualTasks].blocksize = blocksize;
// task.ResidualTasks[task.nResidualTasks].residualOrder = 0;
// task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * blocksize;
// task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * task.channelSize + iFrame * blocksize;
// task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs;
// task.ResidualTasks[task.nResidualTasks].shift = 0;
// task.nResidualTasks++;
@@ -1416,8 +1420,8 @@ namespace CUETools.Codecs.FLACCL
{
for (int ch = 0; ch < channelCount; ch++)
task.frame.subframes[ch].Init(
smp + ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * task.frameSize,
((int*)task.clResidualPtr) + ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * task.frameSize,
smp + ch * task.channelSize + iFrame * task.frameSize,
((int*)task.clResidualPtr) + ch * task.channelSize + iFrame * task.frameSize,
_pcm.BitsPerSample + (doMidside && ch == 3 ? 1 : 0), 0);
select_best_methods(task.frame, channelCount, iFrame, task);
@@ -1462,7 +1466,7 @@ namespace CUETools.Codecs.FLACCL
int channelsCount = doMidside ? 2 * channels : channels;
if (task.nResidualTasks == 0)
initializeSubframeTasks(task.frameSize, channelsCount, max_frames, task);
initializeSubframeTasks(task.frameSize, channelsCount, _settings.TaskSize, task);
estimate_residual(task, channelsCount);
}
@@ -1494,7 +1498,7 @@ namespace CUETools.Codecs.FLACCL
for (int ch = 0; ch < channels; ch++)
{
short* res = ((short*)task.clSamplesBytesPtr) + iFrame * channels * task.frameSize + ch;
int* smp = r + ch * Flake.MAX_BLOCKSIZE;
int* smp = r + ch * task.channelSize;
for (int i = task.frameSize; i > 0; i--)
{
//if (AudioSamples.MemCmp(s + iFrame * task.frameSize + ch * FLACCLWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, task.frameSize))
@@ -1683,7 +1687,7 @@ namespace CUETools.Codecs.FLACCL
int pos = 0;
while (pos < buff.Length)
{
int block = Math.Min(buff.Length - pos, eparams.block_size * max_frames - samplesInBuffer);
int block = Math.Min(buff.Length - pos, eparams.block_size * _settings.TaskSize - samplesInBuffer);
fixed (byte* buf = buff.Bytes)
AudioSamples.MemCpy(((byte*)task1.clSamplesBytesPtr) + samplesInBuffer * _pcm.BlockAlign, buf + pos * _pcm.BlockAlign, block * _pcm.BlockAlign);
@@ -1692,7 +1696,7 @@ namespace CUETools.Codecs.FLACCL
pos += block;
int nFrames = samplesInBuffer / eparams.block_size;
if (nFrames >= max_frames)
if (nFrames >= _settings.TaskSize)
do_output_frames(nFrames);
}
if (md5 != null)
@@ -1770,9 +1774,9 @@ namespace CUETools.Codecs.FLACCL
public unsafe void do_output_frames(int nFrames)
{
send_to_GPU(task1, nFrames, eparams.block_size);
run_GPU_task(task1);
if (task2.frameCount > 0)
task2.openCLCQ.Finish();
run_GPU_task(task1);
if (task2.frameCount > 0)
{
if (cpu_tasks != null)
@@ -2011,8 +2015,6 @@ namespace CUETools.Codecs.FLACCL
else
eparams.block_size = _blocksize;
max_frames = Math.Min(maxFrames, FLACCLWriter.MAX_BLOCKSIZE / eparams.block_size);
// set maximum encoded frame size (if larger, re-encodes in verbatim mode)
if (channels == 2)
max_frame_size = 16 + ((eparams.block_size * (int)(bits_per_sample + bits_per_sample + 1) + 7) >> 3);
@@ -2357,6 +2359,7 @@ namespace CUETools.Codecs.FLACCL
public int[] samplesBuffer;
public byte[] outputBuffer;
public int outputSize = 0;
public int channelSize = 0;
public int frameSize = 0;
public int frameCount = 0;
public int frameNumber = 0;
@@ -2397,18 +2400,20 @@ namespace CUETools.Codecs.FLACCL
openCLCQ = openCLProgram.Context.CreateCommandQueue(openCLProgram.Context.Devices[0], prop);
int MAX_ORDER = this.writer.eparams.max_prediction_order;
residualTasksLen = sizeof(FLACCLSubframeTask) * 32 * channelsCount * FLACCLWriter.maxFrames;
bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channels * FLACCLWriter.maxFrames;
int samplesBufferLen = sizeof(int) * FLACCLWriter.MAX_BLOCKSIZE * channelsCount;
int residualBufferLen = sizeof(int) * FLACCLWriter.MAX_BLOCKSIZE * channels; // need to adjust residualOffset?
int partitionsLen = sizeof(int) * (30 << 8) * channels * FLACCLWriter.maxFrames;
int riceParamsLen = sizeof(int) * (4 << 8) * channels * FLACCLWriter.maxFrames;
int autocorLen = sizeof(float) * (MAX_ORDER + 1) * lpc.MAX_LPC_WINDOWS * channelsCount * FLACCLWriter.maxFrames;
int MAX_FRAMES = this.writer._settings.TaskSize;
int MAX_CHANNELSIZE = MAX_FRAMES * writer.eparams.block_size;
residualTasksLen = sizeof(FLACCLSubframeTask) * 32 * channelsCount * MAX_FRAMES;
bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channels * MAX_FRAMES;
int samplesBufferLen = sizeof(int) * MAX_CHANNELSIZE * channelsCount;
int residualBufferLen = sizeof(int) * MAX_CHANNELSIZE * channels; // need to adjust residualOffset?
int partitionsLen = sizeof(int) * (30 << 8) * channels * MAX_FRAMES;
int riceParamsLen = sizeof(int) * (4 << 8) * channels * MAX_FRAMES;
int autocorLen = sizeof(float) * (MAX_ORDER + 1) * lpc.MAX_LPC_WINDOWS * channelsCount * MAX_FRAMES;
int lpcDataLen = autocorLen * 32;
int resOutLen = sizeof(int) * channelsCount * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 8) * FLACCLWriter.maxFrames;
int wndLen = sizeof(float) * FLACCLWriter.MAX_BLOCKSIZE /** 2*/ * lpc.MAX_LPC_WINDOWS;
int selectedLen = sizeof(int) * 32 * channelsCount * FLACCLWriter.maxFrames;
int riceLen = sizeof(int) * channels * FLACCLWriter.MAX_BLOCKSIZE;
int resOutLen = sizeof(int) * channelsCount * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 8) * MAX_FRAMES;
int wndLen = sizeof(float) * MAX_CHANNELSIZE /** 2*/ * lpc.MAX_LPC_WINDOWS;
int selectedLen = sizeof(int) * 32 * channelsCount * MAX_FRAMES;
int riceLen = sizeof(int) * channels * MAX_CHANNELSIZE;
if (!writer._settings.MappedMemory)
{
@@ -2506,8 +2511,8 @@ namespace CUETools.Codecs.FLACCL
}
}
samplesBuffer = new int[FLACCLWriter.MAX_BLOCKSIZE * channelsCount];
outputBuffer = new byte[max_frame_size * FLACCLWriter.maxFrames + 1];
samplesBuffer = new int[MAX_CHANNELSIZE * channelsCount];
outputBuffer = new byte[max_frame_size * MAX_FRAMES + 1];
frame = new FlacFrame(channelsCount);
frame.writer = new BitWriter(outputBuffer, 0, outputBuffer.Length);
@@ -2662,9 +2667,9 @@ namespace CUETools.Codecs.FLACCL
clChannelDecorr.SetArgs(
clSamples,
clSamplesBytes,
FLACCLWriter.MAX_BLOCKSIZE / 4);
channelSize / 4);
openCLCQ.EnqueueNDRangeKernel(clChannelDecorr, 0, FLACCLWriter.MAX_BLOCKSIZE / 4);
openCLCQ.EnqueueNDRangeKernel(clChannelDecorr, 0, channelSize / 4);
//openCLCQ.EnqueueNDRangeKernel(clChannelDecorr, 0, (frameSize * frameCount + 3) / 4);
if (eparams.do_wasted)
@@ -2917,7 +2922,7 @@ namespace CUETools.Codecs.FLACCL
if (writer._settings.DoRice)
openCLCQ.EnqueueReadBuffer(clRiceOutput, false, 0, (channels * frameSize * 17 + 128) / 8 * frameCount, clRiceOutputPtr);
else
openCLCQ.EnqueueReadBuffer(clResidual, false, 0, sizeof(int) * FLACCLWriter.MAX_BLOCKSIZE * channels, clResidualPtr);
openCLCQ.EnqueueReadBuffer(clResidual, false, 0, sizeof(int) * channelSize * channels, clResidualPtr);
}
}
if (!writer._settings.MappedMemory)

View File

@@ -435,7 +435,7 @@ namespace CUETools.Codecs.FLAKE
// subframe header
uint t1 = bitreader.readbit(); // ?????? == 0
if (t1 != 0)
throw new Exception("unsupported subframe coding");
throw new Exception("unsupported subframe coding (ch == " + ch.ToString() + ")");
int type_code = (int)bitreader.readbits(6);
frame.subframes[ch].wbits = (int)bitreader.readbit();
if (frame.subframes[ch].wbits != 0)

View File

@@ -56,6 +56,21 @@ namespace CUETools.Codecs
writebits(8, c);
}
public unsafe void writeints(int len, byte* buf)
{
int old_pos = BitLength;
int start = old_pos / 8;
int end = (old_pos + len) / 8;
flush();
byte start_val = old_pos % 8 != 0 ? buffer[start] : (byte)0;
fixed (byte* buf1 = &buffer[0])
AudioSamples.MemCpy(buf1 + start, buf + start, end - start);
buffer[start] |= start_val;
buf_ptr = end;
if ((old_pos + len) % 8 != 0)
writebits((old_pos + len) % 8, buf[end] >> (8 - ((old_pos + len) % 8)));
}
public void write(params char [] chars)
{
foreach (char c in chars)
@@ -343,5 +358,13 @@ namespace CUETools.Codecs
buf_ptr = buf_start + value;
}
}
public int BitLength
{
get
{
return buf_ptr * 8 + 32 - bit_left;
}
}
}
}

View File

@@ -42,9 +42,18 @@ namespace CUETools.FLACCL.cmd
Console.WriteLine(" --verify Verify during encoding");
Console.WriteLine(" --no-md5 Don't compute MD5 hash");
Console.WriteLine(" --no-seektable Don't generate a seektable");
Console.WriteLine(" --slow-gpu Some encoding stages are done on CPU");
Console.WriteLine(" --cpu-threads Use additional CPU threads");
Console.WriteLine();
Console.WriteLine("OpenCL Options:");
Console.WriteLine();
Console.WriteLine(" --opencl-type <X> CPU or GPU, default GPU");
Console.WriteLine(" --opencl-platform '' 'ATI Stream', 'NVIDIA Cuda', 'Intel OpenCL' etc");
Console.WriteLine(" --group-size # Set GPU workgroup size (64,128,256)");
Console.WriteLine(" --task-size # Set number of frames per GPU call, default 32");
Console.WriteLine(" --slow-gpu Some encoding stages are done on CPU");
Console.WriteLine(" --do-rice Experimental mode, not recommended");
Console.WriteLine(" --define <X> <Y> OpenCL preprocessor definition");
Console.WriteLine();
Console.WriteLine("Advanced Options:");
Console.WriteLine();
Console.WriteLine(" -b # Block size");
@@ -113,6 +122,8 @@ namespace CUETools.FLACCL.cmd
}
else if (args[arg] == "--group-size" && ++arg < args.Length && int.TryParse(args[arg], out intarg))
settings.GroupSize = intarg;
else if (args[arg] == "--task-size" && ++arg < args.Length && int.TryParse(args[arg], out intarg))
settings.TaskSize = intarg;
else if (args[arg] == "--define" && arg + 2 < args.Length)
settings.Defines += "#define " + args[++arg] + " " + args[++arg] + "\n";
else if (args[arg] == "--opencl-platform" && ++arg < args.Length)