utilize multi core CPUs

This commit is contained in:
chudov
2009-12-24 16:04:47 +00:00
parent 8f3b3bef9f
commit a6fcab4d9f
2 changed files with 270 additions and 122 deletions

View File

@@ -21,6 +21,7 @@ using System;
using System.Collections.Generic; using System.Collections.Generic;
using System.IO; using System.IO;
using System.Security.Cryptography; using System.Security.Cryptography;
using System.Threading;
using System.Text; using System.Text;
using CUETools.Codecs; using CUETools.Codecs;
using CUETools.Codecs.FLAKE; using CUETools.Codecs.FLAKE;
@@ -57,10 +58,8 @@ namespace CUETools.Codecs.FlaCuda
// this can be used to allocate memory for output // this can be used to allocate memory for output
int max_frame_size; int max_frame_size;
byte[] frame_buffer = null;
BitWriter frame_writer = null;
int frame_count = 0; int frame_count = 0;
int frame_pos = 0;
long first_frame_offset = 0; long first_frame_offset = 0;
@@ -70,7 +69,6 @@ namespace CUETools.Codecs.FlaCuda
// allocated by flake_encode_init and freed by flake_encode_close // allocated by flake_encode_init and freed by flake_encode_close
byte[] header; byte[] header;
int[] residualBuffer;
float[] windowBuffer; float[] windowBuffer;
byte[] md5_buffer; byte[] md5_buffer;
int samplesInBuffer = 0; int samplesInBuffer = 0;
@@ -85,8 +83,6 @@ namespace CUETools.Codecs.FlaCuda
Crc16 crc16; Crc16 crc16;
MD5 md5; MD5 md5;
FlakeReader verify;
SeekPoint[] seek_table; SeekPoint[] seek_table;
int seek_table_offset = -1; int seek_table_offset = -1;
@@ -95,11 +91,15 @@ namespace CUETools.Codecs.FlaCuda
CUDA cuda; CUDA cuda;
FlaCudaTask task1; FlaCudaTask task1;
FlaCudaTask task2; FlaCudaTask task2;
FlaCudaTask[] cpu_tasks;
int oldest_cpu_task = 0;
CUdeviceptr cudaWindow; CUdeviceptr cudaWindow;
bool encode_on_cpu = false; bool encode_on_cpu = false;
int cpu_threads = 0;
bool do_lattice = false; bool do_lattice = false;
public const int MAX_BLOCKSIZE = 4096 * 16; public const int MAX_BLOCKSIZE = 4096 * 16;
@@ -123,7 +123,6 @@ namespace CUETools.Codecs.FlaCuda
_path = path; _path = path;
_IO = IO; _IO = IO;
residualBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 10 : channels + 1)];
windowBuffer = new float[FlaCudaWriter.MAX_BLOCKSIZE * lpc.MAX_LPC_WINDOWS]; windowBuffer = new float[FlaCudaWriter.MAX_BLOCKSIZE * lpc.MAX_LPC_WINDOWS];
md5_buffer = new byte[FlaCudaWriter.MAX_BLOCKSIZE * channels * bits_per_sample / 8]; md5_buffer = new byte[FlaCudaWriter.MAX_BLOCKSIZE * channels * bits_per_sample / 8];
@@ -169,6 +168,20 @@ namespace CUETools.Codecs.FlaCuda
} }
} }
public int CPUThreads
{
get
{
return cpu_threads;
}
set
{
if (value < 0 || value > 16)
throw new Exception("cpu_threads must be between 0..16");
cpu_threads = value;
}
}
public bool GPUOnly public bool GPUOnly
{ {
get get
@@ -213,8 +226,23 @@ namespace CUETools.Codecs.FlaCuda
} }
if (task2.frameCount > 0) if (task2.frameCount > 0)
{ {
if (cpu_tasks != null)
{
for (int i = 0; i < cpu_tasks.Length; i++)
{
wait_for_cpu_task();
FlaCudaTask task = cpu_tasks[oldest_cpu_task];
oldest_cpu_task = (oldest_cpu_task + 1) % cpu_tasks.Length;
if (task.frameCount > 0)
{
write_result(task);
task.frameCount = 0;
}
}
}
cuda.SynchronizeStream(task2.stream); cuda.SynchronizeStream(task2.stream);
process_result(task2); process_result(task2);
write_result(task2);
task2.frameCount = 0; task2.frameCount = 0;
} }
@@ -231,7 +259,7 @@ namespace CUETools.Codecs.FlaCuda
if (md5 != null) if (md5 != null)
{ {
md5.TransformFinalBlock(frame_buffer, 0, 0); md5.TransformFinalBlock(new byte[] { 0 }, 0, 0);
_IO.Position = 26; _IO.Position = 26;
_IO.Write(md5.Hash, 0, md5.Hash.Length); _IO.Write(md5.Hash, 0, md5.Hash.Length);
} }
@@ -248,6 +276,9 @@ namespace CUETools.Codecs.FlaCuda
cuda.Free(cudaWindow); cuda.Free(cudaWindow);
task1.Dispose(); task1.Dispose();
task2.Dispose(); task2.Dispose();
if (cpu_tasks != null)
foreach (FlaCudaTask task in cpu_tasks)
task.Dispose();
cuda.UnloadModule(); cuda.UnloadModule();
cuda.DestroyContext(); cuda.DestroyContext();
cuda.Dispose(); cuda.Dispose();
@@ -270,6 +301,9 @@ namespace CUETools.Codecs.FlaCuda
cuda.Free(cudaWindow); cuda.Free(cudaWindow);
task1.Dispose(); task1.Dispose();
task2.Dispose(); task2.Dispose();
if (cpu_tasks != null)
foreach (FlaCudaTask task in cpu_tasks)
task.Dispose();
cuda.UnloadModule(); cuda.UnloadModule();
cuda.DestroyContext(); cuda.DestroyContext();
cuda.Dispose(); cuda.Dispose();
@@ -714,114 +748,114 @@ namespace CUETools.Codecs.FlaCuda
return porder; return porder;
} }
unsafe void output_frame_header(FlacFrame frame, BitWriter bitwriter) unsafe void output_frame_header(FlacFrame frame)
{ {
bitwriter.writebits(15, 0x7FFC); frame.writer.writebits(15, 0x7FFC);
bitwriter.writebits(1, eparams.variable_block_size > 0 ? 1 : 0); frame.writer.writebits(1, eparams.variable_block_size > 0 ? 1 : 0);
bitwriter.writebits(4, frame.bs_code0); frame.writer.writebits(4, frame.bs_code0);
bitwriter.writebits(4, sr_code0); frame.writer.writebits(4, sr_code0);
if (frame.ch_mode == ChannelMode.NotStereo) if (frame.ch_mode == ChannelMode.NotStereo)
bitwriter.writebits(4, ch_code); frame.writer.writebits(4, ch_code);
else else
bitwriter.writebits(4, (int) frame.ch_mode); frame.writer.writebits(4, (int)frame.ch_mode);
bitwriter.writebits(3, bps_code); frame.writer.writebits(3, bps_code);
bitwriter.writebits(1, 0); frame.writer.writebits(1, 0);
bitwriter.write_utf8(frame_count); frame.writer.write_utf8(frame.frame_number);
// custom block size // custom block size
if (frame.bs_code1 >= 0) if (frame.bs_code1 >= 0)
{ {
if (frame.bs_code1 < 256) if (frame.bs_code1 < 256)
bitwriter.writebits(8, frame.bs_code1); frame.writer.writebits(8, frame.bs_code1);
else else
bitwriter.writebits(16, frame.bs_code1); frame.writer.writebits(16, frame.bs_code1);
} }
// custom sample rate // custom sample rate
if (sr_code1 > 0) if (sr_code1 > 0)
{ {
if (sr_code1 < 256) if (sr_code1 < 256)
bitwriter.writebits(8, sr_code1); frame.writer.writebits(8, sr_code1);
else else
bitwriter.writebits(16, sr_code1); frame.writer.writebits(16, sr_code1);
} }
// CRC-8 of frame header // CRC-8 of frame header
bitwriter.flush(); frame.writer.flush();
byte crc = crc8.ComputeChecksum(frame_buffer, 0, bitwriter.Length); byte crc = crc8.ComputeChecksum(frame.writer.Buffer, frame.writer_offset, frame.writer.Length - frame.writer_offset);
bitwriter.writebits(8, crc); frame.writer.writebits(8, crc);
} }
unsafe void output_residual(FlacFrame frame, BitWriter bitwriter, FlacSubframeInfo sub) unsafe void output_residual(FlacFrame frame, FlacSubframeInfo sub)
{ {
// rice-encoded block // rice-encoded block
bitwriter.writebits(2, 0); frame.writer.writebits(2, 0);
// partition order // partition order
int porder = sub.best.rc.porder; int porder = sub.best.rc.porder;
int psize = frame.blocksize >> porder; int psize = frame.blocksize >> porder;
//assert(porder >= 0); //assert(porder >= 0);
bitwriter.writebits(4, porder); frame.writer.writebits(4, porder);
int res_cnt = psize - sub.best.order; int res_cnt = psize - sub.best.order;
// residual // residual
int j = sub.best.order; int j = sub.best.order;
fixed (byte* fixbuf = &frame_buffer[0]) fixed (byte* fixbuf = frame.writer.Buffer)
for (int p = 0; p < (1 << porder); p++) for (int p = 0; p < (1 << porder); p++)
{ {
int k = sub.best.rc.rparams[p]; int k = sub.best.rc.rparams[p];
bitwriter.writebits(4, k); frame.writer.writebits(4, k);
if (p == 1) res_cnt = psize; if (p == 1) res_cnt = psize;
int cnt = Math.Min(res_cnt, frame.blocksize - j); int cnt = Math.Min(res_cnt, frame.blocksize - j);
bitwriter.write_rice_block_signed(fixbuf, k, sub.best.residual + j, cnt); frame.writer.write_rice_block_signed(fixbuf, k, sub.best.residual + j, cnt);
j += cnt; j += cnt;
} }
} }
unsafe void unsafe void
output_subframe_constant(FlacFrame frame, BitWriter bitwriter, FlacSubframeInfo sub) output_subframe_constant(FlacFrame frame, FlacSubframeInfo sub)
{ {
bitwriter.writebits_signed(sub.obits, sub.samples[0]); frame.writer.writebits_signed(sub.obits, sub.samples[0]);
} }
unsafe void unsafe void
output_subframe_verbatim(FlacFrame frame, BitWriter bitwriter, FlacSubframeInfo sub) output_subframe_verbatim(FlacFrame frame, FlacSubframeInfo sub)
{ {
int n = frame.blocksize; int n = frame.blocksize;
for (int i = 0; i < n; i++) for (int i = 0; i < n; i++)
bitwriter.writebits_signed(sub.obits, sub.samples[i]); frame.writer.writebits_signed(sub.obits, sub.samples[i]);
// Don't use residual here, because we don't copy samples to residual for verbatim frames. // Don't use residual here, because we don't copy samples to residual for verbatim frames.
} }
unsafe void unsafe void
output_subframe_fixed(FlacFrame frame, BitWriter bitwriter, FlacSubframeInfo sub) output_subframe_fixed(FlacFrame frame, FlacSubframeInfo sub)
{ {
// warm-up samples // warm-up samples
for (int i = 0; i < sub.best.order; i++) for (int i = 0; i < sub.best.order; i++)
bitwriter.writebits_signed(sub.obits, sub.samples[i]); frame.writer.writebits_signed(sub.obits, sub.samples[i]);
// residual // residual
output_residual(frame, bitwriter, sub); output_residual(frame, sub);
} }
unsafe void unsafe void
output_subframe_lpc(FlacFrame frame, BitWriter bitwriter, FlacSubframeInfo sub) output_subframe_lpc(FlacFrame frame, FlacSubframeInfo sub)
{ {
// warm-up samples // warm-up samples
for (int i = 0; i < sub.best.order; i++) for (int i = 0; i < sub.best.order; i++)
bitwriter.writebits_signed(sub.obits, sub.samples[i]); frame.writer.writebits_signed(sub.obits, sub.samples[i]);
// LPC coefficients // LPC coefficients
bitwriter.writebits(4, sub.best.cbits - 1); frame.writer.writebits(4, sub.best.cbits - 1);
bitwriter.writebits_signed(5, sub.best.shift); frame.writer.writebits_signed(5, sub.best.shift);
for (int i = 0; i < sub.best.order; i++) for (int i = 0; i < sub.best.order; i++)
bitwriter.writebits_signed(sub.best.cbits, sub.best.coefs[i]); frame.writer.writebits_signed(sub.best.cbits, sub.best.coefs[i]);
// residual // residual
output_residual(frame, bitwriter, sub); output_residual(frame, sub);
} }
unsafe void output_subframes(FlacFrame frame, BitWriter bitwriter) unsafe void output_subframes(FlacFrame frame)
{ {
for (int ch = 0; ch < channels; ch++) for (int ch = 0; ch < channels; ch++)
{ {
@@ -832,11 +866,11 @@ namespace CUETools.Codecs.FlaCuda
type_code |= sub.best.order; type_code |= sub.best.order;
if (sub.best.type == SubframeType.LPC) if (sub.best.type == SubframeType.LPC)
type_code |= sub.best.order - 1; type_code |= sub.best.order - 1;
bitwriter.writebits(1, 0); frame.writer.writebits(1, 0);
bitwriter.writebits(6, type_code); frame.writer.writebits(6, type_code);
bitwriter.writebits(1, sub.wbits != 0 ? 1 : 0); frame.writer.writebits(1, sub.wbits != 0 ? 1 : 0);
if (sub.wbits > 0) if (sub.wbits > 0)
bitwriter.writebits((int)sub.wbits, 1); frame.writer.writebits((int)sub.wbits, 1);
//if (frame_writer.Length >= frame_buffer.Length) //if (frame_writer.Length >= frame_buffer.Length)
// throw new Exception("buffer overflow"); // throw new Exception("buffer overflow");
@@ -845,16 +879,16 @@ namespace CUETools.Codecs.FlaCuda
switch (sub.best.type) switch (sub.best.type)
{ {
case SubframeType.Constant: case SubframeType.Constant:
output_subframe_constant(frame, bitwriter, sub); output_subframe_constant(frame, sub);
break; break;
case SubframeType.Verbatim: case SubframeType.Verbatim:
output_subframe_verbatim(frame, bitwriter, sub); output_subframe_verbatim(frame, sub);
break; break;
case SubframeType.Fixed: case SubframeType.Fixed:
output_subframe_fixed(frame, bitwriter, sub); output_subframe_fixed(frame, sub);
break; break;
case SubframeType.LPC: case SubframeType.LPC:
output_subframe_lpc(frame, bitwriter, sub); output_subframe_lpc(frame, sub);
break; break;
} }
//if (frame_writer.Length >= frame_buffer.Length) //if (frame_writer.Length >= frame_buffer.Length)
@@ -862,12 +896,12 @@ namespace CUETools.Codecs.FlaCuda
} }
} }
void output_frame_footer(BitWriter bitwriter) void output_frame_footer(FlacFrame frame)
{ {
bitwriter.flush(); frame.writer.flush();
ushort crc = crc16.ComputeChecksum(frame_buffer, 0, bitwriter.Length); ushort crc = crc16.ComputeChecksum(frame.writer.Buffer, frame.writer_offset, frame.writer.Length - frame.writer_offset);
bitwriter.writebits(16, crc); frame.writer.writebits(16, crc);
bitwriter.flush(); frame.writer.flush();
} }
unsafe delegate void window_function(float* window, int size); unsafe delegate void window_function(float* window, int size);
@@ -1004,6 +1038,7 @@ namespace CUETools.Codecs.FlaCuda
case SubframeType.Verbatim: case SubframeType.Verbatim:
break; break;
case SubframeType.Fixed: case SubframeType.Fixed:
if (encode_on_cpu)
{ {
encode_residual_fixed(frame.subframes[ch].best.residual, frame.subframes[ch].samples, encode_residual_fixed(frame.subframes[ch].best.residual, frame.subframes[ch].samples,
frame.blocksize, frame.subframes[ch].best.order); frame.blocksize, frame.subframes[ch].best.order);
@@ -1323,39 +1358,33 @@ namespace CUETools.Codecs.FlaCuda
cuda.CopyDeviceToHostAsync(task.cudaBestResidualTasks, task.bestResidualTasksPtr, (uint)(sizeof(FlaCudaSubframeTask) * channels * task.frameCount), task.stream); cuda.CopyDeviceToHostAsync(task.cudaBestResidualTasks, task.bestResidualTasksPtr, (uint)(sizeof(FlaCudaSubframeTask) * channels * task.frameCount), task.stream);
} }
unsafe int encode_frame(bool doMidside, int channelCount, int iFrame, FlaCudaTask task) unsafe int encode_frame(bool doMidside, int channelCount, int iFrame, FlaCudaTask task, int current_frame_number)
{ {
fixed (int* r = residualBuffer) fixed (int* r = task.residualBuffer)
{ {
FlacFrame frame = task.frame; task.frame.InitSize(task.frameSize, eparams.variable_block_size != 0);
frame.InitSize(task.frameSize, eparams.variable_block_size != 0);
for (int ch = 0; ch < channelCount; ch++) for (int ch = 0; ch < channelCount; ch++)
{ {
int* s = ((int*)task.samplesBufferPtr) + ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * task.frameSize; int* s = ((int*)task.samplesBufferPtr) + ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * task.frameSize;
frame.subframes[ch].Init(s, r + ch * FlaCudaWriter.MAX_BLOCKSIZE, task.frame.subframes[ch].Init(s, r + ch * FlaCudaWriter.MAX_BLOCKSIZE,
bits_per_sample + (doMidside && ch == 3 ? 1U : 0U), 0); bits_per_sample + (doMidside && ch == 3 ? 1U : 0U), 0);
} }
select_best_methods(frame, channelCount, iFrame, task); select_best_methods(task.frame, channelCount, iFrame, task);
encode_residual(frame); encode_residual(task.frame);
frame_writer.Reset(); //task.frame.writer.Reset();
task.frame.frame_number = current_frame_number;
task.frame.writer_offset = task.frame.writer.Length;
output_frame_header(frame, frame_writer); output_frame_header(task.frame);
output_subframes(frame, frame_writer); output_subframes(task.frame);
output_frame_footer(frame_writer); output_frame_footer(task.frame);
if (frame_writer.Length >= frame_buffer.Length) if (task.frame.writer.Length - task.frame.writer_offset >= max_frame_size)
throw new Exception("buffer overflow"); throw new Exception("buffer overflow");
if (frame_buffer != null) return task.frame.writer.Length - task.frame.writer_offset;
{
if (eparams.variable_block_size > 0)
frame_count += frame.blocksize;
else
frame_count++;
}
return frame_writer.Length;
} }
} }
@@ -1367,8 +1396,12 @@ namespace CUETools.Codecs.FlaCuda
task.nResidualTasks = 0; task.nResidualTasks = 0;
task.frameCount = nFrames; task.frameCount = nFrames;
task.frameSize = blocksize; task.frameSize = blocksize;
task.frameNumber = eparams.variable_block_size > 0 ? frame_pos : frame_count;
task.framePos = frame_pos;
frame_count += nFrames;
frame_pos += nFrames * blocksize;
cuda.CopyHostToDeviceAsync(task.cudaSamplesBytes, task.samplesBytesPtr, (uint)(sizeof(short) * channels * blocksize * nFrames), task.stream); cuda.CopyHostToDeviceAsync(task.cudaSamplesBytes, task.samplesBytesPtr, (uint)(sizeof(short) * channels * blocksize * nFrames), task.stream);
if (verify != null) if (task.verify != null)
{ {
int* r = (int*)task.samplesBufferPtr; int* r = (int*)task.samplesBufferPtr;
fixed (int* s = task.verifyBuffer) fixed (int* s = task.verifyBuffer)
@@ -1402,19 +1435,35 @@ namespace CUETools.Codecs.FlaCuda
estimate_residual(task, channelsCount); estimate_residual(task, channelsCount);
} }
unsafe int process_result(FlaCudaTask task) unsafe void process_result(FlaCudaTask task)
{ {
bool doMidside = channels == 2 && eparams.do_midside; bool doMidside = channels == 2 && eparams.do_midside;
int channelCount = doMidside ? 2 * channels : channels; int channelCount = doMidside ? 2 * channels : channels;
int bs = 0; int iSample = 0;
int iByte = 0;
task.frame.writer.Reset();
task.frame.writer_offset = 0;
for (int iFrame = 0; iFrame < task.frameCount; iFrame++) for (int iFrame = 0; iFrame < task.frameCount; iFrame++)
{ {
//if (0 != eparams.variable_block_size && 0 == (task.blocksize & 7) && task.blocksize >= 128) //if (0 != eparams.variable_block_size && 0 == (task.blocksize & 7) && task.blocksize >= 128)
// fs = encode_frame_vbs(); // fs = encode_frame_vbs();
//else //else
int fs = encode_frame(doMidside, channelCount, iFrame, task); int fn = task.frameNumber + (eparams.variable_block_size > 0 ? iSample : iFrame);
bs += task.frameSize; int fs = encode_frame(doMidside, channelCount, iFrame, task, fn);
if (task.verify != null)
{
int decoded = task.verify.DecodeFrame(task.frame.writer.Buffer, task.frame.writer_offset, fs);
if (decoded != fs || task.verify.Remaining != (ulong)task.frameSize)
throw new Exception("validation failed! frame size mismatch");
fixed (int* s = task.verifyBuffer, r = task.verify.Samples)
{
for (int ch = 0; ch < channels; ch++)
if (AudioSamples.MemCmp(s + iFrame * task.frameSize + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, task.frameSize))
throw new Exception("validation failed!");
}
}
if (seek_table != null && _IO.CanSeek) if (seek_table != null && _IO.CanSeek)
{ {
@@ -1422,35 +1471,42 @@ namespace CUETools.Codecs.FlaCuda
{ {
if (seek_table[sp].framesize != 0) if (seek_table[sp].framesize != 0)
continue; continue;
if (seek_table[sp].number > (ulong)_position + (ulong)task.frameSize) if (seek_table[sp].number >= (ulong)(task.framePos + iSample + task.frameSize))
break; break;
if (seek_table[sp].number >= (ulong)_position) if (seek_table[sp].number >= (ulong)(task.framePos + iSample))
{ {
seek_table[sp].number = (ulong)_position; seek_table[sp].number = (ulong)(task.framePos + iSample);
seek_table[sp].offset = (ulong)(_IO.Position - first_frame_offset); seek_table[sp].offset = (ulong)iByte;
seek_table[sp].framesize = (uint)task.frameSize; seek_table[sp].framesize = (uint)task.frameSize;
} }
} }
} }
_position += task.frameSize; //Array.Copy(task.frame.buffer, 0, task.outputBuffer, iByte, fs);
_IO.Write(frame_buffer, 0, fs);
_totalSize += fs;
if (verify != null) iSample += task.frameSize;
{ iByte += fs;
int decoded = verify.DecodeFrame(frame_buffer, 0, fs);
if (decoded != fs || verify.Remaining != (ulong)task.frameSize)
throw new Exception("validation failed! frame size mismatch");
fixed (int* s = task.verifyBuffer, r = verify.Samples)
{
for (int ch = 0; ch < channels; ch++)
if (AudioSamples.MemCmp(s + iFrame * task.frameSize + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, task.frameSize))
throw new Exception("validation failed!");
}
}
} }
return bs; task.outputSize = iByte;
if (iByte != task.frame.writer.Length)
throw new Exception("invalid length");
}
unsafe void write_result(FlaCudaTask task)
{
int iSample = task.frameSize * task.frameCount;
if (seek_table != null && _IO.CanSeek)
for (int sp = 0; sp < seek_table.Length; sp++)
{
if (seek_table[sp].number >= (ulong)(task.framePos + iSample))
break;
if (seek_table[sp].number >= (ulong)(task.framePos))
seek_table[sp].offset += (ulong)(_IO.Position - first_frame_offset);
}
_IO.Write(task.outputBuffer, 0, task.outputSize);
_position += iSample;
_totalSize += task.outputSize;
} }
public unsafe void Write(int[,] buff, int pos, int sampleCount) public unsafe void Write(int[,] buff, int pos, int sampleCount)
@@ -1466,15 +1522,23 @@ namespace CUETools.Codecs.FlaCuda
using (StreamReader sr = new StreamReader(cubin)) using (StreamReader sr = new StreamReader(cubin))
cuda.LoadModule(new ASCIIEncoding().GetBytes(sr.ReadToEnd())); cuda.LoadModule(new ASCIIEncoding().GetBytes(sr.ReadToEnd()));
//cuda.LoadModule(System.IO.Path.Combine(Environment.CurrentDirectory, "flacuda.cubin")); //cuda.LoadModule(System.IO.Path.Combine(Environment.CurrentDirectory, "flacuda.cubin"));
task1 = new FlaCudaTask(cuda, channelCount);
task2 = new FlaCudaTask(cuda, channelCount);
cudaWindow = cuda.Allocate((uint)sizeof(float) * FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS);
if (_IO == null) if (_IO == null)
_IO = new FileStream(_path, FileMode.Create, FileAccess.Write, FileShare.Read); _IO = new FileStream(_path, FileMode.Create, FileAccess.Write, FileShare.Read);
int header_size = flake_encode_init(); int header_size = flake_encode_init();
_IO.Write(header, 0, header_size); _IO.Write(header, 0, header_size);
if (_IO.CanSeek) if (_IO.CanSeek)
first_frame_offset = _IO.Position; first_frame_offset = _IO.Position;
task1 = new FlaCudaTask(cuda, channelCount, channels, bits_per_sample, max_frame_size, eparams.do_verify);
task2 = new FlaCudaTask(cuda, channelCount, channels, bits_per_sample, max_frame_size, eparams.do_verify);
if (cpu_threads > 0)
{
cpu_tasks = new FlaCudaTask[cpu_threads];
for (int i = 0; i < cpu_tasks.Length; i++)
cpu_tasks[i] = new FlaCudaTask(cuda, channelCount, channels, bits_per_sample, max_frame_size, eparams.do_verify);
}
cudaWindow = cuda.Allocate((uint)sizeof(float) * FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS);
inited = true; inited = true;
} }
int len = sampleCount; int len = sampleCount;
@@ -1499,6 +1563,58 @@ namespace CUETools.Codecs.FlaCuda
} }
} }
public void wait_for_cpu_task()
{
FlaCudaTask task = cpu_tasks[oldest_cpu_task];
if (task.workThread == null)
return;
lock (task)
{
while (!task.done)
Monitor.Wait(task);
}
}
public void cpu_task_thread(object param)
{
FlaCudaTask task = param as FlaCudaTask;
while (true)
{
lock (task)
{
while (task.done)
Monitor.Wait(task);
}
process_result(task);
lock (task)
{
task.done = true;
Monitor.Pulse(task);
}
}
}
public void start_cpu_task()
{
FlaCudaTask task = cpu_tasks[oldest_cpu_task];
if (task.workThread == null)
{
task.done = false;
task.workThread = new Thread(cpu_task_thread);
task.workThread.IsBackground = true;
//task.workThread.Priority = ThreadPriority.BelowNormal;
task.workThread.Start(task);
}
else
{
lock (task)
{
task.done = false;
Monitor.Pulse(task);
}
}
}
public unsafe void do_output_frames(int nFrames) public unsafe void do_output_frames(int nFrames)
{ {
bool doMidside = channels == 2 && eparams.do_midside; bool doMidside = channels == 2 && eparams.do_midside;
@@ -1509,7 +1625,28 @@ namespace CUETools.Codecs.FlaCuda
cuda.SynchronizeStream(task2.stream); cuda.SynchronizeStream(task2.stream);
run_GPU_task(task1); run_GPU_task(task1);
if (task2.frameCount > 0) if (task2.frameCount > 0)
process_result(task2); {
if (cpu_tasks != null)
{
wait_for_cpu_task();
FlaCudaTask ttmp = cpu_tasks[oldest_cpu_task];
cpu_tasks[oldest_cpu_task] = task2;
task2 = ttmp;
start_cpu_task();
oldest_cpu_task = (oldest_cpu_task + 1) % cpu_tasks.Length;
if (task2.frameCount > 0)
write_result(task2);
}
else
{
process_result(task2);
write_result(task2);
}
}
int bs = eparams.block_size * nFrames; int bs = eparams.block_size * nFrames;
if (bs < samplesInBuffer) if (bs < samplesInBuffer)
{ {
@@ -1759,15 +1896,6 @@ namespace CUETools.Codecs.FlaCuda
if (_IO.CanSeek && eparams.do_md5) if (_IO.CanSeek && eparams.do_md5)
md5 = new MD5CryptoServiceProvider(); md5 = new MD5CryptoServiceProvider();
if (eparams.do_verify)
{
verify = new FlakeReader(channels, bits_per_sample);
verify.DoCRC = false;
}
frame_buffer = new byte[max_frame_size + 1];
frame_writer = new BitWriter(frame_buffer, 0, max_frame_size + 1);
return header_len; return header_len;
} }
} }
@@ -1939,7 +2067,7 @@ namespace CUETools.Codecs.FlaCuda
case 4: case 4:
min_fixed_order = 2; min_fixed_order = 2;
max_fixed_order = 2; max_fixed_order = 2;
orders_per_window = 4; orders_per_window = 3;
max_prediction_order = 8; max_prediction_order = 8;
max_partition_order = 4; max_partition_order = 4;
break; break;
@@ -2052,9 +2180,14 @@ namespace CUETools.Codecs.FlaCuda
public IntPtr residualTasksPtr = IntPtr.Zero; public IntPtr residualTasksPtr = IntPtr.Zero;
public IntPtr bestResidualTasksPtr = IntPtr.Zero; public IntPtr bestResidualTasksPtr = IntPtr.Zero;
public CUstream stream; public CUstream stream;
public int[] residualBuffer;
public int[] verifyBuffer; public int[] verifyBuffer;
public byte[] outputBuffer;
public int outputSize = 0;
public int frameSize = 0; public int frameSize = 0;
public int frameCount = 0; public int frameCount = 0;
public int frameNumber = 0;
public int framePos = 0;
public FlacFrame frame; public FlacFrame frame;
public int residualTasksLen; public int residualTasksLen;
public int bestResidualTasksLen; public int bestResidualTasksLen;
@@ -2065,7 +2198,12 @@ namespace CUETools.Codecs.FlaCuda
public int nAutocorTasksPerChannel = 0; public int nAutocorTasksPerChannel = 0;
public int max_porder = 0; public int max_porder = 0;
unsafe public FlaCudaTask(CUDA _cuda, int channelCount) public FlakeReader verify;
public Thread workThread = null;
public bool done = false;
unsafe public FlaCudaTask(CUDA _cuda, int channelCount, int channels, uint bits_per_sample, int max_frame_size, bool do_verify)
{ {
cuda = _cuda; cuda = _cuda;
@@ -2136,7 +2274,17 @@ namespace CUETools.Codecs.FlaCuda
stream = cuda.CreateStream(); stream = cuda.CreateStream();
verifyBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * channelCount]; // should be channels, not channelCount. And should null if not doing verify! verifyBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * channelCount]; // should be channels, not channelCount. And should null if not doing verify!
//residualBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * Math.Max(10, channelCount + 1)];
residualBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 10 : channels + 1)];
outputBuffer = new byte[max_frame_size * FlaCudaWriter.maxFrames + 1];
frame = new FlacFrame(channelCount); frame = new FlacFrame(channelCount);
frame.writer = new BitWriter(outputBuffer, 0, outputBuffer.Length);
if (do_verify)
{
verify = new FlakeReader(channels, bits_per_sample);
verify.DoCRC = false;
}
} }
public void Dispose() public void Dispose()

View File

@@ -810,7 +810,7 @@ code {
name = cudaEstimateResidual8 name = cudaEstimateResidual8
lmem = 0 lmem = 0
smem = 3760 smem = 3760
reg = 14 reg = 13
bar = 1 bar = 1
const { const {
segname = const segname = const
@@ -875,11 +875,11 @@ code {
0x00020611 0xc0000780 0x00070a09 0xc0000780 0x00020611 0xc0000780 0x00070a09 0xc0000780
0xd8154809 0x20000780 0x1900e204 0x1900e000 0xd8154809 0x20000780 0x1900e204 0x1900e000
0x4c01c205 0x00218780 0x6c00c031 0x80204780 0x4c01c205 0x00218780 0x6c00c031 0x80204780
0x1900e62c 0x1900e410 0x1900ea04 0x1900e800 0x1900e62c 0x1900e410 0x1900e804 0x1900ea00
0x1900ee0c 0x1900ec08 0x4c0bc62d 0x00218780 0x1900ee0c 0x1900ec08 0x4c0bc62d 0x00218780
0xd4115809 0x20000780 0x6c04c42d 0x8022c780 0xd4115809 0x20000780 0x6c04c42d 0x8022c780
0x1900e010 0x4d41ea34 0x20001805 0x0402c780 0x1900e010 0x200b982c 0x6c01c805 0x8022c780
0x6c00c801 0x80234780 0x4d43ee0c 0x20008200 0x4c03ce0d 0x00218780 0x6c00ca01 0x80204780
0x6c02cc05 0x8020c780 0x20000001 0x04004780 0x6c02cc05 0x8020c780 0x20000001 0x04004780
0x30040001 0xec000780 0x2040c001 0x04200784 0x30040001 0xec000780 0x2040c001 0x04200784
0x301f0005 0xec100780 0x30010001 0xc4100780 0x301f0005 0xec100780 0x30010001 0xc4100780