mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
optimizations
This commit is contained in:
@@ -59,6 +59,7 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
int max_frame_size;
|
int max_frame_size;
|
||||||
|
|
||||||
byte[] frame_buffer = null;
|
byte[] frame_buffer = null;
|
||||||
|
BitWriter frame_writer = null;
|
||||||
|
|
||||||
int frame_count = 0;
|
int frame_count = 0;
|
||||||
|
|
||||||
@@ -70,7 +71,6 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
// allocated by flake_encode_init and freed by flake_encode_close
|
// allocated by flake_encode_init and freed by flake_encode_close
|
||||||
byte[] header;
|
byte[] header;
|
||||||
|
|
||||||
int[] verifyBuffer;
|
|
||||||
int[] residualBuffer;
|
int[] residualBuffer;
|
||||||
float[] windowBuffer;
|
float[] windowBuffer;
|
||||||
byte[] md5_buffer;
|
byte[] md5_buffer;
|
||||||
@@ -85,7 +85,6 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
Crc16 crc16;
|
Crc16 crc16;
|
||||||
MD5 md5;
|
MD5 md5;
|
||||||
|
|
||||||
FlacFrame _frame;
|
|
||||||
FlakeReader verify;
|
FlakeReader verify;
|
||||||
|
|
||||||
SeekPoint[] seek_table;
|
SeekPoint[] seek_table;
|
||||||
@@ -94,31 +93,18 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
bool inited = false;
|
bool inited = false;
|
||||||
|
|
||||||
CUDA cuda;
|
CUDA cuda;
|
||||||
CUfunction cudaComputeAutocor;
|
FlaCudaTask task1;
|
||||||
CUfunction cudaComputeLPC;
|
FlaCudaTask task2;
|
||||||
CUfunction cudaEstimateResidual;
|
|
||||||
CUfunction cudaSumResidualChunks;
|
|
||||||
CUfunction cudaSumResidual;
|
|
||||||
CUfunction cudaEncodeResidual;
|
|
||||||
CUdeviceptr cudaSamples;
|
|
||||||
CUdeviceptr cudaWindow;
|
CUdeviceptr cudaWindow;
|
||||||
CUdeviceptr cudaAutocorTasks;
|
|
||||||
CUdeviceptr cudaAutocorOutput;
|
|
||||||
CUdeviceptr cudaResidualTasks;
|
|
||||||
CUdeviceptr cudaResidualOutput;
|
|
||||||
IntPtr samplesBufferPtr = IntPtr.Zero;
|
|
||||||
IntPtr autocorTasksPtr = IntPtr.Zero;
|
|
||||||
IntPtr residualTasksPtr = IntPtr.Zero;
|
|
||||||
CUstream cudaStream;
|
|
||||||
CUstream cudaStream1;
|
|
||||||
|
|
||||||
int nResidualTasks = 0;
|
int nResidualTasks = 0;
|
||||||
int nAutocorTasks = 0;
|
int nAutocorTasks = 0;
|
||||||
int maxFrames = 8;
|
|
||||||
|
|
||||||
const int MAX_BLOCKSIZE = 4608 * 4;
|
public const int MAX_BLOCKSIZE = 4608 * 4;
|
||||||
const int maxResidualParts = (MAX_BLOCKSIZE + 32 * 3) / (32 * 3);
|
internal const int maxFrames = 8;
|
||||||
const int maxAutocorParts = MAX_BLOCKSIZE / (256 - 32);
|
internal const int maxResidualParts = (MAX_BLOCKSIZE + 32 * 3) / (32 * 3);
|
||||||
|
internal const int maxAutocorParts = MAX_BLOCKSIZE / (256 - 32);
|
||||||
|
|
||||||
public FlaCudaWriter(string path, int bitsPerSample, int channelCount, int sampleRate, Stream IO)
|
public FlaCudaWriter(string path, int bitsPerSample, int channelCount, int sampleRate, Stream IO)
|
||||||
{
|
{
|
||||||
@@ -145,7 +131,6 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
|
|
||||||
crc8 = new Crc8();
|
crc8 = new Crc8();
|
||||||
crc16 = new Crc16();
|
crc16 = new Crc16();
|
||||||
_frame = new FlacFrame(channels * 2);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public int TotalSize
|
public int TotalSize
|
||||||
@@ -192,11 +177,12 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
{
|
{
|
||||||
if (inited)
|
if (inited)
|
||||||
{
|
{
|
||||||
while (samplesInBuffer > 0)
|
if (samplesInBuffer > 0)
|
||||||
{
|
{
|
||||||
eparams.block_size = samplesInBuffer;
|
eparams.block_size = samplesInBuffer;
|
||||||
output_frames();
|
output_frames(1);
|
||||||
}
|
}
|
||||||
|
samplesInBuffer = 0;
|
||||||
|
|
||||||
if (_IO.CanSeek)
|
if (_IO.CanSeek)
|
||||||
{
|
{
|
||||||
@@ -226,16 +212,8 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
_IO.Close();
|
_IO.Close();
|
||||||
|
|
||||||
cuda.Free(cudaWindow);
|
cuda.Free(cudaWindow);
|
||||||
cuda.Free(cudaSamples);
|
task1.Dispose();
|
||||||
cuda.Free(cudaAutocorTasks);
|
task2.Dispose();
|
||||||
cuda.Free(cudaAutocorOutput);
|
|
||||||
cuda.Free(cudaResidualTasks);
|
|
||||||
cuda.Free(cudaResidualOutput);
|
|
||||||
CUDADriver.cuMemFreeHost(samplesBufferPtr);
|
|
||||||
CUDADriver.cuMemFreeHost(residualTasksPtr);
|
|
||||||
CUDADriver.cuMemFreeHost(autocorTasksPtr);
|
|
||||||
cuda.DestroyStream(cudaStream);
|
|
||||||
cuda.DestroyStream(cudaStream1);
|
|
||||||
cuda.Dispose();
|
cuda.Dispose();
|
||||||
inited = false;
|
inited = false;
|
||||||
}
|
}
|
||||||
@@ -258,16 +236,8 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
{
|
{
|
||||||
_IO.Close();
|
_IO.Close();
|
||||||
cuda.Free(cudaWindow);
|
cuda.Free(cudaWindow);
|
||||||
cuda.Free(cudaSamples);
|
task1.Dispose();
|
||||||
cuda.Free(cudaAutocorTasks);
|
task2.Dispose();
|
||||||
cuda.Free(cudaAutocorOutput);
|
|
||||||
cuda.Free(cudaResidualTasks);
|
|
||||||
cuda.Free(cudaResidualOutput);
|
|
||||||
CUDADriver.cuMemFreeHost(samplesBufferPtr);
|
|
||||||
CUDADriver.cuMemFreeHost(residualTasksPtr);
|
|
||||||
CUDADriver.cuMemFreeHost(autocorTasksPtr);
|
|
||||||
cuda.DestroyStream(cudaStream);
|
|
||||||
cuda.DestroyStream(cudaStream1);
|
|
||||||
cuda.Dispose();
|
cuda.Dispose();
|
||||||
inited = false;
|
inited = false;
|
||||||
}
|
}
|
||||||
@@ -450,17 +420,18 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
/// <param name="samples"></param>
|
/// <param name="samples"></param>
|
||||||
/// <param name="pos"></param>
|
/// <param name="pos"></param>
|
||||||
/// <param name="block"></param>
|
/// <param name="block"></param>
|
||||||
unsafe void copy_samples(int[,] samples, int pos, int block)
|
unsafe void copy_samples(int[,] samples, int pos, int block, FlaCudaTask task)
|
||||||
{
|
{
|
||||||
int* fsamples = (int*)samplesBufferPtr;
|
int* s = ((int*)task.samplesBufferPtr) + samplesInBuffer;
|
||||||
fixed (int *src = &samples[pos, 0])
|
fixed (int *src = &samples[pos, 0])
|
||||||
{
|
{
|
||||||
if (channels == 2)
|
if (channels == 2 && eparams.do_midside)
|
||||||
AudioSamples.Deinterlace(fsamples + samplesInBuffer, fsamples + FlaCudaWriter.MAX_BLOCKSIZE + samplesInBuffer, src, block);
|
channel_decorrelation(s, s + FlaCudaWriter.MAX_BLOCKSIZE,
|
||||||
|
s + 2 * FlaCudaWriter.MAX_BLOCKSIZE, s + 3 * FlaCudaWriter.MAX_BLOCKSIZE, src, block);
|
||||||
else
|
else
|
||||||
for (int ch = 0; ch < channels; ch++)
|
for (int ch = 0; ch < channels; ch++)
|
||||||
{
|
{
|
||||||
int* psamples = fsamples + ch * FlaCudaWriter.MAX_BLOCKSIZE + samplesInBuffer;
|
int* psamples = s + ch * FlaCudaWriter.MAX_BLOCKSIZE;
|
||||||
for (int i = 0; i < block; i++)
|
for (int i = 0; i < block; i++)
|
||||||
psamples[i] = src[i * channels + ch];
|
psamples[i] = src[i * channels + ch];
|
||||||
}
|
}
|
||||||
@@ -509,20 +480,19 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
return k_opt;
|
return k_opt;
|
||||||
}
|
}
|
||||||
|
|
||||||
unsafe static void channel_decorrelation(int* leftS, int* rightS, int *leftM, int *rightM, int blocksize)
|
unsafe static void channel_decorrelation(int* leftS, int* rightS, int *leftM, int *rightM, int* src, int blocksize)
|
||||||
{
|
{
|
||||||
for (int i = 0; i < blocksize; i++)
|
for (int i = 0; i < blocksize; i++)
|
||||||
{
|
{
|
||||||
leftM[i] = (leftS[i] + rightS[i]) >> 1;
|
int l = *(src++);
|
||||||
rightM[i] = leftS[i] - rightS[i];
|
int r = *(src++);
|
||||||
|
leftS[i] = l;
|
||||||
|
rightS[i] = r;
|
||||||
|
leftM[i] = (l + r) >> 1;
|
||||||
|
rightM[i] = l - r;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
unsafe void encode_residual_verbatim(int* res, int* smp, uint n)
|
|
||||||
{
|
|
||||||
AudioSamples.MemCpy(res, smp, (int) n);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsafe void encode_residual_fixed(int* res, int* smp, int n, int order)
|
unsafe void encode_residual_fixed(int* res, int* smp, int n, int order)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
@@ -624,9 +594,8 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static unsafe uint calc_rice_params(ref RiceContext rc, int pmin, int pmax, int* data, uint n, uint pred_order)
|
static unsafe uint calc_rice_params(ref RiceContext rc, ref RiceContext tmp_rc, int pmin, int pmax, int* data, uint n, uint pred_order)
|
||||||
{
|
{
|
||||||
RiceContext tmp_rc = new RiceContext(), tmp_rc2;
|
|
||||||
uint* udata = stackalloc uint[(int)n];
|
uint* udata = stackalloc uint[(int)n];
|
||||||
uint* sums = stackalloc uint[(pmax + 1) * Flake.MAX_PARTITIONS];
|
uint* sums = stackalloc uint[(pmax + 1) * Flake.MAX_PARTITIONS];
|
||||||
//uint* bits = stackalloc uint[Flake.MAX_PARTITION_ORDER];
|
//uint* bits = stackalloc uint[Flake.MAX_PARTITION_ORDER];
|
||||||
@@ -649,7 +618,7 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
{
|
{
|
||||||
opt_porder = i;
|
opt_porder = i;
|
||||||
opt_bits = bits;
|
opt_bits = bits;
|
||||||
tmp_rc2 = rc;
|
RiceContext tmp_rc2 = rc;
|
||||||
rc = tmp_rc;
|
rc = tmp_rc;
|
||||||
tmp_rc = tmp_rc2;
|
tmp_rc = tmp_rc2;
|
||||||
}
|
}
|
||||||
@@ -666,42 +635,6 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
return porder;
|
return porder;
|
||||||
}
|
}
|
||||||
|
|
||||||
static unsafe uint calc_rice_params_fixed(ref RiceContext rc, int pmin, int pmax,
|
|
||||||
int* data, int n, int pred_order, uint bps)
|
|
||||||
{
|
|
||||||
pmin = get_max_p_order(pmin, n, pred_order);
|
|
||||||
pmax = get_max_p_order(pmax, n, pred_order);
|
|
||||||
uint bits = (uint)pred_order * bps + 6;
|
|
||||||
bits += calc_rice_params(ref rc, pmin, pmax, data, (uint)n, (uint)pred_order);
|
|
||||||
return bits;
|
|
||||||
}
|
|
||||||
|
|
||||||
static unsafe uint calc_rice_params_lpc(ref RiceContext rc, int pmin, int pmax,
|
|
||||||
int* data, int n, int pred_order, uint bps, uint precision)
|
|
||||||
{
|
|
||||||
pmin = get_max_p_order(pmin, n, pred_order);
|
|
||||||
pmax = get_max_p_order(pmax, n, pred_order);
|
|
||||||
uint bits = (uint)pred_order * bps + 4 + 5 + (uint)pred_order * precision + 6;
|
|
||||||
bits += calc_rice_params(ref rc, pmin, pmax, data, (uint)n, (uint)pred_order);
|
|
||||||
return bits;
|
|
||||||
}
|
|
||||||
|
|
||||||
// select LPC precision based on block size
|
|
||||||
static uint get_precision(int blocksize)
|
|
||||||
{
|
|
||||||
uint lpc_precision;
|
|
||||||
if (blocksize <= 192) lpc_precision = 7U;
|
|
||||||
else if (blocksize <= 384) lpc_precision = 8U;
|
|
||||||
else if (blocksize <= 576) lpc_precision = 9U;
|
|
||||||
else if (blocksize <= 1152) lpc_precision = 10U;
|
|
||||||
else if (blocksize <= 2304) lpc_precision = 11U;
|
|
||||||
else if (blocksize <= 4608) lpc_precision = 12U;
|
|
||||||
else if (blocksize <= 8192) lpc_precision = 13U;
|
|
||||||
else if (blocksize <= 16384) lpc_precision = 14U;
|
|
||||||
else lpc_precision = 15;
|
|
||||||
return lpc_precision;
|
|
||||||
}
|
|
||||||
|
|
||||||
unsafe void output_frame_header(FlacFrame frame, BitWriter bitwriter)
|
unsafe void output_frame_header(FlacFrame frame, BitWriter bitwriter)
|
||||||
{
|
{
|
||||||
bitwriter.writebits(15, 0x7FFC);
|
bitwriter.writebits(15, 0x7FFC);
|
||||||
@@ -799,14 +732,10 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
bitwriter.writebits_signed(sub.obits, sub.samples[i]);
|
bitwriter.writebits_signed(sub.obits, sub.samples[i]);
|
||||||
|
|
||||||
// LPC coefficients
|
// LPC coefficients
|
||||||
int cbits = 1;
|
bitwriter.writebits(4, sub.best.cbits - 1);
|
||||||
for (int i = 0; i < sub.best.order; i++)
|
|
||||||
while (cbits < 16 && sub.best.coefs[i] != (sub.best.coefs[i] << (32 - cbits)) >> (32 - cbits))
|
|
||||||
cbits++;
|
|
||||||
bitwriter.writebits(4, cbits - 1);
|
|
||||||
bitwriter.writebits_signed(5, sub.best.shift);
|
bitwriter.writebits_signed(5, sub.best.shift);
|
||||||
for (int i = 0; i < sub.best.order; i++)
|
for (int i = 0; i < sub.best.order; i++)
|
||||||
bitwriter.writebits_signed(cbits, sub.best.coefs[i]);
|
bitwriter.writebits_signed(sub.best.cbits, sub.best.coefs[i]);
|
||||||
|
|
||||||
// residual
|
// residual
|
||||||
output_residual(frame, bitwriter, sub);
|
output_residual(frame, bitwriter, sub);
|
||||||
@@ -829,6 +758,9 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
if (sub.wbits > 0)
|
if (sub.wbits > 0)
|
||||||
bitwriter.writebits((int)sub.wbits, 1);
|
bitwriter.writebits((int)sub.wbits, 1);
|
||||||
|
|
||||||
|
//if (frame_writer.Length >= frame_buffer.Length)
|
||||||
|
// throw new Exception("buffer overflow");
|
||||||
|
|
||||||
// subframe
|
// subframe
|
||||||
switch (sub.best.type)
|
switch (sub.best.type)
|
||||||
{
|
{
|
||||||
@@ -845,6 +777,8 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
output_subframe_lpc(frame, bitwriter, sub);
|
output_subframe_lpc(frame, bitwriter, sub);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
//if (frame_writer.Length >= frame_buffer.Length)
|
||||||
|
// throw new Exception("buffer overflow");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -914,10 +848,10 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
_windowcount++;
|
_windowcount++;
|
||||||
}
|
}
|
||||||
|
|
||||||
unsafe void initialize_autocorTasks(int blocksize, int channelsCount, int max_order, int nFrames)
|
unsafe void initialize_autocorTasks(int blocksize, int channelsCount, int max_order, int nFrames, FlaCudaTask task)
|
||||||
{
|
{
|
||||||
computeAutocorTaskStruct* autocorTasks = (computeAutocorTaskStruct*)autocorTasksPtr;
|
computeAutocorTaskStruct* autocorTasks = (computeAutocorTaskStruct*)task.autocorTasksPtr;
|
||||||
encodeResidualTaskStruct* residualTasks = (encodeResidualTaskStruct*)residualTasksPtr;
|
encodeResidualTaskStruct* residualTasks = (encodeResidualTaskStruct*)task.residualTasksPtr;
|
||||||
nAutocorTasks = 0;
|
nAutocorTasks = 0;
|
||||||
nResidualTasks = 0;
|
nResidualTasks = 0;
|
||||||
for (int iFrame = 0; iFrame < nFrames; iFrame++)
|
for (int iFrame = 0; iFrame < nFrames; iFrame++)
|
||||||
@@ -975,9 +909,13 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
cuda.CopyHostToDeviceAsync(cudaAutocorTasks, autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * nAutocorTasks), cudaStream);
|
if (sizeof(encodeResidualTaskStruct) * nResidualTasks > task.residualTasksLen)
|
||||||
cuda.CopyHostToDeviceAsync(cudaResidualTasks, residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks), cudaStream);
|
throw new Exception("oops");
|
||||||
cuda.SynchronizeStream(cudaStream);
|
if (sizeof(computeAutocorTaskStruct) * nAutocorTasks > task.autocorTasksLen)
|
||||||
|
throw new Exception("oops");
|
||||||
|
cuda.CopyHostToDeviceAsync(task.cudaAutocorTasks, task.autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * nAutocorTasks), task.stream);
|
||||||
|
cuda.CopyHostToDeviceAsync(task.cudaResidualTasks, task.residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks), task.stream);
|
||||||
|
task.blocksize = blocksize;
|
||||||
}
|
}
|
||||||
|
|
||||||
unsafe void encode_residual(FlacFrame frame)
|
unsafe void encode_residual(FlacFrame frame)
|
||||||
@@ -991,11 +929,15 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
case SubframeType.Verbatim:
|
case SubframeType.Verbatim:
|
||||||
break;
|
break;
|
||||||
case SubframeType.Fixed:
|
case SubframeType.Fixed:
|
||||||
encode_residual_fixed(frame.subframes[ch].best.residual, frame.subframes[ch].samples,
|
{
|
||||||
frame.blocksize, frame.subframes[ch].best.order);
|
encode_residual_fixed(frame.subframes[ch].best.residual, frame.subframes[ch].samples,
|
||||||
frame.subframes[ch].best.size = calc_rice_params_fixed(
|
frame.blocksize, frame.subframes[ch].best.order);
|
||||||
ref frame.subframes[ch].best.rc, eparams.min_partition_order, eparams.max_partition_order,
|
|
||||||
frame.subframes[ch].best.residual, frame.blocksize, frame.subframes[ch].best.order, frame.subframes[ch].obits);
|
int pmin = get_max_p_order(eparams.min_partition_order, frame.blocksize, frame.subframes[ch].best.order);
|
||||||
|
int pmax = get_max_p_order(eparams.max_partition_order, frame.blocksize, frame.subframes[ch].best.order);
|
||||||
|
uint bits = (uint)frame.subframes[ch].best.order * frame.subframes[ch].obits + 6;
|
||||||
|
frame.subframes[ch].best.size = bits + calc_rice_params(ref frame.subframes[ch].best.rc, ref frame.current.rc, pmin, pmax, frame.subframes[ch].best.residual, (uint)frame.blocksize, (uint)frame.subframes[ch].best.order);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case SubframeType.LPC:
|
case SubframeType.LPC:
|
||||||
fixed (int* coefs = frame.subframes[ch].best.coefs)
|
fixed (int* coefs = frame.subframes[ch].best.coefs)
|
||||||
@@ -1007,18 +949,28 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
lpc.encode_residual_long(frame.subframes[ch].best.residual, frame.subframes[ch].samples, frame.blocksize, frame.subframes[ch].best.order, coefs, frame.subframes[ch].best.shift);
|
lpc.encode_residual_long(frame.subframes[ch].best.residual, frame.subframes[ch].samples, frame.blocksize, frame.subframes[ch].best.order, coefs, frame.subframes[ch].best.shift);
|
||||||
else
|
else
|
||||||
lpc.encode_residual(frame.subframes[ch].best.residual, frame.subframes[ch].samples, frame.blocksize, frame.subframes[ch].best.order, coefs, frame.subframes[ch].best.shift);
|
lpc.encode_residual(frame.subframes[ch].best.residual, frame.subframes[ch].samples, frame.blocksize, frame.subframes[ch].best.order, coefs, frame.subframes[ch].best.shift);
|
||||||
frame.subframes[ch].best.size = calc_rice_params_lpc(
|
|
||||||
ref frame.subframes[ch].best.rc, eparams.min_partition_order, eparams.max_partition_order,
|
int pmin = get_max_p_order(eparams.min_partition_order, frame.blocksize, frame.subframes[ch].best.order);
|
||||||
frame.subframes[ch].best.residual, frame.blocksize, frame.subframes[ch].best.order, frame.subframes[ch].obits, (uint)frame.subframes[ch].best.cbits);
|
int pmax = get_max_p_order(eparams.max_partition_order, frame.blocksize, frame.subframes[ch].best.order);
|
||||||
|
uint bits = (uint)frame.subframes[ch].best.order * frame.subframes[ch].obits + 4 + 5 + (uint)frame.subframes[ch].best.order * (uint)frame.subframes[ch].best.cbits + 6;
|
||||||
|
frame.subframes[ch].best.size = bits + calc_rice_params(ref frame.subframes[ch].best.rc, ref frame.current.rc, pmin, pmax, frame.subframes[ch].best.residual, (uint)frame.blocksize, (uint)frame.subframes[ch].best.order);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
if (frame.subframes[ch].best.size > frame.subframes[ch].obits * (uint)frame.blocksize)
|
||||||
|
{
|
||||||
|
#if DEBUG
|
||||||
|
throw new Exception("larger than verbatim");
|
||||||
|
#endif
|
||||||
|
frame.subframes[ch].best.type = SubframeType.Verbatim;
|
||||||
|
frame.subframes[ch].best.size = frame.subframes[ch].obits * (uint)frame.blocksize;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
unsafe void select_best_methods(FlacFrame frame, int channelsCount, int max_order, int iFrame)
|
unsafe void select_best_methods(FlacFrame frame, int channelsCount, int max_order, int iFrame, FlaCudaTask task)
|
||||||
{
|
{
|
||||||
encodeResidualTaskStruct* residualTasks = (encodeResidualTaskStruct*)residualTasksPtr;
|
encodeResidualTaskStruct* residualTasks = (encodeResidualTaskStruct*)task.residualTasksPtr;
|
||||||
for (int ch = 0; ch < channelsCount; ch++)
|
for (int ch = 0; ch < channelsCount; ch++)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
@@ -1089,14 +1041,13 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
unsafe void estimate_residual(int blocksize, int channelsCount, int max_order, int nFrames)
|
unsafe void estimate_residual(int blocksize, int channelsCount, int max_order, int nFrames, FlaCudaTask task)
|
||||||
{
|
{
|
||||||
if (blocksize <= 4)
|
if (blocksize <= 4)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
compute_autocorellation(blocksize, channelsCount, max_order, nFrames);
|
compute_autocorellation(blocksize, channelsCount, max_order, nFrames, task);
|
||||||
|
|
||||||
uint cbits = get_precision(blocksize) + 1;
|
|
||||||
int threads_y;
|
int threads_y;
|
||||||
if (max_order >= 4 && max_order <= 8)
|
if (max_order >= 4 && max_order <= 8)
|
||||||
threads_y = max_order;
|
threads_y = max_order;
|
||||||
@@ -1118,31 +1069,29 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
if (partCount > maxResidualParts)
|
if (partCount > maxResidualParts)
|
||||||
throw new Exception("invalid combination of block size and LPC order");
|
throw new Exception("invalid combination of block size and LPC order");
|
||||||
|
|
||||||
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 0, (uint)cudaResidualOutput.Pointer);
|
cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 0, (uint)task.cudaResidualOutput.Pointer);
|
||||||
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 1, (uint)cudaSamples.Pointer);
|
cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 1, (uint)task.cudaSamples.Pointer);
|
||||||
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 2, (uint)cudaResidualTasks.Pointer);
|
cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 2, (uint)task.cudaResidualTasks.Pointer);
|
||||||
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 3, (uint)max_order);
|
cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 3, (uint)max_order);
|
||||||
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 4, (uint)blocksize);
|
cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 4, (uint)blocksize);
|
||||||
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 5, (uint)partSize);
|
cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 5, (uint)partSize);
|
||||||
cuda.SetParameterSize(cudaEstimateResidual, sizeof(uint) * 6);
|
cuda.SetParameterSize(task.cudaEstimateResidual, sizeof(uint) * 6);
|
||||||
cuda.SetFunctionBlockShape(cudaEstimateResidual, 32, threads_y, 1);
|
cuda.SetFunctionBlockShape(task.cudaEstimateResidual, 32, threads_y, 1);
|
||||||
|
|
||||||
cuda.SetParameter(cudaSumResidual, 0, (uint)cudaResidualTasks.Pointer);
|
cuda.SetParameter(task.cudaSumResidual, 0, (uint)task.cudaResidualTasks.Pointer);
|
||||||
cuda.SetParameter(cudaSumResidual, sizeof(uint), (uint)cudaResidualOutput.Pointer);
|
cuda.SetParameter(task.cudaSumResidual, sizeof(uint), (uint)task.cudaResidualOutput.Pointer);
|
||||||
cuda.SetParameter(cudaSumResidual, sizeof(uint) * 2, (uint)partSize);
|
cuda.SetParameter(task.cudaSumResidual, sizeof(uint) * 2, (uint)partSize);
|
||||||
cuda.SetParameter(cudaSumResidual, sizeof(uint) * 3, (uint)partCount);
|
cuda.SetParameter(task.cudaSumResidual, sizeof(uint) * 3, (uint)partCount);
|
||||||
cuda.SetParameterSize(cudaSumResidual, sizeof(uint) * 4U);
|
cuda.SetParameterSize(task.cudaSumResidual, sizeof(uint) * 4U);
|
||||||
cuda.SetFunctionBlockShape(cudaSumResidual, 64, 1, 1);
|
cuda.SetFunctionBlockShape(task.cudaSumResidual, 64, 1, 1);
|
||||||
|
|
||||||
// issue work to the GPU
|
// issue work to the GPU
|
||||||
cuda.LaunchAsync(cudaEstimateResidual, partCount, (nResidualTasks / threads_y * nFrames) / maxFrames, cudaStream);
|
cuda.LaunchAsync(task.cudaEstimateResidual, partCount, (nResidualTasks / threads_y * nFrames) / maxFrames, task.stream);
|
||||||
//cuda.LaunchAsync(cudaSumResidualChunks, partCount, nResidualTasks, cudaStream);
|
cuda.LaunchAsync(task.cudaSumResidual, 1, (nResidualTasks * nFrames) / maxFrames, task.stream);
|
||||||
cuda.LaunchAsync(cudaSumResidual, 1, (nResidualTasks * nFrames) / maxFrames, cudaStream);
|
cuda.CopyDeviceToHostAsync(task.cudaResidualTasks, task.residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * ((nResidualTasks * nFrames) / maxFrames)), task.stream);
|
||||||
cuda.CopyDeviceToHostAsync(cudaResidualTasks, residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * ((nResidualTasks * nFrames) / maxFrames)), cudaStream);
|
|
||||||
cuda.SynchronizeStream(cudaStream);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
unsafe void compute_autocorellation(int blocksize, int channelsCount, int max_order, int nFrames)
|
unsafe void compute_autocorellation(int blocksize, int channelsCount, int max_order, int nFrames, FlaCudaTask task)
|
||||||
{
|
{
|
||||||
int autocorThreads = 256;
|
int autocorThreads = 256;
|
||||||
int partSize = 2 * autocorThreads - max_order;
|
int partSize = 2 * autocorThreads - max_order;
|
||||||
@@ -1155,46 +1104,43 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
if (blocksize <= 4)
|
if (blocksize <= 4)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
cuda.SetParameter(cudaComputeAutocor, 0, (uint)cudaAutocorOutput.Pointer);
|
cuda.SetParameter(task.cudaComputeAutocor, 0, (uint)task.cudaAutocorOutput.Pointer);
|
||||||
cuda.SetParameter(cudaComputeAutocor, sizeof(uint), (uint)cudaSamples.Pointer);
|
cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint), (uint)task.cudaSamples.Pointer);
|
||||||
cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 2, (uint)cudaWindow.Pointer);
|
cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 2, (uint)cudaWindow.Pointer);
|
||||||
cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 3, (uint)cudaAutocorTasks.Pointer);
|
cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 3, (uint)task.cudaAutocorTasks.Pointer);
|
||||||
cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 4, (uint)max_order);
|
cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 4, (uint)max_order);
|
||||||
cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 4 + sizeof(uint), (uint)blocksize);
|
cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 4 + sizeof(uint), (uint)blocksize);
|
||||||
cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 4 + sizeof(uint) * 2, (uint)partSize);
|
cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 4 + sizeof(uint) * 2, (uint)partSize);
|
||||||
cuda.SetParameterSize(cudaComputeAutocor, (uint)(sizeof(uint) * 4) + sizeof(uint) * 3);
|
cuda.SetParameterSize(task.cudaComputeAutocor, (uint)(sizeof(uint) * 4) + sizeof(uint) * 3);
|
||||||
cuda.SetFunctionBlockShape(cudaComputeAutocor, autocorThreads, 1, 1);
|
cuda.SetFunctionBlockShape(task.cudaComputeAutocor, autocorThreads, 1, 1);
|
||||||
|
|
||||||
cuda.SetParameter(cudaComputeLPC, 0, (uint)cudaResidualTasks.Pointer);
|
cuda.SetParameter(task.cudaComputeLPC, 0, (uint)task.cudaResidualTasks.Pointer);
|
||||||
cuda.SetParameter(cudaComputeLPC, sizeof(uint), (uint)cudaAutocorOutput.Pointer);
|
cuda.SetParameter(task.cudaComputeLPC, sizeof(uint), (uint)task.cudaAutocorOutput.Pointer);
|
||||||
cuda.SetParameter(cudaComputeLPC, sizeof(uint) * 2, (uint)cudaAutocorTasks.Pointer);
|
cuda.SetParameter(task.cudaComputeLPC, sizeof(uint) * 2, (uint)task.cudaAutocorTasks.Pointer);
|
||||||
cuda.SetParameter(cudaComputeLPC, sizeof(uint) * 3, (uint)max_order);
|
cuda.SetParameter(task.cudaComputeLPC, sizeof(uint) * 3, (uint)max_order);
|
||||||
cuda.SetParameter(cudaComputeLPC, sizeof(uint) * 3 + sizeof(uint), (uint)partCount);
|
cuda.SetParameter(task.cudaComputeLPC, sizeof(uint) * 3 + sizeof(uint), (uint)partCount);
|
||||||
cuda.SetParameterSize(cudaComputeLPC, (uint)(sizeof(uint) * 3) + sizeof(uint) * 2);
|
cuda.SetParameterSize(task.cudaComputeLPC, (uint)(sizeof(uint) * 3) + sizeof(uint) * 2);
|
||||||
cuda.SetFunctionBlockShape(cudaComputeLPC, (partCount + 31) & ~31, 1, 1);
|
cuda.SetFunctionBlockShape(task.cudaComputeLPC, (partCount + 31) & ~31, 1, 1);
|
||||||
|
|
||||||
// issue work to the GPU
|
// issue work to the GPU
|
||||||
cuda.CopyHostToDeviceAsync(cudaSamples, samplesBufferPtr, (uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelsCount), cudaStream);
|
cuda.LaunchAsync(task.cudaComputeAutocor, partCount, (nAutocorTasks * nFrames) / maxFrames, task.stream);
|
||||||
cuda.LaunchAsync(cudaComputeAutocor, partCount, (nAutocorTasks * nFrames) / maxFrames, cudaStream);
|
cuda.LaunchAsync(task.cudaComputeLPC, 1, (nAutocorTasks * nFrames) / maxFrames, task.stream);
|
||||||
cuda.LaunchAsync(cudaComputeLPC, 1, (nAutocorTasks * nFrames) / maxFrames, cudaStream);
|
|
||||||
//cuda.SynchronizeStream(cudaStream);
|
|
||||||
//cuda.CopyDeviceToHostAsync(cudaResidualTasks, residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks), cudaStream1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
unsafe int encode_frame(bool doMidside, int channelCount, int iFrame)
|
unsafe int encode_frame(bool doMidside, int channelCount, int iFrame, FlaCudaTask task)
|
||||||
{
|
{
|
||||||
fixed (int* r = residualBuffer)
|
fixed (int* r = residualBuffer)
|
||||||
{
|
{
|
||||||
FlacFrame frame = _frame;
|
FlacFrame frame = task.frame;
|
||||||
frame.InitSize(eparams.block_size, eparams.variable_block_size != 0);
|
frame.InitSize(eparams.block_size, eparams.variable_block_size != 0);
|
||||||
for (int ch = 0; ch < channelCount; ch++)
|
for (int ch = 0; ch < channelCount; ch++)
|
||||||
{
|
{
|
||||||
int* s = ((int*)samplesBufferPtr) + ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * eparams.block_size;
|
int* s = ((int*)task.samplesBufferPtr) + ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * eparams.block_size;
|
||||||
frame.subframes[ch].Init(s, r + ch * FlaCudaWriter.MAX_BLOCKSIZE,
|
frame.subframes[ch].Init(s, r + ch * FlaCudaWriter.MAX_BLOCKSIZE,
|
||||||
bits_per_sample + (doMidside && ch == 3 ? 1U : 0U), 0);// get_wasted_bits(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, frame.blocksize));
|
bits_per_sample + (doMidside && ch == 3 ? 1U : 0U), 0);// get_wasted_bits(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, frame.blocksize));
|
||||||
}
|
}
|
||||||
|
|
||||||
select_best_methods(frame, channelCount, eparams.max_prediction_order, iFrame);
|
select_best_methods(frame, channelCount, eparams.max_prediction_order, iFrame, task);
|
||||||
|
|
||||||
if (doMidside)
|
if (doMidside)
|
||||||
{
|
{
|
||||||
@@ -1206,11 +1152,13 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
|
|
||||||
encode_residual(frame);
|
encode_residual(frame);
|
||||||
|
|
||||||
BitWriter bitwriter = new BitWriter(frame_buffer, 0, max_frame_size);
|
frame_writer.Reset();
|
||||||
|
|
||||||
output_frame_header(frame, bitwriter);
|
output_frame_header(frame, frame_writer);
|
||||||
output_subframes(frame, bitwriter);
|
output_subframes(frame, frame_writer);
|
||||||
output_frame_footer(bitwriter);
|
output_frame_footer(frame_writer);
|
||||||
|
if (frame_writer.Length >= frame_buffer.Length)
|
||||||
|
throw new Exception("buffer overflow");
|
||||||
|
|
||||||
if (frame_buffer != null)
|
if (frame_buffer != null)
|
||||||
{
|
{
|
||||||
@@ -1219,26 +1167,22 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
else
|
else
|
||||||
frame_count++;
|
frame_count++;
|
||||||
}
|
}
|
||||||
return bitwriter.Length;
|
return frame_writer.Length;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
unsafe int output_frames()
|
unsafe void send_to_GPU(int nFrames, FlaCudaTask task)
|
||||||
{
|
{
|
||||||
bool doMidside = channels == 2 && eparams.do_midside;
|
bool doMidside = channels == 2 && eparams.do_midside;
|
||||||
int channelCount = doMidside ? 2 * channels : channels;
|
int channelCount = doMidside ? 2 * channels : channels;
|
||||||
int nFrames = Math.Min(samplesInBuffer / eparams.block_size, maxFrames);
|
|
||||||
|
|
||||||
if (nFrames < 1)
|
cuda.CopyHostToDeviceAsync(task.cudaSamples, task.samplesBufferPtr, (uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelCount), task.stream);
|
||||||
throw new Exception("oops");
|
}
|
||||||
|
|
||||||
if (verify != null)
|
unsafe void run_GPU_task(int nFrames, FlaCudaTask task)
|
||||||
{
|
{
|
||||||
int* r = (int*)samplesBufferPtr;
|
bool doMidside = channels == 2 && eparams.do_midside;
|
||||||
fixed (int* s = verifyBuffer)
|
int channelCount = doMidside ? 2 * channels : channels;
|
||||||
for (int ch = 0; ch < channels; ch++)
|
|
||||||
AudioSamples.MemCpy(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * FlaCudaWriter.MAX_BLOCKSIZE, samplesInBuffer);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (eparams.block_size != _windowsize && eparams.block_size > 4)
|
if (eparams.block_size != _windowsize && eparams.block_size > 4)
|
||||||
fixed (float* window = windowBuffer)
|
fixed (float* window = windowBuffer)
|
||||||
@@ -1253,17 +1197,25 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
if (_windowcount == 0)
|
if (_windowcount == 0)
|
||||||
throw new Exception("invalid windowfunction");
|
throw new Exception("invalid windowfunction");
|
||||||
cuda.CopyHostToDevice<float>(cudaWindow, windowBuffer);
|
cuda.CopyHostToDevice<float>(cudaWindow, windowBuffer);
|
||||||
initialize_autocorTasks(eparams.block_size, channelCount, eparams.max_prediction_order, maxFrames);
|
|
||||||
}
|
}
|
||||||
|
if (eparams.block_size != task.blocksize)
|
||||||
|
initialize_autocorTasks(eparams.block_size, channelCount, eparams.max_prediction_order, maxFrames, task);
|
||||||
|
|
||||||
if (doMidside)
|
if (verify != null)
|
||||||
{
|
{
|
||||||
int* s = ((int*)samplesBufferPtr);
|
int* r = (int*)task.samplesBufferPtr;
|
||||||
channel_decorrelation(s, s + FlaCudaWriter.MAX_BLOCKSIZE,
|
fixed (int* s = task.verifyBuffer)
|
||||||
s + 2 * FlaCudaWriter.MAX_BLOCKSIZE, s + 3 * FlaCudaWriter.MAX_BLOCKSIZE, eparams.block_size * nFrames);
|
for (int ch = 0; ch < channels; ch++)
|
||||||
|
AudioSamples.MemCpy(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * FlaCudaWriter.MAX_BLOCKSIZE, eparams.block_size * nFrames);
|
||||||
}
|
}
|
||||||
|
|
||||||
estimate_residual(eparams.block_size, channelCount, eparams.max_prediction_order, nFrames);
|
estimate_residual(eparams.block_size, channelCount, eparams.max_prediction_order, nFrames, task);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsafe int process_result(int nFrames, FlaCudaTask task)
|
||||||
|
{
|
||||||
|
bool doMidside = channels == 2 && eparams.do_midside;
|
||||||
|
int channelCount = doMidside ? 2 * channels : channels;
|
||||||
|
|
||||||
int bs = 0;
|
int bs = 0;
|
||||||
for (int iFrame = 0; iFrame < nFrames; iFrame++)
|
for (int iFrame = 0; iFrame < nFrames; iFrame++)
|
||||||
@@ -1271,7 +1223,7 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
//if (0 != eparams.variable_block_size && 0 == (eparams.block_size & 7) && eparams.block_size >= 128)
|
//if (0 != eparams.variable_block_size && 0 == (eparams.block_size & 7) && eparams.block_size >= 128)
|
||||||
// fs = encode_frame_vbs();
|
// fs = encode_frame_vbs();
|
||||||
//else
|
//else
|
||||||
int fs = encode_frame(doMidside, channelCount, iFrame);
|
int fs = encode_frame(doMidside, channelCount, iFrame, task);
|
||||||
bs += eparams.block_size;
|
bs += eparams.block_size;
|
||||||
|
|
||||||
if (seek_table != null && _IO.CanSeek)
|
if (seek_table != null && _IO.CanSeek)
|
||||||
@@ -1300,7 +1252,7 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
int decoded = verify.DecodeFrame(frame_buffer, 0, fs);
|
int decoded = verify.DecodeFrame(frame_buffer, 0, fs);
|
||||||
if (decoded != fs || verify.Remaining != (ulong)eparams.block_size)
|
if (decoded != fs || verify.Remaining != (ulong)eparams.block_size)
|
||||||
throw new Exception("validation failed!");
|
throw new Exception("validation failed!");
|
||||||
fixed (int* s = verifyBuffer, r = verify.Samples)
|
fixed (int* s = task.verifyBuffer, r = verify.Samples)
|
||||||
{
|
{
|
||||||
for (int ch = 0; ch < channels; ch++)
|
for (int ch = 0; ch < channels; ch++)
|
||||||
if (AudioSamples.MemCmp(s + iFrame * eparams.block_size + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, eparams.block_size))
|
if (AudioSamples.MemCmp(s + iFrame * eparams.block_size + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, eparams.block_size))
|
||||||
@@ -1308,56 +1260,33 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (bs < samplesInBuffer)
|
|
||||||
{
|
|
||||||
int* s = (int*)samplesBufferPtr;
|
|
||||||
for (int ch = 0; ch < channels; ch++)
|
|
||||||
AudioSamples.MemCpy(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, s + bs + ch * FlaCudaWriter.MAX_BLOCKSIZE, samplesInBuffer - bs);
|
|
||||||
}
|
|
||||||
|
|
||||||
samplesInBuffer -= bs;
|
|
||||||
|
|
||||||
return bs;
|
return bs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unsafe void output_frames(int nFrames)
|
||||||
|
{
|
||||||
|
send_to_GPU(nFrames, task1);
|
||||||
|
run_GPU_task(nFrames, task1);
|
||||||
|
cuda.SynchronizeStream(task1.stream);
|
||||||
|
process_result(nFrames, task1);
|
||||||
|
}
|
||||||
|
|
||||||
public unsafe void Write(int[,] buff, int pos, int sampleCount)
|
public unsafe void Write(int[,] buff, int pos, int sampleCount)
|
||||||
{
|
{
|
||||||
|
bool doMidside = channels == 2 && eparams.do_midside;
|
||||||
|
int channelCount = doMidside ? 2 * channels : channels;
|
||||||
|
|
||||||
if (!inited)
|
if (!inited)
|
||||||
{
|
{
|
||||||
cuda = new CUDA(true, InitializationFlags.None);
|
cuda = new CUDA(true, InitializationFlags.None);
|
||||||
cuda.CreateContext(0, CUCtxFlags.BlockingSync);
|
cuda.CreateContext(0, CUCtxFlags.SchedAuto);
|
||||||
using (Stream cubin = GetType().Assembly.GetManifestResourceStream(GetType(), "flacuda.cubin"))
|
using (Stream cubin = GetType().Assembly.GetManifestResourceStream(GetType(), "flacuda.cubin"))
|
||||||
using (StreamReader sr = new StreamReader(cubin))
|
using (StreamReader sr = new StreamReader(cubin))
|
||||||
cuda.LoadModule(new ASCIIEncoding().GetBytes(sr.ReadToEnd()));
|
cuda.LoadModule(new ASCIIEncoding().GetBytes(sr.ReadToEnd()));
|
||||||
//cuda.LoadModule(System.IO.Path.Combine(Environment.CurrentDirectory, "flacuda.cubin"));
|
//cuda.LoadModule(System.IO.Path.Combine(Environment.CurrentDirectory, "flacuda.cubin"));
|
||||||
cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor");
|
task1 = new FlaCudaTask(cuda, channelCount);
|
||||||
cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC");
|
task2 = new FlaCudaTask(cuda, channelCount);
|
||||||
cudaEstimateResidual = cuda.GetModuleFunction("cudaEstimateResidual");
|
|
||||||
cudaSumResidual = cuda.GetModuleFunction("cudaSumResidual");
|
|
||||||
cudaSumResidualChunks = cuda.GetModuleFunction("cudaSumResidualChunks");
|
|
||||||
cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual");
|
|
||||||
cudaSamples = cuda.Allocate((uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels)));
|
|
||||||
cudaWindow = cuda.Allocate((uint)sizeof(float) * FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS);
|
cudaWindow = cuda.Allocate((uint)sizeof(float) * FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS);
|
||||||
cudaAutocorTasks = cuda.Allocate((uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * maxFrames));
|
|
||||||
cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * maxAutocorParts));
|
|
||||||
cudaResidualTasks = cuda.Allocate((uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_ORDER * (lpc.MAX_LPC_WINDOWS + 1) * maxFrames));
|
|
||||||
cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_WINDOWS + 1) * lpc.MAX_LPC_ORDER * maxResidualParts));
|
|
||||||
//cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 4) * maxResidualParts));
|
|
||||||
CUResult cuErr = CUDADriver.cuMemAllocHost(ref samplesBufferPtr, (uint)(sizeof(int) * (channels == 2 ? 4 : channels) * FlaCudaWriter.MAX_BLOCKSIZE));
|
|
||||||
if (cuErr == CUResult.Success)
|
|
||||||
cuErr = CUDADriver.cuMemAllocHost(ref autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * maxFrames));
|
|
||||||
if (cuErr == CUResult.Success)
|
|
||||||
cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_WINDOWS + 1) * lpc.MAX_LPC_ORDER * maxFrames));
|
|
||||||
if (cuErr != CUResult.Success)
|
|
||||||
{
|
|
||||||
if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr); samplesBufferPtr = IntPtr.Zero;
|
|
||||||
if (autocorTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(autocorTasksPtr); autocorTasksPtr = IntPtr.Zero;
|
|
||||||
if (residualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualTasksPtr); residualTasksPtr = IntPtr.Zero;
|
|
||||||
throw new CUDAException(cuErr);
|
|
||||||
}
|
|
||||||
cudaStream = cuda.CreateStream();
|
|
||||||
cudaStream1 = cuda.CreateStream();
|
|
||||||
if (_IO == null)
|
if (_IO == null)
|
||||||
_IO = new FileStream(_path, FileMode.Create, FileAccess.Write, FileShare.Read);
|
_IO = new FileStream(_path, FileMode.Create, FileAccess.Write, FileShare.Read);
|
||||||
int header_size = flake_encode_init();
|
int header_size = flake_encode_init();
|
||||||
@@ -1366,13 +1295,13 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
first_frame_offset = _IO.Position;
|
first_frame_offset = _IO.Position;
|
||||||
inited = true;
|
inited = true;
|
||||||
}
|
}
|
||||||
|
int have_data = 0;
|
||||||
int len = sampleCount;
|
int len = sampleCount;
|
||||||
while (len > 0)
|
while (len > 0)
|
||||||
{
|
{
|
||||||
int block = Math.Min(len, FlaCudaWriter.MAX_BLOCKSIZE - samplesInBuffer);
|
int block = Math.Min(len, Math.Min(FlaCudaWriter.MAX_BLOCKSIZE, eparams.block_size * maxFrames) - samplesInBuffer);
|
||||||
|
|
||||||
copy_samples(buff, pos, block);
|
copy_samples(buff, pos, block, task1);
|
||||||
|
|
||||||
if (md5 != null)
|
if (md5 != null)
|
||||||
{
|
{
|
||||||
@@ -1383,14 +1312,43 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
len -= block;
|
len -= block;
|
||||||
pos += block;
|
pos += block;
|
||||||
|
|
||||||
while (samplesInBuffer >= eparams.block_size)
|
int nFrames = samplesInBuffer / eparams.block_size;
|
||||||
output_frames();
|
if (nFrames > 0)
|
||||||
|
{
|
||||||
|
#if DEBUG
|
||||||
|
if (nFrames > maxFrames)
|
||||||
|
throw new Exception("oops");
|
||||||
|
#endif
|
||||||
|
send_to_GPU(nFrames, task1);
|
||||||
|
cuda.SynchronizeStream(task2.stream);
|
||||||
|
run_GPU_task(nFrames, task1);
|
||||||
|
if (have_data > 0)
|
||||||
|
process_result(have_data, task2);
|
||||||
|
int bs = eparams.block_size * nFrames;
|
||||||
|
if (bs < samplesInBuffer)
|
||||||
|
{
|
||||||
|
int* s1 = (int*)task1.samplesBufferPtr;
|
||||||
|
int* s2 = (int*)task2.samplesBufferPtr;
|
||||||
|
for (int ch = 0; ch < channelCount; ch++)
|
||||||
|
AudioSamples.MemCpy(s2 + ch * FlaCudaWriter.MAX_BLOCKSIZE, s1 + bs + ch * FlaCudaWriter.MAX_BLOCKSIZE, samplesInBuffer - bs);
|
||||||
|
}
|
||||||
|
samplesInBuffer -= bs;
|
||||||
|
have_data = nFrames;
|
||||||
|
FlaCudaTask tmp = task1;
|
||||||
|
task1 = task2;
|
||||||
|
task2 = tmp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (have_data > 0)
|
||||||
|
{
|
||||||
|
cuda.SynchronizeStream(task2.stream);
|
||||||
|
process_result(have_data, task2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public string Path { get { return _path; } }
|
public string Path { get { return _path; } }
|
||||||
|
|
||||||
string vendor_string = "FlaCuda#0.1";
|
string vendor_string = "FlaCuda#0.4";
|
||||||
|
|
||||||
int select_blocksize(int samplerate, int time_ms)
|
int select_blocksize(int samplerate, int time_ms)
|
||||||
{
|
{
|
||||||
@@ -1620,12 +1578,10 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
md5 = new MD5CryptoServiceProvider();
|
md5 = new MD5CryptoServiceProvider();
|
||||||
|
|
||||||
if (eparams.do_verify)
|
if (eparams.do_verify)
|
||||||
{
|
|
||||||
verify = new FlakeReader(channels, bits_per_sample);
|
verify = new FlakeReader(channels, bits_per_sample);
|
||||||
verifyBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * channels];
|
|
||||||
}
|
|
||||||
|
|
||||||
frame_buffer = new byte[max_frame_size];
|
frame_buffer = new byte[max_frame_size + 1];
|
||||||
|
frame_writer = new BitWriter(frame_buffer, 0, max_frame_size + 1);
|
||||||
|
|
||||||
return header_len;
|
return header_len;
|
||||||
}
|
}
|
||||||
@@ -1740,9 +1696,8 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
{
|
{
|
||||||
case 0:
|
case 0:
|
||||||
do_midside = false;
|
do_midside = false;
|
||||||
window_function = WindowFunction.Bartlett;
|
|
||||||
max_partition_order = 4;
|
max_partition_order = 4;
|
||||||
max_prediction_order = 6;
|
max_prediction_order = 4;
|
||||||
break;
|
break;
|
||||||
case 1:
|
case 1:
|
||||||
do_midside = false;
|
do_midside = false;
|
||||||
@@ -1753,24 +1708,24 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
case 2:
|
case 2:
|
||||||
window_function = WindowFunction.Bartlett;
|
window_function = WindowFunction.Bartlett;
|
||||||
max_partition_order = 4;
|
max_partition_order = 4;
|
||||||
max_prediction_order = 4;
|
max_prediction_order = 5;
|
||||||
break;
|
break;
|
||||||
case 3:
|
case 3:
|
||||||
window_function = WindowFunction.Bartlett;
|
window_function = WindowFunction.Bartlett;
|
||||||
max_partition_order = 4;
|
max_partition_order = 4;
|
||||||
max_prediction_order = 5;
|
max_prediction_order = 7;
|
||||||
break;
|
break;
|
||||||
case 4:
|
case 4:
|
||||||
window_function = WindowFunction.Bartlett;
|
window_function = WindowFunction.Bartlett;
|
||||||
max_partition_order = 4;
|
max_partition_order = 4;
|
||||||
max_prediction_order = 7;
|
max_prediction_order = 8;
|
||||||
break;
|
break;
|
||||||
case 5:
|
case 5:
|
||||||
window_function = WindowFunction.Bartlett;
|
|
||||||
max_prediction_order = 8;
|
max_prediction_order = 8;
|
||||||
break;
|
break;
|
||||||
case 6:
|
case 6:
|
||||||
max_prediction_order = 8;
|
window_function = WindowFunction.Bartlett;
|
||||||
|
max_prediction_order = 12;
|
||||||
break;
|
break;
|
||||||
case 7:
|
case 7:
|
||||||
max_prediction_order = 10;
|
max_prediction_order = 10;
|
||||||
@@ -1810,4 +1765,81 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
public fixed int reserved[11];
|
public fixed int reserved[11];
|
||||||
public fixed int coefs[32];
|
public fixed int coefs[32];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
internal class FlaCudaTask
|
||||||
|
{
|
||||||
|
CUDA cuda;
|
||||||
|
public CUfunction cudaComputeAutocor;
|
||||||
|
public CUfunction cudaComputeLPC;
|
||||||
|
public CUfunction cudaEstimateResidual;
|
||||||
|
//public CUfunction cudaSumResidualChunks;
|
||||||
|
public CUfunction cudaSumResidual;
|
||||||
|
//public CUfunction cudaEncodeResidual;
|
||||||
|
public CUdeviceptr cudaSamples;
|
||||||
|
public CUdeviceptr cudaAutocorTasks;
|
||||||
|
public CUdeviceptr cudaAutocorOutput;
|
||||||
|
public CUdeviceptr cudaResidualTasks;
|
||||||
|
public CUdeviceptr cudaResidualOutput;
|
||||||
|
public IntPtr samplesBufferPtr = IntPtr.Zero;
|
||||||
|
public IntPtr autocorTasksPtr = IntPtr.Zero;
|
||||||
|
public IntPtr residualTasksPtr = IntPtr.Zero;
|
||||||
|
public CUstream stream;
|
||||||
|
public int[] verifyBuffer;
|
||||||
|
public int blocksize = 0;
|
||||||
|
public FlacFrame frame;
|
||||||
|
public int autocorTasksLen;
|
||||||
|
public int residualTasksLen;
|
||||||
|
public int samplesBufferLen;
|
||||||
|
|
||||||
|
unsafe public FlaCudaTask(CUDA _cuda, int channelCount)
|
||||||
|
{
|
||||||
|
cuda = _cuda;
|
||||||
|
|
||||||
|
autocorTasksLen = sizeof(computeAutocorTaskStruct) * channelCount * lpc.MAX_LPC_WINDOWS * FlaCudaWriter.maxFrames;
|
||||||
|
residualTasksLen = sizeof(encodeResidualTaskStruct) * channelCount * lpc.MAX_LPC_ORDER * (lpc.MAX_LPC_WINDOWS + 1) * FlaCudaWriter.maxFrames;
|
||||||
|
samplesBufferLen = sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelCount;
|
||||||
|
|
||||||
|
cudaSamples = cuda.Allocate((uint)samplesBufferLen);
|
||||||
|
cudaAutocorTasks = cuda.Allocate((uint)autocorTasksLen);
|
||||||
|
cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * FlaCudaWriter.maxAutocorParts));
|
||||||
|
cudaResidualTasks = cuda.Allocate((uint)residualTasksLen);
|
||||||
|
cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS + 1) * lpc.MAX_LPC_ORDER * FlaCudaWriter.maxResidualParts));
|
||||||
|
CUResult cuErr = CUDADriver.cuMemAllocHost(ref samplesBufferPtr, (uint)samplesBufferLen);
|
||||||
|
if (cuErr == CUResult.Success)
|
||||||
|
cuErr = CUDADriver.cuMemAllocHost(ref autocorTasksPtr, (uint)autocorTasksLen);
|
||||||
|
if (cuErr == CUResult.Success)
|
||||||
|
cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)residualTasksLen);
|
||||||
|
if (cuErr != CUResult.Success)
|
||||||
|
{
|
||||||
|
if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr); samplesBufferPtr = IntPtr.Zero;
|
||||||
|
if (autocorTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(autocorTasksPtr); autocorTasksPtr = IntPtr.Zero;
|
||||||
|
if (residualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualTasksPtr); residualTasksPtr = IntPtr.Zero;
|
||||||
|
throw new CUDAException(cuErr);
|
||||||
|
}
|
||||||
|
|
||||||
|
cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor");
|
||||||
|
cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC");
|
||||||
|
cudaEstimateResidual = cuda.GetModuleFunction("cudaEstimateResidual");
|
||||||
|
cudaSumResidual = cuda.GetModuleFunction("cudaSumResidual");
|
||||||
|
//cudaSumResidualChunks = cuda.GetModuleFunction("cudaSumResidualChunks");
|
||||||
|
//cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual");
|
||||||
|
|
||||||
|
stream = cuda.CreateStream();
|
||||||
|
verifyBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * channelCount]; // should be channels, not channelCount. And should null if not doing verify!
|
||||||
|
frame = new FlacFrame(channelCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void Dispose()
|
||||||
|
{
|
||||||
|
cuda.Free(cudaSamples);
|
||||||
|
cuda.Free(cudaAutocorTasks);
|
||||||
|
cuda.Free(cudaAutocorOutput);
|
||||||
|
cuda.Free(cudaResidualTasks);
|
||||||
|
cuda.Free(cudaResidualOutput);
|
||||||
|
CUDADriver.cuMemFreeHost(samplesBufferPtr);
|
||||||
|
CUDADriver.cuMemFreeHost(residualTasksPtr);
|
||||||
|
CUDADriver.cuMemFreeHost(autocorTasksPtr);
|
||||||
|
cuda.DestroyStream(stream);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -81,15 +81,12 @@ extern "C" __global__ void cudaComputeAutocor(
|
|||||||
//if (tid < 256) shared.product[tid] += shared.product[tid + 256]; __syncthreads();
|
//if (tid < 256) shared.product[tid] += shared.product[tid + 256]; __syncthreads();
|
||||||
if (tid < 128) shared.product[tid] += shared.product[tid + 128]; __syncthreads();
|
if (tid < 128) shared.product[tid] += shared.product[tid + 128]; __syncthreads();
|
||||||
if (tid < 64) shared.product[tid] += shared.product[tid + 64]; __syncthreads();
|
if (tid < 64) shared.product[tid] += shared.product[tid + 64]; __syncthreads();
|
||||||
if (tid < 32)
|
if (tid < 32) shared.product[tid] += shared.product[tid + 32]; __syncthreads();
|
||||||
{
|
shared.product[tid] += shared.product[tid + 16];
|
||||||
shared.product[tid] += shared.product[tid + 32];
|
shared.product[tid] += shared.product[tid + 8];
|
||||||
shared.product[tid] += shared.product[tid + 16];
|
shared.product[tid] += shared.product[tid + 4];
|
||||||
shared.product[tid] += shared.product[tid + 8];
|
shared.product[tid] += shared.product[tid + 2];
|
||||||
shared.product[tid] += shared.product[tid + 4];
|
if (tid == 0) shared.sum[lag] = shared.product[0] + shared.product[1];
|
||||||
shared.product[tid] += shared.product[tid + 2];
|
|
||||||
if (tid == 0) shared.sum[lag] = shared.product[0] + shared.product[1];
|
|
||||||
}
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
}
|
}
|
||||||
// return results
|
// return results
|
||||||
@@ -167,7 +164,7 @@ extern "C" __global__ void cudaComputeLPC(
|
|||||||
shared.ldr[tid] += (tid < order) * __fmul_rz(reff, shared.ldr[order - 1 - tid]) + (tid == order) * reff;
|
shared.ldr[tid] += (tid < order) * __fmul_rz(reff, shared.ldr[order - 1 - tid]) + (tid == order) * reff;
|
||||||
|
|
||||||
// Quantization
|
// Quantization
|
||||||
int precision = 13;
|
int precision = 13 - (order > 8);
|
||||||
int taskNo = shared.task.residualOffs + order;
|
int taskNo = shared.task.residualOffs + order;
|
||||||
shared.bits[tid] = __mul24((33 - __clz(__float2int_rn(fabs(shared.ldr[tid]) * (1 << 15))) - precision), tid <= order);
|
shared.bits[tid] = __mul24((33 - __clz(__float2int_rn(fabs(shared.ldr[tid]) * (1 << 15))) - precision), tid <= order);
|
||||||
shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 16]);
|
shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 16]);
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ code {
|
|||||||
name = cudaComputeAutocor
|
name = cudaComputeAutocor
|
||||||
lmem = 0
|
lmem = 0
|
||||||
smem = 3264
|
smem = 3264
|
||||||
reg = 9
|
reg = 10
|
||||||
bar = 1
|
bar = 1
|
||||||
const {
|
const {
|
||||||
segname = const
|
segname = const
|
||||||
@@ -47,17 +47,18 @@ code {
|
|||||||
0xc0000a01 0x00000780 0x10039003 0x00000780
|
0xc0000a01 0x00000780 0x10039003 0x00000780
|
||||||
0x1000f801 0x0403c780 0x00020c05 0xc0000782
|
0x1000f801 0x0403c780 0x00020c05 0xc0000782
|
||||||
0x04001601 0xe4200780 0x861ffe03 0x00000000
|
0x04001601 0xe4200780 0x861ffe03 0x00000000
|
||||||
0x307cd1fd 0x6c2047c8 0x10081003 0x00000280
|
0x307cd1fd 0x6c2047c8 0x10085003 0x00000280
|
||||||
0x300209fd 0x6c00c7e8 0x30040dfd 0x6c0187f8
|
0x300209fd 0x6c00c7e8 0x30040dfd 0x6c0187f8
|
||||||
0x308105fd 0x6c40c7c8 0x00000019 0x20000780
|
0x308105fd 0x6c40c7c8 0x00000019 0x20000780
|
||||||
0x2101f011 0x00000003 0x1000f815 0x0403c780
|
0x2101f011 0x00000003 0x1000f815 0x0403c780
|
||||||
0x308205fd 0x6c40c7c8 0x0000001d 0x20000780
|
0x308205fd 0x6c40c7c8 0x0000001d 0x20000780
|
||||||
0x308305fd 0x6c40c7d8 0x20000a21 0x04008780
|
0x308305fd 0x6c40c7c8 0x00000021 0x20000780
|
||||||
0x20009001 0x00000013 0x00020009 0xc0000780
|
0x307c05fd 0x6c0087d8 0x20000a25 0x04008780
|
||||||
0x1800d601 0x0423c780 0x0002100d 0xc0000780
|
0x20009201 0x00000013 0x00020009 0xc0000780
|
||||||
0xc400d621 0x00200780 0x00000609 0xc0000780
|
0x1800d601 0x0423c780 0x0002120d 0xc0000780
|
||||||
0x1c00d601 0x0423c780 0x1000f821 0x0403f280
|
0xc400d625 0x00200780 0x00000609 0xc0000780
|
||||||
0xe800d601 0x00220780 0x10001001 0x0403e280
|
0x1c00d601 0x0423c780 0x1000f825 0x0403f280
|
||||||
|
0xe800d601 0x00224780 0x10001201 0x0403e280
|
||||||
0x08041601 0xe4200780 0x861ffe03 0x00000000
|
0x08041601 0xe4200780 0x861ffe03 0x00000000
|
||||||
0x00000c01 0xa00007c0 0x00000609 0xc0000680
|
0x00000c01 0xa00007c0 0x00000609 0xc0000680
|
||||||
0xd8145811 0x20000680 0xd810580d 0x20000680
|
0xd8145811 0x20000680 0xd810580d 0x20000680
|
||||||
@@ -67,21 +68,22 @@ code {
|
|||||||
0xd8125811 0x20000680 0xd810580d 0x20000680
|
0xd8125811 0x20000680 0xd810580d 0x20000680
|
||||||
0x1000c001 0x0423c684 0xbc00c001 0x00200680
|
0x1000c001 0x0423c684 0xbc00c001 0x00200680
|
||||||
0x08041601 0xe4200680 0x861ffe03 0x00000000
|
0x08041601 0xe4200680 0x861ffe03 0x00000000
|
||||||
0xa007c003 0x00000000 0x1007c003 0x00001100
|
0x00001001 0xa00007c0 0x00000609 0xc0000680
|
||||||
0x00000609 0xc0000780 0xd8115811 0x20000780
|
0xd8115811 0x20000680 0xd810580d 0x20000680
|
||||||
0xd810580d 0x20000780 0x1000c001 0x0423c784
|
0x1000c001 0x0423c684 0xbc00c001 0x00200680
|
||||||
0xbc00c001 0x00200780 0x08041601 0xe4200780
|
0x08041601 0xe4200680 0x861ffe03 0x00000000
|
||||||
|
0x00000609 0xc0000780 0xd810580d 0x20000780
|
||||||
0x1c00e001 0x0423c780 0xbc00c001 0x00200780
|
0x1c00e001 0x0423c780 0xbc00c001 0x00200780
|
||||||
0x08041601 0xe4200780 0x1d00f000 0xbd006000
|
0x08041601 0xe4200780 0x1d00f000 0xbd006000
|
||||||
0x08041601 0xe4200780 0x1d00e800 0xbd006000
|
0x08041601 0xe4200780 0x1d00e800 0xbd006000
|
||||||
0x08041601 0xe4200780 0x1d00e400 0xbd006000
|
0x08041601 0xe4200780 0x1d00e400 0xbd006000
|
||||||
0x08041601 0xe4200780 0x307c05fd 0x6c0147c8
|
0x08041601 0xe4200780 0xa0080003 0x00000000
|
||||||
0x1007c003 0x00000280 0xd010580d 0x20000780
|
0x10080003 0x00001100 0xd010580d 0x20000780
|
||||||
0x1c00c201 0x0423c780 0x00020a09 0xc0000780
|
0x1c00c201 0x0423c780 0x00020a09 0xc0000780
|
||||||
0xbc00c001 0x00200780 0x08061601 0xe4200780
|
0xbc00c001 0x00200780 0x08061601 0xe4200780
|
||||||
0xf0000001 0xe0000002 0x861ffe03 0x00000000
|
0xf0000001 0xe0000002 0x861ffe03 0x00000000
|
||||||
0x20018a15 0x00000003 0x30040bfd 0x6c0147c8
|
0x20018a15 0x00000003 0x30040bfd 0x6c0147c8
|
||||||
0x10047003 0x00000280 0x3002d1fd 0x6c2047c8
|
0x10049003 0x00000280 0x3002d1fd 0x6c2047c8
|
||||||
0x30000003 0x00000280 0x10004e01 0x0023c780
|
0x30000003 0x00000280 0x10004e01 0x0023c780
|
||||||
0x60004805 0x00204780 0x2101f001 0x00000003
|
0x60004805 0x00204780 0x2101f001 0x00000003
|
||||||
0x40030011 0x00000780 0x60020211 0x00010780
|
0x40030011 0x00000780 0x60020211 0x00010780
|
||||||
@@ -285,12 +287,12 @@ code {
|
|||||||
segname = const
|
segname = const
|
||||||
segnum = 1
|
segnum = 1
|
||||||
offset = 0
|
offset = 0
|
||||||
bytes = 52
|
bytes = 56
|
||||||
mem {
|
mem {
|
||||||
0x00000003 0x0000001f 0x0000003f 0x00000040
|
0x00000003 0x0000001f 0x0000003f 0x00000040
|
||||||
0x00000001 0x00000020 0x7e800000 0x0000000f
|
0x00000001 0x00000020 0x7e800000 0x00000008
|
||||||
0x00001fff 0xffffe000 0x3e800000 0x0000009e
|
0x0000000c 0x0000000f 0xfffff000 0x00000fff
|
||||||
0x00000008
|
0x3e800000 0x0000009e
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
bincode {
|
bincode {
|
||||||
@@ -345,7 +347,7 @@ code {
|
|||||||
0x213fee11 0x0fffffff 0x1000f815 0x0403c780
|
0x213fee11 0x0fffffff 0x1000f815 0x0403c780
|
||||||
0xd0047005 0x20000780 0xb08601fd 0x605107d8
|
0xd0047005 0x20000780 0xb08601fd 0x605107d8
|
||||||
0x10000005 0x0403c780 0xa400c019 0xe4204780
|
0x10000005 0x0403c780 0xa400c019 0xe4204780
|
||||||
0xc08a0c19 0x00401680 0xc08a0205 0x00401680
|
0xc08c0c19 0x00401680 0xc08c0205 0x00401680
|
||||||
0x90000204 0xc0010c04 0xd0047005 0x20000780
|
0x90000204 0xc0010c04 0xd0047005 0x20000780
|
||||||
0xc401c019 0x0020c780 0xb0060000 0x20458818
|
0xc401c019 0x0020c780 0xb0060000 0x20458818
|
||||||
0x300605fd 0x6c0187d8 0xa0077003 0x00000000
|
0x300605fd 0x6c0187d8 0xa0077003 0x00000000
|
||||||
@@ -362,9 +364,11 @@ code {
|
|||||||
0xa800da05 0xc4304780 0xc0000205 0x04700003
|
0xa800da05 0xc4304780 0xc0000205 0x04700003
|
||||||
0xa0000205 0x8c0047d0 0x2000d619 0x04214780
|
0xa0000205 0x8c0047d0 0x2000d619 0x04214780
|
||||||
0xa0000205 0x44065680 0x30170205 0xec101680
|
0xa0000205 0x44065680 0x30170205 0xec101680
|
||||||
0x31000205 0x0442d680 0x10000a05 0x2440d100
|
0x31000205 0x04435680 0x10000a05 0x2440d100
|
||||||
0x30020a1d 0x6c0187d0 0x30148205 0x00000003
|
0x30870bfd 0x6c4107d8 0x100d801d 0x00000003
|
||||||
0xd0840e1d 0x04400780 0x40070205 0x00018780
|
0x1000101d 0x2440d280 0x20000e05 0x04004780
|
||||||
|
0x30020a1d 0x6c0187e0 0xd0840e1d 0x04400780
|
||||||
|
0x30218205 0x00000003 0x40070205 0x00018780
|
||||||
0x00000609 0xc0000780 0x08005a01 0xe4204780
|
0x00000609 0xc0000780 0x08005a01 0xe4204780
|
||||||
0xd801680d 0x20000780 0x1c00e005 0x0423c780
|
0xd801680d 0x20000780 0x1c00e005 0x0423c780
|
||||||
0x3c01c005 0x8c200780 0x08005a01 0xe4204780
|
0x3c01c005 0x8c200780 0x08005a01 0xe4204780
|
||||||
@@ -375,23 +379,25 @@ code {
|
|||||||
0x08005a01 0xe4204780 0x1c00c205 0x0423c780
|
0x08005a01 0xe4204780 0x1c00c205 0x0423c780
|
||||||
0x3c01c005 0x8c200780 0x08005a01 0xe4204780
|
0x3c01c005 0x8c200780 0x08005a01 0xe4204780
|
||||||
0xd0016809 0x20000780 0x390fe005 0x00000003
|
0xd0016809 0x20000780 0x390fe005 0x00000003
|
||||||
0x30870205 0xac400780 0x1001801d 0x00000003
|
0x30890205 0xac400780 0x1001801d 0x00000003
|
||||||
0x307c0205 0x8c000780 0x30010e1d 0xc4000780
|
0x307c0205 0x8c000780 0x30010e1d 0xc4000780
|
||||||
0xa0000e1d 0x44014780 0xc407da1d 0x00200780
|
0xa0000e21 0x44014780 0x103f801d 0x000001ff
|
||||||
0xa0000e1d 0xac004780 0x30880e1d 0xac400780
|
0xc408da21 0x00200780 0x1000161d 0x2440d280
|
||||||
0xa00b3003 0x00000000 0x30890e1d 0x8c400780
|
0xa0001021 0xac004780 0x30080e21 0xac000780
|
||||||
0x100b3003 0x00001100 0x30070c21 0xc4100780
|
0x1000801d 0x0ffffe03 0x1000141d 0x2440d280
|
||||||
|
0x30080e1d 0x8c000780 0xa00bb003 0x00000000
|
||||||
|
0x100bb003 0x00002100 0x30070c21 0xc4100780
|
||||||
0x30060c25 0xc4100780 0x20099020 0x2108e820
|
0x30060c25 0xc4100780 0x20099020 0x2108e820
|
||||||
0x20000621 0x04020780 0x20009021 0x00000007
|
0x20000621 0x04020780 0x20009021 0x00000007
|
||||||
0xd00e101d 0xa0c00780 0xf0000001 0xe0000002
|
0xd00e101d 0xa0c00780 0xf0000001 0xe0000002
|
||||||
0x30070c21 0xc4100680 0x30060c25 0xc4100680
|
0x30070c21 0xc4100680 0x30060c25 0xc4100680
|
||||||
0x20001021 0x04024680 0x2000c821 0x04220680
|
0x20001021 0x04024680 0x2000c821 0x04220680
|
||||||
0x21001021 0x04430680 0xd00e1005 0xa0c00680
|
0x21001021 0x0441c680 0xd00e1005 0xa0c00680
|
||||||
0x307c0ffd 0x6c0087d8 0xa0000e05 0x44065500
|
0x307c0ffd 0x6c0087d8 0xa0000e05 0x44065500
|
||||||
0x30170205 0xec101500 0x31000205 0x0442d500
|
0x30170205 0xec101500 0x31000205 0x04435500
|
||||||
0x10000a05 0x2440d280 0xd007001d 0x0402c780
|
0x10000a05 0x2440d280 0xd007001d 0x0402c780
|
||||||
0x307c0ffd 0x6c0087d8 0xa0000e1d 0x44065500
|
0x307c0ffd 0x6c0087d8 0xa0000e1d 0x44065500
|
||||||
0x30170e1d 0xec101500 0x31000e1d 0x0442d500
|
0x30170e1d 0xec101500 0x31000e1d 0x04435500
|
||||||
0x10000a1d 0x2440d280 0x30070205 0x8c000780
|
0x10000a1d 0x2440d280 0x30070205 0x8c000780
|
||||||
0x00000605 0xc0000780 0x30218205 0x00000003
|
0x00000605 0xc0000780 0x30218205 0x00000003
|
||||||
0x04005a01 0xe4204780 0xd4016809 0x20000780
|
0x04005a01 0xe4204780 0xd4016809 0x20000780
|
||||||
@@ -402,8 +408,8 @@ code {
|
|||||||
0x04005a01 0xe4204780 0x1800c405 0x0423c780
|
0x04005a01 0xe4204780 0x1800c405 0x0423c780
|
||||||
0x3801c005 0x8c200780 0x04005a01 0xe4204780
|
0x3801c005 0x8c200780 0x04005a01 0xe4204780
|
||||||
0x1800c205 0x0423c780 0x3801c005 0x8c200780
|
0x1800c205 0x0423c780 0x3801c005 0x8c200780
|
||||||
0x04005a01 0xe4204780 0xa00e2003 0x00000000
|
0x04005a01 0xe4204780 0xa00ea003 0x00000000
|
||||||
0x100e2003 0x00000100 0x30070c05 0xc4100780
|
0x100ea003 0x00000100 0x30070c05 0xc4100780
|
||||||
0x30060c19 0xc4100780 0x20000205 0x04018780
|
0x30060c19 0xc4100780 0x20000205 0x04018780
|
||||||
0xd0016805 0x20000780 0x2101e818 0x1500e004
|
0xd0016805 0x20000780 0x2101e818 0x1500e004
|
||||||
0x200c8c19 0x00000003 0xd00e0c05 0xa0c00780
|
0x200c8c19 0x00000003 0xd00e0c05 0xa0c00780
|
||||||
|
|||||||
Reference in New Issue
Block a user