optimizations

This commit is contained in:
chudov
2009-09-13 16:08:21 +00:00
parent f7a52fbcdf
commit 866257d0de
3 changed files with 287 additions and 285 deletions

View File

@@ -73,6 +73,7 @@ namespace CUETools.Codecs.FlaCuda
int[] verifyBuffer; int[] verifyBuffer;
int[] residualBuffer; int[] residualBuffer;
float[] windowBuffer; float[] windowBuffer;
byte[] md5_buffer;
int samplesInBuffer = 0; int samplesInBuffer = 0;
int _compressionLevel = 5; int _compressionLevel = 5;
@@ -84,7 +85,7 @@ namespace CUETools.Codecs.FlaCuda
Crc16 crc16; Crc16 crc16;
MD5 md5; MD5 md5;
FlacFrame frame; FlacFrame _frame;
FlakeReader verify; FlakeReader verify;
SeekPoint[] seek_table; SeekPoint[] seek_table;
@@ -105,7 +106,6 @@ namespace CUETools.Codecs.FlaCuda
CUdeviceptr cudaAutocorOutput; CUdeviceptr cudaAutocorOutput;
CUdeviceptr cudaResidualTasks; CUdeviceptr cudaResidualTasks;
CUdeviceptr cudaResidualOutput; CUdeviceptr cudaResidualOutput;
CUdeviceptr cudaResidualSums;
IntPtr samplesBufferPtr = IntPtr.Zero; IntPtr samplesBufferPtr = IntPtr.Zero;
IntPtr autocorTasksPtr = IntPtr.Zero; IntPtr autocorTasksPtr = IntPtr.Zero;
IntPtr residualTasksPtr = IntPtr.Zero; IntPtr residualTasksPtr = IntPtr.Zero;
@@ -114,9 +114,10 @@ namespace CUETools.Codecs.FlaCuda
int nResidualTasks = 0; int nResidualTasks = 0;
int nAutocorTasks = 0; int nAutocorTasks = 0;
int maxFrames = 8;
const int MAX_BLOCKSIZE = 8192; const int MAX_BLOCKSIZE = 4608 * 4;
const int maxResidualParts = 64; const int maxResidualParts = (MAX_BLOCKSIZE + 32 * 3) / (32 * 3);
const int maxAutocorParts = MAX_BLOCKSIZE / (256 - 32); const int maxAutocorParts = MAX_BLOCKSIZE / (256 - 32);
public FlaCudaWriter(string path, int bitsPerSample, int channelCount, int sampleRate, Stream IO) public FlaCudaWriter(string path, int bitsPerSample, int channelCount, int sampleRate, Stream IO)
@@ -137,13 +138,14 @@ namespace CUETools.Codecs.FlaCuda
residualBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 10 : channels + 1)]; residualBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 10 : channels + 1)];
windowBuffer = new float[FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS]; windowBuffer = new float[FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS];
md5_buffer = new byte[FlaCudaWriter.MAX_BLOCKSIZE * channels * bits_per_sample / 8];
eparams.flake_set_defaults(_compressionLevel); eparams.flake_set_defaults(_compressionLevel);
eparams.padding_size = 8192; eparams.padding_size = 8192;
crc8 = new Crc8(); crc8 = new Crc8();
crc16 = new Crc16(); crc16 = new Crc16();
frame = new FlacFrame(channels * 2); _frame = new FlacFrame(channels * 2);
} }
public int TotalSize public int TotalSize
@@ -193,11 +195,20 @@ namespace CUETools.Codecs.FlaCuda
while (samplesInBuffer > 0) while (samplesInBuffer > 0)
{ {
eparams.block_size = samplesInBuffer; eparams.block_size = samplesInBuffer;
output_frame(); output_frames();
} }
if (_IO.CanSeek) if (_IO.CanSeek)
{ {
if (sample_count == 0 && _position != 0)
{
BitWriter bitwriter = new BitWriter(header, 0, 4);
bitwriter.writebits(32, (int)_position);
bitwriter.flush();
_IO.Position = 22;
_IO.Write(header, 0, 4);
}
if (md5 != null) if (md5 != null)
{ {
md5.TransformFinalBlock(frame_buffer, 0, 0); md5.TransformFinalBlock(frame_buffer, 0, 0);
@@ -220,7 +231,6 @@ namespace CUETools.Codecs.FlaCuda
cuda.Free(cudaAutocorOutput); cuda.Free(cudaAutocorOutput);
cuda.Free(cudaResidualTasks); cuda.Free(cudaResidualTasks);
cuda.Free(cudaResidualOutput); cuda.Free(cudaResidualOutput);
cuda.Free(cudaResidualSums);
CUDADriver.cuMemFreeHost(samplesBufferPtr); CUDADriver.cuMemFreeHost(samplesBufferPtr);
CUDADriver.cuMemFreeHost(residualTasksPtr); CUDADriver.cuMemFreeHost(residualTasksPtr);
CUDADriver.cuMemFreeHost(autocorTasksPtr); CUDADriver.cuMemFreeHost(autocorTasksPtr);
@@ -253,7 +263,6 @@ namespace CUETools.Codecs.FlaCuda
cuda.Free(cudaAutocorOutput); cuda.Free(cudaAutocorOutput);
cuda.Free(cudaResidualTasks); cuda.Free(cudaResidualTasks);
cuda.Free(cudaResidualOutput); cuda.Free(cudaResidualOutput);
cuda.Free(cudaResidualSums);
CUDADriver.cuMemFreeHost(samplesBufferPtr); CUDADriver.cuMemFreeHost(samplesBufferPtr);
CUDADriver.cuMemFreeHost(residualTasksPtr); CUDADriver.cuMemFreeHost(residualTasksPtr);
CUDADriver.cuMemFreeHost(autocorTasksPtr); CUDADriver.cuMemFreeHost(autocorTasksPtr);
@@ -905,34 +914,37 @@ namespace CUETools.Codecs.FlaCuda
_windowcount++; _windowcount++;
} }
unsafe void initialize_autocorTasks(int channelsCount, int max_order) unsafe void initialize_autocorTasks(int blocksize, int channelsCount, int max_order, int nFrames)
{ {
computeAutocorTaskStruct* autocorTasks = (computeAutocorTaskStruct*)autocorTasksPtr; computeAutocorTaskStruct* autocorTasks = (computeAutocorTaskStruct*)autocorTasksPtr;
encodeResidualTaskStruct* residualTasks = (encodeResidualTaskStruct*)residualTasksPtr; encodeResidualTaskStruct* residualTasks = (encodeResidualTaskStruct*)residualTasksPtr;
nAutocorTasks = 0; nAutocorTasks = 0;
nResidualTasks = 0; nResidualTasks = 0;
for (int iFrame = 0; iFrame < nFrames; iFrame++)
{
for (int ch = 0; ch < channelsCount; ch++) for (int ch = 0; ch < channelsCount; ch++)
{
for (int iWindow = 0; iWindow < _windowcount; iWindow++) for (int iWindow = 0; iWindow < _windowcount; iWindow++)
{ {
// Autocorelation task // Autocorelation task
autocorTasks[nAutocorTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE; autocorTasks[nAutocorTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize;
autocorTasks[nAutocorTasks].windowOffs = iWindow * 2 * FlaCudaWriter.MAX_BLOCKSIZE; autocorTasks[nAutocorTasks].windowOffs = iWindow * 2 * FlaCudaWriter.MAX_BLOCKSIZE;
autocorTasks[nAutocorTasks].residualOffs = max_order * (iWindow + (_windowcount + 1) * (ch + iFrame * channelsCount));
autocorTasks[nAutocorTasks].blocksize = blocksize;
nAutocorTasks++; nAutocorTasks++;
// LPC tasks // LPC tasks
for (int order = 1; order <= max_order; order++) for (int order = 1; order <= max_order; order++)
{ {
residualTasks[nResidualTasks].residualOrder = order <= max_order ? order : 0; residualTasks[nResidualTasks].residualOrder = order <= max_order ? order : 0;
residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE; residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize;
nResidualTasks++; nResidualTasks++;
} }
} }
// Fixed prediction // Fixed prediction
for (int ch = 0; ch < channelsCount; ch++)
{
for (int order = 1; order <= max_order; order++) for (int order = 1; order <= max_order; order++)
{ {
residualTasks[nResidualTasks].residualOrder = order <= 4 ? order : 0; residualTasks[nResidualTasks].residualOrder = order <= 4 ? order : 0;
residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE; residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize;
residualTasks[nResidualTasks].shift = 0; residualTasks[nResidualTasks].shift = 0;
switch (order) switch (order)
{ {
@@ -962,7 +974,7 @@ namespace CUETools.Codecs.FlaCuda
nResidualTasks++; nResidualTasks++;
} }
} }
}
cuda.CopyHostToDeviceAsync(cudaAutocorTasks, autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * nAutocorTasks), cudaStream); cuda.CopyHostToDeviceAsync(cudaAutocorTasks, autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * nAutocorTasks), cudaStream);
cuda.CopyHostToDeviceAsync(cudaResidualTasks, residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks), cudaStream); cuda.CopyHostToDeviceAsync(cudaResidualTasks, residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks), cudaStream);
cuda.SynchronizeStream(cudaStream); cuda.SynchronizeStream(cudaStream);
@@ -1004,7 +1016,7 @@ namespace CUETools.Codecs.FlaCuda
} }
} }
unsafe void select_best_methods(FlacFrame frame, int channelsCount, int max_order, int partCount) unsafe void select_best_methods(FlacFrame frame, int channelsCount, int max_order, int iFrame)
{ {
encodeResidualTaskStruct* residualTasks = (encodeResidualTaskStruct*)residualTasksPtr; encodeResidualTaskStruct* residualTasks = (encodeResidualTaskStruct*)residualTasksPtr;
for (int ch = 0; ch < channelsCount; ch++) for (int ch = 0; ch < channelsCount; ch++)
@@ -1037,7 +1049,7 @@ namespace CUETools.Codecs.FlaCuda
{ {
for (int order = 1; order <= max_order && order < frame.blocksize; order++) for (int order = 1; order <= max_order && order < frame.blocksize; order++)
{ {
int index = (order - 1) + max_order * (iWindow + _windowcount * ch); int index = (order - 1) + max_order * (iWindow + (_windowcount + 1) * (ch + iFrame * channelsCount));
int cbits = residualTasks[index].cbits; int cbits = residualTasks[index].cbits;
int nbits = order * (int)frame.subframes[ch].obits + 4 + 5 + order * cbits + 6 + residualTasks[index].size; int nbits = order * (int)frame.subframes[ch].obits + 4 + 5 + order * cbits + 6 + residualTasks[index].size;
if (residualTasks[index].residualOrder != order) if (residualTasks[index].residualOrder != order)
@@ -1062,7 +1074,7 @@ namespace CUETools.Codecs.FlaCuda
{ {
for (int order = 1; order <= 5 && order <= max_order && order < frame.blocksize; order++) for (int order = 1; order <= 5 && order <= max_order && order < frame.blocksize; order++)
{ {
int index = (order - 1) + max_order * (ch + _windowcount * channelsCount); int index = (order - 1) + max_order * (_windowcount + (_windowcount + 1) * (ch + iFrame * channelsCount));
int forder = order == 5 ? 0 : order; int forder = order == 5 ? 0 : order;
int nbits = forder * (int)frame.subframes[ch].obits + 6 + residualTasks[index].size; int nbits = forder * (int)frame.subframes[ch].obits + 6 + residualTasks[index].size;
if (residualTasks[index].residualOrder != (order == 5 ? 1 : order)) if (residualTasks[index].residualOrder != (order == 5 ? 1 : order))
@@ -1077,15 +1089,14 @@ namespace CUETools.Codecs.FlaCuda
} }
} }
unsafe void estimate_residual(FlacFrame frame, int channelsCount, int max_order, int autocorPartCount, out int partCount) unsafe void estimate_residual(int blocksize, int channelsCount, int max_order, int nFrames)
{ {
if (frame.blocksize <= 4) if (blocksize <= 4)
{
partCount = 0;
return; return;
}
uint cbits = get_precision(frame.blocksize) + 1; compute_autocorellation(blocksize, channelsCount, max_order, nFrames);
uint cbits = get_precision(blocksize) + 1;
int threads_y; int threads_y;
if (max_order >= 4 && max_order <= 8) if (max_order >= 4 && max_order <= 8)
threads_y = max_order; threads_y = max_order;
@@ -1102,8 +1113,7 @@ namespace CUETools.Codecs.FlaCuda
else else
throw new Exception("invalid LPC order"); throw new Exception("invalid LPC order");
int partSize = 32 * (threads_y - 1); int partSize = 32 * (threads_y - 1);
int partCount = (blocksize + partSize - 1) / partSize;
partCount = (frame.blocksize + partSize - 1) / partSize;
if (partCount > maxResidualParts) if (partCount > maxResidualParts)
throw new Exception("invalid combination of block size and LPC order"); throw new Exception("invalid combination of block size and LPC order");
@@ -1112,19 +1122,11 @@ namespace CUETools.Codecs.FlaCuda
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 1, (uint)cudaSamples.Pointer); cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 1, (uint)cudaSamples.Pointer);
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 2, (uint)cudaResidualTasks.Pointer); cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 2, (uint)cudaResidualTasks.Pointer);
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 3, (uint)max_order); cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 3, (uint)max_order);
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 4, (uint)frame.blocksize); cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 4, (uint)blocksize);
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 5, (uint)partSize); cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 5, (uint)partSize);
cuda.SetParameterSize(cudaEstimateResidual, sizeof(uint) * 6); cuda.SetParameterSize(cudaEstimateResidual, sizeof(uint) * 6);
cuda.SetFunctionBlockShape(cudaEstimateResidual, 32, threads_y, 1); cuda.SetFunctionBlockShape(cudaEstimateResidual, 32, threads_y, 1);
//cuda.SetParameter(cudaSumResidualChunks, 0, (uint)cudaResidualSums.Pointer);
//cuda.SetParameter(cudaSumResidualChunks, sizeof(uint), (uint)cudaResidualTasks.Pointer);
//cuda.SetParameter(cudaSumResidualChunks, sizeof(uint) * 2, (uint)cudaResidualOutput.Pointer);
//cuda.SetParameter(cudaSumResidualChunks, sizeof(uint) * 3, (uint)frame.blocksize);
//cuda.SetParameter(cudaSumResidualChunks, sizeof(uint) * 4, (uint)partSize);
//cuda.SetParameterSize(cudaSumResidualChunks, sizeof(uint) * 5U);
//cuda.SetFunctionBlockShape(cudaSumResidualChunks, residualThreads, 1, 1);
cuda.SetParameter(cudaSumResidual, 0, (uint)cudaResidualTasks.Pointer); cuda.SetParameter(cudaSumResidual, 0, (uint)cudaResidualTasks.Pointer);
cuda.SetParameter(cudaSumResidual, sizeof(uint), (uint)cudaResidualOutput.Pointer); cuda.SetParameter(cudaSumResidual, sizeof(uint), (uint)cudaResidualOutput.Pointer);
cuda.SetParameter(cudaSumResidual, sizeof(uint) * 2, (uint)partSize); cuda.SetParameter(cudaSumResidual, sizeof(uint) * 2, (uint)partSize);
@@ -1133,24 +1135,24 @@ namespace CUETools.Codecs.FlaCuda
cuda.SetFunctionBlockShape(cudaSumResidual, 64, 1, 1); cuda.SetFunctionBlockShape(cudaSumResidual, 64, 1, 1);
// issue work to the GPU // issue work to the GPU
cuda.LaunchAsync(cudaEstimateResidual, partCount, nResidualTasks / threads_y, cudaStream); cuda.LaunchAsync(cudaEstimateResidual, partCount, (nResidualTasks / threads_y * nFrames) / maxFrames, cudaStream);
//cuda.LaunchAsync(cudaSumResidualChunks, partCount, nResidualTasks, cudaStream); //cuda.LaunchAsync(cudaSumResidualChunks, partCount, nResidualTasks, cudaStream);
cuda.LaunchAsync(cudaSumResidual, 1, nResidualTasks, cudaStream); cuda.LaunchAsync(cudaSumResidual, 1, (nResidualTasks * nFrames) / maxFrames, cudaStream);
cuda.CopyDeviceToHostAsync(cudaResidualTasks, residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks), cudaStream); cuda.CopyDeviceToHostAsync(cudaResidualTasks, residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * ((nResidualTasks * nFrames) / maxFrames)), cudaStream);
cuda.SynchronizeStream(cudaStream); cuda.SynchronizeStream(cudaStream);
} }
unsafe void compute_autocorellation(FlacFrame frame, int channelsCount, int max_order, out int partCount) unsafe void compute_autocorellation(int blocksize, int channelsCount, int max_order, int nFrames)
{ {
int autocorThreads = 256; int autocorThreads = 256;
int partSize = 2 * autocorThreads - max_order; int partSize = 2 * autocorThreads - max_order;
partSize &= 0xffffff0; partSize &= 0xffffff0;
partCount = (frame.blocksize + partSize - 1) / partSize; int partCount = (blocksize + partSize - 1) / partSize;
if (partCount > maxAutocorParts) if (partCount > maxAutocorParts)
throw new Exception("internal error"); throw new Exception("internal error");
if (frame.blocksize <= 4) if (blocksize <= 4)
return; return;
cuda.SetParameter(cudaComputeAutocor, 0, (uint)cudaAutocorOutput.Pointer); cuda.SetParameter(cudaComputeAutocor, 0, (uint)cudaAutocorOutput.Pointer);
@@ -1158,7 +1160,7 @@ namespace CUETools.Codecs.FlaCuda
cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 2, (uint)cudaWindow.Pointer); cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 2, (uint)cudaWindow.Pointer);
cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 3, (uint)cudaAutocorTasks.Pointer); cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 3, (uint)cudaAutocorTasks.Pointer);
cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 4, (uint)max_order); cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 4, (uint)max_order);
cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 4 + sizeof(uint), (uint)frame.blocksize); cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 4 + sizeof(uint), (uint)blocksize);
cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 4 + sizeof(uint) * 2, (uint)partSize); cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 4 + sizeof(uint) * 2, (uint)partSize);
cuda.SetParameterSize(cudaComputeAutocor, (uint)(sizeof(uint) * 4) + sizeof(uint) * 3); cuda.SetParameterSize(cudaComputeAutocor, (uint)(sizeof(uint) * 4) + sizeof(uint) * 3);
cuda.SetFunctionBlockShape(cudaComputeAutocor, autocorThreads, 1, 1); cuda.SetFunctionBlockShape(cudaComputeAutocor, autocorThreads, 1, 1);
@@ -1173,50 +1175,26 @@ namespace CUETools.Codecs.FlaCuda
// issue work to the GPU // issue work to the GPU
cuda.CopyHostToDeviceAsync(cudaSamples, samplesBufferPtr, (uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelsCount), cudaStream); cuda.CopyHostToDeviceAsync(cudaSamples, samplesBufferPtr, (uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelsCount), cudaStream);
cuda.LaunchAsync(cudaComputeAutocor, partCount, nAutocorTasks, cudaStream); cuda.LaunchAsync(cudaComputeAutocor, partCount, (nAutocorTasks * nFrames) / maxFrames, cudaStream);
cuda.LaunchAsync(cudaComputeLPC, 1, nAutocorTasks, cudaStream); cuda.LaunchAsync(cudaComputeLPC, 1, (nAutocorTasks * nFrames) / maxFrames, cudaStream);
//cuda.SynchronizeStream(cudaStream); //cuda.SynchronizeStream(cudaStream);
//cuda.CopyDeviceToHostAsync(cudaResidualTasks, residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks), cudaStream1); //cuda.CopyDeviceToHostAsync(cudaResidualTasks, residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks), cudaStream1);
} }
unsafe int encode_frame(out int size) unsafe int encode_frame(bool doMidside, int channelCount, int iFrame)
{ {
int* s = (int*)samplesBufferPtr;
fixed (int* r = residualBuffer) fixed (int* r = residualBuffer)
fixed (float* window = windowBuffer)
{ {
FlacFrame frame = _frame;
frame.InitSize(eparams.block_size, eparams.variable_block_size != 0); frame.InitSize(eparams.block_size, eparams.variable_block_size != 0);
for (int ch = 0; ch < channelCount; ch++)
bool doMidside = channels == 2 && eparams.do_midside;
int channelCount = doMidside ? 2 * channels : channels;
if (frame.blocksize != _windowsize && frame.blocksize > 4)
{ {
_windowsize = frame.blocksize; int* s = ((int*)samplesBufferPtr) + ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * eparams.block_size;
_windowcount = 0; frame.subframes[ch].Init(s, r + ch * FlaCudaWriter.MAX_BLOCKSIZE,
calculate_window(window, lpc.window_welch, WindowFunction.Welch); bits_per_sample + (doMidside && ch == 3 ? 1U : 0U), 0);// get_wasted_bits(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, frame.blocksize));
calculate_window(window, lpc.window_tukey, WindowFunction.Tukey);
calculate_window(window, lpc.window_hann, WindowFunction.Hann);
calculate_window(window, lpc.window_flattop, WindowFunction.Flattop);
calculate_window(window, lpc.window_bartlett, WindowFunction.Bartlett);
if (_windowcount == 0)
throw new Exception("invalid windowfunction");
cuda.CopyHostToDevice<float>(cudaWindow, windowBuffer);
initialize_autocorTasks(channelCount, eparams.max_prediction_order);
} }
if (doMidside) select_best_methods(frame, channelCount, eparams.max_prediction_order, iFrame);
channel_decorrelation(s, s + FlaCudaWriter.MAX_BLOCKSIZE, s + 2 * FlaCudaWriter.MAX_BLOCKSIZE, s + 3 * FlaCudaWriter.MAX_BLOCKSIZE, frame.blocksize);
frame.window_buffer = window;
for (int ch = 0; ch < channelCount; ch++)
frame.subframes[ch].Init(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * FlaCudaWriter.MAX_BLOCKSIZE,
bits_per_sample + (doMidside && ch == 3 ? 1U : 0U), get_wasted_bits(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, frame.blocksize));
int autocorPartCount, residualPartCount;
compute_autocorellation(frame, channelCount, eparams.max_prediction_order, out autocorPartCount);
estimate_residual(frame, channelCount, eparams.max_prediction_order, autocorPartCount, out residualPartCount);
select_best_methods(frame, channelCount, eparams.max_prediction_order, residualPartCount);
if (doMidside) if (doMidside)
{ {
@@ -1241,26 +1219,60 @@ namespace CUETools.Codecs.FlaCuda
else else
frame_count++; frame_count++;
} }
size = frame.blocksize;
return bitwriter.Length; return bitwriter.Length;
} }
} }
unsafe int output_frame() unsafe int output_frames()
{ {
bool doMidside = channels == 2 && eparams.do_midside;
int channelCount = doMidside ? 2 * channels : channels;
int nFrames = Math.Min(samplesInBuffer / eparams.block_size, maxFrames);
if (nFrames < 1)
throw new Exception("oops");
if (verify != null) if (verify != null)
{ {
int* r = (int*)samplesBufferPtr; int* r = (int*)samplesBufferPtr;
fixed (int* s = verifyBuffer) fixed (int* s = verifyBuffer)
for (int ch = 0; ch < channels; ch++) for (int ch = 0; ch < channels; ch++)
AudioSamples.MemCpy(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * FlaCudaWriter.MAX_BLOCKSIZE, eparams.block_size); AudioSamples.MemCpy(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * FlaCudaWriter.MAX_BLOCKSIZE, samplesInBuffer);
} }
int fs, bs; if (eparams.block_size != _windowsize && eparams.block_size > 4)
fixed (float* window = windowBuffer)
{
_windowsize = eparams.block_size;
_windowcount = 0;
calculate_window(window, lpc.window_welch, WindowFunction.Welch);
calculate_window(window, lpc.window_tukey, WindowFunction.Tukey);
calculate_window(window, lpc.window_hann, WindowFunction.Hann);
calculate_window(window, lpc.window_flattop, WindowFunction.Flattop);
calculate_window(window, lpc.window_bartlett, WindowFunction.Bartlett);
if (_windowcount == 0)
throw new Exception("invalid windowfunction");
cuda.CopyHostToDevice<float>(cudaWindow, windowBuffer);
initialize_autocorTasks(eparams.block_size, channelCount, eparams.max_prediction_order, maxFrames);
}
if (doMidside)
{
int* s = ((int*)samplesBufferPtr);
channel_decorrelation(s, s + FlaCudaWriter.MAX_BLOCKSIZE,
s + 2 * FlaCudaWriter.MAX_BLOCKSIZE, s + 3 * FlaCudaWriter.MAX_BLOCKSIZE, eparams.block_size * nFrames);
}
estimate_residual(eparams.block_size, channelCount, eparams.max_prediction_order, nFrames);
int bs = 0;
for (int iFrame = 0; iFrame < nFrames; iFrame++)
{
//if (0 != eparams.variable_block_size && 0 == (eparams.block_size & 7) && eparams.block_size >= 128) //if (0 != eparams.variable_block_size && 0 == (eparams.block_size & 7) && eparams.block_size >= 128)
// fs = encode_frame_vbs(); // fs = encode_frame_vbs();
//else //else
fs = encode_frame(out bs); int fs = encode_frame(doMidside, channelCount, iFrame);
bs += eparams.block_size;
if (seek_table != null && _IO.CanSeek) if (seek_table != null && _IO.CanSeek)
{ {
@@ -1268,39 +1280,40 @@ namespace CUETools.Codecs.FlaCuda
{ {
if (seek_table[sp].framesize != 0) if (seek_table[sp].framesize != 0)
continue; continue;
if (seek_table[sp].number > (ulong)_position + (ulong)bs) if (seek_table[sp].number > (ulong)_position + (ulong)eparams.block_size)
break; break;
if (seek_table[sp].number >= (ulong)_position) if (seek_table[sp].number >= (ulong)_position)
{ {
seek_table[sp].number = (ulong)_position; seek_table[sp].number = (ulong)_position;
seek_table[sp].offset = (ulong)(_IO.Position - first_frame_offset); seek_table[sp].offset = (ulong)(_IO.Position - first_frame_offset);
seek_table[sp].framesize = (uint)bs; seek_table[sp].framesize = (uint)eparams.block_size;
} }
} }
} }
_position += bs; _position += eparams.block_size;
_IO.Write(frame_buffer, 0, fs); _IO.Write(frame_buffer, 0, fs);
_totalSize += fs; _totalSize += fs;
if (verify != null) if (verify != null)
{ {
int decoded = verify.DecodeFrame(frame_buffer, 0, fs); int decoded = verify.DecodeFrame(frame_buffer, 0, fs);
if (decoded != fs || verify.Remaining != (ulong)bs) if (decoded != fs || verify.Remaining != (ulong)eparams.block_size)
throw new Exception("validation failed!"); throw new Exception("validation failed!");
fixed (int* s = verifyBuffer, r = verify.Samples) fixed (int* s = verifyBuffer, r = verify.Samples)
{ {
for (int ch = 0; ch < channels; ch++) for (int ch = 0; ch < channels; ch++)
if (AudioSamples.MemCmp(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, bs)) if (AudioSamples.MemCmp(s + iFrame * eparams.block_size + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, eparams.block_size))
throw new Exception("validation failed!"); throw new Exception("validation failed!");
} }
} }
}
if (bs < eparams.block_size) if (bs < samplesInBuffer)
{ {
int* s = (int*)samplesBufferPtr; int* s = (int*)samplesBufferPtr;
for (int ch = 0; ch < channels; ch++) for (int ch = 0; ch < channels; ch++)
AudioSamples.MemCpy(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, s + bs + ch * FlaCudaWriter.MAX_BLOCKSIZE, eparams.block_size - bs); AudioSamples.MemCpy(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, s + bs + ch * FlaCudaWriter.MAX_BLOCKSIZE, samplesInBuffer - bs);
} }
samplesInBuffer -= bs; samplesInBuffer -= bs;
@@ -1326,17 +1339,16 @@ namespace CUETools.Codecs.FlaCuda
cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual"); cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual");
cudaSamples = cuda.Allocate((uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels))); cudaSamples = cuda.Allocate((uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels)));
cudaWindow = cuda.Allocate((uint)sizeof(float) * FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS); cudaWindow = cuda.Allocate((uint)sizeof(float) * FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS);
cudaAutocorTasks = cuda.Allocate((uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS)); cudaAutocorTasks = cuda.Allocate((uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * maxFrames));
cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * maxAutocorParts); cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * maxAutocorParts));
cudaResidualTasks = cuda.Allocate((uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_ORDER * (lpc.MAX_LPC_WINDOWS + 1))); cudaResidualTasks = cuda.Allocate((uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_ORDER * (lpc.MAX_LPC_WINDOWS + 1) * maxFrames));
cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 4))); cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_WINDOWS + 1) * lpc.MAX_LPC_ORDER * maxResidualParts));
cudaResidualSums = cuda.Allocate((uint)(sizeof(int) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 4) * maxResidualParts));
//cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 4) * maxResidualParts)); //cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 4) * maxResidualParts));
CUResult cuErr = CUDADriver.cuMemAllocHost(ref samplesBufferPtr, (uint)(sizeof(int) * (channels == 2 ? 4 : channels) * FlaCudaWriter.MAX_BLOCKSIZE)); CUResult cuErr = CUDADriver.cuMemAllocHost(ref samplesBufferPtr, (uint)(sizeof(int) * (channels == 2 ? 4 : channels) * FlaCudaWriter.MAX_BLOCKSIZE));
if (cuErr == CUResult.Success) if (cuErr == CUResult.Success)
cuErr = CUDADriver.cuMemAllocHost(ref autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS)); cuErr = CUDADriver.cuMemAllocHost(ref autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * maxFrames));
if (cuErr == CUResult.Success) if (cuErr == CUResult.Success)
cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_WINDOWS + 1) * lpc.MAX_LPC_ORDER)); cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_WINDOWS + 1) * lpc.MAX_LPC_ORDER * maxFrames));
if (cuErr != CUResult.Success) if (cuErr != CUResult.Success)
{ {
if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr); samplesBufferPtr = IntPtr.Zero; if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr); samplesBufferPtr = IntPtr.Zero;
@@ -1358,21 +1370,21 @@ namespace CUETools.Codecs.FlaCuda
int len = sampleCount; int len = sampleCount;
while (len > 0) while (len > 0)
{ {
int block = Math.Min(len, eparams.block_size - samplesInBuffer); int block = Math.Min(len, FlaCudaWriter.MAX_BLOCKSIZE - samplesInBuffer);
copy_samples(buff, pos, block); copy_samples(buff, pos, block);
if (md5 != null) if (md5 != null)
{ {
AudioSamples.FLACSamplesToBytes(buff, pos, frame_buffer, 0, block, channels, (int)bits_per_sample); AudioSamples.FLACSamplesToBytes(buff, pos, md5_buffer, 0, block, channels, (int)bits_per_sample);
md5.TransformBlock(frame_buffer, 0, block * channels * ((int)bits_per_sample >> 3), null, 0); md5.TransformBlock(md5_buffer, 0, block * channels * ((int)bits_per_sample >> 3), null, 0);
} }
len -= block; len -= block;
pos += block; pos += block;
while (samplesInBuffer >= eparams.block_size) while (samplesInBuffer >= eparams.block_size)
output_frame(); output_frames();
} }
} }
@@ -1584,7 +1596,7 @@ namespace CUETools.Codecs.FlaCuda
else else
max_frame_size = 16 + ((eparams.block_size * channels * (int)bits_per_sample + 7) >> 3); max_frame_size = 16 + ((eparams.block_size * channels * (int)bits_per_sample + 7) >> 3);
if (_IO.CanSeek && eparams.do_seektable) if (_IO.CanSeek && eparams.do_seektable && sample_count != 0)
{ {
int seek_points_distance = sample_rate * 10; int seek_points_distance = sample_rate * 10;
int num_seek_points = 1 + sample_count / seek_points_distance; // 1 seek point per 10 seconds int num_seek_points = 1 + sample_count / seek_points_distance; // 1 seek point per 10 seconds
@@ -1729,24 +1741,24 @@ namespace CUETools.Codecs.FlaCuda
case 0: case 0:
do_midside = false; do_midside = false;
window_function = WindowFunction.Bartlett; window_function = WindowFunction.Bartlett;
max_prediction_order = 6;
max_partition_order = 4; max_partition_order = 4;
max_prediction_order = 6;
break; break;
case 1: case 1:
do_midside = false; do_midside = false;
window_function = WindowFunction.Bartlett; window_function = WindowFunction.Bartlett;
max_partition_order = 4;
max_prediction_order = 8; max_prediction_order = 8;
max_partition_order = 6;
break; break;
case 2: case 2:
do_midside = false; window_function = WindowFunction.Bartlett;
max_partition_order = 6; max_partition_order = 4;
max_prediction_order = 8; max_prediction_order = 4;
break; break;
case 3: case 3:
window_function = WindowFunction.Bartlett; window_function = WindowFunction.Bartlett;
max_partition_order = 4; max_partition_order = 4;
max_prediction_order = 4; max_prediction_order = 5;
break; break;
case 4: case 4:
window_function = WindowFunction.Bartlett; window_function = WindowFunction.Bartlett;
@@ -1758,7 +1770,7 @@ namespace CUETools.Codecs.FlaCuda
max_prediction_order = 8; max_prediction_order = 8;
break; break;
case 6: case 6:
window_function = WindowFunction.Bartlett; max_prediction_order = 8;
break; break;
case 7: case 7:
max_prediction_order = 10; max_prediction_order = 10;
@@ -1784,6 +1796,8 @@ namespace CUETools.Codecs.FlaCuda
{ {
public int samplesOffs; public int samplesOffs;
public int windowOffs; public int windowOffs;
public int residualOffs;
public int blocksize;
}; };
unsafe struct encodeResidualTaskStruct unsafe struct encodeResidualTaskStruct

View File

@@ -24,6 +24,8 @@ typedef struct
{ {
int samplesOffs; int samplesOffs;
int windowOffs; int windowOffs;
int residualOffs;
int blocksize;
} computeAutocorTaskStruct; } computeAutocorTaskStruct;
typedef struct typedef struct
@@ -155,7 +157,7 @@ extern "C" __global__ void cudaComputeLPC(
if (tid < 32) if (tid < 32)
{ {
int precision = 13; int precision = 13;
int taskNo = (blockIdx.x + blockIdx.y * gridDim.x) * max_order + order; int taskNo = shared.task.residualOffs + order;
shared.bits[tid] = __mul24((33 - __clz(__float2int_rn(fabs(shared.tmp[tid]) * (1 << 15))) - precision), tid <= order); shared.bits[tid] = __mul24((33 - __clz(__float2int_rn(fabs(shared.tmp[tid]) * (1 << 15))) - precision), tid <= order);
shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 16]); shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 16]);
shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 8]); shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 8]);

View File

@@ -4,7 +4,7 @@ modname {cubin}
code { code {
name = cudaComputeAutocor name = cudaComputeAutocor
lmem = 0 lmem = 0
smem = 3256 smem = 3264
reg = 10 reg = 10
bar = 1 bar = 1
const { const {
@@ -13,14 +13,14 @@ code {
offset = 0 offset = 0
bytes = 16 bytes = 16
mem { mem {
0x00000001 0x0000007f 0x0000003f 0x0000001f 0x00000003 0x0000007f 0x0000003f 0x0000001f
} }
} }
bincode { bincode {
0xa0000009 0x04000780 0x308005fd 0x644107c8 0xa0000009 0x04000780 0x308005fd 0x644107c8
0xa000b003 0x00000000 0x3002040d 0xc4100780 0xa000b003 0x00000000 0x3002040d 0xc4100780
0x1000b003 0x00000280 0xa0004e01 0x04200780 0x1000b003 0x00000280 0xa0004e01 0x04200780
0x30030001 0xc4100780 0x2100ee00 0x20008600 0x30040001 0xc4100780 0x2100ee00 0x20008600
0xd00e0001 0x80c00780 0x00000605 0xc0000780 0xd00e0001 0x80c00780 0x00000605 0xc0000780
0x04065801 0xe4200780 0xf0000001 0xe0000002 0x04065801 0xe4200780 0xf0000001 0xe0000002
0x861ffe03 0x00000000 0xa0004c05 0x04200780 0x861ffe03 0x00000000 0xa0004c05 0x04200780
@@ -278,132 +278,118 @@ code {
code { code {
name = cudaComputeLPC name = cudaComputeLPC
lmem = 0 lmem = 0
smem = 564 smem = 572
reg = 9 reg = 9
bar = 1 bar = 1
const { const {
segname = const segname = const
segnum = 1 segnum = 1
offset = 0 offset = 0
bytes = 40 bytes = 48
mem { mem {
0x00000001 0x0000001f 0x7e800000 0x3f800000 0x00000003 0x0000001f 0x7e800000 0x3f800000
0x0000000f 0x00001fff 0xffffe000 0x3e800000 0x00000001 0x0000000f 0x00001fff 0xffffe000
0x00000020 0x0000009e 0x3e800000 0x00000020 0x0000009e 0x00000008
} }
} }
bincode { bincode {
0xa000000d 0x04000780 0x308007fd 0x644107c8 0xa0000009 0x04000780 0x308005fd 0x644107c8
0xa000b003 0x00000000 0x1000b003 0x00000280 0xa000b003 0x00000000 0x1000b003 0x00000280
0xa0004e01 0x04200780 0x30030001 0xc4100780 0xa0004e01 0x04200780 0x30040001 0xc4100780
0x30020605 0xc4100780 0x2100ec00 0x20008200 0x30020405 0xc4100780 0x2100ec00 0x20008200
0xd00e0001 0x80c00780 0x00020605 0xc0000780 0xd00e0001 0x80c00780 0x00020405 0xc0000780
0x04001201 0xe4200780 0x3003ce01 0x6c2187d2 0x04001201 0xe4200780 0x3002ce01 0x6c2187d2
0xa00001fd 0x0c0147c8 0x00020605 0xc0001680 0xa00001fd 0x0c0147c8 0x00020405 0xc0001680
0x0400d601 0xe43f1680 0x861ffe03 0x00000000 0x0400da01 0xe43f1680 0x861ffe03 0x00000000
0x307cd1fd 0x6c20c7d8 0x1002b003 0x00001280 0x307cd1fd 0x6c20c7d8 0x1002b003 0x00001280
0x1000f805 0x0403c780 0xa0027003 0x00000000 0x1000f805 0x0403c780 0xa0027003 0x00000000
0x10027003 0x00000100 0x1000d001 0x0423c780 0x10027003 0x00000100 0x1000d001 0x0423c780
0x40014e09 0x00200780 0x30100409 0xc4100780 0x40014e0d 0x00200780 0x3010060d 0xc4100780
0x60004e09 0x00208780 0x2101ee01 0x00000003 0x60004e0d 0x0020c780 0x2101ee01 0x00000003
0x20000409 0x04004780 0x40010811 0x00000780 0x2000060d 0x04004780 0x40010c11 0x00000780
0x60000a11 0x00010780 0x30100811 0xc4100780 0x60000e11 0x00010780 0x30100811 0xc4100780
0x60000801 0x00010780 0x20000601 0x04000780 0x60000c01 0x00010780 0x20000401 0x04000780
0x30020001 0xc4100780 0x00020605 0xc0000780 0x30020001 0xc4100780 0x00020405 0xc0000780
0x2000ca01 0x04200780 0xd00e0001 0x80c00780 0x2000ca01 0x04200780 0xd00e0001 0x80c00780
0xd4035809 0x20000780 0xb800c001 0x00200780 0xd4036809 0x20000780 0xb800c001 0x00200780
0x0400d601 0xe4200780 0xf0000001 0xe0000002 0x0400da01 0xe4200780 0xf0000001 0xe0000002
0x20018205 0x00000003 0x3001d1fd 0x6c2147d8 0x20018205 0x00000003 0x3001d1fd 0x6c2147d8
0x10013003 0x00001280 0x861ffe03 0x00000000 0x10013003 0x00001280 0x861ffe03 0x00000000
0x30810601 0x6c40c7d0 0xa00001fd 0x0c0147c8 0x30810401 0x6c40c7d0 0xa00001fd 0x0c0147c8
0x00020605 0xc0001680 0x04001601 0xe43f1680 0x00020405 0xc0001680 0x04001a01 0xe43f1680
0xd0035805 0x20000780 0x307ccffd 0x6c20c7d8 0xd0036805 0x20000780 0x307ccffd 0x6c20c7d8
0x1400c005 0x0423c780 0x30000003 0x00001280 0x1400c001 0x0423c780 0x30000003 0x00001280
0x10248001 0x00000003 0x00000005 0xc0000780 0x10248005 0x00000003 0x00000205 0xc0000780
0x30020611 0xc4100780 0x1000f815 0x0403c780 0x3002040d 0xc4100780 0x1000f811 0x0403c780
0x20400a01 0x0400c780 0x00020009 0xc0000780 0x20400805 0x04008780 0x00020209 0xc0000780
0xa004e003 0x00000000 0x30030bfd 0x6c00c7d8 0xa004e003 0x00000000 0x300209fd 0x6c00c7d8
0x1004e003 0x00000100 0xd8035811 0x20000780 0x1004e003 0x00000100 0xd8036811 0x20000780
0x0000080d 0xc0000780 0x1000c001 0x0423c784 0x0000060d 0xc0000780 0x1000c005 0x0423c784
0xcc00d601 0x00200780 0x1000f801 0x0403d280 0xcc01da05 0x00200780 0x1000f805 0x0403d280
0xdc016011 0x20000780 0x0c005601 0xe4200780 0xdc017011 0x20000780 0x0c005a01 0xe4204780
0xb000de01 0x00200784 0x0c005601 0xe4200780 0xb000de05 0x00204784 0x0c005a01 0xe4204780
0xb000ce01 0x00200784 0x0c005601 0xe4200780 0xb000ce05 0x00204784 0x0c005a01 0xe4204780
0xb000c601 0x00200784 0x0c005601 0xe4200780 0xb000c605 0x00204784 0x0c005a01 0xe4204780
0xb000c201 0x00200784 0x0c005601 0xe4200780 0xb000c205 0x00204784 0x0c005a01 0xe4204780
0xb000c001 0x00200784 0x0c005601 0xe4200780 0xb000c005 0x00204784 0x0c005a01 0xe4204780
0xf0000001 0xe0000002 0x861ffe03 0x00000000 0xf0000001 0xe0000002 0x861ffe03 0x00000000
0xd0015811 0x20000780 0xd403180d 0x20000780 0xd0016811 0x20000780 0xd403280d 0x20000780
0x1000c001 0x0423c784 0xbc00c009 0x00200780 0x1000c005 0x0423c784 0xbc00c015 0x00204780
0xb08203fd 0x605107e8 0x10000201 0x0403c780 0xb08201fd 0x605107e8 0x10000005 0x0403c780
0xa0000409 0xe4004780 0xc0870409 0x00402680 0xa0000a15 0xe4004780 0xc0880a15 0x00402680
0xc0870001 0x00402680 0x90000000 0xc0000418 0xc0880205 0x00402680 0x90000204 0xc0010a18
0xc806d409 0x00200780 0x1000f809 0x0403d280 0xc806d815 0x00200780 0x1000f815 0x0403d280
0x30030bfd 0x6c0147d8 0xb0000c01 0x00008780 0x300209fd 0x6c0147d8 0xb0000c05 0x00014780
0x10000401 0x0403d280 0x0000080d 0xc0000780 0x10000a05 0x0403d280 0x0000060d 0xc0000780
0xe1060c09 0x0440c780 0xbd007600 0xc0020204 0xe1060c15 0x0440c780 0xbd017a04 0xc0050000
0xa00d6003 0x00000000 0x0c001601 0xe4200780 0xa00bb003 0x00000000 0x0c001a01 0xe4204780
0x100d6003 0x00000100 0xa0000001 0xc4104780 0x100bb003 0x00000100 0xa0000205 0xc4104780
0xc0000001 0x04700003 0xa0000001 0x8c0047d0 0xc0000205 0x04700003 0xa0000215 0x8c0047d0
0xa0000001 0x44065680 0x30170001 0xec101680 0x2000d605 0x04210780 0xa0000a15 0x44065680
0x31000001 0x04425680 0x10001001 0x2440d100 0x30170a15 0xec101680 0x31000a15 0x04429680
0x30030a09 0x6c0187d0 0x30148001 0x00000003 0x10001215 0x2440d100 0x30020819 0x6c0187d0
0xd0800409 0x04400780 0x0000080d 0xc0000780 0x30148a15 0x00000003 0xd0840c19 0x04400780
0x40020001 0x00018780 0xdc026011 0x20000780 0x0000060d 0xc0000780 0x40060a15 0x00018780
0x0c009601 0xe4200780 0x3000de01 0x8c200784 0xdc027011 0x20000780 0x0c009a01 0xe4214780
0x0c009601 0xe4200780 0x3000ce01 0x8c200784 0x3005de15 0x8c200784 0x0c009a01 0xe4214780
0x0c009601 0xe4200780 0x3000c601 0x8c200784 0x3005ce15 0x8c200784 0x0c009a01 0xe4214780
0x0c009601 0xe4200780 0x3000c201 0x8c200784 0x3005c615 0x8c200784 0x0c009a01 0xe4214780
0x0c009601 0xe4200780 0x3000c001 0x8c200784 0x3005c215 0x8c200784 0x0c009a01 0xe4214780
0x0c009601 0xe4200780 0xd002580d 0x20000780 0x3005c015 0x8c200784 0x0c009a01 0xe4214780
0x3d0fe001 0x00000003 0x30840001 0xac400780 0xd002680d 0x20000780 0x3d0fe015 0x00000003
0x10018009 0x00000003 0x307c001d 0x8c000780 0x30850a15 0xac400780 0x10018019 0x00000003
0x30070401 0xc4000780 0xa0000001 0x44014780 0x307c0a15 0x8c000780 0x30050c19 0xc4000780
0xc800d601 0x00200780 0xa0000001 0xac004780 0xa0000c19 0x44014780 0xc806da19 0x00200780
0x30850001 0xac400780 0xa0099003 0x00000000 0xa0000c19 0xac004780 0x30860c19 0xac400780
0x30860019 0x8c400780 0x10099003 0x00001100 0xa0091003 0x00000000 0x30870c19 0x8c400780
0xa0004c09 0x04200780 0x10004e01 0x0023c780 0x10091003 0x00001100 0x3007021d 0xc4100780
0x60004809 0x00208780 0x1000ce01 0x0423c780 0x30060221 0xc4100780 0x20088e1c 0x2107e81c
0x40050021 0x00000780 0x60040221 0x00020780 0x2000061d 0x0401c780 0x20008e1d 0x00000007
0x30101021 0xc4100780 0x60040001 0x00020780 0xd00e0e19 0xa0c00780 0x307c041d 0x6c0087e2
0x20000001 0x04014780 0x30070009 0xc4100780 0xa0000ffd 0x0c0147d8 0x3007021d 0xc4102680
0x30060001 0xc4100780 0x20008400 0x2100e800 0x30060221 0xc4102680 0x20000e1d 0x04022680
0x20000801 0x04000780 0x20008001 0x00000007 0x2000c81d 0x0421e680 0x21000e1d 0x0442e680
0xd00e0019 0xa0c00780 0x307c0601 0x6c0087e2 0xd00e0e15 0xa0c02680 0x307c0dfd 0x6c0087e8
0xa00001fd 0x0c0147d8 0xa00ab003 0x00000000 0xa0000c15 0x44066500 0x30170a15 0xec102500
0x100ab003 0x00002100 0xa0004c09 0x04200780 0x31000a15 0x0442a500 0x10001215 0x2440e280
0x10004e01 0x0023c780 0x60004809 0x00208780 0xd0060019 0x0402c780 0x307c0dfd 0x6c0087e8
0x1000ce01 0x0423c780 0x40050021 0x00000780 0xa0000c19 0x44066500 0x30170c19 0xec102500
0x60040221 0x00020780 0x30101021 0xc4100780 0x31000c19 0x0442a500 0x10001219 0x2440e280
0x60040001 0x00020780 0x20000001 0x04014780 0x30060a15 0x8c000780 0x00000609 0xc0000780
0x30070009 0xc4100780 0x30060001 0xc4100780 0x30218a15 0x00000003 0xd802700d 0x20000780
0x20008400 0x2100e800 0x20088001 0x00000003 0x08009a01 0xe4214780 0x3c05de15 0x8c200780
0xd00e001d 0xa0c00780 0x307c0dfd 0x6c0087ea 0x08009a01 0xe4214780 0x3c05ce15 0x8c200780
0xa0000c01 0x44066500 0x30170001 0xec102500 0x08009a01 0xe4214780 0x3c05c615 0x8c200780
0x31000001 0x04426500 0x10001001 0x2440e280 0x08009a01 0xe4214780 0x3c05c215 0x8c200780
0xd0060009 0x0402c780 0x307c05fd 0x6c0087e8 0x08009a01 0xe4214780 0x3c05c015 0x8c200780
0xa0000409 0x44066500 0x30170409 0xec102500 0x08009a01 0xe4214780 0x100bb003 0x00001100
0x31000409 0x04426500 0x10001009 0x2440e280 0x30070215 0xc4100780 0x30060205 0xc4100780
0x30020001 0x8c000780 0x00000809 0xc0000780 0x20000a05 0x04004780 0xd0026809 0x20000780
0x30218001 0x00000003 0xd802600d 0x20000780 0x2101e814 0x1900e004 0x200c8a15 0x00000003
0x08009601 0xe4200780 0x3c00de01 0x8c200780 0xd00e0a05 0xa0c00780 0xf0000001 0xe0000002
0x08009601 0xe4200780 0x3c00ce01 0x8c200780 0x861ffe03 0x00000000 0x20018811 0x00000003
0x08009601 0xe4200780 0x3c00c601 0x8c200780 0x3004cffd 0x6c2147d8 0xd4000805 0x20000780
0x08009601 0xe4200780 0x3c00c201 0x8c200780 0x10038003 0x00001280 0xf0000001 0xe0000001
0x08009601 0xe4200780 0x3c00c001 0x8c200780
0x08009601 0xe4200780 0x100d6003 0x00001100
0xa0004c09 0x04200780 0x10004e01 0x0023c780
0x60004809 0x00208780 0x1000ce01 0x0423c780
0x40050019 0x00000780 0x60040219 0x00018780
0x30100c19 0xc4100780 0x60040001 0x00018780
0x20000001 0x04014780 0x30070009 0xc4100780
0x30060001 0xc4100780 0x20000401 0x04000780
0xd0025809 0x20000780 0x2100e808 0x1900e000
0x200c8409 0x00000003 0xd00e0401 0xa0c00780
0xf0000001 0xe0000002 0x861ffe03 0x00000000
0x20018a15 0x00000003 0x3005cffd 0x6c2147d8
0xd4000805 0x20000780 0x10038003 0x00001280
0xf0000001 0xe0000001
} }
} }
code { code {