optimizations

This commit is contained in:
chudov
2009-09-16 17:11:36 +00:00
parent eba87d6db0
commit 672c0cb20e
3 changed files with 491 additions and 178 deletions

View File

@@ -851,7 +851,6 @@ namespace CUETools.Codecs.FlaCuda
unsafe void initialize_autocorTasks(int blocksize, int channelsCount, int max_order, int nFrames, FlaCudaTask task)
{
computeAutocorTaskStruct* autocorTasks = (computeAutocorTaskStruct*)task.autocorTasksPtr;
encodeResidualTaskStruct* residualTasks = (encodeResidualTaskStruct*)task.residualTasksPtr;
nAutocorTasks = 0;
nResidualTasks = 0;
for (int iFrame = 0; iFrame < nFrames; iFrame++)
@@ -869,40 +868,44 @@ namespace CUETools.Codecs.FlaCuda
// LPC tasks
for (int order = 1; order <= max_order; order++)
{
residualTasks[nResidualTasks].residualOrder = order <= max_order ? order : 0;
residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize;
task.ResidualTasks[nResidualTasks].type = (int)SubframeType.LPC;
task.ResidualTasks[nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0);
task.ResidualTasks[nResidualTasks].blocksize = blocksize;
task.ResidualTasks[nResidualTasks].residualOrder = order <= max_order ? order : 0;
task.ResidualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize;
nResidualTasks++;
}
}
// Fixed prediction
for (int order = 1; order <= max_order; order++)
{
residualTasks[nResidualTasks].residualOrder = order <= 4 ? order : 0;
residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize;
residualTasks[nResidualTasks].shift = 0;
task.ResidualTasks[nResidualTasks].type = order <= 5 ? (int)SubframeType.Fixed : (int)SubframeType.Verbatim;
task.ResidualTasks[nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0);
task.ResidualTasks[nResidualTasks].blocksize = blocksize;
task.ResidualTasks[nResidualTasks].residualOrder = order <= 4 ? order : 0;
task.ResidualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize;
task.ResidualTasks[nResidualTasks].shift = 0;
switch (order)
{
case 5:
residualTasks[nResidualTasks].residualOrder = 1;
residualTasks[nResidualTasks].coefs[0] = 0;
break;
case 1:
residualTasks[nResidualTasks].coefs[0] = 1;
task.ResidualTasks[nResidualTasks].coefs[0] = 1;
break;
case 2:
residualTasks[nResidualTasks].coefs[1] = 2;
residualTasks[nResidualTasks].coefs[0] = -1;
task.ResidualTasks[nResidualTasks].coefs[1] = 2;
task.ResidualTasks[nResidualTasks].coefs[0] = -1;
break;
case 3:
residualTasks[nResidualTasks].coefs[2] = 3;
residualTasks[nResidualTasks].coefs[1] = -3;
residualTasks[nResidualTasks].coefs[0] = 1;
task.ResidualTasks[nResidualTasks].coefs[2] = 3;
task.ResidualTasks[nResidualTasks].coefs[1] = -3;
task.ResidualTasks[nResidualTasks].coefs[0] = 1;
break;
case 4:
residualTasks[nResidualTasks].coefs[3] = 4;
residualTasks[nResidualTasks].coefs[2] = -6;
residualTasks[nResidualTasks].coefs[1] = 4;
residualTasks[nResidualTasks].coefs[0] = -1;
task.ResidualTasks[nResidualTasks].coefs[3] = 4;
task.ResidualTasks[nResidualTasks].coefs[2] = -6;
task.ResidualTasks[nResidualTasks].coefs[1] = 4;
task.ResidualTasks[nResidualTasks].coefs[0] = -1;
break;
}
nResidualTasks++;
@@ -970,7 +973,6 @@ namespace CUETools.Codecs.FlaCuda
unsafe void select_best_methods(FlacFrame frame, int channelsCount, int max_order, int iFrame, FlaCudaTask task)
{
encodeResidualTaskStruct* residualTasks = (encodeResidualTaskStruct*)task.residualTasksPtr;
for (int ch = 0; ch < channelsCount; ch++)
{
int i;
@@ -994,51 +996,59 @@ namespace CUETools.Codecs.FlaCuda
if (frame.blocksize <= 4)
return;
// LPC
for (int ch = 0; ch < channelsCount; ch++)
{
for (int iWindow = 0; iWindow < _windowcount; iWindow++)
int index = ch + iFrame * channelsCount;
if (frame.subframes[ch].best.size > task.BestResidualTasks[index].size)
{
for (int order = 1; order <= max_order && order < frame.blocksize; order++)
{
int index = (order - 1) + max_order * (iWindow + (_windowcount + 1) * (ch + iFrame * channelsCount));
int cbits = residualTasks[index].cbits;
int nbits = order * (int)frame.subframes[ch].obits + 4 + 5 + order * cbits + 6 + residualTasks[index].size;
if (residualTasks[index].residualOrder != order)
throw new Exception("oops");
if (frame.subframes[ch].best.size > nbits)
{
frame.subframes[ch].best.type = SubframeType.LPC;
frame.subframes[ch].best.size = (uint)nbits;
frame.subframes[ch].best.order = order;
frame.subframes[ch].best.window = iWindow;
frame.subframes[ch].best.cbits = cbits;
frame.subframes[ch].best.shift = residualTasks[index].shift;
for (int i = 0; i < order; i++)
frame.subframes[ch].best.coefs[i] = residualTasks[index].coefs[order - 1 - i];
}
}
frame.subframes[ch].best.type = (SubframeType)task.BestResidualTasks[index].type;
frame.subframes[ch].best.size = (uint)task.BestResidualTasks[index].size;
frame.subframes[ch].best.order = task.BestResidualTasks[index].residualOrder;
frame.subframes[ch].best.cbits = task.BestResidualTasks[index].cbits;
frame.subframes[ch].best.shift = task.BestResidualTasks[index].shift;
for (int i = 0; i < task.BestResidualTasks[index].residualOrder; i++)
frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i];
AudioSamples.MemCpy(frame.subframes[ch].best.residual + frame.subframes[ch].best.order, (int*)task.residualBufferPtr + task.BestResidualTasks[index].samplesOffs, frame.blocksize - frame.subframes[ch].best.order);
}
//for (int iWindow = 0; iWindow < _windowcount; iWindow++)
//{
// for (int order = 1; order <= max_order && order < frame.blocksize; order++)
// {
// int index = (order - 1) + max_order * (iWindow + (_windowcount + 1) * (ch + iFrame * channelsCount));
// if (task.ResidualTasks[index].residualOrder != order || task.ResidualTasks[index].type != (int)SubframeType.LPC)
// throw new Exception("oops");
// if (frame.subframes[ch].best.size > task.ResidualTasks[index].size)
// {
// frame.subframes[ch].best.type = SubframeType.LPC;
// frame.subframes[ch].best.size = (uint)task.ResidualTasks[index].size;
// frame.subframes[ch].best.order = task.ResidualTasks[index].residualOrder;
// //frame.subframes[ch].best.window = iWindow;
// frame.subframes[ch].best.cbits = task.ResidualTasks[index].cbits;
// frame.subframes[ch].best.shift = task.ResidualTasks[index].shift;
// for (int i = 0; i < order; i++)
// frame.subframes[ch].best.coefs[i] = task.ResidualTasks[index].coefs[order - 1 - i];
// }
// }
//}
}
// FIXED
for (int ch = 0; ch < channelsCount; ch++)
{
for (int order = 1; order <= 5 && order <= max_order && order < frame.blocksize; order++)
{
int index = (order - 1) + max_order * (_windowcount + (_windowcount + 1) * (ch + iFrame * channelsCount));
int forder = order == 5 ? 0 : order;
int nbits = forder * (int)frame.subframes[ch].obits + 6 + residualTasks[index].size;
if (residualTasks[index].residualOrder != (order == 5 ? 1 : order))
throw new Exception("oops");
if (frame.subframes[ch].best.size > nbits)
{
frame.subframes[ch].best.type = SubframeType.Fixed;
frame.subframes[ch].best.size = (uint)nbits;
frame.subframes[ch].best.order = forder;
}
}
}
//for (int ch = 0; ch < channelsCount; ch++)
//{
// for (int order = 1; order <= 5 && order <= max_order && order < frame.blocksize; order++)
// {
// int index = (order - 1) + max_order * (_windowcount + (_windowcount + 1) * (ch + iFrame * channelsCount));
// int forder = order == 5 ? 0 : order;
// if (task.ResidualTasks[index].residualOrder != (order == 5 ? 1 : order))
// throw new Exception("oops");
// if (frame.subframes[ch].best.size > task.ResidualTasks[index].size)
// {
// frame.subframes[ch].best.type = SubframeType.Fixed;
// frame.subframes[ch].best.size = (uint)task.ResidualTasks[index].size;
// frame.subframes[ch].best.order = forder;
// }
// }
//}
}
unsafe void estimate_residual(int blocksize, int channelsCount, int max_order, int nFrames, FlaCudaTask task)
@@ -1063,7 +1073,7 @@ namespace CUETools.Codecs.FlaCuda
threads_y = 4;
else
throw new Exception("invalid LPC order");
int partSize = 32 * (threads_y - 1);
int partSize = 32 * threads_y;
int partCount = (blocksize + partSize - 1) / partSize;
if (partCount > maxResidualParts)
@@ -1085,10 +1095,27 @@ namespace CUETools.Codecs.FlaCuda
cuda.SetParameterSize(task.cudaSumResidual, sizeof(uint) * 4U);
cuda.SetFunctionBlockShape(task.cudaSumResidual, 64, 1, 1);
int tasksPerChannel = (_windowcount + 1) * max_order;
int nBestTasks = nResidualTasks / tasksPerChannel;
cuda.SetParameter(task.cudaChooseBestResidual, 0, (uint)task.cudaBestResidualTasks.Pointer);
cuda.SetParameter(task.cudaChooseBestResidual, 1 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer);
cuda.SetParameter(task.cudaChooseBestResidual, 2 * sizeof(uint), (uint)tasksPerChannel);
cuda.SetParameterSize(task.cudaChooseBestResidual, sizeof(uint) * 3U);
cuda.SetFunctionBlockShape(task.cudaChooseBestResidual, 256, 1, 1);
cuda.SetParameter(task.cudaEncodeResidual, 0, (uint)task.cudaResidual.Pointer);
cuda.SetParameter(task.cudaEncodeResidual, 1 * sizeof(uint), (uint)task.cudaSamples.Pointer);
cuda.SetParameter(task.cudaEncodeResidual, 2 * sizeof(uint), (uint)task.cudaBestResidualTasks.Pointer);
cuda.SetParameterSize(task.cudaEncodeResidual, sizeof(uint) * 3U);
cuda.SetFunctionBlockShape(task.cudaEncodeResidual, partSize, 1, 1);
// issue work to the GPU
cuda.LaunchAsync(task.cudaEstimateResidual, partCount, (nResidualTasks / threads_y * nFrames) / maxFrames, task.stream);
cuda.LaunchAsync(task.cudaSumResidual, 1, (nResidualTasks * nFrames) / maxFrames, task.stream);
cuda.CopyDeviceToHostAsync(task.cudaResidualTasks, task.residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * ((nResidualTasks * nFrames) / maxFrames)), task.stream);
cuda.LaunchAsync(task.cudaChooseBestResidual, 1, (nBestTasks * nFrames) / maxFrames, task.stream);
//cuda.LaunchAsync(task.cudaEncodeResidual, partCount, (nBestTasks * nFrames) / maxFrames, task.stream);
cuda.CopyDeviceToHostAsync(task.cudaBestResidualTasks, task.bestResidualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * ((nBestTasks * nFrames) / maxFrames)), task.stream);
//cuda.CopyDeviceToHostAsync(task.cudaResidual, task.residualBufferPtr, (uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelsCount), task.stream);
}
unsafe void compute_autocorellation(int blocksize, int channelsCount, int max_order, int nFrames, FlaCudaTask task)
@@ -1762,7 +1789,10 @@ namespace CUETools.Codecs.FlaCuda
public int shift;
public int cbits;
public int size;
public fixed int reserved[11];
public int type;
public int obits;
public int blocksize;
public fixed int reserved[8];
public fixed int coefs[32];
};
@@ -1772,23 +1802,29 @@ namespace CUETools.Codecs.FlaCuda
public CUfunction cudaComputeAutocor;
public CUfunction cudaComputeLPC;
public CUfunction cudaEstimateResidual;
public CUfunction cudaChooseBestResidual;
//public CUfunction cudaSumResidualChunks;
public CUfunction cudaSumResidual;
//public CUfunction cudaEncodeResidual;
public CUfunction cudaEncodeResidual;
public CUdeviceptr cudaSamples;
public CUdeviceptr cudaResidual;
public CUdeviceptr cudaAutocorTasks;
public CUdeviceptr cudaAutocorOutput;
public CUdeviceptr cudaResidualTasks;
public CUdeviceptr cudaResidualOutput;
public CUdeviceptr cudaBestResidualTasks;
public IntPtr samplesBufferPtr = IntPtr.Zero;
public IntPtr residualBufferPtr = IntPtr.Zero;
public IntPtr autocorTasksPtr = IntPtr.Zero;
public IntPtr residualTasksPtr = IntPtr.Zero;
public IntPtr bestResidualTasksPtr = IntPtr.Zero;
public CUstream stream;
public int[] verifyBuffer;
public int blocksize = 0;
public FlacFrame frame;
public int autocorTasksLen;
public int residualTasksLen;
public int bestResidualTasksLen;
public int samplesBufferLen;
unsafe public FlaCudaTask(CUDA _cuda, int channelCount)
@@ -1797,23 +1833,32 @@ namespace CUETools.Codecs.FlaCuda
autocorTasksLen = sizeof(computeAutocorTaskStruct) * channelCount * lpc.MAX_LPC_WINDOWS * FlaCudaWriter.maxFrames;
residualTasksLen = sizeof(encodeResidualTaskStruct) * channelCount * lpc.MAX_LPC_ORDER * (lpc.MAX_LPC_WINDOWS + 1) * FlaCudaWriter.maxFrames;
bestResidualTasksLen = sizeof(encodeResidualTaskStruct) * channelCount * FlaCudaWriter.maxFrames;
samplesBufferLen = sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelCount;
cudaSamples = cuda.Allocate((uint)samplesBufferLen);
cudaResidual = cuda.Allocate((uint)samplesBufferLen);
cudaAutocorTasks = cuda.Allocate((uint)autocorTasksLen);
cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * FlaCudaWriter.maxAutocorParts));
cudaResidualTasks = cuda.Allocate((uint)residualTasksLen);
cudaBestResidualTasks = cuda.Allocate((uint)bestResidualTasksLen);
cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS + 1) * lpc.MAX_LPC_ORDER * FlaCudaWriter.maxResidualParts));
CUResult cuErr = CUDADriver.cuMemAllocHost(ref samplesBufferPtr, (uint)samplesBufferLen);
if (cuErr == CUResult.Success)
cuErr = CUDADriver.cuMemAllocHost(ref residualBufferPtr, (uint)samplesBufferLen);
if (cuErr == CUResult.Success)
cuErr = CUDADriver.cuMemAllocHost(ref autocorTasksPtr, (uint)autocorTasksLen);
if (cuErr == CUResult.Success)
cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)residualTasksLen);
if (cuErr == CUResult.Success)
cuErr = CUDADriver.cuMemAllocHost(ref bestResidualTasksPtr, (uint)bestResidualTasksLen);
if (cuErr != CUResult.Success)
{
if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr); samplesBufferPtr = IntPtr.Zero;
if (residualBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualBufferPtr); residualBufferPtr = IntPtr.Zero;
if (autocorTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(autocorTasksPtr); autocorTasksPtr = IntPtr.Zero;
if (residualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualTasksPtr); residualTasksPtr = IntPtr.Zero;
if (bestResidualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(bestResidualTasksPtr); bestResidualTasksPtr = IntPtr.Zero;
throw new CUDAException(cuErr);
}
@@ -1821,8 +1866,9 @@ namespace CUETools.Codecs.FlaCuda
cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC");
cudaEstimateResidual = cuda.GetModuleFunction("cudaEstimateResidual");
cudaSumResidual = cuda.GetModuleFunction("cudaSumResidual");
cudaChooseBestResidual = cuda.GetModuleFunction("cudaChooseBestResidual");
cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual");
//cudaSumResidualChunks = cuda.GetModuleFunction("cudaSumResidualChunks");
//cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual");
stream = cuda.CreateStream();
verifyBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * channelCount]; // should be channels, not channelCount. And should null if not doing verify!
@@ -1832,14 +1878,34 @@ namespace CUETools.Codecs.FlaCuda
public void Dispose()
{
cuda.Free(cudaSamples);
cuda.Free(cudaResidual);
cuda.Free(cudaAutocorTasks);
cuda.Free(cudaAutocorOutput);
cuda.Free(cudaResidualTasks);
cuda.Free(cudaResidualOutput);
cuda.Free(cudaBestResidualTasks);
CUDADriver.cuMemFreeHost(samplesBufferPtr);
CUDADriver.cuMemFreeHost(residualBufferPtr);
CUDADriver.cuMemFreeHost(residualTasksPtr);
CUDADriver.cuMemFreeHost(bestResidualTasksPtr);
CUDADriver.cuMemFreeHost(autocorTasksPtr);
cuda.DestroyStream(stream);
}
public unsafe encodeResidualTaskStruct* ResidualTasks
{
get
{
return (encodeResidualTaskStruct*)residualTasksPtr;
}
}
public unsafe encodeResidualTaskStruct* BestResidualTasks
{
get
{
return (encodeResidualTaskStruct*)bestResidualTasksPtr;
}
}
}
}

View File

@@ -28,6 +28,14 @@ typedef struct
int blocksize;
} computeAutocorTaskStruct;
typedef enum
{
Constant = 0,
Verbatim = 1,
Fixed = 8,
LPC = 32
} SubframeType;
typedef struct
{
int residualOrder; // <= 32
@@ -35,7 +43,10 @@ typedef struct
int shift;
int cbits;
int size;
int reserved[11];
int type;
int obits;
int blocksize;
int reserved[8];
int coefs[32];
} encodeResidualTaskStruct;
@@ -201,11 +212,11 @@ extern "C" __global__ void cudaEstimateResidual(
encodeResidualTaskStruct *tasks,
int max_order,
int frameSize,
int partSize // should be 224
int partSize // should be blockDim.x * blockDim.y == 256
)
{
__shared__ struct {
int data[32*8];
int data[32*9];
volatile int residual[32*8];
encodeResidualTaskStruct task[8];
} shared;
@@ -218,22 +229,24 @@ extern "C" __global__ void cudaEstimateResidual(
// fetch samples
shared.data[tid] = tid < dataLen ? samples[shared.task[0].samplesOffs + pos + tid] : 0;
const int residualLen = max(0,min(frameSize - pos - shared.task[threadIdx.y].residualOrder, partSize)) * (shared.task[threadIdx.y].residualOrder != 0);
if (tid < 32) shared.data[tid + partSize] = tid + partSize < dataLen ? samples[shared.task[0].samplesOffs + pos + tid + partSize] : 0;
const int residualLen = max(0,min(frameSize - pos - shared.task[threadIdx.y].residualOrder, partSize));
__syncthreads();
shared.residual[tid] = 0;
shared.task[threadIdx.y].coefs[threadIdx.x] = threadIdx.x < max_order ? tasks[blockIdx.y * blockDim.y + threadIdx.y].coefs[threadIdx.x] : 0;
for (int i = threadIdx.x; i - threadIdx.x < residualLen; i += blockDim.x) // += 32
for (int i = blockDim.y * (shared.task[threadIdx.y].type == Verbatim); i < blockDim.y; i++) // += 32
{
int ptr = threadIdx.x + (i<<5);
// compute residual
int sum = 0;
int c = 0;
for (c = 0; c < shared.task[threadIdx.y].residualOrder; c++)
sum += __mul24(shared.data[i + c], shared.task[threadIdx.y].coefs[c]);
sum = shared.data[i + c] - (sum >> shared.task[threadIdx.y].shift);
shared.residual[tid] += __mul24(i < residualLen, (sum << 1) ^ (sum >> 31));
sum += __mul24(shared.data[ptr + c], shared.task[threadIdx.y].coefs[c]);
sum = shared.data[ptr + c] - (sum >> shared.task[threadIdx.y].shift);
shared.residual[tid] += __mul24(ptr < residualLen, (sum << 1) ^ (sum >> 31));
}
// enable this line when using blockDim.x == 64
@@ -250,7 +263,7 @@ extern "C" __global__ void cudaEstimateResidual(
shared.residual[tid] = min(shared.residual[tid], shared.residual[tid + 4]);
shared.residual[tid] = min(shared.residual[tid], shared.residual[tid + 2]);
shared.residual[tid] = min(shared.residual[tid], shared.residual[tid + 1]);
if (threadIdx.x == 0 && shared.task[threadIdx.y].residualOrder != 0)
if (threadIdx.x == 0)
output[(blockIdx.y * blockDim.y + threadIdx.y) * gridDim.x + blockIdx.x] = shared.residual[tid];
}
@@ -342,17 +355,79 @@ extern "C" __global__ void cudaSumResidual(
shared.partLen[tid] += shared.partLen[tid + 1];
// return sum
if (tid == 0)
tasks[blockIdx.y].size = shared.partLen[0];
tasks[blockIdx.y].size = shared.task.type == Fixed ?
shared.task.residualOrder * shared.task.obits + 6 + shared.partLen[0] : shared.task.type == LPC ?
shared.task.residualOrder * shared.task.obits + 4 + 5 + shared.task.residualOrder * shared.task.cbits + 6 + (4 * partCount/2)/* << porder */ + shared.partLen[0] :
shared.task.obits * shared.task.blocksize;
}
#define BEST_INDEX(a,b) ((a) + ((b) - (a)) * (shared.length[b] < shared.length[a]))
extern "C" __global__ void cudaChooseBestResidual(
encodeResidualTaskStruct *tasks_out,
encodeResidualTaskStruct *tasks,
int count
)
{
__shared__ struct {
volatile int index[128];
int length[256];
} shared;
//shared.index[threadIdx.x] = threadIdx.x;
shared.length[threadIdx.x] = (threadIdx.x < count) ? tasks[threadIdx.x + count * blockIdx.y].size : 0x7fffffff;
__syncthreads();
//if (threadIdx.x < 128) shared.index[threadIdx.x] = BEST_INDEX(shared.index[threadIdx.x], shared.index[threadIdx.x + 128]); __syncthreads();
if (threadIdx.x < 128) shared.index[threadIdx.x] = BEST_INDEX(threadIdx.x, threadIdx.x + 128); __syncthreads();
if (threadIdx.x < 64) shared.index[threadIdx.x] = BEST_INDEX(shared.index[threadIdx.x], shared.index[threadIdx.x + 64]); __syncthreads();
if (threadIdx.x < 32)
{
shared.index[threadIdx.x] = BEST_INDEX(shared.index[threadIdx.x], shared.index[threadIdx.x + 32]);
shared.index[threadIdx.x] = BEST_INDEX(shared.index[threadIdx.x], shared.index[threadIdx.x + 16]);
shared.index[threadIdx.x] = BEST_INDEX(shared.index[threadIdx.x], shared.index[threadIdx.x + 8]);
shared.index[threadIdx.x] = BEST_INDEX(shared.index[threadIdx.x], shared.index[threadIdx.x + 4]);
shared.index[threadIdx.x] = BEST_INDEX(shared.index[threadIdx.x], shared.index[threadIdx.x + 2]);
shared.index[threadIdx.x] = BEST_INDEX(shared.index[threadIdx.x], shared.index[threadIdx.x + 1]);
}
__syncthreads();
if (threadIdx.x < sizeof(encodeResidualTaskStruct)/sizeof(int))
((int*)(tasks_out + blockIdx.y))[threadIdx.x] = ((int*)(tasks + count * blockIdx.y + shared.index[0]))[threadIdx.x];
// if (threadIdx.x == 0)
//tasks[count * blockIdx.y].best = count * blockIdx.y + shared.index[0];
}
extern "C" __global__ void cudaEncodeResidual(
int*output,
int*samples,
encodeResidualTaskStruct *tasks,
int frameSize,
int partSize // should be <= blockDim - max_order
encodeResidualTaskStruct *tasks
)
{
__shared__ struct {
int data[256 + 32];
encodeResidualTaskStruct task;
} shared;
const int tid = threadIdx.x;
if (threadIdx.x < sizeof(encodeResidualTaskStruct))
((int*)&shared.task)[threadIdx.x] = ((int*)(&tasks[blockIdx.y]))[threadIdx.x];
__syncthreads();
const int partSize = blockDim.x;
const int pos = blockIdx.x * partSize;
const int dataLen = min(shared.task.blocksize - pos, partSize + shared.task.residualOrder);
// fetch samples
shared.data[tid] = tid < dataLen ? samples[shared.task.samplesOffs + pos + tid] : 0;
if (tid < 32) shared.data[tid + partSize] = tid + partSize < dataLen ? samples[shared.task.samplesOffs + pos + tid + partSize] : 0;
const int residualLen = max(0,min(shared.task.blocksize - pos - shared.task.residualOrder, partSize));
__syncthreads();
// compute residual
int sum = 0;
for (int c = 0; c < shared.task.residualOrder; c++)
sum += __mul24(shared.data[tid + c], shared.task.coefs[c]);
if (tid < residualLen)
output[shared.task.samplesOffs + pos + tid] = shared.data[tid + shared.task.residualOrder] - (sum >> shared.task.shift);
}
#endif

View File

@@ -1,6 +1,93 @@
architecture {sm_10}
abiversion {1}
modname {cubin}
code {
name = cudaChooseBestResidual
lmem = 0
smem = 1564
reg = 7
bar = 1
const {
segname = const
segnum = 1
offset = 0
bytes = 16
mem {
0x0000007f 0x0000003f 0x0000001f 0x0000002f
}
}
bincode {
0xa0000009 0x04000780 0x3002cdfd 0x6420c7c8
0xa0010003 0x00000000 0x1000f003 0x00000280
0x1000cc01 0x0423c780 0x40014e05 0x00200780
0x30100205 0xc4100780 0x60004e01 0x00204780
0x20000001 0x04008780 0x30070005 0xc4100780
0x30060001 0xc4100780 0x20008200 0x2100ea00
0x20108001 0x00000003 0xd00e0001 0x80c00780
0x10010003 0x00000780 0x103f8001 0x07ffffff
0x00020405 0xc0000782 0x04010e01 0xe4200780
0x3002040d 0xc4100780 0x861ffe03 0x00000000
0x308005fd 0x644107c8 0xa001f003 0x00000000
0x1001f003 0x00000280 0x00000605 0xc0000780
0xd408380d 0x20000780 0xd4043809 0x20000780
0x1c00c001 0x0423c780 0x3800c1fd 0x6c2107c8
0x20008401 0x0000000b 0x10000401 0x0403c500
0x04000e01 0xe4200780 0xf0000001 0xe0000002
0x861ffe03 0x00000000 0x308105fd 0x644107c8
0xa0030003 0x00000000 0x10030003 0x00000280
0x00000605 0xc0000780 0xd4023809 0x20000780
0x0802c00d 0xc0200780 0x0402ce11 0xc0200780
0xdc04380d 0x20000780 0x1400ce01 0x0423c780
0xd0043811 0x20000784 0x1d00e004 0x2940e000
0x3001c005 0x6c20c784 0xd0010001 0x04020780
0x2400ce01 0x04200780 0x04000e01 0xe4200780
0xf0000001 0xe0000002 0x861ffe03 0x00000000
0x308205fd 0x644107c8 0xa0074003 0x00000000
0x10074003 0x00000280 0x00000605 0xc0000780
0xd4013809 0x20000780 0x0802c00d 0xc0200780
0x0402ce11 0xc0200780 0xdc04380d 0x20000780
0x1400ce01 0x0423c780 0xd0043811 0x20000784
0x1d00e004 0x2940e000 0x3001c005 0x6c20c784
0xd0010001 0x04020780 0x2400ce01 0x04200780
0x04000e01 0xe4200780 0x0402ee0d 0xc0200780
0x0402ce09 0xc0200780 0xdc04380d 0x20000780
0x1400ce01 0x0423c780 0xd8043809 0x20000780
0x1c00c005 0x0423c780 0x2440ee01 0x04200780
0x3801c005 0x6c20c780 0xd0010001 0x04020780
0x2400ce01 0x04200780 0x04000e01 0xe4200780
0x0402de0d 0xc0200780 0x0402ce09 0xc0200780
0xdc04380d 0x20000780 0x1400ce01 0x0423c780
0xd8043809 0x20000780 0x1d00e004 0x2540fe00
0x3801c005 0x6c20c780 0xd0010001 0x04020780
0x2400ce01 0x04200780 0x04000e01 0xe4200780
0x0402d60d 0xc0200780 0x0402ce09 0xc0200780
0xdc04380d 0x20000780 0x1400ce01 0x0423c780
0xd8043809 0x20000780 0x1d00e004 0x2540f600
0x3801c005 0x6c20c780 0xd0010001 0x04020780
0x2400ce01 0x04200780 0x04000e01 0xe4200780
0x0402d20d 0xc0200780 0x0402ce09 0xc0200780
0xdc04380d 0x20000780 0x1400ce01 0x0423c780
0xd8043809 0x20000780 0x1d00e004 0x2540f200
0x3801c005 0x6c20c780 0xd0010001 0x04020780
0x2400ce01 0x04200780 0x04000e01 0xe4200780
0x0402d00d 0xc0200780 0x0402ce09 0xc0200780
0xdc04380d 0x20000780 0x1400ce01 0x0423c780
0xd8043809 0x20000780 0x1d00e004 0x2540f000
0x3801c005 0x6c20c780 0xd0010001 0x04020780
0x2400ce01 0x04200780 0x04000e01 0xe4200780
0xf0000001 0xe0000002 0x861ffe03 0x00000000
0x308305fd 0x644107c8 0x30000003 0x00000280
0xa0004e05 0x04200780 0x1000cc01 0x0423c780
0x40020209 0x00000780 0x30100409 0xc4100780
0x60020001 0x00008780 0x3007ce09 0xc4300780
0x3006ce11 0xc4300780 0x30070015 0xc4100780
0x30060019 0xc4100780 0x30070201 0xc4100780
0x30060205 0xc4100780 0x20048408 0x20068a10
0x20018000 0x20028608 0x2104ea10 0x2100e804
0x20000401 0x04010780 0xd00e0001 0x80c00780
0x20000605 0x04004780 0xd00e0201 0xa0c00781
}
}
code {
name = cudaComputeAutocor
lmem = 0
@@ -96,105 +183,121 @@ code {
code {
name = cudaEstimateResidual
lmem = 0
smem = 3624
smem = 3752
reg = 10
bar = 1
const {
segname = const
segnum = 1
offset = 0
bytes = 12
bytes = 20
mem {
0x000003ff 0x0000000f 0x0000000e
0x000003ff 0x0000000f 0x0000001f 0x00000001
0x0000000e
}
}
bincode {
0xd0800205 0x00400780 0xa0000209 0x04000780
0x30070405 0xc4100780 0x3006040d 0xc4100780
0xa0000001 0x04000780 0x20000205 0x0400c780
0x30020015 0xc4100780 0x2000020d 0x04014780
0x308101fd 0x644107c8 0x00000205 0xc0000780
0x00000609 0xc0000780 0xa0015003 0x00000000
0x10015003 0x00000280 0x10004409 0x0023c780
0x60024e05 0x00208780 0x3007020d 0xc4100780
0x30060205 0xc4100780 0x20018604 0x2101ec04
0x20000a05 0x04004780 0xd00e0205 0x80c00780
0x08041401 0xe4204780 0xf0000001 0xe0000002
0x861ffe03 0x00000000 0xa0004c11 0x04200780
0x1000d205 0x0423c780 0x40080619 0x00000780
0xa000420d 0x04200780 0x30100c19 0xc4100780
0x60080411 0x00018780 0x40060a05 0x00000780
0x30100219 0xc4100780 0x1000d205 0x0423c780
0x60060819 0x00018780 0x2101ee20 0x2144f004
0x20000c1d 0x04000780 0x30080221 0xac000780
0x30080ffd 0x6c0187c8 0xd010580d 0x20000780
0x2c00c011 0x04210500 0x20000e11 0x04010500
0x30020811 0xc4100500 0x2000ca11 0x04210500
0xd00e0811 0x80c00500 0x1000f811 0x0403c280
0x00020e0d 0xc0000780 0x0c001401 0xe4210780
0xd410500d 0x20000780 0x3c00c005 0x04204780
0x3c7cc011 0x6c208780 0x3001d205 0xac200780
0x307c0205 0x8c000780 0xd0040211 0x04020780
0xd4005005 0x20000780 0x861ffe03 0x00000000
0x3000cffd 0x6420c7c8 0xa0045003 0x00000000
0x00020e0d 0xc0000780 0x0c021401 0xe43f0780
0x10044003 0x00000280 0x10004409 0x0023c780
0x60024e05 0x00208780 0x30070221 0xc4100780
0x30060205 0xc4100780 0x20019004 0x2101ec04
0x20000a05 0x04004780 0x20008205 0x00000007
0xd00e0205 0x80c00780 0x10045003 0x00000780
0x1000f805 0x0403c780 0x307c09fd 0x640087ca
0x08043401 0xe4204780 0xa0072003 0x00000000
0x10000005 0x0403c780 0x10072003 0x00000280
0xd4100009 0x20000780 0x387cc1fd 0x6c20c7c8
0xa005e003 0x00000000 0x1000f815 0x0403c780
0x1000f825 0x0403c780 0x1005e003 0x00000280
0xa005c003 0x00000000 0xd4000009 0x20000780
0x20000221 0x04014780 0x0002100d 0xc0000780
0xd8108011 0x20000780 0x1000c021 0x0423c784
0x6c08d425 0x80224780 0xd410000d 0x20000780
0x20018a15 0x00000003 0x3c05c1fd 0x6c2147c8
0xd8000809 0x20000780 0x10052003 0x00000280
0xd4100009 0x20000782 0x1800c015 0x0423c780
0x20000215 0x04014782 0x00020a09 0xc0000780
0xd410100d 0x20000780 0x1c00c015 0x0423c780
0x30051215 0xec000780 0x2840d415 0x04214780
0x301f0a21 0xec100780 0x30010a15 0xc4100780
0xd0051021 0x04008780 0x00020e0d 0xc0000780
0xdc085009 0x20000780 0x30010825 0x6c010780
0x1800c015 0x0423c780 0xa0001225 0x2c014780
0x60081215 0x80014780 0x20000205 0x0400c780
0x0c021401 0xe4214780 0x20400215 0x04000780
0x300509fd 0x640107c8 0x1004a003 0x00000280
0x00020e0d 0xc0000782 0xdc085011 0x20000780
0x1000e005 0x0423c784 0x2000c005 0x04204784
0x0c021401 0xe4204780 0x1000d005 0x0423c784
0x2000c005 0x04204784 0x0c021401 0xe4204780
0x1000c805 0x0423c784 0x2000c005 0x04204784
0x0c021401 0xe4204780 0x1000c405 0x0423c784
0x2000c005 0x04204784 0x0c021401 0xe4204780
0x20018005 0x00000003 0x1000c20d 0x0423c784
0x2000c00d 0x0420c784 0x40031015 0x00000780
0x0c021401 0xe420c780 0x6002120d 0x00014780
0x30820021 0x64410780 0x00020c09 0xc0000780
0xd8085009 0x20000780 0x30100615 0xc4100780
0xa0001019 0x2c014780 0x3001080d 0xec100780
0x60021011 0x00014780 0x407f8c15 0x0007ffff
0x2943e004 0x2005880c 0x30000205 0xec000780
0x20000205 0x0400c780 0x0c021401 0xe4204780
0x1000d005 0x0423c784 0x3001c005 0xac200784
0x0c021401 0xe4204780 0x1000c805 0x0423c784
0x3001c005 0xac200784 0x0c021401 0xe4204780
0x1000c405 0x0423c784 0x3001c005 0xac200784
0x0c021401 0xe4204780 0x1000c205 0x0423c784
0x3001c005 0xac200784 0x307c01fd 0x640147c8
0x0c021401 0xe4204780 0x30000003 0x00000280
0xd4100005 0x20000780 0x347cc1fd 0x6c2087c8
0x30000003 0x00000280 0x10004401 0x0023c780
0x60004e01 0x00208780 0x40014805 0x00200780
0xa0000011 0x04000780 0x20000201 0x0400c780
0x30020805 0xc4100780 0x2000000d 0x04004780
0x00000005 0xc0000780 0x308109fd 0x644107c8
0x00000609 0xc0000780 0xa0018003 0x00000000
0xa0004401 0x04200780 0x10018003 0x00000280
0x40014e0d 0x00200780 0x3010060d 0xc4100780
0x60004e01 0x0020c780 0x20000001 0x04008780
0x3007000d 0xc4100780 0x30060001 0xc4100780
0x20008600 0x2100ec00 0x20000201 0x04000780
0xd00e0001 0x80c00780 0x08045401 0xe4200780
0xf0000001 0xe0000002 0x861ffe03 0x00000000
0xa0004c0d 0x04200780 0x1000d201 0x0423c780
0x40060215 0x00000780 0x30100a15 0xc4100780
0x6006000d 0x00014780 0x40054201 0x00200780
0x30100015 0xc4100780 0x1000d201 0x0423c780
0x60044215 0x00214780 0x2100ee1c 0x2143f000
0x20000a19 0x04010780 0x3007001d 0xac000780
0x30060ffd 0x6c00c7c8 0xa0030003 0x00000000
0x1002f003 0x00000280 0xd011580d 0x20000780
0x2d03e020 0x20088c20 0x30021021 0xc4100780
0x2000ca21 0x04220780 0xd00e1021 0x80c00780
0x10030003 0x00000780 0x1000f821 0x0403c780
0x00020c0d 0xc0000782 0x0c001401 0xe4220780
0x30820dfd 0x6c4107c8 0xa0043003 0x00000000
0x10043003 0x00000280 0x2000d221 0x04218780
0x0002100d 0xc0000780 0x30080ffd 0x6c00c7c8
0xa0042003 0x00000000 0x10041003 0x00000280
0xd0115811 0x20000780 0x2000c00d 0x0420c784
0x2106f21c 0x2007860c 0x3002060d 0xc4100780
0x2000ca0d 0x0420c780 0xd00e060d 0x80c00780
0x10042003 0x00000780 0x1000f80d 0x0403c780
0x0c001401 0xe420c782 0xd411500d 0x20000782
0x3c00c00d 0x04200780 0xd4005005 0x20000780
0x861ffe03 0x00000000 0x3004cffd 0x6420c7c8
0xa0059003 0x00000000 0x00020c0d 0xc0000780
0x0c025401 0xe43f0780 0x10058003 0x00000280
0xa0004401 0x04200780 0x40014e1d 0x00200780
0x30100e1d 0xc4100780 0x60004e01 0x0021c780
0x20000001 0x04008780 0x3007001d 0xc4100780
0x30060001 0xc4100780 0x20008e00 0x2100ec00
0x20000201 0x04000780 0x20008001 0x00000007
0xd00e0001 0x80c00780 0x10059003 0x00000780
0x1000f801 0x0403c780 0x08047401 0xe4200782
0xd4112809 0x20000780 0x3883c005 0x6c608780
0xa0004401 0x04200780 0xd001001d 0x04000780
0x30000ffd 0x640187c8 0xa0093003 0x00000000
0x10091003 0x00000280 0x3003d201 0xac200780
0x307c000d 0x8c000780 0xd4110009 0x20000780
0x387cc1fd 0x6c20c7c8 0xa007c003 0x00000000
0x1000f801 0x0403c780 0x1000f825 0x0403c780
0x1007a003 0x00000280 0x30050e05 0xc4100780
0x20000221 0x04010780 0x200a9005 0x00000003
0x0002020d 0xc0000780 0xa0077003 0x00000000
0xd4000009 0x20000780 0xd8118011 0x20000780
0x1000c005 0x0423c784 0x6e01c225 0x80224780
0x20018001 0x00000003 0xd4110011 0x20000780
0x3000c1fd 0x6c2147cc 0xd8000809 0x20000780
0x1006f003 0x00000280 0xd4110009 0x20000782
0x1800c001 0x0423c780 0x1007c003 0x00000780
0x30050e05 0xc4100780 0x20000221 0x04010780
0x20001001 0x04000782 0x00020009 0xc0000780
0xd411100d 0x20000780 0x1c00c001 0x0423c780
0x30001201 0xec000780 0x2840d401 0x04200780
0x301f0005 0xec100780 0x30010001 0xc4100780
0xd0000205 0x04008780 0x30080601 0x6c010780
0x00020c0d 0xc0000780 0xdc095009 0x20000780
0xa0000021 0x2c014780 0x1800c001 0x0423c780
0x60011005 0x80000780 0x20018e1d 0x00000003
0xa0004401 0x04200780 0x30000ffd 0x640147c8
0x0c025401 0xe4204780 0x10063003 0x00000280
0x10093003 0x00000780 0x3003d201 0xac200780
0x307c000d 0x8c000780 0x00020c0d 0xc0000782
0xdc095005 0x20000780 0x1400e001 0x0423c780
0x2400c001 0x04200780 0x0c025401 0xe4200780
0x1500f000 0x2500e000 0x0c025401 0xe4200780
0x1500e800 0x2500e000 0x0c025401 0xe4200780
0x1500e400 0x2500e000 0x0c025401 0xe4200780
0x1400c205 0x0423c780 0x20018801 0x00000003
0x2400c005 0x04204780 0x0c025401 0xe4204780
0x40010c05 0x00000780 0x60000e05 0x00004780
0x30100205 0xc4100780 0x60000c01 0x00004780
0x30840805 0x64410780 0x00020a09 0xc0000780
0xd8095009 0x20000780 0xa0000215 0x2c014780
0x30010605 0xec100780 0x407f8a0d 0x0007ffff
0x2941e004 0x2003800c 0x30040201 0xec000780
0x20000001 0x0400c780 0x0c025401 0xe4200780
0x1400d001 0x0423c780 0x3400c001 0xac200780
0x0c025401 0xe4200780 0x1400c801 0x0423c780
0x3400c001 0xac200780 0x0c025401 0xe4200780
0x1400c401 0x0423c780 0x3400c001 0xac200780
0x0c025401 0xe4200780 0x1400c201 0x0423c780
0x3400c001 0xac200780 0x307c09fd 0x640147c8
0x0c025401 0xe4200780 0x30000003 0x00000280
0xa0004401 0x04200780 0x40014e05 0x00200780
0x30100205 0xc4100780 0x60004e01 0x00204780
0x20000001 0x04008780 0x40014805 0x00200780
0x30100205 0xc4100780 0x60004801 0x00204780
0xa0004c11 0x04200780 0x20000001 0x04010780
0x00020e0d 0xc0000780 0xdc085005 0x20000780
0xa0004c0d 0x04200780 0x20000001 0x0400c780
0x00020c0d 0xc0000780 0xdc095005 0x20000780
0x30020005 0xc4100780 0x1500e000 0x2101e804
0xd00e0201 0xa0c00781
}
@@ -202,11 +305,62 @@ code {
code {
name = cudaEncodeResidual
lmem = 0
smem = 36
reg = 0
bar = 0
smem = 1372
reg = 6
bar = 1
const {
segname = const
segnum = 1
offset = 0
bytes = 8
mem {
0x000000bf 0x0000001f
}
}
bincode {
0xf0000001 0xe0000001
0xa0000005 0x04000780 0x308003fd 0x644107c8
0xa000d003 0x00000000 0x1000d003 0x00000280
0xa0004e01 0x04200780 0x30070009 0xc4100780
0x30060001 0xc4100780 0x20000401 0x04000780
0x30020209 0xc4100780 0x2100ec00 0x20008400
0xd00e0001 0x80c00780 0x00020205 0xc0000780
0x04024e01 0xe4200780 0xf0000001 0xe0000002
0x861ffe03 0x00000000 0xa0004201 0x04200780
0x40014c09 0x00200780 0x30100409 0xc4100780
0xd0093805 0x20000780 0x60004c09 0x00208780
0x2500e00c 0x2542ee10 0x3004060d 0xac000780
0x300107fd 0x6c00c7c8 0xa0020003 0x00000000
0x1001f003 0x00000280 0xd0094005 0x20000780
0x2502e010 0x20048210 0x30020811 0xc4100780
0x2000ca11 0x04210780 0xd00e0811 0x80c00780
0x10020003 0x00000780 0x1000f811 0x0403c780
0x00020205 0xc0000782 0x308103fd 0x6c4107c8
0x04000e01 0xe4210780 0xa0033003 0x00000000
0x10033003 0x00000280 0x20000011 0x04004780
0x300309fd 0x6c0187c8 0x00020805 0xc0000780
0xa0032003 0x00000000 0x10031003 0x00000280
0xd0094009 0x20000780 0x2001800c 0x2902e010
0x2000060d 0x04010780 0x3002060d 0xc4100780
0x2000ca0d 0x0420c780 0xd00e060d 0x80c00780
0x10032003 0x00000780 0x1000f80d 0x0403c780
0x04000e01 0xe420c782 0xd0093805 0x20000782
0x2542ee0c 0x3503e00c 0x30030001 0xac000780
0x307c0011 0x8c000780 0x861ffe03 0x00000000
0xd0093805 0x20000780 0x347cc1fd 0x6c20c7c8
0x1000f80d 0x0403c780 0x1400c001 0x0423c780
0x1004b003 0x00000280 0x101c8001 0x00000003
0x00000005 0xc0000780 0x1000f815 0x0403c780
0x20000a01 0x04004780 0xd409800d 0x20000780
0x00020009 0xc0000780 0xd0093811 0x20000780
0x20018a15 0x00000003 0x1c00c001 0x0423c780
0x3005c1fd 0x6c2147cc 0x6800ce0d 0x8020c780
0xd4000805 0x20000780 0x1000c001 0x0423c784
0x10040003 0x00000280 0x300109fd 0x6c00c7c8
0x30000003 0x00000280 0xd0094005 0x20000780
0x2502e008 0x20008210 0x1500e200 0x20028204
0x00020805 0xc0000780 0x30000609 0xec000780
0x30020201 0xc4100780 0x2542ee04 0x2100e800
0xd00e0005 0xa0c00781
}
}
code {
@@ -422,15 +576,16 @@ code {
name = cudaSumResidual
lmem = 0
smem = 1248
reg = 4
reg = 5
bar = 1
const {
segname = const
segnum = 1
offset = 0
bytes = 8
bytes = 20
mem {
0x0000002f 0x0000001f
0x0000002f 0x0000001f 0x00000008 0x00000020
0x00000001
}
}
bincode {
@@ -460,9 +615,26 @@ code {
0x2400d401 0x04200780 0x04001001 0xe4200780
0x2400d201 0x04200780 0x307c03fd 0x6c0147c8
0x04001001 0xe4200780 0x30000003 0x00000280
0xa0004e01 0x04200780 0x30070005 0xc4100780
0x30060001 0xc4100780 0x20008200 0x2100e804
0x1000d001 0x0423c780 0x20108205 0x00000003
0xd00e0201 0xa0c00781
0xd0086805 0x20000780 0x3482c1fd 0x6c6147c8
0x10040003 0x00000280 0xd0084005 0x20000780
0x1500ec00 0x1500e004 0x40030009 0x00000780
0x60020209 0x00008780 0x30100409 0xc4100780
0x60020001 0x00008780 0x2000d001 0x04200780
0x20068001 0x00000003 0x10058003 0x00000780
0xd0086805 0x20000780 0x3483c1fd 0x6c6147c8
0x10052003 0x00000280 0xd0084005 0x20000780
0x1500e604 0x1500e000 0x2400cc05 0x04204780
0x3002ce0d 0xc4300780 0x40030009 0x00000780
0x301f0611 0xec100780 0x60020209 0x00008780
0xd0840811 0x04400780 0x30100409 0xc4100780
0x2000080d 0x0400c780 0x60020001 0x00008780
0x30010605 0xec100780 0x20018000 0x2100f000
0x200f8001 0x00000003 0x10058003 0x00000780
0xd0087005 0x20000780 0x1500e000 0x1500e204
0x40030009 0x00000780 0x60020209 0x00008780
0x30100409 0xc4100780 0x60020001 0x00008780
0xa0004e05 0x04200780 0x30070209 0xc4100780
0x30060205 0xc4100780 0x20018404 0x2101e804
0x20108205 0x00000003 0xd00e0201 0xa0c00781
}
}