mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
trying to do rice partitioning on gpu
This commit is contained in:
@@ -1055,6 +1055,7 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
frame.subframes[ch].best.shift = task.BestResidualTasks[index].shift;
|
frame.subframes[ch].best.shift = task.BestResidualTasks[index].shift;
|
||||||
frame.subframes[ch].obits -= (uint)task.BestResidualTasks[index].wbits;
|
frame.subframes[ch].obits -= (uint)task.BestResidualTasks[index].wbits;
|
||||||
frame.subframes[ch].wbits = (uint)task.BestResidualTasks[index].wbits;
|
frame.subframes[ch].wbits = (uint)task.BestResidualTasks[index].wbits;
|
||||||
|
frame.subframes[ch].best.rc.porder = task.BestResidualTasks[index].porder;
|
||||||
if (frame.subframes[ch].wbits != 0)
|
if (frame.subframes[ch].wbits != 0)
|
||||||
for (int i = 0; i < frame.blocksize; i++)
|
for (int i = 0; i < frame.blocksize; i++)
|
||||||
frame.subframes[ch].samples[i] >>= (int)frame.subframes[ch].wbits;
|
frame.subframes[ch].samples[i] >>= (int)frame.subframes[ch].wbits;
|
||||||
@@ -1062,32 +1063,10 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i];
|
frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i];
|
||||||
if (!encode_on_cpu && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC))
|
if (!encode_on_cpu && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC))
|
||||||
{
|
{
|
||||||
frame.subframes[ch].best.size = (uint)frame.subframes[ch].best.order * frame.subframes[ch].obits + 6;
|
|
||||||
if (frame.subframes[ch].best.type == SubframeType.LPC)
|
|
||||||
frame.subframes[ch].best.size += 4 + 5 + (uint)frame.subframes[ch].best.order * (uint)frame.subframes[ch].best.cbits;
|
|
||||||
AudioSamples.MemCpy(frame.subframes[ch].best.residual, (int*)task.residualBufferPtr + task.BestResidualTasks[index].residualOffs, frame.blocksize);
|
AudioSamples.MemCpy(frame.subframes[ch].best.residual, (int*)task.residualBufferPtr + task.BestResidualTasks[index].residualOffs, frame.blocksize);
|
||||||
int* riceParams = ((int*)task.riceParamsPtr) + (4 << task.max_porder) * index;
|
int* riceParams = ((int*)task.bestRiceParamsPtr) + (index << task.max_porder);
|
||||||
int* partLengths = ((int*)task.riceParamsPtr) + (4 << task.max_porder) * index + (2 << task.max_porder);
|
for (int i = 0; i < (1 << frame.subframes[ch].best.rc.porder); i++)
|
||||||
int opt_porder = task.max_porder;
|
frame.subframes[ch].best.rc.rparams[i] = riceParams[i];
|
||||||
int opt_pos = 0;
|
|
||||||
int opt_bits = 0xfffffff;
|
|
||||||
for (int porder = task.max_porder; porder >= 0; porder--)
|
|
||||||
{
|
|
||||||
int in_pos = (2 << task.max_porder) - (2 << porder);
|
|
||||||
int sum = (1 << porder) * 4;
|
|
||||||
for (int p = 0; p < (1 << porder); p++)
|
|
||||||
sum += partLengths[in_pos + p];// +(riceParams[in_pos + p] + 1) * ((frame.blocksize >> porder) - (p != 0 ? 0 : frame.subframes[ch].best.order));
|
|
||||||
if (sum < opt_bits)
|
|
||||||
{
|
|
||||||
opt_bits = sum;
|
|
||||||
opt_porder = porder;
|
|
||||||
opt_pos = in_pos;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
frame.subframes[ch].best.rc.porder = opt_porder;
|
|
||||||
for (int i = 0; i < (1 << opt_porder); i++)
|
|
||||||
frame.subframes[ch].best.rc.rparams[i] = riceParams[opt_pos + i];
|
|
||||||
frame.subframes[ch].best.size += (uint)opt_bits;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1132,10 +1111,10 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
calcPartitionPartSize <<= 1;
|
calcPartitionPartSize <<= 1;
|
||||||
max_porder--;
|
max_porder--;
|
||||||
}
|
}
|
||||||
int calcPartitionPartCount = (calcPartitionPartSize >= 64) ? 1 : (256 / calcPartitionPartSize);
|
int calcPartitionPartCount = (calcPartitionPartSize >= 128) ? 1 : (256 / calcPartitionPartSize);
|
||||||
|
|
||||||
CUfunction cudaChannelDecorr = channels == 2 ? (channelsCount == 4 ? task.cudaStereoDecorr : task.cudaChannelDecorr2) : task.cudaChannelDecorr;
|
CUfunction cudaChannelDecorr = channels == 2 ? (channelsCount == 4 ? task.cudaStereoDecorr : task.cudaChannelDecorr2) : task.cudaChannelDecorr;
|
||||||
CUfunction cudaCalcPartition = calcPartitionPartSize >= 64 ? task.cudaCalcLargePartition : task.cudaCalcPartition;
|
CUfunction cudaCalcPartition = calcPartitionPartSize >= 128 ? task.cudaCalcLargePartition : task.cudaCalcPartition;
|
||||||
|
|
||||||
cuda.SetParameter(cudaChannelDecorr, 0 * sizeof(uint), (uint)task.cudaSamples.Pointer);
|
cuda.SetParameter(cudaChannelDecorr, 0 * sizeof(uint), (uint)task.cudaSamples.Pointer);
|
||||||
cuda.SetParameter(cudaChannelDecorr, 1 * sizeof(uint), (uint)task.cudaSamplesBytes.Pointer);
|
cuda.SetParameter(cudaChannelDecorr, 1 * sizeof(uint), (uint)task.cudaSamplesBytes.Pointer);
|
||||||
@@ -1229,7 +1208,14 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
cuda.SetParameter(task.cudaFindRiceParameter, 1 * sizeof(uint), (uint)task.cudaPartitions.Pointer);
|
cuda.SetParameter(task.cudaFindRiceParameter, 1 * sizeof(uint), (uint)task.cudaPartitions.Pointer);
|
||||||
cuda.SetParameter(task.cudaFindRiceParameter, 2 * sizeof(uint), (uint)max_porder);
|
cuda.SetParameter(task.cudaFindRiceParameter, 2 * sizeof(uint), (uint)max_porder);
|
||||||
cuda.SetParameterSize(task.cudaFindRiceParameter, 3U * sizeof(uint));
|
cuda.SetParameterSize(task.cudaFindRiceParameter, 3U * sizeof(uint));
|
||||||
cuda.SetFunctionBlockShape(task.cudaFindRiceParameter, 16, 16, 1);
|
cuda.SetFunctionBlockShape(task.cudaFindRiceParameter, 8, 32, 1);
|
||||||
|
|
||||||
|
cuda.SetParameter(task.cudaFindPartitionOrder, 0, (uint)task.cudaBestRiceParams.Pointer);
|
||||||
|
cuda.SetParameter(task.cudaFindPartitionOrder, 1 * sizeof(uint), (uint)task.cudaBestResidualTasks.Pointer);
|
||||||
|
cuda.SetParameter(task.cudaFindPartitionOrder, 2 * sizeof(uint), (uint)task.cudaRiceParams.Pointer);
|
||||||
|
cuda.SetParameter(task.cudaFindPartitionOrder, 3 * sizeof(uint), (uint)max_porder);
|
||||||
|
cuda.SetParameterSize(task.cudaFindPartitionOrder, 4U * sizeof(uint));
|
||||||
|
cuda.SetFunctionBlockShape(task.cudaFindPartitionOrder, 256, 1, 1);
|
||||||
|
|
||||||
// issue work to the GPU
|
// issue work to the GPU
|
||||||
cuda.LaunchAsync(cudaChannelDecorr, (task.frameCount * task.frameSize + 255) / 256, channels == 2 ? 1 : channels, task.stream);
|
cuda.LaunchAsync(cudaChannelDecorr, (task.frameCount * task.frameSize + 255) / 256, channels == 2 ? 1 : channels, task.stream);
|
||||||
@@ -1255,9 +1241,11 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
cuda.LaunchAsync(cudaCalcPartition, (task.frameSize + bsz - 1) / bsz, channels * task.frameCount, task.stream);
|
cuda.LaunchAsync(cudaCalcPartition, (task.frameSize + bsz - 1) / bsz, channels * task.frameCount, task.stream);
|
||||||
if (max_porder > 0)
|
if (max_porder > 0)
|
||||||
cuda.LaunchAsync(task.cudaSumPartition, Flake.MAX_RICE_PARAM + 1, channels * task.frameCount, task.stream);
|
cuda.LaunchAsync(task.cudaSumPartition, Flake.MAX_RICE_PARAM + 1, channels * task.frameCount, task.stream);
|
||||||
cuda.LaunchAsync(task.cudaFindRiceParameter, ((2 << max_porder) + 15) / 16, channels * task.frameCount, task.stream);
|
cuda.LaunchAsync(task.cudaFindRiceParameter, ((2 << max_porder) + 31) / 32, channels * task.frameCount, task.stream);
|
||||||
|
//if (max_porder > 0) // need to run even if max_porder==0 just to calculate the final frame size
|
||||||
|
cuda.LaunchAsync(task.cudaFindPartitionOrder, 1, channels * task.frameCount, task.stream);
|
||||||
cuda.CopyDeviceToHostAsync(task.cudaResidual, task.residualBufferPtr, (uint)(sizeof(int) * MAX_BLOCKSIZE * channels), task.stream);
|
cuda.CopyDeviceToHostAsync(task.cudaResidual, task.residualBufferPtr, (uint)(sizeof(int) * MAX_BLOCKSIZE * channels), task.stream);
|
||||||
cuda.CopyDeviceToHostAsync(task.cudaRiceParams, task.riceParamsPtr, (uint)(sizeof(int) * (4 << max_porder) * channels * task.frameCount), task.stream);
|
cuda.CopyDeviceToHostAsync(task.cudaBestRiceParams, task.bestRiceParamsPtr, (uint)(sizeof(int) * (1 << max_porder) * channels * task.frameCount), task.stream);
|
||||||
task.max_porder = max_porder;
|
task.max_porder = max_porder;
|
||||||
}
|
}
|
||||||
cuda.CopyDeviceToHostAsync(task.cudaBestResidualTasks, task.bestResidualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * channels * task.frameCount), task.stream);
|
cuda.CopyDeviceToHostAsync(task.cudaBestResidualTasks, task.bestResidualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * channels * task.frameCount), task.stream);
|
||||||
@@ -1925,7 +1913,8 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
public int residualOffs;
|
public int residualOffs;
|
||||||
public int wbits;
|
public int wbits;
|
||||||
public int abits;
|
public int abits;
|
||||||
public fixed int reserved[3];
|
public int porder;
|
||||||
|
public fixed int reserved[2];
|
||||||
public fixed int coefs[32];
|
public fixed int coefs[32];
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -1948,11 +1937,13 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
public CUfunction cudaCalcLargePartition;
|
public CUfunction cudaCalcLargePartition;
|
||||||
public CUfunction cudaSumPartition;
|
public CUfunction cudaSumPartition;
|
||||||
public CUfunction cudaFindRiceParameter;
|
public CUfunction cudaFindRiceParameter;
|
||||||
|
public CUfunction cudaFindPartitionOrder;
|
||||||
public CUdeviceptr cudaSamplesBytes;
|
public CUdeviceptr cudaSamplesBytes;
|
||||||
public CUdeviceptr cudaSamples;
|
public CUdeviceptr cudaSamples;
|
||||||
public CUdeviceptr cudaResidual;
|
public CUdeviceptr cudaResidual;
|
||||||
public CUdeviceptr cudaPartitions;
|
public CUdeviceptr cudaPartitions;
|
||||||
public CUdeviceptr cudaRiceParams;
|
public CUdeviceptr cudaRiceParams;
|
||||||
|
public CUdeviceptr cudaBestRiceParams;
|
||||||
public CUdeviceptr cudaAutocorTasks;
|
public CUdeviceptr cudaAutocorTasks;
|
||||||
public CUdeviceptr cudaAutocorOutput;
|
public CUdeviceptr cudaAutocorOutput;
|
||||||
public CUdeviceptr cudaResidualTasks;
|
public CUdeviceptr cudaResidualTasks;
|
||||||
@@ -1961,7 +1952,7 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
public IntPtr samplesBytesPtr = IntPtr.Zero;
|
public IntPtr samplesBytesPtr = IntPtr.Zero;
|
||||||
public IntPtr samplesBufferPtr = IntPtr.Zero;
|
public IntPtr samplesBufferPtr = IntPtr.Zero;
|
||||||
public IntPtr residualBufferPtr = IntPtr.Zero;
|
public IntPtr residualBufferPtr = IntPtr.Zero;
|
||||||
public IntPtr riceParamsPtr = IntPtr.Zero;
|
public IntPtr bestRiceParamsPtr = IntPtr.Zero;
|
||||||
public IntPtr autocorTasksPtr = IntPtr.Zero;
|
public IntPtr autocorTasksPtr = IntPtr.Zero;
|
||||||
public IntPtr residualTasksPtr = IntPtr.Zero;
|
public IntPtr residualTasksPtr = IntPtr.Zero;
|
||||||
public IntPtr bestResidualTasksPtr = IntPtr.Zero;
|
public IntPtr bestResidualTasksPtr = IntPtr.Zero;
|
||||||
@@ -1996,6 +1987,7 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
cudaResidual = cuda.Allocate((uint)samplesBufferLen);
|
cudaResidual = cuda.Allocate((uint)samplesBufferLen);
|
||||||
cudaPartitions = cuda.Allocate((uint)partitionsLen);
|
cudaPartitions = cuda.Allocate((uint)partitionsLen);
|
||||||
cudaRiceParams = cuda.Allocate((uint)riceParamsLen);
|
cudaRiceParams = cuda.Allocate((uint)riceParamsLen);
|
||||||
|
cudaBestRiceParams = cuda.Allocate((uint)riceParamsLen / 4);
|
||||||
cudaAutocorTasks = cuda.Allocate((uint)autocorTasksLen);
|
cudaAutocorTasks = cuda.Allocate((uint)autocorTasksLen);
|
||||||
cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * (FlaCudaWriter.maxAutocorParts + FlaCudaWriter.maxFrames)));
|
cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * (FlaCudaWriter.maxAutocorParts + FlaCudaWriter.maxFrames)));
|
||||||
cudaResidualTasks = cuda.Allocate((uint)residualTasksLen);
|
cudaResidualTasks = cuda.Allocate((uint)residualTasksLen);
|
||||||
@@ -2009,7 +2001,7 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
if (cuErr == CUResult.Success)
|
if (cuErr == CUResult.Success)
|
||||||
cuErr = CUDADriver.cuMemAllocHost(ref residualBufferPtr, (uint)samplesBufferLen);
|
cuErr = CUDADriver.cuMemAllocHost(ref residualBufferPtr, (uint)samplesBufferLen);
|
||||||
if (cuErr == CUResult.Success)
|
if (cuErr == CUResult.Success)
|
||||||
cuErr = CUDADriver.cuMemAllocHost(ref riceParamsPtr, (uint)riceParamsLen);
|
cuErr = CUDADriver.cuMemAllocHost(ref bestRiceParamsPtr, (uint)riceParamsLen / 4);
|
||||||
if (cuErr == CUResult.Success)
|
if (cuErr == CUResult.Success)
|
||||||
cuErr = CUDADriver.cuMemAllocHost(ref autocorTasksPtr, (uint)autocorTasksLen);
|
cuErr = CUDADriver.cuMemAllocHost(ref autocorTasksPtr, (uint)autocorTasksLen);
|
||||||
if (cuErr == CUResult.Success)
|
if (cuErr == CUResult.Success)
|
||||||
@@ -2021,7 +2013,7 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
if (samplesBytesPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBytesPtr); samplesBytesPtr = IntPtr.Zero;
|
if (samplesBytesPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBytesPtr); samplesBytesPtr = IntPtr.Zero;
|
||||||
if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr); samplesBufferPtr = IntPtr.Zero;
|
if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr); samplesBufferPtr = IntPtr.Zero;
|
||||||
if (residualBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualBufferPtr); residualBufferPtr = IntPtr.Zero;
|
if (residualBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualBufferPtr); residualBufferPtr = IntPtr.Zero;
|
||||||
if (riceParamsPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(riceParamsPtr); riceParamsPtr = IntPtr.Zero;
|
if (bestRiceParamsPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(bestRiceParamsPtr); bestRiceParamsPtr = IntPtr.Zero;
|
||||||
if (autocorTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(autocorTasksPtr); autocorTasksPtr = IntPtr.Zero;
|
if (autocorTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(autocorTasksPtr); autocorTasksPtr = IntPtr.Zero;
|
||||||
if (residualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualTasksPtr); residualTasksPtr = IntPtr.Zero;
|
if (residualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualTasksPtr); residualTasksPtr = IntPtr.Zero;
|
||||||
if (bestResidualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(bestResidualTasksPtr); bestResidualTasksPtr = IntPtr.Zero;
|
if (bestResidualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(bestResidualTasksPtr); bestResidualTasksPtr = IntPtr.Zero;
|
||||||
@@ -2044,6 +2036,7 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
cudaCalcLargePartition = cuda.GetModuleFunction("cudaCalcLargePartition");
|
cudaCalcLargePartition = cuda.GetModuleFunction("cudaCalcLargePartition");
|
||||||
cudaSumPartition = cuda.GetModuleFunction("cudaSumPartition");
|
cudaSumPartition = cuda.GetModuleFunction("cudaSumPartition");
|
||||||
cudaFindRiceParameter = cuda.GetModuleFunction("cudaFindRiceParameter");
|
cudaFindRiceParameter = cuda.GetModuleFunction("cudaFindRiceParameter");
|
||||||
|
cudaFindPartitionOrder = cuda.GetModuleFunction("cudaFindPartitionOrder");
|
||||||
|
|
||||||
stream = cuda.CreateStream();
|
stream = cuda.CreateStream();
|
||||||
verifyBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * channelCount]; // should be channels, not channelCount. And should null if not doing verify!
|
verifyBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * channelCount]; // should be channels, not channelCount. And should null if not doing verify!
|
||||||
@@ -2064,7 +2057,7 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
CUDADriver.cuMemFreeHost(samplesBytesPtr);
|
CUDADriver.cuMemFreeHost(samplesBytesPtr);
|
||||||
CUDADriver.cuMemFreeHost(samplesBufferPtr);
|
CUDADriver.cuMemFreeHost(samplesBufferPtr);
|
||||||
CUDADriver.cuMemFreeHost(residualBufferPtr);
|
CUDADriver.cuMemFreeHost(residualBufferPtr);
|
||||||
CUDADriver.cuMemFreeHost(riceParamsPtr);
|
CUDADriver.cuMemFreeHost(bestRiceParamsPtr);
|
||||||
CUDADriver.cuMemFreeHost(residualTasksPtr);
|
CUDADriver.cuMemFreeHost(residualTasksPtr);
|
||||||
CUDADriver.cuMemFreeHost(bestResidualTasksPtr);
|
CUDADriver.cuMemFreeHost(bestResidualTasksPtr);
|
||||||
CUDADriver.cuMemFreeHost(autocorTasksPtr);
|
CUDADriver.cuMemFreeHost(autocorTasksPtr);
|
||||||
|
|||||||
@@ -52,8 +52,9 @@ typedef struct
|
|||||||
int residualOffs;
|
int residualOffs;
|
||||||
int wbits;
|
int wbits;
|
||||||
int abits;
|
int abits;
|
||||||
int reserved[3];
|
int porder;
|
||||||
int coefs[32];
|
int reserved[2];
|
||||||
|
int coefs[32]; // fixme: should be short?
|
||||||
} encodeResidualTaskStruct;
|
} encodeResidualTaskStruct;
|
||||||
|
|
||||||
#define SUM16(buf,tid,op) buf[tid] op buf[tid + 8]; buf[tid] op buf[tid + 4]; buf[tid] op buf[tid + 2]; buf[tid] op buf[tid + 1];
|
#define SUM16(buf,tid,op) buf[tid] op buf[tid + 8]; buf[tid] op buf[tid + 4]; buf[tid] op buf[tid + 2]; buf[tid] op buf[tid + 1];
|
||||||
@@ -668,7 +669,7 @@ extern "C" __global__ void cudaChooseBestMethod(
|
|||||||
int obits = shared.task[threadIdx.y].obits - shared.task[threadIdx.y].wbits;
|
int obits = shared.task[threadIdx.y].obits - shared.task[threadIdx.y].wbits;
|
||||||
shared.length[task + threadIdx.y] =
|
shared.length[task + threadIdx.y] =
|
||||||
min(obits * shared.task[threadIdx.y].blocksize,
|
min(obits * shared.task[threadIdx.y].blocksize,
|
||||||
shared.task[threadIdx.y].type == Fixed ? shared.task[threadIdx.y].residualOrder * obits + 6 + shared.partLen[threadIdx.y * 32] :
|
shared.task[threadIdx.y].type == Fixed ? shared.task[threadIdx.y].residualOrder * obits + 6 + (4 * partCount/2) + shared.partLen[threadIdx.y * 32] :
|
||||||
shared.task[threadIdx.y].type == LPC ? shared.task[threadIdx.y].residualOrder * obits + 4 + 5 + shared.task[threadIdx.y].residualOrder * shared.task[threadIdx.y].cbits + 6 + (4 * partCount/2)/* << porder */ + shared.partLen[threadIdx.y * 32] :
|
shared.task[threadIdx.y].type == LPC ? shared.task[threadIdx.y].residualOrder * obits + 4 + 5 + shared.task[threadIdx.y].residualOrder * shared.task[threadIdx.y].cbits + 6 + (4 * partCount/2)/* << porder */ + shared.partLen[threadIdx.y * 32] :
|
||||||
shared.task[threadIdx.y].type == Constant ? obits * (1 + shared.task[threadIdx.y].blocksize * (shared.partLen[threadIdx.y * 32] != 0)) :
|
shared.task[threadIdx.y].type == Constant ? obits * (1 + shared.task[threadIdx.y].blocksize * (shared.partLen[threadIdx.y * 32] != 0)) :
|
||||||
obits * shared.task[threadIdx.y].blocksize);
|
obits * shared.task[threadIdx.y].blocksize);
|
||||||
@@ -846,13 +847,16 @@ extern "C" __global__ void cudaCalcPartition(
|
|||||||
int s = (offs >= shared.task.residualOrder && tid < parts * psize) ? residual[shared.task.residualOffs + offs] : 0;
|
int s = (offs >= shared.task.residualOrder && tid < parts * psize) ? residual[shared.task.residualOffs + offs] : 0;
|
||||||
// convert to unsigned
|
// convert to unsigned
|
||||||
shared.data[tid] = min(0xfffff, (s << 1) ^ (s >> 31));
|
shared.data[tid] = min(0xfffff, (s << 1) ^ (s >> 31));
|
||||||
shared.length[tid] = (psize - shared.task.residualOrder * (threadIdx.y + blockIdx.x == 0)) * (threadIdx.x + 1);
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
|
int sum = 0;
|
||||||
|
int dpos = threadIdx.y * psize;
|
||||||
// calc number of unary bits for each residual part with each rice paramater
|
// calc number of unary bits for each residual part with each rice paramater
|
||||||
|
#pragma unroll 0
|
||||||
for (int i = 0; i < psize; i++)
|
for (int i = 0; i < psize; i++)
|
||||||
// for part (threadIdx.y) with this rice paramater (threadIdx.x)
|
// for part (threadIdx.y) with this rice paramater (threadIdx.x)
|
||||||
shared.length[tid] += shared.data[threadIdx.y * psize + i] >> threadIdx.x;
|
sum += shared.data[dpos + i] >> threadIdx.x;
|
||||||
|
shared.length[tid] = sum + (psize - shared.task.residualOrder * (threadIdx.y + blockIdx.x == 0)) * (threadIdx.x + 1);
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
// output length (transposed: k is now threadIdx.y)
|
// output length (transposed: k is now threadIdx.y)
|
||||||
@@ -861,6 +865,54 @@ extern "C" __global__ void cudaCalcPartition(
|
|||||||
partition_lengths[pos + blockIdx.x * parts_per_block + threadIdx.x] = shared.length[threadIdx.y + (threadIdx.x << 4)];
|
partition_lengths[pos + blockIdx.x * parts_per_block + threadIdx.x] = shared.length[threadIdx.y + (threadIdx.x << 4)];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
extern "C" __global__ void cudaCalcPartition1(
|
||||||
|
int* partition_lengths,
|
||||||
|
int* residual,
|
||||||
|
encodeResidualTaskStruct *tasks,
|
||||||
|
int max_porder, // <= 8
|
||||||
|
int psize, // == (shared.task.blocksize >> max_porder), < 256
|
||||||
|
int parts_per_block // == 256 / psize, > 0, <= 16
|
||||||
|
)
|
||||||
|
{
|
||||||
|
__shared__ struct {
|
||||||
|
int data[256];
|
||||||
|
int length[256];
|
||||||
|
int plen[256];
|
||||||
|
encodeResidualTaskStruct task;
|
||||||
|
} shared;
|
||||||
|
const int tid = threadIdx.x + (threadIdx.y << 4);
|
||||||
|
if (tid < sizeof(shared.task) / sizeof(int))
|
||||||
|
((int*)&shared.task)[tid] = ((int*)(&tasks[blockIdx.y]))[tid];
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
const int parts = min(parts_per_block, (1 << max_porder) - blockIdx.x * parts_per_block);
|
||||||
|
|
||||||
|
// fetch residual
|
||||||
|
int offs = blockIdx.x * psize * parts_per_block + tid;
|
||||||
|
int s = (offs >= shared.task.residualOrder && tid < parts * psize) ? residual[shared.task.residualOffs + offs] : 0;
|
||||||
|
// convert to unsigned
|
||||||
|
shared.data[tid] = min(0xfffff, (s << 1) ^ (s >> 31));
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
for (int k = 0; k < 15; k++)
|
||||||
|
{
|
||||||
|
shared.length[tid] = 0;
|
||||||
|
// calc number of unary bits for each residual part with each rice paramater
|
||||||
|
// for part (threadIdx.y) with rice paramater k
|
||||||
|
for (int i = 0; i < psize; i += 16)
|
||||||
|
shared.length[tid] += shared.data[threadIdx.y * psize + i + threadIdx.x] >> k; // * (i + threadIdx.x < psize)
|
||||||
|
SUM16(shared.length,tid,+=);
|
||||||
|
if (threadIdx.x == 0 && threadIdx.y < parts)
|
||||||
|
shared.plen[(k << 4) + threadIdx.y] = shared.length[tid];
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
// output length
|
||||||
|
const int pos = blockIdx.x * parts_per_block + threadIdx.x;
|
||||||
|
const int len1 = (psize - shared.task.residualOrder * (pos == 0)) * (threadIdx.y + 1);
|
||||||
|
if (threadIdx.y <= 14 && threadIdx.x < parts)
|
||||||
|
partition_lengths[((threadIdx.y + 15 * blockIdx.y) << (max_porder + 1)) + pos] = shared.plen[tid] + len1;
|
||||||
|
}
|
||||||
|
|
||||||
extern "C" __global__ void cudaCalcLargePartition(
|
extern "C" __global__ void cudaCalcLargePartition(
|
||||||
int* partition_lengths,
|
int* partition_lengths,
|
||||||
int* residual,
|
int* residual,
|
||||||
@@ -880,7 +932,7 @@ extern "C" __global__ void cudaCalcLargePartition(
|
|||||||
((int*)&shared.task)[tid] = ((int*)(&tasks[blockIdx.y]))[tid];
|
((int*)&shared.task)[tid] = ((int*)(&tasks[blockIdx.y]))[tid];
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
shared.length[tid] = 0;
|
int sum = 0;
|
||||||
for (int pos = 0; pos < psize; pos += 256)
|
for (int pos = 0; pos < psize; pos += 256)
|
||||||
{
|
{
|
||||||
// fetch residual
|
// fetch residual
|
||||||
@@ -892,12 +944,12 @@ extern "C" __global__ void cudaCalcLargePartition(
|
|||||||
|
|
||||||
// calc number of unary bits for each residual sample with each rice paramater
|
// calc number of unary bits for each residual sample with each rice paramater
|
||||||
#pragma unroll 0
|
#pragma unroll 0
|
||||||
for (int i = 0; i < min(psize,256); i += 16)
|
for (int i = threadIdx.x; i < min(psize,256); i += 16)
|
||||||
// for sample (i + threadIdx.x) with this rice paramater (threadIdx.y)
|
// for sample (i + threadIdx.x) with this rice paramater (threadIdx.y)
|
||||||
shared.length[tid] += shared.data[i + threadIdx.x] >> threadIdx.y;
|
sum += shared.data[i] >> threadIdx.y;
|
||||||
shared.length[tid] = min(0xfffff, shared.length[tid]);
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
}
|
}
|
||||||
|
shared.length[tid] = min(0xfffff,sum);
|
||||||
SUM16(shared.length,tid,+=);
|
SUM16(shared.length,tid,+=);
|
||||||
|
|
||||||
// output length
|
// output length
|
||||||
@@ -919,7 +971,7 @@ extern "C" __global__ void cudaSumPartition(
|
|||||||
|
|
||||||
const int pos = (15 << (max_porder + 1)) * blockIdx.y + (blockIdx.x << (max_porder + 1));
|
const int pos = (15 << (max_porder + 1)) * blockIdx.y + (blockIdx.x << (max_porder + 1));
|
||||||
|
|
||||||
// fetch residual
|
// fetch partition lengths
|
||||||
shared.data[threadIdx.x] = threadIdx.x < (1 << max_porder) ? partition_lengths[pos + threadIdx.x] : 0;
|
shared.data[threadIdx.x] = threadIdx.x < (1 << max_porder) ? partition_lengths[pos + threadIdx.x] : 0;
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
for (int porder = max_porder - 1; porder >= 0; porder--)
|
for (int porder = max_porder - 1; porder >= 0; porder--)
|
||||||
@@ -936,7 +988,7 @@ extern "C" __global__ void cudaSumPartition(
|
|||||||
// Finds optimal rice parameter for up to 16 partitions at a time.
|
// Finds optimal rice parameter for up to 16 partitions at a time.
|
||||||
// Requires 16x16 threads
|
// Requires 16x16 threads
|
||||||
extern "C" __global__ void cudaFindRiceParameter(
|
extern "C" __global__ void cudaFindRiceParameter(
|
||||||
int* output,
|
int* rice_parameters,
|
||||||
int* partition_lengths,
|
int* partition_lengths,
|
||||||
int max_porder
|
int max_porder
|
||||||
)
|
)
|
||||||
@@ -944,22 +996,22 @@ extern "C" __global__ void cudaFindRiceParameter(
|
|||||||
__shared__ struct {
|
__shared__ struct {
|
||||||
volatile int length[256];
|
volatile int length[256];
|
||||||
volatile int index[256];
|
volatile int index[256];
|
||||||
|
volatile int outlen[32];
|
||||||
|
volatile int outidx[32];
|
||||||
} shared;
|
} shared;
|
||||||
const int tid = threadIdx.x + (threadIdx.y << 4);
|
const int tid = threadIdx.x + (threadIdx.y << 3);
|
||||||
const int parts = min(16, 2 << max_porder);
|
const int parts = min(32, 2 << max_porder);
|
||||||
const int pos = (15 << (max_porder + 1)) * blockIdx.y + (threadIdx.y << (max_porder + 1));
|
const int pos = (15 << (max_porder + 1)) * blockIdx.y + ((tid >> 5) << (max_porder + 1));
|
||||||
|
|
||||||
// read length for 16 partitions
|
// read length for 32 partitions
|
||||||
shared.length[tid] = (threadIdx.y <= 14 && threadIdx.x < parts) ? partition_lengths[pos + blockIdx.x * 16 + threadIdx.x] : 0xffffff;
|
shared.index[tid] = ((tid & 31) < parts) ? partition_lengths[pos + blockIdx.x * 32 + (tid & 31)] : 0xffffff;
|
||||||
|
shared.length[tid] = ((tid >> 5) + 8 <= 14 && (tid & 31) < parts) ? partition_lengths[pos + (8 << (max_porder + 1)) + blockIdx.x * 32 + (tid & 31)] : 0xffffff;
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
// transpose
|
// transpose
|
||||||
//shared.length[tid] = shared.index[threadIdx.y + (threadIdx.x << 4)];
|
int l1 = shared.index[threadIdx.y + (threadIdx.x << 5)];
|
||||||
int l1 = shared.length[threadIdx.y + (threadIdx.x << 4)];
|
int l2 = shared.length[threadIdx.y + (threadIdx.x << 5)];
|
||||||
__syncthreads();
|
|
||||||
shared.length[tid] = l1;
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
// find best rice parameter
|
// find best rice parameter
|
||||||
int l2 = shared.length[tid + 8];
|
|
||||||
shared.index[tid] = threadIdx.x + ((l2 < l1) << 3);
|
shared.index[tid] = threadIdx.x + ((l2 < l1) << 3);
|
||||||
shared.length[tid] = l1 = min(l1, l2);
|
shared.length[tid] = l1 = min(l1, l2);
|
||||||
#pragma unroll 2
|
#pragma unroll 2
|
||||||
@@ -970,12 +1022,81 @@ extern "C" __global__ void cudaFindRiceParameter(
|
|||||||
shared.length[tid] = l1 = min(l1, l2);
|
shared.length[tid] = l1 = min(l1, l2);
|
||||||
}
|
}
|
||||||
l2 = shared.length[tid + 1];
|
l2 = shared.length[tid + 1];
|
||||||
|
if (threadIdx.x == 0 && threadIdx.y < parts)
|
||||||
|
shared.outidx[threadIdx.y] = shared.index[tid + (l2 < l1)];
|
||||||
|
if (threadIdx.x == 0 && threadIdx.y < parts)
|
||||||
|
shared.outlen[threadIdx.y] = min(l1, l2);
|
||||||
|
__syncthreads();
|
||||||
// output rice parameter
|
// output rice parameter
|
||||||
if (threadIdx.x == 0 && threadIdx.y < parts)
|
if (tid < parts)
|
||||||
output[(blockIdx.y << (max_porder + 2)) + blockIdx.x * parts + threadIdx.y] = shared.index[tid + (l2 < l1)];
|
rice_parameters[(blockIdx.y << (max_porder + 2)) + blockIdx.x * parts + tid] = shared.outidx[tid];
|
||||||
// output length
|
// output length
|
||||||
if (threadIdx.x == 0 && threadIdx.y < parts)
|
if (tid < parts)
|
||||||
output[(blockIdx.y << (max_porder + 2)) + (1 << (max_porder + 1)) + blockIdx.x * parts + threadIdx.y] = min(l1, l2);
|
rice_parameters[(blockIdx.y << (max_porder + 2)) + (1 << (max_porder + 1)) + blockIdx.x * parts + tid] = shared.outlen[tid];
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C" __global__ void cudaFindPartitionOrder(
|
||||||
|
int* best_rice_parameters,
|
||||||
|
encodeResidualTaskStruct *tasks,
|
||||||
|
int* rice_parameters,
|
||||||
|
int max_porder
|
||||||
|
)
|
||||||
|
{
|
||||||
|
__shared__ struct {
|
||||||
|
int data[512];
|
||||||
|
volatile int tmp[256];
|
||||||
|
int length[32];
|
||||||
|
int index[32];
|
||||||
|
encodeResidualTaskStruct task;
|
||||||
|
} shared;
|
||||||
|
const int pos = (blockIdx.y << (max_porder + 2)) + (2 << max_porder);
|
||||||
|
if (threadIdx.x < sizeof(shared.task) / sizeof(int))
|
||||||
|
((int*)&shared.task)[threadIdx.x] = ((int*)(&tasks[blockIdx.y]))[threadIdx.x];
|
||||||
|
// fetch partition lengths
|
||||||
|
shared.data[threadIdx.x] = threadIdx.x < (2 << max_porder) ? rice_parameters[pos + threadIdx.x] : 0;
|
||||||
|
shared.data[threadIdx.x + 256] = threadIdx.x + 256 < (2 << max_porder) ? rice_parameters[pos + 256 + threadIdx.x] : 0;
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
for (int porder = max_porder; porder >= 0; porder--)
|
||||||
|
{
|
||||||
|
shared.tmp[threadIdx.x] = (threadIdx.x < (1 << porder)) * shared.data[(2 << max_porder) - (2 << porder) + threadIdx.x];
|
||||||
|
__syncthreads();
|
||||||
|
SUM256(shared.tmp, threadIdx.x, +=);
|
||||||
|
if (threadIdx.x == 0)
|
||||||
|
shared.length[porder] = shared.tmp[0] + (4 << porder);
|
||||||
|
__syncthreads();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (threadIdx.x < 32)
|
||||||
|
{
|
||||||
|
shared.index[threadIdx.x] = threadIdx.x;
|
||||||
|
if (threadIdx.x > max_porder)
|
||||||
|
shared.length[threadIdx.x] = 0xfffffff;
|
||||||
|
int l1 = shared.length[threadIdx.x];
|
||||||
|
#pragma unroll 4
|
||||||
|
for (int sh = 3; sh >= 0; sh --)
|
||||||
|
{
|
||||||
|
int l2 = shared.length[threadIdx.x + (1 << sh)];
|
||||||
|
shared.index[threadIdx.x] = shared.index[threadIdx.x + ((l2 < l1) << sh)];
|
||||||
|
shared.length[threadIdx.x] = l1 = min(l1, l2);
|
||||||
|
}
|
||||||
|
if (threadIdx.x == 0)
|
||||||
|
tasks[blockIdx.y].porder = shared.index[0];
|
||||||
|
if (threadIdx.x == 0)
|
||||||
|
{
|
||||||
|
int obits = shared.task.obits - shared.task.wbits;
|
||||||
|
tasks[blockIdx.y].size =
|
||||||
|
shared.task.type == Fixed ? shared.task.residualOrder * obits + 6 + l1 :
|
||||||
|
shared.task.type == LPC ? shared.task.residualOrder * obits + 6 + l1 + 4 + 5 + shared.task.residualOrder * shared.task.cbits :
|
||||||
|
shared.task.type == Constant ? obits : obits * shared.task.blocksize;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
int porder = shared.index[0];
|
||||||
|
//shared.data[threadIdx.x] = threadIdx.x < (1 << porder) ? rice_parameters[pos - (2 << porder) + threadIdx.x] : 0;
|
||||||
|
if (threadIdx.x < (1 << porder))
|
||||||
|
best_rice_parameters[(blockIdx.y << max_porder) + threadIdx.x] = rice_parameters[pos - (2 << porder) + threadIdx.x];
|
||||||
|
// FIXME: should be bytes?
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -431,6 +431,155 @@ code {
|
|||||||
0xd00e0201 0xa0c00781
|
0xd00e0201 0xa0c00781
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
code {
|
||||||
|
name = cudaFindPartitionOrder
|
||||||
|
lmem = 0
|
||||||
|
smem = 3552
|
||||||
|
reg = 8
|
||||||
|
bar = 1
|
||||||
|
const {
|
||||||
|
segname = const
|
||||||
|
segnum = 1
|
||||||
|
offset = 0
|
||||||
|
bytes = 36
|
||||||
|
mem {
|
||||||
|
0x0000002f 0x0000001f 0x0000007f 0x0000003f
|
||||||
|
0xffffffff 0x00000008 0x00000020 0x00000002
|
||||||
|
0x0fffffff
|
||||||
|
}
|
||||||
|
}
|
||||||
|
bincode {
|
||||||
|
0xa000000d 0x04000780 0x308007fd 0x644107c8
|
||||||
|
0xa000d003 0x00000000 0x30020609 0xc4100780
|
||||||
|
0x1000d003 0x00000280 0xa0004e01 0x04200780
|
||||||
|
0x30070005 0xc4100780 0x30060001 0xc4100780
|
||||||
|
0x20008200 0x2100ea00 0x20000401 0x04000780
|
||||||
|
0xd00e0001 0x80c00780 0x00000405 0xc0000780
|
||||||
|
0x04069001 0xe4200780 0x1000ce01 0x0423c782
|
||||||
|
0x10028005 0x00000003 0x30000211 0xc4000780
|
||||||
|
0x300309fd 0x6400c7c8 0xa001c003 0x00000000
|
||||||
|
0x1001b003 0x00000280 0xa0004e01 0x04200780
|
||||||
|
0x2102ee05 0x00000003 0x30010001 0xc4000780
|
||||||
|
0x20008800 0x20008600 0x30020001 0xc4100780
|
||||||
|
0x2000cc01 0x04200780 0xd00e0001 0x80c00780
|
||||||
|
0x1001c003 0x00000780 0x1000f801 0x0403c780
|
||||||
|
0x00000405 0xc0000782 0x20008605 0x00000013
|
||||||
|
0x300109fd 0x6400c7c8 0x04001001 0xe4200780
|
||||||
|
0xa002c003 0x00000000 0x1002b003 0x00000280
|
||||||
|
0xa0004e01 0x04200780 0x2102ee05 0x00000003
|
||||||
|
0x30010001 0xc4000780 0x20008800 0x20008600
|
||||||
|
0x30020001 0xc4100780 0x2000cc01 0x04200780
|
||||||
|
0x20008001 0x00000043 0xd00e0001 0x80c00780
|
||||||
|
0x1002c003 0x00000780 0x1000f801 0x0403c780
|
||||||
|
0x00000405 0xc0000782 0x04021001 0xe4200780
|
||||||
|
0x861ffe03 0x00000000 0x307ccffd 0x6c2047c8
|
||||||
|
0x1000ce01 0x0423c780 0x10076003 0x00000280
|
||||||
|
0x308207fd 0x6440c7e8 0x308307fd 0x6440c7f8
|
||||||
|
0x308107fd 0x6440c7c8 0x307c07fd 0x640087d8
|
||||||
|
0x00000015 0x20001780 0x300007fd 0xe40007d8
|
||||||
|
0x10000e05 0x2440d500 0x30000205 0xc4001500
|
||||||
|
0x20400805 0x04005500 0x20000605 0x04005500
|
||||||
|
0x00020205 0xc0001500 0x1400d005 0x0423d500
|
||||||
|
0x1000f805 0x0403d280 0x00000405 0xc0000780
|
||||||
|
0x04041001 0xe4204780 0x861ffe03 0x00000000
|
||||||
|
0x00000405 0xc0002680 0xd414400d 0x20002680
|
||||||
|
0xd4104009 0x20002680 0x1c00c005 0x0423e680
|
||||||
|
0x2800c005 0x04206680 0x04041001 0xe4206680
|
||||||
|
0x861ffe03 0x00000000 0x00000405 0xc0003680
|
||||||
|
0xd412400d 0x20003680 0xd4104009 0x20003680
|
||||||
|
0x1c00c005 0x0423f680 0x2800c005 0x04207680
|
||||||
|
0x04041001 0xe4207680 0x861ffe03 0x00000000
|
||||||
|
0x00000405 0xc0000680 0xd411400d 0x20000680
|
||||||
|
0xd4104009 0x20000680 0x1c00c005 0x0423c680
|
||||||
|
0x2800c005 0x04204680 0x04041001 0xe4204680
|
||||||
|
0x861ffe03 0x00000000 0xa0066003 0x00000000
|
||||||
|
0x10066003 0x00000100 0x00000405 0xc0000780
|
||||||
|
0xd4104009 0x20000780 0x1800e005 0x0423c780
|
||||||
|
0x2800c005 0x04204780 0x04041001 0xe4204780
|
||||||
|
0x1900f004 0x2901e004 0x04041001 0xe4204780
|
||||||
|
0x1900e804 0x2901e004 0x04041001 0xe4204780
|
||||||
|
0x1900e404 0x2901e004 0x04041001 0xe4204780
|
||||||
|
0x1900e204 0x2901e004 0x04041001 0xe4204780
|
||||||
|
0xf0000001 0xe0000002 0xa0070003 0x00000000
|
||||||
|
0x00000a01 0xa00007d0 0x10070003 0x00001100
|
||||||
|
0x10048005 0x00000003 0xd0104005 0x20000780
|
||||||
|
0x30000205 0xc4000780 0x00020009 0xc0000780
|
||||||
|
0x2400c005 0x04204780 0x08061001 0xe4204780
|
||||||
|
0xf0000001 0xe0000002 0x861ffe03 0x00000000
|
||||||
|
0x203f8001 0x0fffffff 0x308401fd 0x6c4147d8
|
||||||
|
0x10037003 0x00001280 0x10077003 0x00000780
|
||||||
|
0x308107fd 0x6440c7c8 0xa00eb003 0x00000000
|
||||||
|
0x100eb003 0x00000100 0x00000405 0xc0000780
|
||||||
|
0x3003cffd 0x642187c8 0x04065001 0xe420c780
|
||||||
|
0x00000405 0xc0000500 0x10001001 0x2440c500
|
||||||
|
0x04061001 0xe4200500 0x10001001 0x2440c500
|
||||||
|
0x00000405 0xc0000280 0xd4184005 0x20000280
|
||||||
|
0x1400c001 0x0423c280 0x00000405 0xc0000780
|
||||||
|
0x20108605 0x00000033 0x00020209 0xc0000780
|
||||||
|
0x20088605 0x00000003 0x3800c1fd 0x6c2047c8
|
||||||
|
0x10000605 0x0403c500 0x0002020d 0xc0000780
|
||||||
|
0xdc19400d 0x20000780 0x3800c005 0xac200780
|
||||||
|
0x1c00c001 0x0423c780 0x200c8609 0x00000033
|
||||||
|
0x04065001 0xe4200780 0x00020409 0xc0000780
|
||||||
|
0x04061001 0xe4204780 0x20048601 0x00000003
|
||||||
|
0x3801c1fd 0x6c2047c8 0x10000601 0x0403c500
|
||||||
|
0x0002000d 0xc0000780 0xdc19400d 0x20000780
|
||||||
|
0x3801c005 0xac200780 0x1c00c001 0x0423c780
|
||||||
|
0x200a8609 0x00000033 0x04065001 0xe4200780
|
||||||
|
0x00020409 0xc0000780 0x04061001 0xe4204780
|
||||||
|
0x20028601 0x00000003 0x3801c1fd 0x6c2047c8
|
||||||
|
0x10000601 0x0403c500 0x0002000d 0xc0000780
|
||||||
|
0xdc19400d 0x20000780 0x3801c005 0xac200780
|
||||||
|
0x1c00c001 0x0423c780 0x20098609 0x00000033
|
||||||
|
0x04065001 0xe4200780 0x00020409 0xc0000780
|
||||||
|
0x04061001 0xe4204780 0x20018601 0x00000003
|
||||||
|
0x3801c1fd 0x6c2047c8 0x10000601 0x0403c500
|
||||||
|
0x0002000d 0xc0000780 0xdc19400d 0x20000780
|
||||||
|
0x3801c015 0xac200780 0x1c00c001 0x0423c780
|
||||||
|
0x307c0605 0x640087d0 0x04065001 0xe4200780
|
||||||
|
0xa00003fd 0x0c0147c8 0xa00bc003 0x00000000
|
||||||
|
0x04061001 0xe4214780 0x100bc003 0x00001100
|
||||||
|
0xa0004e01 0x04200780 0x30070005 0xc4100780
|
||||||
|
0x30060001 0xc4100780 0x20000201 0x04000780
|
||||||
|
0xd0194005 0x20000780 0x2100ea04 0x1500e000
|
||||||
|
0x20348205 0x00000003 0xd00e0201 0xa0c00780
|
||||||
|
0xf0000001 0xe0000002 0x100eb003 0x00000100
|
||||||
|
0xd01a6805 0x20000780 0x1400cc01 0x0423c780
|
||||||
|
0x3485c1fd 0x6c6147c8 0x2440c201 0x04200780
|
||||||
|
0x100cc003 0x00000280 0xd01a4005 0x20000780
|
||||||
|
0x1400c005 0x0423c780 0x40030009 0x00000780
|
||||||
|
0x60020209 0x00008780 0x30100409 0xc4100780
|
||||||
|
0x60020001 0x00008780 0x20000001 0x04014780
|
||||||
|
0x20068001 0x00000003 0x100e5003 0x00000780
|
||||||
|
0xd01a6805 0x20000780 0x3486c1fd 0x6c6147c8
|
||||||
|
0x100dc003 0x00000280 0xd01a4005 0x20000780
|
||||||
|
0x1400c009 0x0423c780 0x1400c605 0x0423c780
|
||||||
|
0x4005001c 0x40040618 0x6004021d 0x0001c780
|
||||||
|
0x60050419 0x00018780 0x30100e1d 0xc4100780
|
||||||
|
0x30100c19 0xc4100780 0x6004001d 0x0001c780
|
||||||
|
0x60040401 0x00018780 0x20058e04 0x20018000
|
||||||
|
0x200f8009 0x00000003 0x100e4003 0x00000780
|
||||||
|
0xd01a6805 0x20000780 0x1400c405 0x0423c780
|
||||||
|
0x40030009 0x00000780 0x60020215 0x00008780
|
||||||
|
0x10000009 0x0403c780 0x347cc1fd 0x6c2147c8
|
||||||
|
0x30100a15 0xc4100780 0x60020009 0x00014280
|
||||||
|
0x10000401 0x0403c780 0xa0004e05 0x04200780
|
||||||
|
0x30070209 0xc4100780 0x30060205 0xc4100780
|
||||||
|
0x20018404 0x2101ea04 0x20108205 0x00000003
|
||||||
|
0xd00e0201 0xa0c00780 0xf0000001 0xe0000002
|
||||||
|
0x861ffe03 0x00000000 0xd0194005 0x20000780
|
||||||
|
0x1400c001 0x0423c780 0x300007fd 0xe40007c8
|
||||||
|
0x30000003 0x00000280 0xa0004e05 0x04200780
|
||||||
|
0x2102ee01 0x00000003 0xd0194005 0x20000780
|
||||||
|
0x10028015 0x00000003 0x30000209 0xc4000780
|
||||||
|
0x1500e000 0x20028808 0x30000a11 0xc4000780
|
||||||
|
0x1100ee00 0x20448408 0x30000201 0xc4000780
|
||||||
|
0x20028604 0x20008600 0x30020205 0xc4100780
|
||||||
|
0x30020009 0xc4100780 0x2000cc01 0x04204780
|
||||||
|
0xd00e0001 0x80c00780 0x2000c805 0x04208780
|
||||||
|
0xd00e0201 0xa0c00781
|
||||||
|
}
|
||||||
|
}
|
||||||
code {
|
code {
|
||||||
name = cudaEstimateResidual
|
name = cudaEstimateResidual
|
||||||
lmem = 0
|
lmem = 0
|
||||||
@@ -559,6 +708,93 @@ code {
|
|||||||
0x1500e000 0x2101e804 0xd00e0201 0xa0c00781
|
0x1500e000 0x2101e804 0xd00e0201 0xa0c00781
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
code {
|
||||||
|
name = cudaCalcPartition1
|
||||||
|
lmem = 0
|
||||||
|
smem = 3304
|
||||||
|
reg = 11
|
||||||
|
bar = 1
|
||||||
|
const {
|
||||||
|
segname = const
|
||||||
|
segnum = 1
|
||||||
|
offset = 0
|
||||||
|
bytes = 24
|
||||||
|
mem {
|
||||||
|
0x000003ff 0x0000002f 0x000fffff 0x00000001
|
||||||
|
0x0000000f 0x0000000e
|
||||||
|
}
|
||||||
|
}
|
||||||
|
bincode {
|
||||||
|
0x10000005 0x0403c780 0xd0800601 0x00400780
|
||||||
|
0xa0000001 0x04000780 0xa0000415 0x04000780
|
||||||
|
0x30040005 0xc4100780 0x20000a21 0x04004780
|
||||||
|
0x308111fd 0x644107c8 0xa0012003 0x00000000
|
||||||
|
0x30021019 0xc4100780 0x10012003 0x00000280
|
||||||
|
0xa0004e05 0x04200780 0x30070209 0xc4100780
|
||||||
|
0x30060205 0xc4100780 0x20018404 0x2101ec04
|
||||||
|
0x20000c05 0x04004780 0xd00e0205 0x80c00780
|
||||||
|
0x00000c05 0xc0000780 0x04061401 0xe4204780
|
||||||
|
0xf0000001 0xe0000002 0x861ffe03 0x00000000
|
||||||
|
0xa0004c0d 0x04200780 0x1100f204 0x1100f008
|
||||||
|
0x4006061c 0x40050c24 0x10018029 0x00000003
|
||||||
|
0x1000ce11 0x0423c780 0x30100e1d 0xc4100780
|
||||||
|
0x30101225 0xc4100780 0x30041411 0xc4000780
|
||||||
|
0x6006041d 0x0001c780 0x60040c0d 0x00024780
|
||||||
|
0x30048e10 0x1100f008 0x40060625 0x00000780
|
||||||
|
0x3004d211 0xa4200780 0x60070425 0x00024780
|
||||||
|
0x40051029 0x00000780 0x30101225 0xc4100780
|
||||||
|
0x60041229 0x00028780 0x60060405 0x00024780
|
||||||
|
0xd0185005 0x20000780 0x3010140d 0xc4100780
|
||||||
|
0x20000205 0x04020780 0x60041009 0x0000c780
|
||||||
|
0x3401c1fd 0x6c20c7c8 0x300211fd 0x6c0042c8
|
||||||
|
0xa0035003 0x00000000 0x10034003 0x00000100
|
||||||
|
0xd018a005 0x20000780 0x2400c005 0x04204780
|
||||||
|
0x30020205 0xc4100780 0x2000ca05 0x04204780
|
||||||
|
0xd00e0205 0x80c00780 0x10035003 0x00000780
|
||||||
|
0x1000f805 0x0403c780 0x301f0209 0xec100782
|
||||||
|
0x30010205 0xc4100780 0xd0010405 0x04008780
|
||||||
|
0x00000c05 0xc0000780 0x30820205 0xac400780
|
||||||
|
0x04001401 0xe4204780 0x861ffe03 0x00000000
|
||||||
|
0x307c0a05 0x64008780 0x30000809 0x64010780
|
||||||
|
0xd0830205 0x04400780 0xd0830409 0x04400780
|
||||||
|
0xd002020d 0x04000780 0x307cd1fd 0x6c2107c8
|
||||||
|
0x1000f821 0x0403c780 0x00000c05 0xc0000780
|
||||||
|
0x1000f809 0x0403c780 0x04021401 0xe43f0780
|
||||||
|
0x1005b003 0x00000100 0x1000d005 0x0423c780
|
||||||
|
0x40010425 0x00000780 0x60000625 0x00024780
|
||||||
|
0x30101225 0xc4100780 0x60000429 0x00024780
|
||||||
|
0x20001405 0x04014780 0x200a8225 0x00000003
|
||||||
|
0x2000d029 0x04228780 0x00021205 0xc0000780
|
||||||
|
0xa005a003 0x00000000 0x20000a25 0x04028780
|
||||||
|
0x3408c029 0xec200780 0x20108205 0x00000003
|
||||||
|
0x20000409 0x04028780 0x00000c09 0xc0000780
|
||||||
|
0x300903fd 0x6c0047d8 0xd4008005 0x20000780
|
||||||
|
0x08021401 0xe4208780 0x10052003 0x00001280
|
||||||
|
0xf0000001 0xe0000002 0x00000c05 0xc0000780
|
||||||
|
0xd4085809 0x20000780 0x2800ce05 0x04208780
|
||||||
|
0x04021401 0xe4204780 0x2800c605 0x04204780
|
||||||
|
0x04021401 0xe4204780 0x2800c205 0x04204780
|
||||||
|
0x04021401 0xe4204780 0x2800c005 0x04204780
|
||||||
|
0x307c07fd 0x6c0087d8 0x04021401 0xe4204780
|
||||||
|
0x30041009 0xc4101500 0x20000009 0x04009500
|
||||||
|
0x00020405 0xc0001500 0x04041401 0xe4205500
|
||||||
|
0x20019021 0x00000003 0x308411fd 0x6c4147d8
|
||||||
|
0x10043003 0x00001280 0x861ffe03 0x00000000
|
||||||
|
0x300509fd 0x640107c8 0x308501fd 0x6440c2c8
|
||||||
|
0x30000003 0x00000100 0xd0185005 0x20000780
|
||||||
|
0x1000d00d 0x0423c780 0x20018009 0x00000003
|
||||||
|
0x1000d005 0x0423c780 0x3503e00c 0x40030810
|
||||||
|
0x610f2e01 0x00000003 0x60020a29 0x00010780
|
||||||
|
0x40070825 0x00000780 0x2101ee21 0x00000003
|
||||||
|
0x20000e11 0x040147c0 0x30101429 0xc4100780
|
||||||
|
0x60060a1d 0x00024780 0x30080015 0xc4000780
|
||||||
|
0x60020801 0x00028780 0x30100e1d 0xc4100780
|
||||||
|
0x20000805 0x04014780 0x00000c05 0xc0000780
|
||||||
|
0x60060801 0x0001c100 0x30020205 0xc4100780
|
||||||
|
0xd4105005 0x20000780 0x2101e804 0x2500e000
|
||||||
|
0xd00e0201 0xa0c00781
|
||||||
|
}
|
||||||
|
}
|
||||||
code {
|
code {
|
||||||
name = cudaChooseBestMethod
|
name = cudaChooseBestMethod
|
||||||
lmem = 0
|
lmem = 0
|
||||||
@@ -571,7 +807,7 @@ code {
|
|||||||
offset = 0
|
offset = 0
|
||||||
bytes = 28
|
bytes = 28
|
||||||
mem {
|
mem {
|
||||||
0x000003ff 0x00000008 0x00000020 0x00000001
|
0x000003ff 0x00000008 0x00000001 0x00000020
|
||||||
0x0000007f 0x0000003f 0x0000001f
|
0x0000007f 0x0000003f 0x0000001f
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -581,10 +817,10 @@ code {
|
|||||||
0x20000211 0x04000780 0x103f8001 0x07ffffff
|
0x20000211 0x04000780 0x103f8001 0x07ffffff
|
||||||
0x00020805 0xc0000780 0x307ccffd 0x6c20c7c8
|
0x00020805 0xc0000780 0x307ccffd 0x6c20c7c8
|
||||||
0x04011001 0xe4200780 0x00070609 0xc0000780
|
0x04011001 0xe4200780 0x00070609 0xc0000780
|
||||||
0x10090003 0x00000280 0xa0004415 0x04200780
|
0x10096003 0x00000280 0xa0004415 0x04200780
|
||||||
0x1000f819 0x0403c780 0x20000c1d 0x0400c780
|
0x1000f819 0x0403c780 0x20000c1d 0x0400c780
|
||||||
0x3007cffd 0x6420c7c8 0xa008d003 0x00000000
|
0x3007cffd 0x6420c7c8 0xa0093003 0x00000000
|
||||||
0x1008d003 0x00000280 0x1000ce01 0x0423c780
|
0x10093003 0x00000280 0x1000ce01 0x0423c780
|
||||||
0x40014e09 0x00200780 0x30100409 0xc4100780
|
0x40014e09 0x00200780 0x30100409 0xc4100780
|
||||||
0x60004e21 0x00208780 0x30070601 0xc4100780
|
0x60004e21 0x00208780 0x30070601 0xc4100780
|
||||||
0x30070c2d 0xc4100780 0x30060c31 0xc4100780
|
0x30070c2d 0xc4100780 0x30060c31 0xc4100780
|
||||||
@@ -613,34 +849,37 @@ code {
|
|||||||
0x1000c401 0x0423c784 0x2000c001 0x04200784
|
0x1000c401 0x0423c784 0x2000c001 0x04200784
|
||||||
0x0c031001 0xe4200780 0x1000c201 0x0423c784
|
0x0c031001 0xe4200780 0x1000c201 0x0423c784
|
||||||
0x2000c001 0x04200784 0x307c03fd 0x640147c8
|
0x2000c001 0x04200784 0x307c03fd 0x640147c8
|
||||||
0x0c031001 0xe4200780 0x1008d003 0x00000280
|
0x0c031001 0xe4200780 0x10093003 0x00000280
|
||||||
0xd414680d 0x20000780 0x1d00ec08 0x1d00e400
|
0xd414680d 0x20000780 0x1d00ec08 0x1d00e400
|
||||||
0x2c40c209 0x04208780 0x40050021 0x00000780
|
0x2c40c209 0x04208780 0x40050021 0x00000780
|
||||||
0x60040221 0x00020780 0x30101021 0xc4100780
|
0x60040221 0x00020780 0x30101021 0xc4100780
|
||||||
0x3c81c1fd 0x6c6147c8 0x60040021 0x00020780
|
0x3c81c1fd 0x6c6147c8 0x60040021 0x00020780
|
||||||
0xa008a003 0x00000000 0x10060003 0x00000280
|
0xa0090003 0x00000000 0x10066003 0x00000280
|
||||||
0xd4144005 0x20000780 0x1400c001 0x0423c780
|
0xd4144005 0x20000780 0x1400c001 0x0423c780
|
||||||
0x40050025 0x00000780 0x60040225 0x00024780
|
0x3002cc25 0xc4300780 0x4005002d 0x00000780
|
||||||
0x30101225 0xc4100780 0x60040001 0x00024780
|
0x301f1229 0xec100780 0x6004022d 0x0002c780
|
||||||
|
0xd0821429 0x04400780 0x3010162d 0xc4100780
|
||||||
|
0x20001425 0x04024780 0x60040009 0x0002c780
|
||||||
|
0x30011201 0xec100780 0x20000401 0x04000780
|
||||||
0xd80c4005 0x20000780 0x2400c001 0x04200780
|
0xd80c4005 0x20000780 0x2400c001 0x04200780
|
||||||
0x20068001 0x00000003 0x1008a003 0x00000780
|
0x20068001 0x00000003 0x10090003 0x00000780
|
||||||
0xd414680d 0x20000780 0x3c82c1fd 0x6c6147c8
|
0xd414680d 0x20000780 0x3c83c1fd 0x6c6147c8
|
||||||
0xa0089003 0x00000000 0x10074003 0x00000280
|
0xa008f003 0x00000000 0x1007a003 0x00000280
|
||||||
0xd4144005 0x20000780 0x2502e608 0x1500e000
|
0xd4144005 0x20000780 0x2502e608 0x1500e000
|
||||||
0x3002cc25 0xc4300780 0x40050029 0x00000780
|
0x3002cc25 0xc4300780 0x40050029 0x00000780
|
||||||
0x301f122d 0xec100780 0x60040229 0x00028780
|
0x301f122d 0xec100780 0x60040229 0x00028780
|
||||||
0xd083162d 0x04400780 0x30101429 0xc4100780
|
0xd082162d 0x04400780 0x30101429 0xc4100780
|
||||||
0x20001625 0x04024780 0x60040001 0x00028780
|
0x20001625 0x04024780 0x60040001 0x00028780
|
||||||
0x30011209 0xec100780 0x20000001 0x04008780
|
0x30011209 0xec100780 0x20000001 0x04008780
|
||||||
0xd80c4005 0x20000780 0x2400c001 0x04200780
|
0xd80c4005 0x20000780 0x2400c001 0x04200780
|
||||||
0x200f8001 0x00000003 0x10089003 0x00000780
|
0x200f8001 0x00000003 0x1008f003 0x00000780
|
||||||
0xd414680d 0x20000780 0x3c7cc1fd 0x6c2147c8
|
0xd414680d 0x20000780 0x3c7cc1fd 0x6c2147c8
|
||||||
0xa0088003 0x00000000 0x10082003 0x00000280
|
0xa008e003 0x00000000 0x10088003 0x00000280
|
||||||
0xd80c400d 0x20000780 0xd4147805 0x20000780
|
0xd80c400d 0x20000780 0xd4147805 0x20000780
|
||||||
0x3c7cc1fd 0x6c2087c8 0x2501e001 0x00000003
|
0x3c7cc1fd 0x6c2087c8 0x2501e001 0x00000003
|
||||||
0x10000601 0x2440c280 0x40050025 0x00000780
|
0x10000401 0x2440c280 0x40050025 0x00000780
|
||||||
0x60040225 0x00024780 0x30101225 0xc4100780
|
0x60040225 0x00024780 0x30101225 0xc4100780
|
||||||
0x60040001 0x00024780 0x10088003 0x00000780
|
0x60040001 0x00024780 0x1008e003 0x00000780
|
||||||
0xd4147805 0x20000780 0x1400c001 0x0423c780
|
0xd4147805 0x20000780 0x1400c001 0x0423c780
|
||||||
0x40050025 0x00000780 0x60040225 0x00024780
|
0x40050025 0x00000780 0x60040225 0x00024780
|
||||||
0x30101225 0xc4100780 0x60040001 0x00024780
|
0x30101225 0xc4100780 0x60040001 0x00024780
|
||||||
@@ -649,7 +888,7 @@ code {
|
|||||||
0x04011001 0xe4200780 0x20000c19 0x04014782
|
0x04011001 0xe4200780 0x20000c19 0x04014782
|
||||||
0x3006cffd 0x6c2107c8 0x1000d003 0x00000280
|
0x3006cffd 0x6c2107c8 0x1000d003 0x00000280
|
||||||
0x861ffe03 0x00000000 0x3004cffd 0x6c20c7c8
|
0x861ffe03 0x00000000 0x3004cffd 0x6c20c7c8
|
||||||
0xa00a1003 0x00000000 0x100a1003 0x00000280
|
0xa00a7003 0x00000000 0x100a7003 0x00000280
|
||||||
0x1000ce01 0x0423c780 0x40014e05 0x00200780
|
0x1000ce01 0x0423c780 0x40014e05 0x00200780
|
||||||
0x30100205 0xc4100780 0x60004e01 0x00204780
|
0x30100205 0xc4100780 0x60004e01 0x00204780
|
||||||
0x20000001 0x04010780 0x30070005 0xc4100780
|
0x20000001 0x04010780 0x30070005 0xc4100780
|
||||||
@@ -659,16 +898,16 @@ code {
|
|||||||
0xd00e0201 0xa0c00780 0xf0000001 0xe0000002
|
0xd00e0201 0xa0c00780 0xf0000001 0xe0000002
|
||||||
0x861ffe03 0x00000000 0x00020805 0xc0000780
|
0x861ffe03 0x00000000 0x00020805 0xc0000780
|
||||||
0xd4044005 0x20000780 0x308409fd 0x6c4107c8
|
0xd4044005 0x20000780 0x308409fd 0x6c4107c8
|
||||||
0xa00b4003 0x00000000 0x1400c001 0x0423c780
|
0xa00ba003 0x00000000 0x1400c001 0x0423c780
|
||||||
0x100b4003 0x00000280 0x00020805 0xc0000780
|
0x100ba003 0x00000280 0x00020805 0xc0000780
|
||||||
0xd408400d 0x20000780 0xd4044009 0x20000780
|
0xd408400d 0x20000780 0xd4044009 0x20000780
|
||||||
0x1c00c001 0x0423c780 0x3800c1fd 0x6c2107c8
|
0x1c00c001 0x0423c780 0x3800c1fd 0x6c2107c8
|
||||||
0x1c00c001 0x0423c780 0x20008805 0x0000000b
|
0x1c00c001 0x0423c780 0x20008805 0x0000000b
|
||||||
0x3800c001 0xac200780 0x10000805 0x0403c500
|
0x3800c001 0xac200780 0x10000805 0x0403c500
|
||||||
0x04001001 0xe4204780 0x04011001 0xe4200780
|
0x04001001 0xe4204780 0x04011001 0xe4200780
|
||||||
0xf0000001 0xe0000002 0x861ffe03 0x00000000
|
0xf0000001 0xe0000002 0x861ffe03 0x00000000
|
||||||
0x308509fd 0x6c4107c8 0xa00c3003 0x00000000
|
0x308509fd 0x6c4107c8 0xa00c9003 0x00000000
|
||||||
0x100c3003 0x00000280 0x00020805 0xc0000780
|
0x100c9003 0x00000280 0x00020805 0xc0000780
|
||||||
0xd4064009 0x20000780 0x20008805 0x00000007
|
0xd4064009 0x20000780 0x20008805 0x00000007
|
||||||
0x3800c1fd 0x6c2047c8 0x10000805 0x0403c500
|
0x3800c1fd 0x6c2047c8 0x10000805 0x0403c500
|
||||||
0x0002020d 0xc0000780 0x3800c001 0xac200780
|
0x0002020d 0xc0000780 0x3800c001 0xac200780
|
||||||
@@ -707,7 +946,7 @@ code {
|
|||||||
0x1400c205 0x0423c780 0x60004e09 0x00208780
|
0x1400c205 0x0423c780 0x60004e09 0x00208780
|
||||||
0x3401c1fd 0x6c2107c8 0x10048011 0x00000003
|
0x3401c1fd 0x6c2107c8 0x10048011 0x00000003
|
||||||
0x10208001 0x00000003 0x30070405 0xc4100780
|
0x10208001 0x00000003 0x30070405 0xc4100780
|
||||||
0x3006040d 0xc4100780 0x21000801 0x04408280
|
0x3006040d 0xc4100780 0x21000801 0x0440c280
|
||||||
0x20000205 0x0400c780 0x00000005 0xc0000780
|
0x20000205 0x0400c780 0x00000005 0xc0000780
|
||||||
0x2101e800 0x2502e004 0x20208001 0x00000003
|
0x2101e800 0x2502e004 0x20208001 0x00000003
|
||||||
0xd00e0005 0xa0c00781
|
0xd00e0005 0xa0c00781
|
||||||
@@ -820,74 +1059,93 @@ code {
|
|||||||
code {
|
code {
|
||||||
name = cudaFindRiceParameter
|
name = cudaFindRiceParameter
|
||||||
lmem = 0
|
lmem = 0
|
||||||
smem = 2076
|
smem = 2332
|
||||||
reg = 9
|
reg = 10
|
||||||
bar = 1
|
bar = 1
|
||||||
const {
|
const {
|
||||||
segname = const
|
segname = const
|
||||||
segnum = 1
|
segnum = 1
|
||||||
offset = 0
|
offset = 0
|
||||||
bytes = 16
|
bytes = 20
|
||||||
mem {
|
mem {
|
||||||
0x00000010 0x000003ff 0x0000000e 0x00000001
|
0x000003ff 0x00000020 0x0000001f 0x00000001
|
||||||
|
0x0000000e
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
bincode {
|
bincode {
|
||||||
|
0xd0800205 0x00400780 0xa0000211 0x04000780
|
||||||
0x10028009 0x00000003 0x1000cc05 0x0423c780
|
0x10028009 0x00000003 0x1000cc05 0x0423c780
|
||||||
0x30010409 0xc4000780 0x10000005 0x0403c780
|
0xa0000015 0x04000780 0x30030801 0xc4100780
|
||||||
0x3080040d 0xac400780 0xa0000401 0x04000780
|
0x30010405 0xc4000780 0x20000a01 0x04000780
|
||||||
0xd0820609 0x00400780 0x300007fd 0x640107c8
|
0x30810209 0xac400780 0xd0820005 0x04400780
|
||||||
0xa0000411 0x04000780 0x308209fd 0x6440c2c8
|
0x3001040d 0x6c0107d0 0xa00007fd 0x0c0147c8
|
||||||
0xa001b003 0x00000000 0x1001a003 0x00000100
|
0xa001e003 0x00000000 0x30050019 0xec100780
|
||||||
0x2101ec09 0x00000003 0x100f8005 0x00000003
|
0x1001d003 0x00001100 0x2101ec1d 0x00000003
|
||||||
0x30020205 0xc4000780 0x40034e15 0x00200780
|
0x100f800d 0x00000003 0x3007060d 0xc4000780
|
||||||
0x30100a15 0xc4100780 0x60024e15 0x00214780
|
0x40074e21 0x00200780 0x30101021 0xc4100780
|
||||||
0x30020805 0xc4000780 0x20000a09 0x04004780
|
0x60064e21 0x00220780 0x30070c0d 0xc4000780
|
||||||
0x60804c05 0x00600780 0x20000205 0x04008780
|
0x2000101d 0x0400c780 0x60824c0d 0x00604780
|
||||||
|
0x2000060d 0x0401c780 0x3002060d 0xc4100780
|
||||||
|
0x2000ca0d 0x0420c780 0xd00e060d 0x80c00780
|
||||||
|
0x1001e003 0x00000780 0x103f800d 0x000fffff
|
||||||
|
0x1000f81d 0x0403c782 0x20088c21 0x00000003
|
||||||
|
0x30841021 0x6c40c780 0x1000061d 0x2440c280
|
||||||
|
0xa0001021 0x2c014780 0x00020005 0xc0000780
|
||||||
|
0xd0080ffd 0x040007c8 0x04020e01 0xe420c780
|
||||||
|
0xa0039003 0x00000000 0x10038003 0x00000100
|
||||||
|
0x2101ec1d 0x00000003 0x100f800d 0x00000003
|
||||||
|
0x3007060d 0xc4000780 0x40074e21 0x00200780
|
||||||
|
0x30101021 0xc4100780 0x60064e0d 0x00220780
|
||||||
|
0x10088021 0x00000003 0x30070c25 0xc4000780
|
||||||
|
0x30071019 0xc4000780 0x2009860c 0x20038c0c
|
||||||
|
0x61202c0d 0x00000003 0x20000205 0x0400c780
|
||||||
0x30020205 0xc4100780 0x2000ca05 0x04204780
|
0x30020205 0xc4100780 0x2000ca05 0x04204780
|
||||||
0xd00e0209 0x80c00780 0x1001b003 0x00000780
|
0xd00e0205 0x80c00780 0x10039003 0x00000780
|
||||||
0x103f8009 0x000fffff 0x30040805 0xc4100782
|
0x103f8005 0x000fffff 0x04000e01 0xe4204782
|
||||||
0x20000205 0x04000780 0x00020205 0xc0000780
|
0x861ffe03 0x00000000 0x30050a05 0xc4100780
|
||||||
0x04000e01 0xe4208780 0x861ffe03 0x00000000
|
0x20000205 0x04010780 0x00020209 0xc0000780
|
||||||
0x30040009 0xc4100780 0x20000809 0x04008780
|
0xd808380d 0x20000780 0x1900ee18 0x1d00e004
|
||||||
0x00020405 0xc0000780 0x1400ce09 0x0423c780
|
0x861ffe03 0x00000000 0x300603fd 0x6c0107c8
|
||||||
0x861ffe03 0x00000000 0x00020205 0xc0000780
|
0x20088a0d 0x00000003 0x10000a0d 0x0403c500
|
||||||
0x04000e01 0xe4208780 0x861ffe03 0x00000000
|
0x30060219 0xac000780 0x2004801d 0x00000003
|
||||||
0x00020205 0xc0000780 0x20088015 0x00000003
|
0x04020e01 0xe420c780 0x00020e09 0xc0000780
|
||||||
0x3402dffd 0x6c2047c8 0x3402de19 0xac200780
|
0x04000e01 0xe4218780 0x10000005 0x0403c780
|
||||||
0x10000015 0x0403c500 0x2004821d 0x00000003
|
0x3806cffd 0x6c2047c8 0x10000e05 0x0403c280
|
||||||
0x04020e01 0xe4214780 0x00020e09 0xc0000780
|
0x0002020d 0xc0000780 0xdc08380d 0x20000780
|
||||||
0x04000e01 0xe4218780 0x10000209 0x0403c780
|
0x3806ce0d 0xac200780 0x1c00c005 0x0423c780
|
||||||
0x3806cffd 0x6c2047c8 0x10000e09 0x0403c280
|
0x20028019 0x00000003 0x04020e01 0xe4204780
|
||||||
0x0002040d 0xc0000780 0xdc08380d 0x20000780
|
0x00020c09 0xc0000780 0x04000e01 0xe420c780
|
||||||
0x3806ce15 0xac200780 0x1c00c009 0x0423c780
|
0x10000005 0x0403c780 0x3803cffd 0x6c2047c8
|
||||||
0x20028219 0x00000003 0x04020e01 0xe4208780
|
0x10000c05 0x0403c280 0x0002020d 0xc0000780
|
||||||
0x00020c09 0xc0000780 0x04000e01 0xe4214780
|
0xdc08380d 0x20000780 0x3803ce19 0xac200780
|
||||||
0x10000209 0x0403c780 0x3805cffd 0x6c2047c8
|
0x1c00c005 0x0423c780 0x307c0a0d 0x64008780
|
||||||
0x10000c09 0x0403c280 0x0002040d 0xc0000780
|
0x30040415 0x64010780 0x04020e01 0xe4204780
|
||||||
0xdc08380d 0x20000780 0x3805ce15 0xac200780
|
0xd0830605 0x04400780 0xd0830a0d 0x04400780
|
||||||
0x1c00c009 0x0423c780 0x307c0001 0x64008780
|
0x04000e01 0xe4218780 0xd0030215 0x040007c0
|
||||||
0x30040619 0x64010780 0x04020e01 0xe4208780
|
0xa006c003 0x00000000 0x1400d005 0x0423c780
|
||||||
0xd0830001 0x04400780 0xd0830c09 0x04400780
|
0x1006c003 0x00000100 0x3406d005 0x6c204780
|
||||||
0x04000e01 0xe4214780 0xd0020009 0x040007c0
|
0x30000205 0x04000780 0x00020209 0xc0000780
|
||||||
0xa005d003 0x00000000 0x1400d001 0x0423c780
|
0xd8083809 0x20000780 0x0002080d 0xc0000780
|
||||||
0x1005d003 0x00000100 0x40074c01 0x00200780
|
0x1500f004 0x1900e00c 0x0c044e01 0xe420c780
|
||||||
0xa0004e19 0x04200780 0x2102ec1d 0x00000003
|
0x307c0bfd 0x6c0087ca 0x30010c05 0xac000500
|
||||||
0x00020205 0xc0000780 0x30100001 0xc4100780
|
0x00020809 0xc0000500 0x08040e01 0xe4204500
|
||||||
0x30070c19 0xc4000780 0x3405d01d 0x6c204780
|
0x861ffe03 0x00000000 0x30000405 0x6c0107d0
|
||||||
0x60064c01 0x00200780 0x20478204 0x20068000
|
0xa00003fd 0x0c0147c8 0xa0080003 0x00000000
|
||||||
0x00020209 0xc0000780 0x20000801 0x04000780
|
0x10080003 0x00001100 0x40054c11 0x00200780
|
||||||
0xd8083809 0x20000780 0x30020005 0xc4100780
|
0xa0004e05 0x04200780 0x2102ec0d 0x00000003
|
||||||
0x1900e000 0x2101e804 0xd00e0201 0xa0c00780
|
0x30100811 0xc4100780 0x30030205 0xc4000780
|
||||||
0x1400d001 0x0423c780 0x307c05fd 0x6c0087ca
|
0x60044c0d 0x00210780 0x20018604 0x20018004
|
||||||
0x30000003 0x00000280 0x2101ec09 0x00000003
|
0xd4113809 0x20000780 0x3002020d 0xc4100780
|
||||||
0x40074c21 0x00200780 0x10018005 0x00000003
|
0x1900e004 0x2103e80c 0xd00e0605 0xa0c00780
|
||||||
0xa0004e19 0x04200780 0x2102ec1d 0x00000003
|
0xf0000001 0xe0000002 0x30000003 0x00000100
|
||||||
0x30101021 0xc4100780 0x30020205 0xc4000780
|
0x2101ec0d 0x00000003 0x40054c19 0x00200780
|
||||||
0x30070c09 0xc4000780 0x60064c0d 0x00220780
|
0x10018005 0x00000003 0xa0004e11 0x04200780
|
||||||
0x20028204 0x20048608 0x20000405 0x04004780
|
0x2102ec15 0x00000003 0x30100c19 0xc4100780
|
||||||
0x30020205 0xc4100780 0x30000a01 0xac000780
|
0x30030205 0xc4000780 0x3005080d 0xc4000780
|
||||||
0x2000c805 0x04204780 0xd00e0201 0xa0c00781
|
0x60044c09 0x00218780 0x20038204 0x20008400
|
||||||
|
0x20000001 0x04004780 0xd4103805 0x20000780
|
||||||
|
0x30020005 0xc4100780 0x1500e000 0x2101e804
|
||||||
|
0xd00e0201 0xa0c00781
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
code {
|
code {
|
||||||
@@ -1199,25 +1457,27 @@ code {
|
|||||||
0x1000f801 0x0403c100 0x301f0005 0xec100780
|
0x1000f801 0x0403c100 0x301f0005 0xec100780
|
||||||
0x30010001 0xc4100780 0xd0000201 0x04008780
|
0x30010001 0xc4100780 0xd0000201 0x04008780
|
||||||
0x00000e05 0xc0000780 0x30820001 0xac400780
|
0x00000e05 0xc0000780 0x30820001 0xac400780
|
||||||
0x04001401 0xe4200780 0xd0105009 0x20000780
|
0x04001401 0xe4200780 0x861ffe03 0x00000000
|
||||||
0x20018c05 0x00000003 0x1000d001 0x0423c780
|
0x307cd1fd 0x6c20c7c8 0x1000f815 0x0403c780
|
||||||
0x1100f014 0x40010424 0x3800c015 0x04214780
|
0x1004a003 0x00000280 0x1000d001 0x0423c780
|
||||||
0x60000629 0x00024780 0x400b0425 0x00000780
|
0x40090005 0x00000780 0x60080205 0x00004780
|
||||||
0x30101429 0xc4100780 0x600a0625 0x00024780
|
0x30100205 0xc4100780 0x60080001 0x00004780
|
||||||
0x200005fd 0x040107c8 0x60000401 0x00028780
|
0x200a8005 0x00000003 0x00020205 0xc0000780
|
||||||
0x30101209 0xc4100780 0x600a0401 0x00008100
|
0xa0049003 0x00000000 0x2000d005 0x04200780
|
||||||
|
0x20018001 0x00000003 0x3606c225 0xec200780
|
||||||
|
0x300101fd 0x6c0147c8 0x20000a15 0x04024780
|
||||||
|
0x10044003 0x00000280 0xf0000001 0xe0000002
|
||||||
|
0x200005fd 0x040107c8 0xa005d003 0x00000000
|
||||||
|
0x20018c05 0x00000003 0x10057003 0x00000280
|
||||||
|
0xd0105005 0x20000780 0x1400c001 0x0423c780
|
||||||
|
0x2040d001 0x04200780 0x40010409 0x00000780
|
||||||
|
0x60000609 0x00008780 0x30100409 0xc4100780
|
||||||
|
0x60000401 0x00008780 0x20000001 0x04014780
|
||||||
|
0x1005d003 0x00000780 0x1000d001 0x0423c780
|
||||||
|
0x40010409 0x00000780 0x60000609 0x00008780
|
||||||
|
0x30100409 0xc4100780 0x60000401 0x00008780
|
||||||
|
0x20000001 0x04014780 0x00000e05 0xc0000782
|
||||||
0x04021401 0xe4200780 0x861ffe03 0x00000000
|
0x04021401 0xe4200780 0x861ffe03 0x00000000
|
||||||
0x307cd1fd 0x6c20c7c8 0x1005b003 0x00000280
|
|
||||||
0x1000d001 0x0423c780 0x40090005 0x00000780
|
|
||||||
0x60080205 0x00004780 0x30100205 0xc4100780
|
|
||||||
0x60080005 0x00004780 0x00000e05 0xc0000780
|
|
||||||
0x200a8201 0x00000003 0xd4085009 0x20000780
|
|
||||||
0x00020005 0xc0000780 0xa005a003 0x00000000
|
|
||||||
0x1900e000 0x2101f008 0x3606c215 0xec200780
|
|
||||||
0x20018205 0x00000003 0x20000001 0x04014780
|
|
||||||
0x00000e09 0xc0000780 0x300203fd 0x6c0147c8
|
|
||||||
0x08021401 0xe4200780 0x10053003 0x00000280
|
|
||||||
0xf0000001 0xe0000002 0x861ffe03 0x00000000
|
|
||||||
0x300607fd 0x640107c8 0x308309fd 0x6440c2c8
|
0x300607fd 0x640107c8 0x308309fd 0x6440c2c8
|
||||||
0x30000003 0x00000100 0x2101ee05 0x00000003
|
0x30000003 0x00000100 0x2101ee05 0x00000003
|
||||||
0x100f8001 0x00000003 0x30010001 0xc4000780
|
0x100f8001 0x00000003 0x30010001 0xc4000780
|
||||||
@@ -1271,63 +1531,61 @@ code {
|
|||||||
bincode {
|
bincode {
|
||||||
0xd0800205 0x00400780 0xa000020d 0x04000780
|
0xd0800205 0x00400780 0xa000020d 0x04000780
|
||||||
0xa0000001 0x04000780 0x30040605 0xc4100780
|
0xa0000001 0x04000780 0x30040605 0xc4100780
|
||||||
0x20000005 0x04004780 0x308103fd 0x644107c8
|
0x20000009 0x04004780 0x308105fd 0x644107c8
|
||||||
0xa0011003 0x00000000 0x30020211 0xc4100780
|
0xa0011003 0x00000000 0x30020411 0xc4100780
|
||||||
0x10011003 0x00000280 0xa0004e09 0x04200780
|
0x10011003 0x00000280 0xa0004e05 0x04200780
|
||||||
0x30070415 0xc4100780 0x30060409 0xc4100780
|
0x30070215 0xc4100780 0x30060205 0xc4100780
|
||||||
0x20028a08 0x2102ec08 0x20000809 0x04008780
|
0x20018a04 0x2101ec04 0x20000805 0x04004780
|
||||||
0xd00e0409 0x80c00780 0x00000805 0xc0000780
|
0xd00e0205 0x80c00780 0x00000805 0xc0000780
|
||||||
0x04041401 0xe4208780 0xf0000001 0xe0000002
|
0x04041401 0xe4204780 0xf0000001 0xe0000002
|
||||||
0x861ffe03 0x00000000 0x00000805 0xc0000780
|
0x861ffe03 0x00000000 0x307cd1fd 0x6c20c7c8
|
||||||
0x307cd1fd 0x6c20c7c8 0x04021401 0xe43f0780
|
0x1000f819 0x0403c780 0x10046003 0x00000280
|
||||||
0x1004d003 0x00000280 0xa0004c15 0x04200780
|
0xa0004c15 0x04200780 0x1000d005 0x0423c780
|
||||||
0x1000d009 0x0423c780 0x400a0a19 0x00000780
|
0x400a061d 0x00000780 0x30100e1d 0xc4100780
|
||||||
0x30100c19 0xc4100780 0x600a0809 0x00018780
|
0x600a0405 0x0001c780 0x3082d015 0xac600780
|
||||||
0x3082d015 0xac600780 0x2000d01d 0x04208780
|
0x2000d021 0x04204780 0xa0045003 0x00000000
|
||||||
0xa004c003 0x00000000 0x307c0bfd 0x6c0107c8
|
0x30000bfd 0x6c0107c8 0x2001841c 0x20088420
|
||||||
0x20028218 0x2007821c 0xd0105005 0x20000780
|
0x10000405 0x0403c780 0xd0105005 0x20000780
|
||||||
0x3406c1fd 0x6c20c7d8 0x3001d1fd 0x6c2112d8
|
0x3407c1fd 0x6c20c7d8 0x3001d1fd 0x6c2112d8
|
||||||
0xa002d003 0x00000000 0x1002c003 0x00001100
|
0xa002d003 0x00000000 0x1002c003 0x00001100
|
||||||
0xd010a005 0x20000780 0x2400c009 0x04218780
|
0xd010a005 0x20000780 0x2400c009 0x0421c780
|
||||||
0x30020409 0xc4100780 0x2000ca09 0x04208780
|
0x30020409 0xc4100780 0x2000ca09 0x04208780
|
||||||
0xd00e0409 0x80c00780 0x1002d003 0x00000780
|
0xd00e0425 0x80c00780 0x1002d003 0x00000780
|
||||||
0x1000f809 0x0403c780 0x301f0421 0xec100782
|
0x1000f825 0x0403c780 0x301f1209 0xec100782
|
||||||
0x30010409 0xc4100780 0xd0021009 0x04008780
|
0x30011225 0xc4100780 0xd0090409 0x04008780
|
||||||
0x00000805 0xc0000780 0x30830409 0xac400780
|
0x00000805 0xc0000780 0x30830409 0xac400780
|
||||||
0x04001401 0xe4208780 0x861ffe03 0x00000000
|
0x04001401 0xe4208780 0x861ffe03 0x00000000
|
||||||
0x10043003 0x00000100 0x200a8009 0x00000003
|
0xa003f003 0x00000000 0x10000009 0x0403c780
|
||||||
0x00020405 0xc0000780 0xa0042003 0x00000000
|
0x1003f003 0x00000100 0x200a8025 0x00000003
|
||||||
0x10008008 0x20058020 0x00000809 0xc0000780
|
0x00021205 0xc0000780 0x20108409 0x00000003
|
||||||
0x3403c025 0xec200780 0xd808500d 0x20000780
|
0x3403c025 0xec200780 0x30020bfd 0x6c0107d8
|
||||||
0x20108409 0x00000003 0x2c00c025 0x04224780
|
0x20000c19 0x04024780 0xd4008005 0x20000780
|
||||||
0x300805fd 0x6c0047d8 0x08021401 0xe4224780
|
0x10039003 0x00001280 0xf0000001 0xe0000002
|
||||||
0xd4008005 0x20000780 0x10039003 0x00001280
|
0x861ffe03 0x00000000 0x20008e1d 0x00000013
|
||||||
0xf0000001 0xe0000002 0x00000805 0xc0000780
|
0x30080ffd 0x6c0047d8 0x20008205 0x00000013
|
||||||
0xd4085009 0x20000780 0x3883c009 0xac600780
|
0x10021003 0x00001280 0xf0000001 0xe0000002
|
||||||
0x04021401 0xe4208780 0x861ffe03 0x00000000
|
0x30830c05 0xac400780 0x00000805 0xc0000780
|
||||||
0x20008c19 0x00000013 0x30070dfd 0x6c0047d8
|
0x04021401 0xe4204780 0xd4085009 0x20000780
|
||||||
0x20008205 0x00000013 0x10021003 0x00001280
|
0x1900f004 0x2901e004 0x04021401 0xe4204780
|
||||||
0xf0000001 0xe0000002 0x00000805 0xc0000780
|
0x1900e804 0x2901e004 0x04021401 0xe4204780
|
||||||
0xd4085009 0x20000780 0x1900f004 0x2901e004
|
0x1900e404 0x2901e004 0x04021401 0xe4204780
|
||||||
0x04021401 0xe4204780 0x1900e804 0x2901e004
|
0x1800c205 0x0423c780 0x307c01fd 0x640087c8
|
||||||
0x04021401 0xe4204780 0x1900e404 0x2901e004
|
0x2800c001 0x04204780 0x308407fd 0x6440c2c8
|
||||||
0x04021401 0xe4204780 0x1800c205 0x0423c780
|
0x04021401 0xe4200780 0x30000003 0x00000100
|
||||||
0x307c01fd 0x640087c8 0x2800c001 0x04204780
|
0xd0105005 0x20000780 0x2101ee19 0x00000003
|
||||||
0x308407fd 0x6440c2c8 0x04021401 0xe4200780
|
0x100f8005 0x00000003 0x20018609 0x00000003
|
||||||
0x30000003 0x00000100 0xd0105005 0x20000780
|
0x1000d001 0x0423c780 0x30060205 0xc4000780
|
||||||
0x2101ee19 0x00000003 0x100f8005 0x00000003
|
0x1100f014 0x41032e1c 0x40000a24 0x3505e014
|
||||||
0x20018609 0x00000003 0x1000d001 0x0423c780
|
0x30100e21 0xc4100780 0x6001081d 0x00024780
|
||||||
0x30060205 0xc4000780 0x1100f014 0x41032e1c
|
0x3006060d 0xc4000780 0x400b0819 0x00000780
|
||||||
0x40000a24 0x3505e014 0x30100e21 0xc4100780
|
0x60024e05 0x00220780 0x00000805 0xc0000780
|
||||||
0x6001081d 0x00024780 0x3006060d 0xc4000780
|
0x30100e1d 0xc4100780 0x600a0a11 0x00018780
|
||||||
0x400b0819 0x00000780 0x60024e05 0x00220780
|
0x2000020d 0x0400c780 0xa0004c05 0x042007c0
|
||||||
0x00000805 0xc0000780 0x30100e1d 0xc4100780
|
0x60000801 0x0001c780 0x30100811 0xc4100780
|
||||||
0x600a0a11 0x00018780 0x2000020d 0x0400c780
|
0xd4085005 0x20000780 0x20000205 0x0400c780
|
||||||
0xa0004c05 0x042007c0 0x60000801 0x0001c780
|
0x600a0801 0x00010100 0x3483c009 0xac600780
|
||||||
0x30100811 0xc4100780 0xd4085005 0x20000780
|
0x30020205 0xc4100780 0x20028000 0x2101e804
|
||||||
0x20000205 0x0400c780 0x600a0801 0x00010100
|
0xd00e0201 0xa0c00781
|
||||||
0x3483c009 0xac600780 0x30020205 0xc4100780
|
|
||||||
0x20028000 0x2101e804 0xd00e0201 0xa0c00781
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
code {
|
code {
|
||||||
|
|||||||
Reference in New Issue
Block a user