trying to do rice partitioning on gpu

This commit is contained in:
chudov
2009-10-01 01:35:44 +00:00
parent a02f79a3f8
commit 556edb7127
3 changed files with 607 additions and 10 deletions

View File

@@ -969,11 +969,37 @@ namespace CUETools.Codecs.FlaCuda
lpc.encode_residual_long(frame.subframes[ch].best.residual, frame.subframes[ch].samples, frame.blocksize, frame.subframes[ch].best.order, coefs, frame.subframes[ch].best.shift); lpc.encode_residual_long(frame.subframes[ch].best.residual, frame.subframes[ch].samples, frame.blocksize, frame.subframes[ch].best.order, coefs, frame.subframes[ch].best.shift);
else if (encode_on_cpu) else if (encode_on_cpu)
lpc.encode_residual(frame.subframes[ch].best.residual, frame.subframes[ch].samples, frame.blocksize, frame.subframes[ch].best.order, coefs, frame.subframes[ch].best.shift); lpc.encode_residual(frame.subframes[ch].best.residual, frame.subframes[ch].samples, frame.blocksize, frame.subframes[ch].best.order, coefs, frame.subframes[ch].best.shift);
if ((csum << (int)frame.subframes[ch].obits) >= 1UL << 32 || encode_on_cpu)
int pmin = get_max_p_order(eparams.min_partition_order, frame.blocksize, frame.subframes[ch].best.order); {
int pmax = get_max_p_order(eparams.max_partition_order, frame.blocksize, frame.subframes[ch].best.order); int pmin = get_max_p_order(eparams.min_partition_order, frame.blocksize, frame.subframes[ch].best.order);
uint bits = (uint)frame.subframes[ch].best.order * frame.subframes[ch].obits + 4 + 5 + (uint)frame.subframes[ch].best.order * (uint)frame.subframes[ch].best.cbits + 6; int pmax = get_max_p_order(eparams.max_partition_order, frame.blocksize, frame.subframes[ch].best.order);
frame.subframes[ch].best.size = bits + calc_rice_params(ref frame.subframes[ch].best.rc, ref frame.current.rc, pmin, pmax, frame.subframes[ch].best.residual, (uint)frame.blocksize, (uint)frame.subframes[ch].best.order); uint bits = (uint)frame.subframes[ch].best.order * frame.subframes[ch].obits + 4 + 5 + (uint)frame.subframes[ch].best.order * (uint)frame.subframes[ch].best.cbits + 6;
//uint oldsize = frame.subframes[ch].best.size;
frame.subframes[ch].best.size = bits + calc_rice_params(ref frame.subframes[ch].best.rc, ref frame.current.rc, pmin, pmax, frame.subframes[ch].best.residual, (uint)frame.blocksize, (uint)frame.subframes[ch].best.order);
//if (frame.subframes[ch].best.size > frame.subframes[ch].obits * (uint)frame.blocksize &&
// oldsize <= frame.subframes[ch].obits * (uint)frame.blocksize)
// throw new Exception("oops");
}
else
{
// residual
int len = frame.subframes[ch].best.order * (int)frame.subframes[ch].obits + 6 +
4 + 5 + frame.subframes[ch].best.order * frame.subframes[ch].best.cbits +
(4 << frame.subframes[ch].best.rc.porder);
int j = frame.subframes[ch].best.order;
int psize = frame.blocksize >> frame.subframes[ch].best.rc.porder;
for (int p = 0; p < (1 << frame.subframes[ch].best.rc.porder); p++)
{
int k = frame.subframes[ch].best.rc.rparams[p];
int cnt = p == 0 ? psize - frame.subframes[ch].best.order : psize;
len += (k + 1) * cnt;
for (int i = j; i < j + cnt; i++)
len += (((frame.subframes[ch].best.residual[i] << 1) ^ (frame.subframes[ch].best.residual[i] >> 31)) >> k);
j += cnt;
}
if (len != frame.subframes[ch].best.size)
throw new Exception(string.Format("length mismatch: {0} vs {1}", len, frame.subframes[ch].best.size));
}
} }
break; break;
} }
@@ -1031,8 +1057,35 @@ namespace CUETools.Codecs.FlaCuda
frame.subframes[ch].samples[i] >>= (int)frame.subframes[ch].wbits; frame.subframes[ch].samples[i] >>= (int)frame.subframes[ch].wbits;
for (int i = 0; i < task.BestResidualTasks[index].residualOrder; i++) for (int i = 0; i < task.BestResidualTasks[index].residualOrder; i++)
frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i]; frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i];
if (!encode_on_cpu) if (!encode_on_cpu && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC))
{
frame.subframes[ch].best.size = (uint)frame.subframes[ch].best.order * frame.subframes[ch].obits + 6;
if (frame.subframes[ch].best.type == SubframeType.LPC)
frame.subframes[ch].best.size += 4 + 5 + (uint)frame.subframes[ch].best.order * (uint)frame.subframes[ch].best.cbits;
AudioSamples.MemCpy(frame.subframes[ch].best.residual + frame.subframes[ch].best.order, (int*)task.residualBufferPtr + task.BestResidualTasks[index].residualOffs, frame.blocksize - frame.subframes[ch].best.order); AudioSamples.MemCpy(frame.subframes[ch].best.residual + frame.subframes[ch].best.order, (int*)task.residualBufferPtr + task.BestResidualTasks[index].residualOffs, frame.blocksize - frame.subframes[ch].best.order);
int* riceParams = ((int*)task.riceParamsPtr) + (4 << task.max_porder) * index;
int* partLengths = ((int*)task.riceParamsPtr) + (4 << task.max_porder) * index + (2 << task.max_porder);
int opt_porder = task.max_porder;
int opt_pos = 0;
int opt_bits = 0xfffffff;
for (int porder = task.max_porder; porder >= 0; porder--)
{
int in_pos = (2 << task.max_porder) - (2 << porder);
int sum = (1 << porder) * 4;
for (int p = 0; p < (1 << porder); p++)
sum += partLengths[in_pos + p];// +(riceParams[in_pos + p] + 1) * ((frame.blocksize >> porder) - (p != 0 ? 0 : frame.subframes[ch].best.order));
if (sum < opt_bits)
{
opt_bits = sum;
opt_porder = porder;
opt_pos = in_pos;
}
}
frame.subframes[ch].best.rc.porder = opt_porder;
for (int i = 0; i < (1 << opt_porder); i++)
frame.subframes[ch].best.rc.rparams[i] = riceParams[opt_pos + i];
frame.subframes[ch].best.size += (uint)opt_bits;
}
} }
} }
} }
@@ -1069,7 +1122,16 @@ namespace CUETools.Codecs.FlaCuda
if (residualPartCount > maxResidualParts) if (residualPartCount > maxResidualParts)
throw new Exception("invalid combination of block size and LPC order"); throw new Exception("invalid combination of block size and LPC order");
int max_porder = get_max_p_order(eparams.max_partition_order, task.frameSize, eparams.max_prediction_order);
int psize = task.frameSize >> max_porder;
while (psize < 16 && max_porder > 0)
{
psize <<= 1;
max_porder--;
}
CUfunction cudaChannelDecorr = channels == 2 ? (channelsCount == 4 ? task.cudaStereoDecorr : task.cudaChannelDecorr2) : task.cudaChannelDecorr; CUfunction cudaChannelDecorr = channels == 2 ? (channelsCount == 4 ? task.cudaStereoDecorr : task.cudaChannelDecorr2) : task.cudaChannelDecorr;
CUfunction cudaCalcPartition = psize >= 128 ? task.cudaCalcLargePartition : task.cudaCalcPartition;
cuda.SetParameter(cudaChannelDecorr, 0 * sizeof(uint), (uint)task.cudaSamples.Pointer); cuda.SetParameter(cudaChannelDecorr, 0 * sizeof(uint), (uint)task.cudaSamples.Pointer);
cuda.SetParameter(cudaChannelDecorr, 1 * sizeof(uint), (uint)task.cudaSamplesBytes.Pointer); cuda.SetParameter(cudaChannelDecorr, 1 * sizeof(uint), (uint)task.cudaSamplesBytes.Pointer);
@@ -1145,6 +1207,24 @@ namespace CUETools.Codecs.FlaCuda
cuda.SetParameterSize(task.cudaEncodeResidual, sizeof(uint) * 3U); cuda.SetParameterSize(task.cudaEncodeResidual, sizeof(uint) * 3U);
cuda.SetFunctionBlockShape(task.cudaEncodeResidual, residualPartSize, 1, 1); cuda.SetFunctionBlockShape(task.cudaEncodeResidual, residualPartSize, 1, 1);
cuda.SetParameter(cudaCalcPartition, 0, (uint)task.cudaPartitions.Pointer);
cuda.SetParameter(cudaCalcPartition, 1 * sizeof(uint), (uint)task.cudaResidual.Pointer);
cuda.SetParameter(cudaCalcPartition, 2 * sizeof(uint), (uint)task.cudaBestResidualTasks.Pointer);
cuda.SetParameter(cudaCalcPartition, 3 * sizeof(uint), (uint)max_porder);
cuda.SetParameterSize(cudaCalcPartition, 4U * sizeof(uint));
cuda.SetFunctionBlockShape(cudaCalcPartition, 16, 16, 1);
cuda.SetParameter(task.cudaSumPartition, 0, (uint)task.cudaPartitions.Pointer);
cuda.SetParameter(task.cudaSumPartition, 1 * sizeof(uint), (uint)max_porder);
cuda.SetParameterSize(task.cudaSumPartition, 2U * sizeof(uint));
cuda.SetFunctionBlockShape(task.cudaSumPartition, 256, 1, 1);
cuda.SetParameter(task.cudaFindRiceParameter, 0, (uint)task.cudaRiceParams.Pointer);
cuda.SetParameter(task.cudaFindRiceParameter, 1 * sizeof(uint), (uint)task.cudaPartitions.Pointer);
cuda.SetParameter(task.cudaFindRiceParameter, 2 * sizeof(uint), (uint)max_porder);
cuda.SetParameterSize(task.cudaFindRiceParameter, 3U * sizeof(uint));
cuda.SetFunctionBlockShape(task.cudaFindRiceParameter, 16, 16, 1);
// issue work to the GPU // issue work to the GPU
cuda.LaunchAsync(cudaChannelDecorr, (task.frameCount * task.frameSize + 255) / 256, channels == 2 ? 1 : channels, task.stream); cuda.LaunchAsync(cudaChannelDecorr, (task.frameCount * task.frameSize + 255) / 256, channels == 2 ? 1 : channels, task.stream);
if (task.frameSize <= 512 && eparams.max_prediction_order <= 12) if (task.frameSize <= 512 && eparams.max_prediction_order <= 12)
@@ -1163,10 +1243,18 @@ namespace CUETools.Codecs.FlaCuda
else else
cuda.LaunchAsync(task.cudaCopyBestMethod, 1, channels * task.frameCount, task.stream); cuda.LaunchAsync(task.cudaCopyBestMethod, 1, channels * task.frameCount, task.stream);
if (!encode_on_cpu) if (!encode_on_cpu)
{
int bsz = (psize >= 128) ? psize : (256 / psize) * psize;
cuda.LaunchAsync(task.cudaEncodeResidual, residualPartCount, channels * task.frameCount, task.stream); cuda.LaunchAsync(task.cudaEncodeResidual, residualPartCount, channels * task.frameCount, task.stream);
cuda.CopyDeviceToHostAsync(task.cudaBestResidualTasks, task.bestResidualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * channels * task.frameCount), task.stream); cuda.LaunchAsync(cudaCalcPartition, (task.frameSize + bsz - 1) / bsz, channels * task.frameCount, task.stream);
if (!encode_on_cpu) if (max_porder > 0)
cuda.LaunchAsync(task.cudaSumPartition, Flake.MAX_RICE_PARAM + 1, channels * task.frameCount, task.stream);
cuda.LaunchAsync(task.cudaFindRiceParameter, ((2 << max_porder) + 15) / 16, channels * task.frameCount, task.stream);
cuda.CopyDeviceToHostAsync(task.cudaResidual, task.residualBufferPtr, (uint)(sizeof(int) * MAX_BLOCKSIZE * channels), task.stream); cuda.CopyDeviceToHostAsync(task.cudaResidual, task.residualBufferPtr, (uint)(sizeof(int) * MAX_BLOCKSIZE * channels), task.stream);
cuda.CopyDeviceToHostAsync(task.cudaRiceParams, task.riceParamsPtr, (uint)(sizeof(int) * (4 << max_porder) * channels * task.frameCount), task.stream);
task.max_porder = max_porder;
}
cuda.CopyDeviceToHostAsync(task.cudaBestResidualTasks, task.bestResidualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * channels * task.frameCount), task.stream);
} }
unsafe int encode_frame(bool doMidside, int channelCount, int iFrame, FlaCudaTask task) unsafe int encode_frame(bool doMidside, int channelCount, int iFrame, FlaCudaTask task)
@@ -1856,9 +1944,15 @@ namespace CUETools.Codecs.FlaCuda
public CUfunction cudaCopyBestMethod; public CUfunction cudaCopyBestMethod;
public CUfunction cudaCopyBestMethodStereo; public CUfunction cudaCopyBestMethodStereo;
public CUfunction cudaEncodeResidual; public CUfunction cudaEncodeResidual;
public CUfunction cudaCalcPartition;
public CUfunction cudaCalcLargePartition;
public CUfunction cudaSumPartition;
public CUfunction cudaFindRiceParameter;
public CUdeviceptr cudaSamplesBytes; public CUdeviceptr cudaSamplesBytes;
public CUdeviceptr cudaSamples; public CUdeviceptr cudaSamples;
public CUdeviceptr cudaResidual; public CUdeviceptr cudaResidual;
public CUdeviceptr cudaPartitions;
public CUdeviceptr cudaRiceParams;
public CUdeviceptr cudaAutocorTasks; public CUdeviceptr cudaAutocorTasks;
public CUdeviceptr cudaAutocorOutput; public CUdeviceptr cudaAutocorOutput;
public CUdeviceptr cudaResidualTasks; public CUdeviceptr cudaResidualTasks;
@@ -1867,6 +1961,7 @@ namespace CUETools.Codecs.FlaCuda
public IntPtr samplesBytesPtr = IntPtr.Zero; public IntPtr samplesBytesPtr = IntPtr.Zero;
public IntPtr samplesBufferPtr = IntPtr.Zero; public IntPtr samplesBufferPtr = IntPtr.Zero;
public IntPtr residualBufferPtr = IntPtr.Zero; public IntPtr residualBufferPtr = IntPtr.Zero;
public IntPtr riceParamsPtr = IntPtr.Zero;
public IntPtr autocorTasksPtr = IntPtr.Zero; public IntPtr autocorTasksPtr = IntPtr.Zero;
public IntPtr residualTasksPtr = IntPtr.Zero; public IntPtr residualTasksPtr = IntPtr.Zero;
public IntPtr bestResidualTasksPtr = IntPtr.Zero; public IntPtr bestResidualTasksPtr = IntPtr.Zero;
@@ -1883,6 +1978,7 @@ namespace CUETools.Codecs.FlaCuda
public int nAutocorTasks = 0; public int nAutocorTasks = 0;
public int nResidualTasksPerChannel = 0; public int nResidualTasksPerChannel = 0;
public int nAutocorTasksPerChannel = 0; public int nAutocorTasksPerChannel = 0;
public int max_porder = 0;
unsafe public FlaCudaTask(CUDA _cuda, int channelCount) unsafe public FlaCudaTask(CUDA _cuda, int channelCount)
{ {
@@ -1892,10 +1988,14 @@ namespace CUETools.Codecs.FlaCuda
residualTasksLen = sizeof(encodeResidualTaskStruct) * channelCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 8) * FlaCudaWriter.maxFrames; residualTasksLen = sizeof(encodeResidualTaskStruct) * channelCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 8) * FlaCudaWriter.maxFrames;
bestResidualTasksLen = sizeof(encodeResidualTaskStruct) * channelCount * FlaCudaWriter.maxFrames; bestResidualTasksLen = sizeof(encodeResidualTaskStruct) * channelCount * FlaCudaWriter.maxFrames;
samplesBufferLen = sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelCount; samplesBufferLen = sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelCount;
int partitionsLen = sizeof(int) * (30 << 8) * channelCount * FlaCudaWriter.maxFrames;
int riceParamsLen = sizeof(int) * (4 << 8) * channelCount * FlaCudaWriter.maxFrames;
cudaSamplesBytes = cuda.Allocate((uint)samplesBufferLen / 2); cudaSamplesBytes = cuda.Allocate((uint)samplesBufferLen / 2);
cudaSamples = cuda.Allocate((uint)samplesBufferLen); cudaSamples = cuda.Allocate((uint)samplesBufferLen);
cudaResidual = cuda.Allocate((uint)samplesBufferLen); cudaResidual = cuda.Allocate((uint)samplesBufferLen);
cudaPartitions = cuda.Allocate((uint)partitionsLen);
cudaRiceParams = cuda.Allocate((uint)riceParamsLen);
cudaAutocorTasks = cuda.Allocate((uint)autocorTasksLen); cudaAutocorTasks = cuda.Allocate((uint)autocorTasksLen);
cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * (FlaCudaWriter.maxAutocorParts + FlaCudaWriter.maxFrames))); cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * (FlaCudaWriter.maxAutocorParts + FlaCudaWriter.maxFrames)));
cudaResidualTasks = cuda.Allocate((uint)residualTasksLen); cudaResidualTasks = cuda.Allocate((uint)residualTasksLen);
@@ -1908,6 +2008,8 @@ namespace CUETools.Codecs.FlaCuda
cuErr = CUDADriver.cuMemAllocHost(ref samplesBufferPtr, (uint)samplesBufferLen); cuErr = CUDADriver.cuMemAllocHost(ref samplesBufferPtr, (uint)samplesBufferLen);
if (cuErr == CUResult.Success) if (cuErr == CUResult.Success)
cuErr = CUDADriver.cuMemAllocHost(ref residualBufferPtr, (uint)samplesBufferLen); cuErr = CUDADriver.cuMemAllocHost(ref residualBufferPtr, (uint)samplesBufferLen);
if (cuErr == CUResult.Success)
cuErr = CUDADriver.cuMemAllocHost(ref riceParamsPtr, (uint)riceParamsLen);
if (cuErr == CUResult.Success) if (cuErr == CUResult.Success)
cuErr = CUDADriver.cuMemAllocHost(ref autocorTasksPtr, (uint)autocorTasksLen); cuErr = CUDADriver.cuMemAllocHost(ref autocorTasksPtr, (uint)autocorTasksLen);
if (cuErr == CUResult.Success) if (cuErr == CUResult.Success)
@@ -1919,6 +2021,7 @@ namespace CUETools.Codecs.FlaCuda
if (samplesBytesPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBytesPtr); samplesBytesPtr = IntPtr.Zero; if (samplesBytesPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBytesPtr); samplesBytesPtr = IntPtr.Zero;
if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr); samplesBufferPtr = IntPtr.Zero; if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr); samplesBufferPtr = IntPtr.Zero;
if (residualBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualBufferPtr); residualBufferPtr = IntPtr.Zero; if (residualBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualBufferPtr); residualBufferPtr = IntPtr.Zero;
if (riceParamsPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(riceParamsPtr); riceParamsPtr = IntPtr.Zero;
if (autocorTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(autocorTasksPtr); autocorTasksPtr = IntPtr.Zero; if (autocorTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(autocorTasksPtr); autocorTasksPtr = IntPtr.Zero;
if (residualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualTasksPtr); residualTasksPtr = IntPtr.Zero; if (residualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualTasksPtr); residualTasksPtr = IntPtr.Zero;
if (bestResidualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(bestResidualTasksPtr); bestResidualTasksPtr = IntPtr.Zero; if (bestResidualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(bestResidualTasksPtr); bestResidualTasksPtr = IntPtr.Zero;
@@ -1937,6 +2040,10 @@ namespace CUETools.Codecs.FlaCuda
cudaCopyBestMethod = cuda.GetModuleFunction("cudaCopyBestMethod"); cudaCopyBestMethod = cuda.GetModuleFunction("cudaCopyBestMethod");
cudaCopyBestMethodStereo = cuda.GetModuleFunction("cudaCopyBestMethodStereo"); cudaCopyBestMethodStereo = cuda.GetModuleFunction("cudaCopyBestMethodStereo");
cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual"); cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual");
cudaCalcPartition = cuda.GetModuleFunction("cudaCalcPartition");
cudaCalcLargePartition = cuda.GetModuleFunction("cudaCalcLargePartition");
cudaSumPartition = cuda.GetModuleFunction("cudaSumPartition");
cudaFindRiceParameter = cuda.GetModuleFunction("cudaFindRiceParameter");
stream = cuda.CreateStream(); stream = cuda.CreateStream();
verifyBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * channelCount]; // should be channels, not channelCount. And should null if not doing verify! verifyBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * channelCount]; // should be channels, not channelCount. And should null if not doing verify!
@@ -1948,6 +2055,7 @@ namespace CUETools.Codecs.FlaCuda
cuda.Free(cudaSamples); cuda.Free(cudaSamples);
cuda.Free(cudaSamplesBytes); cuda.Free(cudaSamplesBytes);
cuda.Free(cudaResidual); cuda.Free(cudaResidual);
cuda.Free(cudaPartitions);
cuda.Free(cudaAutocorTasks); cuda.Free(cudaAutocorTasks);
cuda.Free(cudaAutocorOutput); cuda.Free(cudaAutocorOutput);
cuda.Free(cudaResidualTasks); cuda.Free(cudaResidualTasks);
@@ -1956,6 +2064,7 @@ namespace CUETools.Codecs.FlaCuda
CUDADriver.cuMemFreeHost(samplesBytesPtr); CUDADriver.cuMemFreeHost(samplesBytesPtr);
CUDADriver.cuMemFreeHost(samplesBufferPtr); CUDADriver.cuMemFreeHost(samplesBufferPtr);
CUDADriver.cuMemFreeHost(residualBufferPtr); CUDADriver.cuMemFreeHost(residualBufferPtr);
CUDADriver.cuMemFreeHost(riceParamsPtr);
CUDADriver.cuMemFreeHost(residualTasksPtr); CUDADriver.cuMemFreeHost(residualTasksPtr);
CUDADriver.cuMemFreeHost(bestResidualTasksPtr); CUDADriver.cuMemFreeHost(bestResidualTasksPtr);
CUDADriver.cuMemFreeHost(autocorTasksPtr); CUDADriver.cuMemFreeHost(autocorTasksPtr);

View File

@@ -56,7 +56,8 @@ typedef struct
int coefs[32]; int coefs[32];
} encodeResidualTaskStruct; } encodeResidualTaskStruct;
#define SUM32(buf,tid,op) buf[tid] op buf[tid + 16]; buf[tid] op buf[tid + 8]; buf[tid] op buf[tid + 4]; buf[tid] op buf[tid + 2]; buf[tid] op buf[tid + 1]; #define SUM16(buf,tid,op) buf[tid] op buf[tid + 8]; buf[tid] op buf[tid + 4]; buf[tid] op buf[tid + 2]; buf[tid] op buf[tid + 1];
#define SUM32(buf,tid,op) buf[tid] op buf[tid + 16]; SUM16(buf,tid,op)
#define SUM64(buf,tid,op) if (tid < 32) buf[tid] op buf[tid + 32]; __syncthreads(); if (tid < 32) { SUM32(buf,tid,op) } #define SUM64(buf,tid,op) if (tid < 32) buf[tid] op buf[tid + 32]; __syncthreads(); if (tid < 32) { SUM32(buf,tid,op) }
#define SUM128(buf,tid,op) if (tid < 64) buf[tid] op buf[tid + 64]; __syncthreads(); SUM64(buf,tid,op) #define SUM128(buf,tid,op) if (tid < 64) buf[tid] op buf[tid + 64]; __syncthreads(); SUM64(buf,tid,op)
#define SUM256(buf,tid,op) if (tid < 128) buf[tid] op buf[tid + 128]; __syncthreads(); SUM128(buf,tid,op) #define SUM256(buf,tid,op) if (tid < 128) buf[tid] op buf[tid + 128]; __syncthreads(); SUM128(buf,tid,op)
@@ -804,4 +805,152 @@ extern "C" __global__ void cudaEncodeResidual(
if (tid < residualLen) if (tid < residualLen)
output[shared.task.residualOffs + pos + tid] = shared.data[tid + shared.task.residualOrder] - (sum >> shared.task.shift); output[shared.task.residualOffs + pos + tid] = shared.data[tid + shared.task.residualOrder] - (sum >> shared.task.shift);
} }
extern "C" __global__ void cudaCalcPartition(
int* partition_lengths,
int* residual,
encodeResidualTaskStruct *tasks,
int max_porder // <= 8
)
{
__shared__ struct {
int data[256];
int length[256];
encodeResidualTaskStruct task;
} shared;
const int tid = threadIdx.x + (threadIdx.y << 4);
if (tid < sizeof(shared.task) / sizeof(int))
((int*)&shared.task)[tid] = ((int*)(&tasks[blockIdx.y]))[tid];
__syncthreads();
const int psize = (shared.task.blocksize >> max_porder); // 18
const int parts_per_block = 256 / psize; // 14
const int parts = min(parts_per_block, (1 << max_porder) - blockIdx.x * parts_per_block);
// fetch residual
shared.data[tid] = ((blockIdx.x != 0 || tid >= shared.task.residualOrder) && tid < parts * psize) ? residual[shared.task.residualOffs + blockIdx.x * psize * parts_per_block + tid - shared.task.residualOrder] : 0;
// convert to unsigned
shared.data[tid] = (shared.data[tid] << 1) ^ (shared.data[tid] >> 31);
__syncthreads();
// calc number of unary bits for each residual part with each rice paramater
shared.length[tid] = (psize - shared.task.residualOrder * (threadIdx.y + blockIdx.x == 0)) * (threadIdx.x + 1);
if (threadIdx.y < parts)
for (int i = 0; i < psize; i++)
// for part (threadIdx.y) with this rice paramater (threadIdx.x)
shared.length[tid] = min(0xfffff, shared.length[tid] + (shared.data[threadIdx.y * psize + i] >> threadIdx.x));
__syncthreads();
// output length (transposed: k is now threadIdx.y)
const int pos = (15 << (max_porder + 1)) * blockIdx.y + (threadIdx.y << (max_porder + 1));
if (threadIdx.y <= 14 && threadIdx.x < parts)
partition_lengths[pos + blockIdx.x * parts_per_block + threadIdx.x] = shared.length[threadIdx.y + (threadIdx.x << 4)];
}
extern "C" __global__ void cudaCalcLargePartition(
int* partition_lengths,
int* residual,
encodeResidualTaskStruct *tasks,
int max_porder // <= 8
)
{
__shared__ struct {
int data[256];
int length[256];
encodeResidualTaskStruct task;
} shared;
const int tid = threadIdx.x + (threadIdx.y << 4);
if (tid < sizeof(shared.task) / sizeof(int))
((int*)&shared.task)[tid] = ((int*)(&tasks[blockIdx.y]))[tid];
__syncthreads();
const int psize = (shared.task.blocksize >> max_porder); // 18
shared.length[tid] = 0;
for (int pos = 0; pos < psize; pos += 256)
{
// fetch residual
shared.data[tid] = ((blockIdx.x != 0 || pos + tid >= shared.task.residualOrder) && pos + tid < psize) ? residual[shared.task.residualOffs + blockIdx.x * psize + pos + tid - shared.task.residualOrder] : 0;
// convert to unsigned
shared.data[tid] = (shared.data[tid] << 1) ^ (shared.data[tid] >> 31);
__syncthreads();
// calc number of unary bits for each residual sample with each rice paramater
for (int i = 0; i < 256; i += 16)
// for sample (i + threadIdx.x) with this rice paramater (threadIdx.y)
shared.length[tid] = min(0xfffff, shared.length[tid] + (shared.data[i + threadIdx.x] >> threadIdx.y));
__syncthreads();
}
__syncthreads();
SUM16(shared.length,tid,+=);
__syncthreads();
// output length
const int pos = (15 << (max_porder + 1)) * blockIdx.y + (threadIdx.y << (max_porder + 1));
if (threadIdx.y <= 14 && threadIdx.x == 0)
partition_lengths[pos + blockIdx.x] = shared.length[tid] + (psize - shared.task.residualOrder * (blockIdx.x == 0)) * (threadIdx.y + 1);
}
// Sums partition lengths for a certain k == blockIdx.x
// Requires 256 threads
extern "C" __global__ void cudaSumPartition(
int* partition_lengths,
int max_porder
)
{
__shared__ struct {
int data[512];
} shared;
const int pos = (15 << (max_porder + 1)) * blockIdx.y + (blockIdx.x << (max_porder + 1));
// fetch residual
shared.data[threadIdx.x] = threadIdx.x < (1 << max_porder) ? partition_lengths[pos + threadIdx.x] : 0;
__syncthreads();
for (int porder = max_porder - 1; porder >= 0; porder--)
{
const int in_pos = (2 << max_porder) - (4 << porder);
const int out_pos = (2 << max_porder) - (2 << porder);
if (threadIdx.x < (1 << porder)) shared.data[out_pos + threadIdx.x] = shared.data[in_pos + (threadIdx.x << 1)] + shared.data[in_pos + (threadIdx.x << 1) + 1];
__syncthreads();
}
if (threadIdx.x < (1 << max_porder))
partition_lengths[pos + (1 << max_porder) + threadIdx.x] = shared.data[(1 << max_porder) + threadIdx.x];
}
// Finds optimal rice parameter for up to 16 partitions at a time.
// Requires 16x16 threads
extern "C" __global__ void cudaFindRiceParameter(
int* output,
int* partition_lengths,
int max_porder
)
{
__shared__ struct {
int length[256];
int tmp[256];
} shared;
const int tid = threadIdx.x + (threadIdx.y << 4);
const int parts = min(16, 2 << max_porder);
const int pos = (15 << (max_porder + 1)) * blockIdx.y + (threadIdx.y << (max_porder + 1));
// read length for 16 partitions
shared.length[tid] = (threadIdx.y <= 14 && threadIdx.x < parts) ? partition_lengths[pos + blockIdx.x * 16 + threadIdx.x] : 0xffffff;
__syncthreads();
// transpose
shared.tmp[tid] = threadIdx.y + (threadIdx.x << 4);
// find best rice parameter
shared.tmp[tid] = BEST_INDEX(shared.tmp[tid], shared.tmp[tid + 8]);
shared.tmp[tid] = BEST_INDEX(shared.tmp[tid], shared.tmp[tid + 4]);
shared.tmp[tid] = BEST_INDEX(shared.tmp[tid], shared.tmp[tid + 2]);
shared.tmp[tid] = BEST_INDEX(shared.tmp[tid], shared.tmp[tid + 1]);
__syncthreads();
// output rice parameter
if (threadIdx.x == 0 && threadIdx.y < parts)
output[(blockIdx.y << (max_porder + 2)) + blockIdx.x * parts + threadIdx.y] = shared.tmp[tid] >> 4;
// output length
if (threadIdx.x == 0 && threadIdx.y < parts)
output[(blockIdx.y << (max_porder + 2)) + (1 << (max_porder + 1)) + blockIdx.x * parts + threadIdx.y] = shared.length[shared.tmp[tid]];
}
#endif #endif

View File

@@ -380,6 +380,57 @@ code {
0xf0000001 0xe0000001 0xf0000001 0xe0000001
} }
} }
code {
name = cudaSumPartition
lmem = 0
smem = 2072
reg = 6
bar = 1
const {
segname = const
segnum = 1
offset = 0
bytes = 4
mem {
0xffffffff
}
}
bincode {
0x1000ca05 0x0423c780 0xa0000009 0x04000780
0x30010401 0xe40007d0 0x307c01fd 0x640087c8
0xa0014003 0x00000000 0x10013003 0x00001680
0x2101ea05 0x00000003 0x100f8001 0x00000003
0x30010001 0xc4000780 0x40014e11 0x00200780
0xa0004c0d 0x04200780 0x30100811 0xc4100780
0x30010605 0xc4000780 0x60004e01 0x00210780
0x20018000 0x20008400 0x30020001 0xc4100780
0x2000c801 0x04200780 0xd00e0001 0x80c00780
0x10014003 0x00000780 0x1000f801 0x0403c780
0x00020405 0xc0000782 0x04000c01 0xe4200780
0x861ffe03 0x00000000 0x2100ca05 0x046007d0
0x1002f003 0x00001980 0x300105fd 0xe40007d8
0xa002a003 0x00000000 0x1002a003 0x00001280
0x1002800d 0x00000003 0x1000ca01 0x0423c780
0x10048011 0x00000003 0x30000601 0xc4000780
0x30010811 0xc4000780 0x30010415 0xc4100780
0x20400011 0x04010780 0x3001060d 0xc4000780
0x20058810 0x20438000 0x00020805 0xc0000780
0x2000840c 0x1500ee00 0x00020609 0xc0000780
0x2400cc01 0x04200780 0x08000c01 0xe4200780
0xf0000001 0xe0000002 0x861ffe03 0x00000000
0x203f8205 0x0fffffff 0x308003fd 0x6c4147d8
0x10019003 0x00001280 0x30000003 0x00000100
0x2101ea0d 0x00000003 0x100f8001 0x00000003
0x30030005 0xc4000780 0x40034e11 0x00200780
0xa0004c01 0x04200780 0x30100815 0xc4100780
0x30030011 0xc4000780 0x1001800d 0x00000003
0x1000ca01 0x0423c780 0x60024e05 0x00214780
0x30000601 0xc4000780 0x20048204 0x20018004
0x2000840c 0x20018400 0x00020605 0xc0000780
0x30020005 0xc4100780 0x1500ec00 0x2101e804
0xd00e0201 0xa0c00781
}
}
code { code {
name = cudaEstimateResidual name = cudaEstimateResidual
lmem = 0 lmem = 0
@@ -759,6 +810,77 @@ code {
0xd00e0401 0x80c00780 0xd00e0201 0xa0c00781 0xd00e0401 0x80c00780 0xd00e0201 0xa0c00781
} }
} }
code {
name = cudaFindRiceParameter
lmem = 0
smem = 2076
reg = 7
bar = 1
const {
segname = const
segnum = 1
offset = 0
bytes = 16
mem {
0x00000010 0x000003ff 0x0000000e 0x00000001
}
}
bincode {
0x10028009 0x00000003 0x1000cc05 0x0423c780
0x30010409 0xc4000780 0x10000005 0x0403c780
0x30800409 0xac400780 0xa0000401 0x04000780
0xd0820609 0x00400780 0x300005fd 0x640107c8
0xa0000411 0x04000780 0x308209fd 0x6440c2c8
0xa001b003 0x00000000 0x1001a003 0x00000100
0x2101ec0d 0x00000003 0x100f8005 0x00000003
0x30030205 0xc4000780 0x40034e15 0x00200780
0x30100a15 0xc4100780 0x60024e15 0x00214780
0x30030805 0xc4000780 0x20000a0d 0x04004780
0x60804c05 0x00600780 0x20000205 0x0400c780
0x30020205 0xc4100780 0x2000ca05 0x04204780
0xd00e0205 0x80c00780 0x1001b003 0x00000780
0x103f8005 0x000fffff 0x3004080d 0xc4100782
0x2000060d 0x04000780 0x00020605 0xc0000780
0x04000e01 0xe4204780 0x861ffe03 0x00000000
0x30040005 0xc4100780 0x20000205 0x04010780
0xd4087809 0x20000780 0x04020e01 0xe4204780
0x0002020d 0xc0000780 0x0802c009 0xc0200780
0x1c00ce0d 0x0423c780 0x3803cffd 0x6c2187c8
0xd4087809 0x20000500 0x1800c005 0x0423c500
0xd4085809 0x20000780 0x04020e01 0xe4204780
0x0802c00d 0xc0200780 0x00020209 0xc0000780
0x1c00ce0d 0x0423c780 0x3803cffd 0x6c20c7c8
0xd4085809 0x20000500 0x1800c005 0x0423c500
0xd4084809 0x20000780 0x04020e01 0xe4204780
0x0802c00d 0xc0200780 0x00020209 0xc0000780
0x1c00ce0d 0x0423c780 0x3803cffd 0x6c20c7c8
0xd4084809 0x20000500 0x1800c005 0x0423c500
0xd4084009 0x20000780 0x04020e01 0xe4204780
0x0802c00d 0xc0200780 0x00020209 0xc0000780
0x1c00ce0d 0x0423c780 0x3803cffd 0x6c20c7c8
0xd4084009 0x20000500 0x1800c005 0x0423c500
0x04020e01 0xe4204780 0x861ffe03 0x00000000
0x307c0001 0x64008780 0x30040405 0x64010780
0xd0830001 0x04400780 0xd0830205 0x04400780
0xd0010001 0x040007c0 0xa0057003 0x00000000
0x10057003 0x00000100 0x40054c15 0x00200780
0xa0004e05 0x04200780 0x2102ec0d 0x00000003
0x30100a15 0xc4100780 0x30030205 0xc4000780
0x60044c0d 0x00214780 0x20018604 0x20018804
0xd4083809 0x20000780 0x3002020d 0xc4100780
0x3804c005 0xec300780 0x2000c80d 0x0420c780
0xd00e0605 0xa0c00780 0x307c01fd 0x6c0087ca
0x30000003 0x00000280 0x2101ec05 0x00000003
0x40054c19 0x00200780 0x10018001 0x00000003
0xa0004e0d 0x04200780 0x2102ec15 0x00000003
0x30100c19 0xc4100780 0x30010001 0xc4000780
0x30050605 0xc4000780 0x60044c09 0x00218780
0x20018000 0x20048404 0xd4083805 0x20000780
0x20000201 0x04000780 0x0402c005 0xc0200780
0x30020005 0xc4100780 0x1500ee00 0x2101e804
0xd00e0201 0xa0c00781
}
}
code { code {
name = cudaFindWastedBits name = cudaFindWastedBits
lmem = 0 lmem = 0
@@ -1025,6 +1147,101 @@ code {
0xf0000001 0xe0000001 0xf0000001 0xe0000001
} }
} }
code {
name = cudaCalcPartition
lmem = 0
smem = 2272
reg = 12
bar = 1
const {
segname = const
segnum = 1
offset = 0
bytes = 16
mem {
0x000003ff 0x0000002f 0x000fffff 0x0000000e
}
}
bincode {
0xd0800205 0x00400780 0xa000020d 0x04000780
0xa0000019 0x04000780 0x30040601 0xc4100780
0x20000c25 0x04000780 0x308113fd 0x644107c8
0xa0011003 0x00000000 0x3002121d 0xc4100780
0x10011003 0x00000280 0xa0004e01 0x04200780
0x30070005 0xc4100780 0x30060001 0xc4100780
0x20008200 0x2100ec00 0x20000e01 0x04000780
0xd00e0001 0x80c00780 0x00000e05 0xc0000780
0x04041001 0xe4200780 0xf0000001 0xe0000002
0x861ffe03 0x00000000 0xd0104005 0x20000780
0x1000ce01 0x0423c780 0x3400ce05 0xec200780
0x10008001 0x00000013 0x10000209 0x0403c780
0x20078003 0x00000780 0xa0004c09 0x04200780
0x40040215 0x00000780 0x10018021 0x00000003
0x1000ce11 0x0423c780 0x30100a15 0xc4100780
0x30041011 0xc4000780 0x60040021 0x00014780
0x30001011 0x04010780 0x30040011 0xa4000780
0x40090415 0x00000780 0x60080615 0x00014780
0x3409c1fd 0x6c20c7c8 0x30100a15 0xc4100780
0x307c05fd 0x64014148 0x60080415 0x00014780
0x300513fd 0x6c0042c8 0xa003b003 0x00000000
0x1003a003 0x00000100 0x40050415 0x00000780
0x60040615 0x00014780 0x30100a15 0xc4100780
0x60040415 0x00014780 0x400b0029 0x00000780
0x600a0229 0x00028780 0x30101429 0xc4100780
0x600a0001 0x00028780 0xd0104005 0x20000780
0x2500f400 0x20009200 0x3400c001 0x04200780
0x30020001 0xc4100780 0x2000ca01 0x04200780
0xd00e0001 0x80c00780 0x1003b003 0x00000780
0x1000f801 0x0403c780 0x301f0015 0xec100782
0x30010001 0xc4100780 0xd0000a01 0x04008780
0x00000e05 0xc0000780 0x04001001 0xe4200780
0x861ffe03 0x00000000 0xd0104005 0x20000780
0x20018c01 0x00000003 0x3400c015 0x04204780
0x40020228 0x400b0024 0x60030029 0x00028780
0x600a0225 0x00024780 0x30101429 0xc4100780
0x200005fd 0x0400c7c8 0x30101225 0xc4100780
0x60020009 0x00028780 0x600a0009 0x00024100
0x00000e05 0xc0000780 0x300309fd 0x6400c7c8
0xa0062003 0x00000000 0x04021001 0xe4208780
0x10062003 0x00000280 0x307c03fd 0x6c00c7c8
0x10062003 0x00000280 0x40070401 0x00000780
0x60060601 0x00000780 0x30100001 0xc4100780
0x60060401 0x00000780 0x20088015 0x00000003
0x00020a05 0xc0000780 0x20000005 0x04004780
0x3606c215 0xec200780 0x20000409 0x04014780
0x20018001 0x00000003 0x00000e09 0xc0000780
0x30820409 0xac400780 0x300101fd 0x6c0147c8
0x08021001 0xe4208780 0x1005a003 0x00000280
0xf0000001 0xe0000002 0x861ffe03 0x00000000
0x300609fd 0x640107c8 0x308307fd 0x6440c2c8
0x30000003 0x00000100 0x2101ee05 0x00000003
0x100f8001 0x00000003 0x30010001 0xc4000780
0x40014e09 0x00200780 0x30100409 0xc4100780
0x60004e09 0x00208780 0x30040c11 0xc4100780
0x30010605 0xc4000780 0x20069000 0x2004860c
0x20000405 0x04004780 0x00020605 0xc0000780
0x20000001 0x04004780 0xd4084005 0x20000780
0x30020005 0xc4100780 0x1500e000 0x2101e804
0xd00e0201 0xa0c00780 0x30000003 0x00000780
0xa0000411 0x04114780 0xa0000815 0x44004780
0xa0000021 0x04114780 0x90000a29 0x00000780
0xa0001015 0x44064780 0x203e9429 0x0fffffff
0xc00a0a15 0x0000c7c0 0xa0000a15 0x84064780
0x400b102d 0x00000780 0x600a122d 0x0002c780
0x3010162d 0xc4100780 0x600a102d 0x0002c780
0x2040102d 0x0402c780 0xa000162d 0x44064780
0xc00a1629 0x0000c7c0 0xa0001429 0x84064780
0x20000a15 0x04028780 0x40081629 0x00000780
0x60091429 0x00028780 0x30101429 0xc4100780
0x60081429 0x00028780 0x30001421 0x04020780
0x30080811 0x6400c780 0xd0000401 0x04008780
0x301f0001 0xe4100780 0x30000815 0x04014780
0xa0000011 0x2c014780 0xd0050811 0x04008780
0x307c05fd 0x6c0147c8 0x20000001 0x04010780
0xd0020001 0x0402c500 0x30000003 0x00000780
0xf0000001 0xe0000001
}
}
code { code {
name = cudaStereoDecorr name = cudaStereoDecorr
lmem = 0 lmem = 0
@@ -1047,6 +1264,128 @@ code {
0x2000c801 0x04210780 0xd00e0005 0xa0c00781 0x2000c801 0x04210780 0xd00e0005 0xa0c00781
} }
} }
code {
name = cudaCalcLargePartition
lmem = 0
smem = 2272
reg = 23
bar = 1
const {
segname = const
segnum = 1
offset = 0
bytes = 20
mem {
0x000003ff 0x0000002f 0x00000001 0x000fffff
0x0000000e
}
}
bincode {
0xd0800205 0x00400780 0xa0000219 0x04000780
0xa000001d 0x04000780 0x30040c01 0xc4100780
0x20000e21 0x04000780 0x308111fd 0x644107c8
0xa0011003 0x00000000 0x30021025 0xc4100780
0x10011003 0x00000280 0xa0004e01 0x04200780
0x30070005 0xc4100780 0x30060001 0xc4100780
0x20008200 0x2100ec00 0x20001201 0x04000780
0xd00e0001 0x80c00780 0x00001205 0xc0000780
0x04041001 0xe4200780 0xf0000001 0xe0000002
0x861ffe03 0x00000000 0xd0107805 0x20000780
0x1000ce01 0x0423c780 0x3400c001 0xec200780
0x00001205 0xc0000780 0x307c01fd 0x6c00c7c8
0x04021001 0xe43f0780 0x1009f003 0x00000280
0x20108e51 0x00000003 0x20208e4d 0x00000003
0x20308e49 0x00000003 0x20008e45 0x00000007
0x20108e41 0x00000007 0x20208e3d 0x00000007
0x20308e39 0x00000007 0x20008e35 0x0000000b
0x20108e31 0x0000000b 0x20208e2d 0x0000000b
0x20308e29 0x0000000b 0x20008e15 0x0000000f
0x20108e11 0x0000000f 0x20208e05 0x0000000f
0x20308e09 0x0000000f 0x00020205 0xc0000780
0x1000f80d 0x0403c780 0xa0004c05 0x042007c0
0x00020409 0xc0000780 0x1000f809 0x0403c780
0x1000040d 0x2440c280 0xd010400d 0x20000780
0x20000455 0x04020780 0x3c15c059 0x6c20c780
0x30002a55 0x6c004780 0xa0002c59 0x2c014780
0xa0002a55 0x2c014780 0xd0032c59 0x04004780
0xd0162bfd 0x040007c8 0xa0045003 0x00000000
0x10044003 0x00000100 0x40030055 0x00000780
0x60020255 0x00054780 0x30102a55 0xc4100780
0xd010400d 0x20000780 0x60020059 0x00054780
0x20088454 0x2d16f458 0x2016aa54 0x3d15e054
0x30022a55 0xc4100780 0x2000ca55 0x04254780
0xd00e2a55 0x80c00780 0x10045003 0x00000780
0x1000f855 0x0403c780 0x301f2a59 0xec100782
0x30012a55 0xc4100780 0xd0152c55 0x04008780
0x0000120d 0xc0000780 0x0c001001 0xe4254780
0x861ffe03 0x00000000 0x0000120d 0xc0000780
0x00020e11 0xc0000780 0x3006d055 0xec200784
0xdc084011 0x20000780 0x2000c055 0x04254784
0x30832a55 0xac400780 0x0c021001 0xe4254780
0x00022811 0xc0000780 0x3006d059 0xec200784
0x20002a55 0x04058780 0x30832a55 0xac400780
0x0c021001 0xe4254780 0x00022611 0xc0000780
0x3006d059 0xec200784 0x20002a55 0x04058780
0x30832a55 0xac400780 0x0c021001 0xe4254780
0x00022411 0xc0000780 0x3006d059 0xec200784
0x20002a55 0x04058780 0x30832a55 0xac400780
0x0c021001 0xe4254780 0x00022211 0xc0000780
0x3006d059 0xec200784 0x20002a55 0x04058780
0x30832a55 0xac400780 0x0c021001 0xe4254780
0x00022011 0xc0000780 0x3006d059 0xec200784
0x20002a55 0x04058780 0x30832a55 0xac400780
0x0c021001 0xe4254780 0x00021e11 0xc0000780
0x3006d059 0xec200784 0x20002a55 0x04058780
0x30832a55 0xac400780 0x0c021001 0xe4254780
0x00021c11 0xc0000780 0x3006d059 0xec200784
0x20002a55 0x04058780 0x30832a55 0xac400780
0x0c021001 0xe4254780 0x00021a11 0xc0000780
0x3006d059 0xec200784 0x20002a55 0x04058780
0x30832a55 0xac400780 0x0c021001 0xe4254780
0x00021811 0xc0000780 0x3006d059 0xec200784
0x20002a55 0x04058780 0x30832a55 0xac400780
0x0c021001 0xe4254780 0x00021611 0xc0000780
0x3006d059 0xec200784 0x20002a55 0x04058780
0x30832a55 0xac400780 0x0c021001 0xe4254780
0x00021411 0xc0000780 0x3006d059 0xec200784
0x20002a55 0x04058780 0x30832a55 0xac400780
0x0c021001 0xe4254780 0x00020a11 0xc0000780
0x3006d059 0xec200784 0x20002a55 0x04058780
0x30832a55 0xac400780 0x0c021001 0xe4254780
0x00020811 0xc0000780 0x3006d059 0xec200784
0x20002a55 0x04058780 0x30832a55 0xac400780
0x0c021001 0xe4254780 0x3406d059 0xec200780
0x20002a55 0x04058780 0x30832a55 0xac400780
0x0c021001 0xe4254780 0x3806d059 0xec200780
0x20002a55 0x04058780 0x30832a55 0xac400780
0x0c021001 0xe4254780 0x861ffe03 0x00000000
0x20008409 0x00000013 0x300201fd 0x6c0107c8
0x1002f003 0x00000280 0x861ffe03 0x00000000
0x00001205 0xc0000780 0xd4084009 0x20000780
0x1900f004 0x2901e004 0x04021001 0xe4204780
0x2800c805 0x04204780 0x04021001 0xe4204780
0x2800c405 0x04204780 0x04021001 0xe4204780
0x2800c205 0x04204780 0x04021001 0xe4204780
0x861ffe03 0x00000000 0x307c0ffd 0x640087c8
0x30840dfd 0x6440c2c8 0x30000003 0x00000100
0xa0004c09 0x042007c0 0x20018c05 0x00000003
0x100bb003 0x00000280 0xd0104005 0x20000780
0x3400c001 0x04200780 0x4001040d 0x00000780
0x6000060d 0x0000c780 0x3010060d 0xc4100780
0x00001205 0xc0000780 0x60000401 0x0000c780
0xd4084005 0x20000780 0x2400c005 0x04200780
0x100c2003 0x00000780 0x4003000d 0x00000780
0x6002020d 0x0000c780 0x3010060d 0xc4100780
0x00001205 0xc0000780 0x60020001 0x0000c780
0xd4084005 0x20000780 0x2400c005 0x04200780
0x2101ee0d 0x00000003 0x100f8001 0x00000003
0x30030001 0xc4000780 0x40014e11 0x00200780
0x30100811 0xc4100780 0x30030c0d 0xc4000780
0x60004e01 0x00210780 0x20038000 0x20008400
0x30020001 0xc4100780 0x2000c801 0x04200780
0xd00e0005 0xa0c00781
}
}
code { code {
name = cudaCopyBestMethodStereo name = cudaCopyBestMethodStereo
lmem = 0 lmem = 0