mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
trying to do rice partitioning on gpu
This commit is contained in:
@@ -969,11 +969,37 @@ namespace CUETools.Codecs.FlaCuda
|
||||
lpc.encode_residual_long(frame.subframes[ch].best.residual, frame.subframes[ch].samples, frame.blocksize, frame.subframes[ch].best.order, coefs, frame.subframes[ch].best.shift);
|
||||
else if (encode_on_cpu)
|
||||
lpc.encode_residual(frame.subframes[ch].best.residual, frame.subframes[ch].samples, frame.blocksize, frame.subframes[ch].best.order, coefs, frame.subframes[ch].best.shift);
|
||||
|
||||
int pmin = get_max_p_order(eparams.min_partition_order, frame.blocksize, frame.subframes[ch].best.order);
|
||||
int pmax = get_max_p_order(eparams.max_partition_order, frame.blocksize, frame.subframes[ch].best.order);
|
||||
uint bits = (uint)frame.subframes[ch].best.order * frame.subframes[ch].obits + 4 + 5 + (uint)frame.subframes[ch].best.order * (uint)frame.subframes[ch].best.cbits + 6;
|
||||
frame.subframes[ch].best.size = bits + calc_rice_params(ref frame.subframes[ch].best.rc, ref frame.current.rc, pmin, pmax, frame.subframes[ch].best.residual, (uint)frame.blocksize, (uint)frame.subframes[ch].best.order);
|
||||
if ((csum << (int)frame.subframes[ch].obits) >= 1UL << 32 || encode_on_cpu)
|
||||
{
|
||||
int pmin = get_max_p_order(eparams.min_partition_order, frame.blocksize, frame.subframes[ch].best.order);
|
||||
int pmax = get_max_p_order(eparams.max_partition_order, frame.blocksize, frame.subframes[ch].best.order);
|
||||
uint bits = (uint)frame.subframes[ch].best.order * frame.subframes[ch].obits + 4 + 5 + (uint)frame.subframes[ch].best.order * (uint)frame.subframes[ch].best.cbits + 6;
|
||||
//uint oldsize = frame.subframes[ch].best.size;
|
||||
frame.subframes[ch].best.size = bits + calc_rice_params(ref frame.subframes[ch].best.rc, ref frame.current.rc, pmin, pmax, frame.subframes[ch].best.residual, (uint)frame.blocksize, (uint)frame.subframes[ch].best.order);
|
||||
//if (frame.subframes[ch].best.size > frame.subframes[ch].obits * (uint)frame.blocksize &&
|
||||
// oldsize <= frame.subframes[ch].obits * (uint)frame.blocksize)
|
||||
// throw new Exception("oops");
|
||||
}
|
||||
else
|
||||
{
|
||||
// residual
|
||||
int len = frame.subframes[ch].best.order * (int)frame.subframes[ch].obits + 6 +
|
||||
4 + 5 + frame.subframes[ch].best.order * frame.subframes[ch].best.cbits +
|
||||
(4 << frame.subframes[ch].best.rc.porder);
|
||||
int j = frame.subframes[ch].best.order;
|
||||
int psize = frame.blocksize >> frame.subframes[ch].best.rc.porder;
|
||||
for (int p = 0; p < (1 << frame.subframes[ch].best.rc.porder); p++)
|
||||
{
|
||||
int k = frame.subframes[ch].best.rc.rparams[p];
|
||||
int cnt = p == 0 ? psize - frame.subframes[ch].best.order : psize;
|
||||
len += (k + 1) * cnt;
|
||||
for (int i = j; i < j + cnt; i++)
|
||||
len += (((frame.subframes[ch].best.residual[i] << 1) ^ (frame.subframes[ch].best.residual[i] >> 31)) >> k);
|
||||
j += cnt;
|
||||
}
|
||||
if (len != frame.subframes[ch].best.size)
|
||||
throw new Exception(string.Format("length mismatch: {0} vs {1}", len, frame.subframes[ch].best.size));
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -1031,8 +1057,35 @@ namespace CUETools.Codecs.FlaCuda
|
||||
frame.subframes[ch].samples[i] >>= (int)frame.subframes[ch].wbits;
|
||||
for (int i = 0; i < task.BestResidualTasks[index].residualOrder; i++)
|
||||
frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i];
|
||||
if (!encode_on_cpu)
|
||||
if (!encode_on_cpu && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC))
|
||||
{
|
||||
frame.subframes[ch].best.size = (uint)frame.subframes[ch].best.order * frame.subframes[ch].obits + 6;
|
||||
if (frame.subframes[ch].best.type == SubframeType.LPC)
|
||||
frame.subframes[ch].best.size += 4 + 5 + (uint)frame.subframes[ch].best.order * (uint)frame.subframes[ch].best.cbits;
|
||||
AudioSamples.MemCpy(frame.subframes[ch].best.residual + frame.subframes[ch].best.order, (int*)task.residualBufferPtr + task.BestResidualTasks[index].residualOffs, frame.blocksize - frame.subframes[ch].best.order);
|
||||
int* riceParams = ((int*)task.riceParamsPtr) + (4 << task.max_porder) * index;
|
||||
int* partLengths = ((int*)task.riceParamsPtr) + (4 << task.max_porder) * index + (2 << task.max_porder);
|
||||
int opt_porder = task.max_porder;
|
||||
int opt_pos = 0;
|
||||
int opt_bits = 0xfffffff;
|
||||
for (int porder = task.max_porder; porder >= 0; porder--)
|
||||
{
|
||||
int in_pos = (2 << task.max_porder) - (2 << porder);
|
||||
int sum = (1 << porder) * 4;
|
||||
for (int p = 0; p < (1 << porder); p++)
|
||||
sum += partLengths[in_pos + p];// +(riceParams[in_pos + p] + 1) * ((frame.blocksize >> porder) - (p != 0 ? 0 : frame.subframes[ch].best.order));
|
||||
if (sum < opt_bits)
|
||||
{
|
||||
opt_bits = sum;
|
||||
opt_porder = porder;
|
||||
opt_pos = in_pos;
|
||||
}
|
||||
}
|
||||
frame.subframes[ch].best.rc.porder = opt_porder;
|
||||
for (int i = 0; i < (1 << opt_porder); i++)
|
||||
frame.subframes[ch].best.rc.rparams[i] = riceParams[opt_pos + i];
|
||||
frame.subframes[ch].best.size += (uint)opt_bits;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1069,7 +1122,16 @@ namespace CUETools.Codecs.FlaCuda
|
||||
if (residualPartCount > maxResidualParts)
|
||||
throw new Exception("invalid combination of block size and LPC order");
|
||||
|
||||
int max_porder = get_max_p_order(eparams.max_partition_order, task.frameSize, eparams.max_prediction_order);
|
||||
int psize = task.frameSize >> max_porder;
|
||||
while (psize < 16 && max_porder > 0)
|
||||
{
|
||||
psize <<= 1;
|
||||
max_porder--;
|
||||
}
|
||||
|
||||
CUfunction cudaChannelDecorr = channels == 2 ? (channelsCount == 4 ? task.cudaStereoDecorr : task.cudaChannelDecorr2) : task.cudaChannelDecorr;
|
||||
CUfunction cudaCalcPartition = psize >= 128 ? task.cudaCalcLargePartition : task.cudaCalcPartition;
|
||||
|
||||
cuda.SetParameter(cudaChannelDecorr, 0 * sizeof(uint), (uint)task.cudaSamples.Pointer);
|
||||
cuda.SetParameter(cudaChannelDecorr, 1 * sizeof(uint), (uint)task.cudaSamplesBytes.Pointer);
|
||||
@@ -1145,6 +1207,24 @@ namespace CUETools.Codecs.FlaCuda
|
||||
cuda.SetParameterSize(task.cudaEncodeResidual, sizeof(uint) * 3U);
|
||||
cuda.SetFunctionBlockShape(task.cudaEncodeResidual, residualPartSize, 1, 1);
|
||||
|
||||
cuda.SetParameter(cudaCalcPartition, 0, (uint)task.cudaPartitions.Pointer);
|
||||
cuda.SetParameter(cudaCalcPartition, 1 * sizeof(uint), (uint)task.cudaResidual.Pointer);
|
||||
cuda.SetParameter(cudaCalcPartition, 2 * sizeof(uint), (uint)task.cudaBestResidualTasks.Pointer);
|
||||
cuda.SetParameter(cudaCalcPartition, 3 * sizeof(uint), (uint)max_porder);
|
||||
cuda.SetParameterSize(cudaCalcPartition, 4U * sizeof(uint));
|
||||
cuda.SetFunctionBlockShape(cudaCalcPartition, 16, 16, 1);
|
||||
|
||||
cuda.SetParameter(task.cudaSumPartition, 0, (uint)task.cudaPartitions.Pointer);
|
||||
cuda.SetParameter(task.cudaSumPartition, 1 * sizeof(uint), (uint)max_porder);
|
||||
cuda.SetParameterSize(task.cudaSumPartition, 2U * sizeof(uint));
|
||||
cuda.SetFunctionBlockShape(task.cudaSumPartition, 256, 1, 1);
|
||||
|
||||
cuda.SetParameter(task.cudaFindRiceParameter, 0, (uint)task.cudaRiceParams.Pointer);
|
||||
cuda.SetParameter(task.cudaFindRiceParameter, 1 * sizeof(uint), (uint)task.cudaPartitions.Pointer);
|
||||
cuda.SetParameter(task.cudaFindRiceParameter, 2 * sizeof(uint), (uint)max_porder);
|
||||
cuda.SetParameterSize(task.cudaFindRiceParameter, 3U * sizeof(uint));
|
||||
cuda.SetFunctionBlockShape(task.cudaFindRiceParameter, 16, 16, 1);
|
||||
|
||||
// issue work to the GPU
|
||||
cuda.LaunchAsync(cudaChannelDecorr, (task.frameCount * task.frameSize + 255) / 256, channels == 2 ? 1 : channels, task.stream);
|
||||
if (task.frameSize <= 512 && eparams.max_prediction_order <= 12)
|
||||
@@ -1163,10 +1243,18 @@ namespace CUETools.Codecs.FlaCuda
|
||||
else
|
||||
cuda.LaunchAsync(task.cudaCopyBestMethod, 1, channels * task.frameCount, task.stream);
|
||||
if (!encode_on_cpu)
|
||||
{
|
||||
int bsz = (psize >= 128) ? psize : (256 / psize) * psize;
|
||||
cuda.LaunchAsync(task.cudaEncodeResidual, residualPartCount, channels * task.frameCount, task.stream);
|
||||
cuda.CopyDeviceToHostAsync(task.cudaBestResidualTasks, task.bestResidualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * channels * task.frameCount), task.stream);
|
||||
if (!encode_on_cpu)
|
||||
cuda.LaunchAsync(cudaCalcPartition, (task.frameSize + bsz - 1) / bsz, channels * task.frameCount, task.stream);
|
||||
if (max_porder > 0)
|
||||
cuda.LaunchAsync(task.cudaSumPartition, Flake.MAX_RICE_PARAM + 1, channels * task.frameCount, task.stream);
|
||||
cuda.LaunchAsync(task.cudaFindRiceParameter, ((2 << max_porder) + 15) / 16, channels * task.frameCount, task.stream);
|
||||
cuda.CopyDeviceToHostAsync(task.cudaResidual, task.residualBufferPtr, (uint)(sizeof(int) * MAX_BLOCKSIZE * channels), task.stream);
|
||||
cuda.CopyDeviceToHostAsync(task.cudaRiceParams, task.riceParamsPtr, (uint)(sizeof(int) * (4 << max_porder) * channels * task.frameCount), task.stream);
|
||||
task.max_porder = max_porder;
|
||||
}
|
||||
cuda.CopyDeviceToHostAsync(task.cudaBestResidualTasks, task.bestResidualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * channels * task.frameCount), task.stream);
|
||||
}
|
||||
|
||||
unsafe int encode_frame(bool doMidside, int channelCount, int iFrame, FlaCudaTask task)
|
||||
@@ -1856,9 +1944,15 @@ namespace CUETools.Codecs.FlaCuda
|
||||
public CUfunction cudaCopyBestMethod;
|
||||
public CUfunction cudaCopyBestMethodStereo;
|
||||
public CUfunction cudaEncodeResidual;
|
||||
public CUfunction cudaCalcPartition;
|
||||
public CUfunction cudaCalcLargePartition;
|
||||
public CUfunction cudaSumPartition;
|
||||
public CUfunction cudaFindRiceParameter;
|
||||
public CUdeviceptr cudaSamplesBytes;
|
||||
public CUdeviceptr cudaSamples;
|
||||
public CUdeviceptr cudaResidual;
|
||||
public CUdeviceptr cudaPartitions;
|
||||
public CUdeviceptr cudaRiceParams;
|
||||
public CUdeviceptr cudaAutocorTasks;
|
||||
public CUdeviceptr cudaAutocorOutput;
|
||||
public CUdeviceptr cudaResidualTasks;
|
||||
@@ -1867,6 +1961,7 @@ namespace CUETools.Codecs.FlaCuda
|
||||
public IntPtr samplesBytesPtr = IntPtr.Zero;
|
||||
public IntPtr samplesBufferPtr = IntPtr.Zero;
|
||||
public IntPtr residualBufferPtr = IntPtr.Zero;
|
||||
public IntPtr riceParamsPtr = IntPtr.Zero;
|
||||
public IntPtr autocorTasksPtr = IntPtr.Zero;
|
||||
public IntPtr residualTasksPtr = IntPtr.Zero;
|
||||
public IntPtr bestResidualTasksPtr = IntPtr.Zero;
|
||||
@@ -1883,6 +1978,7 @@ namespace CUETools.Codecs.FlaCuda
|
||||
public int nAutocorTasks = 0;
|
||||
public int nResidualTasksPerChannel = 0;
|
||||
public int nAutocorTasksPerChannel = 0;
|
||||
public int max_porder = 0;
|
||||
|
||||
unsafe public FlaCudaTask(CUDA _cuda, int channelCount)
|
||||
{
|
||||
@@ -1892,10 +1988,14 @@ namespace CUETools.Codecs.FlaCuda
|
||||
residualTasksLen = sizeof(encodeResidualTaskStruct) * channelCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 8) * FlaCudaWriter.maxFrames;
|
||||
bestResidualTasksLen = sizeof(encodeResidualTaskStruct) * channelCount * FlaCudaWriter.maxFrames;
|
||||
samplesBufferLen = sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelCount;
|
||||
int partitionsLen = sizeof(int) * (30 << 8) * channelCount * FlaCudaWriter.maxFrames;
|
||||
int riceParamsLen = sizeof(int) * (4 << 8) * channelCount * FlaCudaWriter.maxFrames;
|
||||
|
||||
cudaSamplesBytes = cuda.Allocate((uint)samplesBufferLen / 2);
|
||||
cudaSamples = cuda.Allocate((uint)samplesBufferLen);
|
||||
cudaResidual = cuda.Allocate((uint)samplesBufferLen);
|
||||
cudaPartitions = cuda.Allocate((uint)partitionsLen);
|
||||
cudaRiceParams = cuda.Allocate((uint)riceParamsLen);
|
||||
cudaAutocorTasks = cuda.Allocate((uint)autocorTasksLen);
|
||||
cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * (FlaCudaWriter.maxAutocorParts + FlaCudaWriter.maxFrames)));
|
||||
cudaResidualTasks = cuda.Allocate((uint)residualTasksLen);
|
||||
@@ -1907,7 +2007,9 @@ namespace CUETools.Codecs.FlaCuda
|
||||
if (cuErr == CUResult.Success)
|
||||
cuErr = CUDADriver.cuMemAllocHost(ref samplesBufferPtr, (uint)samplesBufferLen);
|
||||
if (cuErr == CUResult.Success)
|
||||
cuErr = CUDADriver.cuMemAllocHost(ref residualBufferPtr, (uint)samplesBufferLen);
|
||||
cuErr = CUDADriver.cuMemAllocHost(ref residualBufferPtr, (uint)samplesBufferLen);
|
||||
if (cuErr == CUResult.Success)
|
||||
cuErr = CUDADriver.cuMemAllocHost(ref riceParamsPtr, (uint)riceParamsLen);
|
||||
if (cuErr == CUResult.Success)
|
||||
cuErr = CUDADriver.cuMemAllocHost(ref autocorTasksPtr, (uint)autocorTasksLen);
|
||||
if (cuErr == CUResult.Success)
|
||||
@@ -1919,6 +2021,7 @@ namespace CUETools.Codecs.FlaCuda
|
||||
if (samplesBytesPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBytesPtr); samplesBytesPtr = IntPtr.Zero;
|
||||
if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr); samplesBufferPtr = IntPtr.Zero;
|
||||
if (residualBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualBufferPtr); residualBufferPtr = IntPtr.Zero;
|
||||
if (riceParamsPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(riceParamsPtr); riceParamsPtr = IntPtr.Zero;
|
||||
if (autocorTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(autocorTasksPtr); autocorTasksPtr = IntPtr.Zero;
|
||||
if (residualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualTasksPtr); residualTasksPtr = IntPtr.Zero;
|
||||
if (bestResidualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(bestResidualTasksPtr); bestResidualTasksPtr = IntPtr.Zero;
|
||||
@@ -1937,6 +2040,10 @@ namespace CUETools.Codecs.FlaCuda
|
||||
cudaCopyBestMethod = cuda.GetModuleFunction("cudaCopyBestMethod");
|
||||
cudaCopyBestMethodStereo = cuda.GetModuleFunction("cudaCopyBestMethodStereo");
|
||||
cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual");
|
||||
cudaCalcPartition = cuda.GetModuleFunction("cudaCalcPartition");
|
||||
cudaCalcLargePartition = cuda.GetModuleFunction("cudaCalcLargePartition");
|
||||
cudaSumPartition = cuda.GetModuleFunction("cudaSumPartition");
|
||||
cudaFindRiceParameter = cuda.GetModuleFunction("cudaFindRiceParameter");
|
||||
|
||||
stream = cuda.CreateStream();
|
||||
verifyBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * channelCount]; // should be channels, not channelCount. And should null if not doing verify!
|
||||
@@ -1948,6 +2055,7 @@ namespace CUETools.Codecs.FlaCuda
|
||||
cuda.Free(cudaSamples);
|
||||
cuda.Free(cudaSamplesBytes);
|
||||
cuda.Free(cudaResidual);
|
||||
cuda.Free(cudaPartitions);
|
||||
cuda.Free(cudaAutocorTasks);
|
||||
cuda.Free(cudaAutocorOutput);
|
||||
cuda.Free(cudaResidualTasks);
|
||||
@@ -1956,6 +2064,7 @@ namespace CUETools.Codecs.FlaCuda
|
||||
CUDADriver.cuMemFreeHost(samplesBytesPtr);
|
||||
CUDADriver.cuMemFreeHost(samplesBufferPtr);
|
||||
CUDADriver.cuMemFreeHost(residualBufferPtr);
|
||||
CUDADriver.cuMemFreeHost(riceParamsPtr);
|
||||
CUDADriver.cuMemFreeHost(residualTasksPtr);
|
||||
CUDADriver.cuMemFreeHost(bestResidualTasksPtr);
|
||||
CUDADriver.cuMemFreeHost(autocorTasksPtr);
|
||||
|
||||
@@ -56,7 +56,8 @@ typedef struct
|
||||
int coefs[32];
|
||||
} encodeResidualTaskStruct;
|
||||
|
||||
#define SUM32(buf,tid,op) buf[tid] op buf[tid + 16]; buf[tid] op buf[tid + 8]; buf[tid] op buf[tid + 4]; buf[tid] op buf[tid + 2]; buf[tid] op buf[tid + 1];
|
||||
#define SUM16(buf,tid,op) buf[tid] op buf[tid + 8]; buf[tid] op buf[tid + 4]; buf[tid] op buf[tid + 2]; buf[tid] op buf[tid + 1];
|
||||
#define SUM32(buf,tid,op) buf[tid] op buf[tid + 16]; SUM16(buf,tid,op)
|
||||
#define SUM64(buf,tid,op) if (tid < 32) buf[tid] op buf[tid + 32]; __syncthreads(); if (tid < 32) { SUM32(buf,tid,op) }
|
||||
#define SUM128(buf,tid,op) if (tid < 64) buf[tid] op buf[tid + 64]; __syncthreads(); SUM64(buf,tid,op)
|
||||
#define SUM256(buf,tid,op) if (tid < 128) buf[tid] op buf[tid + 128]; __syncthreads(); SUM128(buf,tid,op)
|
||||
@@ -804,4 +805,152 @@ extern "C" __global__ void cudaEncodeResidual(
|
||||
if (tid < residualLen)
|
||||
output[shared.task.residualOffs + pos + tid] = shared.data[tid + shared.task.residualOrder] - (sum >> shared.task.shift);
|
||||
}
|
||||
|
||||
extern "C" __global__ void cudaCalcPartition(
|
||||
int* partition_lengths,
|
||||
int* residual,
|
||||
encodeResidualTaskStruct *tasks,
|
||||
int max_porder // <= 8
|
||||
)
|
||||
{
|
||||
__shared__ struct {
|
||||
int data[256];
|
||||
int length[256];
|
||||
encodeResidualTaskStruct task;
|
||||
} shared;
|
||||
const int tid = threadIdx.x + (threadIdx.y << 4);
|
||||
if (tid < sizeof(shared.task) / sizeof(int))
|
||||
((int*)&shared.task)[tid] = ((int*)(&tasks[blockIdx.y]))[tid];
|
||||
__syncthreads();
|
||||
|
||||
const int psize = (shared.task.blocksize >> max_porder); // 18
|
||||
const int parts_per_block = 256 / psize; // 14
|
||||
const int parts = min(parts_per_block, (1 << max_porder) - blockIdx.x * parts_per_block);
|
||||
|
||||
// fetch residual
|
||||
shared.data[tid] = ((blockIdx.x != 0 || tid >= shared.task.residualOrder) && tid < parts * psize) ? residual[shared.task.residualOffs + blockIdx.x * psize * parts_per_block + tid - shared.task.residualOrder] : 0;
|
||||
// convert to unsigned
|
||||
shared.data[tid] = (shared.data[tid] << 1) ^ (shared.data[tid] >> 31);
|
||||
__syncthreads();
|
||||
|
||||
// calc number of unary bits for each residual part with each rice paramater
|
||||
shared.length[tid] = (psize - shared.task.residualOrder * (threadIdx.y + blockIdx.x == 0)) * (threadIdx.x + 1);
|
||||
if (threadIdx.y < parts)
|
||||
for (int i = 0; i < psize; i++)
|
||||
// for part (threadIdx.y) with this rice paramater (threadIdx.x)
|
||||
shared.length[tid] = min(0xfffff, shared.length[tid] + (shared.data[threadIdx.y * psize + i] >> threadIdx.x));
|
||||
__syncthreads();
|
||||
|
||||
// output length (transposed: k is now threadIdx.y)
|
||||
const int pos = (15 << (max_porder + 1)) * blockIdx.y + (threadIdx.y << (max_porder + 1));
|
||||
if (threadIdx.y <= 14 && threadIdx.x < parts)
|
||||
partition_lengths[pos + blockIdx.x * parts_per_block + threadIdx.x] = shared.length[threadIdx.y + (threadIdx.x << 4)];
|
||||
}
|
||||
|
||||
extern "C" __global__ void cudaCalcLargePartition(
|
||||
int* partition_lengths,
|
||||
int* residual,
|
||||
encodeResidualTaskStruct *tasks,
|
||||
int max_porder // <= 8
|
||||
)
|
||||
{
|
||||
__shared__ struct {
|
||||
int data[256];
|
||||
int length[256];
|
||||
encodeResidualTaskStruct task;
|
||||
} shared;
|
||||
const int tid = threadIdx.x + (threadIdx.y << 4);
|
||||
if (tid < sizeof(shared.task) / sizeof(int))
|
||||
((int*)&shared.task)[tid] = ((int*)(&tasks[blockIdx.y]))[tid];
|
||||
__syncthreads();
|
||||
|
||||
const int psize = (shared.task.blocksize >> max_porder); // 18
|
||||
|
||||
shared.length[tid] = 0;
|
||||
for (int pos = 0; pos < psize; pos += 256)
|
||||
{
|
||||
// fetch residual
|
||||
shared.data[tid] = ((blockIdx.x != 0 || pos + tid >= shared.task.residualOrder) && pos + tid < psize) ? residual[shared.task.residualOffs + blockIdx.x * psize + pos + tid - shared.task.residualOrder] : 0;
|
||||
// convert to unsigned
|
||||
shared.data[tid] = (shared.data[tid] << 1) ^ (shared.data[tid] >> 31);
|
||||
__syncthreads();
|
||||
|
||||
// calc number of unary bits for each residual sample with each rice paramater
|
||||
for (int i = 0; i < 256; i += 16)
|
||||
// for sample (i + threadIdx.x) with this rice paramater (threadIdx.y)
|
||||
shared.length[tid] = min(0xfffff, shared.length[tid] + (shared.data[i + threadIdx.x] >> threadIdx.y));
|
||||
__syncthreads();
|
||||
}
|
||||
__syncthreads();
|
||||
SUM16(shared.length,tid,+=);
|
||||
__syncthreads();
|
||||
|
||||
// output length
|
||||
const int pos = (15 << (max_porder + 1)) * blockIdx.y + (threadIdx.y << (max_porder + 1));
|
||||
if (threadIdx.y <= 14 && threadIdx.x == 0)
|
||||
partition_lengths[pos + blockIdx.x] = shared.length[tid] + (psize - shared.task.residualOrder * (blockIdx.x == 0)) * (threadIdx.y + 1);
|
||||
}
|
||||
|
||||
// Sums partition lengths for a certain k == blockIdx.x
|
||||
// Requires 256 threads
|
||||
extern "C" __global__ void cudaSumPartition(
|
||||
int* partition_lengths,
|
||||
int max_porder
|
||||
)
|
||||
{
|
||||
__shared__ struct {
|
||||
int data[512];
|
||||
} shared;
|
||||
|
||||
const int pos = (15 << (max_porder + 1)) * blockIdx.y + (blockIdx.x << (max_porder + 1));
|
||||
|
||||
// fetch residual
|
||||
shared.data[threadIdx.x] = threadIdx.x < (1 << max_porder) ? partition_lengths[pos + threadIdx.x] : 0;
|
||||
__syncthreads();
|
||||
for (int porder = max_porder - 1; porder >= 0; porder--)
|
||||
{
|
||||
const int in_pos = (2 << max_porder) - (4 << porder);
|
||||
const int out_pos = (2 << max_porder) - (2 << porder);
|
||||
if (threadIdx.x < (1 << porder)) shared.data[out_pos + threadIdx.x] = shared.data[in_pos + (threadIdx.x << 1)] + shared.data[in_pos + (threadIdx.x << 1) + 1];
|
||||
__syncthreads();
|
||||
}
|
||||
if (threadIdx.x < (1 << max_porder))
|
||||
partition_lengths[pos + (1 << max_porder) + threadIdx.x] = shared.data[(1 << max_porder) + threadIdx.x];
|
||||
}
|
||||
|
||||
// Finds optimal rice parameter for up to 16 partitions at a time.
|
||||
// Requires 16x16 threads
|
||||
extern "C" __global__ void cudaFindRiceParameter(
|
||||
int* output,
|
||||
int* partition_lengths,
|
||||
int max_porder
|
||||
)
|
||||
{
|
||||
__shared__ struct {
|
||||
int length[256];
|
||||
int tmp[256];
|
||||
} shared;
|
||||
const int tid = threadIdx.x + (threadIdx.y << 4);
|
||||
const int parts = min(16, 2 << max_porder);
|
||||
const int pos = (15 << (max_porder + 1)) * blockIdx.y + (threadIdx.y << (max_porder + 1));
|
||||
|
||||
// read length for 16 partitions
|
||||
shared.length[tid] = (threadIdx.y <= 14 && threadIdx.x < parts) ? partition_lengths[pos + blockIdx.x * 16 + threadIdx.x] : 0xffffff;
|
||||
__syncthreads();
|
||||
// transpose
|
||||
shared.tmp[tid] = threadIdx.y + (threadIdx.x << 4);
|
||||
// find best rice parameter
|
||||
shared.tmp[tid] = BEST_INDEX(shared.tmp[tid], shared.tmp[tid + 8]);
|
||||
shared.tmp[tid] = BEST_INDEX(shared.tmp[tid], shared.tmp[tid + 4]);
|
||||
shared.tmp[tid] = BEST_INDEX(shared.tmp[tid], shared.tmp[tid + 2]);
|
||||
shared.tmp[tid] = BEST_INDEX(shared.tmp[tid], shared.tmp[tid + 1]);
|
||||
__syncthreads();
|
||||
// output rice parameter
|
||||
if (threadIdx.x == 0 && threadIdx.y < parts)
|
||||
output[(blockIdx.y << (max_porder + 2)) + blockIdx.x * parts + threadIdx.y] = shared.tmp[tid] >> 4;
|
||||
// output length
|
||||
if (threadIdx.x == 0 && threadIdx.y < parts)
|
||||
output[(blockIdx.y << (max_porder + 2)) + (1 << (max_porder + 1)) + blockIdx.x * parts + threadIdx.y] = shared.length[shared.tmp[tid]];
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -380,6 +380,57 @@ code {
|
||||
0xf0000001 0xe0000001
|
||||
}
|
||||
}
|
||||
code {
|
||||
name = cudaSumPartition
|
||||
lmem = 0
|
||||
smem = 2072
|
||||
reg = 6
|
||||
bar = 1
|
||||
const {
|
||||
segname = const
|
||||
segnum = 1
|
||||
offset = 0
|
||||
bytes = 4
|
||||
mem {
|
||||
0xffffffff
|
||||
}
|
||||
}
|
||||
bincode {
|
||||
0x1000ca05 0x0423c780 0xa0000009 0x04000780
|
||||
0x30010401 0xe40007d0 0x307c01fd 0x640087c8
|
||||
0xa0014003 0x00000000 0x10013003 0x00001680
|
||||
0x2101ea05 0x00000003 0x100f8001 0x00000003
|
||||
0x30010001 0xc4000780 0x40014e11 0x00200780
|
||||
0xa0004c0d 0x04200780 0x30100811 0xc4100780
|
||||
0x30010605 0xc4000780 0x60004e01 0x00210780
|
||||
0x20018000 0x20008400 0x30020001 0xc4100780
|
||||
0x2000c801 0x04200780 0xd00e0001 0x80c00780
|
||||
0x10014003 0x00000780 0x1000f801 0x0403c780
|
||||
0x00020405 0xc0000782 0x04000c01 0xe4200780
|
||||
0x861ffe03 0x00000000 0x2100ca05 0x046007d0
|
||||
0x1002f003 0x00001980 0x300105fd 0xe40007d8
|
||||
0xa002a003 0x00000000 0x1002a003 0x00001280
|
||||
0x1002800d 0x00000003 0x1000ca01 0x0423c780
|
||||
0x10048011 0x00000003 0x30000601 0xc4000780
|
||||
0x30010811 0xc4000780 0x30010415 0xc4100780
|
||||
0x20400011 0x04010780 0x3001060d 0xc4000780
|
||||
0x20058810 0x20438000 0x00020805 0xc0000780
|
||||
0x2000840c 0x1500ee00 0x00020609 0xc0000780
|
||||
0x2400cc01 0x04200780 0x08000c01 0xe4200780
|
||||
0xf0000001 0xe0000002 0x861ffe03 0x00000000
|
||||
0x203f8205 0x0fffffff 0x308003fd 0x6c4147d8
|
||||
0x10019003 0x00001280 0x30000003 0x00000100
|
||||
0x2101ea0d 0x00000003 0x100f8001 0x00000003
|
||||
0x30030005 0xc4000780 0x40034e11 0x00200780
|
||||
0xa0004c01 0x04200780 0x30100815 0xc4100780
|
||||
0x30030011 0xc4000780 0x1001800d 0x00000003
|
||||
0x1000ca01 0x0423c780 0x60024e05 0x00214780
|
||||
0x30000601 0xc4000780 0x20048204 0x20018004
|
||||
0x2000840c 0x20018400 0x00020605 0xc0000780
|
||||
0x30020005 0xc4100780 0x1500ec00 0x2101e804
|
||||
0xd00e0201 0xa0c00781
|
||||
}
|
||||
}
|
||||
code {
|
||||
name = cudaEstimateResidual
|
||||
lmem = 0
|
||||
@@ -759,6 +810,77 @@ code {
|
||||
0xd00e0401 0x80c00780 0xd00e0201 0xa0c00781
|
||||
}
|
||||
}
|
||||
code {
|
||||
name = cudaFindRiceParameter
|
||||
lmem = 0
|
||||
smem = 2076
|
||||
reg = 7
|
||||
bar = 1
|
||||
const {
|
||||
segname = const
|
||||
segnum = 1
|
||||
offset = 0
|
||||
bytes = 16
|
||||
mem {
|
||||
0x00000010 0x000003ff 0x0000000e 0x00000001
|
||||
}
|
||||
}
|
||||
bincode {
|
||||
0x10028009 0x00000003 0x1000cc05 0x0423c780
|
||||
0x30010409 0xc4000780 0x10000005 0x0403c780
|
||||
0x30800409 0xac400780 0xa0000401 0x04000780
|
||||
0xd0820609 0x00400780 0x300005fd 0x640107c8
|
||||
0xa0000411 0x04000780 0x308209fd 0x6440c2c8
|
||||
0xa001b003 0x00000000 0x1001a003 0x00000100
|
||||
0x2101ec0d 0x00000003 0x100f8005 0x00000003
|
||||
0x30030205 0xc4000780 0x40034e15 0x00200780
|
||||
0x30100a15 0xc4100780 0x60024e15 0x00214780
|
||||
0x30030805 0xc4000780 0x20000a0d 0x04004780
|
||||
0x60804c05 0x00600780 0x20000205 0x0400c780
|
||||
0x30020205 0xc4100780 0x2000ca05 0x04204780
|
||||
0xd00e0205 0x80c00780 0x1001b003 0x00000780
|
||||
0x103f8005 0x000fffff 0x3004080d 0xc4100782
|
||||
0x2000060d 0x04000780 0x00020605 0xc0000780
|
||||
0x04000e01 0xe4204780 0x861ffe03 0x00000000
|
||||
0x30040005 0xc4100780 0x20000205 0x04010780
|
||||
0xd4087809 0x20000780 0x04020e01 0xe4204780
|
||||
0x0002020d 0xc0000780 0x0802c009 0xc0200780
|
||||
0x1c00ce0d 0x0423c780 0x3803cffd 0x6c2187c8
|
||||
0xd4087809 0x20000500 0x1800c005 0x0423c500
|
||||
0xd4085809 0x20000780 0x04020e01 0xe4204780
|
||||
0x0802c00d 0xc0200780 0x00020209 0xc0000780
|
||||
0x1c00ce0d 0x0423c780 0x3803cffd 0x6c20c7c8
|
||||
0xd4085809 0x20000500 0x1800c005 0x0423c500
|
||||
0xd4084809 0x20000780 0x04020e01 0xe4204780
|
||||
0x0802c00d 0xc0200780 0x00020209 0xc0000780
|
||||
0x1c00ce0d 0x0423c780 0x3803cffd 0x6c20c7c8
|
||||
0xd4084809 0x20000500 0x1800c005 0x0423c500
|
||||
0xd4084009 0x20000780 0x04020e01 0xe4204780
|
||||
0x0802c00d 0xc0200780 0x00020209 0xc0000780
|
||||
0x1c00ce0d 0x0423c780 0x3803cffd 0x6c20c7c8
|
||||
0xd4084009 0x20000500 0x1800c005 0x0423c500
|
||||
0x04020e01 0xe4204780 0x861ffe03 0x00000000
|
||||
0x307c0001 0x64008780 0x30040405 0x64010780
|
||||
0xd0830001 0x04400780 0xd0830205 0x04400780
|
||||
0xd0010001 0x040007c0 0xa0057003 0x00000000
|
||||
0x10057003 0x00000100 0x40054c15 0x00200780
|
||||
0xa0004e05 0x04200780 0x2102ec0d 0x00000003
|
||||
0x30100a15 0xc4100780 0x30030205 0xc4000780
|
||||
0x60044c0d 0x00214780 0x20018604 0x20018804
|
||||
0xd4083809 0x20000780 0x3002020d 0xc4100780
|
||||
0x3804c005 0xec300780 0x2000c80d 0x0420c780
|
||||
0xd00e0605 0xa0c00780 0x307c01fd 0x6c0087ca
|
||||
0x30000003 0x00000280 0x2101ec05 0x00000003
|
||||
0x40054c19 0x00200780 0x10018001 0x00000003
|
||||
0xa0004e0d 0x04200780 0x2102ec15 0x00000003
|
||||
0x30100c19 0xc4100780 0x30010001 0xc4000780
|
||||
0x30050605 0xc4000780 0x60044c09 0x00218780
|
||||
0x20018000 0x20048404 0xd4083805 0x20000780
|
||||
0x20000201 0x04000780 0x0402c005 0xc0200780
|
||||
0x30020005 0xc4100780 0x1500ee00 0x2101e804
|
||||
0xd00e0201 0xa0c00781
|
||||
}
|
||||
}
|
||||
code {
|
||||
name = cudaFindWastedBits
|
||||
lmem = 0
|
||||
@@ -1025,6 +1147,101 @@ code {
|
||||
0xf0000001 0xe0000001
|
||||
}
|
||||
}
|
||||
code {
|
||||
name = cudaCalcPartition
|
||||
lmem = 0
|
||||
smem = 2272
|
||||
reg = 12
|
||||
bar = 1
|
||||
const {
|
||||
segname = const
|
||||
segnum = 1
|
||||
offset = 0
|
||||
bytes = 16
|
||||
mem {
|
||||
0x000003ff 0x0000002f 0x000fffff 0x0000000e
|
||||
}
|
||||
}
|
||||
bincode {
|
||||
0xd0800205 0x00400780 0xa000020d 0x04000780
|
||||
0xa0000019 0x04000780 0x30040601 0xc4100780
|
||||
0x20000c25 0x04000780 0x308113fd 0x644107c8
|
||||
0xa0011003 0x00000000 0x3002121d 0xc4100780
|
||||
0x10011003 0x00000280 0xa0004e01 0x04200780
|
||||
0x30070005 0xc4100780 0x30060001 0xc4100780
|
||||
0x20008200 0x2100ec00 0x20000e01 0x04000780
|
||||
0xd00e0001 0x80c00780 0x00000e05 0xc0000780
|
||||
0x04041001 0xe4200780 0xf0000001 0xe0000002
|
||||
0x861ffe03 0x00000000 0xd0104005 0x20000780
|
||||
0x1000ce01 0x0423c780 0x3400ce05 0xec200780
|
||||
0x10008001 0x00000013 0x10000209 0x0403c780
|
||||
0x20078003 0x00000780 0xa0004c09 0x04200780
|
||||
0x40040215 0x00000780 0x10018021 0x00000003
|
||||
0x1000ce11 0x0423c780 0x30100a15 0xc4100780
|
||||
0x30041011 0xc4000780 0x60040021 0x00014780
|
||||
0x30001011 0x04010780 0x30040011 0xa4000780
|
||||
0x40090415 0x00000780 0x60080615 0x00014780
|
||||
0x3409c1fd 0x6c20c7c8 0x30100a15 0xc4100780
|
||||
0x307c05fd 0x64014148 0x60080415 0x00014780
|
||||
0x300513fd 0x6c0042c8 0xa003b003 0x00000000
|
||||
0x1003a003 0x00000100 0x40050415 0x00000780
|
||||
0x60040615 0x00014780 0x30100a15 0xc4100780
|
||||
0x60040415 0x00014780 0x400b0029 0x00000780
|
||||
0x600a0229 0x00028780 0x30101429 0xc4100780
|
||||
0x600a0001 0x00028780 0xd0104005 0x20000780
|
||||
0x2500f400 0x20009200 0x3400c001 0x04200780
|
||||
0x30020001 0xc4100780 0x2000ca01 0x04200780
|
||||
0xd00e0001 0x80c00780 0x1003b003 0x00000780
|
||||
0x1000f801 0x0403c780 0x301f0015 0xec100782
|
||||
0x30010001 0xc4100780 0xd0000a01 0x04008780
|
||||
0x00000e05 0xc0000780 0x04001001 0xe4200780
|
||||
0x861ffe03 0x00000000 0xd0104005 0x20000780
|
||||
0x20018c01 0x00000003 0x3400c015 0x04204780
|
||||
0x40020228 0x400b0024 0x60030029 0x00028780
|
||||
0x600a0225 0x00024780 0x30101429 0xc4100780
|
||||
0x200005fd 0x0400c7c8 0x30101225 0xc4100780
|
||||
0x60020009 0x00028780 0x600a0009 0x00024100
|
||||
0x00000e05 0xc0000780 0x300309fd 0x6400c7c8
|
||||
0xa0062003 0x00000000 0x04021001 0xe4208780
|
||||
0x10062003 0x00000280 0x307c03fd 0x6c00c7c8
|
||||
0x10062003 0x00000280 0x40070401 0x00000780
|
||||
0x60060601 0x00000780 0x30100001 0xc4100780
|
||||
0x60060401 0x00000780 0x20088015 0x00000003
|
||||
0x00020a05 0xc0000780 0x20000005 0x04004780
|
||||
0x3606c215 0xec200780 0x20000409 0x04014780
|
||||
0x20018001 0x00000003 0x00000e09 0xc0000780
|
||||
0x30820409 0xac400780 0x300101fd 0x6c0147c8
|
||||
0x08021001 0xe4208780 0x1005a003 0x00000280
|
||||
0xf0000001 0xe0000002 0x861ffe03 0x00000000
|
||||
0x300609fd 0x640107c8 0x308307fd 0x6440c2c8
|
||||
0x30000003 0x00000100 0x2101ee05 0x00000003
|
||||
0x100f8001 0x00000003 0x30010001 0xc4000780
|
||||
0x40014e09 0x00200780 0x30100409 0xc4100780
|
||||
0x60004e09 0x00208780 0x30040c11 0xc4100780
|
||||
0x30010605 0xc4000780 0x20069000 0x2004860c
|
||||
0x20000405 0x04004780 0x00020605 0xc0000780
|
||||
0x20000001 0x04004780 0xd4084005 0x20000780
|
||||
0x30020005 0xc4100780 0x1500e000 0x2101e804
|
||||
0xd00e0201 0xa0c00780 0x30000003 0x00000780
|
||||
0xa0000411 0x04114780 0xa0000815 0x44004780
|
||||
0xa0000021 0x04114780 0x90000a29 0x00000780
|
||||
0xa0001015 0x44064780 0x203e9429 0x0fffffff
|
||||
0xc00a0a15 0x0000c7c0 0xa0000a15 0x84064780
|
||||
0x400b102d 0x00000780 0x600a122d 0x0002c780
|
||||
0x3010162d 0xc4100780 0x600a102d 0x0002c780
|
||||
0x2040102d 0x0402c780 0xa000162d 0x44064780
|
||||
0xc00a1629 0x0000c7c0 0xa0001429 0x84064780
|
||||
0x20000a15 0x04028780 0x40081629 0x00000780
|
||||
0x60091429 0x00028780 0x30101429 0xc4100780
|
||||
0x60081429 0x00028780 0x30001421 0x04020780
|
||||
0x30080811 0x6400c780 0xd0000401 0x04008780
|
||||
0x301f0001 0xe4100780 0x30000815 0x04014780
|
||||
0xa0000011 0x2c014780 0xd0050811 0x04008780
|
||||
0x307c05fd 0x6c0147c8 0x20000001 0x04010780
|
||||
0xd0020001 0x0402c500 0x30000003 0x00000780
|
||||
0xf0000001 0xe0000001
|
||||
}
|
||||
}
|
||||
code {
|
||||
name = cudaStereoDecorr
|
||||
lmem = 0
|
||||
@@ -1047,6 +1264,128 @@ code {
|
||||
0x2000c801 0x04210780 0xd00e0005 0xa0c00781
|
||||
}
|
||||
}
|
||||
code {
|
||||
name = cudaCalcLargePartition
|
||||
lmem = 0
|
||||
smem = 2272
|
||||
reg = 23
|
||||
bar = 1
|
||||
const {
|
||||
segname = const
|
||||
segnum = 1
|
||||
offset = 0
|
||||
bytes = 20
|
||||
mem {
|
||||
0x000003ff 0x0000002f 0x00000001 0x000fffff
|
||||
0x0000000e
|
||||
}
|
||||
}
|
||||
bincode {
|
||||
0xd0800205 0x00400780 0xa0000219 0x04000780
|
||||
0xa000001d 0x04000780 0x30040c01 0xc4100780
|
||||
0x20000e21 0x04000780 0x308111fd 0x644107c8
|
||||
0xa0011003 0x00000000 0x30021025 0xc4100780
|
||||
0x10011003 0x00000280 0xa0004e01 0x04200780
|
||||
0x30070005 0xc4100780 0x30060001 0xc4100780
|
||||
0x20008200 0x2100ec00 0x20001201 0x04000780
|
||||
0xd00e0001 0x80c00780 0x00001205 0xc0000780
|
||||
0x04041001 0xe4200780 0xf0000001 0xe0000002
|
||||
0x861ffe03 0x00000000 0xd0107805 0x20000780
|
||||
0x1000ce01 0x0423c780 0x3400c001 0xec200780
|
||||
0x00001205 0xc0000780 0x307c01fd 0x6c00c7c8
|
||||
0x04021001 0xe43f0780 0x1009f003 0x00000280
|
||||
0x20108e51 0x00000003 0x20208e4d 0x00000003
|
||||
0x20308e49 0x00000003 0x20008e45 0x00000007
|
||||
0x20108e41 0x00000007 0x20208e3d 0x00000007
|
||||
0x20308e39 0x00000007 0x20008e35 0x0000000b
|
||||
0x20108e31 0x0000000b 0x20208e2d 0x0000000b
|
||||
0x20308e29 0x0000000b 0x20008e15 0x0000000f
|
||||
0x20108e11 0x0000000f 0x20208e05 0x0000000f
|
||||
0x20308e09 0x0000000f 0x00020205 0xc0000780
|
||||
0x1000f80d 0x0403c780 0xa0004c05 0x042007c0
|
||||
0x00020409 0xc0000780 0x1000f809 0x0403c780
|
||||
0x1000040d 0x2440c280 0xd010400d 0x20000780
|
||||
0x20000455 0x04020780 0x3c15c059 0x6c20c780
|
||||
0x30002a55 0x6c004780 0xa0002c59 0x2c014780
|
||||
0xa0002a55 0x2c014780 0xd0032c59 0x04004780
|
||||
0xd0162bfd 0x040007c8 0xa0045003 0x00000000
|
||||
0x10044003 0x00000100 0x40030055 0x00000780
|
||||
0x60020255 0x00054780 0x30102a55 0xc4100780
|
||||
0xd010400d 0x20000780 0x60020059 0x00054780
|
||||
0x20088454 0x2d16f458 0x2016aa54 0x3d15e054
|
||||
0x30022a55 0xc4100780 0x2000ca55 0x04254780
|
||||
0xd00e2a55 0x80c00780 0x10045003 0x00000780
|
||||
0x1000f855 0x0403c780 0x301f2a59 0xec100782
|
||||
0x30012a55 0xc4100780 0xd0152c55 0x04008780
|
||||
0x0000120d 0xc0000780 0x0c001001 0xe4254780
|
||||
0x861ffe03 0x00000000 0x0000120d 0xc0000780
|
||||
0x00020e11 0xc0000780 0x3006d055 0xec200784
|
||||
0xdc084011 0x20000780 0x2000c055 0x04254784
|
||||
0x30832a55 0xac400780 0x0c021001 0xe4254780
|
||||
0x00022811 0xc0000780 0x3006d059 0xec200784
|
||||
0x20002a55 0x04058780 0x30832a55 0xac400780
|
||||
0x0c021001 0xe4254780 0x00022611 0xc0000780
|
||||
0x3006d059 0xec200784 0x20002a55 0x04058780
|
||||
0x30832a55 0xac400780 0x0c021001 0xe4254780
|
||||
0x00022411 0xc0000780 0x3006d059 0xec200784
|
||||
0x20002a55 0x04058780 0x30832a55 0xac400780
|
||||
0x0c021001 0xe4254780 0x00022211 0xc0000780
|
||||
0x3006d059 0xec200784 0x20002a55 0x04058780
|
||||
0x30832a55 0xac400780 0x0c021001 0xe4254780
|
||||
0x00022011 0xc0000780 0x3006d059 0xec200784
|
||||
0x20002a55 0x04058780 0x30832a55 0xac400780
|
||||
0x0c021001 0xe4254780 0x00021e11 0xc0000780
|
||||
0x3006d059 0xec200784 0x20002a55 0x04058780
|
||||
0x30832a55 0xac400780 0x0c021001 0xe4254780
|
||||
0x00021c11 0xc0000780 0x3006d059 0xec200784
|
||||
0x20002a55 0x04058780 0x30832a55 0xac400780
|
||||
0x0c021001 0xe4254780 0x00021a11 0xc0000780
|
||||
0x3006d059 0xec200784 0x20002a55 0x04058780
|
||||
0x30832a55 0xac400780 0x0c021001 0xe4254780
|
||||
0x00021811 0xc0000780 0x3006d059 0xec200784
|
||||
0x20002a55 0x04058780 0x30832a55 0xac400780
|
||||
0x0c021001 0xe4254780 0x00021611 0xc0000780
|
||||
0x3006d059 0xec200784 0x20002a55 0x04058780
|
||||
0x30832a55 0xac400780 0x0c021001 0xe4254780
|
||||
0x00021411 0xc0000780 0x3006d059 0xec200784
|
||||
0x20002a55 0x04058780 0x30832a55 0xac400780
|
||||
0x0c021001 0xe4254780 0x00020a11 0xc0000780
|
||||
0x3006d059 0xec200784 0x20002a55 0x04058780
|
||||
0x30832a55 0xac400780 0x0c021001 0xe4254780
|
||||
0x00020811 0xc0000780 0x3006d059 0xec200784
|
||||
0x20002a55 0x04058780 0x30832a55 0xac400780
|
||||
0x0c021001 0xe4254780 0x3406d059 0xec200780
|
||||
0x20002a55 0x04058780 0x30832a55 0xac400780
|
||||
0x0c021001 0xe4254780 0x3806d059 0xec200780
|
||||
0x20002a55 0x04058780 0x30832a55 0xac400780
|
||||
0x0c021001 0xe4254780 0x861ffe03 0x00000000
|
||||
0x20008409 0x00000013 0x300201fd 0x6c0107c8
|
||||
0x1002f003 0x00000280 0x861ffe03 0x00000000
|
||||
0x00001205 0xc0000780 0xd4084009 0x20000780
|
||||
0x1900f004 0x2901e004 0x04021001 0xe4204780
|
||||
0x2800c805 0x04204780 0x04021001 0xe4204780
|
||||
0x2800c405 0x04204780 0x04021001 0xe4204780
|
||||
0x2800c205 0x04204780 0x04021001 0xe4204780
|
||||
0x861ffe03 0x00000000 0x307c0ffd 0x640087c8
|
||||
0x30840dfd 0x6440c2c8 0x30000003 0x00000100
|
||||
0xa0004c09 0x042007c0 0x20018c05 0x00000003
|
||||
0x100bb003 0x00000280 0xd0104005 0x20000780
|
||||
0x3400c001 0x04200780 0x4001040d 0x00000780
|
||||
0x6000060d 0x0000c780 0x3010060d 0xc4100780
|
||||
0x00001205 0xc0000780 0x60000401 0x0000c780
|
||||
0xd4084005 0x20000780 0x2400c005 0x04200780
|
||||
0x100c2003 0x00000780 0x4003000d 0x00000780
|
||||
0x6002020d 0x0000c780 0x3010060d 0xc4100780
|
||||
0x00001205 0xc0000780 0x60020001 0x0000c780
|
||||
0xd4084005 0x20000780 0x2400c005 0x04200780
|
||||
0x2101ee0d 0x00000003 0x100f8001 0x00000003
|
||||
0x30030001 0xc4000780 0x40014e11 0x00200780
|
||||
0x30100811 0xc4100780 0x30030c0d 0xc4000780
|
||||
0x60004e01 0x00210780 0x20038000 0x20008400
|
||||
0x30020001 0xc4100780 0x2000c801 0x04200780
|
||||
0xd00e0005 0xa0c00781
|
||||
}
|
||||
}
|
||||
code {
|
||||
name = cudaCopyBestMethodStereo
|
||||
lmem = 0
|
||||
|
||||
Reference in New Issue
Block a user