diff --git a/CUETools.FlaCuda/FlaCudaWriter.cs b/CUETools.FlaCuda/FlaCudaWriter.cs index 20bd6a6..7ba58e2 100644 --- a/CUETools.FlaCuda/FlaCudaWriter.cs +++ b/CUETools.FlaCuda/FlaCudaWriter.cs @@ -424,6 +424,9 @@ namespace CUETools.Codecs.FlaCuda int* s = ((int*)task.samplesBufferPtr) + samplesInBuffer; fixed (int *src = &samples[pos, 0]) { + short* dst = ((short*)task.samplesBytesPtr) + samplesInBuffer * channels; + for (int i = 0; i < block * channels; i++) + dst[i] = (short)src[i]; if (channels == 2 && eparams.do_midside) channel_decorrelation(s, s + FlaCudaWriter.MAX_BLOCKSIZE, s + 2 * FlaCudaWriter.MAX_BLOCKSIZE, s + 3 * FlaCudaWriter.MAX_BLOCKSIZE, src, block); @@ -947,7 +950,7 @@ namespace CUETools.Codecs.FlaCuda csum += (ulong)Math.Abs(coefs[i - 1]); if ((csum << (int)frame.subframes[ch].obits) >= 1UL << 32) lpc.encode_residual_long(frame.subframes[ch].best.residual, frame.subframes[ch].samples, frame.blocksize, frame.subframes[ch].best.order, coefs, frame.subframes[ch].best.shift); - else if (encode_on_cpu) + else// if (encode_on_cpu) lpc.encode_residual(frame.subframes[ch].best.residual, frame.subframes[ch].samples, frame.blocksize, frame.subframes[ch].best.order, coefs, frame.subframes[ch].best.shift); int pmin = get_max_p_order(eparams.min_partition_order, frame.blocksize, frame.subframes[ch].best.order); @@ -1022,7 +1025,10 @@ namespace CUETools.Codecs.FlaCuda if (blocksize <= 4) return; - compute_autocorellation(blocksize, channelsCount, max_order, nFrames, task); + int autocorPartSize = 256 + 128;// (2 * 256 - max_order) & ~31; + int autocorPartCount = (blocksize + autocorPartSize - 1) / autocorPartSize; + if (autocorPartCount > maxAutocorParts) + throw new Exception("internal error"); int threads_y; if (task.nResidualTasksPerChannel >= 4 && task.nResidualTasksPerChannel <= 8) @@ -1039,25 +1045,58 @@ namespace CUETools.Codecs.FlaCuda threads_y = 4; else throw new Exception("invalid LPC order"); - int partSize = 32 * threads_y; - int partCount = (blocksize + partSize - 1) / partSize; + int residualPartSize = 32 * threads_y; + int residualPartCount = (blocksize + residualPartSize - 1) / residualPartSize; - if (partCount > maxResidualParts) + if (residualPartCount > maxResidualParts) throw new Exception("invalid combination of block size and LPC order"); + CUfunction cudaChannelDecorr = channels == 2 ? (channelsCount == 4 ? task.cudaStereoDecorr : task.cudaChannelDecorr2) : task.cudaChannelDecorr; + + cuda.SetParameter(cudaChannelDecorr, 0, (uint)task.cudaSamples.Pointer); + cuda.SetParameter(cudaChannelDecorr, 1 * sizeof(uint), (uint)task.cudaSamplesBytes.Pointer); + cuda.SetParameter(cudaChannelDecorr, 2 * sizeof(uint), (uint)MAX_BLOCKSIZE); + cuda.SetParameterSize(cudaChannelDecorr, sizeof(uint) * 3U); + cuda.SetFunctionBlockShape(cudaChannelDecorr, 256, 1, 1); + + cuda.SetParameter(task.cudaFindWastedBits, 0 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer); + cuda.SetParameter(task.cudaFindWastedBits, 1 * sizeof(uint), (uint)task.cudaSamples.Pointer); + cuda.SetParameter(task.cudaFindWastedBits, 2 * sizeof(uint), (uint)task.nResidualTasksPerChannel); + cuda.SetParameter(task.cudaFindWastedBits, 3 * sizeof(uint), (uint)blocksize); + cuda.SetParameterSize(task.cudaFindWastedBits, sizeof(uint) * 4U); + cuda.SetFunctionBlockShape(task.cudaFindWastedBits, 256, 1, 1); + + cuda.SetParameter(task.cudaComputeAutocor, 0, (uint)task.cudaAutocorOutput.Pointer); + cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint), (uint)task.cudaSamples.Pointer); + cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 2, (uint)cudaWindow.Pointer); + cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 3, (uint)task.cudaAutocorTasks.Pointer); + cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 4, (uint)max_order); + cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 4 + sizeof(uint), (uint)blocksize); + cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 4 + sizeof(uint) * 2, (uint)autocorPartSize); + cuda.SetParameterSize(task.cudaComputeAutocor, sizeof(uint) * 7U); + cuda.SetFunctionBlockShape(task.cudaComputeAutocor, 32, 8, 1); + + cuda.SetParameter(task.cudaComputeLPC, 0, (uint)task.cudaResidualTasks.Pointer); + cuda.SetParameter(task.cudaComputeLPC, sizeof(uint), (uint)task.cudaAutocorOutput.Pointer); + cuda.SetParameter(task.cudaComputeLPC, sizeof(uint) * 2, (uint)task.cudaAutocorTasks.Pointer); + cuda.SetParameter(task.cudaComputeLPC, sizeof(uint) * 3, (uint)max_order); + cuda.SetParameter(task.cudaComputeLPC, sizeof(uint) * 3 + sizeof(uint), (uint)autocorPartCount); + cuda.SetParameterSize(task.cudaComputeLPC, (uint)(sizeof(uint) * 3) + sizeof(uint) * 2); + cuda.SetFunctionBlockShape(task.cudaComputeLPC, (autocorPartCount + 31) & ~31, 1, 1); + cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 0, (uint)task.cudaResidualOutput.Pointer); cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 1, (uint)task.cudaSamples.Pointer); cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 2, (uint)task.cudaResidualTasks.Pointer); cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 3, (uint)max_order); cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 4, (uint)blocksize); - cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 5, (uint)partSize); + cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 5, (uint)residualPartSize); cuda.SetParameterSize(task.cudaEstimateResidual, sizeof(uint) * 6); cuda.SetFunctionBlockShape(task.cudaEstimateResidual, 32, threads_y, 1); int nBestTasks = task.nResidualTasks / task.nResidualTasksPerChannel; cuda.SetParameter(task.cudaChooseBestMethod, 0 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer); cuda.SetParameter(task.cudaChooseBestMethod, 1 * sizeof(uint), (uint)task.cudaResidualOutput.Pointer); - cuda.SetParameter(task.cudaChooseBestMethod, 2 * sizeof(uint), (uint)partCount); + cuda.SetParameter(task.cudaChooseBestMethod, 2 * sizeof(uint), (uint)residualPartCount); cuda.SetParameter(task.cudaChooseBestMethod, 3 * sizeof(uint), (uint)task.nResidualTasksPerChannel); cuda.SetParameterSize(task.cudaChooseBestMethod, sizeof(uint) * 4U); cuda.SetFunctionBlockShape(task.cudaChooseBestMethod, 32, 16, 1); @@ -1078,10 +1117,14 @@ namespace CUETools.Codecs.FlaCuda cuda.SetParameter(task.cudaEncodeResidual, 1 * sizeof(uint), (uint)task.cudaSamples.Pointer); cuda.SetParameter(task.cudaEncodeResidual, 2 * sizeof(uint), (uint)task.cudaBestResidualTasks.Pointer); cuda.SetParameterSize(task.cudaEncodeResidual, sizeof(uint) * 3U); - cuda.SetFunctionBlockShape(task.cudaEncodeResidual, partSize, 1, 1); + cuda.SetFunctionBlockShape(task.cudaEncodeResidual, residualPartSize, 1, 1); // issue work to the GPU - cuda.LaunchAsync(task.cudaEstimateResidual, partCount, (task.nResidualTasks / threads_y * nFrames) / maxFrames, task.stream); + cuda.LaunchAsync(cudaChannelDecorr, (nFrames * blocksize + 255) / 256, channels == 2 ? 1 : channels, task.stream); + cuda.LaunchAsync(task.cudaFindWastedBits, (task.nResidualTasks / task.nResidualTasksPerChannel * nFrames) / maxFrames, 1, task.stream); + cuda.LaunchAsync(task.cudaComputeAutocor, autocorPartCount, (task.nAutocorTasks * nFrames) / maxFrames, task.stream); + cuda.LaunchAsync(task.cudaComputeLPC, 1, (task.nAutocorTasks * nFrames) / maxFrames, task.stream); + cuda.LaunchAsync(task.cudaEstimateResidual, residualPartCount, (task.nResidualTasks / threads_y * nFrames) / maxFrames, task.stream); cuda.LaunchAsync(task.cudaChooseBestMethod, 1, (nBestTasks * nFrames) / maxFrames, task.stream); if (channels == 2 && channelsCount == 4) { @@ -1091,60 +1134,12 @@ namespace CUETools.Codecs.FlaCuda else cuda.LaunchAsync(task.cudaCopyBestMethod, 1, (nBestTasks * nFrames) / maxFrames, task.stream); if (!encode_on_cpu) - cuda.LaunchAsync(task.cudaEncodeResidual, partCount, (nBestTasks * nFrames) / maxFrames, task.stream); + cuda.LaunchAsync(task.cudaEncodeResidual, residualPartCount, (nBestTasks * nFrames) / maxFrames, task.stream); cuda.CopyDeviceToHostAsync(task.cudaBestResidualTasks, task.bestResidualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * ((nBestTasks * nFrames) / maxFrames)), task.stream); if (!encode_on_cpu) cuda.CopyDeviceToHostAsync(task.cudaResidual, task.residualBufferPtr, (uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channels), task.stream); } - unsafe void compute_autocorellation(int blocksize, int channelsCount, int max_order, int nFrames, FlaCudaTask task) - { - if (blocksize <= 4) - return; - - int partSize = 256 + 128;// (2 * 256 - max_order) & ~31; - int partCount = (blocksize + partSize - 1) / partSize; - if (partCount > maxAutocorParts) - throw new Exception("internal error"); - - cuda.SetParameter(task.cudaStereoDecorr, 0, (uint)task.cudaSamples.Pointer); - cuda.SetParameter(task.cudaStereoDecorr, sizeof(uint), (uint)MAX_BLOCKSIZE); - cuda.SetParameterSize(task.cudaStereoDecorr, sizeof(uint) * 2U); - cuda.SetFunctionBlockShape(task.cudaStereoDecorr, 256, 1, 1); - - cuda.SetParameter(task.cudaFindWastedBits, 0 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer); - cuda.SetParameter(task.cudaFindWastedBits, 1 * sizeof(uint), (uint)task.cudaSamples.Pointer); - cuda.SetParameter(task.cudaFindWastedBits, 2 * sizeof(uint), (uint)task.nResidualTasksPerChannel); - cuda.SetParameter(task.cudaFindWastedBits, 3 * sizeof(uint), (uint)blocksize); - cuda.SetParameterSize(task.cudaFindWastedBits, sizeof(uint) * 4U); - cuda.SetFunctionBlockShape(task.cudaFindWastedBits, 256, 1, 1); - - cuda.SetParameter(task.cudaComputeAutocor, 0, (uint)task.cudaAutocorOutput.Pointer); - cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint), (uint)task.cudaSamples.Pointer); - cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 2, (uint)cudaWindow.Pointer); - cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 3, (uint)task.cudaAutocorTasks.Pointer); - cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 4, (uint)max_order); - cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 4 + sizeof(uint), (uint)blocksize); - cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 4 + sizeof(uint) * 2, (uint)partSize); - cuda.SetParameterSize(task.cudaComputeAutocor, sizeof(uint) * 7U); - cuda.SetFunctionBlockShape(task.cudaComputeAutocor, 32, 8, 1); - - cuda.SetParameter(task.cudaComputeLPC, 0, (uint)task.cudaResidualTasks.Pointer); - cuda.SetParameter(task.cudaComputeLPC, sizeof(uint), (uint)task.cudaAutocorOutput.Pointer); - cuda.SetParameter(task.cudaComputeLPC, sizeof(uint) * 2, (uint)task.cudaAutocorTasks.Pointer); - cuda.SetParameter(task.cudaComputeLPC, sizeof(uint) * 3, (uint)max_order); - cuda.SetParameter(task.cudaComputeLPC, sizeof(uint) * 3 + sizeof(uint), (uint)partCount); - cuda.SetParameterSize(task.cudaComputeLPC, (uint)(sizeof(uint) * 3) + sizeof(uint) * 2); - cuda.SetFunctionBlockShape(task.cudaComputeLPC, (partCount + 31) & ~31, 1, 1); - - // issue work to the GPU - if (channels == 2 && channelsCount == 4) - cuda.LaunchAsync(task.cudaStereoDecorr, MAX_BLOCKSIZE / 256, 1, task.stream); - cuda.LaunchAsync(task.cudaFindWastedBits, (task.nResidualTasks / task.nResidualTasksPerChannel * nFrames) / maxFrames, 1, task.stream); - cuda.LaunchAsync(task.cudaComputeAutocor, partCount, (task.nAutocorTasks * nFrames) / maxFrames, task.stream); - cuda.LaunchAsync(task.cudaComputeLPC, 1, (task.nAutocorTasks * nFrames) / maxFrames, task.stream); - } - unsafe int encode_frame(bool doMidside, int channelCount, int iFrame, FlaCudaTask task) { fixed (int* r = residualBuffer) @@ -1186,7 +1181,8 @@ namespace CUETools.Codecs.FlaCuda bool doMidside = channels == 2 && eparams.do_midside; int channelCount = doMidside ? 2 * channels : channels; - cuda.CopyHostToDeviceAsync(task.cudaSamples, task.samplesBufferPtr, (uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channels), task.stream); + //cuda.CopyHostToDeviceAsync(task.cudaSamples, task.samplesBufferPtr, (uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channels), task.stream); + cuda.CopyHostToDeviceAsync(task.cudaSamplesBytes, task.samplesBytesPtr, (uint)(sizeof(short) * channels * eparams.block_size * nFrames), task.stream); } unsafe void run_GPU_task(int nFrames, FlaCudaTask task) @@ -1261,7 +1257,7 @@ namespace CUETools.Codecs.FlaCuda { int decoded = verify.DecodeFrame(frame_buffer, 0, fs); if (decoded != fs || verify.Remaining != (ulong)eparams.block_size) - throw new Exception("validation failed!"); + throw new Exception("validation failed! frame size mismatch"); fixed (int* s = task.verifyBuffer, r = verify.Samples) { for (int ch = 0; ch < channels; ch++) @@ -1341,6 +1337,7 @@ namespace CUETools.Codecs.FlaCuda int* s2 = (int*)task2.samplesBufferPtr; for (int ch = 0; ch < channelCount; ch++) AudioSamples.MemCpy(s2 + ch * FlaCudaWriter.MAX_BLOCKSIZE, s1 + bs + ch * FlaCudaWriter.MAX_BLOCKSIZE, samplesInBuffer - bs); + AudioSamples.MemCpy(((short*)task2.samplesBytesPtr), ((short*)task1.samplesBytesPtr) + bs * channels, (samplesInBuffer - bs) * channels); } samplesInBuffer -= bs; have_data = nFrames; @@ -1790,6 +1787,8 @@ namespace CUETools.Codecs.FlaCuda { CUDA cuda; public CUfunction cudaStereoDecorr; + public CUfunction cudaChannelDecorr; + public CUfunction cudaChannelDecorr2; public CUfunction cudaFindWastedBits; public CUfunction cudaComputeAutocor; public CUfunction cudaComputeLPC; @@ -1799,6 +1798,7 @@ namespace CUETools.Codecs.FlaCuda public CUfunction cudaCopyBestMethodStereo; public CUfunction cudaEncodeResidual; public CUdeviceptr cudaSamples; + public CUdeviceptr cudaSamplesBytes; public CUdeviceptr cudaResidual; public CUdeviceptr cudaAutocorTasks; public CUdeviceptr cudaAutocorOutput; @@ -1806,6 +1806,7 @@ namespace CUETools.Codecs.FlaCuda public CUdeviceptr cudaResidualOutput; public CUdeviceptr cudaBestResidualTasks; public IntPtr samplesBufferPtr = IntPtr.Zero; + public IntPtr samplesBytesPtr = IntPtr.Zero; public IntPtr residualBufferPtr = IntPtr.Zero; public IntPtr autocorTasksPtr = IntPtr.Zero; public IntPtr residualTasksPtr = IntPtr.Zero; @@ -1831,6 +1832,7 @@ namespace CUETools.Codecs.FlaCuda bestResidualTasksLen = sizeof(encodeResidualTaskStruct) * channelCount * FlaCudaWriter.maxFrames; samplesBufferLen = sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelCount; + cudaSamplesBytes = cuda.Allocate((uint)samplesBufferLen / 2); cudaSamples = cuda.Allocate((uint)samplesBufferLen); cudaResidual = cuda.Allocate((uint)samplesBufferLen); cudaAutocorTasks = cuda.Allocate((uint)autocorTasksLen); @@ -1839,6 +1841,8 @@ namespace CUETools.Codecs.FlaCuda cudaBestResidualTasks = cuda.Allocate((uint)bestResidualTasksLen); cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS + 1) * lpc.MAX_LPC_ORDER * FlaCudaWriter.maxResidualParts * FlaCudaWriter.maxFrames)); CUResult cuErr = CUDADriver.cuMemAllocHost(ref samplesBufferPtr, (uint)samplesBufferLen); + if (cuErr == CUResult.Success) + cuErr = CUDADriver.cuMemAllocHost(ref samplesBytesPtr, (uint)samplesBufferLen / 2); if (cuErr == CUResult.Success) cuErr = CUDADriver.cuMemAllocHost(ref residualBufferPtr, (uint)samplesBufferLen); if (cuErr == CUResult.Success) @@ -1849,6 +1853,7 @@ namespace CUETools.Codecs.FlaCuda cuErr = CUDADriver.cuMemAllocHost(ref bestResidualTasksPtr, (uint)bestResidualTasksLen); if (cuErr != CUResult.Success) { + if (samplesBytesPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBytesPtr); samplesBytesPtr = IntPtr.Zero; if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr); samplesBufferPtr = IntPtr.Zero; if (residualBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualBufferPtr); residualBufferPtr = IntPtr.Zero; if (autocorTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(autocorTasksPtr); autocorTasksPtr = IntPtr.Zero; @@ -1859,6 +1864,8 @@ namespace CUETools.Codecs.FlaCuda cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor"); cudaStereoDecorr = cuda.GetModuleFunction("cudaStereoDecorr"); + cudaChannelDecorr = cuda.GetModuleFunction("cudaChannelDecorr"); + cudaChannelDecorr2 = cuda.GetModuleFunction("cudaChannelDecorr2"); cudaFindWastedBits = cuda.GetModuleFunction("cudaFindWastedBits"); cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC"); cudaEstimateResidual = cuda.GetModuleFunction("cudaEstimateResidual"); @@ -1875,12 +1882,14 @@ namespace CUETools.Codecs.FlaCuda public void Dispose() { cuda.Free(cudaSamples); + cuda.Free(cudaSamplesBytes); cuda.Free(cudaResidual); cuda.Free(cudaAutocorTasks); cuda.Free(cudaAutocorOutput); cuda.Free(cudaResidualTasks); cuda.Free(cudaResidualOutput); cuda.Free(cudaBestResidualTasks); + CUDADriver.cuMemFreeHost(samplesBytesPtr); CUDADriver.cuMemFreeHost(samplesBufferPtr); CUDADriver.cuMemFreeHost(residualBufferPtr); CUDADriver.cuMemFreeHost(residualTasksPtr); diff --git a/CUETools.FlaCuda/flacuda.cu b/CUETools.FlaCuda/flacuda.cu index d88049d..f421013 100644 --- a/CUETools.FlaCuda/flacuda.cu +++ b/CUETools.FlaCuda/flacuda.cu @@ -57,19 +57,47 @@ typedef struct extern "C" __global__ void cudaStereoDecorr( int *samples, + short2 *src, int offset ) { const int pos = blockIdx.x * blockDim.x + threadIdx.x; if (pos < offset) { - int l = samples[pos]; - int r = samples[offset + pos]; - samples[2 * offset + pos] = (l + r) >> 1; - samples[3 * offset + pos] = l - r; + short2 s = src[pos]; + samples[pos] = s.x; + samples[1 * offset + pos] = s.y; + samples[2 * offset + pos] = (s.x + s.y) >> 1; + samples[3 * offset + pos] = s.x - s.y; } } +extern "C" __global__ void cudaChannelDecorr2( + int *samples, + short2 *src, + int offset +) +{ + const int pos = blockIdx.x * blockDim.x + threadIdx.x; + if (pos < offset) + { + short2 s = src[pos]; + samples[pos] = s.x; + samples[1 * offset + pos] = s.y; + } +} + +extern "C" __global__ void cudaChannelDecorr( + int *samples, + short *src, + int offset +) +{ + const int pos = blockIdx.x * blockDim.x + threadIdx.x; + if (pos < offset) + samples[blockIdx.y * offset + pos] = src[pos * gridDim.y + blockIdx.y]; +} + extern "C" __global__ void cudaFindWastedBits( encodeResidualTaskStruct *tasks, int *samples, @@ -471,7 +499,7 @@ extern "C" __global__ void cudaEncodeResidual( encodeResidualTaskStruct task; } shared; const int tid = threadIdx.x; - if (threadIdx.x < sizeof(encodeResidualTaskStruct)) + if (threadIdx.x < sizeof(shared.task) / sizeof(int)) ((int*)&shared.task)[threadIdx.x] = ((int*)(&tasks[blockIdx.y]))[threadIdx.x]; __syncthreads(); const int partSize = blockDim.x; diff --git a/CUETools.FlaCuda/flacuda.cubin b/CUETools.FlaCuda/flacuda.cubin index c30bd6f..9ddae1f 100644 --- a/CUETools.FlaCuda/flacuda.cubin +++ b/CUETools.FlaCuda/flacuda.cubin @@ -427,7 +427,7 @@ code { offset = 0 bytes = 8 mem { - 0x000000bf 0x0000001f + 0x0000002f 0x0000001f } } bincode { @@ -725,22 +725,24 @@ code { code { name = cudaStereoDecorr lmem = 0 - smem = 24 - reg = 6 + smem = 28 + reg = 7 bar = 0 bincode { 0x10004205 0x0023c780 0xa0000005 0x04000780 - 0x60014c01 0x00204780 0x3000cbfd 0x6c20c7c8 - 0x30000003 0x00000280 0x2000ca05 0x04200780 - 0x30020009 0xc4100780 0x3002020d 0xc4100780 - 0x2000c805 0x04208780 0xd00e0205 0x80c00780 - 0x2000c809 0x0420c780 0xd00e0409 0x80c00780 - 0x3001ca0d 0xc4300780 0x20038010 0x2103ea0c - 0x30020815 0xc4100780 0x20028210 0x20038000 - 0x2000c815 0x04214780 0x3001080d 0xec100780 - 0x30020011 0xc4100780 0x20400201 0x04008780 - 0xd00e0a0d 0xa0c00780 0x2000c805 0x04210780 - 0xd00e0201 0xa0c00781 + 0x60014c05 0x00204780 0x3001cdfd 0x6c20c7c8 + 0x30000003 0x00000280 0x30020209 0xc4100780 + 0x2000ca01 0x04208780 0xd00e0001 0x80c00780 + 0x2000c80d 0x04208780 0xa0000009 0x0c010780 + 0x2000cc11 0x04204780 0x3001cc15 0xc4300780 + 0xd00e0609 0xa0c00780 0x3002080d 0xc4100780 + 0x20000211 0x04014780 0xa0000201 0x0c010780 + 0x2105ec18 0x2103e80c 0x30020811 0xc4100780 + 0x20008414 0x20068218 0xd00e0601 0xa0c00780 + 0x2000c805 0x04210780 0x30010a0d 0xec100780 + 0x30020c11 0xc4100780 0x20400409 0x04000780 + 0xd00e020d 0xa0c00780 0x2000c801 0x04210780 + 0xd00e0009 0xa0c00781 } } code { @@ -823,3 +825,39 @@ code { 0xd00e0201 0xa0c00781 } } +code { + name = cudaChannelDecorr + lmem = 0 + smem = 28 + reg = 5 + bar = 0 + bincode { + 0x10004205 0x0023c780 0xa0000005 0x04000780 + 0x60014c05 0x00204780 0x3001cdfd 0x6c20c7c8 + 0x30000003 0x00000280 0xa0004e09 0x04200780 + 0x1000cc01 0x0423c780 0x41032a0c 0x40040210 + 0x3010060d 0xc4100780 0x30100811 0xc4100780 + 0x60024a0d 0x0020c780 0x60040011 0x00010780 + 0x20038400 0x20018804 0x30010001 0xc4100780 + 0x30020205 0xc4100780 0x2000ca01 0x04200780 + 0xd00e0001 0x80600780 0x2000c805 0x04204780 + 0xd00e0201 0xa0c00781 + } +} +code { + name = cudaChannelDecorr2 + lmem = 0 + smem = 28 + reg = 4 + bar = 0 + bincode { + 0x10004205 0x0023c780 0xa0000005 0x04000780 + 0x60014c05 0x00204780 0x3001cdfd 0x6c20c7c8 + 0x30000003 0x00000280 0x30020209 0xc4100780 + 0x2000ca01 0x04208780 0xd00e0001 0x80c00780 + 0x2102e808 0x2101ec0c 0xa0000005 0x0c010780 + 0x3002060d 0xc4100780 0xd00e0405 0xa0c00780 + 0x2000c805 0x0420c780 0xa0000201 0x0c010780 + 0xd00e0201 0xa0c00781 + } +}