diff --git a/CUETools.FlaCuda/FlaCudaWriter.cs b/CUETools.FlaCuda/FlaCudaWriter.cs index cab4894..4a41810 100644 --- a/CUETools.FlaCuda/FlaCudaWriter.cs +++ b/CUETools.FlaCuda/FlaCudaWriter.cs @@ -98,9 +98,6 @@ namespace CUETools.Codecs.FlaCuda CUdeviceptr cudaWindow; - int nResidualTasks = 0; - int nAutocorTasks = 0; - bool encode_on_cpu = true; public const int MAX_BLOCKSIZE = 4608 * 4; @@ -717,7 +714,7 @@ namespace CUETools.Codecs.FlaCuda { int n = frame.blocksize; for (int i = 0; i < n; i++) - bitwriter.writebits_signed(sub.obits, sub.samples[i]); + bitwriter.writebits_signed(sub.obits, sub.samples[i]); // Don't use residual here, because we don't copy samples to residual for verbatim frames. } @@ -820,8 +817,9 @@ namespace CUETools.Codecs.FlaCuda unsafe void initialize_autocorTasks(int blocksize, int channelsCount, int max_order, int nFrames, FlaCudaTask task) { computeAutocorTaskStruct* autocorTasks = (computeAutocorTaskStruct*)task.autocorTasksPtr; - nAutocorTasks = 0; - nResidualTasks = 0; + task.nAutocorTasks = 0; + task.nResidualTasks = 0; + task.nResidualTasksPerChannel = (_windowcount * max_order + 6 + 7) & ~7; for (int iFrame = 0; iFrame < nFrames; iFrame++) { for (int ch = 0; ch < channelsCount; ch++) @@ -829,83 +827,94 @@ namespace CUETools.Codecs.FlaCuda for (int iWindow = 0; iWindow < _windowcount; iWindow++) { // Autocorelation task - autocorTasks[nAutocorTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize; - autocorTasks[nAutocorTasks].windowOffs = iWindow * 2 * FlaCudaWriter.MAX_BLOCKSIZE; - autocorTasks[nAutocorTasks].residualOffs = max_order * (iWindow + (_windowcount + 1) * (ch + iFrame * channelsCount)); - autocorTasks[nAutocorTasks].blocksize = blocksize; - nAutocorTasks++; + autocorTasks[task.nAutocorTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize; + autocorTasks[task.nAutocorTasks].windowOffs = iWindow * 2 * FlaCudaWriter.MAX_BLOCKSIZE; + autocorTasks[task.nAutocorTasks].residualOffs = max_order * iWindow + task.nResidualTasksPerChannel * (ch + iFrame * channelsCount); + autocorTasks[task.nAutocorTasks].blocksize = blocksize; + task.nAutocorTasks++; // LPC tasks for (int order = 1; order <= max_order; order++) { - task.ResidualTasks[nResidualTasks].type = (int)SubframeType.LPC; - task.ResidualTasks[nResidualTasks].channel = ch; - task.ResidualTasks[nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0); - task.ResidualTasks[nResidualTasks].blocksize = blocksize; - task.ResidualTasks[nResidualTasks].residualOrder = order <= max_order ? order : 0; - task.ResidualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize; - task.ResidualTasks[nResidualTasks].residualOffs = task.ResidualTasks[nResidualTasks].samplesOffs; - nResidualTasks++; + task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.LPC; + task.ResidualTasks[task.nResidualTasks].channel = ch; + task.ResidualTasks[task.nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0); + task.ResidualTasks[task.nResidualTasks].blocksize = blocksize; + task.ResidualTasks[task.nResidualTasks].residualOrder = order <= max_order ? order : 0; + task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize; + task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs; + task.nResidualTasks++; } } - // Fixed prediction - for (int order = 1; order <= max_order; order++) + // Constant frames { - task.ResidualTasks[nResidualTasks].type = (int)SubframeType.Verbatim; - task.ResidualTasks[nResidualTasks].channel = ch; - task.ResidualTasks[nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0); - task.ResidualTasks[nResidualTasks].blocksize = blocksize; - task.ResidualTasks[nResidualTasks].residualOrder = 0; - task.ResidualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize; - task.ResidualTasks[nResidualTasks].residualOffs = task.ResidualTasks[nResidualTasks].samplesOffs; - task.ResidualTasks[nResidualTasks].shift = 0; + task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.Constant; + task.ResidualTasks[task.nResidualTasks].channel = ch; + task.ResidualTasks[task.nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0); + task.ResidualTasks[task.nResidualTasks].blocksize = blocksize; + task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize; + task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs; + task.ResidualTasks[task.nResidualTasks].residualOrder = 1; + task.ResidualTasks[task.nResidualTasks].shift = 0; + task.ResidualTasks[task.nResidualTasks].coefs[0] = 1; + task.nResidualTasks++; + } + // Fixed prediction + for (int order = 0; order < 5; order++) + { + task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.Fixed; + task.ResidualTasks[task.nResidualTasks].channel = ch; + task.ResidualTasks[task.nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0); + task.ResidualTasks[task.nResidualTasks].blocksize = blocksize; + task.ResidualTasks[task.nResidualTasks].residualOrder = order; + task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize; + task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs; + task.ResidualTasks[task.nResidualTasks].shift = 0; switch (order) { + case 0: + break; case 1: - task.ResidualTasks[nResidualTasks].type = (int)SubframeType.Constant; - task.ResidualTasks[nResidualTasks].residualOrder = 1; - task.ResidualTasks[nResidualTasks].coefs[0] = 1; - break; - case 3: - task.ResidualTasks[nResidualTasks].type = (int)SubframeType.Fixed; - task.ResidualTasks[nResidualTasks].residualOrder = 0; - break; - case 4: - task.ResidualTasks[nResidualTasks].type = (int)SubframeType.Fixed; - task.ResidualTasks[nResidualTasks].residualOrder = 1; - task.ResidualTasks[nResidualTasks].coefs[0] = 1; + task.ResidualTasks[task.nResidualTasks].coefs[0] = 1; break; case 2: - task.ResidualTasks[nResidualTasks].type = (int)SubframeType.Fixed; - task.ResidualTasks[nResidualTasks].residualOrder = 2; - task.ResidualTasks[nResidualTasks].coefs[1] = 2; - task.ResidualTasks[nResidualTasks].coefs[0] = -1; + task.ResidualTasks[task.nResidualTasks].coefs[1] = 2; + task.ResidualTasks[task.nResidualTasks].coefs[0] = -1; break; - case 5: - task.ResidualTasks[nResidualTasks].type = (int)SubframeType.Fixed; - task.ResidualTasks[nResidualTasks].residualOrder = 3; - task.ResidualTasks[nResidualTasks].coefs[2] = 3; - task.ResidualTasks[nResidualTasks].coefs[1] = -3; - task.ResidualTasks[nResidualTasks].coefs[0] = 1; + case 3: + task.ResidualTasks[task.nResidualTasks].coefs[2] = 3; + task.ResidualTasks[task.nResidualTasks].coefs[1] = -3; + task.ResidualTasks[task.nResidualTasks].coefs[0] = 1; break; - case 6: - task.ResidualTasks[nResidualTasks].type = (int)SubframeType.Fixed; - task.ResidualTasks[nResidualTasks].residualOrder = 4; - task.ResidualTasks[nResidualTasks].coefs[3] = 4; - task.ResidualTasks[nResidualTasks].coefs[2] = -6; - task.ResidualTasks[nResidualTasks].coefs[1] = 4; - task.ResidualTasks[nResidualTasks].coefs[0] = -1; + case 4: + task.ResidualTasks[task.nResidualTasks].coefs[3] = 4; + task.ResidualTasks[task.nResidualTasks].coefs[2] = -6; + task.ResidualTasks[task.nResidualTasks].coefs[1] = 4; + task.ResidualTasks[task.nResidualTasks].coefs[0] = -1; break; } - nResidualTasks++; + task.nResidualTasks++; + } + // Filler + while ((task.nResidualTasks % task.nResidualTasksPerChannel) != 0) + { + task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.Verbatim; + task.ResidualTasks[task.nResidualTasks].channel = ch; + task.ResidualTasks[task.nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0); + task.ResidualTasks[task.nResidualTasks].blocksize = blocksize; + task.ResidualTasks[task.nResidualTasks].residualOrder = 0; + task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize; + task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs; + task.ResidualTasks[task.nResidualTasks].shift = 0; + task.nResidualTasks++; } } } - if (sizeof(encodeResidualTaskStruct) * nResidualTasks > task.residualTasksLen) + if (sizeof(encodeResidualTaskStruct) * task.nResidualTasks > task.residualTasksLen) throw new Exception("oops"); - if (sizeof(computeAutocorTaskStruct) * nAutocorTasks > task.autocorTasksLen) + if (sizeof(computeAutocorTaskStruct) * task.nAutocorTasks > task.autocorTasksLen) throw new Exception("oops"); - cuda.CopyHostToDeviceAsync(task.cudaAutocorTasks, task.autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * nAutocorTasks), task.stream); - cuda.CopyHostToDeviceAsync(task.cudaResidualTasks, task.residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks), task.stream); + cuda.CopyHostToDeviceAsync(task.cudaAutocorTasks, task.autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * task.nAutocorTasks), task.stream); + cuda.CopyHostToDeviceAsync(task.cudaResidualTasks, task.residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * task.nResidualTasks), task.stream); task.blocksize = blocksize; } @@ -983,6 +992,7 @@ namespace CUETools.Codecs.FlaCuda { frame.subframes[ch].best.type = SubframeType.Verbatim; frame.subframes[ch].best.size = frame.subframes[ch].obits * (uint)frame.blocksize; + frame.subframes[ch].wbits = 0; int index = ch + iFrame * channels; if (task.BestResidualTasks[index].size < 0) @@ -994,6 +1004,11 @@ namespace CUETools.Codecs.FlaCuda frame.subframes[ch].best.order = task.BestResidualTasks[index].residualOrder; frame.subframes[ch].best.cbits = task.BestResidualTasks[index].cbits; frame.subframes[ch].best.shift = task.BestResidualTasks[index].shift; + frame.subframes[ch].obits -= (uint)task.BestResidualTasks[index].wbits; + frame.subframes[ch].wbits = (uint)task.BestResidualTasks[index].wbits; + if (frame.subframes[ch].wbits != 0) + for (int i = 0; i < frame.blocksize; i++) + frame.subframes[ch].samples[i] >>= (int)frame.subframes[ch].wbits; for (int i = 0; i < task.BestResidualTasks[index].residualOrder; i++) frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i]; if (!encode_on_cpu) @@ -1010,17 +1025,17 @@ namespace CUETools.Codecs.FlaCuda compute_autocorellation(blocksize, channelsCount, max_order, nFrames, task); int threads_y; - if (max_order >= 4 && max_order <= 8) - threads_y = max_order; - else if ((max_order % 8) == 0) + if (task.nResidualTasksPerChannel >= 4 && task.nResidualTasksPerChannel <= 8) + threads_y = task.nResidualTasksPerChannel; + else if ((task.nResidualTasksPerChannel % 8) == 0) threads_y = 8; - else if ((max_order % 7) == 0) + else if ((task.nResidualTasksPerChannel % 7) == 0) threads_y = 7; - else if ((max_order % 6) == 0) + else if ((task.nResidualTasksPerChannel % 6) == 0) threads_y = 6; - else if ((max_order % 5) == 0) + else if ((task.nResidualTasksPerChannel % 5) == 0) threads_y = 5; - else if ((max_order % 4) == 0) + else if ((task.nResidualTasksPerChannel % 4) == 0) threads_y = 4; else throw new Exception("invalid LPC order"); @@ -1039,30 +1054,23 @@ namespace CUETools.Codecs.FlaCuda cuda.SetParameterSize(task.cudaEstimateResidual, sizeof(uint) * 6); cuda.SetFunctionBlockShape(task.cudaEstimateResidual, 32, threads_y, 1); - cuda.SetParameter(task.cudaSumResidual, 0, (uint)task.cudaResidualTasks.Pointer); - cuda.SetParameter(task.cudaSumResidual, sizeof(uint), (uint)task.cudaResidualOutput.Pointer); - cuda.SetParameter(task.cudaSumResidual, 2 * sizeof(uint), (uint)partCount); - cuda.SetParameterSize(task.cudaSumResidual, sizeof(uint) * 3U); - cuda.SetFunctionBlockShape(task.cudaSumResidual, 64, 1, 1); - - int tasksPerChannel = (_windowcount + 1) * max_order; - int nBestTasks = nResidualTasks / tasksPerChannel; + int nBestTasks = task.nResidualTasks / task.nResidualTasksPerChannel; cuda.SetParameter(task.cudaChooseBestMethod, 0 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer); cuda.SetParameter(task.cudaChooseBestMethod, 1 * sizeof(uint), (uint)task.cudaResidualOutput.Pointer); cuda.SetParameter(task.cudaChooseBestMethod, 2 * sizeof(uint), (uint)partCount); - cuda.SetParameter(task.cudaChooseBestMethod, 3 * sizeof(uint), (uint)tasksPerChannel); + cuda.SetParameter(task.cudaChooseBestMethod, 3 * sizeof(uint), (uint)task.nResidualTasksPerChannel); cuda.SetParameterSize(task.cudaChooseBestMethod, sizeof(uint) * 4U); cuda.SetFunctionBlockShape(task.cudaChooseBestMethod, 32, 16, 1); cuda.SetParameter(task.cudaCopyBestMethod, 0, (uint)task.cudaBestResidualTasks.Pointer); cuda.SetParameter(task.cudaCopyBestMethod, 1 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer); - cuda.SetParameter(task.cudaCopyBestMethod, 2 * sizeof(uint), (uint)tasksPerChannel); + cuda.SetParameter(task.cudaCopyBestMethod, 2 * sizeof(uint), (uint)task.nResidualTasksPerChannel); cuda.SetParameterSize(task.cudaCopyBestMethod, sizeof(uint) * 3U); cuda.SetFunctionBlockShape(task.cudaCopyBestMethod, 64, 1, 1); cuda.SetParameter(task.cudaCopyBestMethodStereo, 0, (uint)task.cudaBestResidualTasks.Pointer); cuda.SetParameter(task.cudaCopyBestMethodStereo, 1 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer); - cuda.SetParameter(task.cudaCopyBestMethodStereo, 2 * sizeof(uint), (uint)tasksPerChannel); + cuda.SetParameter(task.cudaCopyBestMethodStereo, 2 * sizeof(uint), (uint)task.nResidualTasksPerChannel); cuda.SetParameterSize(task.cudaCopyBestMethodStereo, sizeof(uint) * 3U); cuda.SetFunctionBlockShape(task.cudaCopyBestMethodStereo, 64, 1, 1); @@ -1073,8 +1081,7 @@ namespace CUETools.Codecs.FlaCuda cuda.SetFunctionBlockShape(task.cudaEncodeResidual, partSize, 1, 1); // issue work to the GPU - cuda.LaunchAsync(task.cudaEstimateResidual, partCount, (nResidualTasks / threads_y * nFrames) / maxFrames, task.stream); - //cuda.LaunchAsync(task.cudaSumResidual, 1, (nResidualTasks * nFrames) / maxFrames, task.stream); + cuda.LaunchAsync(task.cudaEstimateResidual, partCount, (task.nResidualTasks / threads_y * nFrames) / maxFrames, task.stream); cuda.LaunchAsync(task.cudaChooseBestMethod, 1, (nBestTasks * nFrames) / maxFrames, task.stream); if (channels == 2 && channelsCount == 4) { @@ -1103,6 +1110,18 @@ namespace CUETools.Codecs.FlaCuda if (blocksize <= 4) return; + cuda.SetParameter(task.cudaStereoDecorr, 0, (uint)task.cudaSamples.Pointer); + cuda.SetParameter(task.cudaStereoDecorr, sizeof(uint), (uint)MAX_BLOCKSIZE); + cuda.SetParameterSize(task.cudaStereoDecorr, sizeof(uint) * 2U); + cuda.SetFunctionBlockShape(task.cudaStereoDecorr, 256, 1, 1); + + cuda.SetParameter(task.cudaFindWastedBits, 0 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer); + cuda.SetParameter(task.cudaFindWastedBits, 1 * sizeof(uint), (uint)task.cudaSamples.Pointer); + cuda.SetParameter(task.cudaFindWastedBits, 2 * sizeof(uint), (uint)task.nResidualTasksPerChannel); + cuda.SetParameter(task.cudaFindWastedBits, 3 * sizeof(uint), (uint)blocksize); + cuda.SetParameterSize(task.cudaFindWastedBits, sizeof(uint) * 4U); + cuda.SetFunctionBlockShape(task.cudaFindWastedBits, 256, 1, 1); + cuda.SetParameter(task.cudaComputeAutocor, 0, (uint)task.cudaAutocorOutput.Pointer); cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint), (uint)task.cudaSamples.Pointer); cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 2, (uint)cudaWindow.Pointer); @@ -1122,8 +1141,11 @@ namespace CUETools.Codecs.FlaCuda cuda.SetFunctionBlockShape(task.cudaComputeLPC, (partCount + 31) & ~31, 1, 1); // issue work to the GPU - cuda.LaunchAsync(task.cudaComputeAutocor, partCount, (nAutocorTasks * nFrames) / maxFrames, task.stream); - cuda.LaunchAsync(task.cudaComputeLPC, 1, (nAutocorTasks * nFrames) / maxFrames, task.stream); + if (channels == 2 && channelsCount == 4) + cuda.LaunchAsync(task.cudaStereoDecorr, MAX_BLOCKSIZE / 256, 1, task.stream); + cuda.LaunchAsync(task.cudaFindWastedBits, (task.nResidualTasks / task.nResidualTasksPerChannel * nFrames) / maxFrames, 1, task.stream); + cuda.LaunchAsync(task.cudaComputeAutocor, partCount, (task.nAutocorTasks * nFrames) / maxFrames, task.stream); + cuda.LaunchAsync(task.cudaComputeLPC, 1, (task.nAutocorTasks * nFrames) / maxFrames, task.stream); } unsafe int encode_frame(bool doMidside, int channelCount, int iFrame, FlaCudaTask task) @@ -1167,7 +1189,7 @@ namespace CUETools.Codecs.FlaCuda bool doMidside = channels == 2 && eparams.do_midside; int channelCount = doMidside ? 2 * channels : channels; - cuda.CopyHostToDeviceAsync(task.cudaSamples, task.samplesBufferPtr, (uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelCount), task.stream); + cuda.CopyHostToDeviceAsync(task.cudaSamples, task.samplesBufferPtr, (uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channels), task.stream); } unsafe void run_GPU_task(int nFrames, FlaCudaTask task) @@ -1761,21 +1783,22 @@ namespace CUETools.Codecs.FlaCuda public int best_index; public int channel; public int residualOffs; - public fixed int reserved[5]; + public int wbits; + public fixed int reserved[4]; public fixed int coefs[32]; }; internal class FlaCudaTask { CUDA cuda; + public CUfunction cudaStereoDecorr; + public CUfunction cudaFindWastedBits; public CUfunction cudaComputeAutocor; public CUfunction cudaComputeLPC; public CUfunction cudaEstimateResidual; public CUfunction cudaChooseBestMethod; public CUfunction cudaCopyBestMethod; public CUfunction cudaCopyBestMethodStereo; - //public CUfunction cudaSumResidualChunks; - public CUfunction cudaSumResidual; public CUfunction cudaEncodeResidual; public CUdeviceptr cudaSamples; public CUdeviceptr cudaResidual; @@ -1797,6 +1820,9 @@ namespace CUETools.Codecs.FlaCuda public int residualTasksLen; public int bestResidualTasksLen; public int samplesBufferLen; + public int nResidualTasks = 0; + public int nAutocorTasks = 0; + public int nResidualTasksPerChannel = 0; unsafe public FlaCudaTask(CUDA _cuda, int channelCount) { @@ -1834,14 +1860,14 @@ namespace CUETools.Codecs.FlaCuda } cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor"); + cudaStereoDecorr = cuda.GetModuleFunction("cudaStereoDecorr"); + cudaFindWastedBits = cuda.GetModuleFunction("cudaFindWastedBits"); cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC"); cudaEstimateResidual = cuda.GetModuleFunction("cudaEstimateResidual"); - cudaSumResidual = cuda.GetModuleFunction("cudaSumResidual"); cudaChooseBestMethod = cuda.GetModuleFunction("cudaChooseBestMethod"); cudaCopyBestMethod = cuda.GetModuleFunction("cudaCopyBestMethod"); cudaCopyBestMethodStereo = cuda.GetModuleFunction("cudaCopyBestMethodStereo"); cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual"); - //cudaSumResidualChunks = cuda.GetModuleFunction("cudaSumResidualChunks"); stream = cuda.CreateStream(); verifyBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * channelCount]; // should be channels, not channelCount. And should null if not doing verify! diff --git a/CUETools.FlaCuda/flacuda.cu b/CUETools.FlaCuda/flacuda.cu index cd4cd88..ef4f39c 100644 --- a/CUETools.FlaCuda/flacuda.cu +++ b/CUETools.FlaCuda/flacuda.cu @@ -49,10 +49,59 @@ typedef struct int best_index; int channel; int residualOffs; - int reserved[5]; + int wbits; + int reserved[4]; int coefs[32]; } encodeResidualTaskStruct; +extern "C" __global__ void cudaStereoDecorr( + int *samples, + int offset +) +{ + const int pos = blockIdx.x * blockDim.x + threadIdx.x; + if (pos < offset) + { + int l = samples[pos]; + int r = samples[offset + pos]; + samples[2 * offset + pos] = (l + r) >> 1; + samples[3 * offset + pos] = l - r; + } +} + +extern "C" __global__ void cudaFindWastedBits( + encodeResidualTaskStruct *tasks, + int *samples, + int tasksPerChannel, + int blocksize +) +{ + __shared__ struct { + volatile int wbits[256]; + encodeResidualTaskStruct task; + } shared; + + if (threadIdx.x < 16) + ((int*)&shared.task)[threadIdx.x] = ((int*)(&tasks[blockIdx.x * tasksPerChannel]))[threadIdx.x]; + shared.wbits[threadIdx.x] = 0; + __syncthreads(); + + for (int pos = 0; pos < blocksize; pos += blockDim.x) + shared.wbits[threadIdx.x] |= pos + threadIdx.x < blocksize ? samples[shared.task.samplesOffs + pos + threadIdx.x] : 0; + __syncthreads(); + + if (threadIdx.x < 128) shared.wbits[threadIdx.x] |= shared.wbits[threadIdx.x + 128]; __syncthreads(); + if (threadIdx.x < 64) shared.wbits[threadIdx.x] |= shared.wbits[threadIdx.x + 64]; __syncthreads(); + if (threadIdx.x < 32) shared.wbits[threadIdx.x] |= shared.wbits[threadIdx.x + 32]; __syncthreads(); + shared.wbits[threadIdx.x] |= shared.wbits[threadIdx.x + 16]; + shared.wbits[threadIdx.x] |= shared.wbits[threadIdx.x + 8]; + shared.wbits[threadIdx.x] |= shared.wbits[threadIdx.x + 4]; + shared.wbits[threadIdx.x] |= shared.wbits[threadIdx.x + 2]; + shared.wbits[threadIdx.x] |= shared.wbits[threadIdx.x + 1]; + if (threadIdx.x < tasksPerChannel) + tasks[blockIdx.x * tasksPerChannel + threadIdx.x].wbits = max(0,__ffs(shared.wbits[0]) - 1); +} + extern "C" __global__ void cudaComputeAutocor( float *output, const int *samples, @@ -231,8 +280,8 @@ extern "C" __global__ void cudaEstimateResidual( const int dataLen = min(frameSize - pos, partSize + max_order); // fetch samples - shared.data[tid] = tid < dataLen ? samples[shared.task[0].samplesOffs + pos + tid] : 0; - if (tid < 32) shared.data[tid + partSize] = tid + partSize < dataLen ? samples[shared.task[0].samplesOffs + pos + tid + partSize] : 0; + shared.data[tid] = tid < dataLen ? samples[shared.task[0].samplesOffs + pos + tid] >> shared.task[0].wbits : 0; + if (tid < 32) shared.data[tid + partSize] = tid + partSize < dataLen ? samples[shared.task[0].samplesOffs + pos + tid + partSize] >> shared.task[0].wbits : 0; const int residualLen = max(0,min(frameSize - pos - shared.task[threadIdx.y].residualOrder, partSize)); __syncthreads(); @@ -271,100 +320,6 @@ extern "C" __global__ void cudaEstimateResidual( output[(blockIdx.y * blockDim.y + threadIdx.y) * 64 + blockIdx.x] = shared.residual[tid]; } -// blockDim.x == 256 -// gridDim.x = frameSize / chunkSize -extern "C" __global__ void cudaSumResidualChunks( - int *output, - encodeResidualTaskStruct *tasks, - int *residual, - int frameSize, - int chunkSize // <= blockDim.x(256) - ) -{ - __shared__ struct { - int residual[256]; - int rice[32]; - } shared; - - // fetch parameters - const int tid = threadIdx.x; - const int residualOrder = tasks[blockIdx.y].residualOrder; - const int chunkNumber = blockIdx.x; - const int pos = chunkNumber * chunkSize; - const int residualLen = min(frameSize - pos - residualOrder, chunkSize); - - // set upper residuals to zero, in case blockDim < 256 - shared.residual[255 - tid] = 0; - - // read residual - int res = (tid < residualLen) ? residual[blockIdx.y * 8192 + pos + tid] : 0; - - // convert to unsigned - shared.residual[tid] = (2 * res) ^ (res >> 31); - __syncthreads(); - - // residual sum: reduction in shared mem - if (tid < 128) shared.residual[tid] += shared.residual[tid + 128]; __syncthreads(); - if (tid < 64) shared.residual[tid] += shared.residual[tid + 64]; __syncthreads(); - if (tid < 32) shared.residual[tid] += shared.residual[tid + 32]; __syncthreads(); - shared.residual[tid] += shared.residual[tid + 16]; - shared.residual[tid] += shared.residual[tid + 8]; - shared.residual[tid] += shared.residual[tid + 4]; - shared.residual[tid] += shared.residual[tid + 2]; - shared.residual[tid] += shared.residual[tid + 1]; - - if (tid < 32) - { - // rice parameter search - shared.rice[tid] = __mul24(tid >= 15, 0x7fffff) + residualLen * (tid + 1) + ((shared.residual[0] - (residualLen >> 1)) >> tid); - shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 8]); - shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 4]); - shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 2]); - shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 1]); - } - - // write output - if (tid == 0) - output[blockIdx.x + blockIdx.y * gridDim.x] = shared.rice[0]; -} - -extern "C" __global__ void cudaSumResidual( - encodeResidualTaskStruct *tasks, - int *residual, - int partCount // <= blockDim.y (256) - ) -{ - __shared__ struct { - volatile int partLen[256]; - encodeResidualTaskStruct task; - } shared; - - const int tid = threadIdx.x; - // fetch task data - if (tid < sizeof(encodeResidualTaskStruct) / sizeof(int)) - ((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid]; - __syncthreads(); - - shared.partLen[tid] = (tid < partCount) ? residual[tid + partCount * blockIdx.y] : 0; - __syncthreads(); - // length sum: reduction in shared mem - //if (tid < 128) shared.partLen[tid] += shared.partLen[tid + 128]; __syncthreads(); - //if (tid < 64) shared.partLen[tid] += shared.partLen[tid + 64]; __syncthreads(); - if (tid < 32) shared.partLen[tid] += shared.partLen[tid + 32]; __syncthreads(); - shared.partLen[tid] += shared.partLen[tid + 16]; - shared.partLen[tid] += shared.partLen[tid + 8]; - shared.partLen[tid] += shared.partLen[tid + 4]; - shared.partLen[tid] += shared.partLen[tid + 2]; - shared.partLen[tid] += shared.partLen[tid + 1]; - // return sum - if (tid == 0) - tasks[blockIdx.y].size = min(shared.task.obits * shared.task.blocksize, - shared.task.type == Fixed ? shared.task.residualOrder * shared.task.obits + 6 + shared.partLen[0] : - shared.task.type == LPC ? shared.task.residualOrder * shared.task.obits + 4 + 5 + shared.task.residualOrder * shared.task.cbits + 6 + (4 * partCount/2)/* << porder */ + shared.partLen[0] : - shared.task.type == Constant ? shared.task.obits * (1 + shared.task.blocksize * (shared.partLen[0] != 0)) : - shared.task.obits * shared.task.blocksize); -} - #define BEST_INDEX(a,b) ((a) + ((b) - (a)) * (shared.length[b] < shared.length[a])) extern "C" __global__ void cudaChooseBestMethod( @@ -403,12 +358,13 @@ extern "C" __global__ void cudaChooseBestMethod( // return sum if (threadIdx.x == 0) { + int obits = shared.task[threadIdx.y].obits - shared.task[threadIdx.y].wbits; shared.length[task + threadIdx.y] = - min(shared.task[threadIdx.y].obits * shared.task[threadIdx.y].blocksize, - shared.task[threadIdx.y].type == Fixed ? shared.task[threadIdx.y].residualOrder * shared.task[threadIdx.y].obits + 6 + shared.partLen[threadIdx.y * 32] : - shared.task[threadIdx.y].type == LPC ? shared.task[threadIdx.y].residualOrder * shared.task[threadIdx.y].obits + 4 + 5 + shared.task[threadIdx.y].residualOrder * shared.task[threadIdx.y].cbits + 6 + (4 * partCount/2)/* << porder */ + shared.partLen[threadIdx.y * 32] : - shared.task[threadIdx.y].type == Constant ? shared.task[threadIdx.y].obits * (1 + shared.task[threadIdx.y].blocksize * (shared.partLen[threadIdx.y * 32] != 0)) : - shared.task[threadIdx.y].obits * shared.task[threadIdx.y].blocksize); + min(obits * shared.task[threadIdx.y].blocksize, + shared.task[threadIdx.y].type == Fixed ? shared.task[threadIdx.y].residualOrder * obits + 6 + shared.partLen[threadIdx.y * 32] : + shared.task[threadIdx.y].type == LPC ? shared.task[threadIdx.y].residualOrder * obits + 4 + 5 + shared.task[threadIdx.y].residualOrder * shared.task[threadIdx.y].cbits + 6 + (4 * partCount/2)/* << porder */ + shared.partLen[threadIdx.y * 32] : + shared.task[threadIdx.y].type == Constant ? obits * (1 + shared.task[threadIdx.y].blocksize * (shared.partLen[threadIdx.y * 32] != 0)) : + obits * shared.task[threadIdx.y].blocksize); } } //shared.index[threadIdx.x] = threadIdx.x; @@ -527,8 +483,8 @@ extern "C" __global__ void cudaEncodeResidual( const int dataLen = min(shared.task.blocksize - pos, partSize + shared.task.residualOrder); // fetch samples - shared.data[tid] = tid < dataLen ? samples[shared.task.samplesOffs + pos + tid] : 0; - if (tid < 32) shared.data[tid + partSize] = tid + partSize < dataLen ? samples[shared.task.samplesOffs + pos + tid + partSize] : 0; + shared.data[tid] = tid < dataLen ? samples[shared.task.samplesOffs + pos + tid] >> shared.task.wbits : 0; + if (tid < 32) shared.data[tid + partSize] = tid + partSize < dataLen ? samples[shared.task.samplesOffs + pos + tid + partSize] >> shared.task.wbits : 0; const int residualLen = max(0,min(shared.task.blocksize - pos - shared.task.residualOrder, partSize)); __syncthreads(); diff --git a/CUETools.FlaCuda/flacuda.cubin b/CUETools.FlaCuda/flacuda.cubin index 1f47d59..d30925f 100644 --- a/CUETools.FlaCuda/flacuda.cubin +++ b/CUETools.FlaCuda/flacuda.cubin @@ -110,79 +110,81 @@ code { } } bincode { - 0xd0800205 0x00400780 0xa0000209 0x04000780 - 0x30070405 0xc4100780 0x3006040d 0xc4100780 - 0xa0000011 0x04000780 0x20000201 0x0400c780 - 0x30020805 0xc4100780 0x2000000d 0x04004780 + 0xd0800205 0x00400780 0xa0000205 0x04000780 + 0x30070209 0xc4100780 0x3006020d 0xc4100780 + 0xa0000011 0x04000780 0x20000401 0x0400c780 + 0x30020809 0xc4100780 0x2000000d 0x04008780 0x00000005 0xc0000780 0x308109fd 0x644107c8 0x00000609 0xc0000780 0xa0018003 0x00000000 0xa0004401 0x04200780 0x10018003 0x00000280 0x40014e0d 0x00200780 0x3010060d 0xc4100780 - 0x60004e01 0x0020c780 0x20000001 0x04008780 + 0x60004e01 0x0020c780 0x20000001 0x04004780 0x3007000d 0xc4100780 0x30060001 0xc4100780 - 0x20008600 0x2100ec00 0x20000201 0x04000780 + 0x20008600 0x2100ec00 0x20000401 0x04000780 0xd00e0001 0x80c00780 0x08045401 0xe4200780 0xf0000001 0xe0000002 0x861ffe03 0x00000000 0xa0004c0d 0x04200780 0x1000d201 0x0423c780 0x40060215 0x00000780 0x30100a15 0xc4100780 - 0x6006000d 0x00014780 0x40054201 0x00200780 - 0x30100015 0xc4100780 0x1000d201 0x0423c780 - 0x60044215 0x00214780 0x2100ee1c 0x2143f000 - 0x20000a19 0x04010780 0x3007001d 0xac000780 - 0x30060ffd 0x6c00c7c8 0xa0030003 0x00000000 - 0x1002f003 0x00000280 0xd011580d 0x20000780 - 0x2d03e020 0x20088c20 0x30021021 0xc4100780 - 0x2000ca21 0x04220780 0xd00e1021 0x80c00780 - 0x10030003 0x00000780 0x1000f821 0x0403c780 - 0x00020c0d 0xc0000782 0x0c001401 0xe4220780 - 0x30820dfd 0x6c4107c8 0xa0043003 0x00000000 - 0x10043003 0x00000280 0x2000d221 0x04218780 - 0x0002100d 0xc0000780 0x30080ffd 0x6c00c7c8 - 0xa0042003 0x00000000 0x10041003 0x00000280 - 0xd0115811 0x20000780 0x2000c00d 0x0420c784 - 0x2106f21c 0x2007860c 0x3002060d 0xc4100780 - 0x2000ca0d 0x0420c780 0xd00e060d 0x80c00780 - 0x10042003 0x00000780 0x1000f80d 0x0403c780 - 0x0c001401 0xe420c782 0xd411500d 0x20000782 - 0x3c00c00d 0x04200780 0xd4005005 0x20000780 + 0x6006001d 0x00014780 0x40034201 0x00200780 + 0x3010000d 0xc4100780 0x1000d201 0x0423c780 + 0x60024215 0x0020c780 0x2100ee00 0x2147f00c + 0x20000a19 0x04010780 0x30000621 0xac000780 + 0x300611fd 0x6c00c7c8 0xa0031003 0x00000000 + 0x10030003 0x00000280 0xd011580d 0x20000780 + 0x2d07e000 0x20008c00 0x30020001 0xc4100780 + 0x2100ea24 0x1d00f400 0xd00e1225 0x80c00780 + 0x30001201 0xec000780 0x10031003 0x00000780 + 0x1000f801 0x0403c780 0x00020c0d 0xc0000782 + 0x0c001401 0xe4200780 0x30820dfd 0x6c4107c8 + 0xa0047003 0x00000000 0x10047003 0x00000280 + 0x2000d201 0x04218780 0x0002000d 0xc0000780 + 0x300011fd 0x6c00c7c8 0xa0046003 0x00000000 + 0x10045003 0x00000280 0xd0115811 0x20000780 + 0x2000d221 0x04218780 0x2000c001 0x0421c784 + 0x2000001d 0x04020780 0x1000d401 0x0423c784 + 0x30020e1d 0xc4100780 0x2000ca1d 0x0421c780 + 0xd00e0e1d 0x80c00780 0x30000e01 0xec000780 + 0x10046003 0x00000780 0x1000f801 0x0403c780 + 0x0c001401 0xe4200782 0xd411500d 0x20000782 + 0x3c00c00d 0x0420c780 0xd4005005 0x20000780 0x861ffe03 0x00000000 0x3004cffd 0x6420c7c8 - 0xa0059003 0x00000000 0x00020c0d 0xc0000780 - 0x0c025401 0xe43f0780 0x10058003 0x00000280 + 0xa005d003 0x00000000 0x00020c0d 0xc0000780 + 0x0c025401 0xe43f0780 0x1005c003 0x00000280 0xa0004401 0x04200780 0x40014e1d 0x00200780 0x30100e1d 0xc4100780 0x60004e01 0x0021c780 - 0x20000001 0x04008780 0x3007001d 0xc4100780 + 0x20000001 0x04004780 0x3007001d 0xc4100780 0x30060001 0xc4100780 0x20008e00 0x2100ec00 - 0x20000201 0x04000780 0x20008001 0x00000007 - 0xd00e0001 0x80c00780 0x10059003 0x00000780 + 0x20000401 0x04000780 0x20008001 0x00000007 + 0xd00e0001 0x80c00780 0x1005d003 0x00000780 0x1000f801 0x0403c780 0x08047401 0xe4200782 - 0xd4112809 0x20000780 0x3883c005 0x6c608780 - 0xa0004401 0x04200780 0xd001001d 0x04000780 - 0x30000ffd 0x640187c8 0xa0094003 0x00000000 - 0x10092003 0x00000280 0x3003d201 0xac200780 + 0xd4112809 0x20000780 0x3883c009 0x6c608780 + 0xa0004401 0x04200780 0xd002001d 0x04000780 + 0x30000ffd 0x640187c8 0xa0098003 0x00000000 + 0x10096003 0x00000280 0x3003d201 0xac200780 0x307c000d 0x8c000780 0xd4110009 0x20000780 - 0x387cc1fd 0x6c20c7c8 0xa007c003 0x00000000 - 0x1000f801 0x0403c780 0x1000f825 0x0403c780 - 0x1007a003 0x00000280 0x30050e05 0xc4100780 - 0x20000221 0x04010780 0x200a9005 0x00000003 - 0x0002020d 0xc0000780 0xa0077003 0x00000000 + 0x387cc1fd 0x6c20c7c8 0xa0080003 0x00000000 + 0x1000f801 0x0403c780 0x1000f821 0x0403c780 + 0x1007e003 0x00000280 0x30050e09 0xc4100780 + 0x20000425 0x04010780 0x200a9209 0x00000003 + 0x0002040d 0xc0000780 0xa007b003 0x00000000 0xd4000009 0x20000780 0xd8118011 0x20000780 - 0x1000c005 0x0423c784 0x6e01c225 0x80224780 + 0x1000c009 0x0423c784 0x6e02c221 0x80220780 0x20018001 0x00000003 0xd4110011 0x20000780 0x3000c1fd 0x6c2147cc 0xd8000809 0x20000780 - 0x1006f003 0x00000280 0xd4110009 0x20000782 - 0x1800c001 0x0423c780 0x1007c003 0x00000780 - 0x30050e05 0xc4100780 0x20000221 0x04010780 - 0x20001001 0x04000782 0x00020009 0xc0000780 + 0x10073003 0x00000280 0xd4110009 0x20000782 + 0x1800c001 0x0423c780 0x10080003 0x00000780 + 0x30050e09 0xc4100780 0x20000425 0x04010780 + 0x20001201 0x04000782 0x00020009 0xc0000780 0xd411100d 0x20000780 0x1c00c001 0x0423c780 - 0x30001201 0xec000780 0x2840d401 0x04200780 - 0x301f0005 0xec100780 0x30010001 0xc4100780 - 0xd0000205 0x04008780 0x30080601 0x6c010780 + 0x30001001 0xec000780 0x2840d401 0x04200780 + 0x301f0009 0xec100780 0x30010001 0xc4100780 + 0xd0000421 0x04008780 0x30090601 0x6c010780 0x00020c0d 0xc0000780 0xdc095009 0x20000780 - 0xa0000021 0x2c014780 0x1800c001 0x0423c780 - 0x30840205 0xac400780 0x60011005 0x80000780 + 0xa0000009 0x2c014780 0x1800c001 0x0423c780 + 0x30841021 0xac400780 0x60080409 0x80000780 0x20018e1d 0x00000003 0xa0004401 0x04200780 - 0x30000ffd 0x640147c8 0x0c025401 0xe4204780 - 0x10063003 0x00000280 0x10094003 0x00000780 + 0x30000ffd 0x640147c8 0x0c025401 0xe4208780 + 0x10067003 0x00000280 0x10098003 0x00000780 0x3003d201 0xac200780 0x307c000d 0x8c000780 0x00020c0d 0xc0000782 0xdc095009 0x20000780 0x1800e001 0x0423c780 0x2800c001 0x04200780 @@ -191,21 +193,21 @@ code { 0x0c025401 0xe4200780 0x1900e400 0x2900e000 0x0c025401 0xe4200780 0x1900e200 0x2900e000 0x0c025401 0xe4200780 0xd4112805 0x20000780 - 0x347cc1fd 0x6c2147c8 0xa00ac003 0x00000000 - 0x100a9003 0x00000280 0x00020a05 0xc0000780 + 0x347cc1fd 0x6c2147c8 0xa00b0003 0x00000000 + 0x100ad003 0x00000280 0x00020a05 0xc0000780 0xd4095005 0x20000780 0x347cc1fd 0x6c2087c8 - 0x100ab003 0x00000280 0x10018001 0x00000003 - 0x100ac003 0x00000780 0x1000f801 0x0403c780 - 0xf0000001 0xe0000002 0x20018805 0x00000003 - 0x40030c1d 0x00000780 0x60020e1d 0x0001c780 + 0x100af003 0x00000280 0x10018001 0x00000003 + 0x100b0003 0x00000780 0x1000f801 0x0403c780 + 0xf0000001 0xe0000002 0x20018809 0x00000003 + 0x40050c1d 0x00000780 0x60040e1d 0x0001c780 0x00020a05 0xc0000780 0xd4095005 0x20000780 0x30010615 0xec100780 0x30100e1d 0xc4100780 - 0x2440c015 0x04214780 0x60020c0d 0x0001c780 - 0x307c09fd 0x640147c8 0x30040a05 0xec000780 + 0x2440c015 0x04214780 0x60040c0d 0x0001c780 + 0x307c09fd 0x640147c8 0x30040a09 0xec000780 0x30850811 0x64410780 0xa0000811 0x2c014780 - 0x407f8811 0x0007ffff 0x2004860c 0x20038204 - 0x4003000d 0x00000780 0x6002020d 0x0000c780 - 0x3010060d 0xc4100780 0x60020001 0x0000c780 + 0x407f8811 0x0007ffff 0x2004860c 0x20038408 + 0x4005000d 0x00000780 0x6004020d 0x0000c780 + 0x3010060d 0xc4100780 0x60040001 0x0000c780 0x00020c09 0xc0000780 0x08025401 0xe4200780 0xd8095005 0x20000780 0x1400d001 0x0423c780 0x3400c001 0xac200780 0x08025401 0xe4200780 @@ -214,9 +216,9 @@ code { 0x3400c001 0xac200780 0x08025401 0xe4200780 0x1400c201 0x0423c780 0x3400c001 0xac200780 0x08025401 0xe4200780 0x30000003 0x00000280 - 0xa0004401 0x04200780 0x40014e05 0x00200780 - 0x30100205 0xc4100780 0x60004e01 0x00204780 - 0x20000001 0x04008780 0x30060001 0xc4100780 + 0xa0004401 0x04200780 0x40014e09 0x00200780 + 0x30100409 0xc4100780 0x60004e01 0x00208780 + 0x20000001 0x04004780 0x30060001 0xc4100780 0xa0004c0d 0x04200780 0x20000601 0x04000780 0x00020c0d 0xc0000780 0xdc095005 0x20000780 0x30020005 0xc4100780 0x1500e000 0x2101e804 @@ -250,9 +252,9 @@ code { 0x30050205 0xc4100780 0x20000409 0x04004780 0x00020405 0xc0000780 0x103f8005 0x07ffffff 0x04051001 0xe4204780 0x307ccffd 0x6c20c7ca - 0x100a3003 0x00000280 0x1000f811 0x0403c780 + 0x100a2003 0x00000280 0x1000f811 0x0403c780 0x20000815 0x0400c780 0x3005cffd 0x6420c7c8 - 0xa009f003 0x00000000 0x1009f003 0x00000280 + 0xa009e003 0x00000000 0x1009e003 0x00000280 0x1000ce05 0x0423c780 0x40034e09 0x00200780 0x30100409 0xc4100780 0x60024e1d 0x00208780 0x30070605 0xc4100780 0x30060609 0xc4100780 @@ -286,133 +288,132 @@ code { 0x0c011001 0xe4204780 0x1000c205 0x0423c784 0x2000c005 0x04204784 0xa0000019 0x04000780 0x307c0dfd 0x640147c8 0x0c011001 0xe4204780 - 0x1009f003 0x00000280 0xd41c680d 0x20000780 - 0x1d00e204 0x1d00e408 0x40050419 0x00000780 - 0x60040619 0x00018780 0x30100c19 0xc4100780 - 0x3c82c1fd 0x6c6147c8 0x60040419 0x00018780 - 0xa009c003 0x00000000 0x10070003 0x00000280 - 0xd41c4005 0x20000780 0x1500e004 0x1500ec08 - 0x4005041d 0x00000780 0x6004061d 0x0001c780 - 0x30100e1d 0xc4100780 0x60040405 0x0001c780 - 0xd8044005 0x20000780 0x2400c005 0x04204780 - 0x20068205 0x00000003 0x1009c003 0x00000780 - 0xd41c680d 0x20000780 0x3c83c1fd 0x6c6147c8 - 0xa009b003 0x00000000 0x10085003 0x00000280 - 0x3002cc05 0xc4300780 0x301f0209 0xec100780 - 0xd0840409 0x04400780 0x2000041d 0x04004780 - 0xd41c4005 0x20000780 0x1500ec08 0x1500e004 - 0x2400c609 0x04208780 0x40050421 0x00000780 + 0x1009e003 0x00000280 0xd41c680d 0x20000780 + 0x1d00ec08 0x1d00e404 0x2c40c209 0x04208780 + 0x40050419 0x00000780 0x60040619 0x00018780 + 0x30100c19 0xc4100780 0x3c82c1fd 0x6c6147c8 + 0x60040419 0x00018780 0xa009b003 0x00000000 + 0x10071003 0x00000280 0xd41c4005 0x20000780 + 0x1400c005 0x0423c780 0x4005041d 0x00000780 + 0x6004061d 0x0001c780 0x30100e1d 0xc4100780 + 0x60040405 0x0001c780 0xd8044005 0x20000780 + 0x2400c005 0x04204780 0x20068205 0x00000003 + 0x1009b003 0x00000780 0xd41c680d 0x20000780 + 0x3c83c1fd 0x6c6147c8 0xa009a003 0x00000000 + 0x10085003 0x00000280 0x3002cc05 0xc4300780 + 0x301f021d 0xec100780 0xd0840e1d 0x04400780 + 0x20000e1d 0x04004780 0xd41c4005 0x20000780 + 0x2502e608 0x1500e004 0x40050421 0x00000780 0x60040621 0x00020780 0x30101021 0xc4100780 0x60040405 0x00020780 0x30010e09 0xec100780 0x20000205 0x04008780 0xd8044005 0x20000780 0x2400c005 0x04204780 0x200f8205 0x00000003 - 0x1009b003 0x00000780 0xd41c680d 0x20000780 - 0x3c7cc1fd 0x6c2147c8 0xa009a003 0x00000000 - 0x10094003 0x00000280 0xd804400d 0x20000780 - 0xd41c7005 0x20000780 0x3c7cc1fd 0x6c2087c8 - 0x2501e209 0x00000003 0x1400c005 0x0423c780 - 0x10000809 0x2440c280 0x4005041d 0x00000780 - 0x6004061d 0x0001c780 0x30100e1d 0xc4100780 - 0x60040405 0x0001c780 0x1009a003 0x00000780 - 0xd41c7005 0x20000780 0x1500e004 0x1500e208 + 0x1009a003 0x00000780 0xd41c680d 0x20000780 + 0x3c7cc1fd 0x6c2147c8 0xa0099003 0x00000000 + 0x10093003 0x00000280 0xd804400d 0x20000780 + 0xd41c7805 0x20000780 0x3c7cc1fd 0x6c2087c8 + 0x2501e005 0x00000003 0x10000805 0x2440c280 0x4005041d 0x00000780 0x6004061d 0x0001c780 0x30100e1d 0xc4100780 0x60040405 0x0001c780 - 0xf0000001 0xe0000002 0xf0000001 0xe0000002 - 0x30060205 0xac000782 0x00020a05 0xc0000780 - 0x04051001 0xe4204780 0xa0004405 0x04200782 - 0x20000811 0x04004780 0x3004cffd 0x6c2107c8 - 0x10016003 0x00000280 0x861ffe03 0x00000000 - 0xa0000009 0x04000780 0xd0800209 0x00400780 - 0xa0000405 0x04000780 0x30050205 0xc4100780 - 0x20000409 0x04004780 0x308505fd 0x6c4107c8 - 0xa00be003 0x00000000 0x100be003 0x00000280 - 0xa0000009 0x04000780 0xd0800209 0x00400780 - 0xa0000405 0x04000780 0x30050205 0xc4100780 - 0x20000409 0x04004780 0x00020405 0xc0000780 - 0xd418400d 0x20000780 0xd4144009 0x20000780 - 0x1c00c005 0x0423c780 0x3801c1fd 0x6c2107c8 - 0xa0000009 0x04000780 0xd0800209 0x00400780 - 0xa0000405 0x04000780 0x30050205 0xc4100780 - 0x20000409 0x04004780 0x20008405 0x0000000b - 0x10000405 0x0403c500 0x04001001 0xe4204780 - 0xf0000001 0xe0000002 0x861ffe03 0x00000000 - 0xa0000009 0x04000780 0xd0800209 0x00400780 - 0xa0000405 0x04000780 0x30050205 0xc4100780 - 0x20000409 0x04004780 0x308605fd 0x6c4107c8 - 0xa00d9003 0x00000000 0x100d9003 0x00000280 - 0xa0000009 0x04000780 0xd0800209 0x00400780 - 0xa0000405 0x04000780 0x30050205 0xc4100780 - 0x20000409 0x04004780 0x00020405 0xc0000780 - 0xd4024009 0x20000780 0x0802c00d 0xc0200780 - 0x0402d011 0xc0200780 0xdc14400d 0x20000780 - 0x1400d005 0x0423c780 0xd0144011 0x20000784 - 0x1d00e008 0x2941e004 0x3002c009 0x6c20c784 - 0xd0020205 0x04020780 0x2400d005 0x04204780 + 0x10099003 0x00000780 0xd41c7805 0x20000780 + 0x1400c005 0x0423c780 0x4005041d 0x00000780 + 0x6004061d 0x0001c780 0x30100e1d 0xc4100780 + 0x60040405 0x0001c780 0xf0000001 0xe0000002 + 0xf0000001 0xe0000002 0x30060205 0xac000782 + 0x00020a05 0xc0000780 0x04051001 0xe4204780 + 0xa0004405 0x04200782 0x20000811 0x04004780 + 0x3004cffd 0x6c2107c8 0x10016003 0x00000280 + 0x861ffe03 0x00000000 0xa0000009 0x04000780 + 0xd0800209 0x00400780 0xa0000405 0x04000780 + 0x30050205 0xc4100780 0x20000409 0x04004780 + 0x308505fd 0x6c4107c8 0xa00bd003 0x00000000 + 0x100bd003 0x00000280 0xa0000009 0x04000780 + 0xd0800209 0x00400780 0xa0000405 0x04000780 + 0x30050205 0xc4100780 0x20000409 0x04004780 + 0x00020405 0xc0000780 0xd418400d 0x20000780 + 0xd4144009 0x20000780 0x1c00c005 0x0423c780 + 0x3801c1fd 0x6c2107c8 0xa0000009 0x04000780 + 0xd0800209 0x00400780 0xa0000405 0x04000780 + 0x30050205 0xc4100780 0x20000409 0x04004780 + 0x20008405 0x0000000b 0x10000405 0x0403c500 0x04001001 0xe4204780 0xf0000001 0xe0000002 0x861ffe03 0x00000000 0xa0000009 0x04000780 0xd0800209 0x00400780 0xa0000405 0x04000780 0x30050205 0xc4100780 0x20000409 0x04004780 - 0x308705fd 0x6c4107c8 0xa0128003 0x00000000 - 0x10128003 0x00000280 0xa0000009 0x04000780 + 0x308605fd 0x6c4107c8 0xa00d8003 0x00000000 + 0x100d8003 0x00000280 0xa0000009 0x04000780 0xd0800209 0x00400780 0xa0000405 0x04000780 0x30050205 0xc4100780 0x20000409 0x04004780 - 0x00020405 0xc0000780 0xd4014009 0x20000780 + 0x00020405 0xc0000780 0xd4024009 0x20000780 0x0802c00d 0xc0200780 0x0402d011 0xc0200780 0xdc14400d 0x20000780 0x1400d005 0x0423c780 0xd0144011 0x20000784 0x1d00e008 0x2941e004 0x3002c009 0x6c20c784 0xd0020205 0x04020780 0x2400d005 0x04204780 0x04001001 0xe4204780 - 0x0402f00d 0xc0200780 0x0402d009 0xc0200780 - 0xdc14400d 0x20000780 0x1400d005 0x0423c780 - 0xd8144009 0x20000780 0x1c00c009 0x0423c780 - 0x2440f005 0x04204780 0x3802c009 0x6c20c780 - 0xd0020205 0x04020780 0x2400d005 0x04204780 - 0x04001001 0xe4204780 0x0402e00d 0xc0200780 - 0x0402d009 0xc0200780 0xdc14400d 0x20000780 - 0x1400d005 0x0423c780 0xd8144009 0x20000780 - 0x1c00c009 0x0423c780 0x2440e005 0x04204780 - 0x3802c009 0x6c20c780 0xd0020205 0x04020780 - 0x2400d005 0x04204780 0x04001001 0xe4204780 - 0x0402d80d 0xc0200780 0x0402d009 0xc0200780 - 0xdc14400d 0x20000780 0x1400d005 0x0423c780 - 0xd8144009 0x20000780 0x1d00e008 0x2541f804 - 0x3802c009 0x6c20c780 0xd0020205 0x04020780 - 0x2400d005 0x04204780 0x04001001 0xe4204780 - 0x0402d40d 0xc0200780 0x0402d009 0xc0200780 - 0xdc14400d 0x20000780 0x1400d005 0x0423c780 - 0xd8144009 0x20000780 0x1d00e008 0x2541f404 - 0x3802c009 0x6c20c780 0xd0020205 0x04020780 - 0x2400d005 0x04204780 0x04001001 0xe4204780 - 0x0402d20d 0xc0200780 0x0402d009 0xc0200780 - 0xdc14400d 0x20000780 0x1400d005 0x0423c780 - 0xd8144009 0x20000780 0x1d00e008 0x2541f204 - 0x3802c009 0x6c20c780 0xd0020205 0x04020780 - 0x2400d005 0x04204780 0x04001001 0xe4204780 0xf0000001 0xe0000002 0x861ffe03 0x00000000 0xa0000009 0x04000780 0xd0800209 0x00400780 0xa0000405 0x04000780 0x30050205 0xc4100780 - 0x20000409 0x04004780 0x307c05fd 0x6c0147c8 - 0xa013c003 0x00000000 0x1013c003 0x00000280 + 0x20000409 0x04004780 0x308705fd 0x6c4107c8 + 0xa0127003 0x00000000 0x10127003 0x00000280 + 0xa0000009 0x04000780 0xd0800209 0x00400780 + 0xa0000405 0x04000780 0x30050205 0xc4100780 + 0x20000409 0x04004780 0x00020405 0xc0000780 + 0xd4014009 0x20000780 0x0802c00d 0xc0200780 + 0x0402d011 0xc0200780 0xdc14400d 0x20000780 + 0x1400d005 0x0423c780 0xd0144011 0x20000784 + 0x1d00e008 0x2941e004 0x3002c009 0x6c20c784 + 0xd0020205 0x04020780 0x2400d005 0x04204780 + 0x04001001 0xe4204780 0x0402f00d 0xc0200780 + 0x0402d009 0xc0200780 0xdc14400d 0x20000780 + 0x1400d005 0x0423c780 0xd8144009 0x20000780 + 0x1c00c009 0x0423c780 0x2440f005 0x04204780 + 0x3802c009 0x6c20c780 0xd0020205 0x04020780 + 0x2400d005 0x04204780 0x04001001 0xe4204780 + 0x0402e00d 0xc0200780 0x0402d009 0xc0200780 + 0xdc14400d 0x20000780 0x1400d005 0x0423c780 + 0xd8144009 0x20000780 0x1c00c009 0x0423c780 + 0x2440e005 0x04204780 0x3802c009 0x6c20c780 + 0xd0020205 0x04020780 0x2400d005 0x04204780 + 0x04001001 0xe4204780 0x0402d80d 0xc0200780 + 0x0402d009 0xc0200780 0xdc14400d 0x20000780 + 0x1400d005 0x0423c780 0xd8144009 0x20000780 + 0x1d00e008 0x2541f804 0x3802c009 0x6c20c780 + 0xd0020205 0x04020780 0x2400d005 0x04204780 + 0x04001001 0xe4204780 0x0402d40d 0xc0200780 + 0x0402d009 0xc0200780 0xdc14400d 0x20000780 + 0x1400d005 0x0423c780 0xd8144009 0x20000780 + 0x1d00e008 0x2541f404 0x3802c009 0x6c20c780 + 0xd0020205 0x04020780 0x2400d005 0x04204780 + 0x04001001 0xe4204780 0x0402d20d 0xc0200780 + 0x0402d009 0xc0200780 0xdc14400d 0x20000780 + 0x1400d005 0x0423c780 0xd8144009 0x20000780 + 0x1d00e008 0x2541f204 0x3802c009 0x6c20c780 + 0xd0020205 0x04020780 0x2400d005 0x04204780 + 0x04001001 0xe4204780 0xf0000001 0xe0000002 + 0x861ffe03 0x00000000 0xa0000009 0x04000780 + 0xd0800209 0x00400780 0xa0000405 0x04000780 + 0x30050205 0xc4100780 0x20000409 0x04004780 + 0x307c05fd 0x6c0147c8 0xa013b003 0x00000000 + 0x1013b003 0x00000280 0x1000ce05 0x0423c780 + 0x40034e09 0x00200780 0x30100409 0xc4100780 + 0x60024e05 0x00208780 0x30070209 0xc4100780 + 0x3006020d 0xc4100780 0x20038408 0x2102e808 + 0x2000d00d 0x04204780 0x20208405 0x00000003 + 0xd00e020d 0xa0c00780 0xa0000009 0x04000782 + 0xd0800209 0x00400780 0xa0000405 0x04000780 + 0x30050205 0xc4100780 0x20000409 0x04004780 + 0x3002cffd 0x6c20c7c8 0x30000003 0x00000280 0x1000ce05 0x0423c780 0x40034e09 0x00200780 0x30100409 0xc4100780 0x60024e05 0x00208780 - 0x30070209 0xc4100780 0x3006020d 0xc4100780 - 0x20038408 0x2102e808 0x2000d00d 0x04204780 - 0x20208405 0x00000003 0xd00e020d 0xa0c00780 - 0xa0000009 0x04000782 0xd0800209 0x00400780 - 0xa0000405 0x04000780 0x30050205 0xc4100780 - 0x20000409 0x04004780 0x3002cffd 0x6c20c7c8 - 0x30000003 0x00000280 0x1000ce05 0x0423c780 - 0x40034e09 0x00200780 0x30100409 0xc4100780 - 0x60024e05 0x00208780 0xa0000009 0x04000780 - 0xd0800219 0x00400780 0xa0000c0d 0x04000780 - 0x3005060d 0xc4100780 0x20038408 0x20028204 - 0x3007020d 0xc4100780 0x30060205 0xc4100780 - 0xa0000009 0x04000780 0xd0800201 0x00400780 - 0xa0000001 0x04000780 0x30050001 0xc4100780 - 0x20000409 0x04000780 0x00020405 0xc0000780 - 0x20000601 0x04004780 0xd4144005 0x20000780 - 0x2100e804 0x1500e000 0x20108205 0x00000003 - 0xd00e0201 0xa0c00781 + 0xa0000009 0x04000780 0xd0800219 0x00400780 + 0xa0000c0d 0x04000780 0x3005060d 0xc4100780 + 0x20038408 0x20028204 0x3007020d 0xc4100780 + 0x30060205 0xc4100780 0xa0000009 0x04000780 + 0xd0800201 0x00400780 0xa0000001 0x04000780 + 0x30050001 0xc4100780 0x20000409 0x04000780 + 0x00020405 0xc0000780 0x20000601 0x04004780 + 0xd4144005 0x20000780 0x2100e804 0x1500e000 + 0x20108205 0x00000003 0xd00e0201 0xa0c00781 } } code { @@ -431,48 +432,50 @@ code { } } bincode { - 0xa0000005 0x04000780 0x308003fd 0x644107c8 + 0xa0000009 0x04000780 0x308005fd 0x644107c8 0xa000d003 0x00000000 0x1000d003 0x00000280 - 0xa0004e01 0x04200780 0x30070009 0xc4100780 - 0x30060001 0xc4100780 0x20000401 0x04000780 - 0x30020209 0xc4100780 0x2100ec00 0x20008400 - 0xd00e0001 0x80c00780 0x00020205 0xc0000780 + 0xa0004e01 0x04200780 0x30070005 0xc4100780 + 0x30060001 0xc4100780 0x20000201 0x04000780 + 0x30020405 0xc4100780 0x2100ec00 0x20008200 + 0xd00e0001 0x80c00780 0x00020405 0xc0000780 0x04024e01 0xe4200780 0xf0000001 0xe0000002 - 0x861ffe03 0x00000000 0xa0004201 0x04200780 - 0x40014c09 0x00200780 0x30100409 0xc4100780 - 0xd0093805 0x20000780 0x60004c09 0x00208780 - 0x2500e00c 0x2542ee10 0x3004060d 0xac000780 - 0x300107fd 0x6c00c7c8 0xa0020003 0x00000000 - 0x1001f003 0x00000280 0xd0094005 0x20000780 - 0x2502e010 0x20048210 0x30020811 0xc4100780 - 0x2000ca11 0x04210780 0xd00e0811 0x80c00780 - 0x10020003 0x00000780 0x1000f811 0x0403c780 - 0x00020205 0xc0000782 0x308103fd 0x6c4107c8 - 0x04000e01 0xe4210780 0xa0033003 0x00000000 - 0x10033003 0x00000280 0x20000011 0x04004780 - 0x300309fd 0x6c0187c8 0x00020805 0xc0000780 - 0xa0032003 0x00000000 0x10031003 0x00000280 - 0xd0094009 0x20000780 0x2001800c 0x2902e010 - 0x2000060d 0x04010780 0x3002060d 0xc4100780 - 0x2000ca0d 0x0420c780 0xd00e060d 0x80c00780 - 0x10032003 0x00000780 0x1000f80d 0x0403c780 - 0x04000e01 0xe420c782 0xd0093805 0x20000782 - 0x2542ee0c 0x3503e00c 0x30030001 0xac000780 + 0x861ffe03 0x00000000 0xa0004205 0x04200780 + 0x40034c01 0x00200780 0x30100001 0xc4100780 + 0xd0093805 0x20000780 0x60024c0d 0x00200780 + 0x2501e000 0x2543ee10 0x30040011 0xac000780 + 0x300209fd 0x6c00c7c8 0xa0022003 0x00000000 + 0x10021003 0x00000280 0xd0094005 0x20000780 + 0x2503e000 0x20008400 0x30020001 0xc4100780 + 0x2000ca01 0x04200780 0xd00e0015 0x80c00780 + 0x1400d401 0x0423c780 0x30000a01 0xec000780 + 0x10022003 0x00000780 0x1000f801 0x0403c780 + 0x00020405 0xc0000782 0x308105fd 0x6c4107c8 + 0x04000e01 0xe4200780 0xa0037003 0x00000000 + 0x10037003 0x00000280 0x20000201 0x04008780 + 0x300401fd 0x6c0187c8 0x00020005 0xc0000780 + 0xa0036003 0x00000000 0x10035003 0x00000280 + 0xd0094009 0x20000780 0x20028200 0x2903e010 + 0x20000001 0x04010780 0x30020001 0xc4100780 + 0x2000ca01 0x04200780 0xd00e0011 0x80c00780 + 0x1800d401 0x0423c780 0x30000801 0xec000780 + 0x10036003 0x00000780 0x1000f801 0x0403c780 + 0x04000e01 0xe4200782 0xd0093805 0x20000782 + 0x2543ee00 0x3500e000 0x30000201 0xac000780 0x307c0011 0x8c000780 0x861ffe03 0x00000000 0xd0093805 0x20000780 0x347cc1fd 0x6c20c7c8 - 0x1000f80d 0x0403c780 0x1400c001 0x0423c780 - 0x1004b003 0x00000280 0x101c8001 0x00000003 + 0x1000f805 0x0403c780 0x1400c001 0x0423c780 + 0x1004f003 0x00000280 0x101c8001 0x00000003 0x00000005 0xc0000780 0x1000f815 0x0403c780 - 0x20000a01 0x04004780 0xd409800d 0x20000780 + 0x20000a01 0x04008780 0xd409800d 0x20000780 0x00020009 0xc0000780 0xd0093811 0x20000780 0x20018a15 0x00000003 0x1c00c001 0x0423c780 - 0x3005c1fd 0x6c2147cc 0x6800ce0d 0x8020c780 + 0x3005c1fd 0x6c2147cc 0x6800ce05 0x80204780 0xd4000805 0x20000780 0x1000c001 0x0423c784 - 0x10040003 0x00000280 0x300109fd 0x6c00c7c8 + 0x10044003 0x00000280 0x300209fd 0x6c00c7c8 0x30000003 0x00000280 0xd0094805 0x20000780 - 0x2502f008 0x20008210 0x1500e000 0x20028204 - 0x00020805 0xc0000780 0x30000609 0xec000780 - 0x30020201 0xc4100780 0x2542ee04 0x2100e800 + 0x2503f00c 0x20008410 0x1500e000 0x20038408 + 0x00020805 0xc0000780 0x30000205 0xec000780 + 0x30020401 0xc4100780 0x2541ee04 0x2100e800 0xd00e0005 0xa0c00781 } } @@ -510,71 +513,73 @@ code { } } code { - name = cudaSumResidualChunks + name = cudaFindWastedBits lmem = 0 - smem = 1188 - reg = 8 + smem = 1248 + reg = 5 bar = 1 const { segname = const segnum = 1 offset = 0 - bytes = 20 + bytes = 24 mem { - 0x0000007f 0x0000003f 0x0000001f 0x0000000e - 0x007fffff + 0x0000000f 0x0000007f 0x0000003f 0x0000001f + 0x00000020 0x0000009e } } bincode { - 0x10000005 0x0403c780 0xa0004c09 0x04200780 - 0x1000d001 0x0423c780 0xa0004e0d 0x04200780 - 0x40050015 0x00000780 0x30070619 0xc4100780 - 0x3006061d 0xc4100780 0xa0000411 0x04000780 - 0x60040215 0x00014780 0x20000c19 0x0401c780 - 0x30008805 0x00000003 0x30100a15 0xc4100780 - 0x00020205 0xc0000780 0x60040001 0x00014780 - 0x2000ca05 0x04218780 0xd00e0205 0x80c00780 - 0x04021001 0xe43f0780 0x2140ee14 0x20418a04 - 0x3001d005 0xac200780 0x300403fd 0x6c00c7c8 - 0x300d0615 0xc4100500 0x20000001 0x04014500 - 0x20000801 0x04000500 0x30020001 0xc4100500 - 0x2000cc01 0x04200500 0xd00e0001 0x80c00500 - 0x1000f801 0x0403c280 0x301f0015 0xec100780 - 0x30010001 0xc4100780 0xd0000a01 0x04008780 - 0x00020805 0xc0000780 0x04001201 0xe4200780 - 0x861ffe03 0x00000000 0x308009fd 0x6c4107c8 - 0xd4044809 0x20000500 0x1800c001 0x0423c500 - 0x2400d201 0x04200500 0x04001201 0xe4200500 - 0x861ffe03 0x00000000 0x308109fd 0x6c4107c8 - 0xd4024809 0x20000500 0x1800c001 0x0423c500 - 0x2400d201 0x04200500 0x04001201 0xe4200500 - 0x861ffe03 0x00000000 0x30820801 0x6c40c7d0 - 0xa00001fd 0x0c0147c8 0xd4014809 0x20001680 - 0x1800c001 0x0423d680 0x2400d201 0x04201680 - 0x04001201 0xe4201680 0x861ffe03 0x00000000 - 0x1400f201 0x0423c780 0x2400d201 0x04200780 - 0x04001201 0xe4200780 0x2400e201 0x04200780 - 0x04001201 0xe4200780 0x2400da01 0x04200780 - 0x04001201 0xe4200780 0x2400d601 0x04200780 - 0x04001201 0xe4200780 0x2400d401 0x04200780 - 0x04001201 0xe4200780 0xa0057003 0x00000000 - 0x10057003 0x00000100 0x20018801 0x00000003 - 0x40010415 0x00000780 0x60000615 0x00014780 - 0x30100a19 0xc4100780 0x30830815 0x6c410780 - 0x3001021d 0xec100780 0x60000401 0x00018780 - 0xa0000a05 0x2c014780 0x2040d215 0x0421c780 - 0x60840201 0x80400780 0x30040a05 0xec000780 - 0x20000001 0x04004780 0xd4085009 0x20000780 - 0x04021201 0xe4200780 0x3800ce01 0xac200780 - 0x04021201 0xe4200780 0x3800c601 0xac200780 - 0x04021201 0xe4200780 0x3800c201 0xac200780 - 0x04021201 0xe4200780 0x3800c001 0xac200780 - 0x04021201 0xe4200780 0x307c09fd 0x6c0147ca - 0x30000003 0x00000280 0x40074801 0x00200780 - 0x30100001 0xc4100780 0x60064801 0x00200780 - 0x20000001 0x04008780 0xd0084805 0x20000780 - 0x30020005 0xc4100780 0x1500e000 0x2101e804 - 0xd00e0201 0xa0c00781 + 0xa0000005 0x04000780 0x308003fd 0x644107c8 + 0xa0010003 0x00000000 0x30020209 0xc4100780 + 0x10010003 0x00000280 0x1000cc01 0x0423c780 + 0x40014c0d 0x00200780 0x3010060d 0xc4100780 + 0x60004c01 0x0020c780 0x3007000d 0xc4100780 + 0x30060001 0xc4100780 0x20008600 0x2100e800 + 0x20000401 0x04000780 0xd00e0001 0x80c00780 + 0x00000405 0xc0000780 0x04021001 0xe4200780 + 0x00000405 0xc0000782 0x04001001 0xe43f0780 + 0x861ffe03 0x00000000 0x307ccffd 0x6c20c7c8 + 0x1002a003 0x00000280 0xa000420d 0x04200780 + 0x1000f811 0x0403c780 0x20000801 0x04004780 + 0x3000cffd 0x6420c7c8 0xa0025003 0x00000000 + 0x10023003 0x00000280 0xd0084805 0x20000780 + 0x2504e000 0x20008200 0x30020001 0xc4100780 + 0x2000ca01 0x04200780 0xd00e0001 0x80c00780 + 0x00000405 0xc0000780 0xd400d001 0x04204780 + 0x10025003 0x00000780 0x00000405 0xc0000780 + 0x1400d001 0x0423c780 0x20000811 0x0400c782 + 0x00000405 0xc0000780 0x3004cffd 0x6c2107c8 + 0x04001001 0xe4200780 0x10017003 0x00000280 + 0x861ffe03 0x00000000 0x308103fd 0x644107c8 + 0x00000405 0xc0000500 0xd4044009 0x20000500 + 0x1800c001 0x0423c500 0xd400d001 0x04204500 + 0x04001001 0xe4200500 0x861ffe03 0x00000000 + 0x308203fd 0x644107c8 0x00000405 0xc0000500 + 0xd4024009 0x20000500 0x1800c001 0x0423c500 + 0xd400d001 0x04204500 0x04001001 0xe4200500 + 0x861ffe03 0x00000000 0x308303fd 0x644107c8 + 0x00000405 0xc0000500 0xd4014009 0x20000500 + 0x1800c001 0x0423c500 0xd400d001 0x04204500 + 0x04001001 0xe4200500 0x861ffe03 0x00000000 + 0x00000405 0xc0000780 0x1400f001 0x0423c780 + 0xd400d001 0x04204780 0x04001001 0xe4200780 + 0x1400e001 0x0423c780 0xd400d001 0x04204780 + 0x04001001 0xe4200780 0x1400d801 0x0423c780 + 0xd400d001 0x04204780 0x04001001 0xe4200780 + 0x1400d401 0x0423c780 0xd400d001 0x04204780 + 0x04001001 0xe4200780 0x1400d201 0x0423c780 + 0xd400d001 0x04204780 0x3001cdfd 0x6420c7c8 + 0x04001001 0xe4200780 0x30000003 0x00000280 + 0x3100f001 0x00000003 0xd000d001 0x042007c0 + 0xa0000001 0x44064680 0x30170001 0xec100680 + 0x31000009 0x04414680 0x10000809 0x2440c100 + 0x1000cc01 0x0423c780 0x40014c0d 0x00200780 + 0x3010060d 0xc4100780 0x60004c01 0x0020c780 + 0x20000001 0x04004780 0x30070005 0xc4100780 + 0x30060001 0xc4100780 0x20000201 0x04000780 + 0x301f8405 0x00000003 0x2000c801 0x04200780 + 0x307c0205 0x8c000780 0x202c8001 0x00000003 + 0xd00e0005 0xa0c00781 } } code { @@ -718,6 +723,27 @@ code { 0xf0000001 0xe0000001 } } +code { + name = cudaStereoDecorr + lmem = 0 + smem = 24 + reg = 6 + bar = 0 + bincode { + 0x10004205 0x0023c780 0xa0000005 0x04000780 + 0x60014c01 0x00204780 0x3000cbfd 0x6c20c7c8 + 0x30000003 0x00000280 0x2000ca05 0x04200780 + 0x30020009 0xc4100780 0x3002020d 0xc4100780 + 0x2000c805 0x04208780 0xd00e0205 0x80c00780 + 0x2000c809 0x0420c780 0xd00e0409 0x80c00780 + 0x3001ca0d 0xc4300780 0x20038010 0x2103ea0c + 0x30020815 0xc4100780 0x20028210 0x20038000 + 0x2000c815 0x04214780 0x3001080d 0xec100780 + 0x30020011 0xc4100780 0x20400201 0x04008780 + 0xd00e0a0d 0xa0c00780 0x2000c805 0x04210780 + 0xd00e0201 0xa0c00781 + } +} code { name = cudaCopyBestMethodStereo lmem = 0 @@ -798,82 +824,3 @@ code { 0xd00e0201 0xa0c00781 } } -code { - name = cudaSumResidual - lmem = 0 - smem = 1244 - reg = 7 - bar = 1 - const { - segname = const - segnum = 1 - offset = 0 - bytes = 20 - mem { - 0x0000002f 0x0000001f 0x00000008 0x00000020 - 0x00000001 - } - } - bincode { - 0xa0000005 0x04000780 0x308003fd 0x644107c8 - 0xa000d003 0x00000000 0x30020209 0xc4100780 - 0x1000d003 0x00000280 0xa0004e01 0x04200780 - 0x3007000d 0xc4100780 0x30060001 0xc4100780 - 0x20008600 0x2100e800 0x20000401 0x04000780 - 0xd00e0001 0x80c00780 0x00000405 0xc0000780 - 0x04020e01 0xe4200780 0xf0000001 0xe0000002 - 0x861ffe03 0x00000000 0x3001cdfd 0x6c20c7c8 - 0xa001c003 0x00000000 0x1001b003 0x00000280 - 0x1000cc01 0x0423c780 0x40014e0d 0x00200780 - 0x3010060d 0xc4100780 0x60004e01 0x0020c780 - 0x20000001 0x04004780 0x30020001 0xc4100780 - 0x2000ca01 0x04200780 0xd00e0001 0x80c00780 - 0x1001c003 0x00000780 0x1000f801 0x0403c780 - 0x00000405 0xc0000782 0x04000e01 0xe4200780 - 0x861ffe03 0x00000000 0x308103fd 0x6c4107c8 - 0x00000405 0xc0000500 0xd4013809 0x20000500 - 0x1800c001 0x0423c500 0x2400ce01 0x04200500 - 0x04000e01 0xe4200500 0x861ffe03 0x00000000 - 0x00000405 0xc0000780 0x1400ee01 0x0423c780 - 0x2400ce01 0x04200780 0x04000e01 0xe4200780 - 0x1500fe00 0x2500ee00 0x04000e01 0xe4200780 - 0x1500f600 0x2500ee00 0x04000e01 0xe4200780 - 0x1500f200 0x2500ee00 0x04000e01 0xe4200780 - 0x1500f000 0x2500ee00 0x307c03fd 0x6c0147c8 - 0x04000e01 0xe4200780 0x30000003 0x00000280 - 0xd0086005 0x20000780 0x3482c1fd 0x6c6147c8 - 0x10044003 0x00000280 0xd0083805 0x20000780 - 0x1500ec00 0x1500e008 0x4005000c 0x1500ee04 - 0x6004020d 0x0000c780 0x40020211 0x00000780 - 0x3010060d 0xc4100780 0x60030011 0x00010780 - 0x60040009 0x0000c780 0x3010080d 0xc4100780 - 0x2000ce09 0x04208780 0x60020001 0x0000c780 - 0x20068405 0x00000003 0x10070003 0x00000780 - 0xd0086005 0x20000780 0x3483c1fd 0x6c6147c8 - 0x1005c003 0x00000280 0xd0083805 0x20000780 - 0x1500ec00 0x1500e008 0x2400c60d 0x04200780 - 0x3002cc15 0xc4300780 0x40070805 0x00000780 - 0x301f0a19 0xec100780 0x60060a11 0x00004780 - 0x1400ce05 0x0423c780 0xd0840c19 0x04400780 - 0x30100811 0xc4100780 0x20000c15 0x04014780 - 0x60060809 0x00010780 0x40010411 0x00000780 - 0x30010a0d 0xec100780 0x60000611 0x00010780 - 0x20000409 0x0400c780 0x3010080d 0xc4100780 - 0x2000ce09 0x04208780 0x60000401 0x0000c780 - 0x200f8405 0x00000003 0x10070003 0x00000780 - 0xd0086005 0x20000780 0x1500e400 0x1500e204 - 0x40030009 0x00000780 0x60020209 0x00008780 - 0x30100409 0xc4100780 0x347cc1fd 0x6c2147c8 - 0x60020001 0x00008780 0x1006f003 0x00000280 - 0xd0086805 0x20000780 0x2501e209 0x00000003 - 0x1400c005 0x0423c780 0x4005040d 0x00000780 - 0x6004060d 0x0000c780 0x3010060d 0xc4100780 - 0x60040409 0x0000c780 0x307ccffd 0x6c2147c8 - 0x1400c005 0x0423c780 0x10000405 0x0403c280 - 0x10070003 0x00000780 0x10000005 0x0403c780 - 0xa0004e09 0x04200780 0x3007040d 0xc4100780 - 0x30060409 0xc4100780 0x20028608 0x2102e808 - 0x30000201 0xac000780 0x20108405 0x00000003 - 0xd00e0201 0xa0c00781 - } -}