diff --git a/CUETools.FlaCuda/FlaCudaWriter.cs b/CUETools.FlaCuda/FlaCudaWriter.cs index 31be0de..b526416 100644 --- a/CUETools.FlaCuda/FlaCudaWriter.cs +++ b/CUETools.FlaCuda/FlaCudaWriter.cs @@ -1095,7 +1095,7 @@ namespace CUETools.Codecs.FlaCuda cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 4, (uint)frame.blocksize); cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 5, (uint)partSize); cuda.SetParameterSize(cudaEstimateResidual, sizeof(uint) * 6); - cuda.SetFunctionBlockShape(cudaEstimateResidual, 64, 4, 1); + cuda.SetFunctionBlockShape(cudaEstimateResidual, 32, 8, 1); //cuda.SetParameter(cudaSumResidualChunks, 0, (uint)cudaResidualSums.Pointer); //cuda.SetParameter(cudaSumResidualChunks, sizeof(uint), (uint)cudaResidualTasks.Pointer); @@ -1113,7 +1113,7 @@ namespace CUETools.Codecs.FlaCuda cuda.SetFunctionBlockShape(cudaSumResidual, 64, 1, 1); // issue work to the GPU - cuda.LaunchAsync(cudaEstimateResidual, partCount, nResidualTasks / 4, cudaStream); + cuda.LaunchAsync(cudaEstimateResidual, partCount, nResidualTasks / 8, cudaStream); //cuda.LaunchAsync(cudaSumResidualChunks, partCount, nResidualTasks, cudaStream); cuda.LaunchAsync(cudaSumResidual, 1, nResidualTasks, cudaStream); cuda.CopyDeviceToHostAsync(cudaResidualTasks, residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks), cudaStream); @@ -1709,23 +1709,23 @@ namespace CUETools.Codecs.FlaCuda case 0: do_midside = false; window_function = WindowFunction.Bartlett; - max_prediction_order = 8; - max_partition_order = 4; + max_prediction_order = 4; + max_partition_order = 2; break; case 1: do_midside = false; - window_function = WindowFunction.Bartlett; - max_prediction_order = 8; - max_partition_order = 4; + max_prediction_order = 4; + max_partition_order = 3; break; case 2: do_midside = false; window_function = WindowFunction.Bartlett; max_partition_order = 4; + max_prediction_order = 8; break; case 3: window_function = WindowFunction.Bartlett; - max_prediction_order = 8; + max_prediction_order = 6; break; case 4: window_function = WindowFunction.Bartlett; @@ -1733,27 +1733,24 @@ namespace CUETools.Codecs.FlaCuda break; case 5: window_function = WindowFunction.Bartlett; + max_prediction_order = 10; break; - case 6: - //max_prediction_order = 10; + case 6: + window_function = WindowFunction.Bartlett; break; - case 7: + case 7: + max_prediction_order = 10; break; case 8: - lpc_max_precision_search = 2; break; case 9: - window_function = WindowFunction.Bartlett; - max_prediction_order = 32; + max_prediction_order = 16; break; case 10: - max_prediction_order = 32; - //lpc_max_precision_search = 2; + max_prediction_order = 24; break; case 11: max_prediction_order = 32; - //lpc_max_precision_search = 2; - variable_block_size = 4; break; } diff --git a/CUETools.FlaCuda/flacuda.cu b/CUETools.FlaCuda/flacuda.cu index a9ed84a..b0a6c99 100644 --- a/CUETools.FlaCuda/flacuda.cu +++ b/CUETools.FlaCuda/flacuda.cu @@ -197,56 +197,61 @@ extern "C" __global__ void cudaEstimateResidual( int data[256]; int residual[256]; int rice[256]; - int sums[8]; encodeResidualTaskStruct task[8]; } shared; const int tid = threadIdx.x + threadIdx.y * blockDim.x; - // fetch task data (8 * 64 == 512 elements); + // fetch task data (4 * 64 == 256 elements or 8 * 64 == 512 elements); ((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y * blockDim.y))[tid]; ((int*)&shared.task)[tid + 256] = ((int*)(tasks + blockIdx.y * blockDim.y))[tid + 256]; __syncthreads(); - const int residualOrder = shared.task[threadIdx.y].residualOrder; const int partNumber = blockIdx.x; const int pos = partNumber * partSize; - const int dataLen = min(frameSize - pos, partSize + max_order) * (residualOrder != 0); + const int dataLen = min(frameSize - pos, partSize + max_order); // fetch samples shared.data[tid] = (tid < dataLen ? samples[shared.task[0].samplesOffs + pos + tid] : 0); - if (tid < blockDim.y) shared.sums[tid] = 0; + + __syncthreads(); + + //if (tid < blockDim.y) shared.sums[tid] = 0; + shared.rice[tid] = 0; // set upper residuals to zero, in case blockDim < 256 //shared.residual[255 - tid] = 0; - const int residualLen = min(frameSize - pos - residualOrder, partSize) * (residualOrder != 0); + const int residualLen = max(0,min(frameSize - pos - shared.task[threadIdx.y].residualOrder, partSize)) * (shared.task[threadIdx.y].residualOrder != 0); // reverse coefs - if (threadIdx.x < residualOrder) shared.task[threadIdx.y].coefs[threadIdx.x] = shared.task[threadIdx.y].coefs[residualOrder - 1 - threadIdx.x]; - - __syncthreads(); + if (threadIdx.x < shared.task[threadIdx.y].residualOrder) shared.task[threadIdx.y].coefs[threadIdx.x] = shared.task[threadIdx.y].coefs[shared.task[threadIdx.y].residualOrder - 1 - threadIdx.x]; - for (int i = 0; i < residualLen; i += blockDim.x) + for (int i = threadIdx.x; i - threadIdx.x < residualLen; i += blockDim.x) // += 32 { + const int residualOrder = shared.task[threadIdx.y].residualOrder; // compute residual long sum = 0; for (int c = 0; c < residualOrder; c++) - sum += __mul24(shared.data[i + threadIdx.x + c], shared.task[threadIdx.y].coefs[c]); - int res = shared.data[i + threadIdx.x + residualOrder] - (sum >> shared.task[threadIdx.y].shift); - shared.residual[tid] = __mul24(i + threadIdx.x < residualLen, (2 * res) ^ (res >> 31)); - __syncthreads(); if (threadIdx.x < 32) shared.residual[tid] += shared.residual[tid + 32]; __syncthreads(); + sum += __mul24(shared.data[i + c], shared.task[threadIdx.y].coefs[c]); + int res = shared.data[i + residualOrder] - (sum >> shared.task[threadIdx.y].shift); + shared.residual[tid] = __mul24(i < residualLen, (2 * res) ^ (res >> 31)); + // enable this line when using blockDim.y == 4 + //__syncthreads(); if (threadIdx.x < 32) shared.residual[tid] += shared.residual[tid + 32]; __syncthreads(); shared.residual[tid] += shared.residual[tid + 16]; shared.residual[tid] += shared.residual[tid + 8]; shared.residual[tid] += shared.residual[tid + 4]; shared.residual[tid] += shared.residual[tid + 2]; - if (threadIdx.x == 0) shared.sums[threadIdx.y] += shared.residual[tid] + shared.residual[tid + 1]; + //if (threadIdx.x == 0) shared.sums[threadIdx.y] += shared.residual[tid] + shared.residual[tid + 1]; + shared.rice[tid] += shared.residual[tid] + shared.residual[tid + 1]; } // rice parameter search - shared.rice[tid] = __mul24(threadIdx.x >= 15, 0x7fffff) + residualLen * (threadIdx.x + 1) + ((shared.sums[threadIdx.y] - (residualLen >> 1)) >> threadIdx.x); + //shared.rice[tid] = __mul24(threadIdx.x >= 15, 0x7fffff) + residualLen * (threadIdx.x + 1) + ((shared.sums[threadIdx.y] - (residualLen >> 1)) >> threadIdx.x); + shared.rice[tid] = __mul24(threadIdx.x >= 15, 0x7fffff) + residualLen * (threadIdx.x + 1) + ((shared.rice[threadIdx.y * blockDim.x] - (residualLen >> 1)) >> threadIdx.x); shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 8]); shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 4]); shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 2]); - if (threadIdx.x == 0 && residualOrder != 0) - output[(blockIdx.y * blockDim.y + threadIdx.y) * gridDim.x + blockIdx.x] = min(shared.rice[tid], shared.rice[tid + 1]); + shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 1]); + if (threadIdx.x == 0 && shared.task[threadIdx.y].residualOrder != 0) + output[(blockIdx.y * blockDim.y + threadIdx.y) * gridDim.x + blockIdx.x] = shared.rice[tid]; } // blockDim.x == 256 diff --git a/CUETools.FlaCuda/flacuda.cubin b/CUETools.FlaCuda/flacuda.cubin new file mode 100644 index 0000000..12103c0 --- /dev/null +++ b/CUETools.FlaCuda/flacuda.cubin @@ -0,0 +1,453 @@ +architecture {sm_10} +abiversion {1} +modname {cubin} +code { + name = cudaComputeAutocor + lmem = 0 + smem = 3256 + reg = 10 + bar = 1 + const { + segname = const + segnum = 1 + offset = 0 + bytes = 16 + mem { + 0x00000001 0x0000007f 0x0000003f 0x0000001f + } + } + bincode { + 0xa0000009 0x04000780 0x308005fd 0x644107c8 + 0xa000b003 0x00000000 0x3002040d 0xc4100780 + 0x1000b003 0x00000280 0xa0004e01 0x04200780 + 0x30030001 0xc4100780 0x2100ee00 0x20008600 + 0xd00e0001 0x80c00780 0x00000605 0xc0000780 + 0x04065801 0xe4200780 0xf0000001 0xe0000002 + 0x861ffe03 0x00000000 0xa0004c05 0x04200780 + 0x1000d401 0x0423c780 0x40020211 0x00000780 + 0x30100811 0xc4100780 0x60020001 0x00010780 + 0x2140f210 0x3104f010 0x3004d411 0xac200780 + 0x2000d015 0x04210780 0x30020bfd 0x6c00c7c8 + 0xa0024003 0x00000000 0x10023003 0x00000280 + 0xd0196005 0x20000780 0x2500e018 0x2500e21c + 0x20068418 0x2007841c 0x30020c19 0xc4100780 + 0x30020e1d 0xc4100780 0x2106ea18 0x2107ec20 + 0xd00e0c1d 0x80c00780 0xd00e1019 0x80c00780 + 0xa0000e1d 0x44014780 0xc0060e19 0x00000780 + 0x10024003 0x00000780 0x1000f819 0x0403c780 + 0x00000605 0xc0000782 0x04001601 0xe4218780 + 0x20008419 0x00000013 0x30050dfd 0x6c0187c8 + 0xa0039003 0x00000000 0x10038003 0x00000280 + 0xd0196005 0x20000780 0x2500e014 0x2500e21c + 0x20058400 0x20078414 0x20008001 0x00000013 + 0x20008a15 0x00000013 0x30020001 0xc4100780 + 0x30020a1d 0xc4100780 0x2000ca01 0x04200780 + 0xd00e0015 0x80c00780 0x2000cc01 0x0421c780 + 0xd00e0001 0x80c00780 0xa0000a15 0x44014780 + 0xc0000a01 0x00000780 0x10039003 0x00000780 + 0x1000f801 0x0403c780 0x00020c05 0xc0000782 + 0x04001601 0xe4200780 0x861ffe03 0x00000000 + 0x307cd1fd 0x6c2047c8 0x10085003 0x00000280 + 0x300209fd 0x6c00c7e8 0x30040dfd 0x6c0187f8 + 0x308105fd 0x6c40c7c8 0x00000019 0x20000780 + 0x2101f011 0x00000003 0x1000f815 0x0403c780 + 0x308205fd 0x6c40c7c8 0x0000001d 0x20000780 + 0x308305fd 0x6c40c7c8 0x00000021 0x20000780 + 0x307c05fd 0x6c0087d8 0x20000a25 0x04008780 + 0x20009201 0x00000013 0x00020009 0xc0000780 + 0x1800d601 0x0423c780 0x0002120d 0xc0000780 + 0xc400d625 0x00200780 0x00000609 0xc0000780 + 0x1c00d601 0x0423c780 0x1000f825 0x0403f280 + 0xe800d601 0x00224780 0x10001201 0x0403e280 + 0x08041601 0xe4200780 0x861ffe03 0x00000000 + 0x00000c01 0xa00007c0 0x00000609 0xc0000680 + 0xd8145811 0x20000680 0xd810580d 0x20000680 + 0x1000c001 0x0423c684 0xbc00c001 0x00200680 + 0x08041601 0xe4200680 0x861ffe03 0x00000000 + 0x00000e01 0xa00007c0 0x00000609 0xc0000680 + 0xd8125811 0x20000680 0xd810580d 0x20000680 + 0x1000c001 0x0423c684 0xbc00c001 0x00200680 + 0x08041601 0xe4200680 0x861ffe03 0x00000000 + 0x00001001 0xa00007c0 0x00000609 0xc0000680 + 0xd8115811 0x20000680 0xd810580d 0x20000680 + 0x1000c001 0x0423c684 0xbc00c001 0x00200680 + 0x08041601 0xe4200680 0x861ffe03 0x00000000 + 0x00000609 0xc0000780 0xd810580d 0x20000780 + 0x1c00e001 0x0423c780 0xbc00c001 0x00200780 + 0x08041601 0xe4200780 0xbc00d001 0x00200780 + 0x08041601 0xe4200780 0xbc00c801 0x00200780 + 0x08041601 0xe4200780 0xbc00c401 0x00200780 + 0x08041601 0xe4200780 0xa0080003 0x00000000 + 0x10080003 0x00001100 0xd010580d 0x20000780 + 0x1c00c201 0x0423c780 0x00020a09 0xc0000780 + 0xbc00c001 0x00200780 0x08061601 0xe4200780 + 0xf0000001 0xe0000002 0x861ffe03 0x00000000 + 0x20018a15 0x00000003 0x30040bfd 0x6c0147c8 + 0x10049003 0x00000280 0x3002d1fd 0x6c2047c8 + 0x30000003 0x00000280 0x10004e01 0x0023c780 + 0x60004805 0x00204780 0x2101f001 0x00000003 + 0x40030011 0x00000780 0x60020211 0x00010780 + 0x30100811 0xc4100780 0x60020001 0x00010780 + 0x00000605 0xc0000780 0x20000001 0x04008780 + 0xd4185805 0x20000780 0x30020005 0xc4100780 + 0x1500e000 0x2101e804 0xd00e0201 0xa0c00781 + } +} +code { + name = cudaEstimateResidual + lmem = 0 + smem = 4648 + reg = 10 + bar = 1 + const { + segname = const + segnum = 1 + offset = 0 + bytes = 12 + mem { + 0x000003ff 0x00000001 0x0000000e + } + } + bincode { + 0x10004409 0x0023c780 0xd0800205 0x00400780 + 0xa000420d 0x04200780 0x40024e05 0x00200780 + 0xa0000211 0x04000780 0xa0000001 0x04000780 + 0x30070209 0xc4100780 0x30060205 0xc4100780 + 0x40080c1c 0x20018404 0x20008e18 0x2101ec08 + 0x30020c05 0xc4100780 0x20000205 0x04008780 + 0xd00e0209 0x80c00780 0x00020c05 0xc0000780 + 0x20008205 0x00000043 0xd00e0205 0x80c00780 + 0x04061401 0xe4208780 0x04081401 0xe4204780 + 0x861ffe03 0x00000000 0xa0004c09 0x04200780 + 0x1000d205 0x0423c780 0x4005040d 0x00000780 + 0x6004060d 0x0000c780 0x3010060d 0xc4100780 + 0x60040409 0x0000c780 0x2101ee0c 0x2142f004 + 0x3003020d 0xac000780 0x30030dfd 0x6c0187c8 + 0xa0025003 0x00000000 0x10024003 0x00000280 + 0xd0185805 0x20000780 0x2502e008 0x20028c08 + 0x30020409 0xc4100780 0x2000ca09 0x04208780 + 0xd00e0409 0x80c00780 0x10025003 0x00000780 + 0x1000f809 0x0403c780 0x00020c05 0xc0000782 + 0x04001401 0xe4208780 0x861ffe03 0x00000000 + 0x30070809 0xc4100780 0x3006080d 0xc4100780 + 0x20000409 0x0400c780 0x2028840d 0x00000003 + 0x00000605 0xc0000780 0x00020c09 0xc0000780 + 0x0000040d 0xc0000780 0x08041401 0xe43f0780 + 0xdc185009 0x20000780 0x3800c005 0x04204780 + 0x3001d205 0xac200780 0x387cc00d 0x6c208780 + 0x307c0205 0x8c000780 0x3800c1fd 0x6420c7c8 + 0xd0030215 0x04020780 0xa0044003 0x00000000 + 0x10044003 0x00000280 0xd4180009 0x20000780 + 0x2840c005 0x04200780 0x30020205 0xc4100780 + 0x20000405 0x04004780 0x00000209 0xc0000780 + 0x30020005 0xc4100780 0x20000405 0x04004780 + 0x0000020d 0xc0000780 0xd818c809 0x20000780 + 0x1800c005 0x0423c780 0x0c063401 0xe4204780 + 0x307c0bfd 0x640087ca 0xa007b003 0x00000000 + 0x10000005 0x0403c780 0x1007b003 0x00000280 + 0x00020c09 0xc0000780 0xd8105009 0x20000780 + 0x1800c009 0x0423c780 0xd4180009 0x20000780 + 0x387cc1fd 0x6c20c7c8 0xa005e003 0x00000000 + 0x1000f821 0x0403c780 0x1800c00d 0x0423c780 + 0x1005e003 0x00000280 0x1000f825 0x0403c780 + 0xd4000009 0x20000780 0x2000020d 0x04024780 + 0x0002060d 0xc0000780 0xd8188011 0x20000780 + 0x1000c00d 0x0423c784 0x6c03d421 0x80220780 + 0xd418000d 0x20000780 0x20019225 0x00000003 + 0x3c09c1fd 0x6c2147c8 0xd8000809 0x20000780 + 0x1c00c00d 0x0423c780 0x10053003 0x00000280 + 0x2000020d 0x0400c782 0x00020609 0xc0000780 + 0xd418100d 0x20000780 0x1c00c00d 0x0423c780 + 0x3003100d 0xec000780 0x2840d40d 0x0420c780 + 0x301f0621 0xec100780 0x3001060d 0xc4100780 + 0xd0031021 0x04008780 0x30010a0d 0x6c010780 + 0xd081060d 0x04400780 0x4008060d 0x00018780 + 0x00020c09 0xc0000780 0xd808580d 0x20000780 + 0x08021401 0xe420c780 0x2c00de0d 0x0420c780 + 0x08021401 0xe420c780 0x2c00ce0d 0x0420c780 + 0x08021401 0xe420c780 0x2c00c60d 0x0420c780 + 0x08021401 0xe420c780 0x2c00c20d 0x0420c780 + 0x08021401 0xe420c780 0x2d03e00c 0x20038408 + 0xa000420d 0x04200780 0x20038204 0x2040820c + 0x30030bfd 0x640107c8 0x08041401 0xe4208780 + 0x1004b003 0x00000280 0xf0000001 0xe0000002 + 0x20018005 0x00000003 0x00020c0d 0xc0000780 + 0x40031409 0x00000780 0x60021609 0x00008780 + 0x30100409 0xc4100780 0x60021409 0x00008780 + 0x30820005 0x64410780 0x00020e09 0xc0000780 + 0xd8105009 0x20000780 0xa000020d 0x2c014780 + 0x30010a05 0xec100780 0x407f860d 0x0007ffff + 0x2941e004 0x20038408 0x30000205 0xec000780 + 0x20000205 0x04008780 0xdc105809 0x20000780 + 0x0c041401 0xe4204780 0x3801ce05 0xac200780 + 0x0c041401 0xe4204780 0x3801c605 0xac200780 + 0x0c041401 0xe4204780 0x3801c205 0xac200780 + 0x0c041401 0xe4204780 0x3801c00d 0xac200780 + 0x307c01fd 0x640147c8 0x0c041401 0xe420c780 + 0x30000003 0x00000280 0xd4180005 0x20000780 + 0x347cc1fd 0x6c2087c8 0x30000003 0x00000280 + 0x11002408 0x41022e04 0x20000201 0x04010780 + 0x40014805 0x00200780 0x30100205 0xc4100780 + 0x60004801 0x00204780 0xa0004c09 0x04200780 + 0x20000001 0x04008780 0x30020001 0xc4100780 + 0x2000c801 0x04200780 0xd00e000d 0xa0c00781 + } +} +code { + name = cudaEncodeResidual + lmem = 0 + smem = 36 + reg = 0 + bar = 0 + bincode { + 0xf0000001 0xe0000001 + } +} +code { + name = cudaSumResidualChunks + lmem = 0 + smem = 1188 + reg = 8 + bar = 1 + const { + segname = const + segnum = 1 + offset = 0 + bytes = 20 + mem { + 0x0000007f 0x0000003f 0x0000001f 0x0000000e + 0x007fffff + } + } + bincode { + 0x10000005 0x0403c780 0xa0004c09 0x04200780 + 0x1000d001 0x0423c780 0xa0004e0d 0x04200780 + 0x40050015 0x00000780 0x30070619 0xc4100780 + 0x3006061d 0xc4100780 0xa0000411 0x04000780 + 0x60040215 0x00014780 0x20000c19 0x0401c780 + 0x30008805 0x00000003 0x30100a15 0xc4100780 + 0x00020205 0xc0000780 0x60040001 0x00014780 + 0x2000ca05 0x04218780 0xd00e0205 0x80c00780 + 0x04021001 0xe43f0780 0x2140ee14 0x20418a04 + 0x3001d005 0xac200780 0x300403fd 0x6c00c7c8 + 0x300d0615 0xc4100500 0x20000001 0x04014500 + 0x20000801 0x04000500 0x30020001 0xc4100500 + 0x2000cc01 0x04200500 0xd00e0001 0x80c00500 + 0x1000f801 0x0403c280 0x301f0015 0xec100780 + 0x30010001 0xc4100780 0xd0000a01 0x04008780 + 0x00020805 0xc0000780 0x04001201 0xe4200780 + 0x861ffe03 0x00000000 0x308009fd 0x6c4107c8 + 0xd4044809 0x20000500 0x1800c001 0x0423c500 + 0x2400d201 0x04200500 0x04001201 0xe4200500 + 0x861ffe03 0x00000000 0x308109fd 0x6c4107c8 + 0xd4024809 0x20000500 0x1800c001 0x0423c500 + 0x2400d201 0x04200500 0x04001201 0xe4200500 + 0x861ffe03 0x00000000 0x30820801 0x6c40c7d0 + 0xa00001fd 0x0c0147c8 0xd4014809 0x20001680 + 0x1800c001 0x0423d680 0x2400d201 0x04201680 + 0x04001201 0xe4201680 0x861ffe03 0x00000000 + 0x1400f201 0x0423c780 0x2400d201 0x04200780 + 0x04001201 0xe4200780 0x2400e201 0x04200780 + 0x04001201 0xe4200780 0x2400da01 0x04200780 + 0x04001201 0xe4200780 0x2400d601 0x04200780 + 0x04001201 0xe4200780 0x2400d401 0x04200780 + 0x04001201 0xe4200780 0xa0057003 0x00000000 + 0x10057003 0x00000100 0x20018801 0x00000003 + 0x40010415 0x00000780 0x60000615 0x00014780 + 0x30100a19 0xc4100780 0x30830815 0x6c410780 + 0x3001021d 0xec100780 0x60000401 0x00018780 + 0xa0000a05 0x2c014780 0x2040d215 0x0421c780 + 0x60840201 0x80400780 0x30040a05 0xec000780 + 0x20000001 0x04004780 0xd4085009 0x20000780 + 0x04021201 0xe4200780 0x3800ce01 0xac200780 + 0x04021201 0xe4200780 0x3800c601 0xac200780 + 0x04021201 0xe4200780 0x3800c201 0xac200780 + 0x04021201 0xe4200780 0x3800c001 0xac200780 + 0x04021201 0xe4200780 0x307c09fd 0x6c0147ca + 0x30000003 0x00000280 0x40074801 0x00200780 + 0x30100001 0xc4100780 0x60064801 0x00200780 + 0x20000001 0x04008780 0xd0084805 0x20000780 + 0x30020005 0xc4100780 0x1500e000 0x2101e804 + 0xd00e0201 0xa0c00781 + } +} +code { + name = cudaComputeLPC + lmem = 0 + smem = 564 + reg = 9 + bar = 1 + const { + segname = const + segnum = 1 + offset = 0 + bytes = 44 + mem { + 0x00000001 0x0000001f 0x7e800000 0x3f800000 + 0x0000000f 0x00001fff 0xffffe000 0xfffffff8 + 0x3e800000 0x00000020 0x0000009e + } + } + bincode { + 0xa000000d 0x04000780 0x308007fd 0x644107c8 + 0xa000b003 0x00000000 0x1000b003 0x00000280 + 0xa0004e01 0x04200780 0x30030001 0xc4100780 + 0x30020605 0xc4100780 0x2100ec00 0x20008200 + 0xd00e0001 0x80c00780 0x00020605 0xc0000780 + 0x04001201 0xe4200780 0x3003ce01 0x6c2187d2 + 0xa00001fd 0x0c0147c8 0x00020605 0xc0001680 + 0x0400d601 0xe43f1680 0x861ffe03 0x00000000 + 0x307cd1fd 0x6c20c7d8 0x1002b003 0x00001280 + 0x1000f805 0x0403c780 0xa0027003 0x00000000 + 0x10027003 0x00000100 0x1000d001 0x0423c780 + 0x40014e09 0x00200780 0x30100409 0xc4100780 + 0x60004e09 0x00208780 0x2101ee01 0x00000003 + 0x20000409 0x04004780 0x40010811 0x00000780 + 0x60000a11 0x00010780 0x30100811 0xc4100780 + 0x60000801 0x00010780 0x20000601 0x04000780 + 0x30020001 0xc4100780 0x00020605 0xc0000780 + 0x2000ca01 0x04200780 0xd00e0001 0x80c00780 + 0xd4035809 0x20000780 0xb800c001 0x00200780 + 0x0400d601 0xe4200780 0xf0000001 0xe0000002 + 0x20018205 0x00000003 0x3001d1fd 0x6c2147d8 + 0x10013003 0x00001280 0x861ffe03 0x00000000 + 0x30810601 0x6c40c7d0 0xa00001fd 0x0c0147c8 + 0x00020605 0xc0001680 0x04001601 0xe43f1680 + 0xd0035805 0x20000780 0x307ccffd 0x6c20c7d8 + 0x1400c001 0x0423c780 0x30000003 0x00001280 + 0x10248005 0x00000003 0x00000205 0xc0000780 + 0x30020611 0xc4100780 0x1000f815 0x0403c780 + 0x20400a05 0x0400c780 0x00020209 0xc0000780 + 0xa004e003 0x00000000 0x30030bfd 0x6c00c7d8 + 0x1004e003 0x00000100 0xd8035811 0x20000780 + 0x0000080d 0xc0000780 0x1000c005 0x0423c784 + 0xcc01d605 0x00200780 0x1000f805 0x0403d280 + 0xdc016011 0x20000780 0x0c005601 0xe4204780 + 0xb000de05 0x00204784 0x0c005601 0xe4204780 + 0xb000ce05 0x00204784 0x0c005601 0xe4204780 + 0xb000c605 0x00204784 0x0c005601 0xe4204780 + 0xb000c205 0x00204784 0x0c005601 0xe4204780 + 0xb000c005 0x00204784 0x0c005601 0xe4204780 + 0xf0000001 0xe0000002 0x861ffe03 0x00000000 + 0xd0015811 0x20000780 0xd403180d 0x20000780 + 0x1000c005 0x0423c784 0xbc00c009 0x00204780 + 0xb08201fd 0x605107e8 0x10000005 0x0403c780 + 0xa0000409 0xe4004780 0xc0880409 0x00402680 + 0xc0880205 0x00402680 0x90000204 0xc0010418 + 0xc806d409 0x00200780 0x1000f809 0x0403d280 + 0x30030bfd 0x6c0147d8 0xb0000c05 0x00008780 + 0x10000405 0x0403d280 0x00000809 0xc0000780 + 0xe1060c09 0x0440c780 0xb9017604 0xc0020000 + 0xa00d9003 0x00000000 0x08001601 0xe4204780 + 0x100d9003 0x00000100 0xa0000209 0xc4104780 + 0xc0000409 0x04700003 0xa0000409 0x8c0047d0 + 0xa0000409 0x44065680 0x30170409 0xec101680 + 0x31000409 0x04429680 0x10001209 0x2440d100 + 0x30030a19 0x6c0187d0 0x30148409 0x00000003 + 0xd0800c19 0x04400780 0x00000809 0xc0000780 + 0x40060409 0x00018780 0xd802600d 0x20000780 + 0x08009601 0xe4208780 0x3c02de09 0x8c200780 + 0x08009601 0xe4208780 0x3c02ce09 0x8c200780 + 0x08009601 0xe4208780 0x3c02c609 0x8c200780 + 0x08009601 0xe4208780 0x3c02c209 0x8c200780 + 0x08009601 0xe4208780 0x3c02c009 0x8c200780 + 0x08009601 0xe4208780 0xd0025809 0x20000780 + 0x390fe009 0x00000003 0x30840409 0xac400780 + 0x10018019 0x00000003 0x307c041d 0x8c000780 + 0x30070c09 0xc4000780 0xa0000409 0x44014780 + 0xc0010405 0x00000780 0xa0000205 0xac004780 + 0x30850205 0xac400780 0xa009a003 0x00000000 + 0x30860219 0x8c400780 0x1009a003 0x00001100 + 0xa0004c09 0x04200780 0x10004e09 0x0023c780 + 0x60024805 0x00208780 0x2107ee09 0x00000003 + 0xd0870409 0x04400780 0x40050421 0x00000780 + 0x60040621 0x00020780 0x30101021 0xc4100780 + 0x60040405 0x00020780 0x20000205 0x04014780 + 0x30070209 0xc4100780 0x30060205 0xc4100780 + 0x20018404 0x2101e804 0x20000805 0x04004780 + 0x20008205 0x00000007 0xd00e0219 0xa0c00780 + 0x307c0605 0x6c0087e2 0xa00003fd 0x0c0147d8 + 0xa00ad003 0x00000000 0x100ad003 0x00002100 + 0xa0004c09 0x04200780 0x10004e09 0x0023c780 + 0x60024805 0x00208780 0x2107ee09 0x00000003 + 0xd0870409 0x04400780 0x40050421 0x00000780 + 0x60040621 0x00020780 0x30101021 0xc4100780 + 0x60040405 0x00020780 0x20000205 0x04014780 + 0x30070209 0xc4100780 0x30060205 0xc4100780 + 0x20018404 0x2101e804 0x20088205 0x00000003 + 0xd00e021d 0xa0c00780 0x307c0dfd 0x6c0087ea + 0xa0000c05 0x44066500 0x30170205 0xec102500 + 0x31000205 0x0442a500 0x10001205 0x2440e280 + 0xd0060009 0x0402c780 0x307c05fd 0x6c0087e8 + 0xa0000409 0x44066500 0x30170409 0xec102500 + 0x31000409 0x0442a500 0x10001209 0x2440e280 + 0x30020205 0x8c000780 0x00000809 0xc0000780 + 0x30218205 0x00000003 0xd802600d 0x20000780 + 0x08009601 0xe4204780 0x3c01de05 0x8c200780 + 0x08009601 0xe4204780 0x3c01ce05 0x8c200780 + 0x08009601 0xe4204780 0x3c01c605 0x8c200780 + 0x08009601 0xe4204780 0x3c01c205 0x8c200780 + 0x08009601 0xe4204780 0x3c01c005 0x8c200780 + 0x08009601 0xe4204780 0x100d9003 0x00001100 + 0xa0004c09 0x04200780 0x10004e09 0x0023c780 + 0x2107ee19 0x00000003 0x60024805 0x00208780 + 0xd0870c09 0x04400780 0x40050419 0x00000780 + 0x60040619 0x00018780 0x30100c19 0xc4100780 + 0x60040405 0x00018780 0x20000205 0x04014780 + 0x30070209 0xc4100780 0x30060205 0xc4100780 + 0x20000405 0x04004780 0xd0025809 0x20000780 + 0x2101e808 0x1900e004 0x200c8409 0x00000003 + 0xd00e0405 0xa0c00780 0xf0000001 0xe0000002 + 0x861ffe03 0x00000000 0x20018a15 0x00000003 + 0x3005cffd 0x6c2147d8 0xd4000805 0x20000780 + 0x10038003 0x00001280 0xf0000001 0xe0000001 + } +} +code { + name = cudaSumResidual + lmem = 0 + smem = 1248 + reg = 4 + bar = 1 + const { + segname = const + segnum = 1 + offset = 0 + bytes = 8 + mem { + 0x0000002f 0x0000001f + } + } + bincode { + 0xa0000005 0x04000780 0x308003fd 0x644107c8 + 0xa000d003 0x00000000 0x30020209 0xc4100780 + 0x1000d003 0x00000280 0xa0004e01 0x04200780 + 0x3007000d 0xc4100780 0x30060001 0xc4100780 + 0x20008600 0x2100e800 0x20000401 0x04000780 + 0xd00e0001 0x80c00780 0x00000405 0xc0000780 + 0x04021001 0xe4200780 0xf0000001 0xe0000002 + 0x861ffe03 0x00000000 0x3001cffd 0x6c20c7c8 + 0xa001c003 0x00000000 0x1001b003 0x00000280 + 0x1000ce01 0x0423c780 0x40014e0d 0x00200780 + 0x3010060d 0xc4100780 0x60004e01 0x0020c780 + 0x20000001 0x04004780 0x30020001 0xc4100780 + 0x2000ca01 0x04200780 0xd00e0001 0x80c00780 + 0x1001c003 0x00000780 0x1000f801 0x0403c780 + 0x00000405 0xc0000782 0x04001001 0xe4200780 + 0x861ffe03 0x00000000 0x308103fd 0x6c4107c8 + 0x00000405 0xc0000500 0xd4014009 0x20000500 + 0x1800c001 0x0423c500 0x2400d001 0x04200500 + 0x04001001 0xe4200500 0x861ffe03 0x00000000 + 0x00000405 0xc0000780 0x1400f001 0x0423c780 + 0x2400d001 0x04200780 0x04001001 0xe4200780 + 0x2400e001 0x04200780 0x04001001 0xe4200780 + 0x2400d801 0x04200780 0x04001001 0xe4200780 + 0x2400d401 0x04200780 0x04001001 0xe4200780 + 0x2400d201 0x04200780 0x307c03fd 0x6c0147c8 + 0x04001001 0xe4200780 0x30000003 0x00000280 + 0xa0004e01 0x04200780 0x30070005 0xc4100780 + 0x30060001 0xc4100780 0x20008200 0x2100e804 + 0x1000d001 0x0423c780 0x20108205 0x00000003 + 0xd00e0201 0xa0c00781 + } +}