diff --git a/CUETools.FlaCuda/FlaCudaWriter.cs b/CUETools.FlaCuda/FlaCudaWriter.cs index 502d271..cbdda4e 100644 --- a/CUETools.FlaCuda/FlaCudaWriter.cs +++ b/CUETools.FlaCuda/FlaCudaWriter.cs @@ -100,8 +100,8 @@ namespace CUETools.Codecs.FlaCuda bool encode_on_cpu = true; - public const int MAX_BLOCKSIZE = 4608 * 16; - internal const int maxFrames = 32; + public const int MAX_BLOCKSIZE = 4096 * 16; + internal const int maxFrames = 128; internal const int maxResidualParts = 64; // not (MAX_BLOCKSIZE + 255) / 256!! 64 is hardcoded in cudaEstimateResidual. It's per block. internal const int maxAutocorParts = (MAX_BLOCKSIZE + 255) / 256; @@ -704,13 +704,14 @@ namespace CUETools.Codecs.FlaCuda // residual int j = sub.best.order; + fixed (byte* fixbuf = &frame_buffer[0]) for (int p = 0; p < (1 << porder); p++) { int k = sub.best.rc.rparams[p]; bitwriter.writebits(4, k); if (p == 1) res_cnt = psize; int cnt = Math.Min(res_cnt, frame.blocksize - j); - bitwriter.write_rice_block_signed(k, sub.best.residual + j, cnt); + bitwriter.write_rice_block_signed(fixbuf, k, sub.best.residual + j, cnt); j += cnt; } } @@ -980,6 +981,7 @@ namespace CUETools.Codecs.FlaCuda // oldsize <= frame.subframes[ch].obits * (uint)frame.blocksize) // throw new Exception("oops"); } +#if DEBUG else { // residual @@ -1000,6 +1002,7 @@ namespace CUETools.Codecs.FlaCuda if (len != frame.subframes[ch].best.size) throw new Exception(string.Format("length mismatch: {0} vs {1}", len, frame.subframes[ch].best.size)); } +#endif } break; } @@ -1123,15 +1126,16 @@ namespace CUETools.Codecs.FlaCuda throw new Exception("invalid combination of block size and LPC order"); int max_porder = get_max_p_order(eparams.max_partition_order, task.frameSize, eparams.max_prediction_order); - int psize = task.frameSize >> max_porder; - while (psize < 16 && max_porder > 0) + int calcPartitionPartSize = task.frameSize >> max_porder; + while (calcPartitionPartSize < 16 && max_porder > 0) { - psize <<= 1; + calcPartitionPartSize <<= 1; max_porder--; } + int calcPartitionPartCount = (calcPartitionPartSize >= 64) ? 1 : (256 / calcPartitionPartSize); CUfunction cudaChannelDecorr = channels == 2 ? (channelsCount == 4 ? task.cudaStereoDecorr : task.cudaChannelDecorr2) : task.cudaChannelDecorr; - CUfunction cudaCalcPartition = psize >= 128 ? task.cudaCalcLargePartition : task.cudaCalcPartition; + CUfunction cudaCalcPartition = calcPartitionPartSize >= 64 ? task.cudaCalcLargePartition : task.cudaCalcPartition; cuda.SetParameter(cudaChannelDecorr, 0 * sizeof(uint), (uint)task.cudaSamples.Pointer); cuda.SetParameter(cudaChannelDecorr, 1 * sizeof(uint), (uint)task.cudaSamplesBytes.Pointer); @@ -1211,7 +1215,9 @@ namespace CUETools.Codecs.FlaCuda cuda.SetParameter(cudaCalcPartition, 1 * sizeof(uint), (uint)task.cudaResidual.Pointer); cuda.SetParameter(cudaCalcPartition, 2 * sizeof(uint), (uint)task.cudaBestResidualTasks.Pointer); cuda.SetParameter(cudaCalcPartition, 3 * sizeof(uint), (uint)max_porder); - cuda.SetParameterSize(cudaCalcPartition, 4U * sizeof(uint)); + cuda.SetParameter(cudaCalcPartition, 4 * sizeof(uint), (uint)calcPartitionPartSize); + cuda.SetParameter(cudaCalcPartition, 5 * sizeof(uint), (uint)calcPartitionPartCount); + cuda.SetParameterSize(cudaCalcPartition, 6U * sizeof(uint)); cuda.SetFunctionBlockShape(cudaCalcPartition, 16, 16, 1); cuda.SetParameter(task.cudaSumPartition, 0, (uint)task.cudaPartitions.Pointer); @@ -1244,7 +1250,7 @@ namespace CUETools.Codecs.FlaCuda cuda.LaunchAsync(task.cudaCopyBestMethod, 1, channels * task.frameCount, task.stream); if (!encode_on_cpu) { - int bsz = (psize >= 128) ? psize : (256 / psize) * psize; + int bsz = calcPartitionPartCount * calcPartitionPartSize; cuda.LaunchAsync(task.cudaEncodeResidual, residualPartCount, channels * task.frameCount, task.stream); cuda.LaunchAsync(cudaCalcPartition, (task.frameSize + bsz - 1) / bsz, channels * task.frameCount, task.stream); if (max_porder > 0) @@ -1432,12 +1438,6 @@ namespace CUETools.Codecs.FlaCuda if (nFrames >= max_frames) do_output_frames(nFrames); } - //if (task2.frameCount > 0) - //{ - // cuda.SynchronizeStream(task2.stream); - // process_result(task2); - // task2.frameCount = 0; - //} } public unsafe void do_output_frames(int nFrames) @@ -1469,7 +1469,7 @@ namespace CUETools.Codecs.FlaCuda public string Path { get { return _path; } } - string vendor_string = "FlaCuda#0.5"; + string vendor_string = "FlaCuda#0.7"; int select_blocksize(int samplerate, int time_ms) { @@ -1820,7 +1820,7 @@ namespace CUETools.Codecs.FlaCuda window_function = WindowFunction.Flattop | WindowFunction.Tukey; do_midside = true; block_size = 0; - block_time_ms = 105; + block_time_ms = 100; min_fixed_order = 0; max_fixed_order = 4; min_prediction_order = 1; diff --git a/CUETools.FlaCuda/flacuda.cu b/CUETools.FlaCuda/flacuda.cu index b871745..0a22977 100644 --- a/CUETools.FlaCuda/flacuda.cu +++ b/CUETools.FlaCuda/flacuda.cu @@ -810,7 +810,9 @@ extern "C" __global__ void cudaCalcPartition( int* partition_lengths, int* residual, encodeResidualTaskStruct *tasks, - int max_porder // <= 8 + int max_porder, // <= 8 + int psize, // == (shared.task.blocksize >> max_porder), < 256 + int parts_per_block // == 256 / psize, > 0, <= 16 ) { __shared__ struct { @@ -823,22 +825,24 @@ extern "C" __global__ void cudaCalcPartition( ((int*)&shared.task)[tid] = ((int*)(&tasks[blockIdx.y]))[tid]; __syncthreads(); - const int psize = (shared.task.blocksize >> max_porder); // 18 - const int parts_per_block = 256 / psize; // 14 const int parts = min(parts_per_block, (1 << max_porder) - blockIdx.x * parts_per_block); // fetch residual - shared.data[tid] = ((blockIdx.x != 0 || tid >= shared.task.residualOrder) && tid < parts * psize) ? residual[shared.task.residualOffs + blockIdx.x * psize * parts_per_block + tid - shared.task.residualOrder] : 0; + shared.length[tid] = (tid < parts * psize - shared.task.residualOrder) ? + residual[shared.task.residualOffs + blockIdx.x * psize * parts_per_block + tid] : 0; + __syncthreads(); + shared.data[tid] = (tid >= shared.task.residualOrder) ? + shared.length[tid - shared.task.residualOrder] : blockIdx.x != 0 ? + residual[shared.task.residualOffs + blockIdx.x * psize * parts_per_block + tid - shared.task.residualOrder] : 0; // convert to unsigned - shared.data[tid] = (shared.data[tid] << 1) ^ (shared.data[tid] >> 31); + shared.data[tid] = min(0xfffff, (shared.data[tid] << 1) ^ (shared.data[tid] >> 31)); __syncthreads(); // calc number of unary bits for each residual part with each rice paramater shared.length[tid] = (psize - shared.task.residualOrder * (threadIdx.y + blockIdx.x == 0)) * (threadIdx.x + 1); - if (threadIdx.y < parts) - for (int i = 0; i < psize; i++) - // for part (threadIdx.y) with this rice paramater (threadIdx.x) - shared.length[tid] = min(0xfffff, shared.length[tid] + (shared.data[threadIdx.y * psize + i] >> threadIdx.x)); + for (int i = 0; i < psize; i++) + // for part (threadIdx.y) with this rice paramater (threadIdx.x) + shared.length[tid] += shared.data[threadIdx.y * psize + i] >> threadIdx.x; __syncthreads(); // output length (transposed: k is now threadIdx.y) @@ -851,12 +855,15 @@ extern "C" __global__ void cudaCalcLargePartition( int* partition_lengths, int* residual, encodeResidualTaskStruct *tasks, - int max_porder // <= 8 + int max_porder, // <= 8 + int psize, // == >= 128 + int parts_per_block // == 1 ) { __shared__ struct { + int data1[256]; int data[256]; - int length[256]; + volatile int length[256]; encodeResidualTaskStruct task; } shared; const int tid = threadIdx.x + (threadIdx.y << 4); @@ -864,31 +871,35 @@ extern "C" __global__ void cudaCalcLargePartition( ((int*)&shared.task)[tid] = ((int*)(&tasks[blockIdx.y]))[tid]; __syncthreads(); - const int psize = (shared.task.blocksize >> max_porder); // 18 - shared.length[tid] = 0; for (int pos = 0; pos < psize; pos += 256) { // fetch residual - shared.data[tid] = ((blockIdx.x != 0 || pos + tid >= shared.task.residualOrder) && pos + tid < psize) ? residual[shared.task.residualOffs + blockIdx.x * psize + pos + tid - shared.task.residualOrder] : 0; + //shared.data[tid] = ((blockIdx.x != 0 || pos + tid >= shared.task.residualOrder) && pos + tid < psize) ? residual[shared.task.residualOffs + blockIdx.x * psize + pos + tid - shared.task.residualOrder] : 0; + shared.data1[tid] = (pos + tid < psize - shared.task.residualOrder) ? + residual[shared.task.residualOffs + blockIdx.x * psize + pos + tid] : 0; + __syncthreads(); + shared.data[tid] = (tid >= shared.task.residualOrder) ? + shared.data1[tid - shared.task.residualOrder] : ((pos != 0) || blockIdx.x != 0) && (pos + tid < psize) ? + residual[shared.task.residualOffs + blockIdx.x * psize + pos + tid - shared.task.residualOrder] : 0; // convert to unsigned - shared.data[tid] = (shared.data[tid] << 1) ^ (shared.data[tid] >> 31); + shared.data[tid] = min(0xfffff, (shared.data[tid] << 1) ^ (shared.data[tid] >> 31)); __syncthreads(); // calc number of unary bits for each residual sample with each rice paramater - for (int i = 0; i < 256; i += 16) +#pragma unroll 1 + for (int i = 0; i < min(psize,256); i += 16) // for sample (i + threadIdx.x) with this rice paramater (threadIdx.y) - shared.length[tid] = min(0xfffff, shared.length[tid] + (shared.data[i + threadIdx.x] >> threadIdx.y)); + shared.length[tid] += shared.data[i + threadIdx.x] >> threadIdx.y; + shared.length[tid] = min(0xfffff, shared.length[tid]); __syncthreads(); } - __syncthreads(); SUM16(shared.length,tid,+=); - __syncthreads(); // output length const int pos = (15 << (max_porder + 1)) * blockIdx.y + (threadIdx.y << (max_porder + 1)); if (threadIdx.y <= 14 && threadIdx.x == 0) - partition_lengths[pos + blockIdx.x] = shared.length[tid] + (psize - shared.task.residualOrder * (blockIdx.x == 0)) * (threadIdx.y + 1); + partition_lengths[pos + blockIdx.x] = min(0xfffff,shared.length[tid]) + (psize - shared.task.residualOrder * (blockIdx.x == 0)) * (threadIdx.y + 1); } // Sums partition lengths for a certain k == blockIdx.x @@ -927,30 +938,36 @@ extern "C" __global__ void cudaFindRiceParameter( ) { __shared__ struct { - int length[256]; - int tmp[256]; + volatile int length[256]; + volatile int index[256]; } shared; const int tid = threadIdx.x + (threadIdx.y << 4); const int parts = min(16, 2 << max_porder); const int pos = (15 << (max_porder + 1)) * blockIdx.y + (threadIdx.y << (max_porder + 1)); // read length for 16 partitions - shared.length[tid] = (threadIdx.y <= 14 && threadIdx.x < parts) ? partition_lengths[pos + blockIdx.x * 16 + threadIdx.x] : 0xffffff; + shared.index[tid] = (threadIdx.y <= 14 && threadIdx.x < parts) ? partition_lengths[pos + blockIdx.x * 16 + threadIdx.x] : 0xffffff; __syncthreads(); // transpose - shared.tmp[tid] = threadIdx.y + (threadIdx.x << 4); - // find best rice parameter - shared.tmp[tid] = BEST_INDEX(shared.tmp[tid], shared.tmp[tid + 8]); - shared.tmp[tid] = BEST_INDEX(shared.tmp[tid], shared.tmp[tid + 4]); - shared.tmp[tid] = BEST_INDEX(shared.tmp[tid], shared.tmp[tid + 2]); - shared.tmp[tid] = BEST_INDEX(shared.tmp[tid], shared.tmp[tid + 1]); + shared.length[tid] = shared.index[threadIdx.y + (threadIdx.x << 4)]; __syncthreads(); + // find best rice parameter + int cmp = 8 * (shared.length[tid + 8] < shared.length[tid]); + shared.index[tid] = threadIdx.x + cmp; + shared.length[tid] = shared.length[tid + cmp]; + cmp = 4 * (shared.length[tid + 4] < shared.length[tid]); + shared.index[tid] = shared.index[tid + cmp]; + shared.length[tid] = shared.length[tid + cmp]; + cmp = 2 * (shared.length[tid + 2] < shared.length[tid]); + shared.index[tid] = shared.index[tid + cmp]; + shared.length[tid] = shared.length[tid + cmp]; + cmp = (shared.length[tid + 1] < shared.length[tid]); // output rice parameter if (threadIdx.x == 0 && threadIdx.y < parts) - output[(blockIdx.y << (max_porder + 2)) + blockIdx.x * parts + threadIdx.y] = shared.tmp[tid] >> 4; + output[(blockIdx.y << (max_porder + 2)) + blockIdx.x * parts + threadIdx.y] = shared.index[tid + cmp]; // output length if (threadIdx.x == 0 && threadIdx.y < parts) - output[(blockIdx.y << (max_porder + 2)) + (1 << (max_porder + 1)) + blockIdx.x * parts + threadIdx.y] = shared.length[shared.tmp[tid]]; + output[(blockIdx.y << (max_porder + 2)) + (1 << (max_porder + 1)) + blockIdx.x * parts + threadIdx.y] = shared.length[tid + cmp]; } #endif diff --git a/CUETools.FlaCuda/flacuda.cubin b/CUETools.FlaCuda/flacuda.cubin index c458935..2e177ab 100644 --- a/CUETools.FlaCuda/flacuda.cubin +++ b/CUETools.FlaCuda/flacuda.cubin @@ -814,15 +814,16 @@ code { name = cudaFindRiceParameter lmem = 0 smem = 2076 - reg = 7 + reg = 9 bar = 1 const { segname = const segnum = 1 offset = 0 - bytes = 16 + bytes = 28 mem { - 0x00000010 0x000003ff 0x0000000e 0x00000001 + 0x00000010 0x000003ff 0x0000000e 0x00000008 + 0x00000004 0x00000002 0x00000001 } } bincode { @@ -830,53 +831,55 @@ code { 0x30010409 0xc4000780 0x10000005 0x0403c780 0x30800409 0xac400780 0xa0000401 0x04000780 0xd0820609 0x00400780 0x300005fd 0x640107c8 - 0xa0000411 0x04000780 0x308209fd 0x6440c2c8 + 0xa000040d 0x04000780 0x308207fd 0x6440c2c8 0xa001b003 0x00000000 0x1001a003 0x00000100 - 0x2101ec0d 0x00000003 0x100f8005 0x00000003 - 0x30030205 0xc4000780 0x40034e15 0x00200780 + 0x2101ec11 0x00000003 0x100f8005 0x00000003 + 0x30040205 0xc4000780 0x40034e15 0x00200780 0x30100a15 0xc4100780 0x60024e15 0x00214780 - 0x30030805 0xc4000780 0x20000a0d 0x04004780 - 0x60804c05 0x00600780 0x20000205 0x0400c780 + 0x30040605 0xc4000780 0x20000a11 0x04004780 + 0x60804c05 0x00600780 0x20000205 0x04010780 0x30020205 0xc4100780 0x2000ca05 0x04204780 0xd00e0205 0x80c00780 0x1001b003 0x00000780 - 0x103f8005 0x000fffff 0x3004080d 0xc4100782 - 0x2000060d 0x04000780 0x00020605 0xc0000780 - 0x04000e01 0xe4204780 0x861ffe03 0x00000000 - 0x30040005 0xc4100780 0x20000205 0x04010780 - 0xd4087809 0x20000780 0x04020e01 0xe4204780 - 0x0002020d 0xc0000780 0x0802c009 0xc0200780 - 0x1c00ce0d 0x0423c780 0x3803cffd 0x6c2187c8 - 0xd4087809 0x20000500 0x1800c005 0x0423c500 - 0xd4085809 0x20000780 0x04020e01 0xe4204780 - 0x0802c00d 0xc0200780 0x00020209 0xc0000780 - 0x1c00ce0d 0x0423c780 0x3803cffd 0x6c20c7c8 - 0xd4085809 0x20000500 0x1800c005 0x0423c500 - 0xd4084809 0x20000780 0x04020e01 0xe4204780 - 0x0802c00d 0xc0200780 0x00020209 0xc0000780 - 0x1c00ce0d 0x0423c780 0x3803cffd 0x6c20c7c8 - 0xd4084809 0x20000500 0x1800c005 0x0423c500 - 0xd4084009 0x20000780 0x04020e01 0xe4204780 - 0x0802c00d 0xc0200780 0x00020209 0xc0000780 - 0x1c00ce0d 0x0423c780 0x3803cffd 0x6c20c7c8 - 0xd4084009 0x20000500 0x1800c005 0x0423c500 + 0x103f8005 0x000fffff 0x30040611 0xc4100782 + 0x20000811 0x04000780 0x00020805 0xc0000780 0x04020e01 0xe4204780 0x861ffe03 0x00000000 - 0x307c0001 0x64008780 0x30040405 0x64010780 - 0xd0830001 0x04400780 0xd0830205 0x04400780 - 0xd0010001 0x040007c0 0xa0057003 0x00000000 - 0x10057003 0x00000100 0x40054c15 0x00200780 - 0xa0004e05 0x04200780 0x2102ec0d 0x00000003 - 0x30100a15 0xc4100780 0x30030205 0xc4000780 - 0x60044c0d 0x00214780 0x20018604 0x20018804 - 0xd4083809 0x20000780 0x3002020d 0xc4100780 - 0x3804c005 0xec300780 0x2000c80d 0x0420c780 - 0xd00e0605 0xa0c00780 0x307c01fd 0x6c0087ca - 0x30000003 0x00000280 0x2101ec05 0x00000003 - 0x40054c19 0x00200780 0x10018001 0x00000003 - 0xa0004e0d 0x04200780 0x2102ec15 0x00000003 - 0x30100c19 0xc4100780 0x30010001 0xc4000780 - 0x30050605 0xc4000780 0x60044c09 0x00218780 - 0x20018000 0x20048404 0xd4083805 0x20000780 - 0x20000201 0x04000780 0x0402c005 0xc0200780 + 0x30040005 0xc4100780 0x20000605 0x04004780 + 0x00020205 0xc0000780 0xd4083805 0x20000780 + 0x00020809 0xc0000780 0x1400c005 0x0423c780 + 0x08000e01 0xe4204780 0x861ffe03 0x00000000 + 0x00020805 0xc0000780 0x1400de05 0x0423c780 + 0x3401ce05 0x6c210780 0xd0830205 0x04400780 + 0x20008214 0x20018804 0x04020e01 0xe4214780 + 0x00020209 0xc0000780 0x1800ce05 0x0423c780 + 0x04000e01 0xe4204780 0x1400d605 0x0423c780 + 0x3401ce05 0x6c210780 0xd0840205 0x04400780 + 0x20000805 0x04004780 0x00020209 0xc0000780 + 0xd808380d 0x20000780 0x1c00c005 0x0423c780 + 0x04020e01 0xe4204780 0x1800ce05 0x0423c780 + 0x04000e01 0xe4204780 0x1400d205 0x0423c780 + 0x3401ce05 0x6c210780 0xd0850205 0x04400780 + 0x20000805 0x04004780 0x00020209 0xc0000780 + 0xd808380d 0x20000780 0x1c00c005 0x0423c780 + 0x04020e01 0xe4204780 0x1800ce05 0x0423c780 + 0x307c0001 0x64008780 0x04000e01 0xe4204780 + 0x30030415 0x64010780 0xd0860005 0x04400780 + 0x1400d001 0x0423c780 0xd0860a15 0x04400780 + 0x3400ce01 0x6c210780 0xd0050205 0x040007c0 + 0xa005c003 0x00000000 0xa0000015 0x2c014780 + 0x1005c003 0x00000100 0x40054c1d 0x00200780 + 0xa0004e01 0x04200780 0x2102ec19 0x00000003 + 0x30100e1d 0xc4100780 0x30060001 0xc4000780 + 0x60044c19 0x0021c780 0x20048a1c 0x20008c00 + 0x00020e05 0xc0000780 0x20000601 0x04000780 + 0xd4083805 0x20000780 0x30020019 0xc4100780 + 0x1500e000 0x2106e818 0xd00e0c01 0xa0c00780 + 0x307c03fd 0x6c0087ca 0x30000003 0x00000280 + 0x2101ec05 0x00000003 0x40054c21 0x00200780 + 0x10018001 0x00000003 0xa0004e19 0x04200780 + 0x2102ec1d 0x00000003 0x30101021 0xc4100780 + 0x30010001 0xc4000780 0x30070c05 0xc4000780 + 0x60044c09 0x00220780 0x20018000 0x20038404 + 0x20048a08 0x20008200 0x00020405 0xc0000780 0x30020005 0xc4100780 0x1500ee00 0x2101e804 0xd00e0201 0xa0c00781 } @@ -1150,8 +1153,8 @@ code { code { name = cudaCalcPartition lmem = 0 - smem = 2272 - reg = 12 + smem = 2280 + reg = 11 bar = 1 const { segname = const @@ -1163,83 +1166,82 @@ code { } } bincode { - 0xd0800205 0x00400780 0xa000020d 0x04000780 - 0xa0000019 0x04000780 0x30040601 0xc4100780 - 0x20000c25 0x04000780 0x308113fd 0x644107c8 - 0xa0011003 0x00000000 0x3002121d 0xc4100780 + 0xd0800205 0x00400780 0xa0000211 0x04000780 + 0xa0000019 0x04000780 0x30040801 0xc4100780 + 0x20000c15 0x04000780 0x30810bfd 0x644107c8 + 0xa0011003 0x00000000 0x30020a1d 0xc4100780 0x10011003 0x00000280 0xa0004e01 0x04200780 0x30070005 0xc4100780 0x30060001 0xc4100780 0x20008200 0x2100ec00 0x20000e01 0x04000780 0xd00e0001 0x80c00780 0x00000e05 0xc0000780 - 0x04041001 0xe4200780 0xf0000001 0xe0000002 - 0x861ffe03 0x00000000 0xd0104005 0x20000780 - 0x1000ce01 0x0423c780 0x3400ce05 0xec200780 - 0x10008001 0x00000013 0x10000209 0x0403c780 - 0x20078003 0x00000780 0xa0004c09 0x04200780 - 0x40040215 0x00000780 0x10018021 0x00000003 - 0x1000ce11 0x0423c780 0x30100a15 0xc4100780 - 0x30041011 0xc4000780 0x60040021 0x00014780 - 0x30001011 0x04010780 0x30040011 0xa4000780 - 0x40090415 0x00000780 0x60080615 0x00014780 - 0x3409c1fd 0x6c20c7c8 0x30100a15 0xc4100780 - 0x307c05fd 0x64014148 0x60080415 0x00014780 - 0x300513fd 0x6c0042c8 0xa003b003 0x00000000 - 0x1003a003 0x00000100 0x40050415 0x00000780 - 0x60040615 0x00014780 0x30100a15 0xc4100780 - 0x60040415 0x00014780 0x400b0029 0x00000780 - 0x600a0229 0x00028780 0x30101429 0xc4100780 - 0x600a0001 0x00028780 0xd0104005 0x20000780 - 0x2500f400 0x20009200 0x3400c001 0x04200780 + 0x04041401 0xe4200780 0xf0000001 0xe0000002 + 0x861ffe03 0x00000000 0xa0004c0d 0x04200780 + 0x1000d201 0x0423c780 0x40060209 0x00000780 + 0x10018021 0x00000003 0x1000ce05 0x0423c780 + 0x30100409 0xc4100780 0x30011005 0xc4000780 + 0x60060021 0x00008780 0x30019004 0x1100f000 + 0x3001d205 0xa4200780 0x40030009 0x00000780 + 0x60020209 0x00008780 0x30100409 0xc4100780 + 0x60020001 0x00008780 0xd0105005 0x20000780 + 0x3400c001 0x04200780 0x30000bfd 0x6c0187c8 + 0xa0037003 0x00000000 0x10036003 0x00000280 + 0x1000d001 0x0423c780 0x40070009 0x00000780 + 0x60060209 0x00008780 0x30100409 0xc4100780 + 0x60060009 0x00008780 0x1000d201 0x0423c780 + 0x40050025 0x00000780 0x60040225 0x00024780 + 0x30101225 0xc4100780 0x60040001 0x00024780 + 0xd010a005 0x20000780 0x2500e000 0x20008a00 0x30020001 0xc4100780 0x2000ca01 0x04200780 - 0xd00e0001 0x80c00780 0x1003b003 0x00000780 - 0x1000f801 0x0403c780 0x301f0015 0xec100782 - 0x30010001 0xc4100780 0xd0000a01 0x04008780 - 0x00000e05 0xc0000780 0x04001001 0xe4200780 - 0x861ffe03 0x00000000 0xd0104005 0x20000780 - 0x20018c01 0x00000003 0x3400c015 0x04204780 - 0x40020228 0x400b0024 0x60030029 0x00028780 - 0x600a0225 0x00024780 0x30101429 0xc4100780 - 0x200005fd 0x0400c7c8 0x30101225 0xc4100780 - 0x60020009 0x00028780 0x600a0009 0x00024100 - 0x00000e05 0xc0000780 0x300309fd 0x6400c7c8 - 0xa0062003 0x00000000 0x04021001 0xe4208780 - 0x10062003 0x00000280 0x307c03fd 0x6c00c7c8 - 0x10062003 0x00000280 0x40070401 0x00000780 - 0x60060601 0x00000780 0x30100001 0xc4100780 - 0x60060401 0x00000780 0x20088015 0x00000003 - 0x00020a05 0xc0000780 0x20000005 0x04004780 - 0x3606c215 0xec200780 0x20000409 0x04014780 - 0x20018001 0x00000003 0x00000e09 0xc0000780 - 0x30820409 0xac400780 0x300101fd 0x6c0147c8 - 0x08021001 0xe4208780 0x1005a003 0x00000280 + 0xd00e0001 0x80c00780 0x10037003 0x00000780 + 0x1000f801 0x0403c780 0x00000e05 0xc0000782 + 0x04021401 0xe4200780 0x861ffe03 0x00000000 + 0xd0105005 0x20000780 0x3405c1fd 0x6c2107c8 + 0xa005a003 0x00000000 0x10044003 0x00000280 + 0xd0105005 0x20000780 0x3400c001 0x04214780 + 0x00020005 0xc0000780 0xd4085005 0x20000780 + 0x1400c001 0x0423c780 0x1005a003 0x00000780 + 0x307c07fd 0x640087c8 0xa0059003 0x00000000 + 0x10058003 0x00000280 0x1000d001 0x0423c780 + 0x40070009 0x00000780 0x60060209 0x00008780 + 0x30100409 0xc4100780 0x60060009 0x00008780 + 0x1000d201 0x0423c780 0x40050025 0x00000780 + 0x60040225 0x00024780 0x30101225 0xc4100780 + 0x60040001 0x00024780 0xd0105005 0x20000780 + 0x2500f400 0x20008a00 0x3400c001 0x04200780 + 0x30020001 0xc4100780 0x2000ca01 0x04200780 + 0xd00e0001 0x80c00780 0x10059003 0x00000780 + 0x1000f801 0x0403c780 0xf0000001 0xe0000002 + 0x301f0009 0xec100782 0x30010001 0xc4100780 + 0xd0000401 0x04008780 0x00000e05 0xc0000780 + 0x30820001 0xac400780 0x04001401 0xe4200780 + 0x861ffe03 0x00000000 0xd0105005 0x20000780 + 0x20018c09 0x00000003 0x1100f000 0x1100f014 + 0x3505e014 0x40000a24 0x60010825 0x00024780 + 0x400b0829 0x00000780 0x30101225 0xc4100780 + 0x600a0a29 0x00028780 0x200007fd 0x040107c8 + 0x60000801 0x00024780 0x3010140d 0xc4100780 + 0x600a0801 0x0000c100 0x00000e05 0xc0000780 + 0x307cd1fd 0x6c20c7c8 0x04021401 0xe4200780 + 0x10000009 0x0403c780 0x10083003 0x00000280 + 0x1000d001 0x0423c780 0x4009000d 0x00000780 + 0x6008020d 0x0000c780 0x3010060d 0xc4100780 + 0x60080001 0x0000c780 0x200a800d 0x00000003 + 0x00020605 0xc0000780 0xa0082003 0x00000000 + 0x2000d00d 0x04200780 0x3606c215 0xec200780 + 0x20018001 0x00000003 0x20000409 0x04014780 + 0x00000e09 0xc0000780 0x300301fd 0x6c0147c8 + 0x08021401 0xe4208780 0x1007b003 0x00000280 0xf0000001 0xe0000002 0x861ffe03 0x00000000 - 0x300609fd 0x640107c8 0x308307fd 0x6440c2c8 + 0x300603fd 0x640107c8 0x308309fd 0x6440c2c8 0x30000003 0x00000100 0x2101ee05 0x00000003 0x100f8001 0x00000003 0x30010001 0xc4000780 0x40014e09 0x00200780 0x30100409 0xc4100780 - 0x60004e09 0x00208780 0x30040c11 0xc4100780 - 0x30010605 0xc4000780 0x20069000 0x2004860c + 0x60004e09 0x00208780 0x30040c0d 0xc4100780 + 0x30010805 0xc4000780 0x20069000 0x2003880c 0x20000405 0x04004780 0x00020605 0xc0000780 - 0x20000001 0x04004780 0xd4084005 0x20000780 + 0x20000001 0x04004780 0xd4085005 0x20000780 0x30020005 0xc4100780 0x1500e000 0x2101e804 - 0xd00e0201 0xa0c00780 0x30000003 0x00000780 - 0xa0000411 0x04114780 0xa0000815 0x44004780 - 0xa0000021 0x04114780 0x90000a29 0x00000780 - 0xa0001015 0x44064780 0x203e9429 0x0fffffff - 0xc00a0a15 0x0000c7c0 0xa0000a15 0x84064780 - 0x400b102d 0x00000780 0x600a122d 0x0002c780 - 0x3010162d 0xc4100780 0x600a102d 0x0002c780 - 0x2040102d 0x0402c780 0xa000162d 0x44064780 - 0xc00a1629 0x0000c7c0 0xa0001429 0x84064780 - 0x20000a15 0x04028780 0x40081629 0x00000780 - 0x60091429 0x00028780 0x30101429 0xc4100780 - 0x60081429 0x00028780 0x30001421 0x04020780 - 0x30080811 0x6400c780 0xd0000401 0x04008780 - 0x301f0001 0xe4100780 0x30000815 0x04014780 - 0xa0000011 0x2c014780 0xd0050811 0x04008780 - 0x307c05fd 0x6c0147c8 0x20000001 0x04010780 - 0xd0020001 0x0402c500 0x30000003 0x00000780 - 0xf0000001 0xe0000001 + 0xd00e0201 0xa0c00781 } } code { @@ -1267,8 +1269,8 @@ code { code { name = cudaCalcLargePartition lmem = 0 - smem = 2272 - reg = 23 + smem = 3304 + reg = 10 bar = 1 const { segname = const @@ -1276,114 +1278,85 @@ code { offset = 0 bytes = 20 mem { - 0x000003ff 0x0000002f 0x00000001 0x000fffff + 0x000003ff 0x0000002f 0x00000100 0x000fffff 0x0000000e } } bincode { - 0xd0800205 0x00400780 0xa0000219 0x04000780 - 0xa000001d 0x04000780 0x30040c01 0xc4100780 - 0x20000e21 0x04000780 0x308111fd 0x644107c8 - 0xa0011003 0x00000000 0x30021025 0xc4100780 + 0xd0800205 0x00400780 0xa000020d 0x04000780 + 0xa0000005 0x04000780 0x30040601 0xc4100780 + 0x20000215 0x04000780 0x30810bfd 0x644107c8 + 0xa0011003 0x00000000 0x30020a11 0xc4100780 0x10011003 0x00000280 0xa0004e01 0x04200780 - 0x30070005 0xc4100780 0x30060001 0xc4100780 - 0x20008200 0x2100ec00 0x20001201 0x04000780 - 0xd00e0001 0x80c00780 0x00001205 0xc0000780 - 0x04041001 0xe4200780 0xf0000001 0xe0000002 - 0x861ffe03 0x00000000 0xd0107805 0x20000780 - 0x1000ce01 0x0423c780 0x3400c001 0xec200780 - 0x00001205 0xc0000780 0x307c01fd 0x6c00c7c8 - 0x04021001 0xe43f0780 0x1009f003 0x00000280 - 0x20108e51 0x00000003 0x20208e4d 0x00000003 - 0x20308e49 0x00000003 0x20008e45 0x00000007 - 0x20108e41 0x00000007 0x20208e3d 0x00000007 - 0x20308e39 0x00000007 0x20008e35 0x0000000b - 0x20108e31 0x0000000b 0x20208e2d 0x0000000b - 0x20308e29 0x0000000b 0x20008e15 0x0000000f - 0x20108e11 0x0000000f 0x20208e05 0x0000000f - 0x20308e09 0x0000000f 0x00020205 0xc0000780 - 0x1000f80d 0x0403c780 0xa0004c05 0x042007c0 - 0x00020409 0xc0000780 0x1000f809 0x0403c780 - 0x1000040d 0x2440c280 0xd010400d 0x20000780 - 0x20000455 0x04020780 0x3c15c059 0x6c20c780 - 0x30002a55 0x6c004780 0xa0002c59 0x2c014780 - 0xa0002a55 0x2c014780 0xd0032c59 0x04004780 - 0xd0162bfd 0x040007c8 0xa0045003 0x00000000 - 0x10044003 0x00000100 0x40030055 0x00000780 - 0x60020255 0x00054780 0x30102a55 0xc4100780 - 0xd010400d 0x20000780 0x60020059 0x00054780 - 0x20088454 0x2d16f458 0x2016aa54 0x3d15e054 - 0x30022a55 0xc4100780 0x2000ca55 0x04254780 - 0xd00e2a55 0x80c00780 0x10045003 0x00000780 - 0x1000f855 0x0403c780 0x301f2a59 0xec100782 - 0x30012a55 0xc4100780 0xd0152c55 0x04008780 - 0x0000120d 0xc0000780 0x0c001001 0xe4254780 - 0x861ffe03 0x00000000 0x0000120d 0xc0000780 - 0x00020e11 0xc0000780 0x3006d055 0xec200784 - 0xdc084011 0x20000780 0x2000c055 0x04254784 - 0x30832a55 0xac400780 0x0c021001 0xe4254780 - 0x00022811 0xc0000780 0x3006d059 0xec200784 - 0x20002a55 0x04058780 0x30832a55 0xac400780 - 0x0c021001 0xe4254780 0x00022611 0xc0000780 - 0x3006d059 0xec200784 0x20002a55 0x04058780 - 0x30832a55 0xac400780 0x0c021001 0xe4254780 - 0x00022411 0xc0000780 0x3006d059 0xec200784 - 0x20002a55 0x04058780 0x30832a55 0xac400780 - 0x0c021001 0xe4254780 0x00022211 0xc0000780 - 0x3006d059 0xec200784 0x20002a55 0x04058780 - 0x30832a55 0xac400780 0x0c021001 0xe4254780 - 0x00022011 0xc0000780 0x3006d059 0xec200784 - 0x20002a55 0x04058780 0x30832a55 0xac400780 - 0x0c021001 0xe4254780 0x00021e11 0xc0000780 - 0x3006d059 0xec200784 0x20002a55 0x04058780 - 0x30832a55 0xac400780 0x0c021001 0xe4254780 - 0x00021c11 0xc0000780 0x3006d059 0xec200784 - 0x20002a55 0x04058780 0x30832a55 0xac400780 - 0x0c021001 0xe4254780 0x00021a11 0xc0000780 - 0x3006d059 0xec200784 0x20002a55 0x04058780 - 0x30832a55 0xac400780 0x0c021001 0xe4254780 - 0x00021811 0xc0000780 0x3006d059 0xec200784 - 0x20002a55 0x04058780 0x30832a55 0xac400780 - 0x0c021001 0xe4254780 0x00021611 0xc0000780 - 0x3006d059 0xec200784 0x20002a55 0x04058780 - 0x30832a55 0xac400780 0x0c021001 0xe4254780 - 0x00021411 0xc0000780 0x3006d059 0xec200784 - 0x20002a55 0x04058780 0x30832a55 0xac400780 - 0x0c021001 0xe4254780 0x00020a11 0xc0000780 - 0x3006d059 0xec200784 0x20002a55 0x04058780 - 0x30832a55 0xac400780 0x0c021001 0xe4254780 - 0x00020811 0xc0000780 0x3006d059 0xec200784 - 0x20002a55 0x04058780 0x30832a55 0xac400780 - 0x0c021001 0xe4254780 0x3406d059 0xec200780 - 0x20002a55 0x04058780 0x30832a55 0xac400780 - 0x0c021001 0xe4254780 0x3806d059 0xec200780 - 0x20002a55 0x04058780 0x30832a55 0xac400780 - 0x0c021001 0xe4254780 0x861ffe03 0x00000000 - 0x20008409 0x00000013 0x300201fd 0x6c0107c8 - 0x1002f003 0x00000280 0x861ffe03 0x00000000 - 0x00001205 0xc0000780 0xd4084009 0x20000780 - 0x1900f004 0x2901e004 0x04021001 0xe4204780 - 0x2800c805 0x04204780 0x04021001 0xe4204780 - 0x2800c405 0x04204780 0x04021001 0xe4204780 - 0x2800c205 0x04204780 0x04021001 0xe4204780 - 0x861ffe03 0x00000000 0x307c0ffd 0x640087c8 - 0x30840dfd 0x6440c2c8 0x30000003 0x00000100 - 0xa0004c09 0x042007c0 0x20018c05 0x00000003 - 0x100bb003 0x00000280 0xd0104005 0x20000780 - 0x3400c001 0x04200780 0x4001040d 0x00000780 - 0x6000060d 0x0000c780 0x3010060d 0xc4100780 - 0x00001205 0xc0000780 0x60000401 0x0000c780 - 0xd4084005 0x20000780 0x2400c005 0x04200780 - 0x100c2003 0x00000780 0x4003000d 0x00000780 - 0x6002020d 0x0000c780 0x3010060d 0xc4100780 - 0x00001205 0xc0000780 0x60020001 0x0000c780 - 0xd4084005 0x20000780 0x2400c005 0x04200780 - 0x2101ee0d 0x00000003 0x100f8001 0x00000003 - 0x30030001 0xc4000780 0x40014e11 0x00200780 - 0x30100811 0xc4100780 0x30030c0d 0xc4000780 - 0x60004e01 0x00210780 0x20038000 0x20008400 - 0x30020001 0xc4100780 0x2000c801 0x04200780 - 0xd00e0005 0xa0c00781 + 0x30070009 0xc4100780 0x30060001 0xc4100780 + 0x20008400 0x2100ec00 0x20000801 0x04000780 + 0xd00e0001 0x80c00780 0x00000805 0xc0000780 + 0x04061401 0xe4200780 0xf0000001 0xe0000002 + 0x861ffe03 0x00000000 0x00000805 0xc0000780 + 0x307cd1fd 0x6c20c7c8 0x04041401 0xe43f0780 + 0x1006b003 0x00000280 0x3082d019 0xac600780 + 0x1000f81d 0x0403c780 0x307c0dfd 0x6c0107c8 + 0xd0185005 0x20000780 0x1500e000 0x20058e20 + 0x2040d001 0x04200780 0x300011fd 0x6c0187d8 + 0xa002c003 0x00000000 0x1002b003 0x00001280 + 0x1000d001 0x0423c780 0x40014c09 0x00200780 + 0x30100409 0xc4100780 0x60004c09 0x00208780 + 0xd018a005 0x20000780 0x20058e00 0x2502e008 + 0x20000001 0x04008780 0x30020001 0xc4100780 + 0x2000ca01 0x04200780 0xd00e0001 0x80c00780 + 0x1002c003 0x00000780 0x1000f801 0x0403c780 + 0x00000805 0xc0000782 0x04001401 0xe4200780 + 0x861ffe03 0x00000000 0xd0185005 0x20000780 + 0x3405c1fd 0x6c2107d8 0xa004c003 0x00000000 + 0x10038003 0x00001280 0xd0185005 0x20000780 + 0x3400c001 0x04214780 0x00020005 0xc0000780 + 0x1400d401 0x0423c780 0x1004c003 0x00000780 + 0x307c0ffd 0x6c0147d8 0xa0004c09 0x04200780 + 0x307c05fd 0x64015158 0x3008d1fd 0x6c2112d8 + 0xa004b003 0x00000000 0x1004a003 0x00001100 + 0x1000d001 0x0423c780 0x40050021 0x00000780 + 0x60040221 0x00020780 0x30101021 0xc4100780 + 0x60040009 0x00020780 0xd0185005 0x20000780 + 0x20058e00 0x2502f408 0x20028000 0x3500e000 + 0x30020001 0xc4100780 0x2000ca01 0x04200780 + 0xd00e0001 0x80c00780 0x1004b003 0x00000780 + 0x1000f801 0x0403c780 0xf0000001 0xe0000002 + 0x301f0009 0xec100782 0x30010001 0xc4100780 + 0xd0000401 0x04008780 0x00000805 0xc0000780 + 0x30830001 0xac400780 0x04021401 0xe4200780 + 0x861ffe03 0x00000000 0x10063003 0x00000100 + 0x200a8201 0x00000003 0x00020005 0xc0000780 + 0xa0062003 0x00000000 0x10008200 0x20068208 + 0x00000809 0xc0000780 0xd4080011 0x20000780 + 0xd810500d 0x20000780 0x3003c021 0xec200784 + 0x20108001 0x00000003 0x2c00c021 0x04220780 + 0x300201fd 0x6c0047d8 0x08041401 0xe4220780 + 0xd4008005 0x20000780 0x10058003 0x00001280 + 0xf0000001 0xe0000002 0x00000805 0xc0000780 + 0xd4105009 0x20000780 0x3883c001 0xac600780 + 0x04041401 0xe4200780 0x861ffe03 0x00000000 + 0x20008e1d 0x00000013 0x3007d1fd 0x6c2107d8 + 0x1001a003 0x00001280 0x00000805 0xc0000780 + 0xd4105009 0x20000780 0x1900f000 0x2900e000 + 0x04041401 0xe4200780 0x1900e800 0x2900e000 + 0x04041401 0xe4200780 0x1900e400 0x2900e000 + 0x04041401 0xe4200780 0x1800c201 0x0423c780 + 0x307c03fd 0x640087c8 0x2800c001 0x04200780 + 0x308407fd 0x6440c2c8 0x04041401 0xe4200780 + 0x30000003 0x00000100 0xd0185005 0x20000780 + 0x2101ee19 0x00000003 0x100f8005 0x00000003 + 0x20018609 0x00000003 0x1000d001 0x0423c780 + 0x30060205 0xc4000780 0x1100f014 0x41032e1c + 0x40000a24 0x3505e014 0x30100e21 0xc4100780 + 0x6001081d 0x00024780 0x3006060d 0xc4000780 + 0x400b0819 0x00000780 0x60024e05 0x00220780 + 0x00000805 0xc0000780 0x30100e1d 0xc4100780 + 0x600a0a11 0x00018780 0x2000020d 0x0400c780 + 0xa0004c05 0x042007c0 0x60000801 0x0001c780 + 0x30100811 0xc4100780 0xd4105005 0x20000780 + 0x20000205 0x0400c780 0x600a0801 0x00010100 + 0x3483c009 0xac600780 0x30020205 0xc4100780 + 0x20028000 0x2101e804 0xd00e0201 0xa0c00781 } } code {