diff --git a/CUETools.FlaCuda/FlaCudaWriter.cs b/CUETools.FlaCuda/FlaCudaWriter.cs index cbdda4e..ba7e587 100644 --- a/CUETools.FlaCuda/FlaCudaWriter.cs +++ b/CUETools.FlaCuda/FlaCudaWriter.cs @@ -1065,7 +1065,7 @@ namespace CUETools.Codecs.FlaCuda frame.subframes[ch].best.size = (uint)frame.subframes[ch].best.order * frame.subframes[ch].obits + 6; if (frame.subframes[ch].best.type == SubframeType.LPC) frame.subframes[ch].best.size += 4 + 5 + (uint)frame.subframes[ch].best.order * (uint)frame.subframes[ch].best.cbits; - AudioSamples.MemCpy(frame.subframes[ch].best.residual + frame.subframes[ch].best.order, (int*)task.residualBufferPtr + task.BestResidualTasks[index].residualOffs, frame.blocksize - frame.subframes[ch].best.order); + AudioSamples.MemCpy(frame.subframes[ch].best.residual, (int*)task.residualBufferPtr + task.BestResidualTasks[index].residualOffs, frame.blocksize); int* riceParams = ((int*)task.riceParamsPtr) + (4 << task.max_porder) * index; int* partLengths = ((int*)task.riceParamsPtr) + (4 << task.max_porder) * index + (2 << task.max_porder); int opt_porder = task.max_porder; @@ -1191,7 +1191,7 @@ namespace CUETools.Codecs.FlaCuda cuda.SetParameter(task.cudaChooseBestMethod, 2 * sizeof(uint), (uint)residualPartCount); cuda.SetParameter(task.cudaChooseBestMethod, 3 * sizeof(uint), (uint)task.nResidualTasksPerChannel); cuda.SetParameterSize(task.cudaChooseBestMethod, sizeof(uint) * 4U); - cuda.SetFunctionBlockShape(task.cudaChooseBestMethod, 32, 16, 1); + cuda.SetFunctionBlockShape(task.cudaChooseBestMethod, 32, 8, 1); cuda.SetParameter(task.cudaCopyBestMethod, 0, (uint)task.cudaBestResidualTasks.Pointer); cuda.SetParameter(task.cudaCopyBestMethod, 1 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer); @@ -1223,7 +1223,7 @@ namespace CUETools.Codecs.FlaCuda cuda.SetParameter(task.cudaSumPartition, 0, (uint)task.cudaPartitions.Pointer); cuda.SetParameter(task.cudaSumPartition, 1 * sizeof(uint), (uint)max_porder); cuda.SetParameterSize(task.cudaSumPartition, 2U * sizeof(uint)); - cuda.SetFunctionBlockShape(task.cudaSumPartition, 256, 1, 1); + cuda.SetFunctionBlockShape(task.cudaSumPartition, Math.Max(64, 1 << max_porder), 1, 1); cuda.SetParameter(task.cudaFindRiceParameter, 0, (uint)task.cudaRiceParams.Pointer); cuda.SetParameter(task.cudaFindRiceParameter, 1 * sizeof(uint), (uint)task.cudaPartitions.Pointer); diff --git a/CUETools.FlaCuda/flacuda.cu b/CUETools.FlaCuda/flacuda.cu index 0a22977..43eef05 100644 --- a/CUETools.FlaCuda/flacuda.cu +++ b/CUETools.FlaCuda/flacuda.cu @@ -629,8 +629,6 @@ extern "C" __global__ void cudaEstimateResidual( output[(blockIdx.y * blockDim.y + threadIdx.y) * 64 + blockIdx.x] = shared.residual[tid]; } -#define BEST_INDEX(a,b) ((a) + ((b) - (a)) * (shared.length[b] < shared.length[a])) - extern "C" __global__ void cudaChooseBestMethod( encodeResidualTaskStruct *tasks, int *residual, @@ -640,13 +638,13 @@ extern "C" __global__ void cudaChooseBestMethod( { __shared__ struct { volatile int index[128]; - volatile int partLen[512]; - int length[256]; - volatile encodeResidualTaskStruct task[16]; + volatile int length[256]; + volatile int partLen[256]; + volatile encodeResidualTaskStruct task[8]; } shared; const int tid = threadIdx.x + threadIdx.y * 32; - if (tid < 256) shared.length[tid] = 0x7fffffff; + shared.length[tid] = 0x7fffffff; for (int task = 0; task < taskCount; task += blockDim.y) if (task + threadIdx.y < taskCount) { @@ -681,25 +679,37 @@ extern "C" __global__ void cudaChooseBestMethod( __syncthreads(); - //if (tid < 128) shared.index[tid] = BEST_INDEX(shared.index[tid], shared.index[tid + 128]); __syncthreads(); - if (tid < 128) shared.index[tid] = BEST_INDEX(tid, tid + 128); __syncthreads(); - if (tid < 64) shared.index[tid] = BEST_INDEX(shared.index[tid], shared.index[tid + 64]); __syncthreads(); - if (tid < 32) - { - shared.index[tid] = BEST_INDEX(shared.index[tid], shared.index[tid + 32]); - shared.index[tid] = BEST_INDEX(shared.index[tid], shared.index[tid + 16]); - shared.index[tid] = BEST_INDEX(shared.index[tid], shared.index[tid + 8]); - shared.index[tid] = BEST_INDEX(shared.index[tid], shared.index[tid + 4]); - shared.index[tid] = BEST_INDEX(shared.index[tid], shared.index[tid + 2]); - shared.index[tid] = BEST_INDEX(shared.index[tid], shared.index[tid + 1]); - } - __syncthreads(); - // if (threadIdx.x < sizeof(encodeResidualTaskStruct)/sizeof(int)) - //((int*)(tasks_out + blockIdx.y))[threadIdx.x] = ((int*)(tasks + taskCount * blockIdx.y + shared.index[0]))[threadIdx.x]; - if (tid == 0) - tasks[taskCount * blockIdx.y].best_index = taskCount * blockIdx.y + shared.index[0]; if (tid < taskCount) tasks[tid + taskCount * blockIdx.y].size = shared.length[tid]; + + __syncthreads(); + int l1 = shared.length[tid]; + if (tid < 128) + { + int l2 = shared.length[tid + 128]; + shared.index[tid] = tid + ((l2 < l1) << 7); + shared.length[tid] = l1 = min(l1, l2); + } + __syncthreads(); + if (tid < 64) + { + int l2 = shared.length[tid + 64]; + shared.index[tid] = shared.index[tid + ((l2 < l1) << 6)]; + shared.length[tid] = l1 = min(l1, l2); + } + __syncthreads(); + if (tid < 32) + { +#pragma unroll 5 + for (int sh = 5; sh > 0; sh --) + { + int l2 = shared.length[tid + (1 << sh)]; + shared.index[tid] = shared.index[tid + ((l2 < l1) << sh)]; + shared.length[tid] = l1 = min(l1, l2); + } + if (tid == 0) + tasks[taskCount * blockIdx.y].best_index = taskCount * blockIdx.y + shared.index[shared.length[1] < shared.length[0]]; + } } extern "C" __global__ void cudaCopyBestMethod( @@ -796,14 +806,18 @@ extern "C" __global__ void cudaEncodeResidual( if (tid < 32) shared.data[tid + partSize] = tid + partSize < dataLen ? samples[shared.task.samplesOffs + pos + tid + partSize] >> shared.task.wbits : 0; const int residualLen = max(0,min(shared.task.blocksize - pos - shared.task.residualOrder, partSize)); - __syncthreads(); - + __syncthreads(); // compute residual int sum = 0; for (int c = 0; c < shared.task.residualOrder; c++) sum += __mul24(shared.data[tid + c], shared.task.coefs[c]); - if (tid < residualLen) - output[shared.task.residualOffs + pos + tid] = shared.data[tid + shared.task.residualOrder] - (sum >> shared.task.shift); + __syncthreads(); + shared.data[tid + shared.task.residualOrder] -= (sum >> shared.task.shift); + __syncthreads(); + if (tid >= shared.task.residualOrder && tid < residualLen + shared.task.residualOrder) + output[shared.task.residualOffs + pos + tid] = shared.data[tid]; + if (tid + 256 < residualLen + shared.task.residualOrder) + output[shared.task.residualOffs + pos + tid + 256] = shared.data[tid + 256]; } extern "C" __global__ void cudaCalcPartition( @@ -828,18 +842,14 @@ extern "C" __global__ void cudaCalcPartition( const int parts = min(parts_per_block, (1 << max_porder) - blockIdx.x * parts_per_block); // fetch residual - shared.length[tid] = (tid < parts * psize - shared.task.residualOrder) ? - residual[shared.task.residualOffs + blockIdx.x * psize * parts_per_block + tid] : 0; - __syncthreads(); - shared.data[tid] = (tid >= shared.task.residualOrder) ? - shared.length[tid - shared.task.residualOrder] : blockIdx.x != 0 ? - residual[shared.task.residualOffs + blockIdx.x * psize * parts_per_block + tid - shared.task.residualOrder] : 0; + int offs = blockIdx.x * psize * parts_per_block + tid; + int s = (offs >= shared.task.residualOrder && tid < parts * psize) ? residual[shared.task.residualOffs + offs] : 0; // convert to unsigned - shared.data[tid] = min(0xfffff, (shared.data[tid] << 1) ^ (shared.data[tid] >> 31)); + shared.data[tid] = min(0xfffff, (s << 1) ^ (s >> 31)); + shared.length[tid] = (psize - shared.task.residualOrder * (threadIdx.y + blockIdx.x == 0)) * (threadIdx.x + 1); __syncthreads(); // calc number of unary bits for each residual part with each rice paramater - shared.length[tid] = (psize - shared.task.residualOrder * (threadIdx.y + blockIdx.x == 0)) * (threadIdx.x + 1); for (int i = 0; i < psize; i++) // for part (threadIdx.y) with this rice paramater (threadIdx.x) shared.length[tid] += shared.data[threadIdx.y * psize + i] >> threadIdx.x; @@ -861,7 +871,6 @@ extern "C" __global__ void cudaCalcLargePartition( ) { __shared__ struct { - int data1[256]; int data[256]; volatile int length[256]; encodeResidualTaskStruct task; @@ -875,19 +884,14 @@ extern "C" __global__ void cudaCalcLargePartition( for (int pos = 0; pos < psize; pos += 256) { // fetch residual - //shared.data[tid] = ((blockIdx.x != 0 || pos + tid >= shared.task.residualOrder) && pos + tid < psize) ? residual[shared.task.residualOffs + blockIdx.x * psize + pos + tid - shared.task.residualOrder] : 0; - shared.data1[tid] = (pos + tid < psize - shared.task.residualOrder) ? - residual[shared.task.residualOffs + blockIdx.x * psize + pos + tid] : 0; - __syncthreads(); - shared.data[tid] = (tid >= shared.task.residualOrder) ? - shared.data1[tid - shared.task.residualOrder] : ((pos != 0) || blockIdx.x != 0) && (pos + tid < psize) ? - residual[shared.task.residualOffs + blockIdx.x * psize + pos + tid - shared.task.residualOrder] : 0; + int offs = blockIdx.x * psize + pos + tid; + int s = (offs >= shared.task.residualOrder && pos + tid < psize) ? residual[shared.task.residualOffs + offs] : 0; // convert to unsigned - shared.data[tid] = min(0xfffff, (shared.data[tid] << 1) ^ (shared.data[tid] >> 31)); + shared.data[tid] = min(0xfffff, (s << 1) ^ (s >> 31)); __syncthreads(); // calc number of unary bits for each residual sample with each rice paramater -#pragma unroll 1 +#pragma unroll 0 for (int i = 0; i < min(psize,256); i += 16) // for sample (i + threadIdx.x) with this rice paramater (threadIdx.y) shared.length[tid] += shared.data[i + threadIdx.x] >> threadIdx.y; @@ -910,7 +914,7 @@ extern "C" __global__ void cudaSumPartition( ) { __shared__ struct { - int data[512]; + int data[512]; // max_porder <= 8, data length <= 1 << 9. } shared; const int pos = (15 << (max_porder + 1)) * blockIdx.y + (blockIdx.x << (max_porder + 1)); @@ -946,28 +950,32 @@ extern "C" __global__ void cudaFindRiceParameter( const int pos = (15 << (max_porder + 1)) * blockIdx.y + (threadIdx.y << (max_porder + 1)); // read length for 16 partitions - shared.index[tid] = (threadIdx.y <= 14 && threadIdx.x < parts) ? partition_lengths[pos + blockIdx.x * 16 + threadIdx.x] : 0xffffff; + shared.length[tid] = (threadIdx.y <= 14 && threadIdx.x < parts) ? partition_lengths[pos + blockIdx.x * 16 + threadIdx.x] : 0xffffff; __syncthreads(); // transpose - shared.length[tid] = shared.index[threadIdx.y + (threadIdx.x << 4)]; + //shared.length[tid] = shared.index[threadIdx.y + (threadIdx.x << 4)]; + int l1 = shared.length[threadIdx.y + (threadIdx.x << 4)]; + __syncthreads(); + shared.length[tid] = l1; __syncthreads(); // find best rice parameter - int cmp = 8 * (shared.length[tid + 8] < shared.length[tid]); - shared.index[tid] = threadIdx.x + cmp; - shared.length[tid] = shared.length[tid + cmp]; - cmp = 4 * (shared.length[tid + 4] < shared.length[tid]); - shared.index[tid] = shared.index[tid + cmp]; - shared.length[tid] = shared.length[tid + cmp]; - cmp = 2 * (shared.length[tid + 2] < shared.length[tid]); - shared.index[tid] = shared.index[tid + cmp]; - shared.length[tid] = shared.length[tid + cmp]; - cmp = (shared.length[tid + 1] < shared.length[tid]); + int l2 = shared.length[tid + 8]; + shared.index[tid] = threadIdx.x + ((l2 < l1) << 3); + shared.length[tid] = l1 = min(l1, l2); +#pragma unroll 2 + for (int sh = 2; sh > 0; sh --) + { + l2 = shared.length[tid + (1 << sh)]; + shared.index[tid] = shared.index[tid + ((l2 < l1) << sh)]; + shared.length[tid] = l1 = min(l1, l2); + } + l2 = shared.length[tid + 1]; // output rice parameter if (threadIdx.x == 0 && threadIdx.y < parts) - output[(blockIdx.y << (max_porder + 2)) + blockIdx.x * parts + threadIdx.y] = shared.index[tid + cmp]; + output[(blockIdx.y << (max_porder + 2)) + blockIdx.x * parts + threadIdx.y] = shared.index[tid + (l2 < l1)]; // output length if (threadIdx.x == 0 && threadIdx.y < parts) - output[(blockIdx.y << (max_porder + 2)) + (1 << (max_porder + 1)) + blockIdx.x * parts + threadIdx.y] = shared.length[tid + cmp]; + output[(blockIdx.y << (max_porder + 2)) + (1 << (max_porder + 1)) + blockIdx.x * parts + threadIdx.y] = min(l1, l2); } #endif diff --git a/CUETools.FlaCuda/flacuda.cubin b/CUETools.FlaCuda/flacuda.cubin index 2e177ab..61c3fb6 100644 --- a/CUETools.FlaCuda/flacuda.cubin +++ b/CUETools.FlaCuda/flacuda.cubin @@ -562,158 +562,155 @@ code { code { name = cudaChooseBestMethod lmem = 0 - smem = 6688 + smem = 4128 reg = 13 bar = 1 const { segname = const segnum = 1 offset = 0 - bytes = 36 + bytes = 28 mem { - 0x000003ff 0x000000ff 0x00000008 0x00000020 - 0x00000001 0x0000007f 0x0000003f 0x0000001f - 0x7fffffff + 0x000003ff 0x00000008 0x00000020 0x00000001 + 0x0000007f 0x0000003f 0x0000001f } } bincode { 0xd0800205 0x00400780 0xa000020d 0x04000780 0xa0000005 0x04000780 0x30050601 0xc4100780 - 0x20000211 0x04000780 0x308109fd 0x6c4107c8 - 0x00070609 0xc0000780 0x00020805 0xc0000500 - 0x10001001 0x2440c500 0x04051001 0xe4200500 - 0x307ccffd 0x6c20c7c8 0x10091003 0x00000280 - 0xa0004415 0x04200780 0x1000f819 0x0403c780 - 0x20000c1d 0x0400c780 0x3007cffd 0x6420c7c8 - 0xa008e003 0x00000000 0x1008e003 0x00000280 - 0x1000ce01 0x0423c780 0x40014e09 0x00200780 - 0x30100409 0xc4100780 0x60004e21 0x00208780 - 0x30070601 0xc4100780 0x30070c2d 0xc4100780 - 0x30060c31 0xc4100780 0x30060629 0xc4100780 - 0x30071025 0xc4100780 0x30061009 0xc4100780 - 0x200c962c 0x200a8000 0x20029224 0x210be828 - 0x30020209 0xc4100780 0x200a802c 0x20098428 - 0x20028024 0x200b9408 0xd00e0409 0x80c00780 - 0x00000005 0xc0000780 0x0000120d 0xc0000780 - 0x307ccdfd 0x6c20c7c8 0x0c071001 0xe4208780 - 0x1000f809 0x0403c780 0x10039003 0x00000280 - 0xa0038003 0x00000000 0x10008200 0x2101ec24 - 0xa0004229 0x04200780 0x3000cdfd 0x6420c7c8 - 0xa0035003 0x00000000 0x10035003 0x00000280 - 0x20000e2d 0x04020780 0x3006162d 0xc4100780 - 0x2000002d 0x0402c780 0x3002162d 0xc4100780 - 0x2000ca2d 0x0422c780 0xd00e162d 0x80c00780 - 0x20001609 0x04008780 0x20000001 0x04028782 - 0x300901fd 0x6c0047c8 0x1002b003 0x00000280 - 0xf0000001 0xe0000002 0x0002080d 0xc0000780 - 0x0c011001 0xe4208780 0xdc044011 0x20000780 - 0x1000e001 0x0423c784 0x2000c001 0x04200784 - 0x0c011001 0xe4200780 0x1000d001 0x0423c784 - 0x2000c001 0x04200784 0x0c011001 0xe4200780 - 0x1000c801 0x0423c784 0x2000c001 0x04200784 - 0x0c011001 0xe4200780 0x1000c401 0x0423c784 - 0x2000c001 0x04200784 0x0c011001 0xe4200780 - 0x1000c201 0x0423c784 0x2000c001 0x04200784 - 0x307c03fd 0x640147c8 0x0c011001 0xe4200780 - 0x1008e003 0x00000280 0xd41c680d 0x20000780 - 0x1d00ec08 0x1d00e400 0x2c40c209 0x04208780 - 0x40050021 0x00000780 0x60040221 0x00020780 - 0x30101021 0xc4100780 0x3c82c1fd 0x6c6147c8 - 0x60040021 0x00020780 0xa008b003 0x00000000 - 0x10061003 0x00000280 0xd41c4005 0x20000780 - 0x1400c001 0x0423c780 0x40050025 0x00000780 - 0x60040225 0x00024780 0x30101225 0xc4100780 - 0x60040001 0x00024780 0xd8044005 0x20000780 - 0x2400c001 0x04200780 0x20068001 0x00000003 - 0x1008b003 0x00000780 0xd41c680d 0x20000780 - 0x3c83c1fd 0x6c6147c8 0xa008a003 0x00000000 - 0x10075003 0x00000280 0xd41c4005 0x20000780 - 0x2502e608 0x1500e000 0x3002cc25 0xc4300780 - 0x40050029 0x00000780 0x301f122d 0xec100780 - 0x60040229 0x00028780 0xd084162d 0x04400780 - 0x30101429 0xc4100780 0x20001625 0x04024780 - 0x60040001 0x00028780 0x30011209 0xec100780 - 0x20000001 0x04008780 0xd8044005 0x20000780 - 0x2400c001 0x04200780 0x200f8001 0x00000003 - 0x1008a003 0x00000780 0xd41c680d 0x20000780 - 0x3c7cc1fd 0x6c2147c8 0xa0089003 0x00000000 - 0x10083003 0x00000280 0xd804400d 0x20000780 - 0xd41c7805 0x20000780 0x3c7cc1fd 0x6c2087c8 - 0x2501e001 0x00000003 0x10000801 0x2440c280 + 0x20000211 0x04000780 0x103f8001 0x07ffffff + 0x00020805 0xc0000780 0x307ccffd 0x6c20c7c8 + 0x04011001 0xe4200780 0x00070609 0xc0000780 + 0x10090003 0x00000280 0xa0004415 0x04200780 + 0x1000f819 0x0403c780 0x20000c1d 0x0400c780 + 0x3007cffd 0x6420c7c8 0xa008d003 0x00000000 + 0x1008d003 0x00000280 0x1000ce01 0x0423c780 + 0x40014e09 0x00200780 0x30100409 0xc4100780 + 0x60004e21 0x00208780 0x30070601 0xc4100780 + 0x30070c2d 0xc4100780 0x30060c31 0xc4100780 + 0x30060629 0xc4100780 0x30071025 0xc4100780 + 0x30061009 0xc4100780 0x200c962c 0x200a8000 + 0x20029224 0x210be828 0x30020209 0xc4100780 + 0x200a802c 0x20098428 0x20028024 0x200b9408 + 0xd00e0409 0x80c00780 0x00000005 0xc0000780 + 0x0000120d 0xc0000780 0x307ccdfd 0x6c20c7c8 + 0x0c051001 0xe4208780 0x1000f809 0x0403c780 + 0x10038003 0x00000280 0xa0037003 0x00000000 + 0x10008200 0x2101ec24 0xa0004229 0x04200780 + 0x3000cdfd 0x6420c7c8 0xa0034003 0x00000000 + 0x10034003 0x00000280 0x20000e2d 0x04020780 + 0x3006162d 0xc4100780 0x2000002d 0x0402c780 + 0x3002162d 0xc4100780 0x2000ca2d 0x0422c780 + 0xd00e162d 0x80c00780 0x20001609 0x04008780 + 0x20000001 0x04028782 0x300901fd 0x6c0047c8 + 0x1002a003 0x00000280 0xf0000001 0xe0000002 + 0x0002080d 0xc0000780 0x0c031001 0xe4208780 + 0xdc0c4011 0x20000780 0x1000e001 0x0423c784 + 0x2000c001 0x04200784 0x0c031001 0xe4200780 + 0x1000d001 0x0423c784 0x2000c001 0x04200784 + 0x0c031001 0xe4200780 0x1000c801 0x0423c784 + 0x2000c001 0x04200784 0x0c031001 0xe4200780 + 0x1000c401 0x0423c784 0x2000c001 0x04200784 + 0x0c031001 0xe4200780 0x1000c201 0x0423c784 + 0x2000c001 0x04200784 0x307c03fd 0x640147c8 + 0x0c031001 0xe4200780 0x1008d003 0x00000280 + 0xd414680d 0x20000780 0x1d00ec08 0x1d00e400 + 0x2c40c209 0x04208780 0x40050021 0x00000780 + 0x60040221 0x00020780 0x30101021 0xc4100780 + 0x3c81c1fd 0x6c6147c8 0x60040021 0x00020780 + 0xa008a003 0x00000000 0x10060003 0x00000280 + 0xd4144005 0x20000780 0x1400c001 0x0423c780 0x40050025 0x00000780 0x60040225 0x00024780 0x30101225 0xc4100780 0x60040001 0x00024780 - 0x10089003 0x00000780 0xd41c7805 0x20000780 - 0x1400c001 0x0423c780 0x40050025 0x00000780 + 0xd80c4005 0x20000780 0x2400c001 0x04200780 + 0x20068001 0x00000003 0x1008a003 0x00000780 + 0xd414680d 0x20000780 0x3c82c1fd 0x6c6147c8 + 0xa0089003 0x00000000 0x10074003 0x00000280 + 0xd4144005 0x20000780 0x2502e608 0x1500e000 + 0x3002cc25 0xc4300780 0x40050029 0x00000780 + 0x301f122d 0xec100780 0x60040229 0x00028780 + 0xd083162d 0x04400780 0x30101429 0xc4100780 + 0x20001625 0x04024780 0x60040001 0x00028780 + 0x30011209 0xec100780 0x20000001 0x04008780 + 0xd80c4005 0x20000780 0x2400c001 0x04200780 + 0x200f8001 0x00000003 0x10089003 0x00000780 + 0xd414680d 0x20000780 0x3c7cc1fd 0x6c2147c8 + 0xa0088003 0x00000000 0x10082003 0x00000280 + 0xd80c400d 0x20000780 0xd4147805 0x20000780 + 0x3c7cc1fd 0x6c2087c8 0x2501e001 0x00000003 + 0x10000601 0x2440c280 0x40050025 0x00000780 0x60040225 0x00024780 0x30101225 0xc4100780 - 0x60040001 0x00024780 0xf0000001 0xe0000002 - 0xf0000001 0xe0000002 0x30080001 0xac000782 - 0x00020e05 0xc0000780 0x04051001 0xe4200780 - 0x20000c19 0x04014782 0x3006cffd 0x6c2107c8 - 0x1000e003 0x00000280 0x861ffe03 0x00000000 - 0x308509fd 0x6c4107c8 0xa009d003 0x00000000 - 0x1009d003 0x00000280 0x00020805 0xc0000780 - 0xd418400d 0x20000780 0xd4144009 0x20000780 - 0x1c00c001 0x0423c780 0x3800c1fd 0x6c2107c8 - 0x20008801 0x0000000b 0x10000801 0x0403c500 - 0x04001001 0xe4200780 0xf0000001 0xe0000002 - 0x861ffe03 0x00000000 0x308609fd 0x6c4107c8 - 0xa00ae003 0x00000000 0x100ae003 0x00000280 - 0x00020805 0xc0000780 0xd4024009 0x20000780 - 0x0802c00d 0xc0200780 0x0402d011 0xc0200780 - 0xdc14400d 0x20000780 0x1400d001 0x0423c780 - 0xd0144011 0x20000784 0x1d00e004 0x2940e000 - 0x3001c005 0x6c20c784 0xd0010001 0x04020780 - 0x2400d001 0x04200780 0x04001001 0xe4200780 - 0xf0000001 0xe0000002 0x861ffe03 0x00000000 - 0x308709fd 0x6c4107c8 0xa00f3003 0x00000000 - 0x100f3003 0x00000280 0x00020805 0xc0000780 - 0xd4014009 0x20000780 0x0802c00d 0xc0200780 - 0x0402d011 0xc0200780 0xdc14400d 0x20000780 - 0x1400d001 0x0423c780 0xd0144011 0x20000784 - 0x1d00e004 0x2940e000 0x3001c005 0x6c20c784 - 0xd0010001 0x04020780 0x2400d001 0x04200780 - 0x04001001 0xe4200780 0x0402f00d 0xc0200780 - 0x0402d009 0xc0200780 0xdc14400d 0x20000780 - 0x1400d001 0x0423c780 0xd8144009 0x20000780 - 0x1c00c005 0x0423c780 0x2440f001 0x04200780 - 0x3801c005 0x6c20c780 0xd0010001 0x04020780 - 0x2400d001 0x04200780 0x04001001 0xe4200780 - 0x0402e00d 0xc0200780 0x0402d009 0xc0200780 - 0xdc14400d 0x20000780 0x1400d001 0x0423c780 - 0xd8144009 0x20000780 0x1c00c005 0x0423c780 - 0x2440e001 0x04200780 0x3801c005 0x6c20c780 - 0xd0010001 0x04020780 0x2400d001 0x04200780 - 0x04001001 0xe4200780 0x0402d80d 0xc0200780 - 0x0402d009 0xc0200780 0xdc14400d 0x20000780 - 0x1400d001 0x0423c780 0xd8144009 0x20000780 - 0x1d00e004 0x2540f800 0x3801c005 0x6c20c780 - 0xd0010001 0x04020780 0x2400d001 0x04200780 - 0x04001001 0xe4200780 0x0402d40d 0xc0200780 - 0x0402d009 0xc0200780 0xdc14400d 0x20000780 - 0x1400d001 0x0423c780 0xd8144009 0x20000780 - 0x1d00e004 0x2540f400 0x3801c005 0x6c20c780 - 0xd0010001 0x04020780 0x2400d001 0x04200780 - 0x04001001 0xe4200780 0x0402d20d 0xc0200780 - 0x0402d009 0xc0200780 0xdc14400d 0x20000780 - 0x1400d001 0x0423c780 0xd8144009 0x20000780 - 0x1d00e004 0x2540f200 0x3801c005 0x6c20c780 - 0xd0010001 0x04020780 0x2400d001 0x04200780 - 0x04001001 0xe4200780 0xf0000001 0xe0000002 - 0x861ffe03 0x00000000 0x307c09fd 0x6c0147c8 - 0xa0102003 0x00000000 0x10102003 0x00000280 - 0x1000ce01 0x0423c780 0x40014e05 0x00200780 - 0x30100205 0xc4100780 0x60004e01 0x00204780 - 0x30070005 0xc4100780 0x30060009 0xc4100780 - 0x20028204 0x2101e804 0x2000d009 0x04200780 - 0x20208201 0x00000003 0xd00e0009 0xa0c00780 - 0x3004cffd 0x6c20c7ca 0x30000003 0x00000280 + 0x60040001 0x00024780 0x10088003 0x00000780 + 0xd4147805 0x20000780 0x1400c001 0x0423c780 + 0x40050025 0x00000780 0x60040225 0x00024780 + 0x30101225 0xc4100780 0x60040001 0x00024780 + 0xf0000001 0xe0000002 0xf0000001 0xe0000002 + 0x30080001 0xac000782 0x00020e05 0xc0000780 + 0x04011001 0xe4200780 0x20000c19 0x04014782 + 0x3006cffd 0x6c2107c8 0x1000d003 0x00000280 + 0x861ffe03 0x00000000 0x3004cffd 0x6c20c7c8 + 0xa00a1003 0x00000000 0x100a1003 0x00000280 0x1000ce01 0x0423c780 0x40014e05 0x00200780 0x30100205 0xc4100780 0x60004e01 0x00204780 0x20000001 0x04010780 0x30070005 0xc4100780 0x30060001 0xc4100780 0x00020805 0xc0000780 - 0x20000201 0x04000780 0xd4144005 0x20000780 + 0x20000201 0x04000780 0xd4044005 0x20000780 0x2100e804 0x1500e000 0x20108205 0x00000003 - 0xd00e0201 0xa0c00781 + 0xd00e0201 0xa0c00780 0xf0000001 0xe0000002 + 0x861ffe03 0x00000000 0x00020805 0xc0000780 + 0xd4044005 0x20000780 0x308409fd 0x6c4107c8 + 0xa00b4003 0x00000000 0x1400c001 0x0423c780 + 0x100b4003 0x00000280 0x00020805 0xc0000780 + 0xd408400d 0x20000780 0xd4044009 0x20000780 + 0x1c00c001 0x0423c780 0x3800c1fd 0x6c2107c8 + 0x1c00c001 0x0423c780 0x20008805 0x0000000b + 0x3800c001 0xac200780 0x10000805 0x0403c500 + 0x04001001 0xe4204780 0x04011001 0xe4200780 + 0xf0000001 0xe0000002 0x861ffe03 0x00000000 + 0x308509fd 0x6c4107c8 0xa00c3003 0x00000000 + 0x100c3003 0x00000280 0x00020805 0xc0000780 + 0xd4064009 0x20000780 0x20008805 0x00000007 + 0x3800c1fd 0x6c2047c8 0x10000805 0x0403c500 + 0x0002020d 0xc0000780 0x3800c001 0xac200780 + 0x1c00d005 0x0423c780 0x04001001 0xe4204780 + 0x04011001 0xe4200780 0xf0000001 0xe0000002 + 0x861ffe03 0x00000000 0x308609fd 0x6c4107c8 + 0x30000003 0x00000280 0x20288805 0x0000000b + 0x00020209 0xc0000780 0x20208805 0x00000003 + 0x3800c1fd 0x6c2047c8 0x10000805 0x0403c500 + 0x0002020d 0xc0000780 0x00020805 0xc0000780 + 0x3800c005 0xac200780 0x1c00d001 0x0423c780 + 0x20188809 0x0000000b 0x04001001 0xe4200780 + 0x00020409 0xc0000780 0x04011001 0xe4204780 + 0x20108801 0x00000003 0x3801c1fd 0x6c2047c8 + 0x10000801 0x0403c500 0x0002000d 0xc0000780 + 0x3801c005 0xac200780 0x1c00d001 0x0423c780 + 0x20108809 0x0000000b 0x04001001 0xe4200780 + 0x00020409 0xc0000780 0x04011001 0xe4204780 + 0x20088801 0x00000003 0x3801c1fd 0x6c2047c8 + 0x10000801 0x0403c500 0x0002000d 0xc0000780 + 0x3801c005 0xac200780 0x1c00d001 0x0423c780 + 0x200c8809 0x0000000b 0x04001001 0xe4200780 + 0x00020409 0xc0000780 0x04011001 0xe4204780 + 0x20048801 0x00000003 0x3801c1fd 0x6c2047c8 + 0x10000801 0x0403c500 0x0002000d 0xc0000780 + 0x3801c005 0xac200780 0x1c00d001 0x0423c780 + 0x200a8809 0x0000000b 0x04001001 0xe4200780 + 0x00020409 0xc0000780 0x04011001 0xe4204780 + 0x20028801 0x00000003 0x3801c1fd 0x6c2047c8 + 0x10000801 0x0403c500 0x0002000d 0xc0000780 + 0x3801c005 0xac200780 0x1c00d001 0x0423c780 + 0x04001001 0xe4200780 0x307c09fd 0x6c0147c8 + 0x04011001 0xe4204780 0x30000003 0x00000280 + 0x1000ce01 0x0423c780 0x40014e05 0x00200780 + 0xd0044005 0x20000780 0x30100209 0xc4100780 + 0x1400c205 0x0423c780 0x60004e09 0x00208780 + 0x3401c1fd 0x6c2107c8 0x10048011 0x00000003 + 0x10208001 0x00000003 0x30070405 0xc4100780 + 0x3006040d 0xc4100780 0x21000801 0x04408280 + 0x20000205 0x0400c780 0x00000005 0xc0000780 + 0x2101e800 0x2502e004 0x20208001 0x00000003 + 0xd00e0005 0xa0c00781 } } code { @@ -737,45 +734,55 @@ code { 0xa0004e01 0x04200780 0x30070005 0xc4100780 0x30060001 0xc4100780 0x20000201 0x04000780 0x30020405 0xc4100780 0x2100ec00 0x20008200 - 0x00020405 0xc0000780 0xd00e0001 0x80c00780 + 0xd00e0001 0x80c00780 0x00020405 0xc0000780 0x04024e01 0xe4200780 0xf0000001 0xe0000002 0x861ffe03 0x00000000 0xa0004205 0x04200780 0x40034c01 0x00200780 0x30100001 0xc4100780 0xd0093805 0x20000780 0x60024c0d 0x00200780 0x2501e000 0x2543ee10 0x30040011 0xac000780 - 0x300209fd 0x6c00c7c8 0xa0021003 0x00000000 - 0x10020003 0x00000280 0xd0094005 0x20000780 + 0x300209fd 0x6c00c7c8 0xa0022003 0x00000000 + 0x10021003 0x00000280 0xd0094005 0x20000780 0x2503e000 0x20008400 0x30020001 0xc4100780 - 0x2100ea14 0x1500f400 0xd00e0a15 0x80c00780 - 0x30000a01 0xec000780 0x10021003 0x00000780 - 0x1000f801 0x0403c780 0x00020405 0xc0000782 - 0x308105fd 0x6c4107c8 0x04000e01 0xe4200780 - 0xa0035003 0x00000000 0x10035003 0x00000280 - 0x20000201 0x04008780 0x300401fd 0x6c0187c8 - 0x00020005 0xc0000780 0xa0034003 0x00000000 - 0x10033003 0x00000280 0xd0094009 0x20000780 - 0x20028200 0x2903e010 0x20000001 0x04010780 - 0x30020001 0xc4100780 0x2100ea10 0x1900f400 - 0xd00e0811 0x80c00780 0x30000801 0xec000780 - 0x10034003 0x00000780 0x1000f801 0x0403c780 - 0x04000e01 0xe4200782 0xd0093805 0x20000782 - 0x2543ee00 0x3500e000 0x30000201 0xac000780 - 0x307c0011 0x8c000780 0x861ffe03 0x00000000 - 0xd0093805 0x20000780 0x347cc1fd 0x6c20c7c8 - 0x1000f805 0x0403c780 0x1400c001 0x0423c780 - 0x1004d003 0x00000280 0x101c8001 0x00000003 - 0x00000005 0xc0000780 0x1000f815 0x0403c780 - 0x20000a01 0x04008780 0xd409800d 0x20000780 - 0x00020009 0xc0000780 0xd0093811 0x20000780 - 0x20018a15 0x00000003 0x1c00c001 0x0423c780 - 0x3005c1fd 0x6c2147cc 0x6800ce05 0x80204780 - 0xd4000805 0x20000780 0x1000c001 0x0423c784 - 0x10042003 0x00000280 0x300209fd 0x6c00c7c8 - 0x30000003 0x00000280 0xd0094805 0x20000780 - 0x2503f00c 0x20008410 0x1500e000 0x20038408 - 0x00020805 0xc0000780 0x30000205 0xec000780 - 0x30020401 0xc4100780 0x2541ee04 0x2100e800 - 0xd00e0005 0xa0c00781 + 0x2000ca01 0x04200780 0xd00e0015 0x80c00780 + 0x1400d401 0x0423c780 0x30000a01 0xec000780 + 0x10022003 0x00000780 0x1000f801 0x0403c780 + 0x00020405 0xc0000782 0x308105fd 0x6c4107c8 + 0x04000e01 0xe4200780 0xa0037003 0x00000000 + 0x10037003 0x00000280 0x20000201 0x04008780 + 0x300401fd 0x6c0187c8 0x00020009 0xc0000780 + 0xa0036003 0x00000000 0x10035003 0x00000280 + 0xd009400d 0x20000780 0x20028200 0x2d03e010 + 0x20000001 0x04010780 0x30020001 0xc4100780 + 0x2000ca01 0x04200780 0xd00e0011 0x80c00780 + 0x1c00d401 0x0423c780 0x30000801 0xec000780 + 0x10036003 0x00000780 0x1000f801 0x0403c780 + 0x08000e01 0xe4200782 0xd0093809 0x20000782 + 0x2943ee00 0x3900e000 0x30000201 0xac000780 + 0x307c0005 0x8c000780 0x861ffe03 0x00000000 + 0xd0093809 0x20000780 0x387cc1fd 0x6c20c7c8 + 0x1000f811 0x0403c780 0x1004d003 0x00000280 + 0x101c8001 0x00000003 0x0000000d 0xc0000780 + 0x1000f815 0x0403c780 0x20000a01 0x04008780 + 0xd0093811 0x20000780 0x20018a15 0x00000003 + 0xdc098009 0x20000780 0x3005c1fd 0x6c2147cc + 0x00020011 0xc0000780 0x1800c001 0x0423c780 + 0x6000ce11 0x80210784 0xdc00080d 0x20000780 + 0x10043003 0x00000280 0x861ffe03 0x00000000 + 0xd0093809 0x20000780 0x2902e014 0x1900e400 + 0x00020a09 0xc0000780 0x30000801 0xec000780 + 0x2840ce01 0x04200780 0x08000e01 0xe4200780 + 0x861ffe03 0x00000000 0xd0093809 0x20000780 + 0x2800c001 0x04204780 0x300201fd 0x6c0107c8 + 0x3802c1fd 0x6c20c2c8 0xa0061003 0x00000000 + 0x10061003 0x00000100 0xd0093809 0x20000780 + 0x2903f400 0x20008400 0x30020011 0xc4100780 + 0x1500ee00 0x2104e810 0xd00e0801 0xa0c00780 + 0x2800c001 0x04204780 0xf0000001 0xe0000002 + 0x20008405 0x00000013 0x300101fd 0x6c00c7c8 + 0x30000003 0x00000280 0xd0098809 0x20000780 + 0x2903e000 0x20008400 0x30020001 0xc4100780 + 0xd4083805 0x20000780 0x2100e804 0x1500e000 + 0x20008205 0x00000043 0xd00e0201 0xa0c00781 } } code { @@ -820,68 +827,67 @@ code { segname = const segnum = 1 offset = 0 - bytes = 28 + bytes = 16 mem { - 0x00000010 0x000003ff 0x0000000e 0x00000008 - 0x00000004 0x00000002 0x00000001 + 0x00000010 0x000003ff 0x0000000e 0x00000001 } } bincode { 0x10028009 0x00000003 0x1000cc05 0x0423c780 0x30010409 0xc4000780 0x10000005 0x0403c780 - 0x30800409 0xac400780 0xa0000401 0x04000780 - 0xd0820609 0x00400780 0x300005fd 0x640107c8 - 0xa000040d 0x04000780 0x308207fd 0x6440c2c8 + 0x3080040d 0xac400780 0xa0000401 0x04000780 + 0xd0820609 0x00400780 0x300007fd 0x640107c8 + 0xa0000411 0x04000780 0x308209fd 0x6440c2c8 0xa001b003 0x00000000 0x1001a003 0x00000100 - 0x2101ec11 0x00000003 0x100f8005 0x00000003 - 0x30040205 0xc4000780 0x40034e15 0x00200780 + 0x2101ec09 0x00000003 0x100f8005 0x00000003 + 0x30020205 0xc4000780 0x40034e15 0x00200780 0x30100a15 0xc4100780 0x60024e15 0x00214780 - 0x30040605 0xc4000780 0x20000a11 0x04004780 - 0x60804c05 0x00600780 0x20000205 0x04010780 + 0x30020805 0xc4000780 0x20000a09 0x04004780 + 0x60804c05 0x00600780 0x20000205 0x04008780 0x30020205 0xc4100780 0x2000ca05 0x04204780 - 0xd00e0205 0x80c00780 0x1001b003 0x00000780 - 0x103f8005 0x000fffff 0x30040611 0xc4100782 - 0x20000811 0x04000780 0x00020805 0xc0000780 - 0x04020e01 0xe4204780 0x861ffe03 0x00000000 - 0x30040005 0xc4100780 0x20000605 0x04004780 - 0x00020205 0xc0000780 0xd4083805 0x20000780 - 0x00020809 0xc0000780 0x1400c005 0x0423c780 - 0x08000e01 0xe4204780 0x861ffe03 0x00000000 - 0x00020805 0xc0000780 0x1400de05 0x0423c780 - 0x3401ce05 0x6c210780 0xd0830205 0x04400780 - 0x20008214 0x20018804 0x04020e01 0xe4214780 - 0x00020209 0xc0000780 0x1800ce05 0x0423c780 - 0x04000e01 0xe4204780 0x1400d605 0x0423c780 - 0x3401ce05 0x6c210780 0xd0840205 0x04400780 - 0x20000805 0x04004780 0x00020209 0xc0000780 - 0xd808380d 0x20000780 0x1c00c005 0x0423c780 - 0x04020e01 0xe4204780 0x1800ce05 0x0423c780 - 0x04000e01 0xe4204780 0x1400d205 0x0423c780 - 0x3401ce05 0x6c210780 0xd0850205 0x04400780 - 0x20000805 0x04004780 0x00020209 0xc0000780 - 0xd808380d 0x20000780 0x1c00c005 0x0423c780 - 0x04020e01 0xe4204780 0x1800ce05 0x0423c780 - 0x307c0001 0x64008780 0x04000e01 0xe4204780 - 0x30030415 0x64010780 0xd0860005 0x04400780 - 0x1400d001 0x0423c780 0xd0860a15 0x04400780 - 0x3400ce01 0x6c210780 0xd0050205 0x040007c0 - 0xa005c003 0x00000000 0xa0000015 0x2c014780 - 0x1005c003 0x00000100 0x40054c1d 0x00200780 - 0xa0004e01 0x04200780 0x2102ec19 0x00000003 - 0x30100e1d 0xc4100780 0x30060001 0xc4000780 - 0x60044c19 0x0021c780 0x20048a1c 0x20008c00 - 0x00020e05 0xc0000780 0x20000601 0x04000780 - 0xd4083805 0x20000780 0x30020019 0xc4100780 - 0x1500e000 0x2106e818 0xd00e0c01 0xa0c00780 - 0x307c03fd 0x6c0087ca 0x30000003 0x00000280 - 0x2101ec05 0x00000003 0x40054c21 0x00200780 - 0x10018001 0x00000003 0xa0004e19 0x04200780 - 0x2102ec1d 0x00000003 0x30101021 0xc4100780 - 0x30010001 0xc4000780 0x30070c05 0xc4000780 - 0x60044c09 0x00220780 0x20018000 0x20038404 - 0x20048a08 0x20008200 0x00020405 0xc0000780 - 0x30020005 0xc4100780 0x1500ee00 0x2101e804 - 0xd00e0201 0xa0c00781 + 0xd00e0209 0x80c00780 0x1001b003 0x00000780 + 0x103f8009 0x000fffff 0x30040805 0xc4100782 + 0x20000205 0x04000780 0x00020205 0xc0000780 + 0x04000e01 0xe4208780 0x861ffe03 0x00000000 + 0x30040009 0xc4100780 0x20000809 0x04008780 + 0x00020405 0xc0000780 0x1400ce09 0x0423c780 + 0x861ffe03 0x00000000 0x00020205 0xc0000780 + 0x04000e01 0xe4208780 0x861ffe03 0x00000000 + 0x00020205 0xc0000780 0x20088015 0x00000003 + 0x3402dffd 0x6c2047c8 0x3402de19 0xac200780 + 0x10000015 0x0403c500 0x2004821d 0x00000003 + 0x04020e01 0xe4214780 0x00020e09 0xc0000780 + 0x04000e01 0xe4218780 0x10000209 0x0403c780 + 0x3806cffd 0x6c2047c8 0x10000e09 0x0403c280 + 0x0002040d 0xc0000780 0xdc08380d 0x20000780 + 0x3806ce15 0xac200780 0x1c00c009 0x0423c780 + 0x20028219 0x00000003 0x04020e01 0xe4208780 + 0x00020c09 0xc0000780 0x04000e01 0xe4214780 + 0x10000209 0x0403c780 0x3805cffd 0x6c2047c8 + 0x10000c09 0x0403c280 0x0002040d 0xc0000780 + 0xdc08380d 0x20000780 0x3805ce15 0xac200780 + 0x1c00c009 0x0423c780 0x307c0001 0x64008780 + 0x30040619 0x64010780 0x04020e01 0xe4208780 + 0xd0830001 0x04400780 0xd0830c09 0x04400780 + 0x04000e01 0xe4214780 0xd0020009 0x040007c0 + 0xa005d003 0x00000000 0x1400d001 0x0423c780 + 0x1005d003 0x00000100 0x40074c01 0x00200780 + 0xa0004e19 0x04200780 0x2102ec1d 0x00000003 + 0x00020205 0xc0000780 0x30100001 0xc4100780 + 0x30070c19 0xc4000780 0x3405d01d 0x6c204780 + 0x60064c01 0x00200780 0x20478204 0x20068000 + 0x00020209 0xc0000780 0x20000801 0x04000780 + 0xd8083809 0x20000780 0x30020005 0xc4100780 + 0x1900e000 0x2101e804 0xd00e0201 0xa0c00780 + 0x1400d001 0x0423c780 0x307c05fd 0x6c0087ca + 0x30000003 0x00000280 0x2101ec09 0x00000003 + 0x40074c21 0x00200780 0x10018005 0x00000003 + 0xa0004e19 0x04200780 0x2102ec1d 0x00000003 + 0x30101021 0xc4100780 0x30020205 0xc4000780 + 0x30070c09 0xc4000780 0x60064c0d 0x00220780 + 0x20028204 0x20048608 0x20000405 0x04004780 + 0x30020205 0xc4100780 0x30000a01 0xac000780 + 0x2000c805 0x04204780 0xd00e0201 0xa0c00781 } } code { @@ -1154,7 +1160,7 @@ code { name = cudaCalcPartition lmem = 0 smem = 2280 - reg = 11 + reg = 12 bar = 1 const { segname = const @@ -1168,71 +1174,51 @@ code { bincode { 0xd0800205 0x00400780 0xa0000211 0x04000780 0xa0000019 0x04000780 0x30040801 0xc4100780 - 0x20000c15 0x04000780 0x30810bfd 0x644107c8 - 0xa0011003 0x00000000 0x30020a1d 0xc4100780 + 0x20000c25 0x04000780 0x308113fd 0x644107c8 + 0xa0011003 0x00000000 0x3002121d 0xc4100780 0x10011003 0x00000280 0xa0004e01 0x04200780 0x30070005 0xc4100780 0x30060001 0xc4100780 0x20008200 0x2100ec00 0x20000e01 0x04000780 0xd00e0001 0x80c00780 0x00000e05 0xc0000780 0x04041401 0xe4200780 0xf0000001 0xe0000002 - 0x861ffe03 0x00000000 0xa0004c0d 0x04200780 - 0x1000d201 0x0423c780 0x40060209 0x00000780 - 0x10018021 0x00000003 0x1000ce05 0x0423c780 - 0x30100409 0xc4100780 0x30011005 0xc4000780 - 0x60060021 0x00008780 0x30019004 0x1100f000 - 0x3001d205 0xa4200780 0x40030009 0x00000780 - 0x60020209 0x00008780 0x30100409 0xc4100780 - 0x60020001 0x00008780 0xd0105005 0x20000780 - 0x3400c001 0x04200780 0x30000bfd 0x6c0187c8 - 0xa0037003 0x00000000 0x10036003 0x00000280 - 0x1000d001 0x0423c780 0x40070009 0x00000780 - 0x60060209 0x00008780 0x30100409 0xc4100780 - 0x60060009 0x00008780 0x1000d201 0x0423c780 - 0x40050025 0x00000780 0x60040225 0x00024780 - 0x30101225 0xc4100780 0x60040001 0x00024780 - 0xd010a005 0x20000780 0x2500e000 0x20008a00 - 0x30020001 0xc4100780 0x2000ca01 0x04200780 - 0xd00e0001 0x80c00780 0x10037003 0x00000780 - 0x1000f801 0x0403c780 0x00000e05 0xc0000782 + 0x861ffe03 0x00000000 0xa0004c09 0x04200780 + 0x1100f004 0x1100f200 0x40040214 0x4003082c + 0x10018029 0x00000003 0x1000ce0d 0x0423c780 + 0x30100a21 0xc4100780 0x30101615 0xc4100780 + 0x3003140d 0xc4000780 0x60040021 0x00020780 + 0x60020815 0x00014780 0x3003900c 0x1100f004 + 0x400a0229 0x00000780 0x3003d20d 0xa4200780 + 0x600b0029 0x00028780 0x40030c2d 0x00000780 + 0x30101429 0xc4100780 0x60020e2d 0x0002c780 + 0x600a0001 0x00028780 0xd0105005 0x20000780 + 0x30101615 0xc4100780 0x20000001 0x04024780 + 0x60020c05 0x00014780 0x3400c1fd 0x6c20c7c8 + 0x300113fd 0x6c0042c8 0xd010a005 0x20000780 + 0x2400c001 0x04200680 0x30020001 0xc4100680 + 0x2000ca01 0x04200680 0xd00e0001 0x80c00680 + 0x1000f801 0x0403c100 0x301f0005 0xec100780 + 0x30010001 0xc4100780 0xd0000201 0x04008780 + 0x00000e05 0xc0000780 0x30820001 0xac400780 + 0x04001401 0xe4200780 0xd0105009 0x20000780 + 0x20018c05 0x00000003 0x1000d001 0x0423c780 + 0x1100f014 0x40010424 0x3800c015 0x04214780 + 0x60000629 0x00024780 0x400b0425 0x00000780 + 0x30101429 0xc4100780 0x600a0625 0x00024780 + 0x200005fd 0x040107c8 0x60000401 0x00028780 + 0x30101209 0xc4100780 0x600a0401 0x00008100 0x04021401 0xe4200780 0x861ffe03 0x00000000 - 0xd0105005 0x20000780 0x3405c1fd 0x6c2107c8 - 0xa005a003 0x00000000 0x10044003 0x00000280 - 0xd0105005 0x20000780 0x3400c001 0x04214780 - 0x00020005 0xc0000780 0xd4085005 0x20000780 - 0x1400c001 0x0423c780 0x1005a003 0x00000780 - 0x307c07fd 0x640087c8 0xa0059003 0x00000000 - 0x10058003 0x00000280 0x1000d001 0x0423c780 - 0x40070009 0x00000780 0x60060209 0x00008780 - 0x30100409 0xc4100780 0x60060009 0x00008780 - 0x1000d201 0x0423c780 0x40050025 0x00000780 - 0x60040225 0x00024780 0x30101225 0xc4100780 - 0x60040001 0x00024780 0xd0105005 0x20000780 - 0x2500f400 0x20008a00 0x3400c001 0x04200780 - 0x30020001 0xc4100780 0x2000ca01 0x04200780 - 0xd00e0001 0x80c00780 0x10059003 0x00000780 - 0x1000f801 0x0403c780 0xf0000001 0xe0000002 - 0x301f0009 0xec100782 0x30010001 0xc4100780 - 0xd0000401 0x04008780 0x00000e05 0xc0000780 - 0x30820001 0xac400780 0x04001401 0xe4200780 - 0x861ffe03 0x00000000 0xd0105005 0x20000780 - 0x20018c09 0x00000003 0x1100f000 0x1100f014 - 0x3505e014 0x40000a24 0x60010825 0x00024780 - 0x400b0829 0x00000780 0x30101225 0xc4100780 - 0x600a0a29 0x00028780 0x200007fd 0x040107c8 - 0x60000801 0x00024780 0x3010140d 0xc4100780 - 0x600a0801 0x0000c100 0x00000e05 0xc0000780 - 0x307cd1fd 0x6c20c7c8 0x04021401 0xe4200780 - 0x10000009 0x0403c780 0x10083003 0x00000280 - 0x1000d001 0x0423c780 0x4009000d 0x00000780 - 0x6008020d 0x0000c780 0x3010060d 0xc4100780 - 0x60080001 0x0000c780 0x200a800d 0x00000003 - 0x00020605 0xc0000780 0xa0082003 0x00000000 - 0x2000d00d 0x04200780 0x3606c215 0xec200780 - 0x20018001 0x00000003 0x20000409 0x04014780 - 0x00000e09 0xc0000780 0x300301fd 0x6c0147c8 - 0x08021401 0xe4208780 0x1007b003 0x00000280 + 0x307cd1fd 0x6c20c7c8 0x1005b003 0x00000280 + 0x1000d001 0x0423c780 0x40090005 0x00000780 + 0x60080205 0x00004780 0x30100205 0xc4100780 + 0x60080005 0x00004780 0x00000e05 0xc0000780 + 0x200a8201 0x00000003 0xd4085009 0x20000780 + 0x00020005 0xc0000780 0xa005a003 0x00000000 + 0x1900e000 0x2101f008 0x3606c215 0xec200780 + 0x20018205 0x00000003 0x20000001 0x04014780 + 0x00000e09 0xc0000780 0x300203fd 0x6c0147c8 + 0x08021401 0xe4200780 0x10053003 0x00000280 0xf0000001 0xe0000002 0x861ffe03 0x00000000 - 0x300603fd 0x640107c8 0x308309fd 0x6440c2c8 + 0x300607fd 0x640107c8 0x308309fd 0x6440c2c8 0x30000003 0x00000100 0x2101ee05 0x00000003 0x100f8001 0x00000003 0x30010001 0xc4000780 0x40014e09 0x00200780 0x30100409 0xc4100780 @@ -1269,7 +1255,7 @@ code { code { name = cudaCalcLargePartition lmem = 0 - smem = 3304 + smem = 2280 reg = 10 bar = 1 const { @@ -1284,66 +1270,51 @@ code { } bincode { 0xd0800205 0x00400780 0xa000020d 0x04000780 - 0xa0000005 0x04000780 0x30040601 0xc4100780 - 0x20000215 0x04000780 0x30810bfd 0x644107c8 - 0xa0011003 0x00000000 0x30020a11 0xc4100780 - 0x10011003 0x00000280 0xa0004e01 0x04200780 - 0x30070009 0xc4100780 0x30060001 0xc4100780 - 0x20008400 0x2100ec00 0x20000801 0x04000780 - 0xd00e0001 0x80c00780 0x00000805 0xc0000780 - 0x04061401 0xe4200780 0xf0000001 0xe0000002 + 0xa0000001 0x04000780 0x30040605 0xc4100780 + 0x20000005 0x04004780 0x308103fd 0x644107c8 + 0xa0011003 0x00000000 0x30020211 0xc4100780 + 0x10011003 0x00000280 0xa0004e09 0x04200780 + 0x30070415 0xc4100780 0x30060409 0xc4100780 + 0x20028a08 0x2102ec08 0x20000809 0x04008780 + 0xd00e0409 0x80c00780 0x00000805 0xc0000780 + 0x04041401 0xe4208780 0xf0000001 0xe0000002 0x861ffe03 0x00000000 0x00000805 0xc0000780 - 0x307cd1fd 0x6c20c7c8 0x04041401 0xe43f0780 - 0x1006b003 0x00000280 0x3082d019 0xac600780 - 0x1000f81d 0x0403c780 0x307c0dfd 0x6c0107c8 - 0xd0185005 0x20000780 0x1500e000 0x20058e20 - 0x2040d001 0x04200780 0x300011fd 0x6c0187d8 - 0xa002c003 0x00000000 0x1002b003 0x00001280 - 0x1000d001 0x0423c780 0x40014c09 0x00200780 - 0x30100409 0xc4100780 0x60004c09 0x00208780 - 0xd018a005 0x20000780 0x20058e00 0x2502e008 - 0x20000001 0x04008780 0x30020001 0xc4100780 - 0x2000ca01 0x04200780 0xd00e0001 0x80c00780 - 0x1002c003 0x00000780 0x1000f801 0x0403c780 - 0x00000805 0xc0000782 0x04001401 0xe4200780 - 0x861ffe03 0x00000000 0xd0185005 0x20000780 - 0x3405c1fd 0x6c2107d8 0xa004c003 0x00000000 - 0x10038003 0x00001280 0xd0185005 0x20000780 - 0x3400c001 0x04214780 0x00020005 0xc0000780 - 0x1400d401 0x0423c780 0x1004c003 0x00000780 - 0x307c0ffd 0x6c0147d8 0xa0004c09 0x04200780 - 0x307c05fd 0x64015158 0x3008d1fd 0x6c2112d8 - 0xa004b003 0x00000000 0x1004a003 0x00001100 - 0x1000d001 0x0423c780 0x40050021 0x00000780 - 0x60040221 0x00020780 0x30101021 0xc4100780 - 0x60040009 0x00020780 0xd0185005 0x20000780 - 0x20058e00 0x2502f408 0x20028000 0x3500e000 - 0x30020001 0xc4100780 0x2000ca01 0x04200780 - 0xd00e0001 0x80c00780 0x1004b003 0x00000780 - 0x1000f801 0x0403c780 0xf0000001 0xe0000002 - 0x301f0009 0xec100782 0x30010001 0xc4100780 - 0xd0000401 0x04008780 0x00000805 0xc0000780 - 0x30830001 0xac400780 0x04021401 0xe4200780 - 0x861ffe03 0x00000000 0x10063003 0x00000100 - 0x200a8201 0x00000003 0x00020005 0xc0000780 - 0xa0062003 0x00000000 0x10008200 0x20068208 - 0x00000809 0xc0000780 0xd4080011 0x20000780 - 0xd810500d 0x20000780 0x3003c021 0xec200784 - 0x20108001 0x00000003 0x2c00c021 0x04220780 - 0x300201fd 0x6c0047d8 0x08041401 0xe4220780 - 0xd4008005 0x20000780 0x10058003 0x00001280 + 0x307cd1fd 0x6c20c7c8 0x04021401 0xe43f0780 + 0x1004d003 0x00000280 0xa0004c15 0x04200780 + 0x1000d009 0x0423c780 0x400a0a19 0x00000780 + 0x30100c19 0xc4100780 0x600a0809 0x00018780 + 0x3082d015 0xac600780 0x2000d01d 0x04208780 + 0xa004c003 0x00000000 0x307c0bfd 0x6c0107c8 + 0x20028218 0x2007821c 0xd0105005 0x20000780 + 0x3406c1fd 0x6c20c7d8 0x3001d1fd 0x6c2112d8 + 0xa002d003 0x00000000 0x1002c003 0x00001100 + 0xd010a005 0x20000780 0x2400c009 0x04218780 + 0x30020409 0xc4100780 0x2000ca09 0x04208780 + 0xd00e0409 0x80c00780 0x1002d003 0x00000780 + 0x1000f809 0x0403c780 0x301f0421 0xec100782 + 0x30010409 0xc4100780 0xd0021009 0x04008780 + 0x00000805 0xc0000780 0x30830409 0xac400780 + 0x04001401 0xe4208780 0x861ffe03 0x00000000 + 0x10043003 0x00000100 0x200a8009 0x00000003 + 0x00020405 0xc0000780 0xa0042003 0x00000000 + 0x10008008 0x20058020 0x00000809 0xc0000780 + 0x3403c025 0xec200780 0xd808500d 0x20000780 + 0x20108409 0x00000003 0x2c00c025 0x04224780 + 0x300805fd 0x6c0047d8 0x08021401 0xe4224780 + 0xd4008005 0x20000780 0x10039003 0x00001280 0xf0000001 0xe0000002 0x00000805 0xc0000780 - 0xd4105009 0x20000780 0x3883c001 0xac600780 - 0x04041401 0xe4200780 0x861ffe03 0x00000000 - 0x20008e1d 0x00000013 0x3007d1fd 0x6c2107d8 - 0x1001a003 0x00001280 0x00000805 0xc0000780 - 0xd4105009 0x20000780 0x1900f000 0x2900e000 - 0x04041401 0xe4200780 0x1900e800 0x2900e000 - 0x04041401 0xe4200780 0x1900e400 0x2900e000 - 0x04041401 0xe4200780 0x1800c201 0x0423c780 - 0x307c03fd 0x640087c8 0x2800c001 0x04200780 - 0x308407fd 0x6440c2c8 0x04041401 0xe4200780 - 0x30000003 0x00000100 0xd0185005 0x20000780 + 0xd4085009 0x20000780 0x3883c009 0xac600780 + 0x04021401 0xe4208780 0x861ffe03 0x00000000 + 0x20008c19 0x00000013 0x30070dfd 0x6c0047d8 + 0x20008205 0x00000013 0x10021003 0x00001280 + 0xf0000001 0xe0000002 0x00000805 0xc0000780 + 0xd4085009 0x20000780 0x1900f004 0x2901e004 + 0x04021401 0xe4204780 0x1900e804 0x2901e004 + 0x04021401 0xe4204780 0x1900e404 0x2901e004 + 0x04021401 0xe4204780 0x1800c205 0x0423c780 + 0x307c01fd 0x640087c8 0x2800c001 0x04204780 + 0x308407fd 0x6440c2c8 0x04021401 0xe4200780 + 0x30000003 0x00000100 0xd0105005 0x20000780 0x2101ee19 0x00000003 0x100f8005 0x00000003 0x20018609 0x00000003 0x1000d001 0x0423c780 0x30060205 0xc4000780 0x1100f014 0x41032e1c @@ -1353,7 +1324,7 @@ code { 0x00000805 0xc0000780 0x30100e1d 0xc4100780 0x600a0a11 0x00018780 0x2000020d 0x0400c780 0xa0004c05 0x042007c0 0x60000801 0x0001c780 - 0x30100811 0xc4100780 0xd4105005 0x20000780 + 0x30100811 0xc4100780 0xd4085005 0x20000780 0x20000205 0x0400c780 0x600a0801 0x00010100 0x3483c009 0xac600780 0x30020205 0xc4100780 0x20028000 0x2101e804 0xd00e0201 0xa0c00781