diff --git a/CUETools.FlaCuda/flacuda.cu b/CUETools.FlaCuda/flacuda.cu index fd166a1..cbe6ebb 100644 --- a/CUETools.FlaCuda/flacuda.cu +++ b/CUETools.FlaCuda/flacuda.cu @@ -51,8 +51,8 @@ extern "C" __global__ void cudaComputeAutocor( { __shared__ struct { float data[512]; - float product[256]; - float sum[33]; + volatile float product[256]; + volatile float sum[33]; computeAutocorTaskStruct task; } shared; const int tid = threadIdx.x; @@ -81,15 +81,17 @@ extern "C" __global__ void cudaComputeAutocor( //if (tid < 256) shared.product[tid] += shared.product[tid + 256]; __syncthreads(); if (tid < 128) shared.product[tid] += shared.product[tid + 128]; __syncthreads(); if (tid < 64) shared.product[tid] += shared.product[tid + 64]; __syncthreads(); - if (tid < 32) shared.product[tid] += shared.product[tid + 32]; __syncthreads(); - shared.product[tid] += shared.product[tid + 16]; - shared.product[tid] += shared.product[tid + 8]; - shared.product[tid] += shared.product[tid + 4]; - shared.product[tid] += shared.product[tid + 2]; - if (tid == 0) shared.sum[lag] = shared.product[0] + shared.product[1]; + if (tid < 32) + { + shared.product[tid] += shared.product[tid + 32]; + shared.product[tid] += shared.product[tid + 16]; + shared.product[tid] += shared.product[tid + 8]; + shared.product[tid] += shared.product[tid + 4]; + shared.product[tid] += shared.product[tid + 2]; + if (tid == 0) shared.sum[lag] = shared.product[0] + shared.product[1]; + } __syncthreads(); } - // return results if (tid <= max_order) output[(blockIdx.x + blockIdx.y * gridDim.x) * (max_order + 1) + tid] = shared.sum[tid]; @@ -152,17 +154,17 @@ extern "C" __global__ void cudaComputeLPC( // Schur recursion float reff = -shared.gen1[0] / error; //if (tid == 0) shared.reff[order] = reff; - error += shared.gen1[0] * reff; + error += __fmul_rz(shared.gen1[0], reff); if (tid < max_order - 1 - order) { - float g1 = shared.gen1[tid + 1] + reff * shared.gen0[tid]; - float g0 = shared.gen1[tid + 1] * reff + shared.gen0[tid]; + float g1 = shared.gen1[tid + 1] + __fmul_rz(reff, shared.gen0[tid]); + float g0 = __fmul_rz(shared.gen1[tid + 1], reff) + shared.gen0[tid]; shared.gen1[tid] = g1; shared.gen0[tid] = g0; } // Levinson-Durbin recursion - shared.ldr[tid] += (tid < order) * reff * shared.ldr[order - 1 - tid] + (tid == order) * reff; + shared.ldr[tid] += (tid < order) * __fmul_rz(reff, shared.ldr[order - 1 - tid]) + (tid == order) * reff; // Quantization int precision = 13; diff --git a/CUETools.FlaCuda/flacuda.cubin b/CUETools.FlaCuda/flacuda.cubin index c6c1117..101b459 100644 --- a/CUETools.FlaCuda/flacuda.cubin +++ b/CUETools.FlaCuda/flacuda.cubin @@ -5,7 +5,7 @@ code { name = cudaComputeAutocor lmem = 0 smem = 3264 - reg = 10 + reg = 9 bar = 1 const { segname = const @@ -47,18 +47,17 @@ code { 0xc0000a01 0x00000780 0x10039003 0x00000780 0x1000f801 0x0403c780 0x00020c05 0xc0000782 0x04001601 0xe4200780 0x861ffe03 0x00000000 - 0x307cd1fd 0x6c2047c8 0x10085003 0x00000280 + 0x307cd1fd 0x6c2047c8 0x10081003 0x00000280 0x300209fd 0x6c00c7e8 0x30040dfd 0x6c0187f8 0x308105fd 0x6c40c7c8 0x00000019 0x20000780 0x2101f011 0x00000003 0x1000f815 0x0403c780 0x308205fd 0x6c40c7c8 0x0000001d 0x20000780 - 0x308305fd 0x6c40c7c8 0x00000021 0x20000780 - 0x307c05fd 0x6c0087d8 0x20000a25 0x04008780 - 0x20009201 0x00000013 0x00020009 0xc0000780 - 0x1800d601 0x0423c780 0x0002120d 0xc0000780 - 0xc400d625 0x00200780 0x00000609 0xc0000780 - 0x1c00d601 0x0423c780 0x1000f825 0x0403f280 - 0xe800d601 0x00224780 0x10001201 0x0403e280 + 0x308305fd 0x6c40c7d8 0x20000a21 0x04008780 + 0x20009001 0x00000013 0x00020009 0xc0000780 + 0x1800d601 0x0423c780 0x0002100d 0xc0000780 + 0xc400d621 0x00200780 0x00000609 0xc0000780 + 0x1c00d601 0x0423c780 0x1000f821 0x0403f280 + 0xe800d601 0x00220780 0x10001001 0x0403e280 0x08041601 0xe4200780 0x861ffe03 0x00000000 0x00000c01 0xa00007c0 0x00000609 0xc0000680 0xd8145811 0x20000680 0xd810580d 0x20000680 @@ -68,22 +67,21 @@ code { 0xd8125811 0x20000680 0xd810580d 0x20000680 0x1000c001 0x0423c684 0xbc00c001 0x00200680 0x08041601 0xe4200680 0x861ffe03 0x00000000 - 0x00001001 0xa00007c0 0x00000609 0xc0000680 - 0xd8115811 0x20000680 0xd810580d 0x20000680 - 0x1000c001 0x0423c684 0xbc00c001 0x00200680 - 0x08041601 0xe4200680 0x861ffe03 0x00000000 - 0x00000609 0xc0000780 0xd810580d 0x20000780 + 0xa007c003 0x00000000 0x1007c003 0x00001100 + 0x00000609 0xc0000780 0xd8115811 0x20000780 + 0xd810580d 0x20000780 0x1000c001 0x0423c784 + 0xbc00c001 0x00200780 0x08041601 0xe4200780 0x1c00e001 0x0423c780 0xbc00c001 0x00200780 - 0x08041601 0xe4200780 0xbc00d001 0x00200780 - 0x08041601 0xe4200780 0xbc00c801 0x00200780 - 0x08041601 0xe4200780 0xbc00c401 0x00200780 - 0x08041601 0xe4200780 0xa0080003 0x00000000 - 0x10080003 0x00001100 0xd010580d 0x20000780 + 0x08041601 0xe4200780 0x1d00f000 0xbd006000 + 0x08041601 0xe4200780 0x1d00e800 0xbd006000 + 0x08041601 0xe4200780 0x1d00e400 0xbd006000 + 0x08041601 0xe4200780 0x307c05fd 0x6c0147c8 + 0x1007c003 0x00000280 0xd010580d 0x20000780 0x1c00c201 0x0423c780 0x00020a09 0xc0000780 0xbc00c001 0x00200780 0x08061601 0xe4200780 0xf0000001 0xe0000002 0x861ffe03 0x00000000 0x20018a15 0x00000003 0x30040bfd 0x6c0147c8 - 0x10049003 0x00000280 0x3002d1fd 0x6c2047c8 + 0x10047003 0x00000280 0x3002d1fd 0x6c2047c8 0x30000003 0x00000280 0x10004e01 0x0023c780 0x60004805 0x00204780 0x2101f001 0x00000003 0x40030011 0x00000780 0x60020211 0x00010780 @@ -342,32 +340,32 @@ code { 0x0400dc01 0xe4200780 0x1800c001 0x0423c780 0x04011c01 0xe4200780 0xd0026809 0x20000780 0x04001a01 0xe43f0780 0x307ccffd 0x6c20c7c8 - 0x30020411 0xc4100780 0x1800c001 0x0423c780 + 0x3002040d 0xc4100780 0x1800c001 0x0423c780 0x30000003 0x00000280 0x307c05fd 0x6c0087c8 - 0x213fee15 0x0fffffff 0x1000f819 0x0403c780 + 0x213fee11 0x0fffffff 0x1000f815 0x0403c780 0xd0047005 0x20000780 0xb08601fd 0x605107d8 - 0x10000005 0x0403c780 0xa400c00d 0xe4204780 - 0xc08a060d 0x00401680 0xc08a0205 0x00401680 - 0x90000204 0xc001061c 0x20400a05 0x04018780 - 0xd0047005 0x20000780 0x300105fd 0x6c0187d8 - 0xa0077003 0x00000000 0xe407c001 0x00200780 - 0x10077003 0x00001280 0x00000805 0xc0000780 - 0xd404780d 0x20000780 0xd4037009 0x20000780 - 0x1d00e004 0x1900e00c 0xe807c005 0x00204780 - 0xec07c00d 0x0020c780 0x04011c01 0xe4204780 - 0x0400dc01 0xe420c780 0x20400c05 0x04008782 - 0x00020205 0xc0000780 0x30020dfd 0x6c00c7d8 - 0xc407d80d 0x00200780 0x1000f80d 0x0403d280 - 0x30020dfd 0x6c0147d8 0xb0000605 0x0001c780 - 0x10000605 0x0403d280 0x00000809 0xc0000780 + 0x10000005 0x0403c780 0xa400c019 0xe4204780 + 0xc08a0c19 0x00401680 0xc08a0205 0x00401680 + 0x90000204 0xc0010c04 0xd0047005 0x20000780 + 0xc401c019 0x0020c780 0xb0060000 0x20458818 + 0x300605fd 0x6c0187d8 0xa0077003 0x00000000 + 0x10077003 0x00001280 0x00000605 0xc0000780 + 0xd403700d 0x20000780 0xd4047809 0x20000780 + 0xcc01c019 0x0020c780 0xc801c01d 0x0020c780 + 0xb9066018 0xbd07601c 0x04011c01 0xe4218780 + 0x0400dc01 0xe421c780 0x20400a19 0x04008782 + 0x00020c05 0xc0000780 0x30020bfd 0x6c00c7d8 + 0xc401d819 0x0020c780 0x1000f819 0x0403d280 + 0x30020bfd 0x6c0147d8 0xb0000c05 0x00004780 + 0x10000c05 0x0403d280 0x00000609 0xc0000780 0xb800da05 0x00204780 0x08001a01 0xe4204780 0xa800da05 0xc4304780 0xc0000205 0x04700003 - 0xa0000205 0x8c0047d0 0x2000d60d 0x04218780 + 0xa0000205 0x8c0047d0 0x2000d619 0x04214780 0xa0000205 0x44065680 0x30170205 0xec101680 0x31000205 0x0442d680 0x10000a05 0x2440d100 - 0x30020c1d 0x6c0187d0 0x30148205 0x00000003 + 0x30020a1d 0x6c0187d0 0x30148205 0x00000003 0xd0840e1d 0x04400780 0x40070205 0x00018780 - 0x00000809 0xc0000780 0x08005a01 0xe4204780 + 0x00000609 0xc0000780 0x08005a01 0xe4204780 0xd801680d 0x20000780 0x1c00e005 0x0423c780 0x3c01c005 0x8c200780 0x08005a01 0xe4204780 0x1c00d005 0x0423c780 0x3c01c005 0x8c200780 @@ -382,11 +380,11 @@ code { 0xa0000e1d 0x44014780 0xc407da1d 0x00200780 0xa0000e1d 0xac004780 0x30880e1d 0xac400780 0xa00b3003 0x00000000 0x30890e1d 0x8c400780 - 0x100b3003 0x00001100 0x30070621 0xc4100780 - 0x30060625 0xc4100780 0x20099020 0x2108e820 - 0x20000821 0x04020780 0x20009021 0x00000007 + 0x100b3003 0x00001100 0x30070c21 0xc4100780 + 0x30060c25 0xc4100780 0x20099020 0x2108e820 + 0x20000621 0x04020780 0x20009021 0x00000007 0xd00e101d 0xa0c00780 0xf0000001 0xe0000002 - 0x30070621 0xc4100680 0x30060625 0xc4100680 + 0x30070c21 0xc4100680 0x30060c25 0xc4100680 0x20001021 0x04024680 0x2000c821 0x04220680 0x21001021 0x04430680 0xd00e1005 0xa0c00680 0x307c0ffd 0x6c0087d8 0xa0000e05 0x44065500 @@ -395,7 +393,7 @@ code { 0x307c0ffd 0x6c0087d8 0xa0000e1d 0x44065500 0x30170e1d 0xec101500 0x31000e1d 0x0442d500 0x10000a1d 0x2440d280 0x30070205 0x8c000780 - 0x00000805 0xc0000780 0x30218205 0x00000003 + 0x00000605 0xc0000780 0x30218205 0x00000003 0x04005a01 0xe4204780 0xd4016809 0x20000780 0x1800e005 0x0423c780 0x3801c005 0x8c200780 0x04005a01 0xe4204780 0x1800d005 0x0423c780 @@ -405,12 +403,12 @@ code { 0x3801c005 0x8c200780 0x04005a01 0xe4204780 0x1800c205 0x0423c780 0x3801c005 0x8c200780 0x04005a01 0xe4204780 0xa00e2003 0x00000000 - 0x100e2003 0x00000100 0x30070605 0xc4100780 - 0x3006060d 0xc4100780 0x20000205 0x0400c780 - 0xd0016805 0x20000780 0x2101e80c 0x1500e004 - 0x200c860d 0x00000003 0xd00e0605 0xa0c00780 - 0xf0000001 0xe0000002 0x20018c19 0x00000003 - 0x3006cffd 0x6c2147d8 0x10062003 0x00001280 + 0x100e2003 0x00000100 0x30070c05 0xc4100780 + 0x30060c19 0xc4100780 0x20000205 0x04018780 + 0xd0016805 0x20000780 0x2101e818 0x1500e004 + 0x200c8c19 0x00000003 0xd00e0c05 0xa0c00780 + 0xf0000001 0xe0000002 0x20018a15 0x00000003 + 0x3005cffd 0x6c2147d8 0x10062003 0x00001280 0xf0000001 0xe0000001 } }