mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
optimizations
This commit is contained in:
@@ -51,8 +51,8 @@ extern "C" __global__ void cudaComputeAutocor(
|
||||
{
|
||||
__shared__ struct {
|
||||
float data[512];
|
||||
float product[256];
|
||||
float sum[33];
|
||||
volatile float product[256];
|
||||
volatile float sum[33];
|
||||
computeAutocorTaskStruct task;
|
||||
} shared;
|
||||
const int tid = threadIdx.x;
|
||||
@@ -81,15 +81,17 @@ extern "C" __global__ void cudaComputeAutocor(
|
||||
//if (tid < 256) shared.product[tid] += shared.product[tid + 256]; __syncthreads();
|
||||
if (tid < 128) shared.product[tid] += shared.product[tid + 128]; __syncthreads();
|
||||
if (tid < 64) shared.product[tid] += shared.product[tid + 64]; __syncthreads();
|
||||
if (tid < 32) shared.product[tid] += shared.product[tid + 32]; __syncthreads();
|
||||
shared.product[tid] += shared.product[tid + 16];
|
||||
shared.product[tid] += shared.product[tid + 8];
|
||||
shared.product[tid] += shared.product[tid + 4];
|
||||
shared.product[tid] += shared.product[tid + 2];
|
||||
if (tid == 0) shared.sum[lag] = shared.product[0] + shared.product[1];
|
||||
if (tid < 32)
|
||||
{
|
||||
shared.product[tid] += shared.product[tid + 32];
|
||||
shared.product[tid] += shared.product[tid + 16];
|
||||
shared.product[tid] += shared.product[tid + 8];
|
||||
shared.product[tid] += shared.product[tid + 4];
|
||||
shared.product[tid] += shared.product[tid + 2];
|
||||
if (tid == 0) shared.sum[lag] = shared.product[0] + shared.product[1];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// return results
|
||||
if (tid <= max_order)
|
||||
output[(blockIdx.x + blockIdx.y * gridDim.x) * (max_order + 1) + tid] = shared.sum[tid];
|
||||
@@ -152,17 +154,17 @@ extern "C" __global__ void cudaComputeLPC(
|
||||
// Schur recursion
|
||||
float reff = -shared.gen1[0] / error;
|
||||
//if (tid == 0) shared.reff[order] = reff;
|
||||
error += shared.gen1[0] * reff;
|
||||
error += __fmul_rz(shared.gen1[0], reff);
|
||||
if (tid < max_order - 1 - order)
|
||||
{
|
||||
float g1 = shared.gen1[tid + 1] + reff * shared.gen0[tid];
|
||||
float g0 = shared.gen1[tid + 1] * reff + shared.gen0[tid];
|
||||
float g1 = shared.gen1[tid + 1] + __fmul_rz(reff, shared.gen0[tid]);
|
||||
float g0 = __fmul_rz(shared.gen1[tid + 1], reff) + shared.gen0[tid];
|
||||
shared.gen1[tid] = g1;
|
||||
shared.gen0[tid] = g0;
|
||||
}
|
||||
|
||||
// Levinson-Durbin recursion
|
||||
shared.ldr[tid] += (tid < order) * reff * shared.ldr[order - 1 - tid] + (tid == order) * reff;
|
||||
shared.ldr[tid] += (tid < order) * __fmul_rz(reff, shared.ldr[order - 1 - tid]) + (tid == order) * reff;
|
||||
|
||||
// Quantization
|
||||
int precision = 13;
|
||||
|
||||
@@ -5,7 +5,7 @@ code {
|
||||
name = cudaComputeAutocor
|
||||
lmem = 0
|
||||
smem = 3264
|
||||
reg = 10
|
||||
reg = 9
|
||||
bar = 1
|
||||
const {
|
||||
segname = const
|
||||
@@ -47,18 +47,17 @@ code {
|
||||
0xc0000a01 0x00000780 0x10039003 0x00000780
|
||||
0x1000f801 0x0403c780 0x00020c05 0xc0000782
|
||||
0x04001601 0xe4200780 0x861ffe03 0x00000000
|
||||
0x307cd1fd 0x6c2047c8 0x10085003 0x00000280
|
||||
0x307cd1fd 0x6c2047c8 0x10081003 0x00000280
|
||||
0x300209fd 0x6c00c7e8 0x30040dfd 0x6c0187f8
|
||||
0x308105fd 0x6c40c7c8 0x00000019 0x20000780
|
||||
0x2101f011 0x00000003 0x1000f815 0x0403c780
|
||||
0x308205fd 0x6c40c7c8 0x0000001d 0x20000780
|
||||
0x308305fd 0x6c40c7c8 0x00000021 0x20000780
|
||||
0x307c05fd 0x6c0087d8 0x20000a25 0x04008780
|
||||
0x20009201 0x00000013 0x00020009 0xc0000780
|
||||
0x1800d601 0x0423c780 0x0002120d 0xc0000780
|
||||
0xc400d625 0x00200780 0x00000609 0xc0000780
|
||||
0x1c00d601 0x0423c780 0x1000f825 0x0403f280
|
||||
0xe800d601 0x00224780 0x10001201 0x0403e280
|
||||
0x308305fd 0x6c40c7d8 0x20000a21 0x04008780
|
||||
0x20009001 0x00000013 0x00020009 0xc0000780
|
||||
0x1800d601 0x0423c780 0x0002100d 0xc0000780
|
||||
0xc400d621 0x00200780 0x00000609 0xc0000780
|
||||
0x1c00d601 0x0423c780 0x1000f821 0x0403f280
|
||||
0xe800d601 0x00220780 0x10001001 0x0403e280
|
||||
0x08041601 0xe4200780 0x861ffe03 0x00000000
|
||||
0x00000c01 0xa00007c0 0x00000609 0xc0000680
|
||||
0xd8145811 0x20000680 0xd810580d 0x20000680
|
||||
@@ -68,22 +67,21 @@ code {
|
||||
0xd8125811 0x20000680 0xd810580d 0x20000680
|
||||
0x1000c001 0x0423c684 0xbc00c001 0x00200680
|
||||
0x08041601 0xe4200680 0x861ffe03 0x00000000
|
||||
0x00001001 0xa00007c0 0x00000609 0xc0000680
|
||||
0xd8115811 0x20000680 0xd810580d 0x20000680
|
||||
0x1000c001 0x0423c684 0xbc00c001 0x00200680
|
||||
0x08041601 0xe4200680 0x861ffe03 0x00000000
|
||||
0x00000609 0xc0000780 0xd810580d 0x20000780
|
||||
0xa007c003 0x00000000 0x1007c003 0x00001100
|
||||
0x00000609 0xc0000780 0xd8115811 0x20000780
|
||||
0xd810580d 0x20000780 0x1000c001 0x0423c784
|
||||
0xbc00c001 0x00200780 0x08041601 0xe4200780
|
||||
0x1c00e001 0x0423c780 0xbc00c001 0x00200780
|
||||
0x08041601 0xe4200780 0xbc00d001 0x00200780
|
||||
0x08041601 0xe4200780 0xbc00c801 0x00200780
|
||||
0x08041601 0xe4200780 0xbc00c401 0x00200780
|
||||
0x08041601 0xe4200780 0xa0080003 0x00000000
|
||||
0x10080003 0x00001100 0xd010580d 0x20000780
|
||||
0x08041601 0xe4200780 0x1d00f000 0xbd006000
|
||||
0x08041601 0xe4200780 0x1d00e800 0xbd006000
|
||||
0x08041601 0xe4200780 0x1d00e400 0xbd006000
|
||||
0x08041601 0xe4200780 0x307c05fd 0x6c0147c8
|
||||
0x1007c003 0x00000280 0xd010580d 0x20000780
|
||||
0x1c00c201 0x0423c780 0x00020a09 0xc0000780
|
||||
0xbc00c001 0x00200780 0x08061601 0xe4200780
|
||||
0xf0000001 0xe0000002 0x861ffe03 0x00000000
|
||||
0x20018a15 0x00000003 0x30040bfd 0x6c0147c8
|
||||
0x10049003 0x00000280 0x3002d1fd 0x6c2047c8
|
||||
0x10047003 0x00000280 0x3002d1fd 0x6c2047c8
|
||||
0x30000003 0x00000280 0x10004e01 0x0023c780
|
||||
0x60004805 0x00204780 0x2101f001 0x00000003
|
||||
0x40030011 0x00000780 0x60020211 0x00010780
|
||||
@@ -342,32 +340,32 @@ code {
|
||||
0x0400dc01 0xe4200780 0x1800c001 0x0423c780
|
||||
0x04011c01 0xe4200780 0xd0026809 0x20000780
|
||||
0x04001a01 0xe43f0780 0x307ccffd 0x6c20c7c8
|
||||
0x30020411 0xc4100780 0x1800c001 0x0423c780
|
||||
0x3002040d 0xc4100780 0x1800c001 0x0423c780
|
||||
0x30000003 0x00000280 0x307c05fd 0x6c0087c8
|
||||
0x213fee15 0x0fffffff 0x1000f819 0x0403c780
|
||||
0x213fee11 0x0fffffff 0x1000f815 0x0403c780
|
||||
0xd0047005 0x20000780 0xb08601fd 0x605107d8
|
||||
0x10000005 0x0403c780 0xa400c00d 0xe4204780
|
||||
0xc08a060d 0x00401680 0xc08a0205 0x00401680
|
||||
0x90000204 0xc001061c 0x20400a05 0x04018780
|
||||
0xd0047005 0x20000780 0x300105fd 0x6c0187d8
|
||||
0xa0077003 0x00000000 0xe407c001 0x00200780
|
||||
0x10077003 0x00001280 0x00000805 0xc0000780
|
||||
0xd404780d 0x20000780 0xd4037009 0x20000780
|
||||
0x1d00e004 0x1900e00c 0xe807c005 0x00204780
|
||||
0xec07c00d 0x0020c780 0x04011c01 0xe4204780
|
||||
0x0400dc01 0xe420c780 0x20400c05 0x04008782
|
||||
0x00020205 0xc0000780 0x30020dfd 0x6c00c7d8
|
||||
0xc407d80d 0x00200780 0x1000f80d 0x0403d280
|
||||
0x30020dfd 0x6c0147d8 0xb0000605 0x0001c780
|
||||
0x10000605 0x0403d280 0x00000809 0xc0000780
|
||||
0x10000005 0x0403c780 0xa400c019 0xe4204780
|
||||
0xc08a0c19 0x00401680 0xc08a0205 0x00401680
|
||||
0x90000204 0xc0010c04 0xd0047005 0x20000780
|
||||
0xc401c019 0x0020c780 0xb0060000 0x20458818
|
||||
0x300605fd 0x6c0187d8 0xa0077003 0x00000000
|
||||
0x10077003 0x00001280 0x00000605 0xc0000780
|
||||
0xd403700d 0x20000780 0xd4047809 0x20000780
|
||||
0xcc01c019 0x0020c780 0xc801c01d 0x0020c780
|
||||
0xb9066018 0xbd07601c 0x04011c01 0xe4218780
|
||||
0x0400dc01 0xe421c780 0x20400a19 0x04008782
|
||||
0x00020c05 0xc0000780 0x30020bfd 0x6c00c7d8
|
||||
0xc401d819 0x0020c780 0x1000f819 0x0403d280
|
||||
0x30020bfd 0x6c0147d8 0xb0000c05 0x00004780
|
||||
0x10000c05 0x0403d280 0x00000609 0xc0000780
|
||||
0xb800da05 0x00204780 0x08001a01 0xe4204780
|
||||
0xa800da05 0xc4304780 0xc0000205 0x04700003
|
||||
0xa0000205 0x8c0047d0 0x2000d60d 0x04218780
|
||||
0xa0000205 0x8c0047d0 0x2000d619 0x04214780
|
||||
0xa0000205 0x44065680 0x30170205 0xec101680
|
||||
0x31000205 0x0442d680 0x10000a05 0x2440d100
|
||||
0x30020c1d 0x6c0187d0 0x30148205 0x00000003
|
||||
0x30020a1d 0x6c0187d0 0x30148205 0x00000003
|
||||
0xd0840e1d 0x04400780 0x40070205 0x00018780
|
||||
0x00000809 0xc0000780 0x08005a01 0xe4204780
|
||||
0x00000609 0xc0000780 0x08005a01 0xe4204780
|
||||
0xd801680d 0x20000780 0x1c00e005 0x0423c780
|
||||
0x3c01c005 0x8c200780 0x08005a01 0xe4204780
|
||||
0x1c00d005 0x0423c780 0x3c01c005 0x8c200780
|
||||
@@ -382,11 +380,11 @@ code {
|
||||
0xa0000e1d 0x44014780 0xc407da1d 0x00200780
|
||||
0xa0000e1d 0xac004780 0x30880e1d 0xac400780
|
||||
0xa00b3003 0x00000000 0x30890e1d 0x8c400780
|
||||
0x100b3003 0x00001100 0x30070621 0xc4100780
|
||||
0x30060625 0xc4100780 0x20099020 0x2108e820
|
||||
0x20000821 0x04020780 0x20009021 0x00000007
|
||||
0x100b3003 0x00001100 0x30070c21 0xc4100780
|
||||
0x30060c25 0xc4100780 0x20099020 0x2108e820
|
||||
0x20000621 0x04020780 0x20009021 0x00000007
|
||||
0xd00e101d 0xa0c00780 0xf0000001 0xe0000002
|
||||
0x30070621 0xc4100680 0x30060625 0xc4100680
|
||||
0x30070c21 0xc4100680 0x30060c25 0xc4100680
|
||||
0x20001021 0x04024680 0x2000c821 0x04220680
|
||||
0x21001021 0x04430680 0xd00e1005 0xa0c00680
|
||||
0x307c0ffd 0x6c0087d8 0xa0000e05 0x44065500
|
||||
@@ -395,7 +393,7 @@ code {
|
||||
0x307c0ffd 0x6c0087d8 0xa0000e1d 0x44065500
|
||||
0x30170e1d 0xec101500 0x31000e1d 0x0442d500
|
||||
0x10000a1d 0x2440d280 0x30070205 0x8c000780
|
||||
0x00000805 0xc0000780 0x30218205 0x00000003
|
||||
0x00000605 0xc0000780 0x30218205 0x00000003
|
||||
0x04005a01 0xe4204780 0xd4016809 0x20000780
|
||||
0x1800e005 0x0423c780 0x3801c005 0x8c200780
|
||||
0x04005a01 0xe4204780 0x1800d005 0x0423c780
|
||||
@@ -405,12 +403,12 @@ code {
|
||||
0x3801c005 0x8c200780 0x04005a01 0xe4204780
|
||||
0x1800c205 0x0423c780 0x3801c005 0x8c200780
|
||||
0x04005a01 0xe4204780 0xa00e2003 0x00000000
|
||||
0x100e2003 0x00000100 0x30070605 0xc4100780
|
||||
0x3006060d 0xc4100780 0x20000205 0x0400c780
|
||||
0xd0016805 0x20000780 0x2101e80c 0x1500e004
|
||||
0x200c860d 0x00000003 0xd00e0605 0xa0c00780
|
||||
0xf0000001 0xe0000002 0x20018c19 0x00000003
|
||||
0x3006cffd 0x6c2147d8 0x10062003 0x00001280
|
||||
0x100e2003 0x00000100 0x30070c05 0xc4100780
|
||||
0x30060c19 0xc4100780 0x20000205 0x04018780
|
||||
0xd0016805 0x20000780 0x2101e818 0x1500e004
|
||||
0x200c8c19 0x00000003 0xd00e0c05 0xa0c00780
|
||||
0xf0000001 0xe0000002 0x20018a15 0x00000003
|
||||
0x3005cffd 0x6c2147d8 0x10062003 0x00001280
|
||||
0xf0000001 0xe0000001
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user