optimizations

This commit is contained in:
chudov
2009-09-14 08:39:28 +00:00
parent 117d1d5e3d
commit fa4b1ef15d
2 changed files with 89 additions and 95 deletions

View File

@@ -105,12 +105,12 @@ extern "C" __global__ void cudaComputeLPC(
{
__shared__ struct {
computeAutocorTaskStruct task;
float ldr[32];
int bits[32];
float autoc[33];
float gen0[32];
float gen1[32];
float reff[32];
volatile float ldr[32];
volatile int bits[32];
volatile float autoc[33];
volatile float gen0[32];
volatile float gen1[32];
//volatile float reff[32];
int cbits;
} shared;
const int tid = threadIdx.x;
@@ -123,37 +123,31 @@ extern "C" __global__ void cudaComputeLPC(
if (tid <= max_order)
shared.autoc[tid] = 0.0f;
__syncthreads();
// add up parts
for (int part = 0; part < partCount; part++)
if (tid <= max_order)
shared.autoc[tid] += autoc[(blockIdx.y * partCount + part) * (max_order + 1) + tid];
__syncthreads();
if (tid < 32)
{
shared.gen0[tid] = shared.autoc[tid+1];
shared.gen1[tid] = shared.autoc[tid+1];
shared.ldr[tid] = 0.0f;
__syncthreads();
float error = shared.autoc[0];
for (int order = 0; order < max_order; order++)
{
// Schur recursion
float reff = -shared.gen1[0] / error;
if (tid == 0) shared.reff[order] = reff;
//if (tid == 0) shared.reff[order] = reff;
error += shared.gen1[0] * reff;
if (tid < max_order - order - 1)
if (tid < max_order - 1 - order)
{
float g1 = shared.gen1[tid + 1] + shared.reff[order] * shared.gen0[tid];
float g0 = shared.gen1[tid + 1] * shared.reff[order] + shared.gen0[tid];
float g1 = shared.gen1[tid + 1] + reff * shared.gen0[tid];
float g0 = shared.gen1[tid + 1] * reff + shared.gen0[tid];
shared.gen1[tid] = g1;
shared.gen0[tid] = g0;
}
__syncthreads();
// Levinson-Durbin recursion
shared.ldr[tid] += (tid < order) * reff * shared.ldr[order - 1 - tid] + (tid == order) * reff;

View File

@@ -278,9 +278,9 @@ code {
code {
name = cudaComputeLPC
lmem = 0
smem = 828
smem = 700
reg = 10
bar = 1
bar = 0
const {
segname = const
segnum = 1
@@ -300,95 +300,95 @@ code {
0xd00e0001 0x80c00780 0x00020605 0xc0000780
0x04001201 0xe4200780 0x3003ce01 0x6c2187d2
0xa00001fd 0x0c0147c8 0x00020605 0xc0001680
0x04009a01 0xe43f1680 0x861ffe03 0x00000000
0x307cd1fd 0x6c20c7d8 0x1002b003 0x00001280
0x1000f805 0x0403c780 0xa0027003 0x00000000
0x10027003 0x00000100 0x1000d001 0x0423c780
0x40014e09 0x00200780 0x30100409 0xc4100780
0x60004e09 0x00208780 0x2101ee01 0x00000003
0x20000409 0x04004780 0x40010811 0x00000780
0x60000a11 0x00010780 0x30100811 0xc4100780
0x60000801 0x00010780 0x20000601 0x04000780
0x30020001 0xc4100780 0x00020605 0xc0000780
0x2000ca01 0x04200780 0xd00e0001 0x80c00780
0xd4026809 0x20000780 0xb800c001 0x00200780
0x04009a01 0xe4200780 0xf0000001 0xe0000002
0x20018205 0x00000003 0x3001d1fd 0x6c2147d8
0x10013003 0x00001280 0x861ffe03 0x00000000
0x04009a01 0xe43f1680 0x307cd1fd 0x6c20c7d8
0x1002a003 0x00001280 0x1000f805 0x0403c780
0xa0026003 0x00000000 0x10026003 0x00000100
0x1000d001 0x0423c780 0x40014e09 0x00200780
0x30100409 0xc4100780 0x60004e09 0x00208780
0x2101ee01 0x00000003 0x20000409 0x04004780
0x40010811 0x00000780 0x60000a11 0x00010780
0x30100811 0xc4100780 0x60000801 0x00010780
0x20000601 0x04000780 0x30020001 0xc4100780
0x00020605 0xc0000780 0x2000ca01 0x04200780
0xd00e0001 0x80c00780 0xd4026809 0x20000780
0xb800c001 0x00200780 0x04009a01 0xe4200780
0xf0000001 0xe0000002 0x20018205 0x00000003
0x3001d1fd 0x6c2147d8 0x10012003 0x00001280
0x308107fd 0x6c4107c8 0x30000003 0x00000280
0x00020605 0xc0000780 0xd4027009 0x20000780
0x1900e000 0x1900e004 0x0400dc01 0xe4200780
0x04011c01 0xe4204780 0x30020611 0xc4100780
0x04001a01 0xe43f0780 0x861ffe03 0x00000000
0xd0026805 0x20000780 0x307ccffd 0x6c20c7c8
0x1400c001 0x0423c780 0x30000003 0x00000280
0x1800c001 0x0423c780 0x0400dc01 0xe4200780
0x1800c001 0x0423c780 0x04011c01 0xe4200780
0xd0026809 0x20000780 0x04001a01 0xe43f0780
0x307ccffd 0x6c20c7c8 0x30020611 0xc4100780
0x1800c001 0x0423c780 0x30000003 0x00000280
0x307c07fd 0x6c0087c8 0x213fee15 0x0fffffff
0x1000f819 0x0403c780 0xd0047005 0x20000780
0xb08201fd 0x605107d8 0x10008008 0x1500e004
0xa400c01d 0xe4204780 0xc0870e1d 0x00401680
0xc0870409 0x00401680 0x90000408 0xc0020e1c
0x00020c05 0xc0000680 0xd0047009 0x20000780
0x04015c01 0xe421c680 0x1800c005 0x0423c680
0xe0010e01 0x00000780 0x20400a05 0x04018780
0x300107fd 0x6c0187d8 0xa0058003 0x00000000
0x10058003 0x00001280 0x00000805 0xc0000780
0x00020c11 0xc0000780 0xd404780d 0x20000780
0xd4037009 0x20000780 0xd0057011 0x20000784
0x1d00e004 0x1900e008 0xe001c021 0x00208784
0x1900e004 0x1d00e008 0xe001c005 0x00208784
0x04011c01 0xe4204780 0x0400dc01 0xe4220780
0xf0000001 0xe0000002 0x861ffe03 0x00000000
0x20400c05 0x0400c780 0x00020205 0xc0000780
0xb08201fd 0x605107d8 0x10000005 0x0403c780
0xa400c009 0xe4204780 0xc0870409 0x00401680
0xc0870205 0x00401680 0x90000204 0xc001041c
0x20400a05 0x04018780 0xd0047005 0x20000780
0x300107fd 0x6c0187d8 0xa0050003 0x00000000
0xe407c001 0x00200780 0x10050003 0x00001280
0x00000805 0xc0000780 0xd404780d 0x20000780
0xd4037009 0x20000780 0x1d00e004 0x1900e008
0xe807c005 0x00204780 0xec07c009 0x00208780
0x04011c01 0xe4204780 0x0400dc01 0xe4208780
0x20400c05 0x0400c782 0x00020205 0xc0000780
0x30030dfd 0x6c00c7d8 0xc407d809 0x00200780
0x1000f809 0x0403d280 0x30030dfd 0x6c0147d8
0xb0000405 0x0001c780 0x10000405 0x0403d280
0x00000809 0xc0000780 0xb800da05 0x00204780
0xa0000209 0xc4104780 0xc0000409 0x04700003
0x08001a01 0xe4204780 0xa0000409 0x8c0047d0
0x2000d605 0x04218780 0xa0000409 0x44065680
0x30170409 0xec101680 0x31000409 0x04425680
0x10001009 0x2440d100 0x30030c1d 0x6c0187d0
0x30148409 0x00000003 0xd0830e1d 0x04400780
0x40070409 0x00018780 0x00000809 0xc0000780
0xd801700d 0x20000780 0x08005a01 0xe4208780
0x3c02de09 0x8c200780 0x08005a01 0xe4208780
0x3c02ce09 0x8c200780 0x08005a01 0xe4208780
0x3c02c609 0x8c200780 0x08005a01 0xe4208780
0x3c02c209 0x8c200780 0x08005a01 0xe4208780
0x3c02c009 0x8c200780 0x08005a01 0xe4208780
0xd0016809 0x20000780 0x390fe009 0x00000003
0x30840409 0xac400780 0x1001801d 0x00000003
0x307c0409 0x8c000780 0x30020e1d 0xc4000780
0xa0000e1d 0x44014780 0xc407da1d 0x00200780
0xa0000e1d 0xac004780 0x30850e1d 0xac400780
0xa0091003 0x00000000 0x30860e1d 0x8c400780
0x10091003 0x00001100 0x30070221 0xc4100780
0x30060225 0xc4100780 0x20099020 0x2108e820
0x20000821 0x04020780 0x20009021 0x00000007
0xd00e101d 0xa0c00780 0xf0000001 0xe0000002
0x30070221 0xc4100680 0x30060225 0xc4100680
0x20001021 0x04024680 0x2000c821 0x04220680
0x21001021 0x04428680 0xd00e1009 0xa0c00680
0x307c0ffd 0x6c0087d8 0xa0000e09 0x44065500
0x30170409 0xec101500 0x31000409 0x04425500
0x10001009 0x2440d280 0xd007001d 0x0402c780
0x307c0ffd 0x6c0087d8 0xa0000e1d 0x44065500
0x30170e1d 0xec101500 0x31000e1d 0x04425500
0x1000101d 0x2440d280 0x30070409 0x8c000780
0x00000805 0xc0000780 0x30218409 0x00000003
0xd4017009 0x20000780 0x04005a01 0xe4208780
0x3802de09 0x8c200780 0x04005a01 0xe4208780
0x3802ce09 0x8c200780 0x04005a01 0xe4208780
0x3802c609 0x8c200780 0x04005a01 0xe4208780
0x3802c209 0x8c200780 0x04005a01 0xe4208780
0x3802c009 0x8c200780 0x04005a01 0xe4208780
0x08001a01 0xe4204780 0xa800da05 0xc4304780
0xc0000205 0x04700003 0xa0000205 0x8c0047d0
0x2000d609 0x04218780 0xa0000205 0x44065680
0x30170205 0xec101680 0x31000205 0x04425680
0x10001005 0x2440d100 0x30030c1d 0x6c0187d0
0x30148205 0x00000003 0xd0830e1d 0x04400780
0x40070205 0x00018780 0x00000809 0xc0000780
0x08005a01 0xe4204780 0xd801680d 0x20000780
0x1c00e005 0x0423c780 0x3c01c005 0x8c200780
0x08005a01 0xe4204780 0x1c00d005 0x0423c780
0x3c01c005 0x8c200780 0x08005a01 0xe4204780
0x1c00c805 0x0423c780 0x3c01c005 0x8c200780
0x08005a01 0xe4204780 0x1c00c405 0x0423c780
0x3c01c005 0x8c200780 0x08005a01 0xe4204780
0x1c00c205 0x0423c780 0x3c01c005 0x8c200780
0x08005a01 0xe4204780 0xd0016809 0x20000780
0x390fe005 0x00000003 0x30840205 0xac400780
0x1001801d 0x00000003 0x307c0205 0x8c000780
0x30010e1d 0xc4000780 0xa0000e1d 0x44014780
0xc407da1d 0x00200780 0xa0000e1d 0xac004780
0x30850e1d 0xac400780 0xa008c003 0x00000000
0x30860e1d 0x8c400780 0x1008c003 0x00001100
0x30070421 0xc4100780 0x30060425 0xc4100780
0x20099020 0x2108e820 0x20000821 0x04020780
0x20009021 0x00000007 0xd00e101d 0xa0c00780
0xf0000001 0xe0000002 0x30070421 0xc4100680
0x30060425 0xc4100680 0x20001021 0x04024680
0x2000c821 0x04220680 0x21001021 0x04428680
0xd00e1005 0xa0c00680 0x307c0ffd 0x6c0087d8
0xa0000e05 0x44065500 0x30170205 0xec101500
0x31000205 0x04425500 0x10001005 0x2440d280
0xd007001d 0x0402c780 0x307c0ffd 0x6c0087d8
0xa0000e1d 0x44065500 0x30170e1d 0xec101500
0x31000e1d 0x04425500 0x1000101d 0x2440d280
0x30070205 0x8c000780 0x00000805 0xc0000780
0x30218205 0x00000003 0x04005a01 0xe4204780
0xd4016809 0x20000780 0x1800e005 0x0423c780
0x3801c005 0x8c200780 0x04005a01 0xe4204780
0x1800d005 0x0423c780 0x3801c005 0x8c200780
0x04005a01 0xe4204780 0x1800c805 0x0423c780
0x3801c005 0x8c200780 0x04005a01 0xe4204780
0x1800c405 0x0423c780 0x3801c005 0x8c200780
0x04005a01 0xe4204780 0x1800c205 0x0423c780
0x3801c005 0x8c200780 0x04005a01 0xe4204780
0xa00bb003 0x00000000 0x100bb003 0x00000100
0x30070209 0xc4100780 0x30060205 0xc4100780
0x20000405 0x04004780 0xd0016805 0x20000780
0x30070405 0xc4100780 0x30060409 0xc4100780
0x20000205 0x04008780 0xd0016805 0x20000780
0x2101e808 0x1500e004 0x200c8409 0x00000003
0xd00e0405 0xa0c00780 0xf0000001 0xe0000002
0x20018c19 0x00000003 0x3006cffd 0x6c2147d8
0x1003d003 0x00001280 0xf0000001 0xe0000001
0x1003b003 0x00001280 0xf0000001 0xe0000001
}
}
code {