tidying up

This commit is contained in:
chudov
2009-09-08 04:56:34 +00:00
parent 25671f8168
commit ef6617b4b6
3 changed files with 34 additions and 42 deletions

View File

@@ -67,7 +67,7 @@
</Target> </Target>
--> -->
<PropertyGroup> <PropertyGroup>
<PostBuildEvent>nvcc flacuda.cu --maxrregcount 10 --cubin --compiler-bindir "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\bin" --system-include "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\include" <PostBuildEvent>nvcc flacuda.cu --maxrregcount 10 --cubin --compiler-bindir "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\bin" --system-include "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\include"
</PostBuildEvent> </PostBuildEvent>
</PropertyGroup> </PropertyGroup>
</Project> </Project>

View File

@@ -1298,9 +1298,9 @@ namespace CUETools.Codecs.FlaCuda
int blocks_y = ((threads_x - 1) * (threads_y - 1)) / threads_y; int blocks_y = ((threads_x - 1) * (threads_y - 1)) / threads_y;
int blocks = (frame.blocksize + blocks_y * threads_y - 1) / (blocks_y * threads_y); int blocks = (frame.blocksize + blocks_y * threads_y - 1) / (blocks_y * threads_y);
cuda.CopyHostToDevice(cudaSamples, (IntPtr)s, (uint)FlaCudaWriter.MAX_BLOCKSIZE * 4 * sizeof(int)); cuda.CopyHostToDevice(cudaSamples, (IntPtr)s, (uint)FlaCudaWriter.MAX_BLOCKSIZE * 4 * sizeof(int));
cuda.SetParameter(cudaComputeAutocor, 0, (uint)cudaSamples.Pointer); cuda.SetParameter(cudaComputeAutocor, 0, (uint)cudaAutocor.Pointer);
cuda.SetParameter(cudaComputeAutocor, IntPtr.Size, (uint)cudaWindow.Pointer); cuda.SetParameter(cudaComputeAutocor, IntPtr.Size, (uint)cudaSamples.Pointer);
cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 2, (uint)cudaAutocor.Pointer); cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 2, (uint)cudaWindow.Pointer);
cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3, (uint)frame.blocksize); cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3, (uint)frame.blocksize);
cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3 + sizeof(uint), (uint)FlaCudaWriter.MAX_BLOCKSIZE); cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3 + sizeof(uint), (uint)FlaCudaWriter.MAX_BLOCKSIZE);
cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3 + sizeof(uint) * 2, (uint)(blocks_y*threads_y)); cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3 + sizeof(uint) * 2, (uint)(blocks_y*threads_y));

View File

@@ -36,46 +36,48 @@
#ifndef _FLACUDA_KERNEL_H_ #ifndef _FLACUDA_KERNEL_H_
#define _FLACUDA_KERNEL_H_ #define _FLACUDA_KERNEL_H_
extern "C" __global__ void cudaComputeAutocor(const int * samples, const float * window, float* output, int frameSize, int frameOffset, int blocks) extern "C" __global__ void cudaComputeAutocor(
float *output,
const int *samples,
const float *window,
int frameSize,
int frameOffset,
int blocks)
{ {
extern __shared__ float fshared[]; __shared__ struct {
float * const matrix = fshared + 513;//257; float data[512];
float matrix[512];
} shared;
const int iWin = blockIdx.y >> 2; const int iWin = blockIdx.y >> 2;
const int iCh = blockIdx.y & 3; const int iCh = blockIdx.y & 3;
const int smpBase = iCh * frameOffset; const int smpBase = iCh * frameOffset;
const int winBase = iWin * 2 * frameOffset; const int winBase = iWin * 2 * frameOffset;
const int pos = blockIdx.x * blocks; const int pos = blockIdx.x * blocks;
const int tid = threadIdx.x + threadIdx.y * blockDim.x;
// fetch blockDim.x*blockDim.y samples // fetch blockDim.x*blockDim.y samples
int tid = threadIdx.x + threadIdx.y * blockDim.x; shared.data[tid] = pos + tid < frameSize ? samples[smpBase + pos + tid] * window[winBase + pos + tid] : 0.0f;
fshared[tid] = pos + tid < frameSize ? samples[smpBase + pos + tid] * window[winBase + pos + tid] : 0.0f;
__syncthreads(); __syncthreads();
float s = 0.0f; float s = 0.0f;
for (int i = 0; i < blocks; i += blockDim.y) for (int i = 0; i < blocks; i += blockDim.y)
s += fshared[i + threadIdx.y] * fshared[i + threadIdx.y + threadIdx.x]; s += shared.data[i + threadIdx.y] * shared.data[i + threadIdx.y + threadIdx.x];
matrix[tid + threadIdx.y] = s; shared.matrix[tid] = s;
__syncthreads(); __syncthreads();
// reduction in shared mem // reduction in shared mem
for(unsigned int s=blockDim.y/2; s>1; s>>=1) for(unsigned int s=blockDim.y/2; s>1; s>>=1)
{ {
if (threadIdx.y < s) if (threadIdx.y < s)
matrix[tid + threadIdx.y] += matrix[threadIdx.x + (s + threadIdx.y) * (1 + blockDim.x)]; shared.matrix[tid] += shared.matrix[tid + s * blockDim.x];
__syncthreads(); __syncthreads();
} }
// return results // return results
if (threadIdx.y == 0) if (threadIdx.y == 0)
output[(blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x] = matrix[threadIdx.x] + matrix[threadIdx.x + 1 + blockDim.x]; output[(blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x] = shared.matrix[threadIdx.x] + shared.matrix[threadIdx.x + blockDim.x];
} }
#ifdef __DEVICE_EMULATION__
#define EMUSYNC __syncthreads()
#else
#define EMUSYNC
#endif
extern "C" __global__ void cudaEncodeResidual( extern "C" __global__ void cudaEncodeResidual(
int*output, int*output,
int*samples, int*samples,
@@ -88,6 +90,7 @@ extern "C" __global__ void cudaEncodeResidual(
int data[256]; int data[256];
int residual[256]; int residual[256];
int coefs[32]; int coefs[32];
int rice[16];
int shift; int shift;
} shared; } shared;
const int smpBase = (blockIdx.y & 3) * frameOffset; const int smpBase = (blockIdx.y & 3) * frameOffset;
@@ -96,20 +99,20 @@ extern "C" __global__ void cudaEncodeResidual(
const int step = __mul24(blockDim.x, blockDim.y - 1); const int step = __mul24(blockDim.x, blockDim.y - 1);
int total = 0; int total = 0;
shared.residual[tid] = 0;
if (threadIdx.y == 0) shared.coefs[threadIdx.x] = threadIdx.x <= residualOrder ? allcoefs[threadIdx.x + residualOrder * 32 + blockIdx.y * 32 * 32] : 0; if (threadIdx.y == 0) shared.coefs[threadIdx.x] = threadIdx.x <= residualOrder ? allcoefs[threadIdx.x + residualOrder * 32 + blockIdx.y * 32 * 32] : 0;
if (tid == 0) shared.shift = shifts[32 * blockIdx.y + residualOrder]; if (tid == 0) shared.shift = shifts[32 * blockIdx.y + residualOrder];
__syncthreads(); if (tid == 0) shared.rice[15] = 0x1fffffff;
for(int pos = 0; pos < frameSize - residualOrder - 1; pos += step) for(int pos = 0; pos < frameSize - residualOrder - 1; pos += step)
{ {
// fetch blockDim.x * blockDim.y samples // fetch blockDim.x * blockDim.y samples
shared.data[tid] = pos + tid < frameSize ? samples[smpBase + pos + tid] : 0; shared.data[tid] = (threadIdx.y == 0 && pos != 0) ? shared.data[tid + step]
: (pos + tid < frameSize ? samples[smpBase + pos + tid] : 0);
__syncthreads(); __syncthreads();
long sum = 0; long sum = 0;
for (int c = 0; c <= residualOrder; c++) for (int c = 0; c <= residualOrder; c++)
sum += shared.data[tid + c] * shared.coefs[residualOrder - c]; sum += __mul24(shared.data[tid + c], shared.coefs[residualOrder - c]);
int res = shared.data[tid + residualOrder + 1] - (sum >> shared.shift); int res = shared.data[tid + residualOrder + 1] - (sum >> shared.shift);
__syncthreads(); __syncthreads();
@@ -134,33 +137,22 @@ extern "C" __global__ void cudaEncodeResidual(
{ {
if (threadIdx.x < s && threadIdx.y < s) if (threadIdx.x < s && threadIdx.y < s)
shared.residual[tid] += shared.residual[tid + s] + shared.residual[tid + s * blockDim.x] + shared.residual[tid + s + s * blockDim.x]; shared.residual[tid] += shared.residual[tid + s] + shared.residual[tid + s * blockDim.x] + shared.residual[tid + s + s * blockDim.x];
EMUSYNC; __syncthreads();
} }
#ifndef __DEVICE_EMULATION__ if (tid < 15) shared.rice[tid] = limit * (tid + 1) + ((shared.residual[0] - (limit >> 1)) >> tid); __syncthreads();
if (tid < 16) // Max rice param is really 15 if (tid < 8) shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 8]); __syncthreads();
#endif if (tid < 4) shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 4]); __syncthreads();
{ if (tid < 2) shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 2]); __syncthreads();
shared.data[tid] = limit * (tid + 1) + ((shared.residual[0] - (limit >> 1)) >> tid); EMUSYNC; if (tid < 1) shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 1]); __syncthreads();
//if (tid == 16) shared.rice[15] = 0x7fffffff; total += shared.rice[0];
#ifndef __DEVICE_EMULATION__
if (threadIdx.x < 8)
#endif
{
shared.data[threadIdx.x] = min(shared.data[threadIdx.x], shared.data[threadIdx.x + 8]); EMUSYNC;
shared.data[threadIdx.x] = min(shared.data[threadIdx.x], shared.data[threadIdx.x + 4]); EMUSYNC;
shared.data[threadIdx.x] = min(shared.data[threadIdx.x], shared.data[threadIdx.x + 2]); EMUSYNC;
}
}
total += min(shared.data[0], shared.data[1]);
//total += shared.residual[0];
} }
if (tid == 0) if (tid == 0)
output[blockIdx.x + blockIdx.y * gridDim.x] = total; output[blockIdx.x + blockIdx.y * gridDim.x] = total;
#ifdef __DEVICE_EMULATION__ #ifdef __DEVICE_EMULATION__
if (tid == 0) if (tid == 0)
printf("%d,%d:2:%d\n", blockIdx.x, blockIdx.y, total); printf("%d,%d:%d\n", blockIdx.x, blockIdx.y, total);
#endif #endif
} }