diff --git a/CUETools.FlaCuda/CUETools.Codecs.FlaCuda.csproj b/CUETools.FlaCuda/CUETools.Codecs.FlaCuda.csproj index db38a62..515398c 100644 --- a/CUETools.FlaCuda/CUETools.Codecs.FlaCuda.csproj +++ b/CUETools.FlaCuda/CUETools.Codecs.FlaCuda.csproj @@ -67,7 +67,7 @@ --> - nvcc flacuda.cu --maxrregcount 10 --cubin --compiler-bindir "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\bin" --system-include "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\include" + nvcc flacuda.cu --maxrregcount 10 --cubin --compiler-bindir "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\bin" --system-include "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\include" \ No newline at end of file diff --git a/CUETools.FlaCuda/FlaCudaWriter.cs b/CUETools.FlaCuda/FlaCudaWriter.cs index b6e5a14..8329ba7 100644 --- a/CUETools.FlaCuda/FlaCudaWriter.cs +++ b/CUETools.FlaCuda/FlaCudaWriter.cs @@ -1298,9 +1298,9 @@ namespace CUETools.Codecs.FlaCuda int blocks_y = ((threads_x - 1) * (threads_y - 1)) / threads_y; int blocks = (frame.blocksize + blocks_y * threads_y - 1) / (blocks_y * threads_y); cuda.CopyHostToDevice(cudaSamples, (IntPtr)s, (uint)FlaCudaWriter.MAX_BLOCKSIZE * 4 * sizeof(int)); - cuda.SetParameter(cudaComputeAutocor, 0, (uint)cudaSamples.Pointer); - cuda.SetParameter(cudaComputeAutocor, IntPtr.Size, (uint)cudaWindow.Pointer); - cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 2, (uint)cudaAutocor.Pointer); + cuda.SetParameter(cudaComputeAutocor, 0, (uint)cudaAutocor.Pointer); + cuda.SetParameter(cudaComputeAutocor, IntPtr.Size, (uint)cudaSamples.Pointer); + cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 2, (uint)cudaWindow.Pointer); cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3, (uint)frame.blocksize); cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3 + sizeof(uint), (uint)FlaCudaWriter.MAX_BLOCKSIZE); cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3 + sizeof(uint) * 2, (uint)(blocks_y*threads_y)); diff --git a/CUETools.FlaCuda/flacuda.cu b/CUETools.FlaCuda/flacuda.cu index 6cc5d87..2de4128 100644 --- a/CUETools.FlaCuda/flacuda.cu +++ b/CUETools.FlaCuda/flacuda.cu @@ -36,46 +36,48 @@ #ifndef _FLACUDA_KERNEL_H_ #define _FLACUDA_KERNEL_H_ -extern "C" __global__ void cudaComputeAutocor(const int * samples, const float * window, float* output, int frameSize, int frameOffset, int blocks) +extern "C" __global__ void cudaComputeAutocor( + float *output, + const int *samples, + const float *window, + int frameSize, + int frameOffset, + int blocks) { - extern __shared__ float fshared[]; - float * const matrix = fshared + 513;//257; + __shared__ struct { + float data[512]; + float matrix[512]; + } shared; const int iWin = blockIdx.y >> 2; const int iCh = blockIdx.y & 3; const int smpBase = iCh * frameOffset; const int winBase = iWin * 2 * frameOffset; const int pos = blockIdx.x * blocks; + const int tid = threadIdx.x + threadIdx.y * blockDim.x; // fetch blockDim.x*blockDim.y samples - int tid = threadIdx.x + threadIdx.y * blockDim.x; - fshared[tid] = pos + tid < frameSize ? samples[smpBase + pos + tid] * window[winBase + pos + tid] : 0.0f; + shared.data[tid] = pos + tid < frameSize ? samples[smpBase + pos + tid] * window[winBase + pos + tid] : 0.0f; __syncthreads(); float s = 0.0f; for (int i = 0; i < blocks; i += blockDim.y) - s += fshared[i + threadIdx.y] * fshared[i + threadIdx.y + threadIdx.x]; - matrix[tid + threadIdx.y] = s; + s += shared.data[i + threadIdx.y] * shared.data[i + threadIdx.y + threadIdx.x]; + shared.matrix[tid] = s; __syncthreads(); // reduction in shared mem for(unsigned int s=blockDim.y/2; s>1; s>>=1) { if (threadIdx.y < s) - matrix[tid + threadIdx.y] += matrix[threadIdx.x + (s + threadIdx.y) * (1 + blockDim.x)]; + shared.matrix[tid] += shared.matrix[tid + s * blockDim.x]; __syncthreads(); } // return results if (threadIdx.y == 0) - output[(blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x] = matrix[threadIdx.x] + matrix[threadIdx.x + 1 + blockDim.x]; + output[(blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x] = shared.matrix[threadIdx.x] + shared.matrix[threadIdx.x + blockDim.x]; } -#ifdef __DEVICE_EMULATION__ -#define EMUSYNC __syncthreads() -#else -#define EMUSYNC -#endif - extern "C" __global__ void cudaEncodeResidual( int*output, int*samples, @@ -88,6 +90,7 @@ extern "C" __global__ void cudaEncodeResidual( int data[256]; int residual[256]; int coefs[32]; + int rice[16]; int shift; } shared; const int smpBase = (blockIdx.y & 3) * frameOffset; @@ -96,20 +99,20 @@ extern "C" __global__ void cudaEncodeResidual( const int step = __mul24(blockDim.x, blockDim.y - 1); int total = 0; - shared.residual[tid] = 0; if (threadIdx.y == 0) shared.coefs[threadIdx.x] = threadIdx.x <= residualOrder ? allcoefs[threadIdx.x + residualOrder * 32 + blockIdx.y * 32 * 32] : 0; if (tid == 0) shared.shift = shifts[32 * blockIdx.y + residualOrder]; - __syncthreads(); + if (tid == 0) shared.rice[15] = 0x1fffffff; for(int pos = 0; pos < frameSize - residualOrder - 1; pos += step) { // fetch blockDim.x * blockDim.y samples - shared.data[tid] = pos + tid < frameSize ? samples[smpBase + pos + tid] : 0; + shared.data[tid] = (threadIdx.y == 0 && pos != 0) ? shared.data[tid + step] + : (pos + tid < frameSize ? samples[smpBase + pos + tid] : 0); __syncthreads(); long sum = 0; for (int c = 0; c <= residualOrder; c++) - sum += shared.data[tid + c] * shared.coefs[residualOrder - c]; + sum += __mul24(shared.data[tid + c], shared.coefs[residualOrder - c]); int res = shared.data[tid + residualOrder + 1] - (sum >> shared.shift); __syncthreads(); @@ -134,33 +137,22 @@ extern "C" __global__ void cudaEncodeResidual( { if (threadIdx.x < s && threadIdx.y < s) shared.residual[tid] += shared.residual[tid + s] + shared.residual[tid + s * blockDim.x] + shared.residual[tid + s + s * blockDim.x]; - EMUSYNC; + __syncthreads(); } -#ifndef __DEVICE_EMULATION__ - if (tid < 16) // Max rice param is really 15 -#endif - { - shared.data[tid] = limit * (tid + 1) + ((shared.residual[0] - (limit >> 1)) >> tid); EMUSYNC; - //if (tid == 16) shared.rice[15] = 0x7fffffff; -#ifndef __DEVICE_EMULATION__ - if (threadIdx.x < 8) -#endif - { - shared.data[threadIdx.x] = min(shared.data[threadIdx.x], shared.data[threadIdx.x + 8]); EMUSYNC; - shared.data[threadIdx.x] = min(shared.data[threadIdx.x], shared.data[threadIdx.x + 4]); EMUSYNC; - shared.data[threadIdx.x] = min(shared.data[threadIdx.x], shared.data[threadIdx.x + 2]); EMUSYNC; - } - } - total += min(shared.data[0], shared.data[1]); - //total += shared.residual[0]; + if (tid < 15) shared.rice[tid] = limit * (tid + 1) + ((shared.residual[0] - (limit >> 1)) >> tid); __syncthreads(); + if (tid < 8) shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 8]); __syncthreads(); + if (tid < 4) shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 4]); __syncthreads(); + if (tid < 2) shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 2]); __syncthreads(); + if (tid < 1) shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 1]); __syncthreads(); + total += shared.rice[0]; } if (tid == 0) output[blockIdx.x + blockIdx.y * gridDim.x] = total; #ifdef __DEVICE_EMULATION__ if (tid == 0) - printf("%d,%d:2:%d\n", blockIdx.x, blockIdx.y, total); + printf("%d,%d:%d\n", blockIdx.x, blockIdx.y, total); #endif }