diff --git a/CUETools.FlaCuda/CUETools.Codecs.FlaCuda.csproj b/CUETools.FlaCuda/CUETools.Codecs.FlaCuda.csproj index 515398c..e94423f 100644 --- a/CUETools.FlaCuda/CUETools.Codecs.FlaCuda.csproj +++ b/CUETools.FlaCuda/CUETools.Codecs.FlaCuda.csproj @@ -54,9 +54,9 @@ - PreserveNewest - "%24%28CUDA_BIN_PATH%29\nvcc.exe" flacuda.cu --cubin -cbin "%24%28VCInstallDir%29bin" + + - nvcc flacuda.cu --maxrregcount 10 --cubin --compiler-bindir "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\bin" --system-include "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\include" - + + + nvcc $(ProjectDir)flacuda.cu -o $(ProjectDir)\flacuda.cubin --machine 32 --maxrregcount 10 --cubin --compiler-bindir "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\bin" --system-include "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\include" \ No newline at end of file diff --git a/CUETools.FlaCuda/FlaCudaWriter.cs b/CUETools.FlaCuda/FlaCudaWriter.cs index 4b2e066..31be0de 100644 --- a/CUETools.FlaCuda/FlaCudaWriter.cs +++ b/CUETools.FlaCuda/FlaCudaWriter.cs @@ -75,7 +75,7 @@ namespace CUETools.Codecs.FlaCuda float[] windowBuffer; int samplesInBuffer = 0; - int _compressionLevel = 7; + int _compressionLevel = 5; int _blocksize = 0; int _totalSize = 0; int _windowsize = 0, _windowcount = 0; @@ -96,6 +96,7 @@ namespace CUETools.Codecs.FlaCuda CUfunction cudaComputeAutocor; CUfunction cudaComputeLPC; CUfunction cudaEstimateResidual; + CUfunction cudaSumResidualChunks; CUfunction cudaSumResidual; CUfunction cudaEncodeResidual; CUdeviceptr cudaSamples; @@ -104,6 +105,7 @@ namespace CUETools.Codecs.FlaCuda CUdeviceptr cudaAutocorOutput; CUdeviceptr cudaResidualTasks; CUdeviceptr cudaResidualOutput; + CUdeviceptr cudaResidualSums; IntPtr samplesBufferPtr = IntPtr.Zero; IntPtr autocorTasksPtr = IntPtr.Zero; IntPtr residualTasksPtr = IntPtr.Zero; @@ -114,7 +116,7 @@ namespace CUETools.Codecs.FlaCuda int nAutocorTasks = 0; const int MAX_BLOCKSIZE = 8192; - const int maxResidualParts = MAX_BLOCKSIZE / (256 - 32); + const int maxResidualParts = 64; const int maxAutocorParts = MAX_BLOCKSIZE / (256 - 32); public FlaCudaWriter(string path, int bitsPerSample, int channelCount, int sampleRate, Stream IO) @@ -218,6 +220,7 @@ namespace CUETools.Codecs.FlaCuda cuda.Free(cudaAutocorOutput); cuda.Free(cudaResidualTasks); cuda.Free(cudaResidualOutput); + cuda.Free(cudaResidualSums); CUDADriver.cuMemFreeHost(samplesBufferPtr); CUDADriver.cuMemFreeHost(residualTasksPtr); CUDADriver.cuMemFreeHost(autocorTasksPtr); @@ -250,6 +253,7 @@ namespace CUETools.Codecs.FlaCuda cuda.Free(cudaAutocorOutput); cuda.Free(cudaResidualTasks); cuda.Free(cudaResidualOutput); + cuda.Free(cudaResidualSums); CUDADriver.cuMemFreeHost(samplesBufferPtr); CUDADriver.cuMemFreeHost(residualTasksPtr); CUDADriver.cuMemFreeHost(autocorTasksPtr); @@ -278,7 +282,11 @@ namespace CUETools.Codecs.FlaCuda public long BlockSize { - set { _blocksize = (int)value; } + set { + if (value < 256 || value > MAX_BLOCKSIZE ) + throw new Exception("unsupported BlockSize value"); + _blocksize = (int)value; + } get { return _blocksize == 0 ? eparams.block_size : _blocksize; } } @@ -911,9 +919,9 @@ namespace CUETools.Codecs.FlaCuda autocorTasks[nAutocorTasks].windowOffs = iWindow * 2 * FlaCudaWriter.MAX_BLOCKSIZE; nAutocorTasks++; // LPC tasks - for (int order = 1; order <= max_order; order++) + for (int order = 1; order <= ((max_order + 7) & ~7); order++) { - residualTasks[nResidualTasks].residualOrder = order - 1; + residualTasks[nResidualTasks].residualOrder = order <= max_order ? order : 0; residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE; nResidualTasks++; } @@ -921,9 +929,9 @@ namespace CUETools.Codecs.FlaCuda // Fixed prediction for (int ch = 0; ch < channelsCount; ch++) { - for (int order = 1; order <= 4; order++) + for (int order = 1; order <= 8; order++) { - residualTasks[nResidualTasks].residualOrder = order - 1; + residualTasks[nResidualTasks].residualOrder = order <= 4 ? order : 0; residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE; residualTasks[nResidualTasks].shift = 0; switch (order) @@ -1025,9 +1033,11 @@ namespace CUETools.Codecs.FlaCuda { for (int order = 1; order <= max_order && order < frame.blocksize; order++) { - int index = (order - 1) + max_order * (iWindow + _windowcount * ch); + int index = (order - 1) + ((max_order + 7) & ~7) * (iWindow + _windowcount * ch); int cbits = residualTasks[index].cbits; int nbits = order * (int)frame.subframes[ch].obits + 4 + 5 + order * cbits + 6 + residualTasks[index].size; + if (residualTasks[index].residualOrder != order) + throw new Exception("oops"); if (frame.subframes[ch].best.size > nbits) { frame.subframes[ch].best.type = SubframeType.LPC; @@ -1048,8 +1058,10 @@ namespace CUETools.Codecs.FlaCuda { for (int order = 1; order <= 4 && order < frame.blocksize; order++) { - int index = (order - 1) + 4 * ch; - int nbits = order * (int)frame.subframes[ch].obits + 6 + residualTasks[index + max_order * _windowcount * channelsCount].size; + int index = (order - 1) + 8 * ch + ((max_order + 7) & ~7) * _windowcount * channelsCount; + int nbits = order * (int)frame.subframes[ch].obits + 6 + residualTasks[index].size; + if (residualTasks[index].residualOrder != order) + throw new Exception("oops"); if (frame.subframes[ch].best.size > nbits) { frame.subframes[ch].best.type = SubframeType.Fixed; @@ -1062,34 +1074,47 @@ namespace CUETools.Codecs.FlaCuda unsafe void estimate_residual(FlacFrame frame, int channelsCount, int max_order, int autocorPartCount, out int partCount) { - uint cbits = get_precision(frame.blocksize) + 1; - int residualThreads = 256; - int partSize = residualThreads - max_order; - partSize &= 0xffffff0; - - partCount = (frame.blocksize + partSize - 1) / partSize; - if (partCount > maxResidualParts) - throw new Exception("internal error"); - if (frame.blocksize <= 4) + { + partCount = 0; return; + } - cuda.SetParameter(cudaEstimateResidual, 0, (uint)cudaResidualOutput.Pointer); - cuda.SetParameter(cudaEstimateResidual, IntPtr.Size, (uint)cudaSamples.Pointer); - cuda.SetParameter(cudaEstimateResidual, IntPtr.Size * 2, (uint)cudaResidualTasks.Pointer); - cuda.SetParameter(cudaEstimateResidual, IntPtr.Size * 3, (uint)frame.blocksize); - cuda.SetParameter(cudaEstimateResidual, IntPtr.Size * 3 + sizeof(uint), (uint)partSize); - cuda.SetParameterSize(cudaEstimateResidual, (uint)(IntPtr.Size * 3) + sizeof(uint) * 2U); - cuda.SetFunctionBlockShape(cudaEstimateResidual, residualThreads, 1, 1); + uint cbits = get_precision(frame.blocksize) + 1; + int partSize = 256 - 32; + + partCount = (frame.blocksize + partSize - 1) / partSize; + + if (partCount > maxResidualParts) + throw new Exception("invalid combination of block size and LPC order"); + + cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 0, (uint)cudaResidualOutput.Pointer); + cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 1, (uint)cudaSamples.Pointer); + cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 2, (uint)cudaResidualTasks.Pointer); + cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 3, (uint)max_order); + cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 4, (uint)frame.blocksize); + cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 5, (uint)partSize); + cuda.SetParameterSize(cudaEstimateResidual, sizeof(uint) * 6); + cuda.SetFunctionBlockShape(cudaEstimateResidual, 64, 4, 1); + + //cuda.SetParameter(cudaSumResidualChunks, 0, (uint)cudaResidualSums.Pointer); + //cuda.SetParameter(cudaSumResidualChunks, sizeof(uint), (uint)cudaResidualTasks.Pointer); + //cuda.SetParameter(cudaSumResidualChunks, sizeof(uint) * 2, (uint)cudaResidualOutput.Pointer); + //cuda.SetParameter(cudaSumResidualChunks, sizeof(uint) * 3, (uint)frame.blocksize); + //cuda.SetParameter(cudaSumResidualChunks, sizeof(uint) * 4, (uint)partSize); + //cuda.SetParameterSize(cudaSumResidualChunks, sizeof(uint) * 5U); + //cuda.SetFunctionBlockShape(cudaSumResidualChunks, residualThreads, 1, 1); cuda.SetParameter(cudaSumResidual, 0, (uint)cudaResidualTasks.Pointer); - cuda.SetParameter(cudaSumResidual, IntPtr.Size, (uint)cudaResidualOutput.Pointer); - cuda.SetParameter(cudaSumResidual, IntPtr.Size * 2, (uint)partCount); - cuda.SetParameterSize(cudaSumResidual, (uint)(IntPtr.Size * 2) + sizeof(uint) * 1U); + cuda.SetParameter(cudaSumResidual, sizeof(uint), (uint)cudaResidualOutput.Pointer); + cuda.SetParameter(cudaSumResidual, sizeof(uint) * 2, (uint)partSize); + cuda.SetParameter(cudaSumResidual, sizeof(uint) * 3, (uint)partCount); + cuda.SetParameterSize(cudaSumResidual, sizeof(uint) * 4U); cuda.SetFunctionBlockShape(cudaSumResidual, 64, 1, 1); // issue work to the GPU - cuda.LaunchAsync(cudaEstimateResidual, partCount, nResidualTasks, cudaStream); + cuda.LaunchAsync(cudaEstimateResidual, partCount, nResidualTasks / 4, cudaStream); + //cuda.LaunchAsync(cudaSumResidualChunks, partCount, nResidualTasks, cudaStream); cuda.LaunchAsync(cudaSumResidual, 1, nResidualTasks, cudaStream); cuda.CopyDeviceToHostAsync(cudaResidualTasks, residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks), cudaStream); cuda.SynchronizeStream(cudaStream); @@ -1109,21 +1134,21 @@ namespace CUETools.Codecs.FlaCuda return; cuda.SetParameter(cudaComputeAutocor, 0, (uint)cudaAutocorOutput.Pointer); - cuda.SetParameter(cudaComputeAutocor, IntPtr.Size, (uint)cudaSamples.Pointer); - cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 2, (uint)cudaWindow.Pointer); - cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3, (uint)cudaAutocorTasks.Pointer); - cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 4, (uint)max_order); - cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 4 + sizeof(uint), (uint)frame.blocksize); - cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 4 + sizeof(uint) * 2, (uint)partSize); - cuda.SetParameterSize(cudaComputeAutocor, (uint)(IntPtr.Size * 4) + sizeof(uint) * 3); + cuda.SetParameter(cudaComputeAutocor, sizeof(uint), (uint)cudaSamples.Pointer); + cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 2, (uint)cudaWindow.Pointer); + cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 3, (uint)cudaAutocorTasks.Pointer); + cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 4, (uint)max_order); + cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 4 + sizeof(uint), (uint)frame.blocksize); + cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 4 + sizeof(uint) * 2, (uint)partSize); + cuda.SetParameterSize(cudaComputeAutocor, (uint)(sizeof(uint) * 4) + sizeof(uint) * 3); cuda.SetFunctionBlockShape(cudaComputeAutocor, autocorThreads, 1, 1); cuda.SetParameter(cudaComputeLPC, 0, (uint)cudaResidualTasks.Pointer); - cuda.SetParameter(cudaComputeLPC, IntPtr.Size, (uint)cudaAutocorOutput.Pointer); - cuda.SetParameter(cudaComputeLPC, IntPtr.Size * 2, (uint)cudaAutocorTasks.Pointer); - cuda.SetParameter(cudaComputeLPC, IntPtr.Size * 3, (uint)max_order); - cuda.SetParameter(cudaComputeLPC, IntPtr.Size * 3 + sizeof(uint), (uint)partCount); - cuda.SetParameterSize(cudaComputeLPC, (uint)(IntPtr.Size * 3) + sizeof(uint) * 2); + cuda.SetParameter(cudaComputeLPC, sizeof(uint), (uint)cudaAutocorOutput.Pointer); + cuda.SetParameter(cudaComputeLPC, sizeof(uint) * 2, (uint)cudaAutocorTasks.Pointer); + cuda.SetParameter(cudaComputeLPC, sizeof(uint) * 3, (uint)max_order); + cuda.SetParameter(cudaComputeLPC, sizeof(uint) * 3 + sizeof(uint), (uint)partCount); + cuda.SetParameterSize(cudaComputeLPC, (uint)(sizeof(uint) * 3) + sizeof(uint) * 2); cuda.SetFunctionBlockShape(cudaComputeLPC, 64, 1, 1); // issue work to the GPU @@ -1268,24 +1293,30 @@ namespace CUETools.Codecs.FlaCuda if (!inited) { cuda = new CUDA(true, InitializationFlags.None); - cuda.CreateContext(0, CUCtxFlags.SchedSpin); - cuda.LoadModule(System.IO.Path.Combine(Environment.CurrentDirectory, "flacuda.cubin")); + cuda.CreateContext(0, CUCtxFlags.BlockingSync); + using (Stream cubin = GetType().Assembly.GetManifestResourceStream(GetType(), "flacuda.cubin")) + using (StreamReader sr = new StreamReader(cubin)) + cuda.LoadModule(new ASCIIEncoding().GetBytes(sr.ReadToEnd())); + //cuda.LoadModule(System.IO.Path.Combine(Environment.CurrentDirectory, "flacuda.cubin")); cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor"); cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC"); cudaEstimateResidual = cuda.GetModuleFunction("cudaEstimateResidual"); cudaSumResidual = cuda.GetModuleFunction("cudaSumResidual"); + cudaSumResidualChunks = cuda.GetModuleFunction("cudaSumResidualChunks"); cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual"); cudaSamples = cuda.Allocate((uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels))); cudaWindow = cuda.Allocate((uint)sizeof(float) * FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS); cudaAutocorTasks = cuda.Allocate((uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS)); cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * maxAutocorParts); cudaResidualTasks = cuda.Allocate((uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 4))); - cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER + 1) * lpc.MAX_LPC_WINDOWS * maxResidualParts)); + cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 4))); + cudaResidualSums = cuda.Allocate((uint)(sizeof(int) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 4) * maxResidualParts)); + //cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 4) * maxResidualParts)); CUResult cuErr = CUDADriver.cuMemAllocHost(ref samplesBufferPtr, (uint)(sizeof(int) * (channels == 2 ? 4 : channels) * FlaCudaWriter.MAX_BLOCKSIZE)); if (cuErr == CUResult.Success) cuErr = CUDADriver.cuMemAllocHost(ref autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS)); if (cuErr == CUResult.Success) - cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 4))); + cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 8))); if (cuErr != CUResult.Success) { if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr); samplesBufferPtr = IntPtr.Zero; @@ -1678,7 +1709,7 @@ namespace CUETools.Codecs.FlaCuda case 0: do_midside = false; window_function = WindowFunction.Bartlett; - max_prediction_order = 7; + max_prediction_order = 8; max_partition_order = 4; break; case 1: @@ -1694,7 +1725,7 @@ namespace CUETools.Codecs.FlaCuda break; case 3: window_function = WindowFunction.Bartlett; - max_prediction_order = 7; + max_prediction_order = 8; break; case 4: window_function = WindowFunction.Bartlett; @@ -1704,7 +1735,7 @@ namespace CUETools.Codecs.FlaCuda window_function = WindowFunction.Bartlett; break; case 6: - max_prediction_order = 10; + //max_prediction_order = 10; break; case 7: break; diff --git a/CUETools.FlaCuda/flacuda.cu b/CUETools.FlaCuda/flacuda.cu index b208276..a9ed84a 100644 --- a/CUETools.FlaCuda/flacuda.cu +++ b/CUETools.FlaCuda/flacuda.cu @@ -155,6 +155,7 @@ extern "C" __global__ void cudaComputeLPC( if (tid < 32) { int precision = 13; + int taskNo = (blockIdx.x + blockIdx.y * gridDim.x) * ((max_order + 7) & ~7) + order; shared.bits[tid] = __mul24((33 - __clz(__float2int_rn(fabs(shared.tmp[tid]) * (1 << 15))) - precision), tid <= order); shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 16]); shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 8]); @@ -164,9 +165,9 @@ extern "C" __global__ void cudaComputeLPC( int sh = max(0,min(15, 15 - shared.bits[0])); int coef = max(-(1 << precision),min((1 << precision)-1,__float2int_rn(-shared.tmp[tid] * (1 << sh)))); if (tid <= order) - output[(blockIdx.x + blockIdx.y * gridDim.x) * max_order + order].coefs[tid] = coef; + output[taskNo].coefs[tid] = coef; if (tid == 0) - output[(blockIdx.x + blockIdx.y * gridDim.x) * max_order + order].shift = sh; + output[taskNo].shift = sh; shared.bits[tid] = 33 - max(__clz(coef),__clz(-1 ^ coef)); shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 16]); shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 8]); @@ -175,51 +176,111 @@ extern "C" __global__ void cudaComputeLPC( shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 1]); int cbits = shared.bits[0]; if (tid == 0) - output[(blockIdx.x + blockIdx.y * gridDim.x) * max_order + order].cbits = cbits; + output[taskNo].cbits = cbits; } __syncthreads(); } } +// blockDim.x == 32 +// blockDim.y == 8 extern "C" __global__ void cudaEstimateResidual( int*output, int*samples, encodeResidualTaskStruct *tasks, + int max_order, int frameSize, - int partSize // should be <= blockDim - max_order + int partSize // should be 224 ) { __shared__ struct { int data[256]; int residual[256]; - int rice[32]; - encodeResidualTaskStruct task; + int rice[256]; + int sums[8]; + encodeResidualTaskStruct task[8]; } shared; - const int tid = threadIdx.x; - // fetch task data - if (tid < sizeof(encodeResidualTaskStruct) / sizeof(int)) - ((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid]; + const int tid = threadIdx.x + threadIdx.y * blockDim.x; + // fetch task data (8 * 64 == 512 elements); + ((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y * blockDim.y))[tid]; + ((int*)&shared.task)[tid + 256] = ((int*)(tasks + blockIdx.y * blockDim.y))[tid + 256]; __syncthreads(); - const int pos = blockIdx.x * partSize; - const int residualOrder = shared.task.residualOrder + 1; - const int residualLen = min(frameSize - pos - residualOrder, partSize); - const int dataLen = residualLen + residualOrder; + const int residualOrder = shared.task[threadIdx.y].residualOrder; + const int partNumber = blockIdx.x; + const int pos = partNumber * partSize; + const int dataLen = min(frameSize - pos, partSize + max_order) * (residualOrder != 0); // fetch samples - shared.data[tid] = (tid < dataLen ? samples[shared.task.samplesOffs + pos + tid] : 0); + shared.data[tid] = (tid < dataLen ? samples[shared.task[0].samplesOffs + pos + tid] : 0); + if (tid < blockDim.y) shared.sums[tid] = 0; + + // set upper residuals to zero, in case blockDim < 256 + //shared.residual[255 - tid] = 0; + + const int residualLen = min(frameSize - pos - residualOrder, partSize) * (residualOrder != 0); // reverse coefs - if (tid < residualOrder) shared.task.coefs[tid] = shared.task.coefs[residualOrder - 1 - tid]; - - // compute residual - __syncthreads(); - long sum = 0; - for (int c = 0; c < residualOrder; c++) - sum += __mul24(shared.data[tid + c], shared.task.coefs[c]); - int res = shared.data[tid + residualOrder] - (sum >> shared.task.shift); - shared.residual[tid] = __mul24(tid < residualLen, (2 * res) ^ (res >> 31)); + if (threadIdx.x < residualOrder) shared.task[threadIdx.y].coefs[threadIdx.x] = shared.task[threadIdx.y].coefs[residualOrder - 1 - threadIdx.x]; __syncthreads(); + + for (int i = 0; i < residualLen; i += blockDim.x) + { + // compute residual + long sum = 0; + for (int c = 0; c < residualOrder; c++) + sum += __mul24(shared.data[i + threadIdx.x + c], shared.task[threadIdx.y].coefs[c]); + int res = shared.data[i + threadIdx.x + residualOrder] - (sum >> shared.task[threadIdx.y].shift); + shared.residual[tid] = __mul24(i + threadIdx.x < residualLen, (2 * res) ^ (res >> 31)); + __syncthreads(); if (threadIdx.x < 32) shared.residual[tid] += shared.residual[tid + 32]; __syncthreads(); + shared.residual[tid] += shared.residual[tid + 16]; + shared.residual[tid] += shared.residual[tid + 8]; + shared.residual[tid] += shared.residual[tid + 4]; + shared.residual[tid] += shared.residual[tid + 2]; + if (threadIdx.x == 0) shared.sums[threadIdx.y] += shared.residual[tid] + shared.residual[tid + 1]; + } + + // rice parameter search + shared.rice[tid] = __mul24(threadIdx.x >= 15, 0x7fffff) + residualLen * (threadIdx.x + 1) + ((shared.sums[threadIdx.y] - (residualLen >> 1)) >> threadIdx.x); + shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 8]); + shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 4]); + shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 2]); + if (threadIdx.x == 0 && residualOrder != 0) + output[(blockIdx.y * blockDim.y + threadIdx.y) * gridDim.x + blockIdx.x] = min(shared.rice[tid], shared.rice[tid + 1]); +} + +// blockDim.x == 256 +// gridDim.x = frameSize / chunkSize +extern "C" __global__ void cudaSumResidualChunks( + int *output, + encodeResidualTaskStruct *tasks, + int *residual, + int frameSize, + int chunkSize // <= blockDim.x(256) + ) +{ + __shared__ struct { + int residual[256]; + int rice[32]; + } shared; + + // fetch parameters + const int tid = threadIdx.x; + const int residualOrder = tasks[blockIdx.y].residualOrder; + const int chunkNumber = blockIdx.x; + const int pos = chunkNumber * chunkSize; + const int residualLen = min(frameSize - pos - residualOrder, chunkSize); + + // set upper residuals to zero, in case blockDim < 256 + shared.residual[255 - tid] = 0; + + // read residual + int res = (tid < residualLen) ? residual[blockIdx.y * 8192 + pos + tid] : 0; + + // convert to unsigned + shared.residual[tid] = (2 * res) ^ (res >> 31); + __syncthreads(); + // residual sum: reduction in shared mem if (tid < 128) shared.residual[tid] += shared.residual[tid + 128]; __syncthreads(); if (tid < 64) shared.residual[tid] += shared.residual[tid + 64]; __syncthreads(); @@ -229,7 +290,6 @@ extern "C" __global__ void cudaEstimateResidual( shared.residual[tid] += shared.residual[tid + 4]; shared.residual[tid] += shared.residual[tid + 2]; shared.residual[tid] += shared.residual[tid + 1]; - __syncthreads(); if (tid < 32) { @@ -240,6 +300,8 @@ extern "C" __global__ void cudaEstimateResidual( shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 2]); shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 1]); } + + // write output if (tid == 0) output[blockIdx.x + blockIdx.y * gridDim.x] = shared.rice[0]; } @@ -247,37 +309,35 @@ extern "C" __global__ void cudaEstimateResidual( extern "C" __global__ void cudaSumResidual( encodeResidualTaskStruct *tasks, int *residual, - int partCount // <= blockDim.y (64) + int partSize, + int partCount // <= blockDim.y (256) ) { __shared__ struct { - int partLen[64]; - //encodeResidualTaskStruct task; + int partLen[256]; + encodeResidualTaskStruct task; } shared; const int tid = threadIdx.x; // fetch task data - // if (tid < sizeof(encodeResidualTaskStruct) / sizeof(int)) - //((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid]; - // __syncthreads(); + if (tid < sizeof(encodeResidualTaskStruct) / sizeof(int)) + ((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid]; + __syncthreads(); shared.partLen[tid] = (tid < partCount) ? residual[tid + partCount * blockIdx.y] : 0; - __syncthreads(); // length sum: reduction in shared mem + //if (tid < 128) shared.partLen[tid] += shared.partLen[tid + 128]; __syncthreads(); + //if (tid < 64) shared.partLen[tid] += shared.partLen[tid + 64]; __syncthreads(); if (tid < 32) shared.partLen[tid] += shared.partLen[tid + 32]; __syncthreads(); shared.partLen[tid] += shared.partLen[tid + 16]; shared.partLen[tid] += shared.partLen[tid + 8]; shared.partLen[tid] += shared.partLen[tid + 4]; shared.partLen[tid] += shared.partLen[tid + 2]; shared.partLen[tid] += shared.partLen[tid + 1]; - __syncthreads(); - - // FIXME: should process partition order here!!! - // return sum if (tid == 0) - tasks[blockIdx.y].size = shared.partLen[0]; + tasks[blockIdx.y].size = shared.partLen[0]; } extern "C" __global__ void cudaEncodeResidual( @@ -288,58 +348,6 @@ extern "C" __global__ void cudaEncodeResidual( int partSize // should be <= blockDim - max_order ) { - __shared__ struct { - int data[256]; - int residual[256]; - int rice[32]; - encodeResidualTaskStruct task; - } shared; - const int tid = threadIdx.x; - // fetch task data - if (tid < sizeof(encodeResidualTaskStruct) / sizeof(int)) - ((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid]; __syncthreads(); - const int pos = blockIdx.x * partSize; - const int residualOrder = shared.task.residualOrder + 1; - const int residualLen = min(frameSize - pos - residualOrder, partSize); - const int dataLen = residualLen + residualOrder; - - // fetch samples - shared.data[tid] = (tid < dataLen ? samples[shared.task.samplesOffs + pos + tid] : 0); - - // reverse coefs - if (tid < residualOrder) shared.task.coefs[tid] = shared.task.coefs[residualOrder - 1 - tid]; - - // compute residual - __syncthreads(); - long sum = 0; - for (int c = 0; c < residualOrder; c++) - sum += __mul24(shared.data[tid + c], shared.task.coefs[c]); - int res = shared.data[tid + residualOrder] - (sum >> shared.task.shift); - shared.residual[tid] = __mul24(tid < residualLen, (2 * res) ^ (res >> 31)); - - __syncthreads(); - // residual sum: reduction in shared mem - if (tid < 128) shared.residual[tid] += shared.residual[tid + 128]; __syncthreads(); - if (tid < 64) shared.residual[tid] += shared.residual[tid + 64]; __syncthreads(); - if (tid < 32) shared.residual[tid] += shared.residual[tid + 32]; __syncthreads(); - shared.residual[tid] += shared.residual[tid + 16]; - shared.residual[tid] += shared.residual[tid + 8]; - shared.residual[tid] += shared.residual[tid + 4]; - shared.residual[tid] += shared.residual[tid + 2]; - shared.residual[tid] += shared.residual[tid + 1]; - __syncthreads(); - - if (tid < 32) - { - // rice parameter search - shared.rice[tid] = __mul24(tid >= 15, 0x7fffff) + residualLen * (tid + 1) + ((shared.residual[0] - (residualLen >> 1)) >> tid); - shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 8]); - shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 4]); - shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 2]); - shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 1]); - } - if (tid == 0) - output[blockIdx.x + blockIdx.y * gridDim.x] = shared.rice[0]; } #endif