diff --git a/CUETools.FlaCuda/FlaCudaWriter.cs b/CUETools.FlaCuda/FlaCudaWriter.cs index 2c592c4..4b2e066 100644 --- a/CUETools.FlaCuda/FlaCudaWriter.cs +++ b/CUETools.FlaCuda/FlaCudaWriter.cs @@ -95,20 +95,23 @@ namespace CUETools.Codecs.FlaCuda CUDA cuda; CUfunction cudaComputeAutocor; CUfunction cudaComputeLPC; + CUfunction cudaEstimateResidual; + CUfunction cudaSumResidual; CUfunction cudaEncodeResidual; CUdeviceptr cudaSamples; CUdeviceptr cudaWindow; CUdeviceptr cudaAutocorTasks; CUdeviceptr cudaAutocorOutput; - CUdeviceptr cudaCompLPCOutput; CUdeviceptr cudaResidualTasks; CUdeviceptr cudaResidualOutput; IntPtr samplesBufferPtr = IntPtr.Zero; IntPtr autocorTasksPtr = IntPtr.Zero; - IntPtr compLPCOutputPtr = IntPtr.Zero; IntPtr residualTasksPtr = IntPtr.Zero; - IntPtr residualOutputPtr = IntPtr.Zero; CUstream cudaStream; + CUstream cudaStream1; + + int nResidualTasks = 0; + int nAutocorTasks = 0; const int MAX_BLOCKSIZE = 8192; const int maxResidualParts = MAX_BLOCKSIZE / (256 - 32); @@ -213,15 +216,13 @@ namespace CUETools.Codecs.FlaCuda cuda.Free(cudaSamples); cuda.Free(cudaAutocorTasks); cuda.Free(cudaAutocorOutput); - cuda.Free(cudaCompLPCOutput); cuda.Free(cudaResidualTasks); cuda.Free(cudaResidualOutput); - CUDADriver.cuMemFreeHost(compLPCOutputPtr); - CUDADriver.cuMemFreeHost(residualOutputPtr); CUDADriver.cuMemFreeHost(samplesBufferPtr); CUDADriver.cuMemFreeHost(residualTasksPtr); CUDADriver.cuMemFreeHost(autocorTasksPtr); cuda.DestroyStream(cudaStream); + cuda.DestroyStream(cudaStream1); cuda.Dispose(); inited = false; } @@ -247,15 +248,13 @@ namespace CUETools.Codecs.FlaCuda cuda.Free(cudaSamples); cuda.Free(cudaAutocorTasks); cuda.Free(cudaAutocorOutput); - cuda.Free(cudaCompLPCOutput); cuda.Free(cudaResidualTasks); cuda.Free(cudaResidualOutput); - CUDADriver.cuMemFreeHost(compLPCOutputPtr); - CUDADriver.cuMemFreeHost(residualOutputPtr); CUDADriver.cuMemFreeHost(samplesBufferPtr); CUDADriver.cuMemFreeHost(residualTasksPtr); CUDADriver.cuMemFreeHost(autocorTasksPtr); cuda.DestroyStream(cudaStream); + cuda.DestroyStream(cudaStream1); cuda.Dispose(); inited = false; } @@ -898,6 +897,65 @@ namespace CUETools.Codecs.FlaCuda _windowcount++; } + unsafe void initialize_autocorTasks(int channelsCount, int max_order) + { + computeAutocorTaskStruct* autocorTasks = (computeAutocorTaskStruct*)autocorTasksPtr; + encodeResidualTaskStruct* residualTasks = (encodeResidualTaskStruct*)residualTasksPtr; + nAutocorTasks = 0; + nResidualTasks = 0; + for (int ch = 0; ch < channelsCount; ch++) + for (int iWindow = 0; iWindow < _windowcount; iWindow++) + { + // Autocorelation task + autocorTasks[nAutocorTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE; + autocorTasks[nAutocorTasks].windowOffs = iWindow * 2 * FlaCudaWriter.MAX_BLOCKSIZE; + nAutocorTasks++; + // LPC tasks + for (int order = 1; order <= max_order; order++) + { + residualTasks[nResidualTasks].residualOrder = order - 1; + residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE; + nResidualTasks++; + } + } + // Fixed prediction + for (int ch = 0; ch < channelsCount; ch++) + { + for (int order = 1; order <= 4; order++) + { + residualTasks[nResidualTasks].residualOrder = order - 1; + residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE; + residualTasks[nResidualTasks].shift = 0; + switch (order) + { + case 1: + residualTasks[nResidualTasks].coefs[0] = 1; + break; + case 2: + residualTasks[nResidualTasks].coefs[0] = 2; + residualTasks[nResidualTasks].coefs[1] = -1; + break; + case 3: + residualTasks[nResidualTasks].coefs[0] = 3; + residualTasks[nResidualTasks].coefs[1] = -3; + residualTasks[nResidualTasks].coefs[2] = 1; + break; + case 4: + residualTasks[nResidualTasks].coefs[0] = 4; + residualTasks[nResidualTasks].coefs[1] = -6; + residualTasks[nResidualTasks].coefs[2] = 4; + residualTasks[nResidualTasks].coefs[3] = -1; + break; + } + nResidualTasks++; + } + } + + cuda.CopyHostToDeviceAsync(cudaAutocorTasks, autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * nAutocorTasks), cudaStream); + cuda.CopyHostToDeviceAsync(cudaResidualTasks, residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks), cudaStream); + cuda.SynchronizeStream(cudaStream); + } + unsafe void encode_residual(FlacFrame frame) { for (int ch = 0; ch < channels; ch++) @@ -968,19 +1026,8 @@ namespace CUETools.Codecs.FlaCuda for (int order = 1; order <= max_order && order < frame.blocksize; order++) { int index = (order - 1) + max_order * (iWindow + _windowcount * ch); - int nbits = 0; - for (int p = 0; p < partCount; p++) - nbits += ((int*)residualOutputPtr)[p + partCount * index]; - - int cbits = 1; - for (int i = order; i > 0; i--) - { - int c = residualTasks[index].coefs[i - 1]; - while (cbits < 16 && c != (c << (32 - cbits)) >> (32 - cbits)) - cbits++; - } - - nbits += order * (int)frame.subframes[ch].obits + 4 + 5 + order * cbits + 6; + int cbits = residualTasks[index].cbits; + int nbits = order * (int)frame.subframes[ch].obits + 4 + 5 + order * cbits + 6 + residualTasks[index].size; if (frame.subframes[ch].best.size > nbits) { frame.subframes[ch].best.type = SubframeType.LPC; @@ -989,8 +1036,8 @@ namespace CUETools.Codecs.FlaCuda frame.subframes[ch].best.window = iWindow; frame.subframes[ch].best.cbits = cbits; frame.subframes[ch].best.shift = residualTasks[index].shift; - fixed (int* fcoefs = frame.subframes[ch].best.coefs) - AudioSamples.MemCpy(fcoefs, residualTasks[index].coefs, order); + for (int i = 0; i < order; i++) + frame.subframes[ch].best.coefs[i] = residualTasks[index].coefs[i];//order - 1 - i]; } } } @@ -1002,10 +1049,7 @@ namespace CUETools.Codecs.FlaCuda for (int order = 1; order <= 4 && order < frame.blocksize; order++) { int index = (order - 1) + 4 * ch; - int nbits = 0; - for (int p = 0; p < partCount; p++) - nbits += ((int*)residualOutputPtr)[p + partCount * (index + max_order * _windowcount * channelsCount)]; - nbits += order * (int)frame.subframes[ch].obits + 6; + int nbits = order * (int)frame.subframes[ch].obits + 6 + residualTasks[index + max_order * _windowcount * channelsCount].size; if (frame.subframes[ch].best.size > nbits) { frame.subframes[ch].best.type = SubframeType.Fixed; @@ -1018,11 +1062,10 @@ namespace CUETools.Codecs.FlaCuda unsafe void estimate_residual(FlacFrame frame, int channelsCount, int max_order, int autocorPartCount, out int partCount) { - encodeResidualTaskStruct* residualTasks = (encodeResidualTaskStruct*)residualTasksPtr; uint cbits = get_precision(frame.blocksize) + 1; - int nResidualTasks = 0; int residualThreads = 256; int partSize = residualThreads - max_order; + partSize &= 0xffffff0; partCount = (frame.blocksize + partSize - 1) / partSize; if (partCount > maxResidualParts) @@ -1031,92 +1074,24 @@ namespace CUETools.Codecs.FlaCuda if (frame.blocksize <= 4) return; - // LPC - for (int ch = 0; ch < channelsCount; ch++) - for (int iWindow = 0; iWindow < _windowcount; iWindow++) - { - //int* lpcs = ((int*)compLPCOutputPtr) + (max_order + 1) * max_order * (iWindow + _windowcount * ch); - //for (int order = 1; order <= max_order; order++) - //{ - // residualTasks[nResidualTasks].residualOrder = order - 1; - // residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE; - // residualTasks[nResidualTasks].shift = lpcs[order + (order - 1) * (max_order + 1)]; - // AudioSamples.MemCpy(residualTasks[nResidualTasks].coefs, lpcs + (order - 1) * (max_order + 1), order); - // nResidualTasks++; - //} - float* lpcs = ((float*)compLPCOutputPtr) + max_order * max_order * (iWindow + _windowcount * ch); - for (int order = 1; order <= max_order; order++) - { - residualTasks[nResidualTasks].residualOrder = order - 1; - residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE; + cuda.SetParameter(cudaEstimateResidual, 0, (uint)cudaResidualOutput.Pointer); + cuda.SetParameter(cudaEstimateResidual, IntPtr.Size, (uint)cudaSamples.Pointer); + cuda.SetParameter(cudaEstimateResidual, IntPtr.Size * 2, (uint)cudaResidualTasks.Pointer); + cuda.SetParameter(cudaEstimateResidual, IntPtr.Size * 3, (uint)frame.blocksize); + cuda.SetParameter(cudaEstimateResidual, IntPtr.Size * 3 + sizeof(uint), (uint)partSize); + cuda.SetParameterSize(cudaEstimateResidual, (uint)(IntPtr.Size * 3) + sizeof(uint) * 2U); + cuda.SetFunctionBlockShape(cudaEstimateResidual, residualThreads, 1, 1); - lpc.quantize_lpc_coefs(lpcs + (order - 1) * max_order, - order, cbits, residualTasks[nResidualTasks].coefs, - out residualTasks[nResidualTasks].shift, 15, 0); - - nResidualTasks++; - } - } - // FIXED - for (int ch = 0; ch < channelsCount; ch++) - { - for (int order = 1; order <= 4; order++) - { - residualTasks[nResidualTasks].residualOrder = order - 1; - residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE; - residualTasks[nResidualTasks].shift = 0; - switch (order) - { - case 1: - residualTasks[nResidualTasks].coefs[0] = 1; - break; - case 2: - residualTasks[nResidualTasks].coefs[0] = 2; - residualTasks[nResidualTasks].coefs[1] = -1; - break; - case 3: - residualTasks[nResidualTasks].coefs[0] = 3; - residualTasks[nResidualTasks].coefs[1] = -3; - residualTasks[nResidualTasks].coefs[2] = 1; - break; - case 4: - residualTasks[nResidualTasks].coefs[0] = 4; - residualTasks[nResidualTasks].coefs[1] = -6; - residualTasks[nResidualTasks].coefs[2] = 4; - residualTasks[nResidualTasks].coefs[3] = -1; - break; - } - nResidualTasks++; - } - } - - cuda.SetParameter(cudaEncodeResidual, 0, (uint)cudaResidualOutput.Pointer); - cuda.SetParameter(cudaEncodeResidual, IntPtr.Size, (uint)cudaSamples.Pointer); - cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 2, (uint)cudaResidualTasks.Pointer); - cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 3, (uint)frame.blocksize); - cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 3 + sizeof(uint), (uint)partSize); - cuda.SetParameterSize(cudaEncodeResidual, (uint)(IntPtr.Size * 3) + sizeof(uint) * 2U); - cuda.SetFunctionBlockShape(cudaEncodeResidual, residualThreads, 1, 1); + cuda.SetParameter(cudaSumResidual, 0, (uint)cudaResidualTasks.Pointer); + cuda.SetParameter(cudaSumResidual, IntPtr.Size, (uint)cudaResidualOutput.Pointer); + cuda.SetParameter(cudaSumResidual, IntPtr.Size * 2, (uint)partCount); + cuda.SetParameterSize(cudaSumResidual, (uint)(IntPtr.Size * 2) + sizeof(uint) * 1U); + cuda.SetFunctionBlockShape(cudaSumResidual, 64, 1, 1); // issue work to the GPU - cuda.CopyHostToDeviceAsync(cudaResidualTasks, residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks), cudaStream); - cuda.LaunchAsync(cudaEncodeResidual, partCount, nResidualTasks, cudaStream); - cuda.CopyDeviceToHostAsync(cudaResidualOutput, residualOutputPtr, (uint)(sizeof(int) * partCount * nResidualTasks), cudaStream); - cuda.SynchronizeStream(cudaStream); - } - - unsafe void initialize_autocorTasks() - { - computeAutocorTaskStruct* autocorTasks = (computeAutocorTaskStruct*)autocorTasksPtr; - int nAutocorTasks = 0; - for (int ch = 0; ch < (channels == 2 ? 4 : channels); ch++) - for (int iWindow = 0; iWindow < _windowcount; iWindow++) - { - autocorTasks[nAutocorTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE; - autocorTasks[nAutocorTasks].windowOffs = iWindow * 2 * FlaCudaWriter.MAX_BLOCKSIZE; - nAutocorTasks++; - } - cuda.CopyHostToDeviceAsync(cudaAutocorTasks, autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * nAutocorTasks), cudaStream); + cuda.LaunchAsync(cudaEstimateResidual, partCount, nResidualTasks, cudaStream); + cuda.LaunchAsync(cudaSumResidual, 1, nResidualTasks, cudaStream); + cuda.CopyDeviceToHostAsync(cudaResidualTasks, residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks), cudaStream); cuda.SynchronizeStream(cudaStream); } @@ -1124,7 +1099,7 @@ namespace CUETools.Codecs.FlaCuda { int autocorThreads = 256; int partSize = 2 * autocorThreads - max_order; - int nAutocorTasks = _windowcount * channelsCount; + partSize &= 0xffffff0; partCount = (frame.blocksize + partSize - 1) / partSize; if (partCount > maxAutocorParts) @@ -1143,22 +1118,22 @@ namespace CUETools.Codecs.FlaCuda cuda.SetParameterSize(cudaComputeAutocor, (uint)(IntPtr.Size * 4) + sizeof(uint) * 3); cuda.SetFunctionBlockShape(cudaComputeAutocor, autocorThreads, 1, 1); - cuda.SetParameter(cudaComputeLPC, 0, (uint)cudaCompLPCOutput.Pointer); + cuda.SetParameter(cudaComputeLPC, 0, (uint)cudaResidualTasks.Pointer); cuda.SetParameter(cudaComputeLPC, IntPtr.Size, (uint)cudaAutocorOutput.Pointer); cuda.SetParameter(cudaComputeLPC, IntPtr.Size * 2, (uint)cudaAutocorTasks.Pointer); cuda.SetParameter(cudaComputeLPC, IntPtr.Size * 3, (uint)max_order); cuda.SetParameter(cudaComputeLPC, IntPtr.Size * 3 + sizeof(uint), (uint)partCount); cuda.SetParameterSize(cudaComputeLPC, (uint)(IntPtr.Size * 3) + sizeof(uint) * 2); - cuda.SetFunctionBlockShape(cudaComputeLPC, 32, 1, 1); + cuda.SetFunctionBlockShape(cudaComputeLPC, 64, 1, 1); // issue work to the GPU cuda.CopyHostToDeviceAsync(cudaSamples, samplesBufferPtr, (uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelsCount), cudaStream); cuda.LaunchAsync(cudaComputeAutocor, partCount, nAutocorTasks, cudaStream); cuda.LaunchAsync(cudaComputeLPC, 1, nAutocorTasks, cudaStream); - cuda.CopyDeviceToHostAsync(cudaCompLPCOutput, compLPCOutputPtr, (uint)(sizeof(float) * (max_order + 1) * max_order * nAutocorTasks), cudaStream); cuda.SynchronizeStream(cudaStream); + //cuda.CopyDeviceToHostAsync(cudaResidualTasks, residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks), cudaStream1); } - + unsafe int encode_frame(out int size) { int* s = (int*)samplesBufferPtr; @@ -1167,6 +1142,9 @@ namespace CUETools.Codecs.FlaCuda { frame.InitSize(eparams.block_size, eparams.variable_block_size != 0); + bool doMidside = channels == 2 && eparams.do_midside; + int channelCount = doMidside ? 2 * channels : channels; + if (frame.blocksize != _windowsize && frame.blocksize > 4) { _windowsize = frame.blocksize; @@ -1179,11 +1157,9 @@ namespace CUETools.Codecs.FlaCuda if (_windowcount == 0) throw new Exception("invalid windowfunction"); cuda.CopyHostToDevice(cudaWindow, windowBuffer); - initialize_autocorTasks(); + initialize_autocorTasks(channelCount, eparams.max_prediction_order); } - bool doMidside = channels == 2 && eparams.do_midside; - int channelCount = doMidside ? 2 * channels : channels; if (doMidside) channel_decorrelation(s, s + FlaCudaWriter.MAX_BLOCKSIZE, s + 2 * FlaCudaWriter.MAX_BLOCKSIZE, s + 3 * FlaCudaWriter.MAX_BLOCKSIZE, frame.blocksize); @@ -1296,33 +1272,29 @@ namespace CUETools.Codecs.FlaCuda cuda.LoadModule(System.IO.Path.Combine(Environment.CurrentDirectory, "flacuda.cubin")); cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor"); cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC"); + cudaEstimateResidual = cuda.GetModuleFunction("cudaEstimateResidual"); + cudaSumResidual = cuda.GetModuleFunction("cudaSumResidual"); cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual"); cudaSamples = cuda.Allocate((uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels))); cudaWindow = cuda.Allocate((uint)sizeof(float) * FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS); cudaAutocorTasks = cuda.Allocate((uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS)); cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * maxAutocorParts); - cudaCompLPCOutput = cuda.Allocate((uint)(sizeof(float) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * maxAutocorParts); - cudaResidualTasks = cuda.Allocate((uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS)); + cudaResidualTasks = cuda.Allocate((uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 4))); cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER + 1) * lpc.MAX_LPC_WINDOWS * maxResidualParts)); CUResult cuErr = CUDADriver.cuMemAllocHost(ref samplesBufferPtr, (uint)(sizeof(int) * (channels == 2 ? 4 : channels) * FlaCudaWriter.MAX_BLOCKSIZE)); if (cuErr == CUResult.Success) cuErr = CUDADriver.cuMemAllocHost(ref autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS)); if (cuErr == CUResult.Success) - cuErr = CUDADriver.cuMemAllocHost(ref compLPCOutputPtr, (uint)(sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * lpc.MAX_LPC_ORDER * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS)); - if (cuErr == CUResult.Success) - cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER)); - if (cuErr == CUResult.Success) - cuErr = CUDADriver.cuMemAllocHost(ref residualOutputPtr, (uint)(sizeof(int) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER * maxResidualParts)); + cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 4))); if (cuErr != CUResult.Success) { if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr); samplesBufferPtr = IntPtr.Zero; if (autocorTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(autocorTasksPtr); autocorTasksPtr = IntPtr.Zero; - if (compLPCOutputPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(compLPCOutputPtr); compLPCOutputPtr = IntPtr.Zero; if (residualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualTasksPtr); residualTasksPtr = IntPtr.Zero; - if (residualOutputPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualOutputPtr); residualOutputPtr = IntPtr.Zero; throw new CUDAException(cuErr); } cudaStream = cuda.CreateStream(); + cudaStream1 = cuda.CreateStream(); if (_IO == null) _IO = new FileStream(_path, FileMode.Create, FileAccess.Write, FileShare.Read); int header_size = flake_encode_init(); @@ -1769,7 +1741,9 @@ namespace CUETools.Codecs.FlaCuda public int residualOrder; public int samplesOffs; public int shift; - public int reserved; + public int cbits; + public int size; + public fixed int reserved[11]; public fixed int coefs[32]; }; } diff --git a/CUETools.FlaCuda/flacuda.cu b/CUETools.FlaCuda/flacuda.cu index 9d224df..b208276 100644 --- a/CUETools.FlaCuda/flacuda.cu +++ b/CUETools.FlaCuda/flacuda.cu @@ -26,6 +26,17 @@ typedef struct int windowOffs; } computeAutocorTaskStruct; +typedef struct +{ + int residualOrder; // <= 32 + int samplesOffs; + int shift; + int cbits; + int size; + int reserved[11]; + int coefs[32]; +} encodeResidualTaskStruct; + extern "C" __global__ void cudaComputeAutocor( float *output, const int *samples, @@ -73,7 +84,7 @@ extern "C" __global__ void cudaComputeAutocor( shared.product[tid] += shared.product[tid + 8]; shared.product[tid] += shared.product[tid + 4]; shared.product[tid] += shared.product[tid + 2]; - if (tid == 0) shared.sum[lag] = shared.product[0] + shared.product[1]; + if (tid == 0) shared.sum[lag] = shared.product[0] + shared.product[1]; __syncthreads(); } @@ -83,11 +94,11 @@ extern "C" __global__ void cudaComputeAutocor( } extern "C" __global__ void cudaComputeLPC( - float*output, + encodeResidualTaskStruct *output, float*autoc, computeAutocorTaskStruct *tasks, int max_order, // should be <= 32 - int partCount // should be <= blockDim + int partCount // should be <= blockDim? ) { __shared__ struct { @@ -96,6 +107,7 @@ extern "C" __global__ void cudaComputeLPC( float buf[32]; int bits[32]; float autoc[33]; + int cbits; } shared; const int tid = threadIdx.x; @@ -111,21 +123,21 @@ extern "C" __global__ void cudaComputeLPC( // add up parts for (int part = 0; part < partCount; part++) - if (tid <= max_order) + if (tid <= max_order) shared.autoc[tid] += autoc[(blockIdx.y * partCount + part) * (max_order + 1) + tid]; __syncthreads(); - if (tid <= 32) + if (tid < 32) shared.tmp[tid] = 0.0f; float err = shared.autoc[0]; for(int order = 0; order < max_order; order++) { - if (tid < 32) + if (tid < 32) { - shared.buf[tid] = tid < order ? shared.tmp[tid] * shared.autoc[order - tid] : 0; + shared.buf[tid] = (tid < order) * shared.tmp[tid] * shared.autoc[order - tid]; shared.buf[tid] += shared.buf[tid + 16]; shared.buf[tid] += shared.buf[tid + 8]; shared.buf[tid] += shared.buf[tid + 4]; @@ -138,38 +150,135 @@ extern "C" __global__ void cudaComputeLPC( err *= 1.0f - (r * r); - if (tid == 0) - shared.tmp[order] = r; // we could also set shared.tmp[-1] to 1.0f - if (tid < order) - shared.tmp[tid] += r * shared.tmp[order - 1 - tid]; - if (tid <= order) - output[((blockIdx.x + blockIdx.y * gridDim.x) * max_order + order) * max_order + tid] = -shared.tmp[tid]; - //{ - // int precision = 13; - // shared.bits[tid] = 32 - __clz(__float2int_rn(fabs(shared.tmp[tid]) * (1 << 15))) - precision; - // shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 16]); - // shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 8]); - // shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 4]); - // shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 2]); - // shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 1]); - // int sh = max(0,min(15, 15 - shared.bits[0])); - // shared.bits[tid] = max(-(1 << precision),min((1 << precision)-1,__float2int_rn(-shared.tmp[tid] * (1 << sh)))); - // if (tid == 0) - // output[((blockIdx.x + blockIdx.y * gridDim.x) * max_order + order) * (1 + max_order) + order + 1] = sh; - // output[((blockIdx.x + blockIdx.y * gridDim.x) * max_order + order) * (1 + max_order) + tid] = shared.bits[tid]; - //} + shared.tmp[tid] += (tid < order) * r * shared.tmp[order - 1 - tid] + (tid == order) * r; + + if (tid < 32) + { + int precision = 13; + shared.bits[tid] = __mul24((33 - __clz(__float2int_rn(fabs(shared.tmp[tid]) * (1 << 15))) - precision), tid <= order); + shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 16]); + shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 8]); + shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 4]); + shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 2]); + shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 1]); + int sh = max(0,min(15, 15 - shared.bits[0])); + int coef = max(-(1 << precision),min((1 << precision)-1,__float2int_rn(-shared.tmp[tid] * (1 << sh)))); + if (tid <= order) + output[(blockIdx.x + blockIdx.y * gridDim.x) * max_order + order].coefs[tid] = coef; + if (tid == 0) + output[(blockIdx.x + blockIdx.y * gridDim.x) * max_order + order].shift = sh; + shared.bits[tid] = 33 - max(__clz(coef),__clz(-1 ^ coef)); + shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 16]); + shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 8]); + shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 4]); + shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 2]); + shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 1]); + int cbits = shared.bits[0]; + if (tid == 0) + output[(blockIdx.x + blockIdx.y * gridDim.x) * max_order + order].cbits = cbits; + } __syncthreads(); } } -typedef struct +extern "C" __global__ void cudaEstimateResidual( + int*output, + int*samples, + encodeResidualTaskStruct *tasks, + int frameSize, + int partSize // should be <= blockDim - max_order + ) { - int residualOrder; // <= 32 - int samplesOffs; - int shift; - int reserved; - int coefs[32]; -} encodeResidualTaskStruct; + __shared__ struct { + int data[256]; + int residual[256]; + int rice[32]; + encodeResidualTaskStruct task; + } shared; + const int tid = threadIdx.x; + // fetch task data + if (tid < sizeof(encodeResidualTaskStruct) / sizeof(int)) + ((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid]; + __syncthreads(); + const int pos = blockIdx.x * partSize; + const int residualOrder = shared.task.residualOrder + 1; + const int residualLen = min(frameSize - pos - residualOrder, partSize); + const int dataLen = residualLen + residualOrder; + + // fetch samples + shared.data[tid] = (tid < dataLen ? samples[shared.task.samplesOffs + pos + tid] : 0); + + // reverse coefs + if (tid < residualOrder) shared.task.coefs[tid] = shared.task.coefs[residualOrder - 1 - tid]; + + // compute residual + __syncthreads(); + long sum = 0; + for (int c = 0; c < residualOrder; c++) + sum += __mul24(shared.data[tid + c], shared.task.coefs[c]); + int res = shared.data[tid + residualOrder] - (sum >> shared.task.shift); + shared.residual[tid] = __mul24(tid < residualLen, (2 * res) ^ (res >> 31)); + + __syncthreads(); + // residual sum: reduction in shared mem + if (tid < 128) shared.residual[tid] += shared.residual[tid + 128]; __syncthreads(); + if (tid < 64) shared.residual[tid] += shared.residual[tid + 64]; __syncthreads(); + if (tid < 32) shared.residual[tid] += shared.residual[tid + 32]; __syncthreads(); + shared.residual[tid] += shared.residual[tid + 16]; + shared.residual[tid] += shared.residual[tid + 8]; + shared.residual[tid] += shared.residual[tid + 4]; + shared.residual[tid] += shared.residual[tid + 2]; + shared.residual[tid] += shared.residual[tid + 1]; + __syncthreads(); + + if (tid < 32) + { + // rice parameter search + shared.rice[tid] = __mul24(tid >= 15, 0x7fffff) + residualLen * (tid + 1) + ((shared.residual[0] - (residualLen >> 1)) >> tid); + shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 8]); + shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 4]); + shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 2]); + shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 1]); + } + if (tid == 0) + output[blockIdx.x + blockIdx.y * gridDim.x] = shared.rice[0]; +} + +extern "C" __global__ void cudaSumResidual( + encodeResidualTaskStruct *tasks, + int *residual, + int partCount // <= blockDim.y (64) + ) +{ + __shared__ struct { + int partLen[64]; + //encodeResidualTaskStruct task; + } shared; + + const int tid = threadIdx.x; + // fetch task data + // if (tid < sizeof(encodeResidualTaskStruct) / sizeof(int)) + //((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid]; + // __syncthreads(); + + shared.partLen[tid] = (tid < partCount) ? residual[tid + partCount * blockIdx.y] : 0; + + __syncthreads(); + // length sum: reduction in shared mem + if (tid < 32) shared.partLen[tid] += shared.partLen[tid + 32]; __syncthreads(); + shared.partLen[tid] += shared.partLen[tid + 16]; + shared.partLen[tid] += shared.partLen[tid + 8]; + shared.partLen[tid] += shared.partLen[tid + 4]; + shared.partLen[tid] += shared.partLen[tid + 2]; + shared.partLen[tid] += shared.partLen[tid + 1]; + __syncthreads(); + + // FIXME: should process partition order here!!! + + // return sum + if (tid == 0) + tasks[blockIdx.y].size = shared.partLen[0]; +} extern "C" __global__ void cudaEncodeResidual( int*output, @@ -233,5 +342,4 @@ extern "C" __global__ void cudaEncodeResidual( if (tid == 0) output[blockIdx.x + blockIdx.y * gridDim.x] = shared.rice[0]; } - #endif