diff --git a/CUETools.FlaCuda/FlaCudaWriter.cs b/CUETools.FlaCuda/FlaCudaWriter.cs index 73afe81..2c592c4 100644 --- a/CUETools.FlaCuda/FlaCudaWriter.cs +++ b/CUETools.FlaCuda/FlaCudaWriter.cs @@ -94,16 +94,18 @@ namespace CUETools.Codecs.FlaCuda CUDA cuda; CUfunction cudaComputeAutocor; + CUfunction cudaComputeLPC; CUfunction cudaEncodeResidual; CUdeviceptr cudaSamples; CUdeviceptr cudaWindow; CUdeviceptr cudaAutocorTasks; CUdeviceptr cudaAutocorOutput; + CUdeviceptr cudaCompLPCOutput; CUdeviceptr cudaResidualTasks; CUdeviceptr cudaResidualOutput; IntPtr samplesBufferPtr = IntPtr.Zero; IntPtr autocorTasksPtr = IntPtr.Zero; - IntPtr autocorOutputPtr = IntPtr.Zero; + IntPtr compLPCOutputPtr = IntPtr.Zero; IntPtr residualTasksPtr = IntPtr.Zero; IntPtr residualOutputPtr = IntPtr.Zero; CUstream cudaStream; @@ -211,9 +213,10 @@ namespace CUETools.Codecs.FlaCuda cuda.Free(cudaSamples); cuda.Free(cudaAutocorTasks); cuda.Free(cudaAutocorOutput); + cuda.Free(cudaCompLPCOutput); cuda.Free(cudaResidualTasks); cuda.Free(cudaResidualOutput); - CUDADriver.cuMemFreeHost(autocorOutputPtr); + CUDADriver.cuMemFreeHost(compLPCOutputPtr); CUDADriver.cuMemFreeHost(residualOutputPtr); CUDADriver.cuMemFreeHost(samplesBufferPtr); CUDADriver.cuMemFreeHost(residualTasksPtr); @@ -244,9 +247,10 @@ namespace CUETools.Codecs.FlaCuda cuda.Free(cudaSamples); cuda.Free(cudaAutocorTasks); cuda.Free(cudaAutocorOutput); + cuda.Free(cudaCompLPCOutput); cuda.Free(cudaResidualTasks); cuda.Free(cudaResidualOutput); - CUDADriver.cuMemFreeHost(autocorOutputPtr); + CUDADriver.cuMemFreeHost(compLPCOutputPtr); CUDADriver.cuMemFreeHost(residualOutputPtr); CUDADriver.cuMemFreeHost(samplesBufferPtr); CUDADriver.cuMemFreeHost(residualTasksPtr); @@ -1031,22 +1035,22 @@ namespace CUETools.Codecs.FlaCuda for (int ch = 0; ch < channelsCount; ch++) for (int iWindow = 0; iWindow < _windowcount; iWindow++) { - double* ac = stackalloc double[lpc.MAX_LPC_ORDER + 1]; - for (int order = 0; order <= max_order; order++) - { - ac[order] = 0; - for (int i_block = 0; i_block < autocorPartCount; i_block++) - ac[order] += ((float*)autocorOutputPtr)[order + (max_order + 1) * (i_block + autocorPartCount * (iWindow + _windowcount * ch))]; - } - frame.subframes[ch].lpc_ctx[iWindow].ComputeReflection(max_order, ac); - float* lpcs = stackalloc float[lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER]; - frame.subframes[ch].lpc_ctx[iWindow].ComputeLPC(lpcs); + //int* lpcs = ((int*)compLPCOutputPtr) + (max_order + 1) * max_order * (iWindow + _windowcount * ch); + //for (int order = 1; order <= max_order; order++) + //{ + // residualTasks[nResidualTasks].residualOrder = order - 1; + // residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE; + // residualTasks[nResidualTasks].shift = lpcs[order + (order - 1) * (max_order + 1)]; + // AudioSamples.MemCpy(residualTasks[nResidualTasks].coefs, lpcs + (order - 1) * (max_order + 1), order); + // nResidualTasks++; + //} + float* lpcs = ((float*)compLPCOutputPtr) + max_order * max_order * (iWindow + _windowcount * ch); for (int order = 1; order <= max_order; order++) { residualTasks[nResidualTasks].residualOrder = order - 1; residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE; - lpc.quantize_lpc_coefs(lpcs + (order - 1) * lpc.MAX_LPC_ORDER, + lpc.quantize_lpc_coefs(lpcs + (order - 1) * max_order, order, cbits, residualTasks[nResidualTasks].coefs, out residualTasks[nResidualTasks].shift, 15, 0); @@ -1119,10 +1123,10 @@ namespace CUETools.Codecs.FlaCuda unsafe void compute_autocorellation(FlacFrame frame, int channelsCount, int max_order, out int partCount) { int autocorThreads = 256; - int partSize = autocorThreads - max_order; + int partSize = 2 * autocorThreads - max_order; int nAutocorTasks = _windowcount * channelsCount; - partCount = (frame.blocksize + partSize - 1) / partSize; + partCount = (frame.blocksize + partSize - 1) / partSize; if (partCount > maxAutocorParts) throw new Exception("internal error"); @@ -1139,10 +1143,19 @@ namespace CUETools.Codecs.FlaCuda cuda.SetParameterSize(cudaComputeAutocor, (uint)(IntPtr.Size * 4) + sizeof(uint) * 3); cuda.SetFunctionBlockShape(cudaComputeAutocor, autocorThreads, 1, 1); + cuda.SetParameter(cudaComputeLPC, 0, (uint)cudaCompLPCOutput.Pointer); + cuda.SetParameter(cudaComputeLPC, IntPtr.Size, (uint)cudaAutocorOutput.Pointer); + cuda.SetParameter(cudaComputeLPC, IntPtr.Size * 2, (uint)cudaAutocorTasks.Pointer); + cuda.SetParameter(cudaComputeLPC, IntPtr.Size * 3, (uint)max_order); + cuda.SetParameter(cudaComputeLPC, IntPtr.Size * 3 + sizeof(uint), (uint)partCount); + cuda.SetParameterSize(cudaComputeLPC, (uint)(IntPtr.Size * 3) + sizeof(uint) * 2); + cuda.SetFunctionBlockShape(cudaComputeLPC, 32, 1, 1); + // issue work to the GPU cuda.CopyHostToDeviceAsync(cudaSamples, samplesBufferPtr, (uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelsCount), cudaStream); cuda.LaunchAsync(cudaComputeAutocor, partCount, nAutocorTasks, cudaStream); - cuda.CopyDeviceToHostAsync(cudaAutocorOutput, autocorOutputPtr, (uint)(sizeof(float) * partCount * (max_order + 1) * nAutocorTasks), cudaStream); + cuda.LaunchAsync(cudaComputeLPC, 1, nAutocorTasks, cudaStream); + cuda.CopyDeviceToHostAsync(cudaCompLPCOutput, compLPCOutputPtr, (uint)(sizeof(float) * (max_order + 1) * max_order * nAutocorTasks), cudaStream); cuda.SynchronizeStream(cudaStream); } @@ -1282,18 +1295,20 @@ namespace CUETools.Codecs.FlaCuda cuda.CreateContext(0, CUCtxFlags.SchedSpin); cuda.LoadModule(System.IO.Path.Combine(Environment.CurrentDirectory, "flacuda.cubin")); cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor"); + cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC"); cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual"); cudaSamples = cuda.Allocate((uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels))); cudaWindow = cuda.Allocate((uint)sizeof(float) * FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS); cudaAutocorTasks = cuda.Allocate((uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS)); cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * maxAutocorParts); + cudaCompLPCOutput = cuda.Allocate((uint)(sizeof(float) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * maxAutocorParts); cudaResidualTasks = cuda.Allocate((uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS)); cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER + 1) * lpc.MAX_LPC_WINDOWS * maxResidualParts)); CUResult cuErr = CUDADriver.cuMemAllocHost(ref samplesBufferPtr, (uint)(sizeof(int) * (channels == 2 ? 4 : channels) * FlaCudaWriter.MAX_BLOCKSIZE)); if (cuErr == CUResult.Success) cuErr = CUDADriver.cuMemAllocHost(ref autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS)); if (cuErr == CUResult.Success) - cuErr = CUDADriver.cuMemAllocHost(ref autocorOutputPtr, (uint)(sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * maxAutocorParts)); + cuErr = CUDADriver.cuMemAllocHost(ref compLPCOutputPtr, (uint)(sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * lpc.MAX_LPC_ORDER * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS)); if (cuErr == CUResult.Success) cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER)); if (cuErr == CUResult.Success) @@ -1302,7 +1317,7 @@ namespace CUETools.Codecs.FlaCuda { if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr); samplesBufferPtr = IntPtr.Zero; if (autocorTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(autocorTasksPtr); autocorTasksPtr = IntPtr.Zero; - if (autocorOutputPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(autocorOutputPtr); autocorOutputPtr = IntPtr.Zero; + if (compLPCOutputPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(compLPCOutputPtr); compLPCOutputPtr = IntPtr.Zero; if (residualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualTasksPtr); residualTasksPtr = IntPtr.Zero; if (residualOutputPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualOutputPtr); residualOutputPtr = IntPtr.Zero; throw new CUDAException(cuErr); diff --git a/CUETools.FlaCuda/flacuda.cu b/CUETools.FlaCuda/flacuda.cu index 22c3134..9d224df 100644 --- a/CUETools.FlaCuda/flacuda.cu +++ b/CUETools.FlaCuda/flacuda.cu @@ -33,17 +33,17 @@ extern "C" __global__ void cudaComputeAutocor( computeAutocorTaskStruct *tasks, int max_order, // should be <= 32 int frameSize, - int partSize // should be <= blockDim - max_order + int partSize // should be <= 2*blockDim - max_order ) { __shared__ struct { - float data[256]; + float data[512]; float product[256]; - float product2[256]; float sum[33]; computeAutocorTaskStruct task; } shared; const int tid = threadIdx.x; + const int tid2 = threadIdx.x + 256; // fetch task data if (tid < sizeof(shared.task) / sizeof(int)) ((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid]; @@ -55,11 +55,13 @@ extern "C" __global__ void cudaComputeAutocor( // fetch samples shared.data[tid] = tid < dataLen ? samples[shared.task.samplesOffs + pos + tid] * window[shared.task.windowOffs + pos + tid]: 0.0f; + shared.data[tid2] = tid2 < dataLen ? samples[shared.task.samplesOffs + pos + tid2] * window[shared.task.windowOffs + pos + tid2]: 0.0f; __syncthreads(); for (int lag = 0; lag <= max_order; lag++) { - shared.product[tid] = tid < productLen ? shared.data[tid] * shared.data[tid + lag] : 0.0f; + shared.product[tid] = (tid < productLen) * shared.data[tid] * shared.data[tid + lag] + + + (tid2 < productLen) * shared.data[tid2] * shared.data[tid2 + lag]; __syncthreads(); // product sum: reduction in shared mem @@ -72,6 +74,7 @@ extern "C" __global__ void cudaComputeAutocor( shared.product[tid] += shared.product[tid + 4]; shared.product[tid] += shared.product[tid + 2]; if (tid == 0) shared.sum[lag] = shared.product[0] + shared.product[1]; + __syncthreads(); } // return results @@ -79,6 +82,86 @@ extern "C" __global__ void cudaComputeAutocor( output[(blockIdx.x + blockIdx.y * gridDim.x) * (max_order + 1) + tid] = shared.sum[tid]; } +extern "C" __global__ void cudaComputeLPC( + float*output, + float*autoc, + computeAutocorTaskStruct *tasks, + int max_order, // should be <= 32 + int partCount // should be <= blockDim +) +{ + __shared__ struct { + computeAutocorTaskStruct task; + float tmp[32]; + float buf[32]; + int bits[32]; + float autoc[33]; + } shared; + const int tid = threadIdx.x; + + // fetch task data + if (tid < sizeof(shared.task) / sizeof(int)) + ((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid]; + + // initialize autoc sums + if (tid <= max_order) + shared.autoc[tid] = 0.0f; + + __syncthreads(); + + // add up parts + for (int part = 0; part < partCount; part++) + if (tid <= max_order) + shared.autoc[tid] += autoc[(blockIdx.y * partCount + part) * (max_order + 1) + tid]; + + __syncthreads(); + + if (tid <= 32) + shared.tmp[tid] = 0.0f; + + float err = shared.autoc[0]; + + for(int order = 0; order < max_order; order++) + { + if (tid < 32) + { + shared.buf[tid] = tid < order ? shared.tmp[tid] * shared.autoc[order - tid] : 0; + shared.buf[tid] += shared.buf[tid + 16]; + shared.buf[tid] += shared.buf[tid + 8]; + shared.buf[tid] += shared.buf[tid + 4]; + shared.buf[tid] += shared.buf[tid + 2]; + shared.buf[tid] += shared.buf[tid + 1]; + } + __syncthreads(); + + float r = (- shared.autoc[order+1] - shared.buf[0]) / err; + + err *= 1.0f - (r * r); + + if (tid == 0) + shared.tmp[order] = r; // we could also set shared.tmp[-1] to 1.0f + if (tid < order) + shared.tmp[tid] += r * shared.tmp[order - 1 - tid]; + if (tid <= order) + output[((blockIdx.x + blockIdx.y * gridDim.x) * max_order + order) * max_order + tid] = -shared.tmp[tid]; + //{ + // int precision = 13; + // shared.bits[tid] = 32 - __clz(__float2int_rn(fabs(shared.tmp[tid]) * (1 << 15))) - precision; + // shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 16]); + // shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 8]); + // shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 4]); + // shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 2]); + // shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 1]); + // int sh = max(0,min(15, 15 - shared.bits[0])); + // shared.bits[tid] = max(-(1 << precision),min((1 << precision)-1,__float2int_rn(-shared.tmp[tid] * (1 << sh)))); + // if (tid == 0) + // output[((blockIdx.x + blockIdx.y * gridDim.x) * max_order + order) * (1 + max_order) + order + 1] = sh; + // output[((blockIdx.x + blockIdx.y * gridDim.x) * max_order + order) * (1 + max_order) + tid] = shared.bits[tid]; + //} + __syncthreads(); + } +} + typedef struct { int residualOrder; // <= 32 @@ -108,19 +191,22 @@ extern "C" __global__ void cudaEncodeResidual( ((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid]; __syncthreads(); const int pos = blockIdx.x * partSize; - const int residualOrder = shared.task.residualOrder; - const int residualLen = min(frameSize - pos - residualOrder - 1, partSize); - const int dataLen = residualLen + residualOrder + 1; + const int residualOrder = shared.task.residualOrder + 1; + const int residualLen = min(frameSize - pos - residualOrder, partSize); + const int dataLen = residualLen + residualOrder; // fetch samples shared.data[tid] = (tid < dataLen ? samples[shared.task.samplesOffs + pos + tid] : 0); + + // reverse coefs + if (tid < residualOrder) shared.task.coefs[tid] = shared.task.coefs[residualOrder - 1 - tid]; // compute residual __syncthreads(); long sum = 0; - for (int c = 0; c <= residualOrder; c++) - sum += __mul24(shared.data[tid + c], shared.task.coefs[residualOrder - c]); - int res = shared.data[tid + residualOrder + 1] - (sum >> shared.task.shift); + for (int c = 0; c < residualOrder; c++) + sum += __mul24(shared.data[tid + c], shared.task.coefs[c]); + int res = shared.data[tid + residualOrder] - (sum >> shared.task.shift); shared.residual[tid] = __mul24(tid < residualLen, (2 * res) ^ (res >> 31)); __syncthreads();