mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
optimizations
This commit is contained in:
@@ -94,16 +94,18 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
|
|
||||||
CUDA cuda;
|
CUDA cuda;
|
||||||
CUfunction cudaComputeAutocor;
|
CUfunction cudaComputeAutocor;
|
||||||
|
CUfunction cudaComputeLPC;
|
||||||
CUfunction cudaEncodeResidual;
|
CUfunction cudaEncodeResidual;
|
||||||
CUdeviceptr cudaSamples;
|
CUdeviceptr cudaSamples;
|
||||||
CUdeviceptr cudaWindow;
|
CUdeviceptr cudaWindow;
|
||||||
CUdeviceptr cudaAutocorTasks;
|
CUdeviceptr cudaAutocorTasks;
|
||||||
CUdeviceptr cudaAutocorOutput;
|
CUdeviceptr cudaAutocorOutput;
|
||||||
|
CUdeviceptr cudaCompLPCOutput;
|
||||||
CUdeviceptr cudaResidualTasks;
|
CUdeviceptr cudaResidualTasks;
|
||||||
CUdeviceptr cudaResidualOutput;
|
CUdeviceptr cudaResidualOutput;
|
||||||
IntPtr samplesBufferPtr = IntPtr.Zero;
|
IntPtr samplesBufferPtr = IntPtr.Zero;
|
||||||
IntPtr autocorTasksPtr = IntPtr.Zero;
|
IntPtr autocorTasksPtr = IntPtr.Zero;
|
||||||
IntPtr autocorOutputPtr = IntPtr.Zero;
|
IntPtr compLPCOutputPtr = IntPtr.Zero;
|
||||||
IntPtr residualTasksPtr = IntPtr.Zero;
|
IntPtr residualTasksPtr = IntPtr.Zero;
|
||||||
IntPtr residualOutputPtr = IntPtr.Zero;
|
IntPtr residualOutputPtr = IntPtr.Zero;
|
||||||
CUstream cudaStream;
|
CUstream cudaStream;
|
||||||
@@ -211,9 +213,10 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
cuda.Free(cudaSamples);
|
cuda.Free(cudaSamples);
|
||||||
cuda.Free(cudaAutocorTasks);
|
cuda.Free(cudaAutocorTasks);
|
||||||
cuda.Free(cudaAutocorOutput);
|
cuda.Free(cudaAutocorOutput);
|
||||||
|
cuda.Free(cudaCompLPCOutput);
|
||||||
cuda.Free(cudaResidualTasks);
|
cuda.Free(cudaResidualTasks);
|
||||||
cuda.Free(cudaResidualOutput);
|
cuda.Free(cudaResidualOutput);
|
||||||
CUDADriver.cuMemFreeHost(autocorOutputPtr);
|
CUDADriver.cuMemFreeHost(compLPCOutputPtr);
|
||||||
CUDADriver.cuMemFreeHost(residualOutputPtr);
|
CUDADriver.cuMemFreeHost(residualOutputPtr);
|
||||||
CUDADriver.cuMemFreeHost(samplesBufferPtr);
|
CUDADriver.cuMemFreeHost(samplesBufferPtr);
|
||||||
CUDADriver.cuMemFreeHost(residualTasksPtr);
|
CUDADriver.cuMemFreeHost(residualTasksPtr);
|
||||||
@@ -244,9 +247,10 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
cuda.Free(cudaSamples);
|
cuda.Free(cudaSamples);
|
||||||
cuda.Free(cudaAutocorTasks);
|
cuda.Free(cudaAutocorTasks);
|
||||||
cuda.Free(cudaAutocorOutput);
|
cuda.Free(cudaAutocorOutput);
|
||||||
|
cuda.Free(cudaCompLPCOutput);
|
||||||
cuda.Free(cudaResidualTasks);
|
cuda.Free(cudaResidualTasks);
|
||||||
cuda.Free(cudaResidualOutput);
|
cuda.Free(cudaResidualOutput);
|
||||||
CUDADriver.cuMemFreeHost(autocorOutputPtr);
|
CUDADriver.cuMemFreeHost(compLPCOutputPtr);
|
||||||
CUDADriver.cuMemFreeHost(residualOutputPtr);
|
CUDADriver.cuMemFreeHost(residualOutputPtr);
|
||||||
CUDADriver.cuMemFreeHost(samplesBufferPtr);
|
CUDADriver.cuMemFreeHost(samplesBufferPtr);
|
||||||
CUDADriver.cuMemFreeHost(residualTasksPtr);
|
CUDADriver.cuMemFreeHost(residualTasksPtr);
|
||||||
@@ -1031,22 +1035,22 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
for (int ch = 0; ch < channelsCount; ch++)
|
for (int ch = 0; ch < channelsCount; ch++)
|
||||||
for (int iWindow = 0; iWindow < _windowcount; iWindow++)
|
for (int iWindow = 0; iWindow < _windowcount; iWindow++)
|
||||||
{
|
{
|
||||||
double* ac = stackalloc double[lpc.MAX_LPC_ORDER + 1];
|
//int* lpcs = ((int*)compLPCOutputPtr) + (max_order + 1) * max_order * (iWindow + _windowcount * ch);
|
||||||
for (int order = 0; order <= max_order; order++)
|
//for (int order = 1; order <= max_order; order++)
|
||||||
{
|
//{
|
||||||
ac[order] = 0;
|
// residualTasks[nResidualTasks].residualOrder = order - 1;
|
||||||
for (int i_block = 0; i_block < autocorPartCount; i_block++)
|
// residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE;
|
||||||
ac[order] += ((float*)autocorOutputPtr)[order + (max_order + 1) * (i_block + autocorPartCount * (iWindow + _windowcount * ch))];
|
// residualTasks[nResidualTasks].shift = lpcs[order + (order - 1) * (max_order + 1)];
|
||||||
}
|
// AudioSamples.MemCpy(residualTasks[nResidualTasks].coefs, lpcs + (order - 1) * (max_order + 1), order);
|
||||||
frame.subframes[ch].lpc_ctx[iWindow].ComputeReflection(max_order, ac);
|
// nResidualTasks++;
|
||||||
float* lpcs = stackalloc float[lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER];
|
//}
|
||||||
frame.subframes[ch].lpc_ctx[iWindow].ComputeLPC(lpcs);
|
float* lpcs = ((float*)compLPCOutputPtr) + max_order * max_order * (iWindow + _windowcount * ch);
|
||||||
for (int order = 1; order <= max_order; order++)
|
for (int order = 1; order <= max_order; order++)
|
||||||
{
|
{
|
||||||
residualTasks[nResidualTasks].residualOrder = order - 1;
|
residualTasks[nResidualTasks].residualOrder = order - 1;
|
||||||
residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE;
|
residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE;
|
||||||
|
|
||||||
lpc.quantize_lpc_coefs(lpcs + (order - 1) * lpc.MAX_LPC_ORDER,
|
lpc.quantize_lpc_coefs(lpcs + (order - 1) * max_order,
|
||||||
order, cbits, residualTasks[nResidualTasks].coefs,
|
order, cbits, residualTasks[nResidualTasks].coefs,
|
||||||
out residualTasks[nResidualTasks].shift, 15, 0);
|
out residualTasks[nResidualTasks].shift, 15, 0);
|
||||||
|
|
||||||
@@ -1119,7 +1123,7 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
unsafe void compute_autocorellation(FlacFrame frame, int channelsCount, int max_order, out int partCount)
|
unsafe void compute_autocorellation(FlacFrame frame, int channelsCount, int max_order, out int partCount)
|
||||||
{
|
{
|
||||||
int autocorThreads = 256;
|
int autocorThreads = 256;
|
||||||
int partSize = autocorThreads - max_order;
|
int partSize = 2 * autocorThreads - max_order;
|
||||||
int nAutocorTasks = _windowcount * channelsCount;
|
int nAutocorTasks = _windowcount * channelsCount;
|
||||||
|
|
||||||
partCount = (frame.blocksize + partSize - 1) / partSize;
|
partCount = (frame.blocksize + partSize - 1) / partSize;
|
||||||
@@ -1139,10 +1143,19 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
cuda.SetParameterSize(cudaComputeAutocor, (uint)(IntPtr.Size * 4) + sizeof(uint) * 3);
|
cuda.SetParameterSize(cudaComputeAutocor, (uint)(IntPtr.Size * 4) + sizeof(uint) * 3);
|
||||||
cuda.SetFunctionBlockShape(cudaComputeAutocor, autocorThreads, 1, 1);
|
cuda.SetFunctionBlockShape(cudaComputeAutocor, autocorThreads, 1, 1);
|
||||||
|
|
||||||
|
cuda.SetParameter(cudaComputeLPC, 0, (uint)cudaCompLPCOutput.Pointer);
|
||||||
|
cuda.SetParameter(cudaComputeLPC, IntPtr.Size, (uint)cudaAutocorOutput.Pointer);
|
||||||
|
cuda.SetParameter(cudaComputeLPC, IntPtr.Size * 2, (uint)cudaAutocorTasks.Pointer);
|
||||||
|
cuda.SetParameter(cudaComputeLPC, IntPtr.Size * 3, (uint)max_order);
|
||||||
|
cuda.SetParameter(cudaComputeLPC, IntPtr.Size * 3 + sizeof(uint), (uint)partCount);
|
||||||
|
cuda.SetParameterSize(cudaComputeLPC, (uint)(IntPtr.Size * 3) + sizeof(uint) * 2);
|
||||||
|
cuda.SetFunctionBlockShape(cudaComputeLPC, 32, 1, 1);
|
||||||
|
|
||||||
// issue work to the GPU
|
// issue work to the GPU
|
||||||
cuda.CopyHostToDeviceAsync(cudaSamples, samplesBufferPtr, (uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelsCount), cudaStream);
|
cuda.CopyHostToDeviceAsync(cudaSamples, samplesBufferPtr, (uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelsCount), cudaStream);
|
||||||
cuda.LaunchAsync(cudaComputeAutocor, partCount, nAutocorTasks, cudaStream);
|
cuda.LaunchAsync(cudaComputeAutocor, partCount, nAutocorTasks, cudaStream);
|
||||||
cuda.CopyDeviceToHostAsync(cudaAutocorOutput, autocorOutputPtr, (uint)(sizeof(float) * partCount * (max_order + 1) * nAutocorTasks), cudaStream);
|
cuda.LaunchAsync(cudaComputeLPC, 1, nAutocorTasks, cudaStream);
|
||||||
|
cuda.CopyDeviceToHostAsync(cudaCompLPCOutput, compLPCOutputPtr, (uint)(sizeof(float) * (max_order + 1) * max_order * nAutocorTasks), cudaStream);
|
||||||
cuda.SynchronizeStream(cudaStream);
|
cuda.SynchronizeStream(cudaStream);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1282,18 +1295,20 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
cuda.CreateContext(0, CUCtxFlags.SchedSpin);
|
cuda.CreateContext(0, CUCtxFlags.SchedSpin);
|
||||||
cuda.LoadModule(System.IO.Path.Combine(Environment.CurrentDirectory, "flacuda.cubin"));
|
cuda.LoadModule(System.IO.Path.Combine(Environment.CurrentDirectory, "flacuda.cubin"));
|
||||||
cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor");
|
cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor");
|
||||||
|
cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC");
|
||||||
cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual");
|
cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual");
|
||||||
cudaSamples = cuda.Allocate((uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels)));
|
cudaSamples = cuda.Allocate((uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels)));
|
||||||
cudaWindow = cuda.Allocate((uint)sizeof(float) * FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS);
|
cudaWindow = cuda.Allocate((uint)sizeof(float) * FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS);
|
||||||
cudaAutocorTasks = cuda.Allocate((uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS));
|
cudaAutocorTasks = cuda.Allocate((uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS));
|
||||||
cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * maxAutocorParts);
|
cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * maxAutocorParts);
|
||||||
|
cudaCompLPCOutput = cuda.Allocate((uint)(sizeof(float) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * maxAutocorParts);
|
||||||
cudaResidualTasks = cuda.Allocate((uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS));
|
cudaResidualTasks = cuda.Allocate((uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS));
|
||||||
cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER + 1) * lpc.MAX_LPC_WINDOWS * maxResidualParts));
|
cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER + 1) * lpc.MAX_LPC_WINDOWS * maxResidualParts));
|
||||||
CUResult cuErr = CUDADriver.cuMemAllocHost(ref samplesBufferPtr, (uint)(sizeof(int) * (channels == 2 ? 4 : channels) * FlaCudaWriter.MAX_BLOCKSIZE));
|
CUResult cuErr = CUDADriver.cuMemAllocHost(ref samplesBufferPtr, (uint)(sizeof(int) * (channels == 2 ? 4 : channels) * FlaCudaWriter.MAX_BLOCKSIZE));
|
||||||
if (cuErr == CUResult.Success)
|
if (cuErr == CUResult.Success)
|
||||||
cuErr = CUDADriver.cuMemAllocHost(ref autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS));
|
cuErr = CUDADriver.cuMemAllocHost(ref autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS));
|
||||||
if (cuErr == CUResult.Success)
|
if (cuErr == CUResult.Success)
|
||||||
cuErr = CUDADriver.cuMemAllocHost(ref autocorOutputPtr, (uint)(sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * maxAutocorParts));
|
cuErr = CUDADriver.cuMemAllocHost(ref compLPCOutputPtr, (uint)(sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * lpc.MAX_LPC_ORDER * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS));
|
||||||
if (cuErr == CUResult.Success)
|
if (cuErr == CUResult.Success)
|
||||||
cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER));
|
cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER));
|
||||||
if (cuErr == CUResult.Success)
|
if (cuErr == CUResult.Success)
|
||||||
@@ -1302,7 +1317,7 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
{
|
{
|
||||||
if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr); samplesBufferPtr = IntPtr.Zero;
|
if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr); samplesBufferPtr = IntPtr.Zero;
|
||||||
if (autocorTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(autocorTasksPtr); autocorTasksPtr = IntPtr.Zero;
|
if (autocorTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(autocorTasksPtr); autocorTasksPtr = IntPtr.Zero;
|
||||||
if (autocorOutputPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(autocorOutputPtr); autocorOutputPtr = IntPtr.Zero;
|
if (compLPCOutputPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(compLPCOutputPtr); compLPCOutputPtr = IntPtr.Zero;
|
||||||
if (residualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualTasksPtr); residualTasksPtr = IntPtr.Zero;
|
if (residualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualTasksPtr); residualTasksPtr = IntPtr.Zero;
|
||||||
if (residualOutputPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualOutputPtr); residualOutputPtr = IntPtr.Zero;
|
if (residualOutputPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualOutputPtr); residualOutputPtr = IntPtr.Zero;
|
||||||
throw new CUDAException(cuErr);
|
throw new CUDAException(cuErr);
|
||||||
|
|||||||
@@ -33,17 +33,17 @@ extern "C" __global__ void cudaComputeAutocor(
|
|||||||
computeAutocorTaskStruct *tasks,
|
computeAutocorTaskStruct *tasks,
|
||||||
int max_order, // should be <= 32
|
int max_order, // should be <= 32
|
||||||
int frameSize,
|
int frameSize,
|
||||||
int partSize // should be <= blockDim - max_order
|
int partSize // should be <= 2*blockDim - max_order
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
__shared__ struct {
|
__shared__ struct {
|
||||||
float data[256];
|
float data[512];
|
||||||
float product[256];
|
float product[256];
|
||||||
float product2[256];
|
|
||||||
float sum[33];
|
float sum[33];
|
||||||
computeAutocorTaskStruct task;
|
computeAutocorTaskStruct task;
|
||||||
} shared;
|
} shared;
|
||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
|
const int tid2 = threadIdx.x + 256;
|
||||||
// fetch task data
|
// fetch task data
|
||||||
if (tid < sizeof(shared.task) / sizeof(int))
|
if (tid < sizeof(shared.task) / sizeof(int))
|
||||||
((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid];
|
((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid];
|
||||||
@@ -55,11 +55,13 @@ extern "C" __global__ void cudaComputeAutocor(
|
|||||||
|
|
||||||
// fetch samples
|
// fetch samples
|
||||||
shared.data[tid] = tid < dataLen ? samples[shared.task.samplesOffs + pos + tid] * window[shared.task.windowOffs + pos + tid]: 0.0f;
|
shared.data[tid] = tid < dataLen ? samples[shared.task.samplesOffs + pos + tid] * window[shared.task.windowOffs + pos + tid]: 0.0f;
|
||||||
|
shared.data[tid2] = tid2 < dataLen ? samples[shared.task.samplesOffs + pos + tid2] * window[shared.task.windowOffs + pos + tid2]: 0.0f;
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
for (int lag = 0; lag <= max_order; lag++)
|
for (int lag = 0; lag <= max_order; lag++)
|
||||||
{
|
{
|
||||||
shared.product[tid] = tid < productLen ? shared.data[tid] * shared.data[tid + lag] : 0.0f;
|
shared.product[tid] = (tid < productLen) * shared.data[tid] * shared.data[tid + lag] +
|
||||||
|
+ (tid2 < productLen) * shared.data[tid2] * shared.data[tid2 + lag];
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
// product sum: reduction in shared mem
|
// product sum: reduction in shared mem
|
||||||
@@ -72,6 +74,7 @@ extern "C" __global__ void cudaComputeAutocor(
|
|||||||
shared.product[tid] += shared.product[tid + 4];
|
shared.product[tid] += shared.product[tid + 4];
|
||||||
shared.product[tid] += shared.product[tid + 2];
|
shared.product[tid] += shared.product[tid + 2];
|
||||||
if (tid == 0) shared.sum[lag] = shared.product[0] + shared.product[1];
|
if (tid == 0) shared.sum[lag] = shared.product[0] + shared.product[1];
|
||||||
|
__syncthreads();
|
||||||
}
|
}
|
||||||
|
|
||||||
// return results
|
// return results
|
||||||
@@ -79,6 +82,86 @@ extern "C" __global__ void cudaComputeAutocor(
|
|||||||
output[(blockIdx.x + blockIdx.y * gridDim.x) * (max_order + 1) + tid] = shared.sum[tid];
|
output[(blockIdx.x + blockIdx.y * gridDim.x) * (max_order + 1) + tid] = shared.sum[tid];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
extern "C" __global__ void cudaComputeLPC(
|
||||||
|
float*output,
|
||||||
|
float*autoc,
|
||||||
|
computeAutocorTaskStruct *tasks,
|
||||||
|
int max_order, // should be <= 32
|
||||||
|
int partCount // should be <= blockDim
|
||||||
|
)
|
||||||
|
{
|
||||||
|
__shared__ struct {
|
||||||
|
computeAutocorTaskStruct task;
|
||||||
|
float tmp[32];
|
||||||
|
float buf[32];
|
||||||
|
int bits[32];
|
||||||
|
float autoc[33];
|
||||||
|
} shared;
|
||||||
|
const int tid = threadIdx.x;
|
||||||
|
|
||||||
|
// fetch task data
|
||||||
|
if (tid < sizeof(shared.task) / sizeof(int))
|
||||||
|
((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid];
|
||||||
|
|
||||||
|
// initialize autoc sums
|
||||||
|
if (tid <= max_order)
|
||||||
|
shared.autoc[tid] = 0.0f;
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// add up parts
|
||||||
|
for (int part = 0; part < partCount; part++)
|
||||||
|
if (tid <= max_order)
|
||||||
|
shared.autoc[tid] += autoc[(blockIdx.y * partCount + part) * (max_order + 1) + tid];
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (tid <= 32)
|
||||||
|
shared.tmp[tid] = 0.0f;
|
||||||
|
|
||||||
|
float err = shared.autoc[0];
|
||||||
|
|
||||||
|
for(int order = 0; order < max_order; order++)
|
||||||
|
{
|
||||||
|
if (tid < 32)
|
||||||
|
{
|
||||||
|
shared.buf[tid] = tid < order ? shared.tmp[tid] * shared.autoc[order - tid] : 0;
|
||||||
|
shared.buf[tid] += shared.buf[tid + 16];
|
||||||
|
shared.buf[tid] += shared.buf[tid + 8];
|
||||||
|
shared.buf[tid] += shared.buf[tid + 4];
|
||||||
|
shared.buf[tid] += shared.buf[tid + 2];
|
||||||
|
shared.buf[tid] += shared.buf[tid + 1];
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
float r = (- shared.autoc[order+1] - shared.buf[0]) / err;
|
||||||
|
|
||||||
|
err *= 1.0f - (r * r);
|
||||||
|
|
||||||
|
if (tid == 0)
|
||||||
|
shared.tmp[order] = r; // we could also set shared.tmp[-1] to 1.0f
|
||||||
|
if (tid < order)
|
||||||
|
shared.tmp[tid] += r * shared.tmp[order - 1 - tid];
|
||||||
|
if (tid <= order)
|
||||||
|
output[((blockIdx.x + blockIdx.y * gridDim.x) * max_order + order) * max_order + tid] = -shared.tmp[tid];
|
||||||
|
//{
|
||||||
|
// int precision = 13;
|
||||||
|
// shared.bits[tid] = 32 - __clz(__float2int_rn(fabs(shared.tmp[tid]) * (1 << 15))) - precision;
|
||||||
|
// shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 16]);
|
||||||
|
// shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 8]);
|
||||||
|
// shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 4]);
|
||||||
|
// shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 2]);
|
||||||
|
// shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 1]);
|
||||||
|
// int sh = max(0,min(15, 15 - shared.bits[0]));
|
||||||
|
// shared.bits[tid] = max(-(1 << precision),min((1 << precision)-1,__float2int_rn(-shared.tmp[tid] * (1 << sh))));
|
||||||
|
// if (tid == 0)
|
||||||
|
// output[((blockIdx.x + blockIdx.y * gridDim.x) * max_order + order) * (1 + max_order) + order + 1] = sh;
|
||||||
|
// output[((blockIdx.x + blockIdx.y * gridDim.x) * max_order + order) * (1 + max_order) + tid] = shared.bits[tid];
|
||||||
|
//}
|
||||||
|
__syncthreads();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
int residualOrder; // <= 32
|
int residualOrder; // <= 32
|
||||||
@@ -108,19 +191,22 @@ extern "C" __global__ void cudaEncodeResidual(
|
|||||||
((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid];
|
((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid];
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
const int pos = blockIdx.x * partSize;
|
const int pos = blockIdx.x * partSize;
|
||||||
const int residualOrder = shared.task.residualOrder;
|
const int residualOrder = shared.task.residualOrder + 1;
|
||||||
const int residualLen = min(frameSize - pos - residualOrder - 1, partSize);
|
const int residualLen = min(frameSize - pos - residualOrder, partSize);
|
||||||
const int dataLen = residualLen + residualOrder + 1;
|
const int dataLen = residualLen + residualOrder;
|
||||||
|
|
||||||
// fetch samples
|
// fetch samples
|
||||||
shared.data[tid] = (tid < dataLen ? samples[shared.task.samplesOffs + pos + tid] : 0);
|
shared.data[tid] = (tid < dataLen ? samples[shared.task.samplesOffs + pos + tid] : 0);
|
||||||
|
|
||||||
|
// reverse coefs
|
||||||
|
if (tid < residualOrder) shared.task.coefs[tid] = shared.task.coefs[residualOrder - 1 - tid];
|
||||||
|
|
||||||
// compute residual
|
// compute residual
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
long sum = 0;
|
long sum = 0;
|
||||||
for (int c = 0; c <= residualOrder; c++)
|
for (int c = 0; c < residualOrder; c++)
|
||||||
sum += __mul24(shared.data[tid + c], shared.task.coefs[residualOrder - c]);
|
sum += __mul24(shared.data[tid + c], shared.task.coefs[c]);
|
||||||
int res = shared.data[tid + residualOrder + 1] - (sum >> shared.task.shift);
|
int res = shared.data[tid + residualOrder] - (sum >> shared.task.shift);
|
||||||
shared.residual[tid] = __mul24(tid < residualLen, (2 * res) ^ (res >> 31));
|
shared.residual[tid] = __mul24(tid < residualLen, (2 * res) ^ (res >> 31));
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|||||||
Reference in New Issue
Block a user