mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
optimizations
This commit is contained in:
@@ -95,20 +95,23 @@ namespace CUETools.Codecs.FlaCuda
|
||||
CUDA cuda;
|
||||
CUfunction cudaComputeAutocor;
|
||||
CUfunction cudaComputeLPC;
|
||||
CUfunction cudaEstimateResidual;
|
||||
CUfunction cudaSumResidual;
|
||||
CUfunction cudaEncodeResidual;
|
||||
CUdeviceptr cudaSamples;
|
||||
CUdeviceptr cudaWindow;
|
||||
CUdeviceptr cudaAutocorTasks;
|
||||
CUdeviceptr cudaAutocorOutput;
|
||||
CUdeviceptr cudaCompLPCOutput;
|
||||
CUdeviceptr cudaResidualTasks;
|
||||
CUdeviceptr cudaResidualOutput;
|
||||
IntPtr samplesBufferPtr = IntPtr.Zero;
|
||||
IntPtr autocorTasksPtr = IntPtr.Zero;
|
||||
IntPtr compLPCOutputPtr = IntPtr.Zero;
|
||||
IntPtr residualTasksPtr = IntPtr.Zero;
|
||||
IntPtr residualOutputPtr = IntPtr.Zero;
|
||||
CUstream cudaStream;
|
||||
CUstream cudaStream1;
|
||||
|
||||
int nResidualTasks = 0;
|
||||
int nAutocorTasks = 0;
|
||||
|
||||
const int MAX_BLOCKSIZE = 8192;
|
||||
const int maxResidualParts = MAX_BLOCKSIZE / (256 - 32);
|
||||
@@ -213,15 +216,13 @@ namespace CUETools.Codecs.FlaCuda
|
||||
cuda.Free(cudaSamples);
|
||||
cuda.Free(cudaAutocorTasks);
|
||||
cuda.Free(cudaAutocorOutput);
|
||||
cuda.Free(cudaCompLPCOutput);
|
||||
cuda.Free(cudaResidualTasks);
|
||||
cuda.Free(cudaResidualOutput);
|
||||
CUDADriver.cuMemFreeHost(compLPCOutputPtr);
|
||||
CUDADriver.cuMemFreeHost(residualOutputPtr);
|
||||
CUDADriver.cuMemFreeHost(samplesBufferPtr);
|
||||
CUDADriver.cuMemFreeHost(residualTasksPtr);
|
||||
CUDADriver.cuMemFreeHost(autocorTasksPtr);
|
||||
cuda.DestroyStream(cudaStream);
|
||||
cuda.DestroyStream(cudaStream1);
|
||||
cuda.Dispose();
|
||||
inited = false;
|
||||
}
|
||||
@@ -247,15 +248,13 @@ namespace CUETools.Codecs.FlaCuda
|
||||
cuda.Free(cudaSamples);
|
||||
cuda.Free(cudaAutocorTasks);
|
||||
cuda.Free(cudaAutocorOutput);
|
||||
cuda.Free(cudaCompLPCOutput);
|
||||
cuda.Free(cudaResidualTasks);
|
||||
cuda.Free(cudaResidualOutput);
|
||||
CUDADriver.cuMemFreeHost(compLPCOutputPtr);
|
||||
CUDADriver.cuMemFreeHost(residualOutputPtr);
|
||||
CUDADriver.cuMemFreeHost(samplesBufferPtr);
|
||||
CUDADriver.cuMemFreeHost(residualTasksPtr);
|
||||
CUDADriver.cuMemFreeHost(autocorTasksPtr);
|
||||
cuda.DestroyStream(cudaStream);
|
||||
cuda.DestroyStream(cudaStream1);
|
||||
cuda.Dispose();
|
||||
inited = false;
|
||||
}
|
||||
@@ -898,6 +897,65 @@ namespace CUETools.Codecs.FlaCuda
|
||||
_windowcount++;
|
||||
}
|
||||
|
||||
unsafe void initialize_autocorTasks(int channelsCount, int max_order)
|
||||
{
|
||||
computeAutocorTaskStruct* autocorTasks = (computeAutocorTaskStruct*)autocorTasksPtr;
|
||||
encodeResidualTaskStruct* residualTasks = (encodeResidualTaskStruct*)residualTasksPtr;
|
||||
nAutocorTasks = 0;
|
||||
nResidualTasks = 0;
|
||||
for (int ch = 0; ch < channelsCount; ch++)
|
||||
for (int iWindow = 0; iWindow < _windowcount; iWindow++)
|
||||
{
|
||||
// Autocorelation task
|
||||
autocorTasks[nAutocorTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE;
|
||||
autocorTasks[nAutocorTasks].windowOffs = iWindow * 2 * FlaCudaWriter.MAX_BLOCKSIZE;
|
||||
nAutocorTasks++;
|
||||
// LPC tasks
|
||||
for (int order = 1; order <= max_order; order++)
|
||||
{
|
||||
residualTasks[nResidualTasks].residualOrder = order - 1;
|
||||
residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE;
|
||||
nResidualTasks++;
|
||||
}
|
||||
}
|
||||
// Fixed prediction
|
||||
for (int ch = 0; ch < channelsCount; ch++)
|
||||
{
|
||||
for (int order = 1; order <= 4; order++)
|
||||
{
|
||||
residualTasks[nResidualTasks].residualOrder = order - 1;
|
||||
residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE;
|
||||
residualTasks[nResidualTasks].shift = 0;
|
||||
switch (order)
|
||||
{
|
||||
case 1:
|
||||
residualTasks[nResidualTasks].coefs[0] = 1;
|
||||
break;
|
||||
case 2:
|
||||
residualTasks[nResidualTasks].coefs[0] = 2;
|
||||
residualTasks[nResidualTasks].coefs[1] = -1;
|
||||
break;
|
||||
case 3:
|
||||
residualTasks[nResidualTasks].coefs[0] = 3;
|
||||
residualTasks[nResidualTasks].coefs[1] = -3;
|
||||
residualTasks[nResidualTasks].coefs[2] = 1;
|
||||
break;
|
||||
case 4:
|
||||
residualTasks[nResidualTasks].coefs[0] = 4;
|
||||
residualTasks[nResidualTasks].coefs[1] = -6;
|
||||
residualTasks[nResidualTasks].coefs[2] = 4;
|
||||
residualTasks[nResidualTasks].coefs[3] = -1;
|
||||
break;
|
||||
}
|
||||
nResidualTasks++;
|
||||
}
|
||||
}
|
||||
|
||||
cuda.CopyHostToDeviceAsync(cudaAutocorTasks, autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * nAutocorTasks), cudaStream);
|
||||
cuda.CopyHostToDeviceAsync(cudaResidualTasks, residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks), cudaStream);
|
||||
cuda.SynchronizeStream(cudaStream);
|
||||
}
|
||||
|
||||
unsafe void encode_residual(FlacFrame frame)
|
||||
{
|
||||
for (int ch = 0; ch < channels; ch++)
|
||||
@@ -968,19 +1026,8 @@ namespace CUETools.Codecs.FlaCuda
|
||||
for (int order = 1; order <= max_order && order < frame.blocksize; order++)
|
||||
{
|
||||
int index = (order - 1) + max_order * (iWindow + _windowcount * ch);
|
||||
int nbits = 0;
|
||||
for (int p = 0; p < partCount; p++)
|
||||
nbits += ((int*)residualOutputPtr)[p + partCount * index];
|
||||
|
||||
int cbits = 1;
|
||||
for (int i = order; i > 0; i--)
|
||||
{
|
||||
int c = residualTasks[index].coefs[i - 1];
|
||||
while (cbits < 16 && c != (c << (32 - cbits)) >> (32 - cbits))
|
||||
cbits++;
|
||||
}
|
||||
|
||||
nbits += order * (int)frame.subframes[ch].obits + 4 + 5 + order * cbits + 6;
|
||||
int cbits = residualTasks[index].cbits;
|
||||
int nbits = order * (int)frame.subframes[ch].obits + 4 + 5 + order * cbits + 6 + residualTasks[index].size;
|
||||
if (frame.subframes[ch].best.size > nbits)
|
||||
{
|
||||
frame.subframes[ch].best.type = SubframeType.LPC;
|
||||
@@ -989,8 +1036,8 @@ namespace CUETools.Codecs.FlaCuda
|
||||
frame.subframes[ch].best.window = iWindow;
|
||||
frame.subframes[ch].best.cbits = cbits;
|
||||
frame.subframes[ch].best.shift = residualTasks[index].shift;
|
||||
fixed (int* fcoefs = frame.subframes[ch].best.coefs)
|
||||
AudioSamples.MemCpy(fcoefs, residualTasks[index].coefs, order);
|
||||
for (int i = 0; i < order; i++)
|
||||
frame.subframes[ch].best.coefs[i] = residualTasks[index].coefs[i];//order - 1 - i];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1002,10 +1049,7 @@ namespace CUETools.Codecs.FlaCuda
|
||||
for (int order = 1; order <= 4 && order < frame.blocksize; order++)
|
||||
{
|
||||
int index = (order - 1) + 4 * ch;
|
||||
int nbits = 0;
|
||||
for (int p = 0; p < partCount; p++)
|
||||
nbits += ((int*)residualOutputPtr)[p + partCount * (index + max_order * _windowcount * channelsCount)];
|
||||
nbits += order * (int)frame.subframes[ch].obits + 6;
|
||||
int nbits = order * (int)frame.subframes[ch].obits + 6 + residualTasks[index + max_order * _windowcount * channelsCount].size;
|
||||
if (frame.subframes[ch].best.size > nbits)
|
||||
{
|
||||
frame.subframes[ch].best.type = SubframeType.Fixed;
|
||||
@@ -1018,11 +1062,10 @@ namespace CUETools.Codecs.FlaCuda
|
||||
|
||||
unsafe void estimate_residual(FlacFrame frame, int channelsCount, int max_order, int autocorPartCount, out int partCount)
|
||||
{
|
||||
encodeResidualTaskStruct* residualTasks = (encodeResidualTaskStruct*)residualTasksPtr;
|
||||
uint cbits = get_precision(frame.blocksize) + 1;
|
||||
int nResidualTasks = 0;
|
||||
int residualThreads = 256;
|
||||
int partSize = residualThreads - max_order;
|
||||
partSize &= 0xffffff0;
|
||||
|
||||
partCount = (frame.blocksize + partSize - 1) / partSize;
|
||||
if (partCount > maxResidualParts)
|
||||
@@ -1031,92 +1074,24 @@ namespace CUETools.Codecs.FlaCuda
|
||||
if (frame.blocksize <= 4)
|
||||
return;
|
||||
|
||||
// LPC
|
||||
for (int ch = 0; ch < channelsCount; ch++)
|
||||
for (int iWindow = 0; iWindow < _windowcount; iWindow++)
|
||||
{
|
||||
//int* lpcs = ((int*)compLPCOutputPtr) + (max_order + 1) * max_order * (iWindow + _windowcount * ch);
|
||||
//for (int order = 1; order <= max_order; order++)
|
||||
//{
|
||||
// residualTasks[nResidualTasks].residualOrder = order - 1;
|
||||
// residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE;
|
||||
// residualTasks[nResidualTasks].shift = lpcs[order + (order - 1) * (max_order + 1)];
|
||||
// AudioSamples.MemCpy(residualTasks[nResidualTasks].coefs, lpcs + (order - 1) * (max_order + 1), order);
|
||||
// nResidualTasks++;
|
||||
//}
|
||||
float* lpcs = ((float*)compLPCOutputPtr) + max_order * max_order * (iWindow + _windowcount * ch);
|
||||
for (int order = 1; order <= max_order; order++)
|
||||
{
|
||||
residualTasks[nResidualTasks].residualOrder = order - 1;
|
||||
residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE;
|
||||
cuda.SetParameter(cudaEstimateResidual, 0, (uint)cudaResidualOutput.Pointer);
|
||||
cuda.SetParameter(cudaEstimateResidual, IntPtr.Size, (uint)cudaSamples.Pointer);
|
||||
cuda.SetParameter(cudaEstimateResidual, IntPtr.Size * 2, (uint)cudaResidualTasks.Pointer);
|
||||
cuda.SetParameter(cudaEstimateResidual, IntPtr.Size * 3, (uint)frame.blocksize);
|
||||
cuda.SetParameter(cudaEstimateResidual, IntPtr.Size * 3 + sizeof(uint), (uint)partSize);
|
||||
cuda.SetParameterSize(cudaEstimateResidual, (uint)(IntPtr.Size * 3) + sizeof(uint) * 2U);
|
||||
cuda.SetFunctionBlockShape(cudaEstimateResidual, residualThreads, 1, 1);
|
||||
|
||||
lpc.quantize_lpc_coefs(lpcs + (order - 1) * max_order,
|
||||
order, cbits, residualTasks[nResidualTasks].coefs,
|
||||
out residualTasks[nResidualTasks].shift, 15, 0);
|
||||
|
||||
nResidualTasks++;
|
||||
}
|
||||
}
|
||||
// FIXED
|
||||
for (int ch = 0; ch < channelsCount; ch++)
|
||||
{
|
||||
for (int order = 1; order <= 4; order++)
|
||||
{
|
||||
residualTasks[nResidualTasks].residualOrder = order - 1;
|
||||
residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE;
|
||||
residualTasks[nResidualTasks].shift = 0;
|
||||
switch (order)
|
||||
{
|
||||
case 1:
|
||||
residualTasks[nResidualTasks].coefs[0] = 1;
|
||||
break;
|
||||
case 2:
|
||||
residualTasks[nResidualTasks].coefs[0] = 2;
|
||||
residualTasks[nResidualTasks].coefs[1] = -1;
|
||||
break;
|
||||
case 3:
|
||||
residualTasks[nResidualTasks].coefs[0] = 3;
|
||||
residualTasks[nResidualTasks].coefs[1] = -3;
|
||||
residualTasks[nResidualTasks].coefs[2] = 1;
|
||||
break;
|
||||
case 4:
|
||||
residualTasks[nResidualTasks].coefs[0] = 4;
|
||||
residualTasks[nResidualTasks].coefs[1] = -6;
|
||||
residualTasks[nResidualTasks].coefs[2] = 4;
|
||||
residualTasks[nResidualTasks].coefs[3] = -1;
|
||||
break;
|
||||
}
|
||||
nResidualTasks++;
|
||||
}
|
||||
}
|
||||
|
||||
cuda.SetParameter(cudaEncodeResidual, 0, (uint)cudaResidualOutput.Pointer);
|
||||
cuda.SetParameter(cudaEncodeResidual, IntPtr.Size, (uint)cudaSamples.Pointer);
|
||||
cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 2, (uint)cudaResidualTasks.Pointer);
|
||||
cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 3, (uint)frame.blocksize);
|
||||
cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 3 + sizeof(uint), (uint)partSize);
|
||||
cuda.SetParameterSize(cudaEncodeResidual, (uint)(IntPtr.Size * 3) + sizeof(uint) * 2U);
|
||||
cuda.SetFunctionBlockShape(cudaEncodeResidual, residualThreads, 1, 1);
|
||||
cuda.SetParameter(cudaSumResidual, 0, (uint)cudaResidualTasks.Pointer);
|
||||
cuda.SetParameter(cudaSumResidual, IntPtr.Size, (uint)cudaResidualOutput.Pointer);
|
||||
cuda.SetParameter(cudaSumResidual, IntPtr.Size * 2, (uint)partCount);
|
||||
cuda.SetParameterSize(cudaSumResidual, (uint)(IntPtr.Size * 2) + sizeof(uint) * 1U);
|
||||
cuda.SetFunctionBlockShape(cudaSumResidual, 64, 1, 1);
|
||||
|
||||
// issue work to the GPU
|
||||
cuda.CopyHostToDeviceAsync(cudaResidualTasks, residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks), cudaStream);
|
||||
cuda.LaunchAsync(cudaEncodeResidual, partCount, nResidualTasks, cudaStream);
|
||||
cuda.CopyDeviceToHostAsync(cudaResidualOutput, residualOutputPtr, (uint)(sizeof(int) * partCount * nResidualTasks), cudaStream);
|
||||
cuda.SynchronizeStream(cudaStream);
|
||||
}
|
||||
|
||||
unsafe void initialize_autocorTasks()
|
||||
{
|
||||
computeAutocorTaskStruct* autocorTasks = (computeAutocorTaskStruct*)autocorTasksPtr;
|
||||
int nAutocorTasks = 0;
|
||||
for (int ch = 0; ch < (channels == 2 ? 4 : channels); ch++)
|
||||
for (int iWindow = 0; iWindow < _windowcount; iWindow++)
|
||||
{
|
||||
autocorTasks[nAutocorTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE;
|
||||
autocorTasks[nAutocorTasks].windowOffs = iWindow * 2 * FlaCudaWriter.MAX_BLOCKSIZE;
|
||||
nAutocorTasks++;
|
||||
}
|
||||
cuda.CopyHostToDeviceAsync(cudaAutocorTasks, autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * nAutocorTasks), cudaStream);
|
||||
cuda.LaunchAsync(cudaEstimateResidual, partCount, nResidualTasks, cudaStream);
|
||||
cuda.LaunchAsync(cudaSumResidual, 1, nResidualTasks, cudaStream);
|
||||
cuda.CopyDeviceToHostAsync(cudaResidualTasks, residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks), cudaStream);
|
||||
cuda.SynchronizeStream(cudaStream);
|
||||
}
|
||||
|
||||
@@ -1124,7 +1099,7 @@ namespace CUETools.Codecs.FlaCuda
|
||||
{
|
||||
int autocorThreads = 256;
|
||||
int partSize = 2 * autocorThreads - max_order;
|
||||
int nAutocorTasks = _windowcount * channelsCount;
|
||||
partSize &= 0xffffff0;
|
||||
|
||||
partCount = (frame.blocksize + partSize - 1) / partSize;
|
||||
if (partCount > maxAutocorParts)
|
||||
@@ -1143,22 +1118,22 @@ namespace CUETools.Codecs.FlaCuda
|
||||
cuda.SetParameterSize(cudaComputeAutocor, (uint)(IntPtr.Size * 4) + sizeof(uint) * 3);
|
||||
cuda.SetFunctionBlockShape(cudaComputeAutocor, autocorThreads, 1, 1);
|
||||
|
||||
cuda.SetParameter(cudaComputeLPC, 0, (uint)cudaCompLPCOutput.Pointer);
|
||||
cuda.SetParameter(cudaComputeLPC, 0, (uint)cudaResidualTasks.Pointer);
|
||||
cuda.SetParameter(cudaComputeLPC, IntPtr.Size, (uint)cudaAutocorOutput.Pointer);
|
||||
cuda.SetParameter(cudaComputeLPC, IntPtr.Size * 2, (uint)cudaAutocorTasks.Pointer);
|
||||
cuda.SetParameter(cudaComputeLPC, IntPtr.Size * 3, (uint)max_order);
|
||||
cuda.SetParameter(cudaComputeLPC, IntPtr.Size * 3 + sizeof(uint), (uint)partCount);
|
||||
cuda.SetParameterSize(cudaComputeLPC, (uint)(IntPtr.Size * 3) + sizeof(uint) * 2);
|
||||
cuda.SetFunctionBlockShape(cudaComputeLPC, 32, 1, 1);
|
||||
cuda.SetFunctionBlockShape(cudaComputeLPC, 64, 1, 1);
|
||||
|
||||
// issue work to the GPU
|
||||
cuda.CopyHostToDeviceAsync(cudaSamples, samplesBufferPtr, (uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelsCount), cudaStream);
|
||||
cuda.LaunchAsync(cudaComputeAutocor, partCount, nAutocorTasks, cudaStream);
|
||||
cuda.LaunchAsync(cudaComputeLPC, 1, nAutocorTasks, cudaStream);
|
||||
cuda.CopyDeviceToHostAsync(cudaCompLPCOutput, compLPCOutputPtr, (uint)(sizeof(float) * (max_order + 1) * max_order * nAutocorTasks), cudaStream);
|
||||
cuda.SynchronizeStream(cudaStream);
|
||||
//cuda.CopyDeviceToHostAsync(cudaResidualTasks, residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks), cudaStream1);
|
||||
}
|
||||
|
||||
|
||||
unsafe int encode_frame(out int size)
|
||||
{
|
||||
int* s = (int*)samplesBufferPtr;
|
||||
@@ -1167,6 +1142,9 @@ namespace CUETools.Codecs.FlaCuda
|
||||
{
|
||||
frame.InitSize(eparams.block_size, eparams.variable_block_size != 0);
|
||||
|
||||
bool doMidside = channels == 2 && eparams.do_midside;
|
||||
int channelCount = doMidside ? 2 * channels : channels;
|
||||
|
||||
if (frame.blocksize != _windowsize && frame.blocksize > 4)
|
||||
{
|
||||
_windowsize = frame.blocksize;
|
||||
@@ -1179,11 +1157,9 @@ namespace CUETools.Codecs.FlaCuda
|
||||
if (_windowcount == 0)
|
||||
throw new Exception("invalid windowfunction");
|
||||
cuda.CopyHostToDevice<float>(cudaWindow, windowBuffer);
|
||||
initialize_autocorTasks();
|
||||
initialize_autocorTasks(channelCount, eparams.max_prediction_order);
|
||||
}
|
||||
|
||||
bool doMidside = channels == 2 && eparams.do_midside;
|
||||
int channelCount = doMidside ? 2 * channels : channels;
|
||||
if (doMidside)
|
||||
channel_decorrelation(s, s + FlaCudaWriter.MAX_BLOCKSIZE, s + 2 * FlaCudaWriter.MAX_BLOCKSIZE, s + 3 * FlaCudaWriter.MAX_BLOCKSIZE, frame.blocksize);
|
||||
|
||||
@@ -1296,33 +1272,29 @@ namespace CUETools.Codecs.FlaCuda
|
||||
cuda.LoadModule(System.IO.Path.Combine(Environment.CurrentDirectory, "flacuda.cubin"));
|
||||
cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor");
|
||||
cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC");
|
||||
cudaEstimateResidual = cuda.GetModuleFunction("cudaEstimateResidual");
|
||||
cudaSumResidual = cuda.GetModuleFunction("cudaSumResidual");
|
||||
cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual");
|
||||
cudaSamples = cuda.Allocate((uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels)));
|
||||
cudaWindow = cuda.Allocate((uint)sizeof(float) * FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS);
|
||||
cudaAutocorTasks = cuda.Allocate((uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS));
|
||||
cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * maxAutocorParts);
|
||||
cudaCompLPCOutput = cuda.Allocate((uint)(sizeof(float) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * maxAutocorParts);
|
||||
cudaResidualTasks = cuda.Allocate((uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS));
|
||||
cudaResidualTasks = cuda.Allocate((uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 4)));
|
||||
cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER + 1) * lpc.MAX_LPC_WINDOWS * maxResidualParts));
|
||||
CUResult cuErr = CUDADriver.cuMemAllocHost(ref samplesBufferPtr, (uint)(sizeof(int) * (channels == 2 ? 4 : channels) * FlaCudaWriter.MAX_BLOCKSIZE));
|
||||
if (cuErr == CUResult.Success)
|
||||
cuErr = CUDADriver.cuMemAllocHost(ref autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS));
|
||||
if (cuErr == CUResult.Success)
|
||||
cuErr = CUDADriver.cuMemAllocHost(ref compLPCOutputPtr, (uint)(sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * lpc.MAX_LPC_ORDER * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS));
|
||||
if (cuErr == CUResult.Success)
|
||||
cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER));
|
||||
if (cuErr == CUResult.Success)
|
||||
cuErr = CUDADriver.cuMemAllocHost(ref residualOutputPtr, (uint)(sizeof(int) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER * maxResidualParts));
|
||||
cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 4)));
|
||||
if (cuErr != CUResult.Success)
|
||||
{
|
||||
if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr); samplesBufferPtr = IntPtr.Zero;
|
||||
if (autocorTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(autocorTasksPtr); autocorTasksPtr = IntPtr.Zero;
|
||||
if (compLPCOutputPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(compLPCOutputPtr); compLPCOutputPtr = IntPtr.Zero;
|
||||
if (residualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualTasksPtr); residualTasksPtr = IntPtr.Zero;
|
||||
if (residualOutputPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualOutputPtr); residualOutputPtr = IntPtr.Zero;
|
||||
throw new CUDAException(cuErr);
|
||||
}
|
||||
cudaStream = cuda.CreateStream();
|
||||
cudaStream1 = cuda.CreateStream();
|
||||
if (_IO == null)
|
||||
_IO = new FileStream(_path, FileMode.Create, FileAccess.Write, FileShare.Read);
|
||||
int header_size = flake_encode_init();
|
||||
@@ -1769,7 +1741,9 @@ namespace CUETools.Codecs.FlaCuda
|
||||
public int residualOrder;
|
||||
public int samplesOffs;
|
||||
public int shift;
|
||||
public int reserved;
|
||||
public int cbits;
|
||||
public int size;
|
||||
public fixed int reserved[11];
|
||||
public fixed int coefs[32];
|
||||
};
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user