optimizations

This commit is contained in:
chudov
2009-09-10 00:00:46 +00:00
parent 435a6acdf8
commit 5fd6108c9d
2 changed files with 248 additions and 166 deletions

View File

@@ -95,20 +95,23 @@ namespace CUETools.Codecs.FlaCuda
CUDA cuda; CUDA cuda;
CUfunction cudaComputeAutocor; CUfunction cudaComputeAutocor;
CUfunction cudaComputeLPC; CUfunction cudaComputeLPC;
CUfunction cudaEstimateResidual;
CUfunction cudaSumResidual;
CUfunction cudaEncodeResidual; CUfunction cudaEncodeResidual;
CUdeviceptr cudaSamples; CUdeviceptr cudaSamples;
CUdeviceptr cudaWindow; CUdeviceptr cudaWindow;
CUdeviceptr cudaAutocorTasks; CUdeviceptr cudaAutocorTasks;
CUdeviceptr cudaAutocorOutput; CUdeviceptr cudaAutocorOutput;
CUdeviceptr cudaCompLPCOutput;
CUdeviceptr cudaResidualTasks; CUdeviceptr cudaResidualTasks;
CUdeviceptr cudaResidualOutput; CUdeviceptr cudaResidualOutput;
IntPtr samplesBufferPtr = IntPtr.Zero; IntPtr samplesBufferPtr = IntPtr.Zero;
IntPtr autocorTasksPtr = IntPtr.Zero; IntPtr autocorTasksPtr = IntPtr.Zero;
IntPtr compLPCOutputPtr = IntPtr.Zero;
IntPtr residualTasksPtr = IntPtr.Zero; IntPtr residualTasksPtr = IntPtr.Zero;
IntPtr residualOutputPtr = IntPtr.Zero;
CUstream cudaStream; CUstream cudaStream;
CUstream cudaStream1;
int nResidualTasks = 0;
int nAutocorTasks = 0;
const int MAX_BLOCKSIZE = 8192; const int MAX_BLOCKSIZE = 8192;
const int maxResidualParts = MAX_BLOCKSIZE / (256 - 32); const int maxResidualParts = MAX_BLOCKSIZE / (256 - 32);
@@ -213,15 +216,13 @@ namespace CUETools.Codecs.FlaCuda
cuda.Free(cudaSamples); cuda.Free(cudaSamples);
cuda.Free(cudaAutocorTasks); cuda.Free(cudaAutocorTasks);
cuda.Free(cudaAutocorOutput); cuda.Free(cudaAutocorOutput);
cuda.Free(cudaCompLPCOutput);
cuda.Free(cudaResidualTasks); cuda.Free(cudaResidualTasks);
cuda.Free(cudaResidualOutput); cuda.Free(cudaResidualOutput);
CUDADriver.cuMemFreeHost(compLPCOutputPtr);
CUDADriver.cuMemFreeHost(residualOutputPtr);
CUDADriver.cuMemFreeHost(samplesBufferPtr); CUDADriver.cuMemFreeHost(samplesBufferPtr);
CUDADriver.cuMemFreeHost(residualTasksPtr); CUDADriver.cuMemFreeHost(residualTasksPtr);
CUDADriver.cuMemFreeHost(autocorTasksPtr); CUDADriver.cuMemFreeHost(autocorTasksPtr);
cuda.DestroyStream(cudaStream); cuda.DestroyStream(cudaStream);
cuda.DestroyStream(cudaStream1);
cuda.Dispose(); cuda.Dispose();
inited = false; inited = false;
} }
@@ -247,15 +248,13 @@ namespace CUETools.Codecs.FlaCuda
cuda.Free(cudaSamples); cuda.Free(cudaSamples);
cuda.Free(cudaAutocorTasks); cuda.Free(cudaAutocorTasks);
cuda.Free(cudaAutocorOutput); cuda.Free(cudaAutocorOutput);
cuda.Free(cudaCompLPCOutput);
cuda.Free(cudaResidualTasks); cuda.Free(cudaResidualTasks);
cuda.Free(cudaResidualOutput); cuda.Free(cudaResidualOutput);
CUDADriver.cuMemFreeHost(compLPCOutputPtr);
CUDADriver.cuMemFreeHost(residualOutputPtr);
CUDADriver.cuMemFreeHost(samplesBufferPtr); CUDADriver.cuMemFreeHost(samplesBufferPtr);
CUDADriver.cuMemFreeHost(residualTasksPtr); CUDADriver.cuMemFreeHost(residualTasksPtr);
CUDADriver.cuMemFreeHost(autocorTasksPtr); CUDADriver.cuMemFreeHost(autocorTasksPtr);
cuda.DestroyStream(cudaStream); cuda.DestroyStream(cudaStream);
cuda.DestroyStream(cudaStream1);
cuda.Dispose(); cuda.Dispose();
inited = false; inited = false;
} }
@@ -898,6 +897,65 @@ namespace CUETools.Codecs.FlaCuda
_windowcount++; _windowcount++;
} }
unsafe void initialize_autocorTasks(int channelsCount, int max_order)
{
computeAutocorTaskStruct* autocorTasks = (computeAutocorTaskStruct*)autocorTasksPtr;
encodeResidualTaskStruct* residualTasks = (encodeResidualTaskStruct*)residualTasksPtr;
nAutocorTasks = 0;
nResidualTasks = 0;
for (int ch = 0; ch < channelsCount; ch++)
for (int iWindow = 0; iWindow < _windowcount; iWindow++)
{
// Autocorelation task
autocorTasks[nAutocorTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE;
autocorTasks[nAutocorTasks].windowOffs = iWindow * 2 * FlaCudaWriter.MAX_BLOCKSIZE;
nAutocorTasks++;
// LPC tasks
for (int order = 1; order <= max_order; order++)
{
residualTasks[nResidualTasks].residualOrder = order - 1;
residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE;
nResidualTasks++;
}
}
// Fixed prediction
for (int ch = 0; ch < channelsCount; ch++)
{
for (int order = 1; order <= 4; order++)
{
residualTasks[nResidualTasks].residualOrder = order - 1;
residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE;
residualTasks[nResidualTasks].shift = 0;
switch (order)
{
case 1:
residualTasks[nResidualTasks].coefs[0] = 1;
break;
case 2:
residualTasks[nResidualTasks].coefs[0] = 2;
residualTasks[nResidualTasks].coefs[1] = -1;
break;
case 3:
residualTasks[nResidualTasks].coefs[0] = 3;
residualTasks[nResidualTasks].coefs[1] = -3;
residualTasks[nResidualTasks].coefs[2] = 1;
break;
case 4:
residualTasks[nResidualTasks].coefs[0] = 4;
residualTasks[nResidualTasks].coefs[1] = -6;
residualTasks[nResidualTasks].coefs[2] = 4;
residualTasks[nResidualTasks].coefs[3] = -1;
break;
}
nResidualTasks++;
}
}
cuda.CopyHostToDeviceAsync(cudaAutocorTasks, autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * nAutocorTasks), cudaStream);
cuda.CopyHostToDeviceAsync(cudaResidualTasks, residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks), cudaStream);
cuda.SynchronizeStream(cudaStream);
}
unsafe void encode_residual(FlacFrame frame) unsafe void encode_residual(FlacFrame frame)
{ {
for (int ch = 0; ch < channels; ch++) for (int ch = 0; ch < channels; ch++)
@@ -968,19 +1026,8 @@ namespace CUETools.Codecs.FlaCuda
for (int order = 1; order <= max_order && order < frame.blocksize; order++) for (int order = 1; order <= max_order && order < frame.blocksize; order++)
{ {
int index = (order - 1) + max_order * (iWindow + _windowcount * ch); int index = (order - 1) + max_order * (iWindow + _windowcount * ch);
int nbits = 0; int cbits = residualTasks[index].cbits;
for (int p = 0; p < partCount; p++) int nbits = order * (int)frame.subframes[ch].obits + 4 + 5 + order * cbits + 6 + residualTasks[index].size;
nbits += ((int*)residualOutputPtr)[p + partCount * index];
int cbits = 1;
for (int i = order; i > 0; i--)
{
int c = residualTasks[index].coefs[i - 1];
while (cbits < 16 && c != (c << (32 - cbits)) >> (32 - cbits))
cbits++;
}
nbits += order * (int)frame.subframes[ch].obits + 4 + 5 + order * cbits + 6;
if (frame.subframes[ch].best.size > nbits) if (frame.subframes[ch].best.size > nbits)
{ {
frame.subframes[ch].best.type = SubframeType.LPC; frame.subframes[ch].best.type = SubframeType.LPC;
@@ -989,8 +1036,8 @@ namespace CUETools.Codecs.FlaCuda
frame.subframes[ch].best.window = iWindow; frame.subframes[ch].best.window = iWindow;
frame.subframes[ch].best.cbits = cbits; frame.subframes[ch].best.cbits = cbits;
frame.subframes[ch].best.shift = residualTasks[index].shift; frame.subframes[ch].best.shift = residualTasks[index].shift;
fixed (int* fcoefs = frame.subframes[ch].best.coefs) for (int i = 0; i < order; i++)
AudioSamples.MemCpy(fcoefs, residualTasks[index].coefs, order); frame.subframes[ch].best.coefs[i] = residualTasks[index].coefs[i];//order - 1 - i];
} }
} }
} }
@@ -1002,10 +1049,7 @@ namespace CUETools.Codecs.FlaCuda
for (int order = 1; order <= 4 && order < frame.blocksize; order++) for (int order = 1; order <= 4 && order < frame.blocksize; order++)
{ {
int index = (order - 1) + 4 * ch; int index = (order - 1) + 4 * ch;
int nbits = 0; int nbits = order * (int)frame.subframes[ch].obits + 6 + residualTasks[index + max_order * _windowcount * channelsCount].size;
for (int p = 0; p < partCount; p++)
nbits += ((int*)residualOutputPtr)[p + partCount * (index + max_order * _windowcount * channelsCount)];
nbits += order * (int)frame.subframes[ch].obits + 6;
if (frame.subframes[ch].best.size > nbits) if (frame.subframes[ch].best.size > nbits)
{ {
frame.subframes[ch].best.type = SubframeType.Fixed; frame.subframes[ch].best.type = SubframeType.Fixed;
@@ -1018,11 +1062,10 @@ namespace CUETools.Codecs.FlaCuda
unsafe void estimate_residual(FlacFrame frame, int channelsCount, int max_order, int autocorPartCount, out int partCount) unsafe void estimate_residual(FlacFrame frame, int channelsCount, int max_order, int autocorPartCount, out int partCount)
{ {
encodeResidualTaskStruct* residualTasks = (encodeResidualTaskStruct*)residualTasksPtr;
uint cbits = get_precision(frame.blocksize) + 1; uint cbits = get_precision(frame.blocksize) + 1;
int nResidualTasks = 0;
int residualThreads = 256; int residualThreads = 256;
int partSize = residualThreads - max_order; int partSize = residualThreads - max_order;
partSize &= 0xffffff0;
partCount = (frame.blocksize + partSize - 1) / partSize; partCount = (frame.blocksize + partSize - 1) / partSize;
if (partCount > maxResidualParts) if (partCount > maxResidualParts)
@@ -1031,92 +1074,24 @@ namespace CUETools.Codecs.FlaCuda
if (frame.blocksize <= 4) if (frame.blocksize <= 4)
return; return;
// LPC cuda.SetParameter(cudaEstimateResidual, 0, (uint)cudaResidualOutput.Pointer);
for (int ch = 0; ch < channelsCount; ch++) cuda.SetParameter(cudaEstimateResidual, IntPtr.Size, (uint)cudaSamples.Pointer);
for (int iWindow = 0; iWindow < _windowcount; iWindow++) cuda.SetParameter(cudaEstimateResidual, IntPtr.Size * 2, (uint)cudaResidualTasks.Pointer);
{ cuda.SetParameter(cudaEstimateResidual, IntPtr.Size * 3, (uint)frame.blocksize);
//int* lpcs = ((int*)compLPCOutputPtr) + (max_order + 1) * max_order * (iWindow + _windowcount * ch); cuda.SetParameter(cudaEstimateResidual, IntPtr.Size * 3 + sizeof(uint), (uint)partSize);
//for (int order = 1; order <= max_order; order++) cuda.SetParameterSize(cudaEstimateResidual, (uint)(IntPtr.Size * 3) + sizeof(uint) * 2U);
//{ cuda.SetFunctionBlockShape(cudaEstimateResidual, residualThreads, 1, 1);
// residualTasks[nResidualTasks].residualOrder = order - 1;
// residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE;
// residualTasks[nResidualTasks].shift = lpcs[order + (order - 1) * (max_order + 1)];
// AudioSamples.MemCpy(residualTasks[nResidualTasks].coefs, lpcs + (order - 1) * (max_order + 1), order);
// nResidualTasks++;
//}
float* lpcs = ((float*)compLPCOutputPtr) + max_order * max_order * (iWindow + _windowcount * ch);
for (int order = 1; order <= max_order; order++)
{
residualTasks[nResidualTasks].residualOrder = order - 1;
residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE;
lpc.quantize_lpc_coefs(lpcs + (order - 1) * max_order, cuda.SetParameter(cudaSumResidual, 0, (uint)cudaResidualTasks.Pointer);
order, cbits, residualTasks[nResidualTasks].coefs, cuda.SetParameter(cudaSumResidual, IntPtr.Size, (uint)cudaResidualOutput.Pointer);
out residualTasks[nResidualTasks].shift, 15, 0); cuda.SetParameter(cudaSumResidual, IntPtr.Size * 2, (uint)partCount);
cuda.SetParameterSize(cudaSumResidual, (uint)(IntPtr.Size * 2) + sizeof(uint) * 1U);
nResidualTasks++; cuda.SetFunctionBlockShape(cudaSumResidual, 64, 1, 1);
}
}
// FIXED
for (int ch = 0; ch < channelsCount; ch++)
{
for (int order = 1; order <= 4; order++)
{
residualTasks[nResidualTasks].residualOrder = order - 1;
residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE;
residualTasks[nResidualTasks].shift = 0;
switch (order)
{
case 1:
residualTasks[nResidualTasks].coefs[0] = 1;
break;
case 2:
residualTasks[nResidualTasks].coefs[0] = 2;
residualTasks[nResidualTasks].coefs[1] = -1;
break;
case 3:
residualTasks[nResidualTasks].coefs[0] = 3;
residualTasks[nResidualTasks].coefs[1] = -3;
residualTasks[nResidualTasks].coefs[2] = 1;
break;
case 4:
residualTasks[nResidualTasks].coefs[0] = 4;
residualTasks[nResidualTasks].coefs[1] = -6;
residualTasks[nResidualTasks].coefs[2] = 4;
residualTasks[nResidualTasks].coefs[3] = -1;
break;
}
nResidualTasks++;
}
}
cuda.SetParameter(cudaEncodeResidual, 0, (uint)cudaResidualOutput.Pointer);
cuda.SetParameter(cudaEncodeResidual, IntPtr.Size, (uint)cudaSamples.Pointer);
cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 2, (uint)cudaResidualTasks.Pointer);
cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 3, (uint)frame.blocksize);
cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 3 + sizeof(uint), (uint)partSize);
cuda.SetParameterSize(cudaEncodeResidual, (uint)(IntPtr.Size * 3) + sizeof(uint) * 2U);
cuda.SetFunctionBlockShape(cudaEncodeResidual, residualThreads, 1, 1);
// issue work to the GPU // issue work to the GPU
cuda.CopyHostToDeviceAsync(cudaResidualTasks, residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks), cudaStream); cuda.LaunchAsync(cudaEstimateResidual, partCount, nResidualTasks, cudaStream);
cuda.LaunchAsync(cudaEncodeResidual, partCount, nResidualTasks, cudaStream); cuda.LaunchAsync(cudaSumResidual, 1, nResidualTasks, cudaStream);
cuda.CopyDeviceToHostAsync(cudaResidualOutput, residualOutputPtr, (uint)(sizeof(int) * partCount * nResidualTasks), cudaStream); cuda.CopyDeviceToHostAsync(cudaResidualTasks, residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks), cudaStream);
cuda.SynchronizeStream(cudaStream);
}
unsafe void initialize_autocorTasks()
{
computeAutocorTaskStruct* autocorTasks = (computeAutocorTaskStruct*)autocorTasksPtr;
int nAutocorTasks = 0;
for (int ch = 0; ch < (channels == 2 ? 4 : channels); ch++)
for (int iWindow = 0; iWindow < _windowcount; iWindow++)
{
autocorTasks[nAutocorTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE;
autocorTasks[nAutocorTasks].windowOffs = iWindow * 2 * FlaCudaWriter.MAX_BLOCKSIZE;
nAutocorTasks++;
}
cuda.CopyHostToDeviceAsync(cudaAutocorTasks, autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * nAutocorTasks), cudaStream);
cuda.SynchronizeStream(cudaStream); cuda.SynchronizeStream(cudaStream);
} }
@@ -1124,7 +1099,7 @@ namespace CUETools.Codecs.FlaCuda
{ {
int autocorThreads = 256; int autocorThreads = 256;
int partSize = 2 * autocorThreads - max_order; int partSize = 2 * autocorThreads - max_order;
int nAutocorTasks = _windowcount * channelsCount; partSize &= 0xffffff0;
partCount = (frame.blocksize + partSize - 1) / partSize; partCount = (frame.blocksize + partSize - 1) / partSize;
if (partCount > maxAutocorParts) if (partCount > maxAutocorParts)
@@ -1143,22 +1118,22 @@ namespace CUETools.Codecs.FlaCuda
cuda.SetParameterSize(cudaComputeAutocor, (uint)(IntPtr.Size * 4) + sizeof(uint) * 3); cuda.SetParameterSize(cudaComputeAutocor, (uint)(IntPtr.Size * 4) + sizeof(uint) * 3);
cuda.SetFunctionBlockShape(cudaComputeAutocor, autocorThreads, 1, 1); cuda.SetFunctionBlockShape(cudaComputeAutocor, autocorThreads, 1, 1);
cuda.SetParameter(cudaComputeLPC, 0, (uint)cudaCompLPCOutput.Pointer); cuda.SetParameter(cudaComputeLPC, 0, (uint)cudaResidualTasks.Pointer);
cuda.SetParameter(cudaComputeLPC, IntPtr.Size, (uint)cudaAutocorOutput.Pointer); cuda.SetParameter(cudaComputeLPC, IntPtr.Size, (uint)cudaAutocorOutput.Pointer);
cuda.SetParameter(cudaComputeLPC, IntPtr.Size * 2, (uint)cudaAutocorTasks.Pointer); cuda.SetParameter(cudaComputeLPC, IntPtr.Size * 2, (uint)cudaAutocorTasks.Pointer);
cuda.SetParameter(cudaComputeLPC, IntPtr.Size * 3, (uint)max_order); cuda.SetParameter(cudaComputeLPC, IntPtr.Size * 3, (uint)max_order);
cuda.SetParameter(cudaComputeLPC, IntPtr.Size * 3 + sizeof(uint), (uint)partCount); cuda.SetParameter(cudaComputeLPC, IntPtr.Size * 3 + sizeof(uint), (uint)partCount);
cuda.SetParameterSize(cudaComputeLPC, (uint)(IntPtr.Size * 3) + sizeof(uint) * 2); cuda.SetParameterSize(cudaComputeLPC, (uint)(IntPtr.Size * 3) + sizeof(uint) * 2);
cuda.SetFunctionBlockShape(cudaComputeLPC, 32, 1, 1); cuda.SetFunctionBlockShape(cudaComputeLPC, 64, 1, 1);
// issue work to the GPU // issue work to the GPU
cuda.CopyHostToDeviceAsync(cudaSamples, samplesBufferPtr, (uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelsCount), cudaStream); cuda.CopyHostToDeviceAsync(cudaSamples, samplesBufferPtr, (uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelsCount), cudaStream);
cuda.LaunchAsync(cudaComputeAutocor, partCount, nAutocorTasks, cudaStream); cuda.LaunchAsync(cudaComputeAutocor, partCount, nAutocorTasks, cudaStream);
cuda.LaunchAsync(cudaComputeLPC, 1, nAutocorTasks, cudaStream); cuda.LaunchAsync(cudaComputeLPC, 1, nAutocorTasks, cudaStream);
cuda.CopyDeviceToHostAsync(cudaCompLPCOutput, compLPCOutputPtr, (uint)(sizeof(float) * (max_order + 1) * max_order * nAutocorTasks), cudaStream);
cuda.SynchronizeStream(cudaStream); cuda.SynchronizeStream(cudaStream);
//cuda.CopyDeviceToHostAsync(cudaResidualTasks, residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks), cudaStream1);
} }
unsafe int encode_frame(out int size) unsafe int encode_frame(out int size)
{ {
int* s = (int*)samplesBufferPtr; int* s = (int*)samplesBufferPtr;
@@ -1167,6 +1142,9 @@ namespace CUETools.Codecs.FlaCuda
{ {
frame.InitSize(eparams.block_size, eparams.variable_block_size != 0); frame.InitSize(eparams.block_size, eparams.variable_block_size != 0);
bool doMidside = channels == 2 && eparams.do_midside;
int channelCount = doMidside ? 2 * channels : channels;
if (frame.blocksize != _windowsize && frame.blocksize > 4) if (frame.blocksize != _windowsize && frame.blocksize > 4)
{ {
_windowsize = frame.blocksize; _windowsize = frame.blocksize;
@@ -1179,11 +1157,9 @@ namespace CUETools.Codecs.FlaCuda
if (_windowcount == 0) if (_windowcount == 0)
throw new Exception("invalid windowfunction"); throw new Exception("invalid windowfunction");
cuda.CopyHostToDevice<float>(cudaWindow, windowBuffer); cuda.CopyHostToDevice<float>(cudaWindow, windowBuffer);
initialize_autocorTasks(); initialize_autocorTasks(channelCount, eparams.max_prediction_order);
} }
bool doMidside = channels == 2 && eparams.do_midside;
int channelCount = doMidside ? 2 * channels : channels;
if (doMidside) if (doMidside)
channel_decorrelation(s, s + FlaCudaWriter.MAX_BLOCKSIZE, s + 2 * FlaCudaWriter.MAX_BLOCKSIZE, s + 3 * FlaCudaWriter.MAX_BLOCKSIZE, frame.blocksize); channel_decorrelation(s, s + FlaCudaWriter.MAX_BLOCKSIZE, s + 2 * FlaCudaWriter.MAX_BLOCKSIZE, s + 3 * FlaCudaWriter.MAX_BLOCKSIZE, frame.blocksize);
@@ -1296,33 +1272,29 @@ namespace CUETools.Codecs.FlaCuda
cuda.LoadModule(System.IO.Path.Combine(Environment.CurrentDirectory, "flacuda.cubin")); cuda.LoadModule(System.IO.Path.Combine(Environment.CurrentDirectory, "flacuda.cubin"));
cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor"); cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor");
cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC"); cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC");
cudaEstimateResidual = cuda.GetModuleFunction("cudaEstimateResidual");
cudaSumResidual = cuda.GetModuleFunction("cudaSumResidual");
cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual"); cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual");
cudaSamples = cuda.Allocate((uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels))); cudaSamples = cuda.Allocate((uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels)));
cudaWindow = cuda.Allocate((uint)sizeof(float) * FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS); cudaWindow = cuda.Allocate((uint)sizeof(float) * FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS);
cudaAutocorTasks = cuda.Allocate((uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS)); cudaAutocorTasks = cuda.Allocate((uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS));
cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * maxAutocorParts); cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * maxAutocorParts);
cudaCompLPCOutput = cuda.Allocate((uint)(sizeof(float) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * maxAutocorParts); cudaResidualTasks = cuda.Allocate((uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 4)));
cudaResidualTasks = cuda.Allocate((uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS));
cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER + 1) * lpc.MAX_LPC_WINDOWS * maxResidualParts)); cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER + 1) * lpc.MAX_LPC_WINDOWS * maxResidualParts));
CUResult cuErr = CUDADriver.cuMemAllocHost(ref samplesBufferPtr, (uint)(sizeof(int) * (channels == 2 ? 4 : channels) * FlaCudaWriter.MAX_BLOCKSIZE)); CUResult cuErr = CUDADriver.cuMemAllocHost(ref samplesBufferPtr, (uint)(sizeof(int) * (channels == 2 ? 4 : channels) * FlaCudaWriter.MAX_BLOCKSIZE));
if (cuErr == CUResult.Success) if (cuErr == CUResult.Success)
cuErr = CUDADriver.cuMemAllocHost(ref autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS)); cuErr = CUDADriver.cuMemAllocHost(ref autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS));
if (cuErr == CUResult.Success) if (cuErr == CUResult.Success)
cuErr = CUDADriver.cuMemAllocHost(ref compLPCOutputPtr, (uint)(sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * lpc.MAX_LPC_ORDER * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS)); cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 4)));
if (cuErr == CUResult.Success)
cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER));
if (cuErr == CUResult.Success)
cuErr = CUDADriver.cuMemAllocHost(ref residualOutputPtr, (uint)(sizeof(int) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER * maxResidualParts));
if (cuErr != CUResult.Success) if (cuErr != CUResult.Success)
{ {
if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr); samplesBufferPtr = IntPtr.Zero; if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr); samplesBufferPtr = IntPtr.Zero;
if (autocorTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(autocorTasksPtr); autocorTasksPtr = IntPtr.Zero; if (autocorTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(autocorTasksPtr); autocorTasksPtr = IntPtr.Zero;
if (compLPCOutputPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(compLPCOutputPtr); compLPCOutputPtr = IntPtr.Zero;
if (residualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualTasksPtr); residualTasksPtr = IntPtr.Zero; if (residualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualTasksPtr); residualTasksPtr = IntPtr.Zero;
if (residualOutputPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualOutputPtr); residualOutputPtr = IntPtr.Zero;
throw new CUDAException(cuErr); throw new CUDAException(cuErr);
} }
cudaStream = cuda.CreateStream(); cudaStream = cuda.CreateStream();
cudaStream1 = cuda.CreateStream();
if (_IO == null) if (_IO == null)
_IO = new FileStream(_path, FileMode.Create, FileAccess.Write, FileShare.Read); _IO = new FileStream(_path, FileMode.Create, FileAccess.Write, FileShare.Read);
int header_size = flake_encode_init(); int header_size = flake_encode_init();
@@ -1769,7 +1741,9 @@ namespace CUETools.Codecs.FlaCuda
public int residualOrder; public int residualOrder;
public int samplesOffs; public int samplesOffs;
public int shift; public int shift;
public int reserved; public int cbits;
public int size;
public fixed int reserved[11];
public fixed int coefs[32]; public fixed int coefs[32];
}; };
} }

View File

@@ -26,6 +26,17 @@ typedef struct
int windowOffs; int windowOffs;
} computeAutocorTaskStruct; } computeAutocorTaskStruct;
typedef struct
{
int residualOrder; // <= 32
int samplesOffs;
int shift;
int cbits;
int size;
int reserved[11];
int coefs[32];
} encodeResidualTaskStruct;
extern "C" __global__ void cudaComputeAutocor( extern "C" __global__ void cudaComputeAutocor(
float *output, float *output,
const int *samples, const int *samples,
@@ -73,7 +84,7 @@ extern "C" __global__ void cudaComputeAutocor(
shared.product[tid] += shared.product[tid + 8]; shared.product[tid] += shared.product[tid + 8];
shared.product[tid] += shared.product[tid + 4]; shared.product[tid] += shared.product[tid + 4];
shared.product[tid] += shared.product[tid + 2]; shared.product[tid] += shared.product[tid + 2];
if (tid == 0) shared.sum[lag] = shared.product[0] + shared.product[1]; if (tid == 0) shared.sum[lag] = shared.product[0] + shared.product[1];
__syncthreads(); __syncthreads();
} }
@@ -83,11 +94,11 @@ extern "C" __global__ void cudaComputeAutocor(
} }
extern "C" __global__ void cudaComputeLPC( extern "C" __global__ void cudaComputeLPC(
float*output, encodeResidualTaskStruct *output,
float*autoc, float*autoc,
computeAutocorTaskStruct *tasks, computeAutocorTaskStruct *tasks,
int max_order, // should be <= 32 int max_order, // should be <= 32
int partCount // should be <= blockDim int partCount // should be <= blockDim?
) )
{ {
__shared__ struct { __shared__ struct {
@@ -96,6 +107,7 @@ extern "C" __global__ void cudaComputeLPC(
float buf[32]; float buf[32];
int bits[32]; int bits[32];
float autoc[33]; float autoc[33];
int cbits;
} shared; } shared;
const int tid = threadIdx.x; const int tid = threadIdx.x;
@@ -111,21 +123,21 @@ extern "C" __global__ void cudaComputeLPC(
// add up parts // add up parts
for (int part = 0; part < partCount; part++) for (int part = 0; part < partCount; part++)
if (tid <= max_order) if (tid <= max_order)
shared.autoc[tid] += autoc[(blockIdx.y * partCount + part) * (max_order + 1) + tid]; shared.autoc[tid] += autoc[(blockIdx.y * partCount + part) * (max_order + 1) + tid];
__syncthreads(); __syncthreads();
if (tid <= 32) if (tid < 32)
shared.tmp[tid] = 0.0f; shared.tmp[tid] = 0.0f;
float err = shared.autoc[0]; float err = shared.autoc[0];
for(int order = 0; order < max_order; order++) for(int order = 0; order < max_order; order++)
{ {
if (tid < 32) if (tid < 32)
{ {
shared.buf[tid] = tid < order ? shared.tmp[tid] * shared.autoc[order - tid] : 0; shared.buf[tid] = (tid < order) * shared.tmp[tid] * shared.autoc[order - tid];
shared.buf[tid] += shared.buf[tid + 16]; shared.buf[tid] += shared.buf[tid + 16];
shared.buf[tid] += shared.buf[tid + 8]; shared.buf[tid] += shared.buf[tid + 8];
shared.buf[tid] += shared.buf[tid + 4]; shared.buf[tid] += shared.buf[tid + 4];
@@ -138,38 +150,135 @@ extern "C" __global__ void cudaComputeLPC(
err *= 1.0f - (r * r); err *= 1.0f - (r * r);
if (tid == 0) shared.tmp[tid] += (tid < order) * r * shared.tmp[order - 1 - tid] + (tid == order) * r;
shared.tmp[order] = r; // we could also set shared.tmp[-1] to 1.0f
if (tid < order) if (tid < 32)
shared.tmp[tid] += r * shared.tmp[order - 1 - tid]; {
if (tid <= order) int precision = 13;
output[((blockIdx.x + blockIdx.y * gridDim.x) * max_order + order) * max_order + tid] = -shared.tmp[tid]; shared.bits[tid] = __mul24((33 - __clz(__float2int_rn(fabs(shared.tmp[tid]) * (1 << 15))) - precision), tid <= order);
//{ shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 16]);
// int precision = 13; shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 8]);
// shared.bits[tid] = 32 - __clz(__float2int_rn(fabs(shared.tmp[tid]) * (1 << 15))) - precision; shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 4]);
// shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 16]); shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 2]);
// shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 8]); shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 1]);
// shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 4]); int sh = max(0,min(15, 15 - shared.bits[0]));
// shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 2]); int coef = max(-(1 << precision),min((1 << precision)-1,__float2int_rn(-shared.tmp[tid] * (1 << sh))));
// shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 1]); if (tid <= order)
// int sh = max(0,min(15, 15 - shared.bits[0])); output[(blockIdx.x + blockIdx.y * gridDim.x) * max_order + order].coefs[tid] = coef;
// shared.bits[tid] = max(-(1 << precision),min((1 << precision)-1,__float2int_rn(-shared.tmp[tid] * (1 << sh)))); if (tid == 0)
// if (tid == 0) output[(blockIdx.x + blockIdx.y * gridDim.x) * max_order + order].shift = sh;
// output[((blockIdx.x + blockIdx.y * gridDim.x) * max_order + order) * (1 + max_order) + order + 1] = sh; shared.bits[tid] = 33 - max(__clz(coef),__clz(-1 ^ coef));
// output[((blockIdx.x + blockIdx.y * gridDim.x) * max_order + order) * (1 + max_order) + tid] = shared.bits[tid]; shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 16]);
//} shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 8]);
shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 4]);
shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 2]);
shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 1]);
int cbits = shared.bits[0];
if (tid == 0)
output[(blockIdx.x + blockIdx.y * gridDim.x) * max_order + order].cbits = cbits;
}
__syncthreads(); __syncthreads();
} }
} }
typedef struct extern "C" __global__ void cudaEstimateResidual(
int*output,
int*samples,
encodeResidualTaskStruct *tasks,
int frameSize,
int partSize // should be <= blockDim - max_order
)
{ {
int residualOrder; // <= 32 __shared__ struct {
int samplesOffs; int data[256];
int shift; int residual[256];
int reserved; int rice[32];
int coefs[32]; encodeResidualTaskStruct task;
} encodeResidualTaskStruct; } shared;
const int tid = threadIdx.x;
// fetch task data
if (tid < sizeof(encodeResidualTaskStruct) / sizeof(int))
((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid];
__syncthreads();
const int pos = blockIdx.x * partSize;
const int residualOrder = shared.task.residualOrder + 1;
const int residualLen = min(frameSize - pos - residualOrder, partSize);
const int dataLen = residualLen + residualOrder;
// fetch samples
shared.data[tid] = (tid < dataLen ? samples[shared.task.samplesOffs + pos + tid] : 0);
// reverse coefs
if (tid < residualOrder) shared.task.coefs[tid] = shared.task.coefs[residualOrder - 1 - tid];
// compute residual
__syncthreads();
long sum = 0;
for (int c = 0; c < residualOrder; c++)
sum += __mul24(shared.data[tid + c], shared.task.coefs[c]);
int res = shared.data[tid + residualOrder] - (sum >> shared.task.shift);
shared.residual[tid] = __mul24(tid < residualLen, (2 * res) ^ (res >> 31));
__syncthreads();
// residual sum: reduction in shared mem
if (tid < 128) shared.residual[tid] += shared.residual[tid + 128]; __syncthreads();
if (tid < 64) shared.residual[tid] += shared.residual[tid + 64]; __syncthreads();
if (tid < 32) shared.residual[tid] += shared.residual[tid + 32]; __syncthreads();
shared.residual[tid] += shared.residual[tid + 16];
shared.residual[tid] += shared.residual[tid + 8];
shared.residual[tid] += shared.residual[tid + 4];
shared.residual[tid] += shared.residual[tid + 2];
shared.residual[tid] += shared.residual[tid + 1];
__syncthreads();
if (tid < 32)
{
// rice parameter search
shared.rice[tid] = __mul24(tid >= 15, 0x7fffff) + residualLen * (tid + 1) + ((shared.residual[0] - (residualLen >> 1)) >> tid);
shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 8]);
shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 4]);
shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 2]);
shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 1]);
}
if (tid == 0)
output[blockIdx.x + blockIdx.y * gridDim.x] = shared.rice[0];
}
extern "C" __global__ void cudaSumResidual(
encodeResidualTaskStruct *tasks,
int *residual,
int partCount // <= blockDim.y (64)
)
{
__shared__ struct {
int partLen[64];
//encodeResidualTaskStruct task;
} shared;
const int tid = threadIdx.x;
// fetch task data
// if (tid < sizeof(encodeResidualTaskStruct) / sizeof(int))
//((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid];
// __syncthreads();
shared.partLen[tid] = (tid < partCount) ? residual[tid + partCount * blockIdx.y] : 0;
__syncthreads();
// length sum: reduction in shared mem
if (tid < 32) shared.partLen[tid] += shared.partLen[tid + 32]; __syncthreads();
shared.partLen[tid] += shared.partLen[tid + 16];
shared.partLen[tid] += shared.partLen[tid + 8];
shared.partLen[tid] += shared.partLen[tid + 4];
shared.partLen[tid] += shared.partLen[tid + 2];
shared.partLen[tid] += shared.partLen[tid + 1];
__syncthreads();
// FIXME: should process partition order here!!!
// return sum
if (tid == 0)
tasks[blockIdx.y].size = shared.partLen[0];
}
extern "C" __global__ void cudaEncodeResidual( extern "C" __global__ void cudaEncodeResidual(
int*output, int*output,
@@ -233,5 +342,4 @@ extern "C" __global__ void cudaEncodeResidual(
if (tid == 0) if (tid == 0)
output[blockIdx.x + blockIdx.y * gridDim.x] = shared.rice[0]; output[blockIdx.x + blockIdx.y * gridDim.x] = shared.rice[0];
} }
#endif #endif