mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
optimizations
This commit is contained in:
@@ -424,6 +424,9 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
int* s = ((int*)task.samplesBufferPtr) + samplesInBuffer;
|
int* s = ((int*)task.samplesBufferPtr) + samplesInBuffer;
|
||||||
fixed (int *src = &samples[pos, 0])
|
fixed (int *src = &samples[pos, 0])
|
||||||
{
|
{
|
||||||
|
short* dst = ((short*)task.samplesBytesPtr) + samplesInBuffer * channels;
|
||||||
|
for (int i = 0; i < block * channels; i++)
|
||||||
|
dst[i] = (short)src[i];
|
||||||
if (channels == 2 && eparams.do_midside)
|
if (channels == 2 && eparams.do_midside)
|
||||||
channel_decorrelation(s, s + FlaCudaWriter.MAX_BLOCKSIZE,
|
channel_decorrelation(s, s + FlaCudaWriter.MAX_BLOCKSIZE,
|
||||||
s + 2 * FlaCudaWriter.MAX_BLOCKSIZE, s + 3 * FlaCudaWriter.MAX_BLOCKSIZE, src, block);
|
s + 2 * FlaCudaWriter.MAX_BLOCKSIZE, s + 3 * FlaCudaWriter.MAX_BLOCKSIZE, src, block);
|
||||||
@@ -947,7 +950,7 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
csum += (ulong)Math.Abs(coefs[i - 1]);
|
csum += (ulong)Math.Abs(coefs[i - 1]);
|
||||||
if ((csum << (int)frame.subframes[ch].obits) >= 1UL << 32)
|
if ((csum << (int)frame.subframes[ch].obits) >= 1UL << 32)
|
||||||
lpc.encode_residual_long(frame.subframes[ch].best.residual, frame.subframes[ch].samples, frame.blocksize, frame.subframes[ch].best.order, coefs, frame.subframes[ch].best.shift);
|
lpc.encode_residual_long(frame.subframes[ch].best.residual, frame.subframes[ch].samples, frame.blocksize, frame.subframes[ch].best.order, coefs, frame.subframes[ch].best.shift);
|
||||||
else if (encode_on_cpu)
|
else// if (encode_on_cpu)
|
||||||
lpc.encode_residual(frame.subframes[ch].best.residual, frame.subframes[ch].samples, frame.blocksize, frame.subframes[ch].best.order, coefs, frame.subframes[ch].best.shift);
|
lpc.encode_residual(frame.subframes[ch].best.residual, frame.subframes[ch].samples, frame.blocksize, frame.subframes[ch].best.order, coefs, frame.subframes[ch].best.shift);
|
||||||
|
|
||||||
int pmin = get_max_p_order(eparams.min_partition_order, frame.blocksize, frame.subframes[ch].best.order);
|
int pmin = get_max_p_order(eparams.min_partition_order, frame.blocksize, frame.subframes[ch].best.order);
|
||||||
@@ -1022,7 +1025,10 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
if (blocksize <= 4)
|
if (blocksize <= 4)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
compute_autocorellation(blocksize, channelsCount, max_order, nFrames, task);
|
int autocorPartSize = 256 + 128;// (2 * 256 - max_order) & ~31;
|
||||||
|
int autocorPartCount = (blocksize + autocorPartSize - 1) / autocorPartSize;
|
||||||
|
if (autocorPartCount > maxAutocorParts)
|
||||||
|
throw new Exception("internal error");
|
||||||
|
|
||||||
int threads_y;
|
int threads_y;
|
||||||
if (task.nResidualTasksPerChannel >= 4 && task.nResidualTasksPerChannel <= 8)
|
if (task.nResidualTasksPerChannel >= 4 && task.nResidualTasksPerChannel <= 8)
|
||||||
@@ -1039,25 +1045,58 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
threads_y = 4;
|
threads_y = 4;
|
||||||
else
|
else
|
||||||
throw new Exception("invalid LPC order");
|
throw new Exception("invalid LPC order");
|
||||||
int partSize = 32 * threads_y;
|
int residualPartSize = 32 * threads_y;
|
||||||
int partCount = (blocksize + partSize - 1) / partSize;
|
int residualPartCount = (blocksize + residualPartSize - 1) / residualPartSize;
|
||||||
|
|
||||||
if (partCount > maxResidualParts)
|
if (residualPartCount > maxResidualParts)
|
||||||
throw new Exception("invalid combination of block size and LPC order");
|
throw new Exception("invalid combination of block size and LPC order");
|
||||||
|
|
||||||
|
CUfunction cudaChannelDecorr = channels == 2 ? (channelsCount == 4 ? task.cudaStereoDecorr : task.cudaChannelDecorr2) : task.cudaChannelDecorr;
|
||||||
|
|
||||||
|
cuda.SetParameter(cudaChannelDecorr, 0, (uint)task.cudaSamples.Pointer);
|
||||||
|
cuda.SetParameter(cudaChannelDecorr, 1 * sizeof(uint), (uint)task.cudaSamplesBytes.Pointer);
|
||||||
|
cuda.SetParameter(cudaChannelDecorr, 2 * sizeof(uint), (uint)MAX_BLOCKSIZE);
|
||||||
|
cuda.SetParameterSize(cudaChannelDecorr, sizeof(uint) * 3U);
|
||||||
|
cuda.SetFunctionBlockShape(cudaChannelDecorr, 256, 1, 1);
|
||||||
|
|
||||||
|
cuda.SetParameter(task.cudaFindWastedBits, 0 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer);
|
||||||
|
cuda.SetParameter(task.cudaFindWastedBits, 1 * sizeof(uint), (uint)task.cudaSamples.Pointer);
|
||||||
|
cuda.SetParameter(task.cudaFindWastedBits, 2 * sizeof(uint), (uint)task.nResidualTasksPerChannel);
|
||||||
|
cuda.SetParameter(task.cudaFindWastedBits, 3 * sizeof(uint), (uint)blocksize);
|
||||||
|
cuda.SetParameterSize(task.cudaFindWastedBits, sizeof(uint) * 4U);
|
||||||
|
cuda.SetFunctionBlockShape(task.cudaFindWastedBits, 256, 1, 1);
|
||||||
|
|
||||||
|
cuda.SetParameter(task.cudaComputeAutocor, 0, (uint)task.cudaAutocorOutput.Pointer);
|
||||||
|
cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint), (uint)task.cudaSamples.Pointer);
|
||||||
|
cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 2, (uint)cudaWindow.Pointer);
|
||||||
|
cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 3, (uint)task.cudaAutocorTasks.Pointer);
|
||||||
|
cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 4, (uint)max_order);
|
||||||
|
cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 4 + sizeof(uint), (uint)blocksize);
|
||||||
|
cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 4 + sizeof(uint) * 2, (uint)autocorPartSize);
|
||||||
|
cuda.SetParameterSize(task.cudaComputeAutocor, sizeof(uint) * 7U);
|
||||||
|
cuda.SetFunctionBlockShape(task.cudaComputeAutocor, 32, 8, 1);
|
||||||
|
|
||||||
|
cuda.SetParameter(task.cudaComputeLPC, 0, (uint)task.cudaResidualTasks.Pointer);
|
||||||
|
cuda.SetParameter(task.cudaComputeLPC, sizeof(uint), (uint)task.cudaAutocorOutput.Pointer);
|
||||||
|
cuda.SetParameter(task.cudaComputeLPC, sizeof(uint) * 2, (uint)task.cudaAutocorTasks.Pointer);
|
||||||
|
cuda.SetParameter(task.cudaComputeLPC, sizeof(uint) * 3, (uint)max_order);
|
||||||
|
cuda.SetParameter(task.cudaComputeLPC, sizeof(uint) * 3 + sizeof(uint), (uint)autocorPartCount);
|
||||||
|
cuda.SetParameterSize(task.cudaComputeLPC, (uint)(sizeof(uint) * 3) + sizeof(uint) * 2);
|
||||||
|
cuda.SetFunctionBlockShape(task.cudaComputeLPC, (autocorPartCount + 31) & ~31, 1, 1);
|
||||||
|
|
||||||
cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 0, (uint)task.cudaResidualOutput.Pointer);
|
cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 0, (uint)task.cudaResidualOutput.Pointer);
|
||||||
cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 1, (uint)task.cudaSamples.Pointer);
|
cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 1, (uint)task.cudaSamples.Pointer);
|
||||||
cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 2, (uint)task.cudaResidualTasks.Pointer);
|
cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 2, (uint)task.cudaResidualTasks.Pointer);
|
||||||
cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 3, (uint)max_order);
|
cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 3, (uint)max_order);
|
||||||
cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 4, (uint)blocksize);
|
cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 4, (uint)blocksize);
|
||||||
cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 5, (uint)partSize);
|
cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 5, (uint)residualPartSize);
|
||||||
cuda.SetParameterSize(task.cudaEstimateResidual, sizeof(uint) * 6);
|
cuda.SetParameterSize(task.cudaEstimateResidual, sizeof(uint) * 6);
|
||||||
cuda.SetFunctionBlockShape(task.cudaEstimateResidual, 32, threads_y, 1);
|
cuda.SetFunctionBlockShape(task.cudaEstimateResidual, 32, threads_y, 1);
|
||||||
|
|
||||||
int nBestTasks = task.nResidualTasks / task.nResidualTasksPerChannel;
|
int nBestTasks = task.nResidualTasks / task.nResidualTasksPerChannel;
|
||||||
cuda.SetParameter(task.cudaChooseBestMethod, 0 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer);
|
cuda.SetParameter(task.cudaChooseBestMethod, 0 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer);
|
||||||
cuda.SetParameter(task.cudaChooseBestMethod, 1 * sizeof(uint), (uint)task.cudaResidualOutput.Pointer);
|
cuda.SetParameter(task.cudaChooseBestMethod, 1 * sizeof(uint), (uint)task.cudaResidualOutput.Pointer);
|
||||||
cuda.SetParameter(task.cudaChooseBestMethod, 2 * sizeof(uint), (uint)partCount);
|
cuda.SetParameter(task.cudaChooseBestMethod, 2 * sizeof(uint), (uint)residualPartCount);
|
||||||
cuda.SetParameter(task.cudaChooseBestMethod, 3 * sizeof(uint), (uint)task.nResidualTasksPerChannel);
|
cuda.SetParameter(task.cudaChooseBestMethod, 3 * sizeof(uint), (uint)task.nResidualTasksPerChannel);
|
||||||
cuda.SetParameterSize(task.cudaChooseBestMethod, sizeof(uint) * 4U);
|
cuda.SetParameterSize(task.cudaChooseBestMethod, sizeof(uint) * 4U);
|
||||||
cuda.SetFunctionBlockShape(task.cudaChooseBestMethod, 32, 16, 1);
|
cuda.SetFunctionBlockShape(task.cudaChooseBestMethod, 32, 16, 1);
|
||||||
@@ -1078,10 +1117,14 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
cuda.SetParameter(task.cudaEncodeResidual, 1 * sizeof(uint), (uint)task.cudaSamples.Pointer);
|
cuda.SetParameter(task.cudaEncodeResidual, 1 * sizeof(uint), (uint)task.cudaSamples.Pointer);
|
||||||
cuda.SetParameter(task.cudaEncodeResidual, 2 * sizeof(uint), (uint)task.cudaBestResidualTasks.Pointer);
|
cuda.SetParameter(task.cudaEncodeResidual, 2 * sizeof(uint), (uint)task.cudaBestResidualTasks.Pointer);
|
||||||
cuda.SetParameterSize(task.cudaEncodeResidual, sizeof(uint) * 3U);
|
cuda.SetParameterSize(task.cudaEncodeResidual, sizeof(uint) * 3U);
|
||||||
cuda.SetFunctionBlockShape(task.cudaEncodeResidual, partSize, 1, 1);
|
cuda.SetFunctionBlockShape(task.cudaEncodeResidual, residualPartSize, 1, 1);
|
||||||
|
|
||||||
// issue work to the GPU
|
// issue work to the GPU
|
||||||
cuda.LaunchAsync(task.cudaEstimateResidual, partCount, (task.nResidualTasks / threads_y * nFrames) / maxFrames, task.stream);
|
cuda.LaunchAsync(cudaChannelDecorr, (nFrames * blocksize + 255) / 256, channels == 2 ? 1 : channels, task.stream);
|
||||||
|
cuda.LaunchAsync(task.cudaFindWastedBits, (task.nResidualTasks / task.nResidualTasksPerChannel * nFrames) / maxFrames, 1, task.stream);
|
||||||
|
cuda.LaunchAsync(task.cudaComputeAutocor, autocorPartCount, (task.nAutocorTasks * nFrames) / maxFrames, task.stream);
|
||||||
|
cuda.LaunchAsync(task.cudaComputeLPC, 1, (task.nAutocorTasks * nFrames) / maxFrames, task.stream);
|
||||||
|
cuda.LaunchAsync(task.cudaEstimateResidual, residualPartCount, (task.nResidualTasks / threads_y * nFrames) / maxFrames, task.stream);
|
||||||
cuda.LaunchAsync(task.cudaChooseBestMethod, 1, (nBestTasks * nFrames) / maxFrames, task.stream);
|
cuda.LaunchAsync(task.cudaChooseBestMethod, 1, (nBestTasks * nFrames) / maxFrames, task.stream);
|
||||||
if (channels == 2 && channelsCount == 4)
|
if (channels == 2 && channelsCount == 4)
|
||||||
{
|
{
|
||||||
@@ -1091,60 +1134,12 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
else
|
else
|
||||||
cuda.LaunchAsync(task.cudaCopyBestMethod, 1, (nBestTasks * nFrames) / maxFrames, task.stream);
|
cuda.LaunchAsync(task.cudaCopyBestMethod, 1, (nBestTasks * nFrames) / maxFrames, task.stream);
|
||||||
if (!encode_on_cpu)
|
if (!encode_on_cpu)
|
||||||
cuda.LaunchAsync(task.cudaEncodeResidual, partCount, (nBestTasks * nFrames) / maxFrames, task.stream);
|
cuda.LaunchAsync(task.cudaEncodeResidual, residualPartCount, (nBestTasks * nFrames) / maxFrames, task.stream);
|
||||||
cuda.CopyDeviceToHostAsync(task.cudaBestResidualTasks, task.bestResidualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * ((nBestTasks * nFrames) / maxFrames)), task.stream);
|
cuda.CopyDeviceToHostAsync(task.cudaBestResidualTasks, task.bestResidualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * ((nBestTasks * nFrames) / maxFrames)), task.stream);
|
||||||
if (!encode_on_cpu)
|
if (!encode_on_cpu)
|
||||||
cuda.CopyDeviceToHostAsync(task.cudaResidual, task.residualBufferPtr, (uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channels), task.stream);
|
cuda.CopyDeviceToHostAsync(task.cudaResidual, task.residualBufferPtr, (uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channels), task.stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
unsafe void compute_autocorellation(int blocksize, int channelsCount, int max_order, int nFrames, FlaCudaTask task)
|
|
||||||
{
|
|
||||||
if (blocksize <= 4)
|
|
||||||
return;
|
|
||||||
|
|
||||||
int partSize = 256 + 128;// (2 * 256 - max_order) & ~31;
|
|
||||||
int partCount = (blocksize + partSize - 1) / partSize;
|
|
||||||
if (partCount > maxAutocorParts)
|
|
||||||
throw new Exception("internal error");
|
|
||||||
|
|
||||||
cuda.SetParameter(task.cudaStereoDecorr, 0, (uint)task.cudaSamples.Pointer);
|
|
||||||
cuda.SetParameter(task.cudaStereoDecorr, sizeof(uint), (uint)MAX_BLOCKSIZE);
|
|
||||||
cuda.SetParameterSize(task.cudaStereoDecorr, sizeof(uint) * 2U);
|
|
||||||
cuda.SetFunctionBlockShape(task.cudaStereoDecorr, 256, 1, 1);
|
|
||||||
|
|
||||||
cuda.SetParameter(task.cudaFindWastedBits, 0 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer);
|
|
||||||
cuda.SetParameter(task.cudaFindWastedBits, 1 * sizeof(uint), (uint)task.cudaSamples.Pointer);
|
|
||||||
cuda.SetParameter(task.cudaFindWastedBits, 2 * sizeof(uint), (uint)task.nResidualTasksPerChannel);
|
|
||||||
cuda.SetParameter(task.cudaFindWastedBits, 3 * sizeof(uint), (uint)blocksize);
|
|
||||||
cuda.SetParameterSize(task.cudaFindWastedBits, sizeof(uint) * 4U);
|
|
||||||
cuda.SetFunctionBlockShape(task.cudaFindWastedBits, 256, 1, 1);
|
|
||||||
|
|
||||||
cuda.SetParameter(task.cudaComputeAutocor, 0, (uint)task.cudaAutocorOutput.Pointer);
|
|
||||||
cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint), (uint)task.cudaSamples.Pointer);
|
|
||||||
cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 2, (uint)cudaWindow.Pointer);
|
|
||||||
cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 3, (uint)task.cudaAutocorTasks.Pointer);
|
|
||||||
cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 4, (uint)max_order);
|
|
||||||
cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 4 + sizeof(uint), (uint)blocksize);
|
|
||||||
cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 4 + sizeof(uint) * 2, (uint)partSize);
|
|
||||||
cuda.SetParameterSize(task.cudaComputeAutocor, sizeof(uint) * 7U);
|
|
||||||
cuda.SetFunctionBlockShape(task.cudaComputeAutocor, 32, 8, 1);
|
|
||||||
|
|
||||||
cuda.SetParameter(task.cudaComputeLPC, 0, (uint)task.cudaResidualTasks.Pointer);
|
|
||||||
cuda.SetParameter(task.cudaComputeLPC, sizeof(uint), (uint)task.cudaAutocorOutput.Pointer);
|
|
||||||
cuda.SetParameter(task.cudaComputeLPC, sizeof(uint) * 2, (uint)task.cudaAutocorTasks.Pointer);
|
|
||||||
cuda.SetParameter(task.cudaComputeLPC, sizeof(uint) * 3, (uint)max_order);
|
|
||||||
cuda.SetParameter(task.cudaComputeLPC, sizeof(uint) * 3 + sizeof(uint), (uint)partCount);
|
|
||||||
cuda.SetParameterSize(task.cudaComputeLPC, (uint)(sizeof(uint) * 3) + sizeof(uint) * 2);
|
|
||||||
cuda.SetFunctionBlockShape(task.cudaComputeLPC, (partCount + 31) & ~31, 1, 1);
|
|
||||||
|
|
||||||
// issue work to the GPU
|
|
||||||
if (channels == 2 && channelsCount == 4)
|
|
||||||
cuda.LaunchAsync(task.cudaStereoDecorr, MAX_BLOCKSIZE / 256, 1, task.stream);
|
|
||||||
cuda.LaunchAsync(task.cudaFindWastedBits, (task.nResidualTasks / task.nResidualTasksPerChannel * nFrames) / maxFrames, 1, task.stream);
|
|
||||||
cuda.LaunchAsync(task.cudaComputeAutocor, partCount, (task.nAutocorTasks * nFrames) / maxFrames, task.stream);
|
|
||||||
cuda.LaunchAsync(task.cudaComputeLPC, 1, (task.nAutocorTasks * nFrames) / maxFrames, task.stream);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsafe int encode_frame(bool doMidside, int channelCount, int iFrame, FlaCudaTask task)
|
unsafe int encode_frame(bool doMidside, int channelCount, int iFrame, FlaCudaTask task)
|
||||||
{
|
{
|
||||||
fixed (int* r = residualBuffer)
|
fixed (int* r = residualBuffer)
|
||||||
@@ -1186,7 +1181,8 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
bool doMidside = channels == 2 && eparams.do_midside;
|
bool doMidside = channels == 2 && eparams.do_midside;
|
||||||
int channelCount = doMidside ? 2 * channels : channels;
|
int channelCount = doMidside ? 2 * channels : channels;
|
||||||
|
|
||||||
cuda.CopyHostToDeviceAsync(task.cudaSamples, task.samplesBufferPtr, (uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channels), task.stream);
|
//cuda.CopyHostToDeviceAsync(task.cudaSamples, task.samplesBufferPtr, (uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channels), task.stream);
|
||||||
|
cuda.CopyHostToDeviceAsync(task.cudaSamplesBytes, task.samplesBytesPtr, (uint)(sizeof(short) * channels * eparams.block_size * nFrames), task.stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
unsafe void run_GPU_task(int nFrames, FlaCudaTask task)
|
unsafe void run_GPU_task(int nFrames, FlaCudaTask task)
|
||||||
@@ -1261,7 +1257,7 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
{
|
{
|
||||||
int decoded = verify.DecodeFrame(frame_buffer, 0, fs);
|
int decoded = verify.DecodeFrame(frame_buffer, 0, fs);
|
||||||
if (decoded != fs || verify.Remaining != (ulong)eparams.block_size)
|
if (decoded != fs || verify.Remaining != (ulong)eparams.block_size)
|
||||||
throw new Exception("validation failed!");
|
throw new Exception("validation failed! frame size mismatch");
|
||||||
fixed (int* s = task.verifyBuffer, r = verify.Samples)
|
fixed (int* s = task.verifyBuffer, r = verify.Samples)
|
||||||
{
|
{
|
||||||
for (int ch = 0; ch < channels; ch++)
|
for (int ch = 0; ch < channels; ch++)
|
||||||
@@ -1341,6 +1337,7 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
int* s2 = (int*)task2.samplesBufferPtr;
|
int* s2 = (int*)task2.samplesBufferPtr;
|
||||||
for (int ch = 0; ch < channelCount; ch++)
|
for (int ch = 0; ch < channelCount; ch++)
|
||||||
AudioSamples.MemCpy(s2 + ch * FlaCudaWriter.MAX_BLOCKSIZE, s1 + bs + ch * FlaCudaWriter.MAX_BLOCKSIZE, samplesInBuffer - bs);
|
AudioSamples.MemCpy(s2 + ch * FlaCudaWriter.MAX_BLOCKSIZE, s1 + bs + ch * FlaCudaWriter.MAX_BLOCKSIZE, samplesInBuffer - bs);
|
||||||
|
AudioSamples.MemCpy(((short*)task2.samplesBytesPtr), ((short*)task1.samplesBytesPtr) + bs * channels, (samplesInBuffer - bs) * channels);
|
||||||
}
|
}
|
||||||
samplesInBuffer -= bs;
|
samplesInBuffer -= bs;
|
||||||
have_data = nFrames;
|
have_data = nFrames;
|
||||||
@@ -1790,6 +1787,8 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
{
|
{
|
||||||
CUDA cuda;
|
CUDA cuda;
|
||||||
public CUfunction cudaStereoDecorr;
|
public CUfunction cudaStereoDecorr;
|
||||||
|
public CUfunction cudaChannelDecorr;
|
||||||
|
public CUfunction cudaChannelDecorr2;
|
||||||
public CUfunction cudaFindWastedBits;
|
public CUfunction cudaFindWastedBits;
|
||||||
public CUfunction cudaComputeAutocor;
|
public CUfunction cudaComputeAutocor;
|
||||||
public CUfunction cudaComputeLPC;
|
public CUfunction cudaComputeLPC;
|
||||||
@@ -1799,6 +1798,7 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
public CUfunction cudaCopyBestMethodStereo;
|
public CUfunction cudaCopyBestMethodStereo;
|
||||||
public CUfunction cudaEncodeResidual;
|
public CUfunction cudaEncodeResidual;
|
||||||
public CUdeviceptr cudaSamples;
|
public CUdeviceptr cudaSamples;
|
||||||
|
public CUdeviceptr cudaSamplesBytes;
|
||||||
public CUdeviceptr cudaResidual;
|
public CUdeviceptr cudaResidual;
|
||||||
public CUdeviceptr cudaAutocorTasks;
|
public CUdeviceptr cudaAutocorTasks;
|
||||||
public CUdeviceptr cudaAutocorOutput;
|
public CUdeviceptr cudaAutocorOutput;
|
||||||
@@ -1806,6 +1806,7 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
public CUdeviceptr cudaResidualOutput;
|
public CUdeviceptr cudaResidualOutput;
|
||||||
public CUdeviceptr cudaBestResidualTasks;
|
public CUdeviceptr cudaBestResidualTasks;
|
||||||
public IntPtr samplesBufferPtr = IntPtr.Zero;
|
public IntPtr samplesBufferPtr = IntPtr.Zero;
|
||||||
|
public IntPtr samplesBytesPtr = IntPtr.Zero;
|
||||||
public IntPtr residualBufferPtr = IntPtr.Zero;
|
public IntPtr residualBufferPtr = IntPtr.Zero;
|
||||||
public IntPtr autocorTasksPtr = IntPtr.Zero;
|
public IntPtr autocorTasksPtr = IntPtr.Zero;
|
||||||
public IntPtr residualTasksPtr = IntPtr.Zero;
|
public IntPtr residualTasksPtr = IntPtr.Zero;
|
||||||
@@ -1831,6 +1832,7 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
bestResidualTasksLen = sizeof(encodeResidualTaskStruct) * channelCount * FlaCudaWriter.maxFrames;
|
bestResidualTasksLen = sizeof(encodeResidualTaskStruct) * channelCount * FlaCudaWriter.maxFrames;
|
||||||
samplesBufferLen = sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelCount;
|
samplesBufferLen = sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelCount;
|
||||||
|
|
||||||
|
cudaSamplesBytes = cuda.Allocate((uint)samplesBufferLen / 2);
|
||||||
cudaSamples = cuda.Allocate((uint)samplesBufferLen);
|
cudaSamples = cuda.Allocate((uint)samplesBufferLen);
|
||||||
cudaResidual = cuda.Allocate((uint)samplesBufferLen);
|
cudaResidual = cuda.Allocate((uint)samplesBufferLen);
|
||||||
cudaAutocorTasks = cuda.Allocate((uint)autocorTasksLen);
|
cudaAutocorTasks = cuda.Allocate((uint)autocorTasksLen);
|
||||||
@@ -1839,6 +1841,8 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
cudaBestResidualTasks = cuda.Allocate((uint)bestResidualTasksLen);
|
cudaBestResidualTasks = cuda.Allocate((uint)bestResidualTasksLen);
|
||||||
cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS + 1) * lpc.MAX_LPC_ORDER * FlaCudaWriter.maxResidualParts * FlaCudaWriter.maxFrames));
|
cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS + 1) * lpc.MAX_LPC_ORDER * FlaCudaWriter.maxResidualParts * FlaCudaWriter.maxFrames));
|
||||||
CUResult cuErr = CUDADriver.cuMemAllocHost(ref samplesBufferPtr, (uint)samplesBufferLen);
|
CUResult cuErr = CUDADriver.cuMemAllocHost(ref samplesBufferPtr, (uint)samplesBufferLen);
|
||||||
|
if (cuErr == CUResult.Success)
|
||||||
|
cuErr = CUDADriver.cuMemAllocHost(ref samplesBytesPtr, (uint)samplesBufferLen / 2);
|
||||||
if (cuErr == CUResult.Success)
|
if (cuErr == CUResult.Success)
|
||||||
cuErr = CUDADriver.cuMemAllocHost(ref residualBufferPtr, (uint)samplesBufferLen);
|
cuErr = CUDADriver.cuMemAllocHost(ref residualBufferPtr, (uint)samplesBufferLen);
|
||||||
if (cuErr == CUResult.Success)
|
if (cuErr == CUResult.Success)
|
||||||
@@ -1849,6 +1853,7 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
cuErr = CUDADriver.cuMemAllocHost(ref bestResidualTasksPtr, (uint)bestResidualTasksLen);
|
cuErr = CUDADriver.cuMemAllocHost(ref bestResidualTasksPtr, (uint)bestResidualTasksLen);
|
||||||
if (cuErr != CUResult.Success)
|
if (cuErr != CUResult.Success)
|
||||||
{
|
{
|
||||||
|
if (samplesBytesPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBytesPtr); samplesBytesPtr = IntPtr.Zero;
|
||||||
if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr); samplesBufferPtr = IntPtr.Zero;
|
if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr); samplesBufferPtr = IntPtr.Zero;
|
||||||
if (residualBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualBufferPtr); residualBufferPtr = IntPtr.Zero;
|
if (residualBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualBufferPtr); residualBufferPtr = IntPtr.Zero;
|
||||||
if (autocorTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(autocorTasksPtr); autocorTasksPtr = IntPtr.Zero;
|
if (autocorTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(autocorTasksPtr); autocorTasksPtr = IntPtr.Zero;
|
||||||
@@ -1859,6 +1864,8 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
|
|
||||||
cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor");
|
cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor");
|
||||||
cudaStereoDecorr = cuda.GetModuleFunction("cudaStereoDecorr");
|
cudaStereoDecorr = cuda.GetModuleFunction("cudaStereoDecorr");
|
||||||
|
cudaChannelDecorr = cuda.GetModuleFunction("cudaChannelDecorr");
|
||||||
|
cudaChannelDecorr2 = cuda.GetModuleFunction("cudaChannelDecorr2");
|
||||||
cudaFindWastedBits = cuda.GetModuleFunction("cudaFindWastedBits");
|
cudaFindWastedBits = cuda.GetModuleFunction("cudaFindWastedBits");
|
||||||
cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC");
|
cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC");
|
||||||
cudaEstimateResidual = cuda.GetModuleFunction("cudaEstimateResidual");
|
cudaEstimateResidual = cuda.GetModuleFunction("cudaEstimateResidual");
|
||||||
@@ -1875,12 +1882,14 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
public void Dispose()
|
public void Dispose()
|
||||||
{
|
{
|
||||||
cuda.Free(cudaSamples);
|
cuda.Free(cudaSamples);
|
||||||
|
cuda.Free(cudaSamplesBytes);
|
||||||
cuda.Free(cudaResidual);
|
cuda.Free(cudaResidual);
|
||||||
cuda.Free(cudaAutocorTasks);
|
cuda.Free(cudaAutocorTasks);
|
||||||
cuda.Free(cudaAutocorOutput);
|
cuda.Free(cudaAutocorOutput);
|
||||||
cuda.Free(cudaResidualTasks);
|
cuda.Free(cudaResidualTasks);
|
||||||
cuda.Free(cudaResidualOutput);
|
cuda.Free(cudaResidualOutput);
|
||||||
cuda.Free(cudaBestResidualTasks);
|
cuda.Free(cudaBestResidualTasks);
|
||||||
|
CUDADriver.cuMemFreeHost(samplesBytesPtr);
|
||||||
CUDADriver.cuMemFreeHost(samplesBufferPtr);
|
CUDADriver.cuMemFreeHost(samplesBufferPtr);
|
||||||
CUDADriver.cuMemFreeHost(residualBufferPtr);
|
CUDADriver.cuMemFreeHost(residualBufferPtr);
|
||||||
CUDADriver.cuMemFreeHost(residualTasksPtr);
|
CUDADriver.cuMemFreeHost(residualTasksPtr);
|
||||||
|
|||||||
@@ -57,19 +57,47 @@ typedef struct
|
|||||||
|
|
||||||
extern "C" __global__ void cudaStereoDecorr(
|
extern "C" __global__ void cudaStereoDecorr(
|
||||||
int *samples,
|
int *samples,
|
||||||
|
short2 *src,
|
||||||
int offset
|
int offset
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
const int pos = blockIdx.x * blockDim.x + threadIdx.x;
|
const int pos = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
if (pos < offset)
|
if (pos < offset)
|
||||||
{
|
{
|
||||||
int l = samples[pos];
|
short2 s = src[pos];
|
||||||
int r = samples[offset + pos];
|
samples[pos] = s.x;
|
||||||
samples[2 * offset + pos] = (l + r) >> 1;
|
samples[1 * offset + pos] = s.y;
|
||||||
samples[3 * offset + pos] = l - r;
|
samples[2 * offset + pos] = (s.x + s.y) >> 1;
|
||||||
|
samples[3 * offset + pos] = s.x - s.y;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
extern "C" __global__ void cudaChannelDecorr2(
|
||||||
|
int *samples,
|
||||||
|
short2 *src,
|
||||||
|
int offset
|
||||||
|
)
|
||||||
|
{
|
||||||
|
const int pos = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
if (pos < offset)
|
||||||
|
{
|
||||||
|
short2 s = src[pos];
|
||||||
|
samples[pos] = s.x;
|
||||||
|
samples[1 * offset + pos] = s.y;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C" __global__ void cudaChannelDecorr(
|
||||||
|
int *samples,
|
||||||
|
short *src,
|
||||||
|
int offset
|
||||||
|
)
|
||||||
|
{
|
||||||
|
const int pos = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
if (pos < offset)
|
||||||
|
samples[blockIdx.y * offset + pos] = src[pos * gridDim.y + blockIdx.y];
|
||||||
|
}
|
||||||
|
|
||||||
extern "C" __global__ void cudaFindWastedBits(
|
extern "C" __global__ void cudaFindWastedBits(
|
||||||
encodeResidualTaskStruct *tasks,
|
encodeResidualTaskStruct *tasks,
|
||||||
int *samples,
|
int *samples,
|
||||||
@@ -471,7 +499,7 @@ extern "C" __global__ void cudaEncodeResidual(
|
|||||||
encodeResidualTaskStruct task;
|
encodeResidualTaskStruct task;
|
||||||
} shared;
|
} shared;
|
||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
if (threadIdx.x < sizeof(encodeResidualTaskStruct))
|
if (threadIdx.x < sizeof(shared.task) / sizeof(int))
|
||||||
((int*)&shared.task)[threadIdx.x] = ((int*)(&tasks[blockIdx.y]))[threadIdx.x];
|
((int*)&shared.task)[threadIdx.x] = ((int*)(&tasks[blockIdx.y]))[threadIdx.x];
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
const int partSize = blockDim.x;
|
const int partSize = blockDim.x;
|
||||||
|
|||||||
@@ -427,7 +427,7 @@ code {
|
|||||||
offset = 0
|
offset = 0
|
||||||
bytes = 8
|
bytes = 8
|
||||||
mem {
|
mem {
|
||||||
0x000000bf 0x0000001f
|
0x0000002f 0x0000001f
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
bincode {
|
bincode {
|
||||||
@@ -725,22 +725,24 @@ code {
|
|||||||
code {
|
code {
|
||||||
name = cudaStereoDecorr
|
name = cudaStereoDecorr
|
||||||
lmem = 0
|
lmem = 0
|
||||||
smem = 24
|
smem = 28
|
||||||
reg = 6
|
reg = 7
|
||||||
bar = 0
|
bar = 0
|
||||||
bincode {
|
bincode {
|
||||||
0x10004205 0x0023c780 0xa0000005 0x04000780
|
0x10004205 0x0023c780 0xa0000005 0x04000780
|
||||||
0x60014c01 0x00204780 0x3000cbfd 0x6c20c7c8
|
0x60014c05 0x00204780 0x3001cdfd 0x6c20c7c8
|
||||||
0x30000003 0x00000280 0x2000ca05 0x04200780
|
0x30000003 0x00000280 0x30020209 0xc4100780
|
||||||
0x30020009 0xc4100780 0x3002020d 0xc4100780
|
0x2000ca01 0x04208780 0xd00e0001 0x80c00780
|
||||||
0x2000c805 0x04208780 0xd00e0205 0x80c00780
|
0x2000c80d 0x04208780 0xa0000009 0x0c010780
|
||||||
0x2000c809 0x0420c780 0xd00e0409 0x80c00780
|
0x2000cc11 0x04204780 0x3001cc15 0xc4300780
|
||||||
0x3001ca0d 0xc4300780 0x20038010 0x2103ea0c
|
0xd00e0609 0xa0c00780 0x3002080d 0xc4100780
|
||||||
0x30020815 0xc4100780 0x20028210 0x20038000
|
0x20000211 0x04014780 0xa0000201 0x0c010780
|
||||||
0x2000c815 0x04214780 0x3001080d 0xec100780
|
0x2105ec18 0x2103e80c 0x30020811 0xc4100780
|
||||||
0x30020011 0xc4100780 0x20400201 0x04008780
|
0x20008414 0x20068218 0xd00e0601 0xa0c00780
|
||||||
0xd00e0a0d 0xa0c00780 0x2000c805 0x04210780
|
0x2000c805 0x04210780 0x30010a0d 0xec100780
|
||||||
0xd00e0201 0xa0c00781
|
0x30020c11 0xc4100780 0x20400409 0x04000780
|
||||||
|
0xd00e020d 0xa0c00780 0x2000c801 0x04210780
|
||||||
|
0xd00e0009 0xa0c00781
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
code {
|
code {
|
||||||
@@ -823,3 +825,39 @@ code {
|
|||||||
0xd00e0201 0xa0c00781
|
0xd00e0201 0xa0c00781
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
code {
|
||||||
|
name = cudaChannelDecorr
|
||||||
|
lmem = 0
|
||||||
|
smem = 28
|
||||||
|
reg = 5
|
||||||
|
bar = 0
|
||||||
|
bincode {
|
||||||
|
0x10004205 0x0023c780 0xa0000005 0x04000780
|
||||||
|
0x60014c05 0x00204780 0x3001cdfd 0x6c20c7c8
|
||||||
|
0x30000003 0x00000280 0xa0004e09 0x04200780
|
||||||
|
0x1000cc01 0x0423c780 0x41032a0c 0x40040210
|
||||||
|
0x3010060d 0xc4100780 0x30100811 0xc4100780
|
||||||
|
0x60024a0d 0x0020c780 0x60040011 0x00010780
|
||||||
|
0x20038400 0x20018804 0x30010001 0xc4100780
|
||||||
|
0x30020205 0xc4100780 0x2000ca01 0x04200780
|
||||||
|
0xd00e0001 0x80600780 0x2000c805 0x04204780
|
||||||
|
0xd00e0201 0xa0c00781
|
||||||
|
}
|
||||||
|
}
|
||||||
|
code {
|
||||||
|
name = cudaChannelDecorr2
|
||||||
|
lmem = 0
|
||||||
|
smem = 28
|
||||||
|
reg = 4
|
||||||
|
bar = 0
|
||||||
|
bincode {
|
||||||
|
0x10004205 0x0023c780 0xa0000005 0x04000780
|
||||||
|
0x60014c05 0x00204780 0x3001cdfd 0x6c20c7c8
|
||||||
|
0x30000003 0x00000280 0x30020209 0xc4100780
|
||||||
|
0x2000ca01 0x04208780 0xd00e0001 0x80c00780
|
||||||
|
0x2102e808 0x2101ec0c 0xa0000005 0x0c010780
|
||||||
|
0x3002060d 0xc4100780 0xd00e0405 0xa0c00780
|
||||||
|
0x2000c805 0x0420c780 0xa0000201 0x0c010780
|
||||||
|
0xd00e0201 0xa0c00781
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user