mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
optimizations
This commit is contained in:
@@ -100,9 +100,9 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
|
|
||||||
bool encode_on_cpu = true;
|
bool encode_on_cpu = true;
|
||||||
|
|
||||||
public const int MAX_BLOCKSIZE = 4608 * 8;
|
public const int MAX_BLOCKSIZE = 4608 * 16;
|
||||||
internal const int maxFrames = 32;
|
internal const int maxFrames = 32;
|
||||||
internal const int maxResidualParts = (MAX_BLOCKSIZE + 255) / 256;
|
internal const int maxResidualParts = 64; // not (MAX_BLOCKSIZE + 255) / 256!! 64 is hardcoded in cudaEstimateResidual. It's per block.
|
||||||
internal const int maxAutocorParts = (MAX_BLOCKSIZE + 255) / 256;
|
internal const int maxAutocorParts = (MAX_BLOCKSIZE + 255) / 256;
|
||||||
|
|
||||||
public FlaCudaWriter(string path, int bitsPerSample, int channelCount, int sampleRate, Stream IO)
|
public FlaCudaWriter(string path, int bitsPerSample, int channelCount, int sampleRate, Stream IO)
|
||||||
@@ -122,7 +122,7 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
_IO = IO;
|
_IO = IO;
|
||||||
|
|
||||||
residualBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 10 : channels + 1)];
|
residualBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 10 : channels + 1)];
|
||||||
windowBuffer = new float[FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS];
|
windowBuffer = new float[FlaCudaWriter.MAX_BLOCKSIZE * lpc.MAX_LPC_WINDOWS];
|
||||||
md5_buffer = new byte[FlaCudaWriter.MAX_BLOCKSIZE * channels * bits_per_sample / 8];
|
md5_buffer = new byte[FlaCudaWriter.MAX_BLOCKSIZE * channels * bits_per_sample / 8];
|
||||||
|
|
||||||
eparams.flake_set_defaults(_compressionLevel);
|
eparams.flake_set_defaults(_compressionLevel);
|
||||||
@@ -809,16 +809,18 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
{
|
{
|
||||||
if ((eparams.window_function & flag) == 0 || _windowcount == lpc.MAX_LPC_WINDOWS)
|
if ((eparams.window_function & flag) == 0 || _windowcount == lpc.MAX_LPC_WINDOWS)
|
||||||
return;
|
return;
|
||||||
int sz = _windowsize;
|
|
||||||
float* pos = window + _windowcount * FlaCudaWriter.MAX_BLOCKSIZE * 2;
|
func(window + _windowcount * FlaCudaWriter.MAX_BLOCKSIZE, _windowsize);
|
||||||
do
|
//int sz = _windowsize;
|
||||||
{
|
//float* pos = window + _windowcount * FlaCudaWriter.MAX_BLOCKSIZE * 2;
|
||||||
func(pos, sz);
|
//do
|
||||||
if ((sz & 1) != 0)
|
//{
|
||||||
break;
|
// func(pos, sz);
|
||||||
pos += sz;
|
// if ((sz & 1) != 0)
|
||||||
sz >>= 1;
|
// break;
|
||||||
} while (sz >= 32);
|
// pos += sz;
|
||||||
|
// sz >>= 1;
|
||||||
|
//} while (sz >= 32);
|
||||||
_windowcount++;
|
_windowcount++;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -827,7 +829,7 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
computeAutocorTaskStruct* autocorTasks = (computeAutocorTaskStruct*)task.autocorTasksPtr;
|
computeAutocorTaskStruct* autocorTasks = (computeAutocorTaskStruct*)task.autocorTasksPtr;
|
||||||
task.nAutocorTasks = 0;
|
task.nAutocorTasks = 0;
|
||||||
task.nResidualTasks = 0;
|
task.nResidualTasks = 0;
|
||||||
task.nResidualTasksPerChannel = (_windowcount * eparams.max_prediction_order + 2 + eparams.max_fixed_order - eparams.min_fixed_order + 7) & ~7;
|
task.nResidualTasksPerChannel = (_windowcount * eparams.max_prediction_order + 1 + (eparams.do_constant ? 1 : 0) + eparams.max_fixed_order - eparams.min_fixed_order + 7) & ~7;
|
||||||
task.nAutocorTasksPerChannel = _windowcount;
|
task.nAutocorTasksPerChannel = _windowcount;
|
||||||
for (int iFrame = 0; iFrame < nFrames; iFrame++)
|
for (int iFrame = 0; iFrame < nFrames; iFrame++)
|
||||||
{
|
{
|
||||||
@@ -837,7 +839,7 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
{
|
{
|
||||||
// Autocorelation task
|
// Autocorelation task
|
||||||
autocorTasks[task.nAutocorTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize;
|
autocorTasks[task.nAutocorTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize;
|
||||||
autocorTasks[task.nAutocorTasks].windowOffs = iWindow * 2 * FlaCudaWriter.MAX_BLOCKSIZE;
|
autocorTasks[task.nAutocorTasks].windowOffs = iWindow * FlaCudaWriter.MAX_BLOCKSIZE;
|
||||||
autocorTasks[task.nAutocorTasks].residualOffs = eparams.max_prediction_order * iWindow + task.nResidualTasksPerChannel * (ch + iFrame * channelsCount);
|
autocorTasks[task.nAutocorTasks].residualOffs = eparams.max_prediction_order * iWindow + task.nResidualTasksPerChannel * (ch + iFrame * channelsCount);
|
||||||
autocorTasks[task.nAutocorTasks].blocksize = blocksize;
|
autocorTasks[task.nAutocorTasks].blocksize = blocksize;
|
||||||
task.nAutocorTasks++;
|
task.nAutocorTasks++;
|
||||||
@@ -855,6 +857,7 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Constant frames
|
// Constant frames
|
||||||
|
if (eparams.do_constant)
|
||||||
{
|
{
|
||||||
task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.Constant;
|
task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.Constant;
|
||||||
task.ResidualTasks[task.nResidualTasks].channel = ch;
|
task.ResidualTasks[task.nResidualTasks].channel = ch;
|
||||||
@@ -1686,6 +1689,8 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
|
|
||||||
public bool do_wasted;
|
public bool do_wasted;
|
||||||
|
|
||||||
|
public bool do_constant;
|
||||||
|
|
||||||
public WindowFunction window_function;
|
public WindowFunction window_function;
|
||||||
|
|
||||||
public bool do_md5;
|
public bool do_md5;
|
||||||
@@ -1719,55 +1724,55 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
do_verify = false;
|
do_verify = false;
|
||||||
do_seektable = true;
|
do_seektable = true;
|
||||||
do_wasted = true;
|
do_wasted = true;
|
||||||
|
do_constant = true;
|
||||||
|
|
||||||
// differences from level 7
|
// differences from level 7
|
||||||
switch (lvl)
|
switch (lvl)
|
||||||
{
|
{
|
||||||
case 0:
|
case 0:
|
||||||
|
do_constant = false;
|
||||||
do_wasted = false;
|
do_wasted = false;
|
||||||
do_midside = false;
|
do_midside = false;
|
||||||
max_partition_order = 4;
|
max_partition_order = 4;
|
||||||
max_prediction_order = 5;
|
max_prediction_order = 4;
|
||||||
min_fixed_order = 2;
|
min_fixed_order = 3;
|
||||||
max_fixed_order = 2;
|
max_fixed_order = 2;
|
||||||
break;
|
break;
|
||||||
case 1:
|
case 1:
|
||||||
do_wasted = false;
|
do_wasted = false;
|
||||||
do_midside = false;
|
do_midside = false;
|
||||||
window_function = WindowFunction.Bartlett;
|
|
||||||
max_partition_order = 4;
|
max_partition_order = 4;
|
||||||
max_prediction_order = 7;
|
max_prediction_order = 5;
|
||||||
min_fixed_order = 3;
|
|
||||||
max_fixed_order = 2;
|
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
window_function = WindowFunction.Bartlett;
|
window_function = WindowFunction.Bartlett;
|
||||||
max_partition_order = 4;
|
max_partition_order = 4;
|
||||||
max_prediction_order = 5;
|
min_fixed_order = 2;
|
||||||
|
max_fixed_order = 2;
|
||||||
|
max_prediction_order = 6;
|
||||||
break;
|
break;
|
||||||
case 3:
|
case 3:
|
||||||
window_function = WindowFunction.Bartlett;
|
window_function = WindowFunction.Bartlett;
|
||||||
max_partition_order = 4;
|
max_partition_order = 4;
|
||||||
|
min_fixed_order = 2;
|
||||||
|
max_fixed_order = 1;
|
||||||
max_prediction_order = 7;
|
max_prediction_order = 7;
|
||||||
break;
|
break;
|
||||||
case 4:
|
case 4:
|
||||||
window_function = WindowFunction.Bartlett;
|
|
||||||
max_partition_order = 4;
|
max_partition_order = 4;
|
||||||
max_prediction_order = 8;
|
max_prediction_order = 8;
|
||||||
break;
|
break;
|
||||||
case 5:
|
case 5:
|
||||||
max_prediction_order = 8;
|
max_prediction_order = 9;
|
||||||
break;
|
break;
|
||||||
case 6:
|
case 6:
|
||||||
window_function = WindowFunction.Bartlett;
|
|
||||||
min_fixed_order = 2;
|
|
||||||
max_fixed_order = 2;
|
max_fixed_order = 2;
|
||||||
max_prediction_order = 12;
|
max_prediction_order = 10;
|
||||||
break;
|
break;
|
||||||
case 7:
|
case 7:
|
||||||
//min_fixed_order = 2;
|
min_fixed_order = 2;
|
||||||
//max_fixed_order = 2;
|
max_fixed_order = 2;
|
||||||
max_prediction_order = 10;
|
max_prediction_order = 11;
|
||||||
break;
|
break;
|
||||||
case 8:
|
case 8:
|
||||||
break;
|
break;
|
||||||
@@ -1860,7 +1865,7 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
cuda = _cuda;
|
cuda = _cuda;
|
||||||
|
|
||||||
autocorTasksLen = sizeof(computeAutocorTaskStruct) * channelCount * lpc.MAX_LPC_WINDOWS * FlaCudaWriter.maxFrames;
|
autocorTasksLen = sizeof(computeAutocorTaskStruct) * channelCount * lpc.MAX_LPC_WINDOWS * FlaCudaWriter.maxFrames;
|
||||||
residualTasksLen = sizeof(encodeResidualTaskStruct) * channelCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 6) * FlaCudaWriter.maxFrames;
|
residualTasksLen = sizeof(encodeResidualTaskStruct) * channelCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 8) * FlaCudaWriter.maxFrames;
|
||||||
bestResidualTasksLen = sizeof(encodeResidualTaskStruct) * channelCount * FlaCudaWriter.maxFrames;
|
bestResidualTasksLen = sizeof(encodeResidualTaskStruct) * channelCount * FlaCudaWriter.maxFrames;
|
||||||
samplesBufferLen = sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelCount;
|
samplesBufferLen = sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelCount;
|
||||||
|
|
||||||
@@ -1871,8 +1876,7 @@ namespace CUETools.Codecs.FlaCuda
|
|||||||
cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * (FlaCudaWriter.maxAutocorParts + FlaCudaWriter.maxFrames)));
|
cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * (FlaCudaWriter.maxAutocorParts + FlaCudaWriter.maxFrames)));
|
||||||
cudaResidualTasks = cuda.Allocate((uint)residualTasksLen);
|
cudaResidualTasks = cuda.Allocate((uint)residualTasksLen);
|
||||||
cudaBestResidualTasks = cuda.Allocate((uint)bestResidualTasksLen);
|
cudaBestResidualTasks = cuda.Allocate((uint)bestResidualTasksLen);
|
||||||
//cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 6) * (FlaCudaWriter.maxResidualParts + FlaCudaWriter.maxFrames)));
|
cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 8) * 64 /*FlaCudaWriter.maxResidualParts*/ * FlaCudaWriter.maxFrames));
|
||||||
cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS + 1) * lpc.MAX_LPC_ORDER * FlaCudaWriter.maxResidualParts * FlaCudaWriter.maxFrames));
|
|
||||||
CUResult cuErr = CUResult.Success;
|
CUResult cuErr = CUResult.Success;
|
||||||
if (cuErr == CUResult.Success)
|
if (cuErr == CUResult.Success)
|
||||||
cuErr = CUDADriver.cuMemAllocHost(ref samplesBytesPtr, (uint)samplesBufferLen/2);
|
cuErr = CUDADriver.cuMemAllocHost(ref samplesBytesPtr, (uint)samplesBufferLen/2);
|
||||||
|
|||||||
Reference in New Issue
Block a user