optimizations

This commit is contained in:
chudov
2009-09-25 22:00:23 +00:00
parent 9866f4faa3
commit 59115cc03c

View File

@@ -100,9 +100,9 @@ namespace CUETools.Codecs.FlaCuda
bool encode_on_cpu = true; bool encode_on_cpu = true;
public const int MAX_BLOCKSIZE = 4608 * 8; public const int MAX_BLOCKSIZE = 4608 * 16;
internal const int maxFrames = 32; internal const int maxFrames = 32;
internal const int maxResidualParts = (MAX_BLOCKSIZE + 255) / 256; internal const int maxResidualParts = 64; // not (MAX_BLOCKSIZE + 255) / 256!! 64 is hardcoded in cudaEstimateResidual. It's per block.
internal const int maxAutocorParts = (MAX_BLOCKSIZE + 255) / 256; internal const int maxAutocorParts = (MAX_BLOCKSIZE + 255) / 256;
public FlaCudaWriter(string path, int bitsPerSample, int channelCount, int sampleRate, Stream IO) public FlaCudaWriter(string path, int bitsPerSample, int channelCount, int sampleRate, Stream IO)
@@ -122,7 +122,7 @@ namespace CUETools.Codecs.FlaCuda
_IO = IO; _IO = IO;
residualBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 10 : channels + 1)]; residualBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 10 : channels + 1)];
windowBuffer = new float[FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS]; windowBuffer = new float[FlaCudaWriter.MAX_BLOCKSIZE * lpc.MAX_LPC_WINDOWS];
md5_buffer = new byte[FlaCudaWriter.MAX_BLOCKSIZE * channels * bits_per_sample / 8]; md5_buffer = new byte[FlaCudaWriter.MAX_BLOCKSIZE * channels * bits_per_sample / 8];
eparams.flake_set_defaults(_compressionLevel); eparams.flake_set_defaults(_compressionLevel);
@@ -809,16 +809,18 @@ namespace CUETools.Codecs.FlaCuda
{ {
if ((eparams.window_function & flag) == 0 || _windowcount == lpc.MAX_LPC_WINDOWS) if ((eparams.window_function & flag) == 0 || _windowcount == lpc.MAX_LPC_WINDOWS)
return; return;
int sz = _windowsize;
float* pos = window + _windowcount * FlaCudaWriter.MAX_BLOCKSIZE * 2; func(window + _windowcount * FlaCudaWriter.MAX_BLOCKSIZE, _windowsize);
do //int sz = _windowsize;
{ //float* pos = window + _windowcount * FlaCudaWriter.MAX_BLOCKSIZE * 2;
func(pos, sz); //do
if ((sz & 1) != 0) //{
break; // func(pos, sz);
pos += sz; // if ((sz & 1) != 0)
sz >>= 1; // break;
} while (sz >= 32); // pos += sz;
// sz >>= 1;
//} while (sz >= 32);
_windowcount++; _windowcount++;
} }
@@ -827,7 +829,7 @@ namespace CUETools.Codecs.FlaCuda
computeAutocorTaskStruct* autocorTasks = (computeAutocorTaskStruct*)task.autocorTasksPtr; computeAutocorTaskStruct* autocorTasks = (computeAutocorTaskStruct*)task.autocorTasksPtr;
task.nAutocorTasks = 0; task.nAutocorTasks = 0;
task.nResidualTasks = 0; task.nResidualTasks = 0;
task.nResidualTasksPerChannel = (_windowcount * eparams.max_prediction_order + 2 + eparams.max_fixed_order - eparams.min_fixed_order + 7) & ~7; task.nResidualTasksPerChannel = (_windowcount * eparams.max_prediction_order + 1 + (eparams.do_constant ? 1 : 0) + eparams.max_fixed_order - eparams.min_fixed_order + 7) & ~7;
task.nAutocorTasksPerChannel = _windowcount; task.nAutocorTasksPerChannel = _windowcount;
for (int iFrame = 0; iFrame < nFrames; iFrame++) for (int iFrame = 0; iFrame < nFrames; iFrame++)
{ {
@@ -837,7 +839,7 @@ namespace CUETools.Codecs.FlaCuda
{ {
// Autocorelation task // Autocorelation task
autocorTasks[task.nAutocorTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize; autocorTasks[task.nAutocorTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize;
autocorTasks[task.nAutocorTasks].windowOffs = iWindow * 2 * FlaCudaWriter.MAX_BLOCKSIZE; autocorTasks[task.nAutocorTasks].windowOffs = iWindow * FlaCudaWriter.MAX_BLOCKSIZE;
autocorTasks[task.nAutocorTasks].residualOffs = eparams.max_prediction_order * iWindow + task.nResidualTasksPerChannel * (ch + iFrame * channelsCount); autocorTasks[task.nAutocorTasks].residualOffs = eparams.max_prediction_order * iWindow + task.nResidualTasksPerChannel * (ch + iFrame * channelsCount);
autocorTasks[task.nAutocorTasks].blocksize = blocksize; autocorTasks[task.nAutocorTasks].blocksize = blocksize;
task.nAutocorTasks++; task.nAutocorTasks++;
@@ -855,6 +857,7 @@ namespace CUETools.Codecs.FlaCuda
} }
} }
// Constant frames // Constant frames
if (eparams.do_constant)
{ {
task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.Constant; task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.Constant;
task.ResidualTasks[task.nResidualTasks].channel = ch; task.ResidualTasks[task.nResidualTasks].channel = ch;
@@ -1686,6 +1689,8 @@ namespace CUETools.Codecs.FlaCuda
public bool do_wasted; public bool do_wasted;
public bool do_constant;
public WindowFunction window_function; public WindowFunction window_function;
public bool do_md5; public bool do_md5;
@@ -1719,55 +1724,55 @@ namespace CUETools.Codecs.FlaCuda
do_verify = false; do_verify = false;
do_seektable = true; do_seektable = true;
do_wasted = true; do_wasted = true;
do_constant = true;
// differences from level 7 // differences from level 7
switch (lvl) switch (lvl)
{ {
case 0: case 0:
do_constant = false;
do_wasted = false; do_wasted = false;
do_midside = false; do_midside = false;
max_partition_order = 4; max_partition_order = 4;
max_prediction_order = 5; max_prediction_order = 4;
min_fixed_order = 2; min_fixed_order = 3;
max_fixed_order = 2; max_fixed_order = 2;
break; break;
case 1: case 1:
do_wasted = false; do_wasted = false;
do_midside = false; do_midside = false;
window_function = WindowFunction.Bartlett;
max_partition_order = 4; max_partition_order = 4;
max_prediction_order = 7; max_prediction_order = 5;
min_fixed_order = 3;
max_fixed_order = 2;
break; break;
case 2: case 2:
window_function = WindowFunction.Bartlett; window_function = WindowFunction.Bartlett;
max_partition_order = 4; max_partition_order = 4;
max_prediction_order = 5; min_fixed_order = 2;
max_fixed_order = 2;
max_prediction_order = 6;
break; break;
case 3: case 3:
window_function = WindowFunction.Bartlett; window_function = WindowFunction.Bartlett;
max_partition_order = 4; max_partition_order = 4;
min_fixed_order = 2;
max_fixed_order = 1;
max_prediction_order = 7; max_prediction_order = 7;
break; break;
case 4: case 4:
window_function = WindowFunction.Bartlett;
max_partition_order = 4; max_partition_order = 4;
max_prediction_order = 8; max_prediction_order = 8;
break; break;
case 5: case 5:
max_prediction_order = 8; max_prediction_order = 9;
break; break;
case 6: case 6:
window_function = WindowFunction.Bartlett;
min_fixed_order = 2;
max_fixed_order = 2; max_fixed_order = 2;
max_prediction_order = 12; max_prediction_order = 10;
break; break;
case 7: case 7:
//min_fixed_order = 2; min_fixed_order = 2;
//max_fixed_order = 2; max_fixed_order = 2;
max_prediction_order = 10; max_prediction_order = 11;
break; break;
case 8: case 8:
break; break;
@@ -1860,7 +1865,7 @@ namespace CUETools.Codecs.FlaCuda
cuda = _cuda; cuda = _cuda;
autocorTasksLen = sizeof(computeAutocorTaskStruct) * channelCount * lpc.MAX_LPC_WINDOWS * FlaCudaWriter.maxFrames; autocorTasksLen = sizeof(computeAutocorTaskStruct) * channelCount * lpc.MAX_LPC_WINDOWS * FlaCudaWriter.maxFrames;
residualTasksLen = sizeof(encodeResidualTaskStruct) * channelCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 6) * FlaCudaWriter.maxFrames; residualTasksLen = sizeof(encodeResidualTaskStruct) * channelCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 8) * FlaCudaWriter.maxFrames;
bestResidualTasksLen = sizeof(encodeResidualTaskStruct) * channelCount * FlaCudaWriter.maxFrames; bestResidualTasksLen = sizeof(encodeResidualTaskStruct) * channelCount * FlaCudaWriter.maxFrames;
samplesBufferLen = sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelCount; samplesBufferLen = sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelCount;
@@ -1871,8 +1876,7 @@ namespace CUETools.Codecs.FlaCuda
cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * (FlaCudaWriter.maxAutocorParts + FlaCudaWriter.maxFrames))); cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * (FlaCudaWriter.maxAutocorParts + FlaCudaWriter.maxFrames)));
cudaResidualTasks = cuda.Allocate((uint)residualTasksLen); cudaResidualTasks = cuda.Allocate((uint)residualTasksLen);
cudaBestResidualTasks = cuda.Allocate((uint)bestResidualTasksLen); cudaBestResidualTasks = cuda.Allocate((uint)bestResidualTasksLen);
//cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 6) * (FlaCudaWriter.maxResidualParts + FlaCudaWriter.maxFrames))); cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 8) * 64 /*FlaCudaWriter.maxResidualParts*/ * FlaCudaWriter.maxFrames));
cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS + 1) * lpc.MAX_LPC_ORDER * FlaCudaWriter.maxResidualParts * FlaCudaWriter.maxFrames));
CUResult cuErr = CUResult.Success; CUResult cuErr = CUResult.Success;
if (cuErr == CUResult.Success) if (cuErr == CUResult.Success)
cuErr = CUDADriver.cuMemAllocHost(ref samplesBytesPtr, (uint)samplesBufferLen/2); cuErr = CUDADriver.cuMemAllocHost(ref samplesBytesPtr, (uint)samplesBufferLen/2);