mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
optimizations
This commit is contained in:
@@ -98,7 +98,9 @@ namespace CUETools.Codecs.FlaCuda
|
||||
|
||||
CUdeviceptr cudaWindow;
|
||||
|
||||
bool encode_on_cpu = true;
|
||||
bool encode_on_cpu = false;
|
||||
|
||||
bool do_lattice = false;
|
||||
|
||||
public const int MAX_BLOCKSIZE = 4096 * 16;
|
||||
internal const int maxFrames = 128;
|
||||
@@ -167,6 +169,30 @@ namespace CUETools.Codecs.FlaCuda
|
||||
}
|
||||
}
|
||||
|
||||
public bool GPUOnly
|
||||
{
|
||||
get
|
||||
{
|
||||
return !encode_on_cpu;
|
||||
}
|
||||
set
|
||||
{
|
||||
encode_on_cpu = !value;
|
||||
}
|
||||
}
|
||||
|
||||
public bool UseLattice
|
||||
{
|
||||
get
|
||||
{
|
||||
return do_lattice;
|
||||
}
|
||||
set
|
||||
{
|
||||
do_lattice = value;
|
||||
}
|
||||
}
|
||||
|
||||
//[DllImport("kernel32.dll")]
|
||||
//static extern bool GetThreadTimes(IntPtr hThread, out long lpCreationTime, out long lpExitTime, out long lpKernelTime, out long lpUserTime);
|
||||
//[DllImport("kernel32.dll")]
|
||||
@@ -829,10 +855,8 @@ namespace CUETools.Codecs.FlaCuda
|
||||
_windowcount++;
|
||||
}
|
||||
|
||||
unsafe void initialize_autocorTasks(int blocksize, int channelsCount, int nFrames, FlaCudaTask task)
|
||||
unsafe void initializeSubframeTasks(int blocksize, int channelsCount, int nFrames, FlaCudaTask task)
|
||||
{
|
||||
computeAutocorTaskStruct* autocorTasks = (computeAutocorTaskStruct*)task.autocorTasksPtr;
|
||||
task.nAutocorTasks = 0;
|
||||
task.nResidualTasks = 0;
|
||||
task.nResidualTasksPerChannel = (_windowcount * eparams.max_prediction_order + 1 + (eparams.do_constant ? 1 : 0) + eparams.max_fixed_order - eparams.min_fixed_order + 7) & ~7;
|
||||
task.nAutocorTasksPerChannel = _windowcount;
|
||||
@@ -842,12 +866,6 @@ namespace CUETools.Codecs.FlaCuda
|
||||
{
|
||||
for (int iWindow = 0; iWindow < _windowcount; iWindow++)
|
||||
{
|
||||
// Autocorelation task
|
||||
autocorTasks[task.nAutocorTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize;
|
||||
autocorTasks[task.nAutocorTasks].windowOffs = iWindow * FlaCudaWriter.MAX_BLOCKSIZE;
|
||||
autocorTasks[task.nAutocorTasks].residualOffs = eparams.max_prediction_order * iWindow + task.nResidualTasksPerChannel * (ch + iFrame * channelsCount);
|
||||
autocorTasks[task.nAutocorTasks].blocksize = blocksize;
|
||||
task.nAutocorTasks++;
|
||||
// LPC tasks
|
||||
for (int order = 1; order <= eparams.max_prediction_order; order++)
|
||||
{
|
||||
@@ -859,6 +877,7 @@ namespace CUETools.Codecs.FlaCuda
|
||||
task.ResidualTasks[task.nResidualTasks].residualOrder = order;
|
||||
task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize;
|
||||
task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs;
|
||||
task.ResidualTasks[task.nResidualTasks].windowOffs = iWindow * FlaCudaWriter.MAX_BLOCKSIZE;
|
||||
task.nResidualTasks++;
|
||||
}
|
||||
}
|
||||
@@ -930,12 +949,9 @@ namespace CUETools.Codecs.FlaCuda
|
||||
}
|
||||
}
|
||||
}
|
||||
if (sizeof(encodeResidualTaskStruct) * task.nResidualTasks > task.residualTasksLen)
|
||||
if (sizeof(FlaCudaSubframeTask) * task.nResidualTasks > task.residualTasksLen)
|
||||
throw new Exception("oops");
|
||||
if (sizeof(computeAutocorTaskStruct) * task.nAutocorTasks > task.autocorTasksLen)
|
||||
throw new Exception("oops");
|
||||
cuda.CopyHostToDeviceAsync(task.cudaAutocorTasks, task.autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * task.nAutocorTasks), task.stream);
|
||||
cuda.CopyHostToDeviceAsync(task.cudaResidualTasks, task.residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * task.nResidualTasks), task.stream);
|
||||
cuda.CopyHostToDeviceAsync(task.cudaResidualTasks, task.residualTasksPtr, (uint)(sizeof(FlaCudaSubframeTask) * task.nResidualTasks), task.stream);
|
||||
task.frameSize = blocksize;
|
||||
}
|
||||
|
||||
@@ -1078,7 +1094,7 @@ namespace CUETools.Codecs.FlaCuda
|
||||
return;
|
||||
|
||||
//int autocorPartSize = (2 * 256 - eparams.max_prediction_order) & ~15;
|
||||
int autocorPartSize = 256 + 128;
|
||||
int autocorPartSize = 32 * 15;
|
||||
int autocorPartCount = (task.frameSize + autocorPartSize - 1) / autocorPartSize;
|
||||
if (autocorPartCount > maxAutocorParts)
|
||||
throw new Exception("internal error");
|
||||
@@ -1115,6 +1131,7 @@ namespace CUETools.Codecs.FlaCuda
|
||||
|
||||
CUfunction cudaChannelDecorr = channels == 2 ? (channelsCount == 4 ? task.cudaStereoDecorr : task.cudaChannelDecorr2) : task.cudaChannelDecorr;
|
||||
CUfunction cudaCalcPartition = calcPartitionPartSize >= 128 ? task.cudaCalcLargePartition : calcPartitionPartSize == 16 && task.frameSize >= 256 ? task.cudaCalcPartition16 : task.cudaCalcPartition;
|
||||
CUfunction cudaEstimateResidual = eparams.max_prediction_order <= 8 ? task.cudaEstimateResidual8 : eparams.max_prediction_order <= 12 ? task.cudaEstimateResidual12 : task.cudaEstimateResidual;
|
||||
|
||||
cuda.SetParameter(cudaChannelDecorr, 0 * sizeof(uint), (uint)task.cudaSamples.Pointer);
|
||||
cuda.SetParameter(cudaChannelDecorr, 1 * sizeof(uint), (uint)task.cudaSamplesBytes.Pointer);
|
||||
@@ -1130,40 +1147,39 @@ namespace CUETools.Codecs.FlaCuda
|
||||
cuda.SetFunctionBlockShape(task.cudaFindWastedBits, 256, 1, 1);
|
||||
|
||||
cuda.SetParameter(task.cudaComputeAutocor, 0, (uint)task.cudaAutocorOutput.Pointer);
|
||||
cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint), (uint)task.cudaSamples.Pointer);
|
||||
cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 2, (uint)cudaWindow.Pointer);
|
||||
cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 3, (uint)task.cudaAutocorTasks.Pointer);
|
||||
cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 4, (uint)eparams.max_prediction_order);
|
||||
cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 4 + sizeof(uint), (uint)task.frameSize);
|
||||
cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 4 + sizeof(uint) * 2, (uint)autocorPartSize);
|
||||
cuda.SetParameterSize(task.cudaComputeAutocor, sizeof(uint) * 7U);
|
||||
cuda.SetParameter(task.cudaComputeAutocor, 1 * sizeof(uint), (uint)task.cudaSamples.Pointer);
|
||||
cuda.SetParameter(task.cudaComputeAutocor, 2 * sizeof(uint), (uint)cudaWindow.Pointer);
|
||||
cuda.SetParameter(task.cudaComputeAutocor, 3 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer);
|
||||
cuda.SetParameter(task.cudaComputeAutocor, 4 * sizeof(uint), (uint)eparams.max_prediction_order);
|
||||
cuda.SetParameter(task.cudaComputeAutocor, 5 * sizeof(uint), (uint)task.nAutocorTasksPerChannel - 1);
|
||||
cuda.SetParameter(task.cudaComputeAutocor, 6 * sizeof(uint), (uint)task.nResidualTasksPerChannel);
|
||||
cuda.SetParameterSize(task.cudaComputeAutocor, 7U * sizeof(uint));
|
||||
cuda.SetFunctionBlockShape(task.cudaComputeAutocor, 32, 8, 1);
|
||||
|
||||
cuda.SetParameter(task.cudaComputeLPC, 0, (uint)task.cudaResidualTasks.Pointer);
|
||||
cuda.SetParameter(task.cudaComputeLPC, sizeof(uint), (uint)task.cudaAutocorOutput.Pointer);
|
||||
cuda.SetParameter(task.cudaComputeLPC, sizeof(uint) * 2, (uint)task.cudaAutocorTasks.Pointer);
|
||||
cuda.SetParameter(task.cudaComputeLPC, sizeof(uint) * 3, (uint)eparams.max_prediction_order);
|
||||
cuda.SetParameter(task.cudaComputeLPC, sizeof(uint) * 3 + sizeof(uint), (uint)autocorPartCount);
|
||||
cuda.SetParameterSize(task.cudaComputeLPC, (uint)(sizeof(uint) * 3) + sizeof(uint) * 2);
|
||||
cuda.SetParameter(task.cudaComputeLPC, 1 * sizeof(uint), (uint)task.nResidualTasksPerChannel);
|
||||
cuda.SetParameter(task.cudaComputeLPC, 2 * sizeof(uint), (uint)task.cudaAutocorOutput.Pointer);
|
||||
cuda.SetParameter(task.cudaComputeLPC, 3 * sizeof(uint), (uint)eparams.max_prediction_order);
|
||||
cuda.SetParameter(task.cudaComputeLPC, 4 * sizeof(uint), (uint)autocorPartCount);
|
||||
cuda.SetParameterSize(task.cudaComputeLPC, 5U * sizeof(uint));
|
||||
cuda.SetFunctionBlockShape(task.cudaComputeLPC, (autocorPartCount + 31) & ~31, 1, 1);
|
||||
|
||||
cuda.SetParameter(task.cudaComputeLPCLattice, 0, (uint)task.cudaResidualTasks.Pointer);
|
||||
cuda.SetParameter(task.cudaComputeLPCLattice, 1 * sizeof(uint), (uint)task.nResidualTasksPerChannel);
|
||||
cuda.SetParameter(task.cudaComputeLPCLattice, 2 * sizeof(uint), (uint)task.cudaSamples.Pointer);
|
||||
cuda.SetParameter(task.cudaComputeLPCLattice, 3 * sizeof(uint), (uint)_windowcount);
|
||||
//cuda.SetParameter(task.cudaComputeLPCLattice, 3 * sizeof(uint), (uint)task.frameSize);
|
||||
cuda.SetParameter(task.cudaComputeLPCLattice, 4 * sizeof(uint), (uint)eparams.max_prediction_order);
|
||||
cuda.SetParameterSize(task.cudaComputeLPCLattice, 5U * sizeof(uint));
|
||||
cuda.SetFunctionBlockShape(task.cudaComputeLPCLattice, 256, 1, 1);
|
||||
|
||||
cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 0, (uint)task.cudaResidualOutput.Pointer);
|
||||
cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 1, (uint)task.cudaSamples.Pointer);
|
||||
cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 2, (uint)task.cudaResidualTasks.Pointer);
|
||||
cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 3, (uint)eparams.max_prediction_order);
|
||||
cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 4, (uint)task.frameSize);
|
||||
cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 5, (uint)residualPartSize);
|
||||
cuda.SetParameterSize(task.cudaEstimateResidual, sizeof(uint) * 6);
|
||||
cuda.SetFunctionBlockShape(task.cudaEstimateResidual, 32, threads_y, 1);
|
||||
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 0, (uint)task.cudaResidualOutput.Pointer);
|
||||
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 1, (uint)task.cudaSamples.Pointer);
|
||||
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 2, (uint)task.cudaResidualTasks.Pointer);
|
||||
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 3, (uint)eparams.max_prediction_order);
|
||||
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 4, (uint)task.frameSize);
|
||||
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 5, (uint)residualPartSize);
|
||||
cuda.SetParameterSize(cudaEstimateResidual, sizeof(uint) * 6);
|
||||
cuda.SetFunctionBlockShape(cudaEstimateResidual, 32, threads_y, 1);
|
||||
|
||||
cuda.SetParameter(task.cudaChooseBestMethod, 0 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer);
|
||||
cuda.SetParameter(task.cudaChooseBestMethod, 1 * sizeof(uint), (uint)task.cudaResidualOutput.Pointer);
|
||||
@@ -1220,16 +1236,16 @@ namespace CUETools.Codecs.FlaCuda
|
||||
|
||||
// issue work to the GPU
|
||||
cuda.LaunchAsync(cudaChannelDecorr, (task.frameCount * task.frameSize + 255) / 256, channels == 2 ? 1 : channels, task.stream);
|
||||
if (task.frameSize <= 512 && eparams.max_prediction_order <= 12)
|
||||
if (do_lattice && task.frameSize <= 512 && eparams.max_prediction_order <= 12)
|
||||
cuda.LaunchAsync(task.cudaComputeLPCLattice, 1, channelsCount * task.frameCount, task.stream);
|
||||
else
|
||||
{
|
||||
if (eparams.do_wasted)
|
||||
cuda.LaunchAsync(task.cudaFindWastedBits, channelsCount * task.frameCount, 1, task.stream);
|
||||
cuda.LaunchAsync(task.cudaComputeAutocor, autocorPartCount, task.nAutocorTasksPerChannel * channelsCount * task.frameCount, task.stream);
|
||||
cuda.LaunchAsync(task.cudaComputeLPC, 1, task.nAutocorTasksPerChannel * channelsCount * task.frameCount, task.stream);
|
||||
cuda.LaunchAsync(task.cudaComputeLPC, task.nAutocorTasksPerChannel, channelsCount * task.frameCount, task.stream);
|
||||
}
|
||||
cuda.LaunchAsync(task.cudaEstimateResidual, residualPartCount, task.nResidualTasksPerChannel * channelsCount * task.frameCount / threads_y, task.stream);
|
||||
cuda.LaunchAsync(cudaEstimateResidual, residualPartCount, task.nResidualTasksPerChannel * channelsCount * task.frameCount / threads_y, task.stream);
|
||||
cuda.LaunchAsync(task.cudaChooseBestMethod, 1, channelsCount * task.frameCount, task.stream);
|
||||
if (channels == 2 && channelsCount == 4)
|
||||
cuda.LaunchAsync(task.cudaCopyBestMethodStereo, 1, task.frameCount, task.stream);
|
||||
@@ -1250,7 +1266,7 @@ namespace CUETools.Codecs.FlaCuda
|
||||
cuda.CopyDeviceToHostAsync(task.cudaBestRiceParams, task.bestRiceParamsPtr, (uint)(sizeof(int) * (1 << max_porder) * channels * task.frameCount), task.stream);
|
||||
task.max_porder = max_porder;
|
||||
}
|
||||
cuda.CopyDeviceToHostAsync(task.cudaBestResidualTasks, task.bestResidualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * channels * task.frameCount), task.stream);
|
||||
cuda.CopyDeviceToHostAsync(task.cudaBestResidualTasks, task.bestResidualTasksPtr, (uint)(sizeof(FlaCudaSubframeTask) * channels * task.frameCount), task.stream);
|
||||
}
|
||||
|
||||
unsafe int encode_frame(bool doMidside, int channelCount, int iFrame, FlaCudaTask task)
|
||||
@@ -1298,6 +1314,13 @@ namespace CUETools.Codecs.FlaCuda
|
||||
task.frameCount = nFrames;
|
||||
task.frameSize = blocksize;
|
||||
cuda.CopyHostToDeviceAsync(task.cudaSamplesBytes, task.samplesBytesPtr, (uint)(sizeof(short) * channels * blocksize * nFrames), task.stream);
|
||||
if (verify != null)
|
||||
{
|
||||
int* r = (int*)task.samplesBufferPtr;
|
||||
fixed (int* s = task.verifyBuffer)
|
||||
for (int ch = 0; ch < channels; ch++)
|
||||
AudioSamples.MemCpy(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * FlaCudaWriter.MAX_BLOCKSIZE, task.frameSize * task.frameCount);
|
||||
}
|
||||
}
|
||||
|
||||
unsafe void run_GPU_task(FlaCudaTask task)
|
||||
@@ -1320,15 +1343,7 @@ namespace CUETools.Codecs.FlaCuda
|
||||
cuda.CopyHostToDevice<float>(cudaWindow, windowBuffer);
|
||||
}
|
||||
if (task.nResidualTasks == 0)
|
||||
initialize_autocorTasks(task.frameSize, channelsCount, max_frames, task);
|
||||
|
||||
if (verify != null)
|
||||
{
|
||||
int* r = (int*)task.samplesBufferPtr;
|
||||
fixed (int* s = task.verifyBuffer)
|
||||
for (int ch = 0; ch < channels; ch++)
|
||||
AudioSamples.MemCpy(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * FlaCudaWriter.MAX_BLOCKSIZE, task.frameSize * task.frameCount);
|
||||
}
|
||||
initializeSubframeTasks(task.frameSize, channelsCount, max_frames, task);
|
||||
|
||||
estimate_residual(task, channelsCount);
|
||||
}
|
||||
@@ -1691,13 +1706,14 @@ namespace CUETools.Codecs.FlaCuda
|
||||
md5 = new MD5CryptoServiceProvider();
|
||||
|
||||
if (eparams.do_verify)
|
||||
{
|
||||
verify = new FlakeReader(channels, bits_per_sample);
|
||||
verify.DoCRC = false;
|
||||
}
|
||||
|
||||
frame_buffer = new byte[max_frame_size + 1];
|
||||
frame_writer = new BitWriter(frame_buffer, 0, max_frame_size + 1);
|
||||
|
||||
encode_on_cpu = eparams.lpc_max_precision_search <= 1;
|
||||
|
||||
return header_len;
|
||||
}
|
||||
}
|
||||
@@ -1891,16 +1907,7 @@ namespace CUETools.Codecs.FlaCuda
|
||||
}
|
||||
}
|
||||
|
||||
unsafe struct computeAutocorTaskStruct
|
||||
{
|
||||
public int samplesOffs;
|
||||
public int windowOffs;
|
||||
public int residualOffs;
|
||||
public int blocksize;
|
||||
public fixed int reserved[12];
|
||||
};
|
||||
|
||||
unsafe struct encodeResidualTaskStruct
|
||||
unsafe struct FlaCudaSubframeTask
|
||||
{
|
||||
public int residualOrder;
|
||||
public int samplesOffs;
|
||||
@@ -1916,7 +1923,8 @@ namespace CUETools.Codecs.FlaCuda
|
||||
public int wbits;
|
||||
public int abits;
|
||||
public int porder;
|
||||
public fixed int reserved[2];
|
||||
public int windowOffs;
|
||||
public fixed int reserved[1];
|
||||
public fixed int coefs[32];
|
||||
};
|
||||
|
||||
@@ -1931,6 +1939,8 @@ namespace CUETools.Codecs.FlaCuda
|
||||
public CUfunction cudaComputeLPC;
|
||||
public CUfunction cudaComputeLPCLattice;
|
||||
public CUfunction cudaEstimateResidual;
|
||||
public CUfunction cudaEstimateResidual8;
|
||||
public CUfunction cudaEstimateResidual12;
|
||||
public CUfunction cudaChooseBestMethod;
|
||||
public CUfunction cudaCopyBestMethod;
|
||||
public CUfunction cudaCopyBestMethodStereo;
|
||||
@@ -1947,7 +1957,6 @@ namespace CUETools.Codecs.FlaCuda
|
||||
public CUdeviceptr cudaPartitions;
|
||||
public CUdeviceptr cudaRiceParams;
|
||||
public CUdeviceptr cudaBestRiceParams;
|
||||
public CUdeviceptr cudaAutocorTasks;
|
||||
public CUdeviceptr cudaAutocorOutput;
|
||||
public CUdeviceptr cudaResidualTasks;
|
||||
public CUdeviceptr cudaResidualOutput;
|
||||
@@ -1956,7 +1965,6 @@ namespace CUETools.Codecs.FlaCuda
|
||||
public IntPtr samplesBufferPtr = IntPtr.Zero;
|
||||
public IntPtr residualBufferPtr = IntPtr.Zero;
|
||||
public IntPtr bestRiceParamsPtr = IntPtr.Zero;
|
||||
public IntPtr autocorTasksPtr = IntPtr.Zero;
|
||||
public IntPtr residualTasksPtr = IntPtr.Zero;
|
||||
public IntPtr bestResidualTasksPtr = IntPtr.Zero;
|
||||
public CUstream stream;
|
||||
@@ -1964,12 +1972,10 @@ namespace CUETools.Codecs.FlaCuda
|
||||
public int frameSize = 0;
|
||||
public int frameCount = 0;
|
||||
public FlacFrame frame;
|
||||
public int autocorTasksLen;
|
||||
public int residualTasksLen;
|
||||
public int bestResidualTasksLen;
|
||||
public int samplesBufferLen;
|
||||
public int nResidualTasks = 0;
|
||||
public int nAutocorTasks = 0;
|
||||
public int nResidualTasksPerChannel = 0;
|
||||
public int nAutocorTasksPerChannel = 0;
|
||||
public int max_porder = 0;
|
||||
@@ -1978,9 +1984,8 @@ namespace CUETools.Codecs.FlaCuda
|
||||
{
|
||||
cuda = _cuda;
|
||||
|
||||
autocorTasksLen = sizeof(computeAutocorTaskStruct) * channelCount * lpc.MAX_LPC_WINDOWS * FlaCudaWriter.maxFrames;
|
||||
residualTasksLen = sizeof(encodeResidualTaskStruct) * channelCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 8) * FlaCudaWriter.maxFrames;
|
||||
bestResidualTasksLen = sizeof(encodeResidualTaskStruct) * channelCount * FlaCudaWriter.maxFrames;
|
||||
residualTasksLen = sizeof(FlaCudaSubframeTask) * channelCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 8) * FlaCudaWriter.maxFrames;
|
||||
bestResidualTasksLen = sizeof(FlaCudaSubframeTask) * channelCount * FlaCudaWriter.maxFrames;
|
||||
samplesBufferLen = sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelCount;
|
||||
int partitionsLen = sizeof(int) * (30 << 8) * channelCount * FlaCudaWriter.maxFrames;
|
||||
int riceParamsLen = sizeof(int) * (4 << 8) * channelCount * FlaCudaWriter.maxFrames;
|
||||
@@ -1991,7 +1996,6 @@ namespace CUETools.Codecs.FlaCuda
|
||||
cudaPartitions = cuda.Allocate((uint)partitionsLen);
|
||||
cudaRiceParams = cuda.Allocate((uint)riceParamsLen);
|
||||
cudaBestRiceParams = cuda.Allocate((uint)riceParamsLen / 4);
|
||||
cudaAutocorTasks = cuda.Allocate((uint)autocorTasksLen);
|
||||
cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * (FlaCudaWriter.maxAutocorParts + FlaCudaWriter.maxFrames)));
|
||||
cudaResidualTasks = cuda.Allocate((uint)residualTasksLen);
|
||||
cudaBestResidualTasks = cuda.Allocate((uint)bestResidualTasksLen);
|
||||
@@ -2005,8 +2009,6 @@ namespace CUETools.Codecs.FlaCuda
|
||||
cuErr = CUDADriver.cuMemAllocHost(ref residualBufferPtr, (uint)samplesBufferLen);
|
||||
if (cuErr == CUResult.Success)
|
||||
cuErr = CUDADriver.cuMemAllocHost(ref bestRiceParamsPtr, (uint)riceParamsLen / 4);
|
||||
if (cuErr == CUResult.Success)
|
||||
cuErr = CUDADriver.cuMemAllocHost(ref autocorTasksPtr, (uint)autocorTasksLen);
|
||||
if (cuErr == CUResult.Success)
|
||||
cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)residualTasksLen);
|
||||
if (cuErr == CUResult.Success)
|
||||
@@ -2017,7 +2019,6 @@ namespace CUETools.Codecs.FlaCuda
|
||||
if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr); samplesBufferPtr = IntPtr.Zero;
|
||||
if (residualBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualBufferPtr); residualBufferPtr = IntPtr.Zero;
|
||||
if (bestRiceParamsPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(bestRiceParamsPtr); bestRiceParamsPtr = IntPtr.Zero;
|
||||
if (autocorTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(autocorTasksPtr); autocorTasksPtr = IntPtr.Zero;
|
||||
if (residualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualTasksPtr); residualTasksPtr = IntPtr.Zero;
|
||||
if (bestResidualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(bestResidualTasksPtr); bestResidualTasksPtr = IntPtr.Zero;
|
||||
throw new CUDAException(cuErr);
|
||||
@@ -2031,6 +2032,8 @@ namespace CUETools.Codecs.FlaCuda
|
||||
cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC");
|
||||
cudaComputeLPCLattice = cuda.GetModuleFunction("cudaComputeLPCLattice");
|
||||
cudaEstimateResidual = cuda.GetModuleFunction("cudaEstimateResidual");
|
||||
cudaEstimateResidual8 = cuda.GetModuleFunction("cudaEstimateResidual8");
|
||||
cudaEstimateResidual12 = cuda.GetModuleFunction("cudaEstimateResidual12");
|
||||
cudaChooseBestMethod = cuda.GetModuleFunction("cudaChooseBestMethod");
|
||||
cudaCopyBestMethod = cuda.GetModuleFunction("cudaCopyBestMethod");
|
||||
cudaCopyBestMethodStereo = cuda.GetModuleFunction("cudaCopyBestMethodStereo");
|
||||
@@ -2053,7 +2056,6 @@ namespace CUETools.Codecs.FlaCuda
|
||||
cuda.Free(cudaSamplesBytes);
|
||||
cuda.Free(cudaResidual);
|
||||
cuda.Free(cudaPartitions);
|
||||
cuda.Free(cudaAutocorTasks);
|
||||
cuda.Free(cudaAutocorOutput);
|
||||
cuda.Free(cudaResidualTasks);
|
||||
cuda.Free(cudaResidualOutput);
|
||||
@@ -2064,23 +2066,22 @@ namespace CUETools.Codecs.FlaCuda
|
||||
CUDADriver.cuMemFreeHost(bestRiceParamsPtr);
|
||||
CUDADriver.cuMemFreeHost(residualTasksPtr);
|
||||
CUDADriver.cuMemFreeHost(bestResidualTasksPtr);
|
||||
CUDADriver.cuMemFreeHost(autocorTasksPtr);
|
||||
cuda.DestroyStream(stream);
|
||||
}
|
||||
|
||||
public unsafe encodeResidualTaskStruct* ResidualTasks
|
||||
public unsafe FlaCudaSubframeTask* ResidualTasks
|
||||
{
|
||||
get
|
||||
{
|
||||
return (encodeResidualTaskStruct*)residualTasksPtr;
|
||||
return (FlaCudaSubframeTask*)residualTasksPtr;
|
||||
}
|
||||
}
|
||||
|
||||
public unsafe encodeResidualTaskStruct* BestResidualTasks
|
||||
public unsafe FlaCudaSubframeTask* BestResidualTasks
|
||||
{
|
||||
get
|
||||
{
|
||||
return (encodeResidualTaskStruct*)bestResidualTasksPtr;
|
||||
return (FlaCudaSubframeTask*)bestResidualTasksPtr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,15 +20,6 @@
|
||||
#ifndef _FLACUDA_KERNEL_H_
|
||||
#define _FLACUDA_KERNEL_H_
|
||||
|
||||
typedef struct
|
||||
{
|
||||
int samplesOffs;
|
||||
int windowOffs;
|
||||
int residualOffs;
|
||||
int blocksize;
|
||||
int reserved[12];
|
||||
} computeAutocorTaskStruct;
|
||||
|
||||
typedef enum
|
||||
{
|
||||
Constant = 0,
|
||||
@@ -53,9 +44,15 @@ typedef struct
|
||||
int wbits;
|
||||
int abits;
|
||||
int porder;
|
||||
int reserved[2];
|
||||
int windowOffs;
|
||||
int reserved[1];
|
||||
} FlaCudaSubframeData;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
FlaCudaSubframeData data;
|
||||
int coefs[32]; // fixme: should be short?
|
||||
} encodeResidualTaskStruct;
|
||||
} FlaCudaSubframeTask;
|
||||
|
||||
#define SUM16(buf,tid,op) buf[tid] op buf[tid + 8]; buf[tid] op buf[tid + 4]; buf[tid] op buf[tid + 2]; buf[tid] op buf[tid + 1];
|
||||
#define SUM32(buf,tid,op) buf[tid] op buf[tid + 16]; SUM16(buf,tid,op)
|
||||
@@ -110,7 +107,7 @@ extern "C" __global__ void cudaChannelDecorr(
|
||||
}
|
||||
|
||||
extern "C" __global__ void cudaFindWastedBits(
|
||||
encodeResidualTaskStruct *tasks,
|
||||
FlaCudaSubframeTask *tasks,
|
||||
int *samples,
|
||||
int tasksPerChannel,
|
||||
int blocksize
|
||||
@@ -119,11 +116,11 @@ extern "C" __global__ void cudaFindWastedBits(
|
||||
__shared__ struct {
|
||||
volatile int wbits[256];
|
||||
volatile int abits[256];
|
||||
encodeResidualTaskStruct task;
|
||||
FlaCudaSubframeData task;
|
||||
} shared;
|
||||
|
||||
if (threadIdx.x < 16)
|
||||
((int*)&shared.task)[threadIdx.x] = ((int*)(&tasks[blockIdx.x * tasksPerChannel]))[threadIdx.x];
|
||||
if (threadIdx.x < sizeof(shared.task) / sizeof(int))
|
||||
((int*)&shared.task)[threadIdx.x] = ((int*)(&tasks[blockIdx.x * tasksPerChannel].data))[threadIdx.x];
|
||||
shared.wbits[threadIdx.x] = 0;
|
||||
shared.abits[threadIdx.x] = 0;
|
||||
__syncthreads();
|
||||
@@ -144,72 +141,86 @@ extern "C" __global__ void cudaFindWastedBits(
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x < tasksPerChannel)
|
||||
tasks[blockIdx.x * tasksPerChannel + threadIdx.x].wbits = shared.task.wbits;
|
||||
tasks[blockIdx.x * tasksPerChannel + threadIdx.x].data.wbits = shared.task.wbits;
|
||||
if (threadIdx.x < tasksPerChannel)
|
||||
tasks[blockIdx.x * tasksPerChannel + threadIdx.x].abits = shared.task.abits;
|
||||
tasks[blockIdx.x * tasksPerChannel + threadIdx.x].data.abits = shared.task.abits;
|
||||
}
|
||||
|
||||
extern "C" __global__ void cudaComputeAutocor(
|
||||
float *output,
|
||||
const int *samples,
|
||||
const float *window,
|
||||
computeAutocorTaskStruct *tasks,
|
||||
int max_order, // should be <= 32
|
||||
int frameSize,
|
||||
int partSize // should be <= 2*blockDim - max_order
|
||||
FlaCudaSubframeTask *tasks,
|
||||
const int max_order, // should be <= 32
|
||||
const int windowcount, // windows (log2: 0,1)
|
||||
const int taskCount // tasks per block
|
||||
)
|
||||
{
|
||||
__shared__ struct {
|
||||
float data[512];
|
||||
volatile float product[256];
|
||||
computeAutocorTaskStruct task;
|
||||
FlaCudaSubframeData task;
|
||||
volatile float result[33];
|
||||
} shared;
|
||||
const int tid = threadIdx.x + (threadIdx.y * 32);
|
||||
// fetch task data
|
||||
if (tid < sizeof(shared.task) / sizeof(int))
|
||||
((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid];
|
||||
((int*)&shared.task)[tid] = ((int*)(tasks + __mul24(taskCount, blockIdx.y >> windowcount) + __mul24(max_order, blockIdx.y & ((1 << windowcount)-1))))[tid];
|
||||
__syncthreads();
|
||||
|
||||
// fetch samples
|
||||
{
|
||||
const int pos = blockIdx.x * partSize;
|
||||
const int dataLen = min(frameSize - pos, partSize + max_order);
|
||||
const int pos = __mul24(blockIdx.x, 15) * 32;
|
||||
const int dataLen = min(shared.task.blocksize - pos, 15 * 32 + max_order);
|
||||
const int pos2 = pos + tid;
|
||||
|
||||
shared.data[tid] = tid < dataLen ? samples[shared.task.samplesOffs + pos + tid] * window[shared.task.windowOffs + pos + tid]: 0.0f;
|
||||
shared.data[tid + 256] = tid + 256 < dataLen ? samples[shared.task.samplesOffs + pos + tid + 256] * window[shared.task.windowOffs + pos + tid + 256]: 0.0f;
|
||||
shared.data[tid] = tid < dataLen ? samples[shared.task.samplesOffs + pos2] * window[shared.task.windowOffs + pos2]: 0.0f;
|
||||
shared.data[tid + 256] = tid + 256 < dataLen ? samples[shared.task.samplesOffs + pos2 + 256] * window[shared.task.windowOffs + pos2 + 256]: 0.0f;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
const int ptr = __mul24(threadIdx.x, 15);
|
||||
for (int lag = threadIdx.y; lag <= max_order; lag += 8)
|
||||
{
|
||||
const int productLen = min(frameSize - blockIdx.x * partSize - lag, partSize);
|
||||
shared.product[tid] = 0.0;
|
||||
for (int ptr = threadIdx.x; ptr < productLen + threadIdx.x; ptr += 128)
|
||||
shared.product[tid] += ((ptr < productLen) * shared.data[ptr] * shared.data[ptr + lag]
|
||||
+ (ptr + 32 < productLen) * shared.data[ptr + 32] * shared.data[ptr + 32 + lag])
|
||||
+ ((ptr + 64 < productLen) * shared.data[ptr + 64] * shared.data[ptr + 64 + lag]
|
||||
+ (ptr + 96 < productLen) * shared.data[ptr + 96] * shared.data[ptr + 96 + lag]);
|
||||
// product sum: reduction in shared mem
|
||||
//shared.product[tid] += shared.product[tid + 16];
|
||||
shared.product[tid] = (shared.product[tid] + shared.product[tid + 16]) + (shared.product[tid + 8] + shared.product[tid + 24]);
|
||||
shared.product[tid] = (shared.product[tid] + shared.product[tid + 4]) + (shared.product[tid + 2] + shared.product[tid + 6]);
|
||||
//const int productLen = min(shared.task.blocksize - blockIdx.x * partSize - lag, partSize);
|
||||
const int ptr2 = ptr + lag;
|
||||
shared.product[tid] =
|
||||
shared.data[ptr + 0] * shared.data[ptr2 + 0] +
|
||||
shared.data[ptr + 1] * shared.data[ptr2 + 1] +
|
||||
shared.data[ptr + 2] * shared.data[ptr2 + 2] +
|
||||
shared.data[ptr + 3] * shared.data[ptr2 + 3] +
|
||||
shared.data[ptr + 4] * shared.data[ptr2 + 4] +
|
||||
shared.data[ptr + 5] * shared.data[ptr2 + 5] +
|
||||
shared.data[ptr + 6] * shared.data[ptr2 + 6] +
|
||||
shared.data[ptr + 7] * shared.data[ptr2 + 7] +
|
||||
shared.data[ptr + 8] * shared.data[ptr2 + 8] +
|
||||
shared.data[ptr + 9] * shared.data[ptr2 + 9] +
|
||||
shared.data[ptr + 10] * shared.data[ptr2 + 10] +
|
||||
shared.data[ptr + 11] * shared.data[ptr2 + 11] +
|
||||
shared.data[ptr + 12] * shared.data[ptr2 + 12] +
|
||||
shared.data[ptr + 13] * shared.data[ptr2 + 13] +
|
||||
shared.data[ptr + 14] * shared.data[ptr2 + 14];
|
||||
shared.product[tid] = shared.product[tid] + shared.product[tid + 8] + shared.product[tid + 16] + shared.product[tid + 24];
|
||||
shared.product[tid] = shared.product[tid] + shared.product[tid + 2] + shared.product[tid + 4] + shared.product[tid + 6];
|
||||
// return results
|
||||
if (threadIdx.x == 0)
|
||||
output[(blockIdx.x + blockIdx.y * gridDim.x) * (max_order + 1) + lag] = shared.product[tid] + shared.product[tid + 1];
|
||||
shared.result[lag] = shared.product[tid] + shared.product[tid + 1];
|
||||
}
|
||||
__syncthreads();
|
||||
if (tid <= max_order)
|
||||
output[(blockIdx.x + blockIdx.y * gridDim.x) * (max_order + 1) + tid] = shared.result[tid];
|
||||
}
|
||||
|
||||
extern "C" __global__ void cudaComputeLPC(
|
||||
encodeResidualTaskStruct *output,
|
||||
FlaCudaSubframeTask *tasks,
|
||||
int taskCount, // tasks per block
|
||||
float*autoc,
|
||||
computeAutocorTaskStruct *tasks,
|
||||
int max_order, // should be <= 32
|
||||
int partCount // should be <= blockDim?
|
||||
)
|
||||
{
|
||||
__shared__ struct {
|
||||
computeAutocorTaskStruct task;
|
||||
encodeResidualTaskStruct task2;
|
||||
FlaCudaSubframeData task;
|
||||
volatile float ldr[32];
|
||||
volatile int bits[32];
|
||||
volatile float autoc[33];
|
||||
@@ -223,16 +234,13 @@ extern "C" __global__ void cudaComputeLPC(
|
||||
|
||||
// fetch task data
|
||||
if (tid < sizeof(shared.task) / sizeof(int))
|
||||
((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid];
|
||||
__syncthreads();
|
||||
if (tid < sizeof(shared.task2) / sizeof(int))
|
||||
((int*)&shared.task2)[tid] = ((int*)(output + shared.task.residualOffs))[tid];
|
||||
((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.x * max_order + blockIdx.y * taskCount))[tid];
|
||||
__syncthreads();
|
||||
|
||||
// add up parts
|
||||
for (int order = 0; order <= max_order; order++)
|
||||
{
|
||||
shared.parts[tid] = tid < partCount ? autoc[(blockIdx.y * partCount + tid) * (max_order + 1) + order] : 0;
|
||||
shared.parts[tid] = tid < partCount ? autoc[((blockIdx.y * gridDim.x + blockIdx.x) * partCount + tid) * (max_order + 1) + order] : 0;
|
||||
__syncthreads();
|
||||
if (tid < 64 && blockDim.x > 64) shared.parts[tid] += shared.parts[tid + 64];
|
||||
__syncthreads();
|
||||
@@ -275,8 +283,8 @@ extern "C" __global__ void cudaComputeLPC(
|
||||
|
||||
// Quantization
|
||||
//int precision = 13 - (shared.task.blocksize <= 2304) - (shared.task.blocksize <= 1152) - (shared.task.blocksize <= 576);
|
||||
int precision = max(3, min(13 - (shared.task.blocksize <= 2304) - (shared.task.blocksize <= 1152) - (shared.task.blocksize <= 576), shared.task2.abits));
|
||||
int taskNo = shared.task.residualOffs + order;
|
||||
int precision = max(3, min(min(13 - (shared.task.blocksize <= 2304) - (shared.task.blocksize <= 1152) - (shared.task.blocksize <= 576), shared.task.abits), __clz(order) + 1 - shared.task.abits));
|
||||
int taskNo = blockIdx.x * max_order + blockIdx.y * taskCount + order;
|
||||
shared.bits[tid] = __mul24((33 - __clz(__float2int_rn(fabs(shared.ldr[tid]) * (1 << 15))) - precision), tid <= order);
|
||||
shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 16]);
|
||||
shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 8]);
|
||||
@@ -288,9 +296,9 @@ extern "C" __global__ void cudaComputeLPC(
|
||||
// reverse coefs
|
||||
int coef = max(-(1 << precision),min((1 << precision)-1,__float2int_rn(-shared.ldr[order - tid] * (1 << sh))));
|
||||
if (tid <= order)
|
||||
output[taskNo].coefs[tid] = coef;
|
||||
tasks[taskNo].coefs[tid] = coef;
|
||||
if (tid == 0)
|
||||
output[taskNo].shift = sh;
|
||||
tasks[taskNo].data.shift = sh;
|
||||
shared.bits[tid] = __mul24(33 - __clz(coef ^ (coef >> 31)), tid <= order);
|
||||
shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 16]);
|
||||
shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 8]);
|
||||
@@ -299,13 +307,13 @@ extern "C" __global__ void cudaComputeLPC(
|
||||
shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 1]);
|
||||
int cbits = shared.bits[0];
|
||||
if (tid == 0)
|
||||
output[taskNo].cbits = cbits;
|
||||
tasks[taskNo].data.cbits = cbits;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ void cudaComputeLPCLattice(
|
||||
encodeResidualTaskStruct *tasks,
|
||||
FlaCudaSubframeTask *tasks,
|
||||
const int taskCount, // tasks per block
|
||||
const int *samples,
|
||||
const int precisions,
|
||||
@@ -313,7 +321,7 @@ extern "C" __global__ void cudaComputeLPCLattice(
|
||||
)
|
||||
{
|
||||
__shared__ struct {
|
||||
volatile encodeResidualTaskStruct task;
|
||||
volatile FlaCudaSubframeTask task;
|
||||
volatile float F[512];
|
||||
volatile float lpc[12][32];
|
||||
union {
|
||||
@@ -328,25 +336,25 @@ extern "C" __global__ void cudaComputeLPCLattice(
|
||||
__syncthreads();
|
||||
|
||||
// F = samples; B = samples
|
||||
//int frameSize = shared.task.blocksize;
|
||||
int s1 = threadIdx.x < shared.task.blocksize ? samples[shared.task.samplesOffs + threadIdx.x] : 0;
|
||||
int s2 = threadIdx.x + 256 < shared.task.blocksize ? samples[shared.task.samplesOffs + threadIdx.x + 256] : 0;
|
||||
//int frameSize = shared.task.data.blocksize;
|
||||
int s1 = threadIdx.x < shared.task.data.blocksize ? samples[shared.task.data.samplesOffs + threadIdx.x] : 0;
|
||||
int s2 = threadIdx.x + 256 < shared.task.data.blocksize ? samples[shared.task.data.samplesOffs + threadIdx.x + 256] : 0;
|
||||
shared.tmpi[threadIdx.x] = s1|s2;
|
||||
__syncthreads();
|
||||
SUM256(shared.tmpi,threadIdx.x,|=);
|
||||
if (threadIdx.x == 0)
|
||||
shared.task.wbits = max(0,__ffs(shared.tmpi[0]) - 1);
|
||||
shared.task.data.wbits = max(0,__ffs(shared.tmpi[0]) - 1);
|
||||
__syncthreads();
|
||||
if (threadIdx.x < taskCount)
|
||||
tasks[blockIdx.y * taskCount + threadIdx.x].wbits = shared.task.wbits;
|
||||
tasks[blockIdx.y * taskCount + threadIdx.x].data.wbits = shared.task.data.wbits;
|
||||
shared.tmpi[threadIdx.x] = (s1 ^ (s1 >> 31)) | (s2 ^ (s2 >> 31));
|
||||
__syncthreads();
|
||||
SUM256(shared.tmpi,threadIdx.x,|=);
|
||||
if (threadIdx.x == 0)
|
||||
shared.task.abits = 32 - __clz(shared.tmpi[0]) - shared.task.wbits;
|
||||
shared.task.data.abits = 32 - __clz(shared.tmpi[0]) - shared.task.data.wbits;
|
||||
__syncthreads();
|
||||
s1 >>= shared.task.wbits;
|
||||
s2 >>= shared.task.wbits;
|
||||
s1 >>= shared.task.data.wbits;
|
||||
s2 >>= shared.task.data.wbits;
|
||||
shared.F[threadIdx.x] = s1;
|
||||
shared.F[threadIdx.x + 256] = s2;
|
||||
__syncthreads();
|
||||
@@ -354,10 +362,10 @@ extern "C" __global__ void cudaComputeLPCLattice(
|
||||
for (int order = 1; order <= max_order; order++)
|
||||
{
|
||||
// reff = F(order+1:frameSize) * B(1:frameSize-order)' / DEN
|
||||
float f1 = (threadIdx.x + order < shared.task.blocksize) * shared.F[threadIdx.x + order];
|
||||
float f2 = (threadIdx.x + 256 + order < shared.task.blocksize) * shared.F[threadIdx.x + 256 + order];
|
||||
s1 *= (threadIdx.x + order < shared.task.blocksize);
|
||||
s2 *= (threadIdx.x + 256 + order < shared.task.blocksize);
|
||||
float f1 = (threadIdx.x + order < shared.task.data.blocksize) * shared.F[threadIdx.x + order];
|
||||
float f2 = (threadIdx.x + 256 + order < shared.task.data.blocksize) * shared.F[threadIdx.x + 256 + order];
|
||||
s1 *= (threadIdx.x + order < shared.task.data.blocksize);
|
||||
s2 *= (threadIdx.x + 256 + order < shared.task.data.blocksize);
|
||||
|
||||
// DEN = F(order+1:frameSize) * F(order+1:frameSize)' + B(1:frameSize-order) * B(1:frameSize-order)' (BURG)
|
||||
shared.tmp[threadIdx.x] = FSQR(f1) + FSQR(f2) + FSQR(s1) + FSQR(s2);
|
||||
@@ -388,9 +396,9 @@ extern "C" __global__ void cudaComputeLPCLattice(
|
||||
// F1 = F(order+1:frameSize) - reff * B(1:frameSize-order)
|
||||
// B(1:frameSize-order) = B(1:frameSize-order) - reff * F(order+1:frameSize)
|
||||
// F(order+1:frameSize) = F1
|
||||
if (threadIdx.x < shared.task.blocksize - order)
|
||||
if (threadIdx.x < shared.task.data.blocksize - order)
|
||||
shared.F[order + threadIdx.x] -= reff * s1;
|
||||
if (threadIdx.x + 256 < shared.task.blocksize - order)
|
||||
if (threadIdx.x + 256 < shared.task.data.blocksize - order)
|
||||
shared.F[order + threadIdx.x + 256] -= reff * s2;
|
||||
s1 -= reff * f1;
|
||||
s2 -= reff * f2;
|
||||
@@ -408,17 +416,17 @@ extern "C" __global__ void cudaComputeLPCLattice(
|
||||
// OR reduction
|
||||
SUM32(shared.tmpi,threadIdx.x,|=);
|
||||
// choose precision
|
||||
//int cbits = max(3, min(10, 5 + (shared.task.abits >> 1))); // - __float2int_rn(shared.PE[order - 1])
|
||||
int cbits = max(3, min(10, shared.task.abits)) - precision;// + precision); // - __float2int_rn(shared.PE[order - 1])
|
||||
//int cbits = max(3, min(10, 5 + (shared.task.data.abits >> 1))); // - __float2int_rn(shared.PE[order - 1])
|
||||
int cbits = max(3, min(10, shared.task.data.abits)) - precision;// + precision); // - __float2int_rn(shared.PE[order - 1])
|
||||
// calculate shift based on precision and number of leading zeroes in coeffs
|
||||
int shift = max(0,min(15, __clz(shared.tmpi[threadIdx.x & ~31]) - 18 + cbits));
|
||||
//if (shared.task.abits + 32 - __clz(order) < shift
|
||||
//int shift = max(0,min(15, (shared.task.abits >> 2) - 14 + __clz(shared.tmpi[threadIdx.x & ~31]) + ((32 - __clz(order))>>1)));
|
||||
//if (shared.task.data.abits + 32 - __clz(order) < shift
|
||||
//int shift = max(0,min(15, (shared.task.data.abits >> 2) - 14 + __clz(shared.tmpi[threadIdx.x & ~31]) + ((32 - __clz(order))>>1)));
|
||||
// quantize coeffs with given shift
|
||||
coef = cn <= order ? max(-(1 << (cbits - 1)), min((1 << (cbits - 1)) -1, __float2int_rn(shared.lpc[order][order - cn] * (1 << shift)))) : 0;
|
||||
// error correction
|
||||
//shared.tmp[threadIdx.x] = (threadIdx.x != 0) * (shared.arp[threadIdx.x - 1]*(1 << shared.task.shift) - shared.task.coefs[threadIdx.x - 1]);
|
||||
//shared.task.coefs[threadIdx.x] = max(-(1 << (shared.task.cbits - 1)), min((1 << (shared.task.cbits - 1))-1, __float2int_rn((shared.arp[threadIdx.x]) * (1 << shared.task.shift) + shared.tmp[threadIdx.x])));
|
||||
//shared.tmp[threadIdx.x] = (threadIdx.x != 0) * (shared.arp[threadIdx.x - 1]*(1 << shared.task.data.shift) - shared.task.data.coefs[threadIdx.x - 1]);
|
||||
//shared.task.data.coefs[threadIdx.x] = max(-(1 << (shared.task.data.cbits - 1)), min((1 << (shared.task.data.cbits - 1))-1, __float2int_rn((shared.arp[threadIdx.x]) * (1 << shared.task.data.shift) + shared.tmp[threadIdx.x])));
|
||||
// remove sign bits
|
||||
shared.tmpi[threadIdx.x] = coef ^ (coef >> 31);
|
||||
// OR reduction
|
||||
@@ -429,16 +437,16 @@ extern "C" __global__ void cudaComputeLPCLattice(
|
||||
// output shift, cbits and output coeffs
|
||||
int taskNo = taskCount * blockIdx.y + order + precision * max_order;
|
||||
if (cn == 0)
|
||||
tasks[taskNo].shift = shift;
|
||||
tasks[taskNo].data.shift = shift;
|
||||
if (cn == 0)
|
||||
tasks[taskNo].cbits = cbits;
|
||||
tasks[taskNo].data.cbits = cbits;
|
||||
if (cn <= order)
|
||||
tasks[taskNo].coefs[cn] = coef;
|
||||
}
|
||||
}
|
||||
|
||||
//extern "C" __global__ void cudaComputeLPCLattice512(
|
||||
// encodeResidualTaskStruct *tasks,
|
||||
// FlaCudaSubframeTask *tasks,
|
||||
// const int taskCount, // tasks per block
|
||||
// const int *samples,
|
||||
// const int frameSize, // <= 512
|
||||
@@ -446,7 +454,7 @@ extern "C" __global__ void cudaComputeLPCLattice(
|
||||
//)
|
||||
//{
|
||||
// __shared__ struct {
|
||||
// encodeResidualTaskStruct task;
|
||||
// FlaCudaSubframeTask task;
|
||||
// float F[512];
|
||||
// float B[512];
|
||||
// float lpc[32][32];
|
||||
@@ -463,7 +471,7 @@ extern "C" __global__ void cudaComputeLPCLattice(
|
||||
// __syncthreads();
|
||||
//
|
||||
// // F = samples; B = samples
|
||||
// shared.F[threadIdx.x] = threadIdx.x < frameSize ? samples[shared.task.samplesOffs + threadIdx.x] >> shared.task.wbits : 0.0f;
|
||||
// shared.F[threadIdx.x] = threadIdx.x < frameSize ? samples[shared.task.data.samplesOffs + threadIdx.x] >> shared.task.data.wbits : 0.0f;
|
||||
// shared.B[threadIdx.x] = shared.F[threadIdx.x];
|
||||
// __syncthreads();
|
||||
//
|
||||
@@ -538,7 +546,7 @@ extern "C" __global__ void cudaComputeLPCLattice(
|
||||
// {
|
||||
// // Quantization
|
||||
// int cn = threadIdx.x & 31;
|
||||
// int precision = 10 - (order > 8) - min(2, shared.task.wbits);
|
||||
// int precision = 10 - (order > 8) - min(2, shared.task.data.wbits);
|
||||
// int taskNo = taskCount * blockIdx.y + order - 1;
|
||||
// shared.bits[threadIdx.x] = __mul24((33 - __clz(__float2int_rn(fabs(shared.lpc[order - 1][cn]) * (1 << 15))) - precision), cn < order);
|
||||
// shared.bits[threadIdx.x] = max(shared.bits[threadIdx.x], shared.bits[threadIdx.x + 16]);
|
||||
@@ -571,7 +579,7 @@ extern "C" __global__ void cudaComputeLPCLattice(
|
||||
extern "C" __global__ void cudaEstimateResidual(
|
||||
int*output,
|
||||
int*samples,
|
||||
encodeResidualTaskStruct *tasks,
|
||||
FlaCudaSubframeTask *tasks,
|
||||
int max_order,
|
||||
int frameSize,
|
||||
int partSize // should be blockDim.x * blockDim.y == 256
|
||||
@@ -580,10 +588,11 @@ extern "C" __global__ void cudaEstimateResidual(
|
||||
__shared__ struct {
|
||||
int data[32*9];
|
||||
volatile int residual[32*8];
|
||||
encodeResidualTaskStruct task[8];
|
||||
FlaCudaSubframeData task[8];
|
||||
int coefs[32*8];
|
||||
} shared;
|
||||
const int tid = threadIdx.x + threadIdx.y * blockDim.x;
|
||||
if (threadIdx.x < 16)
|
||||
const int tid = threadIdx.x + threadIdx.y * 32;
|
||||
if (threadIdx.x < sizeof(FlaCudaSubframeData)/sizeof(int))
|
||||
((int*)&shared.task[threadIdx.y])[threadIdx.x] = ((int*)(&tasks[blockIdx.y * blockDim.y + threadIdx.y]))[threadIdx.x];
|
||||
__syncthreads();
|
||||
const int pos = blockIdx.x * partSize;
|
||||
@@ -592,46 +601,169 @@ extern "C" __global__ void cudaEstimateResidual(
|
||||
// fetch samples
|
||||
shared.data[tid] = tid < dataLen ? samples[shared.task[0].samplesOffs + pos + tid] >> shared.task[0].wbits : 0;
|
||||
if (tid < 32) shared.data[tid + partSize] = tid + partSize < dataLen ? samples[shared.task[0].samplesOffs + pos + tid + partSize] >> shared.task[0].wbits : 0;
|
||||
const int residualLen = max(0,min(frameSize - pos - shared.task[threadIdx.y].residualOrder, partSize));
|
||||
|
||||
__syncthreads();
|
||||
|
||||
shared.residual[tid] = 0;
|
||||
shared.task[threadIdx.y].coefs[threadIdx.x] = threadIdx.x < max_order ? tasks[blockIdx.y * blockDim.y + threadIdx.y].coefs[threadIdx.x] : 0;
|
||||
shared.coefs[tid] = threadIdx.x < shared.task[threadIdx.y].residualOrder ? tasks[blockIdx.y * blockDim.y + threadIdx.y].coefs[threadIdx.x] : 0;
|
||||
|
||||
const int residualLen = max(0,min(frameSize - pos - shared.task[threadIdx.y].residualOrder, partSize));
|
||||
for (int i = blockDim.y * (shared.task[threadIdx.y].type == Verbatim); i < blockDim.y; i++) // += 32
|
||||
{
|
||||
int ptr = threadIdx.x + (i<<5);
|
||||
// compute residual
|
||||
int *co = &shared.coefs[threadIdx.y << 5];
|
||||
int ptr = threadIdx.x + (i << 5) + shared.task[threadIdx.y].residualOrder;
|
||||
int sum = 0;
|
||||
int c = 0;
|
||||
for (c = 0; c < shared.task[threadIdx.y].residualOrder; c++)
|
||||
sum += __mul24(shared.data[ptr + c], shared.task[threadIdx.y].coefs[c]);
|
||||
sum = shared.data[ptr + c] - (sum >> shared.task[threadIdx.y].shift);
|
||||
shared.residual[tid] += __mul24(ptr < residualLen, min(0x7fffff,(sum << 1) ^ (sum >> 31)));
|
||||
for (int c = -shared.task[threadIdx.y].residualOrder; c < 0; c++)
|
||||
sum += __mul24(shared.data[ptr + c], *(co++));
|
||||
sum = shared.data[ptr] - (sum >> shared.task[threadIdx.y].shift);
|
||||
shared.residual[tid] += __mul24(ptr < dataLen, min(0x7fffff,(sum << 1) ^ (sum >> 31)));
|
||||
}
|
||||
|
||||
// enable this line when using blockDim.x == 64
|
||||
//__syncthreads(); if (threadIdx.x < 32) shared.residual[tid] += shared.residual[tid + 32]; __syncthreads();
|
||||
shared.residual[tid] += shared.residual[tid + 16];
|
||||
shared.residual[tid] += shared.residual[tid + 8];
|
||||
shared.residual[tid] += shared.residual[tid + 4];
|
||||
shared.residual[tid] += shared.residual[tid + 2];
|
||||
shared.residual[tid] = shared.residual[tid] + shared.residual[tid + 8] + shared.residual[tid + 16] + shared.residual[tid + 24];
|
||||
shared.residual[tid] = shared.residual[tid] + shared.residual[tid + 2] + shared.residual[tid + 4] + shared.residual[tid + 6];
|
||||
shared.residual[tid] += shared.residual[tid + 1];
|
||||
|
||||
// rice parameter search
|
||||
shared.residual[tid] = (shared.task[threadIdx.y].type != Constant || shared.residual[threadIdx.y * blockDim.x] != 0) *
|
||||
(__mul24(threadIdx.x >= 15, 0x7fffff) + residualLen * (threadIdx.x + 1) + ((shared.residual[threadIdx.y * blockDim.x] - (residualLen >> 1)) >> threadIdx.x));
|
||||
shared.residual[tid] = min(shared.residual[tid], shared.residual[tid + 8]);
|
||||
shared.residual[tid] = min(shared.residual[tid], shared.residual[tid + 4]);
|
||||
shared.residual[tid] = min(shared.residual[tid], shared.residual[tid + 2]);
|
||||
shared.residual[tid] = min(shared.residual[tid], shared.residual[tid + 1]);
|
||||
shared.residual[tid] = (shared.task[threadIdx.y].type != Constant || shared.residual[threadIdx.y << 5] != 0) *
|
||||
(__mul24(threadIdx.x >= 15, 0x7fffff) + residualLen * (threadIdx.x + 1) + ((shared.residual[threadIdx.y << 5] - (residualLen >> 1)) >> threadIdx.x));
|
||||
shared.residual[tid] = min(min(shared.residual[tid], shared.residual[tid + 4]), min(shared.residual[tid + 8], shared.residual[tid + 12]));
|
||||
if (threadIdx.x == 0)
|
||||
output[(blockIdx.y * blockDim.y + threadIdx.y) * 64 + blockIdx.x] = shared.residual[tid];
|
||||
output[(blockIdx.y * blockDim.y + threadIdx.y) * 64 + blockIdx.x] = min(min(shared.residual[tid], shared.residual[tid + 1]), min(shared.residual[tid + 2], shared.residual[tid + 3]));
|
||||
}
|
||||
|
||||
extern "C" __global__ void cudaEstimateResidual8(
|
||||
int*output,
|
||||
int*samples,
|
||||
FlaCudaSubframeTask *tasks,
|
||||
int max_order,
|
||||
int frameSize,
|
||||
int partSize // should be blockDim.x * blockDim.y == 256
|
||||
)
|
||||
{
|
||||
__shared__ struct {
|
||||
int data[32*9];
|
||||
volatile int residual[32*8];
|
||||
FlaCudaSubframeData task[8];
|
||||
int coefs[32*8];
|
||||
} shared;
|
||||
const int tid = threadIdx.x + threadIdx.y * 32;
|
||||
if (threadIdx.x < sizeof(FlaCudaSubframeData)/sizeof(int))
|
||||
((int*)&shared.task[threadIdx.y])[threadIdx.x] = ((int*)(&tasks[blockIdx.y * blockDim.y + threadIdx.y]))[threadIdx.x];
|
||||
__syncthreads();
|
||||
const int pos = blockIdx.x * partSize;
|
||||
const int dataLen = min(frameSize - pos, partSize + max_order);
|
||||
|
||||
// fetch samples
|
||||
shared.data[tid] = tid < dataLen ? samples[shared.task[0].samplesOffs + pos + tid] >> shared.task[0].wbits : 0;
|
||||
if (tid < 32) shared.data[tid + partSize] = tid + partSize < dataLen ? samples[shared.task[0].samplesOffs + pos + tid + partSize] >> shared.task[0].wbits : 0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
shared.residual[tid] = 0;
|
||||
shared.coefs[tid] = threadIdx.x < shared.task[threadIdx.y].residualOrder ? tasks[blockIdx.y * blockDim.y + threadIdx.y].coefs[threadIdx.x] : 0;
|
||||
|
||||
const int residualLen = max(0,min(frameSize - pos - shared.task[threadIdx.y].residualOrder, partSize));
|
||||
const int ptr2 = threadIdx.y << 5;
|
||||
int s = 0;
|
||||
for (int ptr = threadIdx.x + blockDim.y * 32 * (shared.task[threadIdx.y].type == Verbatim); ptr < blockDim.y * 32 + threadIdx.x; ptr += 32)
|
||||
{
|
||||
// compute residual
|
||||
int sum =
|
||||
__mul24(shared.data[ptr + 0], shared.coefs[ptr2 + 0]) +
|
||||
__mul24(shared.data[ptr + 1], shared.coefs[ptr2 + 1]) +
|
||||
__mul24(shared.data[ptr + 2], shared.coefs[ptr2 + 2]) +
|
||||
__mul24(shared.data[ptr + 3], shared.coefs[ptr2 + 3]) +
|
||||
__mul24(shared.data[ptr + 4], shared.coefs[ptr2 + 4]) +
|
||||
__mul24(shared.data[ptr + 5], shared.coefs[ptr2 + 5]) +
|
||||
__mul24(shared.data[ptr + 6], shared.coefs[ptr2 + 6]) +
|
||||
__mul24(shared.data[ptr + 7], shared.coefs[ptr2 + 7]);
|
||||
sum = shared.data[ptr + shared.task[threadIdx.y].residualOrder] - (sum >> shared.task[threadIdx.y].shift);
|
||||
s += __mul24(ptr < residualLen, min(0x7fffff,(sum << 1) ^ (sum >> 31)));
|
||||
}
|
||||
|
||||
shared.residual[tid] = s;
|
||||
shared.residual[tid] = shared.residual[tid] + shared.residual[tid + 8] + shared.residual[tid + 16] + shared.residual[tid + 24];
|
||||
shared.residual[tid] = shared.residual[tid] + shared.residual[tid + 2] + shared.residual[tid + 4] + shared.residual[tid + 6];
|
||||
shared.residual[tid] += shared.residual[tid + 1];
|
||||
|
||||
// rice parameter search
|
||||
shared.residual[tid] = (shared.task[threadIdx.y].type != Constant || shared.residual[threadIdx.y << 5] != 0) *
|
||||
(__mul24(threadIdx.x >= 15, 0x7fffff) + residualLen * (threadIdx.x + 1) + ((shared.residual[threadIdx.y << 5] - (residualLen >> 1)) >> threadIdx.x));
|
||||
shared.residual[tid] = min(min(shared.residual[tid], shared.residual[tid + 4]), min(shared.residual[tid + 8], shared.residual[tid + 12]));
|
||||
if (threadIdx.x == 0)
|
||||
output[(blockIdx.y * blockDim.y + threadIdx.y) * 64 + blockIdx.x] = min(min(shared.residual[tid], shared.residual[tid + 1]), min(shared.residual[tid + 2], shared.residual[tid + 3]));
|
||||
}
|
||||
|
||||
extern "C" __global__ void cudaEstimateResidual12(
|
||||
int*output,
|
||||
int*samples,
|
||||
FlaCudaSubframeTask *tasks,
|
||||
int max_order,
|
||||
int frameSize,
|
||||
int partSize // should be blockDim.x * blockDim.y == 256
|
||||
)
|
||||
{
|
||||
__shared__ struct {
|
||||
int data[32*9];
|
||||
volatile int residual[32*8];
|
||||
FlaCudaSubframeData task[8];
|
||||
int coefs[8*32];
|
||||
} shared;
|
||||
const int tid = threadIdx.x + threadIdx.y * 32;
|
||||
if (threadIdx.x < sizeof(FlaCudaSubframeData)/sizeof(int))
|
||||
((int*)&shared.task[threadIdx.y])[threadIdx.x] = ((int*)(&tasks[blockIdx.y * blockDim.y + threadIdx.y]))[threadIdx.x];
|
||||
__syncthreads();
|
||||
const int pos = blockIdx.x * partSize;
|
||||
const int dataLen = min(frameSize - pos, partSize + max_order);
|
||||
|
||||
// fetch samples
|
||||
shared.data[tid] = tid < dataLen ? samples[shared.task[0].samplesOffs + pos + tid] >> shared.task[0].wbits : 0;
|
||||
if (tid < 32) shared.data[tid + partSize] = tid + partSize < dataLen ? samples[shared.task[0].samplesOffs + pos + tid + partSize] >> shared.task[0].wbits : 0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
shared.residual[tid] = 0;
|
||||
shared.coefs[tid] = threadIdx.x < shared.task[threadIdx.y].residualOrder ? tasks[blockIdx.y * blockDim.y + threadIdx.y].coefs[threadIdx.x] : 0;
|
||||
|
||||
const int residualLen = shared.task[threadIdx.y].type == Verbatim ? 0 : max(0,min(frameSize - pos - shared.task[threadIdx.y].residualOrder, partSize));
|
||||
const int ptr2 = threadIdx.y << 5;
|
||||
int s = 0;
|
||||
for (int ptr = threadIdx.x; ptr < residualLen; ptr += 32)
|
||||
{
|
||||
// compute residual
|
||||
int sum =
|
||||
__mul24(shared.data[ptr + 0], shared.coefs[ptr2 + 0]) +
|
||||
__mul24(shared.data[ptr + 1], shared.coefs[ptr2 + 1]) +
|
||||
__mul24(shared.data[ptr + 2], shared.coefs[ptr2 + 2]) +
|
||||
__mul24(shared.data[ptr + 3], shared.coefs[ptr2 + 3]) +
|
||||
__mul24(shared.data[ptr + 4], shared.coefs[ptr2 + 4]) +
|
||||
__mul24(shared.data[ptr + 5], shared.coefs[ptr2 + 5]) +
|
||||
__mul24(shared.data[ptr + 6], shared.coefs[ptr2 + 6]) +
|
||||
__mul24(shared.data[ptr + 7], shared.coefs[ptr2 + 7]) +
|
||||
__mul24(shared.data[ptr + 8], shared.coefs[ptr2 + 8]) +
|
||||
__mul24(shared.data[ptr + 9], shared.coefs[ptr2 + 9]) +
|
||||
__mul24(shared.data[ptr + 10], shared.coefs[ptr2 + 10]) +
|
||||
__mul24(shared.data[ptr + 11], shared.coefs[ptr2 + 11]);
|
||||
sum = shared.data[ptr + shared.task[threadIdx.y].residualOrder] - (sum >> shared.task[threadIdx.y].shift);
|
||||
s += min(0x7fffff,(sum << 1) ^ (sum >> 31));
|
||||
}
|
||||
|
||||
shared.residual[tid] = s;
|
||||
shared.residual[tid] = shared.residual[tid] + shared.residual[tid + 8] + shared.residual[tid + 16] + shared.residual[tid + 24];
|
||||
shared.residual[tid] = shared.residual[tid] + shared.residual[tid + 2] + shared.residual[tid + 4] + shared.residual[tid + 6];
|
||||
shared.residual[tid] += shared.residual[tid + 1];
|
||||
|
||||
// rice parameter search
|
||||
shared.residual[tid] = (shared.task[threadIdx.y].type != Constant || shared.residual[threadIdx.y << 5] != 0) *
|
||||
(__mul24(threadIdx.x >= 15, 0x7fffff) + residualLen * (threadIdx.x + 1) + ((shared.residual[threadIdx.y << 5] - (residualLen >> 1)) >> threadIdx.x));
|
||||
shared.residual[tid] = min(min(shared.residual[tid], shared.residual[tid + 4]), min(shared.residual[tid + 8], shared.residual[tid + 12]));
|
||||
if (threadIdx.x == 0)
|
||||
output[(blockIdx.y * blockDim.y + threadIdx.y) * 64 + blockIdx.x] = min(min(shared.residual[tid], shared.residual[tid + 1]), min(shared.residual[tid + 2], shared.residual[tid + 3]));
|
||||
}
|
||||
|
||||
extern "C" __global__ void cudaChooseBestMethod(
|
||||
encodeResidualTaskStruct *tasks,
|
||||
FlaCudaSubframeTask *tasks,
|
||||
int *residual,
|
||||
int partCount, // <= blockDim.y (256)
|
||||
int taskCount
|
||||
@@ -641,7 +773,7 @@ extern "C" __global__ void cudaChooseBestMethod(
|
||||
volatile int index[128];
|
||||
volatile int length[256];
|
||||
volatile int partLen[256];
|
||||
volatile encodeResidualTaskStruct task[8];
|
||||
volatile FlaCudaSubframeTask task[8];
|
||||
} shared;
|
||||
const int tid = threadIdx.x + threadIdx.y * 32;
|
||||
|
||||
@@ -666,13 +798,13 @@ extern "C" __global__ void cudaChooseBestMethod(
|
||||
// return sum
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
int obits = shared.task[threadIdx.y].obits - shared.task[threadIdx.y].wbits;
|
||||
int obits = shared.task[threadIdx.y].data.obits - shared.task[threadIdx.y].data.wbits;
|
||||
shared.length[task + threadIdx.y] =
|
||||
min(obits * shared.task[threadIdx.y].blocksize,
|
||||
shared.task[threadIdx.y].type == Fixed ? shared.task[threadIdx.y].residualOrder * obits + 6 + (4 * partCount/2) + shared.partLen[threadIdx.y * 32] :
|
||||
shared.task[threadIdx.y].type == LPC ? shared.task[threadIdx.y].residualOrder * obits + 4 + 5 + shared.task[threadIdx.y].residualOrder * shared.task[threadIdx.y].cbits + 6 + (4 * partCount/2)/* << porder */ + shared.partLen[threadIdx.y * 32] :
|
||||
shared.task[threadIdx.y].type == Constant ? obits * (1 + shared.task[threadIdx.y].blocksize * (shared.partLen[threadIdx.y * 32] != 0)) :
|
||||
obits * shared.task[threadIdx.y].blocksize);
|
||||
min(obits * shared.task[threadIdx.y].data.blocksize,
|
||||
shared.task[threadIdx.y].data.type == Fixed ? shared.task[threadIdx.y].data.residualOrder * obits + 6 + (4 * partCount/2) + shared.partLen[threadIdx.y * 32] :
|
||||
shared.task[threadIdx.y].data.type == LPC ? shared.task[threadIdx.y].data.residualOrder * obits + 4 + 5 + shared.task[threadIdx.y].data.residualOrder * shared.task[threadIdx.y].data.cbits + 6 + (4 * partCount/2)/* << porder */ + shared.partLen[threadIdx.y * 32] :
|
||||
shared.task[threadIdx.y].data.type == Constant ? obits * (1 + shared.task[threadIdx.y].data.blocksize * (shared.partLen[threadIdx.y * 32] != 0)) :
|
||||
obits * shared.task[threadIdx.y].data.blocksize);
|
||||
}
|
||||
}
|
||||
//shared.index[threadIdx.x] = threadIdx.x;
|
||||
@@ -681,7 +813,7 @@ extern "C" __global__ void cudaChooseBestMethod(
|
||||
__syncthreads();
|
||||
|
||||
if (tid < taskCount)
|
||||
tasks[tid + taskCount * blockIdx.y].size = shared.length[tid];
|
||||
tasks[tid + taskCount * blockIdx.y].data.size = shared.length[tid];
|
||||
|
||||
__syncthreads();
|
||||
int l1 = shared.length[tid];
|
||||
@@ -709,13 +841,13 @@ extern "C" __global__ void cudaChooseBestMethod(
|
||||
shared.length[tid] = l1 = min(l1, l2);
|
||||
}
|
||||
if (tid == 0)
|
||||
tasks[taskCount * blockIdx.y].best_index = taskCount * blockIdx.y + shared.index[shared.length[1] < shared.length[0]];
|
||||
tasks[taskCount * blockIdx.y].data.best_index = taskCount * blockIdx.y + shared.index[shared.length[1] < shared.length[0]];
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ void cudaCopyBestMethod(
|
||||
encodeResidualTaskStruct *tasks_out,
|
||||
encodeResidualTaskStruct *tasks,
|
||||
FlaCudaSubframeTask *tasks_out,
|
||||
FlaCudaSubframeTask *tasks,
|
||||
int count
|
||||
)
|
||||
{
|
||||
@@ -723,15 +855,15 @@ extern "C" __global__ void cudaCopyBestMethod(
|
||||
int best_index;
|
||||
} shared;
|
||||
if (threadIdx.x == 0)
|
||||
shared.best_index = tasks[count * blockIdx.y].best_index;
|
||||
shared.best_index = tasks[count * blockIdx.y].data.best_index;
|
||||
__syncthreads();
|
||||
if (threadIdx.x < sizeof(encodeResidualTaskStruct)/sizeof(int))
|
||||
if (threadIdx.x < sizeof(FlaCudaSubframeTask)/sizeof(int))
|
||||
((int*)(tasks_out + blockIdx.y))[threadIdx.x] = ((int*)(tasks + shared.best_index))[threadIdx.x];
|
||||
}
|
||||
|
||||
extern "C" __global__ void cudaCopyBestMethodStereo(
|
||||
encodeResidualTaskStruct *tasks_out,
|
||||
encodeResidualTaskStruct *tasks,
|
||||
FlaCudaSubframeTask *tasks_out,
|
||||
FlaCudaSubframeTask *tasks,
|
||||
int count
|
||||
)
|
||||
{
|
||||
@@ -741,9 +873,9 @@ extern "C" __global__ void cudaCopyBestMethodStereo(
|
||||
int lr_index[2];
|
||||
} shared;
|
||||
if (threadIdx.x < 4)
|
||||
shared.best_index[threadIdx.x] = tasks[count * (blockIdx.y * 4 + threadIdx.x)].best_index;
|
||||
shared.best_index[threadIdx.x] = tasks[count * (blockIdx.y * 4 + threadIdx.x)].data.best_index;
|
||||
if (threadIdx.x < 4)
|
||||
shared.best_size[threadIdx.x] = tasks[shared.best_index[threadIdx.x]].size;
|
||||
shared.best_size[threadIdx.x] = tasks[shared.best_index[threadIdx.x]].data.size;
|
||||
__syncthreads();
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
@@ -774,25 +906,25 @@ extern "C" __global__ void cudaCopyBestMethodStereo(
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
if (threadIdx.x < sizeof(encodeResidualTaskStruct)/sizeof(int))
|
||||
if (threadIdx.x < sizeof(FlaCudaSubframeTask)/sizeof(int))
|
||||
((int*)(tasks_out + 2 * blockIdx.y))[threadIdx.x] = ((int*)(tasks + shared.lr_index[0]))[threadIdx.x];
|
||||
if (threadIdx.x == 0)
|
||||
tasks_out[2 * blockIdx.y].residualOffs = tasks[shared.best_index[0]].residualOffs;
|
||||
if (threadIdx.x < sizeof(encodeResidualTaskStruct)/sizeof(int))
|
||||
tasks_out[2 * blockIdx.y].data.residualOffs = tasks[shared.best_index[0]].data.residualOffs;
|
||||
if (threadIdx.x < sizeof(FlaCudaSubframeTask)/sizeof(int))
|
||||
((int*)(tasks_out + 2 * blockIdx.y + 1))[threadIdx.x] = ((int*)(tasks + shared.lr_index[1]))[threadIdx.x];
|
||||
if (threadIdx.x == 0)
|
||||
tasks_out[2 * blockIdx.y + 1].residualOffs = tasks[shared.best_index[1]].residualOffs;
|
||||
tasks_out[2 * blockIdx.y + 1].data.residualOffs = tasks[shared.best_index[1]].data.residualOffs;
|
||||
}
|
||||
|
||||
extern "C" __global__ void cudaEncodeResidual(
|
||||
int*output,
|
||||
int*samples,
|
||||
encodeResidualTaskStruct *tasks
|
||||
FlaCudaSubframeTask *tasks
|
||||
)
|
||||
{
|
||||
__shared__ struct {
|
||||
int data[256 + 32];
|
||||
encodeResidualTaskStruct task;
|
||||
FlaCudaSubframeTask task;
|
||||
} shared;
|
||||
const int tid = threadIdx.x;
|
||||
if (threadIdx.x < sizeof(shared.task) / sizeof(int))
|
||||
@@ -800,40 +932,40 @@ extern "C" __global__ void cudaEncodeResidual(
|
||||
__syncthreads();
|
||||
const int partSize = blockDim.x;
|
||||
const int pos = blockIdx.x * partSize;
|
||||
const int dataLen = min(shared.task.blocksize - pos, partSize + shared.task.residualOrder);
|
||||
const int dataLen = min(shared.task.data.blocksize - pos, partSize + shared.task.data.residualOrder);
|
||||
|
||||
// fetch samples
|
||||
shared.data[tid] = tid < dataLen ? samples[shared.task.samplesOffs + pos + tid] >> shared.task.wbits : 0;
|
||||
if (tid < 32) shared.data[tid + partSize] = tid + partSize < dataLen ? samples[shared.task.samplesOffs + pos + tid + partSize] >> shared.task.wbits : 0;
|
||||
const int residualLen = max(0,min(shared.task.blocksize - pos - shared.task.residualOrder, partSize));
|
||||
shared.data[tid] = tid < dataLen ? samples[shared.task.data.samplesOffs + pos + tid] >> shared.task.data.wbits : 0;
|
||||
if (tid < 32) shared.data[tid + partSize] = tid + partSize < dataLen ? samples[shared.task.data.samplesOffs + pos + tid + partSize] >> shared.task.data.wbits : 0;
|
||||
const int residualLen = max(0,min(shared.task.data.blocksize - pos - shared.task.data.residualOrder, partSize));
|
||||
|
||||
__syncthreads();
|
||||
// compute residual
|
||||
int sum = 0;
|
||||
for (int c = 0; c < shared.task.residualOrder; c++)
|
||||
for (int c = 0; c < shared.task.data.residualOrder; c++)
|
||||
sum += __mul24(shared.data[tid + c], shared.task.coefs[c]);
|
||||
__syncthreads();
|
||||
shared.data[tid + shared.task.residualOrder] -= (sum >> shared.task.shift);
|
||||
shared.data[tid + shared.task.data.residualOrder] -= (sum >> shared.task.data.shift);
|
||||
__syncthreads();
|
||||
if (tid >= shared.task.residualOrder && tid < residualLen + shared.task.residualOrder)
|
||||
output[shared.task.residualOffs + pos + tid] = shared.data[tid];
|
||||
if (tid + 256 < residualLen + shared.task.residualOrder)
|
||||
output[shared.task.residualOffs + pos + tid + 256] = shared.data[tid + 256];
|
||||
if (tid >= shared.task.data.residualOrder && tid < residualLen + shared.task.data.residualOrder)
|
||||
output[shared.task.data.residualOffs + pos + tid] = shared.data[tid];
|
||||
if (tid + 256 < residualLen + shared.task.data.residualOrder)
|
||||
output[shared.task.data.residualOffs + pos + tid + 256] = shared.data[tid + 256];
|
||||
}
|
||||
|
||||
extern "C" __global__ void cudaCalcPartition(
|
||||
int* partition_lengths,
|
||||
int* residual,
|
||||
int* samples,
|
||||
encodeResidualTaskStruct *tasks,
|
||||
FlaCudaSubframeTask *tasks,
|
||||
int max_porder, // <= 8
|
||||
int psize, // == (shared.task.blocksize >> max_porder), < 256
|
||||
int psize, // == (shared.task.data.blocksize >> max_porder), < 256
|
||||
int parts_per_block // == 256 / psize, > 0, <= 16
|
||||
)
|
||||
{
|
||||
__shared__ struct {
|
||||
int data[256+32];
|
||||
encodeResidualTaskStruct task;
|
||||
FlaCudaSubframeTask task;
|
||||
} shared;
|
||||
const int tid = threadIdx.x + (threadIdx.y << 4);
|
||||
if (tid < sizeof(shared.task) / sizeof(int))
|
||||
@@ -844,18 +976,18 @@ extern "C" __global__ void cudaCalcPartition(
|
||||
const int offs = blockIdx.x * psize * parts_per_block + tid;
|
||||
|
||||
// fetch samples
|
||||
if (tid < 32) shared.data[tid] = min(offs, tid + shared.task.residualOrder) >= 32 ? samples[shared.task.samplesOffs + offs - 32] >> shared.task.wbits : 0;
|
||||
shared.data[32 + tid] = tid < parts * psize ? samples[shared.task.samplesOffs + offs] >> shared.task.wbits : 0;
|
||||
if (tid < 32) shared.data[tid] = min(offs, tid + shared.task.data.residualOrder) >= 32 ? samples[shared.task.data.samplesOffs + offs - 32] >> shared.task.data.wbits : 0;
|
||||
shared.data[32 + tid] = tid < parts * psize ? samples[shared.task.data.samplesOffs + offs] >> shared.task.data.wbits : 0;
|
||||
__syncthreads();
|
||||
|
||||
// compute residual
|
||||
int s = 0;
|
||||
for (int c = -shared.task.residualOrder; c < 0; c++)
|
||||
s += __mul24(shared.data[32 + tid + c], shared.task.coefs[shared.task.residualOrder + c]);
|
||||
s = shared.data[32 + tid] - (s >> shared.task.shift);
|
||||
for (int c = -shared.task.data.residualOrder; c < 0; c++)
|
||||
s += __mul24(shared.data[32 + tid + c], shared.task.coefs[shared.task.data.residualOrder + c]);
|
||||
s = shared.data[32 + tid] - (s >> shared.task.data.shift);
|
||||
|
||||
if (offs >= shared.task.residualOrder && tid < parts * psize)
|
||||
residual[shared.task.residualOffs + offs] = s;
|
||||
if (offs >= shared.task.data.residualOrder && tid < parts * psize)
|
||||
residual[shared.task.data.residualOffs + offs] = s;
|
||||
else
|
||||
s = 0;
|
||||
|
||||
@@ -878,7 +1010,7 @@ extern "C" __global__ void cudaCalcPartition(
|
||||
//shared.data[tid] = s;
|
||||
__syncthreads();
|
||||
|
||||
s = (psize - shared.task.residualOrder * (threadIdx.x + blockIdx.x == 0)) * (threadIdx.y + 1);
|
||||
s = (psize - shared.task.data.residualOrder * (threadIdx.x + blockIdx.x == 0)) * (threadIdx.y + 1);
|
||||
int dpos = __mul24(threadIdx.x, psize + 1);
|
||||
//int dpos = __mul24(threadIdx.x, psize);
|
||||
// calc number of unary bits for part threadIdx.x with rice paramater threadIdx.y
|
||||
@@ -896,7 +1028,7 @@ extern "C" __global__ void cudaCalcPartition16(
|
||||
int* partition_lengths,
|
||||
int* residual,
|
||||
int* samples,
|
||||
encodeResidualTaskStruct *tasks,
|
||||
FlaCudaSubframeTask *tasks,
|
||||
int max_porder, // <= 8
|
||||
int psize, // == 16
|
||||
int parts_per_block // == 16
|
||||
@@ -904,7 +1036,7 @@ extern "C" __global__ void cudaCalcPartition16(
|
||||
{
|
||||
__shared__ struct {
|
||||
int data[256+32];
|
||||
encodeResidualTaskStruct task;
|
||||
FlaCudaSubframeTask task;
|
||||
} shared;
|
||||
const int tid = threadIdx.x + (threadIdx.y << 4);
|
||||
if (tid < sizeof(shared.task) / sizeof(int))
|
||||
@@ -914,18 +1046,30 @@ extern "C" __global__ void cudaCalcPartition16(
|
||||
const int offs = (blockIdx.x << 8) + tid;
|
||||
|
||||
// fetch samples
|
||||
if (tid < 32) shared.data[tid] = min(offs, tid + shared.task.residualOrder) >= 32 ? samples[shared.task.samplesOffs + offs - 32] >> shared.task.wbits : 0;
|
||||
shared.data[32 + tid] = samples[shared.task.samplesOffs + offs] >> shared.task.wbits;
|
||||
if (tid < 32) shared.data[tid] = min(offs, tid + shared.task.data.residualOrder) >= 32 ? samples[shared.task.data.samplesOffs + offs - 32] >> shared.task.data.wbits : 0;
|
||||
shared.data[32 + tid] = samples[shared.task.data.samplesOffs + offs] >> shared.task.data.wbits;
|
||||
// if (tid < 32 && tid >= shared.task.data.residualOrder)
|
||||
//shared.task.coefs[tid] = 0;
|
||||
__syncthreads();
|
||||
|
||||
// compute residual
|
||||
int s = 0;
|
||||
for (int c = -shared.task.residualOrder; c < 0; c++)
|
||||
s += __mul24(shared.data[32 + tid + c], shared.task.coefs[shared.task.residualOrder + c]);
|
||||
s = shared.data[32 + tid] - (s >> shared.task.shift);
|
||||
for (int c = -shared.task.data.residualOrder; c < 0; c++)
|
||||
s += __mul24(shared.data[32 + tid + c], shared.task.coefs[shared.task.data.residualOrder + c]);
|
||||
// int spos = 32 + tid - shared.task.data.residualOrder;
|
||||
// int s=
|
||||
//__mul24(shared.data[spos + 0], shared.task.coefs[0]) + __mul24(shared.data[spos + 1], shared.task.coefs[1]) +
|
||||
//__mul24(shared.data[spos + 2], shared.task.coefs[2]) + __mul24(shared.data[spos + 3], shared.task.coefs[3]) +
|
||||
//__mul24(shared.data[spos + 4], shared.task.coefs[4]) + __mul24(shared.data[spos + 5], shared.task.coefs[5]) +
|
||||
//__mul24(shared.data[spos + 6], shared.task.coefs[6]) + __mul24(shared.data[spos + 7], shared.task.coefs[7]) +
|
||||
//__mul24(shared.data[spos + 8], shared.task.coefs[8]) + __mul24(shared.data[spos + 9], shared.task.coefs[9]) +
|
||||
//__mul24(shared.data[spos + 10], shared.task.coefs[10]) + __mul24(shared.data[spos + 11], shared.task.coefs[11]) +
|
||||
//__mul24(shared.data[spos + 12], shared.task.coefs[12]) + __mul24(shared.data[spos + 13], shared.task.coefs[13]) +
|
||||
//__mul24(shared.data[spos + 14], shared.task.coefs[14]) + __mul24(shared.data[spos + 15], shared.task.coefs[15]);
|
||||
s = shared.data[32 + tid] - (s >> shared.task.data.shift);
|
||||
|
||||
if (offs >= shared.task.residualOrder)
|
||||
residual[shared.task.residualOffs + offs] = s;
|
||||
if (blockIdx.x != 0 || tid >= shared.task.data.residualOrder)
|
||||
residual[shared.task.data.residualOffs + (blockIdx.x << 8) + tid] = s;
|
||||
else
|
||||
s = 0;
|
||||
|
||||
@@ -937,27 +1081,27 @@ extern "C" __global__ void cudaCalcPartition16(
|
||||
|
||||
// calc number of unary bits for part threadIdx.x with rice paramater threadIdx.y
|
||||
int dpos = __mul24(threadIdx.x, 17);
|
||||
s =
|
||||
(shared.data[dpos + 0] >> threadIdx.y) + (shared.data[dpos + 1] >> threadIdx.y) +
|
||||
(shared.data[dpos + 2] >> threadIdx.y) + (shared.data[dpos + 3] >> threadIdx.y) +
|
||||
(shared.data[dpos + 4] >> threadIdx.y) + (shared.data[dpos + 5] >> threadIdx.y) +
|
||||
(shared.data[dpos + 6] >> threadIdx.y) + (shared.data[dpos + 7] >> threadIdx.y) +
|
||||
(shared.data[dpos + 8] >> threadIdx.y) + (shared.data[dpos + 9] >> threadIdx.y) +
|
||||
(shared.data[dpos + 10] >> threadIdx.y) + (shared.data[dpos + 11] >> threadIdx.y) +
|
||||
(shared.data[dpos + 12] >> threadIdx.y) + (shared.data[dpos + 13] >> threadIdx.y) +
|
||||
(shared.data[dpos + 14] >> threadIdx.y) + (shared.data[dpos + 15] >> threadIdx.y);
|
||||
int sum =
|
||||
(shared.data[dpos + 0] >> threadIdx.y) + (shared.data[dpos + 1] >> threadIdx.y) +
|
||||
(shared.data[dpos + 2] >> threadIdx.y) + (shared.data[dpos + 3] >> threadIdx.y) +
|
||||
(shared.data[dpos + 4] >> threadIdx.y) + (shared.data[dpos + 5] >> threadIdx.y) +
|
||||
(shared.data[dpos + 6] >> threadIdx.y) + (shared.data[dpos + 7] >> threadIdx.y) +
|
||||
(shared.data[dpos + 8] >> threadIdx.y) + (shared.data[dpos + 9] >> threadIdx.y) +
|
||||
(shared.data[dpos + 10] >> threadIdx.y) + (shared.data[dpos + 11] >> threadIdx.y) +
|
||||
(shared.data[dpos + 12] >> threadIdx.y) + (shared.data[dpos + 13] >> threadIdx.y) +
|
||||
(shared.data[dpos + 14] >> threadIdx.y) + (shared.data[dpos + 15] >> threadIdx.y);
|
||||
|
||||
// output length
|
||||
const int pos = ((15 * blockIdx.y + threadIdx.y) << (max_porder + 1)) + (blockIdx.x << 4) + threadIdx.x;
|
||||
if (threadIdx.y <= 14)
|
||||
partition_lengths[pos] = s + (16 - shared.task.residualOrder * (threadIdx.x + blockIdx.x == 0)) * (threadIdx.y + 1);
|
||||
partition_lengths[pos] = sum + (16 - shared.task.data.residualOrder * (threadIdx.x + blockIdx.x == 0)) * (threadIdx.y + 1);
|
||||
}
|
||||
|
||||
extern "C" __global__ void cudaCalcLargePartition(
|
||||
int* partition_lengths,
|
||||
int* residual,
|
||||
int* samples,
|
||||
encodeResidualTaskStruct *tasks,
|
||||
FlaCudaSubframeTask *tasks,
|
||||
int max_porder, // <= 8
|
||||
int psize, // == >= 128
|
||||
int parts_per_block // == 1
|
||||
@@ -966,7 +1110,7 @@ extern "C" __global__ void cudaCalcLargePartition(
|
||||
__shared__ struct {
|
||||
int data[256];
|
||||
volatile int length[256];
|
||||
encodeResidualTaskStruct task;
|
||||
FlaCudaSubframeTask task;
|
||||
} shared;
|
||||
const int tid = threadIdx.x + (threadIdx.y << 4);
|
||||
if (tid < sizeof(shared.task) / sizeof(int))
|
||||
@@ -978,7 +1122,7 @@ extern "C" __global__ void cudaCalcLargePartition(
|
||||
{
|
||||
// fetch residual
|
||||
int offs = blockIdx.x * psize + pos + tid;
|
||||
int s = (offs >= shared.task.residualOrder && pos + tid < psize) ? residual[shared.task.residualOffs + offs] : 0;
|
||||
int s = (offs >= shared.task.data.residualOrder && pos + tid < psize) ? residual[shared.task.data.residualOffs + offs] : 0;
|
||||
// convert to unsigned
|
||||
shared.data[tid] = min(0xfffff, (s << 1) ^ (s >> 31));
|
||||
__syncthreads();
|
||||
@@ -996,7 +1140,7 @@ extern "C" __global__ void cudaCalcLargePartition(
|
||||
// output length
|
||||
const int pos = (15 << (max_porder + 1)) * blockIdx.y + (threadIdx.y << (max_porder + 1));
|
||||
if (threadIdx.y <= 14 && threadIdx.x == 0)
|
||||
partition_lengths[pos + blockIdx.x] = min(0xfffff,shared.length[tid]) + (psize - shared.task.residualOrder * (blockIdx.x == 0)) * (threadIdx.y + 1);
|
||||
partition_lengths[pos + blockIdx.x] = min(0xfffff,shared.length[tid]) + (psize - shared.task.data.residualOrder * (blockIdx.x == 0)) * (threadIdx.y + 1);
|
||||
}
|
||||
|
||||
// Sums partition lengths for a certain k == blockIdx.x
|
||||
@@ -1086,7 +1230,7 @@ extern "C" __global__ void cudaFindRiceParameter(
|
||||
|
||||
extern "C" __global__ void cudaFindPartitionOrder(
|
||||
int* best_rice_parameters,
|
||||
encodeResidualTaskStruct *tasks,
|
||||
FlaCudaSubframeTask *tasks,
|
||||
int* rice_parameters,
|
||||
int max_porder
|
||||
)
|
||||
@@ -1096,7 +1240,8 @@ extern "C" __global__ void cudaFindPartitionOrder(
|
||||
volatile int tmp[256];
|
||||
int length[32];
|
||||
int index[32];
|
||||
encodeResidualTaskStruct task;
|
||||
//char4 ch[64];
|
||||
FlaCudaSubframeTask task;
|
||||
} shared;
|
||||
const int pos = (blockIdx.y << (max_porder + 2)) + (2 << max_porder);
|
||||
if (threadIdx.x < sizeof(shared.task) / sizeof(int))
|
||||
@@ -1130,22 +1275,36 @@ extern "C" __global__ void cudaFindPartitionOrder(
|
||||
shared.length[threadIdx.x] = l1 = min(l1, l2);
|
||||
}
|
||||
if (threadIdx.x == 0)
|
||||
tasks[blockIdx.y].porder = shared.index[0];
|
||||
tasks[blockIdx.y].data.porder = shared.index[0];
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
int obits = shared.task.obits - shared.task.wbits;
|
||||
tasks[blockIdx.y].size =
|
||||
shared.task.type == Fixed ? shared.task.residualOrder * obits + 6 + l1 :
|
||||
shared.task.type == LPC ? shared.task.residualOrder * obits + 6 + l1 + 4 + 5 + shared.task.residualOrder * shared.task.cbits :
|
||||
shared.task.type == Constant ? obits : obits * shared.task.blocksize;
|
||||
int obits = shared.task.data.obits - shared.task.data.wbits;
|
||||
tasks[blockIdx.y].data.size =
|
||||
shared.task.data.type == Fixed ? shared.task.data.residualOrder * obits + 6 + l1 :
|
||||
shared.task.data.type == LPC ? shared.task.data.residualOrder * obits + 6 + l1 + 4 + 5 + shared.task.data.residualOrder * shared.task.data.cbits :
|
||||
shared.task.data.type == Constant ? obits : obits * shared.task.data.blocksize;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
int porder = shared.index[0];
|
||||
//shared.data[threadIdx.x] = threadIdx.x < (1 << porder) ? rice_parameters[pos - (2 << porder) + threadIdx.x] : 0;
|
||||
if (threadIdx.x < (1 << porder))
|
||||
best_rice_parameters[(blockIdx.y << max_porder) + threadIdx.x] = rice_parameters[pos - (2 << porder) + threadIdx.x];
|
||||
// FIXME: should be bytes?
|
||||
// if (threadIdx.x < (1 << porder))
|
||||
//shared.tmp[threadIdx.x] = rice_parameters[pos - (2 << porder) + threadIdx.x];
|
||||
// __syncthreads();
|
||||
// if (threadIdx.x < max(1, (1 << porder) >> 2))
|
||||
// {
|
||||
//char4 ch;
|
||||
//ch.x = shared.tmp[(threadIdx.x << 2)];
|
||||
//ch.y = shared.tmp[(threadIdx.x << 2) + 1];
|
||||
//ch.z = shared.tmp[(threadIdx.x << 2) + 2];
|
||||
//ch.w = shared.tmp[(threadIdx.x << 2) + 3];
|
||||
//shared.ch[threadIdx.x] = ch
|
||||
// }
|
||||
// __syncthreads();
|
||||
// if (threadIdx.x < max(1, (1 << porder) >> 2))
|
||||
//best_rice_parameters[(blockIdx.y << max_porder) + threadIdx.x] = shared.ch[threadIdx.x];
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user