optimizations

This commit is contained in:
chudov
2009-10-06 22:43:47 +00:00
parent 2bfcac5f49
commit e21ad1591d
3 changed files with 1048 additions and 626 deletions

View File

@@ -98,7 +98,9 @@ namespace CUETools.Codecs.FlaCuda
CUdeviceptr cudaWindow;
bool encode_on_cpu = true;
bool encode_on_cpu = false;
bool do_lattice = false;
public const int MAX_BLOCKSIZE = 4096 * 16;
internal const int maxFrames = 128;
@@ -167,6 +169,30 @@ namespace CUETools.Codecs.FlaCuda
}
}
public bool GPUOnly
{
get
{
return !encode_on_cpu;
}
set
{
encode_on_cpu = !value;
}
}
public bool UseLattice
{
get
{
return do_lattice;
}
set
{
do_lattice = value;
}
}
//[DllImport("kernel32.dll")]
//static extern bool GetThreadTimes(IntPtr hThread, out long lpCreationTime, out long lpExitTime, out long lpKernelTime, out long lpUserTime);
//[DllImport("kernel32.dll")]
@@ -829,10 +855,8 @@ namespace CUETools.Codecs.FlaCuda
_windowcount++;
}
unsafe void initialize_autocorTasks(int blocksize, int channelsCount, int nFrames, FlaCudaTask task)
unsafe void initializeSubframeTasks(int blocksize, int channelsCount, int nFrames, FlaCudaTask task)
{
computeAutocorTaskStruct* autocorTasks = (computeAutocorTaskStruct*)task.autocorTasksPtr;
task.nAutocorTasks = 0;
task.nResidualTasks = 0;
task.nResidualTasksPerChannel = (_windowcount * eparams.max_prediction_order + 1 + (eparams.do_constant ? 1 : 0) + eparams.max_fixed_order - eparams.min_fixed_order + 7) & ~7;
task.nAutocorTasksPerChannel = _windowcount;
@@ -842,12 +866,6 @@ namespace CUETools.Codecs.FlaCuda
{
for (int iWindow = 0; iWindow < _windowcount; iWindow++)
{
// Autocorelation task
autocorTasks[task.nAutocorTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize;
autocorTasks[task.nAutocorTasks].windowOffs = iWindow * FlaCudaWriter.MAX_BLOCKSIZE;
autocorTasks[task.nAutocorTasks].residualOffs = eparams.max_prediction_order * iWindow + task.nResidualTasksPerChannel * (ch + iFrame * channelsCount);
autocorTasks[task.nAutocorTasks].blocksize = blocksize;
task.nAutocorTasks++;
// LPC tasks
for (int order = 1; order <= eparams.max_prediction_order; order++)
{
@@ -859,6 +877,7 @@ namespace CUETools.Codecs.FlaCuda
task.ResidualTasks[task.nResidualTasks].residualOrder = order;
task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize;
task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs;
task.ResidualTasks[task.nResidualTasks].windowOffs = iWindow * FlaCudaWriter.MAX_BLOCKSIZE;
task.nResidualTasks++;
}
}
@@ -930,12 +949,9 @@ namespace CUETools.Codecs.FlaCuda
}
}
}
if (sizeof(encodeResidualTaskStruct) * task.nResidualTasks > task.residualTasksLen)
if (sizeof(FlaCudaSubframeTask) * task.nResidualTasks > task.residualTasksLen)
throw new Exception("oops");
if (sizeof(computeAutocorTaskStruct) * task.nAutocorTasks > task.autocorTasksLen)
throw new Exception("oops");
cuda.CopyHostToDeviceAsync(task.cudaAutocorTasks, task.autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * task.nAutocorTasks), task.stream);
cuda.CopyHostToDeviceAsync(task.cudaResidualTasks, task.residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * task.nResidualTasks), task.stream);
cuda.CopyHostToDeviceAsync(task.cudaResidualTasks, task.residualTasksPtr, (uint)(sizeof(FlaCudaSubframeTask) * task.nResidualTasks), task.stream);
task.frameSize = blocksize;
}
@@ -1078,7 +1094,7 @@ namespace CUETools.Codecs.FlaCuda
return;
//int autocorPartSize = (2 * 256 - eparams.max_prediction_order) & ~15;
int autocorPartSize = 256 + 128;
int autocorPartSize = 32 * 15;
int autocorPartCount = (task.frameSize + autocorPartSize - 1) / autocorPartSize;
if (autocorPartCount > maxAutocorParts)
throw new Exception("internal error");
@@ -1115,6 +1131,7 @@ namespace CUETools.Codecs.FlaCuda
CUfunction cudaChannelDecorr = channels == 2 ? (channelsCount == 4 ? task.cudaStereoDecorr : task.cudaChannelDecorr2) : task.cudaChannelDecorr;
CUfunction cudaCalcPartition = calcPartitionPartSize >= 128 ? task.cudaCalcLargePartition : calcPartitionPartSize == 16 && task.frameSize >= 256 ? task.cudaCalcPartition16 : task.cudaCalcPartition;
CUfunction cudaEstimateResidual = eparams.max_prediction_order <= 8 ? task.cudaEstimateResidual8 : eparams.max_prediction_order <= 12 ? task.cudaEstimateResidual12 : task.cudaEstimateResidual;
cuda.SetParameter(cudaChannelDecorr, 0 * sizeof(uint), (uint)task.cudaSamples.Pointer);
cuda.SetParameter(cudaChannelDecorr, 1 * sizeof(uint), (uint)task.cudaSamplesBytes.Pointer);
@@ -1130,40 +1147,39 @@ namespace CUETools.Codecs.FlaCuda
cuda.SetFunctionBlockShape(task.cudaFindWastedBits, 256, 1, 1);
cuda.SetParameter(task.cudaComputeAutocor, 0, (uint)task.cudaAutocorOutput.Pointer);
cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint), (uint)task.cudaSamples.Pointer);
cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 2, (uint)cudaWindow.Pointer);
cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 3, (uint)task.cudaAutocorTasks.Pointer);
cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 4, (uint)eparams.max_prediction_order);
cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 4 + sizeof(uint), (uint)task.frameSize);
cuda.SetParameter(task.cudaComputeAutocor, sizeof(uint) * 4 + sizeof(uint) * 2, (uint)autocorPartSize);
cuda.SetParameterSize(task.cudaComputeAutocor, sizeof(uint) * 7U);
cuda.SetParameter(task.cudaComputeAutocor, 1 * sizeof(uint), (uint)task.cudaSamples.Pointer);
cuda.SetParameter(task.cudaComputeAutocor, 2 * sizeof(uint), (uint)cudaWindow.Pointer);
cuda.SetParameter(task.cudaComputeAutocor, 3 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer);
cuda.SetParameter(task.cudaComputeAutocor, 4 * sizeof(uint), (uint)eparams.max_prediction_order);
cuda.SetParameter(task.cudaComputeAutocor, 5 * sizeof(uint), (uint)task.nAutocorTasksPerChannel - 1);
cuda.SetParameter(task.cudaComputeAutocor, 6 * sizeof(uint), (uint)task.nResidualTasksPerChannel);
cuda.SetParameterSize(task.cudaComputeAutocor, 7U * sizeof(uint));
cuda.SetFunctionBlockShape(task.cudaComputeAutocor, 32, 8, 1);
cuda.SetParameter(task.cudaComputeLPC, 0, (uint)task.cudaResidualTasks.Pointer);
cuda.SetParameter(task.cudaComputeLPC, sizeof(uint), (uint)task.cudaAutocorOutput.Pointer);
cuda.SetParameter(task.cudaComputeLPC, sizeof(uint) * 2, (uint)task.cudaAutocorTasks.Pointer);
cuda.SetParameter(task.cudaComputeLPC, sizeof(uint) * 3, (uint)eparams.max_prediction_order);
cuda.SetParameter(task.cudaComputeLPC, sizeof(uint) * 3 + sizeof(uint), (uint)autocorPartCount);
cuda.SetParameterSize(task.cudaComputeLPC, (uint)(sizeof(uint) * 3) + sizeof(uint) * 2);
cuda.SetParameter(task.cudaComputeLPC, 1 * sizeof(uint), (uint)task.nResidualTasksPerChannel);
cuda.SetParameter(task.cudaComputeLPC, 2 * sizeof(uint), (uint)task.cudaAutocorOutput.Pointer);
cuda.SetParameter(task.cudaComputeLPC, 3 * sizeof(uint), (uint)eparams.max_prediction_order);
cuda.SetParameter(task.cudaComputeLPC, 4 * sizeof(uint), (uint)autocorPartCount);
cuda.SetParameterSize(task.cudaComputeLPC, 5U * sizeof(uint));
cuda.SetFunctionBlockShape(task.cudaComputeLPC, (autocorPartCount + 31) & ~31, 1, 1);
cuda.SetParameter(task.cudaComputeLPCLattice, 0, (uint)task.cudaResidualTasks.Pointer);
cuda.SetParameter(task.cudaComputeLPCLattice, 1 * sizeof(uint), (uint)task.nResidualTasksPerChannel);
cuda.SetParameter(task.cudaComputeLPCLattice, 2 * sizeof(uint), (uint)task.cudaSamples.Pointer);
cuda.SetParameter(task.cudaComputeLPCLattice, 3 * sizeof(uint), (uint)_windowcount);
//cuda.SetParameter(task.cudaComputeLPCLattice, 3 * sizeof(uint), (uint)task.frameSize);
cuda.SetParameter(task.cudaComputeLPCLattice, 4 * sizeof(uint), (uint)eparams.max_prediction_order);
cuda.SetParameterSize(task.cudaComputeLPCLattice, 5U * sizeof(uint));
cuda.SetFunctionBlockShape(task.cudaComputeLPCLattice, 256, 1, 1);
cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 0, (uint)task.cudaResidualOutput.Pointer);
cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 1, (uint)task.cudaSamples.Pointer);
cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 2, (uint)task.cudaResidualTasks.Pointer);
cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 3, (uint)eparams.max_prediction_order);
cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 4, (uint)task.frameSize);
cuda.SetParameter(task.cudaEstimateResidual, sizeof(uint) * 5, (uint)residualPartSize);
cuda.SetParameterSize(task.cudaEstimateResidual, sizeof(uint) * 6);
cuda.SetFunctionBlockShape(task.cudaEstimateResidual, 32, threads_y, 1);
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 0, (uint)task.cudaResidualOutput.Pointer);
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 1, (uint)task.cudaSamples.Pointer);
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 2, (uint)task.cudaResidualTasks.Pointer);
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 3, (uint)eparams.max_prediction_order);
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 4, (uint)task.frameSize);
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 5, (uint)residualPartSize);
cuda.SetParameterSize(cudaEstimateResidual, sizeof(uint) * 6);
cuda.SetFunctionBlockShape(cudaEstimateResidual, 32, threads_y, 1);
cuda.SetParameter(task.cudaChooseBestMethod, 0 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer);
cuda.SetParameter(task.cudaChooseBestMethod, 1 * sizeof(uint), (uint)task.cudaResidualOutput.Pointer);
@@ -1220,16 +1236,16 @@ namespace CUETools.Codecs.FlaCuda
// issue work to the GPU
cuda.LaunchAsync(cudaChannelDecorr, (task.frameCount * task.frameSize + 255) / 256, channels == 2 ? 1 : channels, task.stream);
if (task.frameSize <= 512 && eparams.max_prediction_order <= 12)
if (do_lattice && task.frameSize <= 512 && eparams.max_prediction_order <= 12)
cuda.LaunchAsync(task.cudaComputeLPCLattice, 1, channelsCount * task.frameCount, task.stream);
else
{
if (eparams.do_wasted)
cuda.LaunchAsync(task.cudaFindWastedBits, channelsCount * task.frameCount, 1, task.stream);
cuda.LaunchAsync(task.cudaComputeAutocor, autocorPartCount, task.nAutocorTasksPerChannel * channelsCount * task.frameCount, task.stream);
cuda.LaunchAsync(task.cudaComputeLPC, 1, task.nAutocorTasksPerChannel * channelsCount * task.frameCount, task.stream);
cuda.LaunchAsync(task.cudaComputeLPC, task.nAutocorTasksPerChannel, channelsCount * task.frameCount, task.stream);
}
cuda.LaunchAsync(task.cudaEstimateResidual, residualPartCount, task.nResidualTasksPerChannel * channelsCount * task.frameCount / threads_y, task.stream);
cuda.LaunchAsync(cudaEstimateResidual, residualPartCount, task.nResidualTasksPerChannel * channelsCount * task.frameCount / threads_y, task.stream);
cuda.LaunchAsync(task.cudaChooseBestMethod, 1, channelsCount * task.frameCount, task.stream);
if (channels == 2 && channelsCount == 4)
cuda.LaunchAsync(task.cudaCopyBestMethodStereo, 1, task.frameCount, task.stream);
@@ -1250,7 +1266,7 @@ namespace CUETools.Codecs.FlaCuda
cuda.CopyDeviceToHostAsync(task.cudaBestRiceParams, task.bestRiceParamsPtr, (uint)(sizeof(int) * (1 << max_porder) * channels * task.frameCount), task.stream);
task.max_porder = max_porder;
}
cuda.CopyDeviceToHostAsync(task.cudaBestResidualTasks, task.bestResidualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * channels * task.frameCount), task.stream);
cuda.CopyDeviceToHostAsync(task.cudaBestResidualTasks, task.bestResidualTasksPtr, (uint)(sizeof(FlaCudaSubframeTask) * channels * task.frameCount), task.stream);
}
unsafe int encode_frame(bool doMidside, int channelCount, int iFrame, FlaCudaTask task)
@@ -1298,6 +1314,13 @@ namespace CUETools.Codecs.FlaCuda
task.frameCount = nFrames;
task.frameSize = blocksize;
cuda.CopyHostToDeviceAsync(task.cudaSamplesBytes, task.samplesBytesPtr, (uint)(sizeof(short) * channels * blocksize * nFrames), task.stream);
if (verify != null)
{
int* r = (int*)task.samplesBufferPtr;
fixed (int* s = task.verifyBuffer)
for (int ch = 0; ch < channels; ch++)
AudioSamples.MemCpy(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * FlaCudaWriter.MAX_BLOCKSIZE, task.frameSize * task.frameCount);
}
}
unsafe void run_GPU_task(FlaCudaTask task)
@@ -1320,15 +1343,7 @@ namespace CUETools.Codecs.FlaCuda
cuda.CopyHostToDevice<float>(cudaWindow, windowBuffer);
}
if (task.nResidualTasks == 0)
initialize_autocorTasks(task.frameSize, channelsCount, max_frames, task);
if (verify != null)
{
int* r = (int*)task.samplesBufferPtr;
fixed (int* s = task.verifyBuffer)
for (int ch = 0; ch < channels; ch++)
AudioSamples.MemCpy(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * FlaCudaWriter.MAX_BLOCKSIZE, task.frameSize * task.frameCount);
}
initializeSubframeTasks(task.frameSize, channelsCount, max_frames, task);
estimate_residual(task, channelsCount);
}
@@ -1691,13 +1706,14 @@ namespace CUETools.Codecs.FlaCuda
md5 = new MD5CryptoServiceProvider();
if (eparams.do_verify)
{
verify = new FlakeReader(channels, bits_per_sample);
verify.DoCRC = false;
}
frame_buffer = new byte[max_frame_size + 1];
frame_writer = new BitWriter(frame_buffer, 0, max_frame_size + 1);
encode_on_cpu = eparams.lpc_max_precision_search <= 1;
return header_len;
}
}
@@ -1891,16 +1907,7 @@ namespace CUETools.Codecs.FlaCuda
}
}
unsafe struct computeAutocorTaskStruct
{
public int samplesOffs;
public int windowOffs;
public int residualOffs;
public int blocksize;
public fixed int reserved[12];
};
unsafe struct encodeResidualTaskStruct
unsafe struct FlaCudaSubframeTask
{
public int residualOrder;
public int samplesOffs;
@@ -1916,7 +1923,8 @@ namespace CUETools.Codecs.FlaCuda
public int wbits;
public int abits;
public int porder;
public fixed int reserved[2];
public int windowOffs;
public fixed int reserved[1];
public fixed int coefs[32];
};
@@ -1931,6 +1939,8 @@ namespace CUETools.Codecs.FlaCuda
public CUfunction cudaComputeLPC;
public CUfunction cudaComputeLPCLattice;
public CUfunction cudaEstimateResidual;
public CUfunction cudaEstimateResidual8;
public CUfunction cudaEstimateResidual12;
public CUfunction cudaChooseBestMethod;
public CUfunction cudaCopyBestMethod;
public CUfunction cudaCopyBestMethodStereo;
@@ -1947,7 +1957,6 @@ namespace CUETools.Codecs.FlaCuda
public CUdeviceptr cudaPartitions;
public CUdeviceptr cudaRiceParams;
public CUdeviceptr cudaBestRiceParams;
public CUdeviceptr cudaAutocorTasks;
public CUdeviceptr cudaAutocorOutput;
public CUdeviceptr cudaResidualTasks;
public CUdeviceptr cudaResidualOutput;
@@ -1956,7 +1965,6 @@ namespace CUETools.Codecs.FlaCuda
public IntPtr samplesBufferPtr = IntPtr.Zero;
public IntPtr residualBufferPtr = IntPtr.Zero;
public IntPtr bestRiceParamsPtr = IntPtr.Zero;
public IntPtr autocorTasksPtr = IntPtr.Zero;
public IntPtr residualTasksPtr = IntPtr.Zero;
public IntPtr bestResidualTasksPtr = IntPtr.Zero;
public CUstream stream;
@@ -1964,12 +1972,10 @@ namespace CUETools.Codecs.FlaCuda
public int frameSize = 0;
public int frameCount = 0;
public FlacFrame frame;
public int autocorTasksLen;
public int residualTasksLen;
public int bestResidualTasksLen;
public int samplesBufferLen;
public int nResidualTasks = 0;
public int nAutocorTasks = 0;
public int nResidualTasksPerChannel = 0;
public int nAutocorTasksPerChannel = 0;
public int max_porder = 0;
@@ -1978,9 +1984,8 @@ namespace CUETools.Codecs.FlaCuda
{
cuda = _cuda;
autocorTasksLen = sizeof(computeAutocorTaskStruct) * channelCount * lpc.MAX_LPC_WINDOWS * FlaCudaWriter.maxFrames;
residualTasksLen = sizeof(encodeResidualTaskStruct) * channelCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 8) * FlaCudaWriter.maxFrames;
bestResidualTasksLen = sizeof(encodeResidualTaskStruct) * channelCount * FlaCudaWriter.maxFrames;
residualTasksLen = sizeof(FlaCudaSubframeTask) * channelCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 8) * FlaCudaWriter.maxFrames;
bestResidualTasksLen = sizeof(FlaCudaSubframeTask) * channelCount * FlaCudaWriter.maxFrames;
samplesBufferLen = sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelCount;
int partitionsLen = sizeof(int) * (30 << 8) * channelCount * FlaCudaWriter.maxFrames;
int riceParamsLen = sizeof(int) * (4 << 8) * channelCount * FlaCudaWriter.maxFrames;
@@ -1991,7 +1996,6 @@ namespace CUETools.Codecs.FlaCuda
cudaPartitions = cuda.Allocate((uint)partitionsLen);
cudaRiceParams = cuda.Allocate((uint)riceParamsLen);
cudaBestRiceParams = cuda.Allocate((uint)riceParamsLen / 4);
cudaAutocorTasks = cuda.Allocate((uint)autocorTasksLen);
cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * (FlaCudaWriter.maxAutocorParts + FlaCudaWriter.maxFrames)));
cudaResidualTasks = cuda.Allocate((uint)residualTasksLen);
cudaBestResidualTasks = cuda.Allocate((uint)bestResidualTasksLen);
@@ -2005,8 +2009,6 @@ namespace CUETools.Codecs.FlaCuda
cuErr = CUDADriver.cuMemAllocHost(ref residualBufferPtr, (uint)samplesBufferLen);
if (cuErr == CUResult.Success)
cuErr = CUDADriver.cuMemAllocHost(ref bestRiceParamsPtr, (uint)riceParamsLen / 4);
if (cuErr == CUResult.Success)
cuErr = CUDADriver.cuMemAllocHost(ref autocorTasksPtr, (uint)autocorTasksLen);
if (cuErr == CUResult.Success)
cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)residualTasksLen);
if (cuErr == CUResult.Success)
@@ -2017,7 +2019,6 @@ namespace CUETools.Codecs.FlaCuda
if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr); samplesBufferPtr = IntPtr.Zero;
if (residualBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualBufferPtr); residualBufferPtr = IntPtr.Zero;
if (bestRiceParamsPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(bestRiceParamsPtr); bestRiceParamsPtr = IntPtr.Zero;
if (autocorTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(autocorTasksPtr); autocorTasksPtr = IntPtr.Zero;
if (residualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualTasksPtr); residualTasksPtr = IntPtr.Zero;
if (bestResidualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(bestResidualTasksPtr); bestResidualTasksPtr = IntPtr.Zero;
throw new CUDAException(cuErr);
@@ -2031,6 +2032,8 @@ namespace CUETools.Codecs.FlaCuda
cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC");
cudaComputeLPCLattice = cuda.GetModuleFunction("cudaComputeLPCLattice");
cudaEstimateResidual = cuda.GetModuleFunction("cudaEstimateResidual");
cudaEstimateResidual8 = cuda.GetModuleFunction("cudaEstimateResidual8");
cudaEstimateResidual12 = cuda.GetModuleFunction("cudaEstimateResidual12");
cudaChooseBestMethod = cuda.GetModuleFunction("cudaChooseBestMethod");
cudaCopyBestMethod = cuda.GetModuleFunction("cudaCopyBestMethod");
cudaCopyBestMethodStereo = cuda.GetModuleFunction("cudaCopyBestMethodStereo");
@@ -2053,7 +2056,6 @@ namespace CUETools.Codecs.FlaCuda
cuda.Free(cudaSamplesBytes);
cuda.Free(cudaResidual);
cuda.Free(cudaPartitions);
cuda.Free(cudaAutocorTasks);
cuda.Free(cudaAutocorOutput);
cuda.Free(cudaResidualTasks);
cuda.Free(cudaResidualOutput);
@@ -2064,23 +2066,22 @@ namespace CUETools.Codecs.FlaCuda
CUDADriver.cuMemFreeHost(bestRiceParamsPtr);
CUDADriver.cuMemFreeHost(residualTasksPtr);
CUDADriver.cuMemFreeHost(bestResidualTasksPtr);
CUDADriver.cuMemFreeHost(autocorTasksPtr);
cuda.DestroyStream(stream);
}
public unsafe encodeResidualTaskStruct* ResidualTasks
public unsafe FlaCudaSubframeTask* ResidualTasks
{
get
{
return (encodeResidualTaskStruct*)residualTasksPtr;
return (FlaCudaSubframeTask*)residualTasksPtr;
}
}
public unsafe encodeResidualTaskStruct* BestResidualTasks
public unsafe FlaCudaSubframeTask* BestResidualTasks
{
get
{
return (encodeResidualTaskStruct*)bestResidualTasksPtr;
return (FlaCudaSubframeTask*)bestResidualTasksPtr;
}
}
}

View File

@@ -20,15 +20,6 @@
#ifndef _FLACUDA_KERNEL_H_
#define _FLACUDA_KERNEL_H_
typedef struct
{
int samplesOffs;
int windowOffs;
int residualOffs;
int blocksize;
int reserved[12];
} computeAutocorTaskStruct;
typedef enum
{
Constant = 0,
@@ -53,9 +44,15 @@ typedef struct
int wbits;
int abits;
int porder;
int reserved[2];
int windowOffs;
int reserved[1];
} FlaCudaSubframeData;
typedef struct
{
FlaCudaSubframeData data;
int coefs[32]; // fixme: should be short?
} encodeResidualTaskStruct;
} FlaCudaSubframeTask;
#define SUM16(buf,tid,op) buf[tid] op buf[tid + 8]; buf[tid] op buf[tid + 4]; buf[tid] op buf[tid + 2]; buf[tid] op buf[tid + 1];
#define SUM32(buf,tid,op) buf[tid] op buf[tid + 16]; SUM16(buf,tid,op)
@@ -110,7 +107,7 @@ extern "C" __global__ void cudaChannelDecorr(
}
extern "C" __global__ void cudaFindWastedBits(
encodeResidualTaskStruct *tasks,
FlaCudaSubframeTask *tasks,
int *samples,
int tasksPerChannel,
int blocksize
@@ -119,11 +116,11 @@ extern "C" __global__ void cudaFindWastedBits(
__shared__ struct {
volatile int wbits[256];
volatile int abits[256];
encodeResidualTaskStruct task;
FlaCudaSubframeData task;
} shared;
if (threadIdx.x < 16)
((int*)&shared.task)[threadIdx.x] = ((int*)(&tasks[blockIdx.x * tasksPerChannel]))[threadIdx.x];
if (threadIdx.x < sizeof(shared.task) / sizeof(int))
((int*)&shared.task)[threadIdx.x] = ((int*)(&tasks[blockIdx.x * tasksPerChannel].data))[threadIdx.x];
shared.wbits[threadIdx.x] = 0;
shared.abits[threadIdx.x] = 0;
__syncthreads();
@@ -144,72 +141,86 @@ extern "C" __global__ void cudaFindWastedBits(
__syncthreads();
if (threadIdx.x < tasksPerChannel)
tasks[blockIdx.x * tasksPerChannel + threadIdx.x].wbits = shared.task.wbits;
tasks[blockIdx.x * tasksPerChannel + threadIdx.x].data.wbits = shared.task.wbits;
if (threadIdx.x < tasksPerChannel)
tasks[blockIdx.x * tasksPerChannel + threadIdx.x].abits = shared.task.abits;
tasks[blockIdx.x * tasksPerChannel + threadIdx.x].data.abits = shared.task.abits;
}
extern "C" __global__ void cudaComputeAutocor(
float *output,
const int *samples,
const float *window,
computeAutocorTaskStruct *tasks,
int max_order, // should be <= 32
int frameSize,
int partSize // should be <= 2*blockDim - max_order
FlaCudaSubframeTask *tasks,
const int max_order, // should be <= 32
const int windowcount, // windows (log2: 0,1)
const int taskCount // tasks per block
)
{
__shared__ struct {
float data[512];
volatile float product[256];
computeAutocorTaskStruct task;
FlaCudaSubframeData task;
volatile float result[33];
} shared;
const int tid = threadIdx.x + (threadIdx.y * 32);
// fetch task data
if (tid < sizeof(shared.task) / sizeof(int))
((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid];
((int*)&shared.task)[tid] = ((int*)(tasks + __mul24(taskCount, blockIdx.y >> windowcount) + __mul24(max_order, blockIdx.y & ((1 << windowcount)-1))))[tid];
__syncthreads();
// fetch samples
{
const int pos = blockIdx.x * partSize;
const int dataLen = min(frameSize - pos, partSize + max_order);
const int pos = __mul24(blockIdx.x, 15) * 32;
const int dataLen = min(shared.task.blocksize - pos, 15 * 32 + max_order);
const int pos2 = pos + tid;
shared.data[tid] = tid < dataLen ? samples[shared.task.samplesOffs + pos + tid] * window[shared.task.windowOffs + pos + tid]: 0.0f;
shared.data[tid + 256] = tid + 256 < dataLen ? samples[shared.task.samplesOffs + pos + tid + 256] * window[shared.task.windowOffs + pos + tid + 256]: 0.0f;
shared.data[tid] = tid < dataLen ? samples[shared.task.samplesOffs + pos2] * window[shared.task.windowOffs + pos2]: 0.0f;
shared.data[tid + 256] = tid + 256 < dataLen ? samples[shared.task.samplesOffs + pos2 + 256] * window[shared.task.windowOffs + pos2 + 256]: 0.0f;
}
__syncthreads();
const int ptr = __mul24(threadIdx.x, 15);
for (int lag = threadIdx.y; lag <= max_order; lag += 8)
{
const int productLen = min(frameSize - blockIdx.x * partSize - lag, partSize);
shared.product[tid] = 0.0;
for (int ptr = threadIdx.x; ptr < productLen + threadIdx.x; ptr += 128)
shared.product[tid] += ((ptr < productLen) * shared.data[ptr] * shared.data[ptr + lag]
+ (ptr + 32 < productLen) * shared.data[ptr + 32] * shared.data[ptr + 32 + lag])
+ ((ptr + 64 < productLen) * shared.data[ptr + 64] * shared.data[ptr + 64 + lag]
+ (ptr + 96 < productLen) * shared.data[ptr + 96] * shared.data[ptr + 96 + lag]);
// product sum: reduction in shared mem
//shared.product[tid] += shared.product[tid + 16];
shared.product[tid] = (shared.product[tid] + shared.product[tid + 16]) + (shared.product[tid + 8] + shared.product[tid + 24]);
shared.product[tid] = (shared.product[tid] + shared.product[tid + 4]) + (shared.product[tid + 2] + shared.product[tid + 6]);
//const int productLen = min(shared.task.blocksize - blockIdx.x * partSize - lag, partSize);
const int ptr2 = ptr + lag;
shared.product[tid] =
shared.data[ptr + 0] * shared.data[ptr2 + 0] +
shared.data[ptr + 1] * shared.data[ptr2 + 1] +
shared.data[ptr + 2] * shared.data[ptr2 + 2] +
shared.data[ptr + 3] * shared.data[ptr2 + 3] +
shared.data[ptr + 4] * shared.data[ptr2 + 4] +
shared.data[ptr + 5] * shared.data[ptr2 + 5] +
shared.data[ptr + 6] * shared.data[ptr2 + 6] +
shared.data[ptr + 7] * shared.data[ptr2 + 7] +
shared.data[ptr + 8] * shared.data[ptr2 + 8] +
shared.data[ptr + 9] * shared.data[ptr2 + 9] +
shared.data[ptr + 10] * shared.data[ptr2 + 10] +
shared.data[ptr + 11] * shared.data[ptr2 + 11] +
shared.data[ptr + 12] * shared.data[ptr2 + 12] +
shared.data[ptr + 13] * shared.data[ptr2 + 13] +
shared.data[ptr + 14] * shared.data[ptr2 + 14];
shared.product[tid] = shared.product[tid] + shared.product[tid + 8] + shared.product[tid + 16] + shared.product[tid + 24];
shared.product[tid] = shared.product[tid] + shared.product[tid + 2] + shared.product[tid + 4] + shared.product[tid + 6];
// return results
if (threadIdx.x == 0)
output[(blockIdx.x + blockIdx.y * gridDim.x) * (max_order + 1) + lag] = shared.product[tid] + shared.product[tid + 1];
shared.result[lag] = shared.product[tid] + shared.product[tid + 1];
}
__syncthreads();
if (tid <= max_order)
output[(blockIdx.x + blockIdx.y * gridDim.x) * (max_order + 1) + tid] = shared.result[tid];
}
extern "C" __global__ void cudaComputeLPC(
encodeResidualTaskStruct *output,
FlaCudaSubframeTask *tasks,
int taskCount, // tasks per block
float*autoc,
computeAutocorTaskStruct *tasks,
int max_order, // should be <= 32
int partCount // should be <= blockDim?
)
{
__shared__ struct {
computeAutocorTaskStruct task;
encodeResidualTaskStruct task2;
FlaCudaSubframeData task;
volatile float ldr[32];
volatile int bits[32];
volatile float autoc[33];
@@ -223,16 +234,13 @@ extern "C" __global__ void cudaComputeLPC(
// fetch task data
if (tid < sizeof(shared.task) / sizeof(int))
((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid];
__syncthreads();
if (tid < sizeof(shared.task2) / sizeof(int))
((int*)&shared.task2)[tid] = ((int*)(output + shared.task.residualOffs))[tid];
((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.x * max_order + blockIdx.y * taskCount))[tid];
__syncthreads();
// add up parts
for (int order = 0; order <= max_order; order++)
{
shared.parts[tid] = tid < partCount ? autoc[(blockIdx.y * partCount + tid) * (max_order + 1) + order] : 0;
shared.parts[tid] = tid < partCount ? autoc[((blockIdx.y * gridDim.x + blockIdx.x) * partCount + tid) * (max_order + 1) + order] : 0;
__syncthreads();
if (tid < 64 && blockDim.x > 64) shared.parts[tid] += shared.parts[tid + 64];
__syncthreads();
@@ -275,8 +283,8 @@ extern "C" __global__ void cudaComputeLPC(
// Quantization
//int precision = 13 - (shared.task.blocksize <= 2304) - (shared.task.blocksize <= 1152) - (shared.task.blocksize <= 576);
int precision = max(3, min(13 - (shared.task.blocksize <= 2304) - (shared.task.blocksize <= 1152) - (shared.task.blocksize <= 576), shared.task2.abits));
int taskNo = shared.task.residualOffs + order;
int precision = max(3, min(min(13 - (shared.task.blocksize <= 2304) - (shared.task.blocksize <= 1152) - (shared.task.blocksize <= 576), shared.task.abits), __clz(order) + 1 - shared.task.abits));
int taskNo = blockIdx.x * max_order + blockIdx.y * taskCount + order;
shared.bits[tid] = __mul24((33 - __clz(__float2int_rn(fabs(shared.ldr[tid]) * (1 << 15))) - precision), tid <= order);
shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 16]);
shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 8]);
@@ -288,9 +296,9 @@ extern "C" __global__ void cudaComputeLPC(
// reverse coefs
int coef = max(-(1 << precision),min((1 << precision)-1,__float2int_rn(-shared.ldr[order - tid] * (1 << sh))));
if (tid <= order)
output[taskNo].coefs[tid] = coef;
tasks[taskNo].coefs[tid] = coef;
if (tid == 0)
output[taskNo].shift = sh;
tasks[taskNo].data.shift = sh;
shared.bits[tid] = __mul24(33 - __clz(coef ^ (coef >> 31)), tid <= order);
shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 16]);
shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 8]);
@@ -299,13 +307,13 @@ extern "C" __global__ void cudaComputeLPC(
shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 1]);
int cbits = shared.bits[0];
if (tid == 0)
output[taskNo].cbits = cbits;
tasks[taskNo].data.cbits = cbits;
}
}
}
extern "C" __global__ void cudaComputeLPCLattice(
encodeResidualTaskStruct *tasks,
FlaCudaSubframeTask *tasks,
const int taskCount, // tasks per block
const int *samples,
const int precisions,
@@ -313,7 +321,7 @@ extern "C" __global__ void cudaComputeLPCLattice(
)
{
__shared__ struct {
volatile encodeResidualTaskStruct task;
volatile FlaCudaSubframeTask task;
volatile float F[512];
volatile float lpc[12][32];
union {
@@ -328,25 +336,25 @@ extern "C" __global__ void cudaComputeLPCLattice(
__syncthreads();
// F = samples; B = samples
//int frameSize = shared.task.blocksize;
int s1 = threadIdx.x < shared.task.blocksize ? samples[shared.task.samplesOffs + threadIdx.x] : 0;
int s2 = threadIdx.x + 256 < shared.task.blocksize ? samples[shared.task.samplesOffs + threadIdx.x + 256] : 0;
//int frameSize = shared.task.data.blocksize;
int s1 = threadIdx.x < shared.task.data.blocksize ? samples[shared.task.data.samplesOffs + threadIdx.x] : 0;
int s2 = threadIdx.x + 256 < shared.task.data.blocksize ? samples[shared.task.data.samplesOffs + threadIdx.x + 256] : 0;
shared.tmpi[threadIdx.x] = s1|s2;
__syncthreads();
SUM256(shared.tmpi,threadIdx.x,|=);
if (threadIdx.x == 0)
shared.task.wbits = max(0,__ffs(shared.tmpi[0]) - 1);
shared.task.data.wbits = max(0,__ffs(shared.tmpi[0]) - 1);
__syncthreads();
if (threadIdx.x < taskCount)
tasks[blockIdx.y * taskCount + threadIdx.x].wbits = shared.task.wbits;
tasks[blockIdx.y * taskCount + threadIdx.x].data.wbits = shared.task.data.wbits;
shared.tmpi[threadIdx.x] = (s1 ^ (s1 >> 31)) | (s2 ^ (s2 >> 31));
__syncthreads();
SUM256(shared.tmpi,threadIdx.x,|=);
if (threadIdx.x == 0)
shared.task.abits = 32 - __clz(shared.tmpi[0]) - shared.task.wbits;
shared.task.data.abits = 32 - __clz(shared.tmpi[0]) - shared.task.data.wbits;
__syncthreads();
s1 >>= shared.task.wbits;
s2 >>= shared.task.wbits;
s1 >>= shared.task.data.wbits;
s2 >>= shared.task.data.wbits;
shared.F[threadIdx.x] = s1;
shared.F[threadIdx.x + 256] = s2;
__syncthreads();
@@ -354,10 +362,10 @@ extern "C" __global__ void cudaComputeLPCLattice(
for (int order = 1; order <= max_order; order++)
{
// reff = F(order+1:frameSize) * B(1:frameSize-order)' / DEN
float f1 = (threadIdx.x + order < shared.task.blocksize) * shared.F[threadIdx.x + order];
float f2 = (threadIdx.x + 256 + order < shared.task.blocksize) * shared.F[threadIdx.x + 256 + order];
s1 *= (threadIdx.x + order < shared.task.blocksize);
s2 *= (threadIdx.x + 256 + order < shared.task.blocksize);
float f1 = (threadIdx.x + order < shared.task.data.blocksize) * shared.F[threadIdx.x + order];
float f2 = (threadIdx.x + 256 + order < shared.task.data.blocksize) * shared.F[threadIdx.x + 256 + order];
s1 *= (threadIdx.x + order < shared.task.data.blocksize);
s2 *= (threadIdx.x + 256 + order < shared.task.data.blocksize);
// DEN = F(order+1:frameSize) * F(order+1:frameSize)' + B(1:frameSize-order) * B(1:frameSize-order)' (BURG)
shared.tmp[threadIdx.x] = FSQR(f1) + FSQR(f2) + FSQR(s1) + FSQR(s2);
@@ -388,9 +396,9 @@ extern "C" __global__ void cudaComputeLPCLattice(
// F1 = F(order+1:frameSize) - reff * B(1:frameSize-order)
// B(1:frameSize-order) = B(1:frameSize-order) - reff * F(order+1:frameSize)
// F(order+1:frameSize) = F1
if (threadIdx.x < shared.task.blocksize - order)
if (threadIdx.x < shared.task.data.blocksize - order)
shared.F[order + threadIdx.x] -= reff * s1;
if (threadIdx.x + 256 < shared.task.blocksize - order)
if (threadIdx.x + 256 < shared.task.data.blocksize - order)
shared.F[order + threadIdx.x + 256] -= reff * s2;
s1 -= reff * f1;
s2 -= reff * f2;
@@ -408,17 +416,17 @@ extern "C" __global__ void cudaComputeLPCLattice(
// OR reduction
SUM32(shared.tmpi,threadIdx.x,|=);
// choose precision
//int cbits = max(3, min(10, 5 + (shared.task.abits >> 1))); // - __float2int_rn(shared.PE[order - 1])
int cbits = max(3, min(10, shared.task.abits)) - precision;// + precision); // - __float2int_rn(shared.PE[order - 1])
//int cbits = max(3, min(10, 5 + (shared.task.data.abits >> 1))); // - __float2int_rn(shared.PE[order - 1])
int cbits = max(3, min(10, shared.task.data.abits)) - precision;// + precision); // - __float2int_rn(shared.PE[order - 1])
// calculate shift based on precision and number of leading zeroes in coeffs
int shift = max(0,min(15, __clz(shared.tmpi[threadIdx.x & ~31]) - 18 + cbits));
//if (shared.task.abits + 32 - __clz(order) < shift
//int shift = max(0,min(15, (shared.task.abits >> 2) - 14 + __clz(shared.tmpi[threadIdx.x & ~31]) + ((32 - __clz(order))>>1)));
//if (shared.task.data.abits + 32 - __clz(order) < shift
//int shift = max(0,min(15, (shared.task.data.abits >> 2) - 14 + __clz(shared.tmpi[threadIdx.x & ~31]) + ((32 - __clz(order))>>1)));
// quantize coeffs with given shift
coef = cn <= order ? max(-(1 << (cbits - 1)), min((1 << (cbits - 1)) -1, __float2int_rn(shared.lpc[order][order - cn] * (1 << shift)))) : 0;
// error correction
//shared.tmp[threadIdx.x] = (threadIdx.x != 0) * (shared.arp[threadIdx.x - 1]*(1 << shared.task.shift) - shared.task.coefs[threadIdx.x - 1]);
//shared.task.coefs[threadIdx.x] = max(-(1 << (shared.task.cbits - 1)), min((1 << (shared.task.cbits - 1))-1, __float2int_rn((shared.arp[threadIdx.x]) * (1 << shared.task.shift) + shared.tmp[threadIdx.x])));
//shared.tmp[threadIdx.x] = (threadIdx.x != 0) * (shared.arp[threadIdx.x - 1]*(1 << shared.task.data.shift) - shared.task.data.coefs[threadIdx.x - 1]);
//shared.task.data.coefs[threadIdx.x] = max(-(1 << (shared.task.data.cbits - 1)), min((1 << (shared.task.data.cbits - 1))-1, __float2int_rn((shared.arp[threadIdx.x]) * (1 << shared.task.data.shift) + shared.tmp[threadIdx.x])));
// remove sign bits
shared.tmpi[threadIdx.x] = coef ^ (coef >> 31);
// OR reduction
@@ -429,16 +437,16 @@ extern "C" __global__ void cudaComputeLPCLattice(
// output shift, cbits and output coeffs
int taskNo = taskCount * blockIdx.y + order + precision * max_order;
if (cn == 0)
tasks[taskNo].shift = shift;
tasks[taskNo].data.shift = shift;
if (cn == 0)
tasks[taskNo].cbits = cbits;
tasks[taskNo].data.cbits = cbits;
if (cn <= order)
tasks[taskNo].coefs[cn] = coef;
}
}
//extern "C" __global__ void cudaComputeLPCLattice512(
// encodeResidualTaskStruct *tasks,
// FlaCudaSubframeTask *tasks,
// const int taskCount, // tasks per block
// const int *samples,
// const int frameSize, // <= 512
@@ -446,7 +454,7 @@ extern "C" __global__ void cudaComputeLPCLattice(
//)
//{
// __shared__ struct {
// encodeResidualTaskStruct task;
// FlaCudaSubframeTask task;
// float F[512];
// float B[512];
// float lpc[32][32];
@@ -463,7 +471,7 @@ extern "C" __global__ void cudaComputeLPCLattice(
// __syncthreads();
//
// // F = samples; B = samples
// shared.F[threadIdx.x] = threadIdx.x < frameSize ? samples[shared.task.samplesOffs + threadIdx.x] >> shared.task.wbits : 0.0f;
// shared.F[threadIdx.x] = threadIdx.x < frameSize ? samples[shared.task.data.samplesOffs + threadIdx.x] >> shared.task.data.wbits : 0.0f;
// shared.B[threadIdx.x] = shared.F[threadIdx.x];
// __syncthreads();
//
@@ -538,7 +546,7 @@ extern "C" __global__ void cudaComputeLPCLattice(
// {
// // Quantization
// int cn = threadIdx.x & 31;
// int precision = 10 - (order > 8) - min(2, shared.task.wbits);
// int precision = 10 - (order > 8) - min(2, shared.task.data.wbits);
// int taskNo = taskCount * blockIdx.y + order - 1;
// shared.bits[threadIdx.x] = __mul24((33 - __clz(__float2int_rn(fabs(shared.lpc[order - 1][cn]) * (1 << 15))) - precision), cn < order);
// shared.bits[threadIdx.x] = max(shared.bits[threadIdx.x], shared.bits[threadIdx.x + 16]);
@@ -571,7 +579,7 @@ extern "C" __global__ void cudaComputeLPCLattice(
extern "C" __global__ void cudaEstimateResidual(
int*output,
int*samples,
encodeResidualTaskStruct *tasks,
FlaCudaSubframeTask *tasks,
int max_order,
int frameSize,
int partSize // should be blockDim.x * blockDim.y == 256
@@ -580,10 +588,11 @@ extern "C" __global__ void cudaEstimateResidual(
__shared__ struct {
int data[32*9];
volatile int residual[32*8];
encodeResidualTaskStruct task[8];
FlaCudaSubframeData task[8];
int coefs[32*8];
} shared;
const int tid = threadIdx.x + threadIdx.y * blockDim.x;
if (threadIdx.x < 16)
const int tid = threadIdx.x + threadIdx.y * 32;
if (threadIdx.x < sizeof(FlaCudaSubframeData)/sizeof(int))
((int*)&shared.task[threadIdx.y])[threadIdx.x] = ((int*)(&tasks[blockIdx.y * blockDim.y + threadIdx.y]))[threadIdx.x];
__syncthreads();
const int pos = blockIdx.x * partSize;
@@ -592,46 +601,169 @@ extern "C" __global__ void cudaEstimateResidual(
// fetch samples
shared.data[tid] = tid < dataLen ? samples[shared.task[0].samplesOffs + pos + tid] >> shared.task[0].wbits : 0;
if (tid < 32) shared.data[tid + partSize] = tid + partSize < dataLen ? samples[shared.task[0].samplesOffs + pos + tid + partSize] >> shared.task[0].wbits : 0;
const int residualLen = max(0,min(frameSize - pos - shared.task[threadIdx.y].residualOrder, partSize));
__syncthreads();
shared.residual[tid] = 0;
shared.task[threadIdx.y].coefs[threadIdx.x] = threadIdx.x < max_order ? tasks[blockIdx.y * blockDim.y + threadIdx.y].coefs[threadIdx.x] : 0;
shared.coefs[tid] = threadIdx.x < shared.task[threadIdx.y].residualOrder ? tasks[blockIdx.y * blockDim.y + threadIdx.y].coefs[threadIdx.x] : 0;
const int residualLen = max(0,min(frameSize - pos - shared.task[threadIdx.y].residualOrder, partSize));
for (int i = blockDim.y * (shared.task[threadIdx.y].type == Verbatim); i < blockDim.y; i++) // += 32
{
int ptr = threadIdx.x + (i<<5);
// compute residual
int *co = &shared.coefs[threadIdx.y << 5];
int ptr = threadIdx.x + (i << 5) + shared.task[threadIdx.y].residualOrder;
int sum = 0;
int c = 0;
for (c = 0; c < shared.task[threadIdx.y].residualOrder; c++)
sum += __mul24(shared.data[ptr + c], shared.task[threadIdx.y].coefs[c]);
sum = shared.data[ptr + c] - (sum >> shared.task[threadIdx.y].shift);
shared.residual[tid] += __mul24(ptr < residualLen, min(0x7fffff,(sum << 1) ^ (sum >> 31)));
for (int c = -shared.task[threadIdx.y].residualOrder; c < 0; c++)
sum += __mul24(shared.data[ptr + c], *(co++));
sum = shared.data[ptr] - (sum >> shared.task[threadIdx.y].shift);
shared.residual[tid] += __mul24(ptr < dataLen, min(0x7fffff,(sum << 1) ^ (sum >> 31)));
}
// enable this line when using blockDim.x == 64
//__syncthreads(); if (threadIdx.x < 32) shared.residual[tid] += shared.residual[tid + 32]; __syncthreads();
shared.residual[tid] += shared.residual[tid + 16];
shared.residual[tid] += shared.residual[tid + 8];
shared.residual[tid] += shared.residual[tid + 4];
shared.residual[tid] += shared.residual[tid + 2];
shared.residual[tid] = shared.residual[tid] + shared.residual[tid + 8] + shared.residual[tid + 16] + shared.residual[tid + 24];
shared.residual[tid] = shared.residual[tid] + shared.residual[tid + 2] + shared.residual[tid + 4] + shared.residual[tid + 6];
shared.residual[tid] += shared.residual[tid + 1];
// rice parameter search
shared.residual[tid] = (shared.task[threadIdx.y].type != Constant || shared.residual[threadIdx.y * blockDim.x] != 0) *
(__mul24(threadIdx.x >= 15, 0x7fffff) + residualLen * (threadIdx.x + 1) + ((shared.residual[threadIdx.y * blockDim.x] - (residualLen >> 1)) >> threadIdx.x));
shared.residual[tid] = min(shared.residual[tid], shared.residual[tid + 8]);
shared.residual[tid] = min(shared.residual[tid], shared.residual[tid + 4]);
shared.residual[tid] = min(shared.residual[tid], shared.residual[tid + 2]);
shared.residual[tid] = min(shared.residual[tid], shared.residual[tid + 1]);
shared.residual[tid] = (shared.task[threadIdx.y].type != Constant || shared.residual[threadIdx.y << 5] != 0) *
(__mul24(threadIdx.x >= 15, 0x7fffff) + residualLen * (threadIdx.x + 1) + ((shared.residual[threadIdx.y << 5] - (residualLen >> 1)) >> threadIdx.x));
shared.residual[tid] = min(min(shared.residual[tid], shared.residual[tid + 4]), min(shared.residual[tid + 8], shared.residual[tid + 12]));
if (threadIdx.x == 0)
output[(blockIdx.y * blockDim.y + threadIdx.y) * 64 + blockIdx.x] = shared.residual[tid];
output[(blockIdx.y * blockDim.y + threadIdx.y) * 64 + blockIdx.x] = min(min(shared.residual[tid], shared.residual[tid + 1]), min(shared.residual[tid + 2], shared.residual[tid + 3]));
}
extern "C" __global__ void cudaEstimateResidual8(
int*output,
int*samples,
FlaCudaSubframeTask *tasks,
int max_order,
int frameSize,
int partSize // should be blockDim.x * blockDim.y == 256
)
{
__shared__ struct {
int data[32*9];
volatile int residual[32*8];
FlaCudaSubframeData task[8];
int coefs[32*8];
} shared;
const int tid = threadIdx.x + threadIdx.y * 32;
if (threadIdx.x < sizeof(FlaCudaSubframeData)/sizeof(int))
((int*)&shared.task[threadIdx.y])[threadIdx.x] = ((int*)(&tasks[blockIdx.y * blockDim.y + threadIdx.y]))[threadIdx.x];
__syncthreads();
const int pos = blockIdx.x * partSize;
const int dataLen = min(frameSize - pos, partSize + max_order);
// fetch samples
shared.data[tid] = tid < dataLen ? samples[shared.task[0].samplesOffs + pos + tid] >> shared.task[0].wbits : 0;
if (tid < 32) shared.data[tid + partSize] = tid + partSize < dataLen ? samples[shared.task[0].samplesOffs + pos + tid + partSize] >> shared.task[0].wbits : 0;
__syncthreads();
shared.residual[tid] = 0;
shared.coefs[tid] = threadIdx.x < shared.task[threadIdx.y].residualOrder ? tasks[blockIdx.y * blockDim.y + threadIdx.y].coefs[threadIdx.x] : 0;
const int residualLen = max(0,min(frameSize - pos - shared.task[threadIdx.y].residualOrder, partSize));
const int ptr2 = threadIdx.y << 5;
int s = 0;
for (int ptr = threadIdx.x + blockDim.y * 32 * (shared.task[threadIdx.y].type == Verbatim); ptr < blockDim.y * 32 + threadIdx.x; ptr += 32)
{
// compute residual
int sum =
__mul24(shared.data[ptr + 0], shared.coefs[ptr2 + 0]) +
__mul24(shared.data[ptr + 1], shared.coefs[ptr2 + 1]) +
__mul24(shared.data[ptr + 2], shared.coefs[ptr2 + 2]) +
__mul24(shared.data[ptr + 3], shared.coefs[ptr2 + 3]) +
__mul24(shared.data[ptr + 4], shared.coefs[ptr2 + 4]) +
__mul24(shared.data[ptr + 5], shared.coefs[ptr2 + 5]) +
__mul24(shared.data[ptr + 6], shared.coefs[ptr2 + 6]) +
__mul24(shared.data[ptr + 7], shared.coefs[ptr2 + 7]);
sum = shared.data[ptr + shared.task[threadIdx.y].residualOrder] - (sum >> shared.task[threadIdx.y].shift);
s += __mul24(ptr < residualLen, min(0x7fffff,(sum << 1) ^ (sum >> 31)));
}
shared.residual[tid] = s;
shared.residual[tid] = shared.residual[tid] + shared.residual[tid + 8] + shared.residual[tid + 16] + shared.residual[tid + 24];
shared.residual[tid] = shared.residual[tid] + shared.residual[tid + 2] + shared.residual[tid + 4] + shared.residual[tid + 6];
shared.residual[tid] += shared.residual[tid + 1];
// rice parameter search
shared.residual[tid] = (shared.task[threadIdx.y].type != Constant || shared.residual[threadIdx.y << 5] != 0) *
(__mul24(threadIdx.x >= 15, 0x7fffff) + residualLen * (threadIdx.x + 1) + ((shared.residual[threadIdx.y << 5] - (residualLen >> 1)) >> threadIdx.x));
shared.residual[tid] = min(min(shared.residual[tid], shared.residual[tid + 4]), min(shared.residual[tid + 8], shared.residual[tid + 12]));
if (threadIdx.x == 0)
output[(blockIdx.y * blockDim.y + threadIdx.y) * 64 + blockIdx.x] = min(min(shared.residual[tid], shared.residual[tid + 1]), min(shared.residual[tid + 2], shared.residual[tid + 3]));
}
extern "C" __global__ void cudaEstimateResidual12(
int*output,
int*samples,
FlaCudaSubframeTask *tasks,
int max_order,
int frameSize,
int partSize // should be blockDim.x * blockDim.y == 256
)
{
__shared__ struct {
int data[32*9];
volatile int residual[32*8];
FlaCudaSubframeData task[8];
int coefs[8*32];
} shared;
const int tid = threadIdx.x + threadIdx.y * 32;
if (threadIdx.x < sizeof(FlaCudaSubframeData)/sizeof(int))
((int*)&shared.task[threadIdx.y])[threadIdx.x] = ((int*)(&tasks[blockIdx.y * blockDim.y + threadIdx.y]))[threadIdx.x];
__syncthreads();
const int pos = blockIdx.x * partSize;
const int dataLen = min(frameSize - pos, partSize + max_order);
// fetch samples
shared.data[tid] = tid < dataLen ? samples[shared.task[0].samplesOffs + pos + tid] >> shared.task[0].wbits : 0;
if (tid < 32) shared.data[tid + partSize] = tid + partSize < dataLen ? samples[shared.task[0].samplesOffs + pos + tid + partSize] >> shared.task[0].wbits : 0;
__syncthreads();
shared.residual[tid] = 0;
shared.coefs[tid] = threadIdx.x < shared.task[threadIdx.y].residualOrder ? tasks[blockIdx.y * blockDim.y + threadIdx.y].coefs[threadIdx.x] : 0;
const int residualLen = shared.task[threadIdx.y].type == Verbatim ? 0 : max(0,min(frameSize - pos - shared.task[threadIdx.y].residualOrder, partSize));
const int ptr2 = threadIdx.y << 5;
int s = 0;
for (int ptr = threadIdx.x; ptr < residualLen; ptr += 32)
{
// compute residual
int sum =
__mul24(shared.data[ptr + 0], shared.coefs[ptr2 + 0]) +
__mul24(shared.data[ptr + 1], shared.coefs[ptr2 + 1]) +
__mul24(shared.data[ptr + 2], shared.coefs[ptr2 + 2]) +
__mul24(shared.data[ptr + 3], shared.coefs[ptr2 + 3]) +
__mul24(shared.data[ptr + 4], shared.coefs[ptr2 + 4]) +
__mul24(shared.data[ptr + 5], shared.coefs[ptr2 + 5]) +
__mul24(shared.data[ptr + 6], shared.coefs[ptr2 + 6]) +
__mul24(shared.data[ptr + 7], shared.coefs[ptr2 + 7]) +
__mul24(shared.data[ptr + 8], shared.coefs[ptr2 + 8]) +
__mul24(shared.data[ptr + 9], shared.coefs[ptr2 + 9]) +
__mul24(shared.data[ptr + 10], shared.coefs[ptr2 + 10]) +
__mul24(shared.data[ptr + 11], shared.coefs[ptr2 + 11]);
sum = shared.data[ptr + shared.task[threadIdx.y].residualOrder] - (sum >> shared.task[threadIdx.y].shift);
s += min(0x7fffff,(sum << 1) ^ (sum >> 31));
}
shared.residual[tid] = s;
shared.residual[tid] = shared.residual[tid] + shared.residual[tid + 8] + shared.residual[tid + 16] + shared.residual[tid + 24];
shared.residual[tid] = shared.residual[tid] + shared.residual[tid + 2] + shared.residual[tid + 4] + shared.residual[tid + 6];
shared.residual[tid] += shared.residual[tid + 1];
// rice parameter search
shared.residual[tid] = (shared.task[threadIdx.y].type != Constant || shared.residual[threadIdx.y << 5] != 0) *
(__mul24(threadIdx.x >= 15, 0x7fffff) + residualLen * (threadIdx.x + 1) + ((shared.residual[threadIdx.y << 5] - (residualLen >> 1)) >> threadIdx.x));
shared.residual[tid] = min(min(shared.residual[tid], shared.residual[tid + 4]), min(shared.residual[tid + 8], shared.residual[tid + 12]));
if (threadIdx.x == 0)
output[(blockIdx.y * blockDim.y + threadIdx.y) * 64 + blockIdx.x] = min(min(shared.residual[tid], shared.residual[tid + 1]), min(shared.residual[tid + 2], shared.residual[tid + 3]));
}
extern "C" __global__ void cudaChooseBestMethod(
encodeResidualTaskStruct *tasks,
FlaCudaSubframeTask *tasks,
int *residual,
int partCount, // <= blockDim.y (256)
int taskCount
@@ -641,7 +773,7 @@ extern "C" __global__ void cudaChooseBestMethod(
volatile int index[128];
volatile int length[256];
volatile int partLen[256];
volatile encodeResidualTaskStruct task[8];
volatile FlaCudaSubframeTask task[8];
} shared;
const int tid = threadIdx.x + threadIdx.y * 32;
@@ -666,13 +798,13 @@ extern "C" __global__ void cudaChooseBestMethod(
// return sum
if (threadIdx.x == 0)
{
int obits = shared.task[threadIdx.y].obits - shared.task[threadIdx.y].wbits;
int obits = shared.task[threadIdx.y].data.obits - shared.task[threadIdx.y].data.wbits;
shared.length[task + threadIdx.y] =
min(obits * shared.task[threadIdx.y].blocksize,
shared.task[threadIdx.y].type == Fixed ? shared.task[threadIdx.y].residualOrder * obits + 6 + (4 * partCount/2) + shared.partLen[threadIdx.y * 32] :
shared.task[threadIdx.y].type == LPC ? shared.task[threadIdx.y].residualOrder * obits + 4 + 5 + shared.task[threadIdx.y].residualOrder * shared.task[threadIdx.y].cbits + 6 + (4 * partCount/2)/* << porder */ + shared.partLen[threadIdx.y * 32] :
shared.task[threadIdx.y].type == Constant ? obits * (1 + shared.task[threadIdx.y].blocksize * (shared.partLen[threadIdx.y * 32] != 0)) :
obits * shared.task[threadIdx.y].blocksize);
min(obits * shared.task[threadIdx.y].data.blocksize,
shared.task[threadIdx.y].data.type == Fixed ? shared.task[threadIdx.y].data.residualOrder * obits + 6 + (4 * partCount/2) + shared.partLen[threadIdx.y * 32] :
shared.task[threadIdx.y].data.type == LPC ? shared.task[threadIdx.y].data.residualOrder * obits + 4 + 5 + shared.task[threadIdx.y].data.residualOrder * shared.task[threadIdx.y].data.cbits + 6 + (4 * partCount/2)/* << porder */ + shared.partLen[threadIdx.y * 32] :
shared.task[threadIdx.y].data.type == Constant ? obits * (1 + shared.task[threadIdx.y].data.blocksize * (shared.partLen[threadIdx.y * 32] != 0)) :
obits * shared.task[threadIdx.y].data.blocksize);
}
}
//shared.index[threadIdx.x] = threadIdx.x;
@@ -681,7 +813,7 @@ extern "C" __global__ void cudaChooseBestMethod(
__syncthreads();
if (tid < taskCount)
tasks[tid + taskCount * blockIdx.y].size = shared.length[tid];
tasks[tid + taskCount * blockIdx.y].data.size = shared.length[tid];
__syncthreads();
int l1 = shared.length[tid];
@@ -709,13 +841,13 @@ extern "C" __global__ void cudaChooseBestMethod(
shared.length[tid] = l1 = min(l1, l2);
}
if (tid == 0)
tasks[taskCount * blockIdx.y].best_index = taskCount * blockIdx.y + shared.index[shared.length[1] < shared.length[0]];
tasks[taskCount * blockIdx.y].data.best_index = taskCount * blockIdx.y + shared.index[shared.length[1] < shared.length[0]];
}
}
extern "C" __global__ void cudaCopyBestMethod(
encodeResidualTaskStruct *tasks_out,
encodeResidualTaskStruct *tasks,
FlaCudaSubframeTask *tasks_out,
FlaCudaSubframeTask *tasks,
int count
)
{
@@ -723,15 +855,15 @@ extern "C" __global__ void cudaCopyBestMethod(
int best_index;
} shared;
if (threadIdx.x == 0)
shared.best_index = tasks[count * blockIdx.y].best_index;
shared.best_index = tasks[count * blockIdx.y].data.best_index;
__syncthreads();
if (threadIdx.x < sizeof(encodeResidualTaskStruct)/sizeof(int))
if (threadIdx.x < sizeof(FlaCudaSubframeTask)/sizeof(int))
((int*)(tasks_out + blockIdx.y))[threadIdx.x] = ((int*)(tasks + shared.best_index))[threadIdx.x];
}
extern "C" __global__ void cudaCopyBestMethodStereo(
encodeResidualTaskStruct *tasks_out,
encodeResidualTaskStruct *tasks,
FlaCudaSubframeTask *tasks_out,
FlaCudaSubframeTask *tasks,
int count
)
{
@@ -741,9 +873,9 @@ extern "C" __global__ void cudaCopyBestMethodStereo(
int lr_index[2];
} shared;
if (threadIdx.x < 4)
shared.best_index[threadIdx.x] = tasks[count * (blockIdx.y * 4 + threadIdx.x)].best_index;
shared.best_index[threadIdx.x] = tasks[count * (blockIdx.y * 4 + threadIdx.x)].data.best_index;
if (threadIdx.x < 4)
shared.best_size[threadIdx.x] = tasks[shared.best_index[threadIdx.x]].size;
shared.best_size[threadIdx.x] = tasks[shared.best_index[threadIdx.x]].data.size;
__syncthreads();
if (threadIdx.x == 0)
{
@@ -774,25 +906,25 @@ extern "C" __global__ void cudaCopyBestMethodStereo(
}
}
__syncthreads();
if (threadIdx.x < sizeof(encodeResidualTaskStruct)/sizeof(int))
if (threadIdx.x < sizeof(FlaCudaSubframeTask)/sizeof(int))
((int*)(tasks_out + 2 * blockIdx.y))[threadIdx.x] = ((int*)(tasks + shared.lr_index[0]))[threadIdx.x];
if (threadIdx.x == 0)
tasks_out[2 * blockIdx.y].residualOffs = tasks[shared.best_index[0]].residualOffs;
if (threadIdx.x < sizeof(encodeResidualTaskStruct)/sizeof(int))
tasks_out[2 * blockIdx.y].data.residualOffs = tasks[shared.best_index[0]].data.residualOffs;
if (threadIdx.x < sizeof(FlaCudaSubframeTask)/sizeof(int))
((int*)(tasks_out + 2 * blockIdx.y + 1))[threadIdx.x] = ((int*)(tasks + shared.lr_index[1]))[threadIdx.x];
if (threadIdx.x == 0)
tasks_out[2 * blockIdx.y + 1].residualOffs = tasks[shared.best_index[1]].residualOffs;
tasks_out[2 * blockIdx.y + 1].data.residualOffs = tasks[shared.best_index[1]].data.residualOffs;
}
extern "C" __global__ void cudaEncodeResidual(
int*output,
int*samples,
encodeResidualTaskStruct *tasks
FlaCudaSubframeTask *tasks
)
{
__shared__ struct {
int data[256 + 32];
encodeResidualTaskStruct task;
FlaCudaSubframeTask task;
} shared;
const int tid = threadIdx.x;
if (threadIdx.x < sizeof(shared.task) / sizeof(int))
@@ -800,40 +932,40 @@ extern "C" __global__ void cudaEncodeResidual(
__syncthreads();
const int partSize = blockDim.x;
const int pos = blockIdx.x * partSize;
const int dataLen = min(shared.task.blocksize - pos, partSize + shared.task.residualOrder);
const int dataLen = min(shared.task.data.blocksize - pos, partSize + shared.task.data.residualOrder);
// fetch samples
shared.data[tid] = tid < dataLen ? samples[shared.task.samplesOffs + pos + tid] >> shared.task.wbits : 0;
if (tid < 32) shared.data[tid + partSize] = tid + partSize < dataLen ? samples[shared.task.samplesOffs + pos + tid + partSize] >> shared.task.wbits : 0;
const int residualLen = max(0,min(shared.task.blocksize - pos - shared.task.residualOrder, partSize));
shared.data[tid] = tid < dataLen ? samples[shared.task.data.samplesOffs + pos + tid] >> shared.task.data.wbits : 0;
if (tid < 32) shared.data[tid + partSize] = tid + partSize < dataLen ? samples[shared.task.data.samplesOffs + pos + tid + partSize] >> shared.task.data.wbits : 0;
const int residualLen = max(0,min(shared.task.data.blocksize - pos - shared.task.data.residualOrder, partSize));
__syncthreads();
// compute residual
int sum = 0;
for (int c = 0; c < shared.task.residualOrder; c++)
for (int c = 0; c < shared.task.data.residualOrder; c++)
sum += __mul24(shared.data[tid + c], shared.task.coefs[c]);
__syncthreads();
shared.data[tid + shared.task.residualOrder] -= (sum >> shared.task.shift);
shared.data[tid + shared.task.data.residualOrder] -= (sum >> shared.task.data.shift);
__syncthreads();
if (tid >= shared.task.residualOrder && tid < residualLen + shared.task.residualOrder)
output[shared.task.residualOffs + pos + tid] = shared.data[tid];
if (tid + 256 < residualLen + shared.task.residualOrder)
output[shared.task.residualOffs + pos + tid + 256] = shared.data[tid + 256];
if (tid >= shared.task.data.residualOrder && tid < residualLen + shared.task.data.residualOrder)
output[shared.task.data.residualOffs + pos + tid] = shared.data[tid];
if (tid + 256 < residualLen + shared.task.data.residualOrder)
output[shared.task.data.residualOffs + pos + tid + 256] = shared.data[tid + 256];
}
extern "C" __global__ void cudaCalcPartition(
int* partition_lengths,
int* residual,
int* samples,
encodeResidualTaskStruct *tasks,
FlaCudaSubframeTask *tasks,
int max_porder, // <= 8
int psize, // == (shared.task.blocksize >> max_porder), < 256
int psize, // == (shared.task.data.blocksize >> max_porder), < 256
int parts_per_block // == 256 / psize, > 0, <= 16
)
{
__shared__ struct {
int data[256+32];
encodeResidualTaskStruct task;
FlaCudaSubframeTask task;
} shared;
const int tid = threadIdx.x + (threadIdx.y << 4);
if (tid < sizeof(shared.task) / sizeof(int))
@@ -844,18 +976,18 @@ extern "C" __global__ void cudaCalcPartition(
const int offs = blockIdx.x * psize * parts_per_block + tid;
// fetch samples
if (tid < 32) shared.data[tid] = min(offs, tid + shared.task.residualOrder) >= 32 ? samples[shared.task.samplesOffs + offs - 32] >> shared.task.wbits : 0;
shared.data[32 + tid] = tid < parts * psize ? samples[shared.task.samplesOffs + offs] >> shared.task.wbits : 0;
if (tid < 32) shared.data[tid] = min(offs, tid + shared.task.data.residualOrder) >= 32 ? samples[shared.task.data.samplesOffs + offs - 32] >> shared.task.data.wbits : 0;
shared.data[32 + tid] = tid < parts * psize ? samples[shared.task.data.samplesOffs + offs] >> shared.task.data.wbits : 0;
__syncthreads();
// compute residual
int s = 0;
for (int c = -shared.task.residualOrder; c < 0; c++)
s += __mul24(shared.data[32 + tid + c], shared.task.coefs[shared.task.residualOrder + c]);
s = shared.data[32 + tid] - (s >> shared.task.shift);
for (int c = -shared.task.data.residualOrder; c < 0; c++)
s += __mul24(shared.data[32 + tid + c], shared.task.coefs[shared.task.data.residualOrder + c]);
s = shared.data[32 + tid] - (s >> shared.task.data.shift);
if (offs >= shared.task.residualOrder && tid < parts * psize)
residual[shared.task.residualOffs + offs] = s;
if (offs >= shared.task.data.residualOrder && tid < parts * psize)
residual[shared.task.data.residualOffs + offs] = s;
else
s = 0;
@@ -878,7 +1010,7 @@ extern "C" __global__ void cudaCalcPartition(
//shared.data[tid] = s;
__syncthreads();
s = (psize - shared.task.residualOrder * (threadIdx.x + blockIdx.x == 0)) * (threadIdx.y + 1);
s = (psize - shared.task.data.residualOrder * (threadIdx.x + blockIdx.x == 0)) * (threadIdx.y + 1);
int dpos = __mul24(threadIdx.x, psize + 1);
//int dpos = __mul24(threadIdx.x, psize);
// calc number of unary bits for part threadIdx.x with rice paramater threadIdx.y
@@ -896,7 +1028,7 @@ extern "C" __global__ void cudaCalcPartition16(
int* partition_lengths,
int* residual,
int* samples,
encodeResidualTaskStruct *tasks,
FlaCudaSubframeTask *tasks,
int max_porder, // <= 8
int psize, // == 16
int parts_per_block // == 16
@@ -904,7 +1036,7 @@ extern "C" __global__ void cudaCalcPartition16(
{
__shared__ struct {
int data[256+32];
encodeResidualTaskStruct task;
FlaCudaSubframeTask task;
} shared;
const int tid = threadIdx.x + (threadIdx.y << 4);
if (tid < sizeof(shared.task) / sizeof(int))
@@ -914,18 +1046,30 @@ extern "C" __global__ void cudaCalcPartition16(
const int offs = (blockIdx.x << 8) + tid;
// fetch samples
if (tid < 32) shared.data[tid] = min(offs, tid + shared.task.residualOrder) >= 32 ? samples[shared.task.samplesOffs + offs - 32] >> shared.task.wbits : 0;
shared.data[32 + tid] = samples[shared.task.samplesOffs + offs] >> shared.task.wbits;
if (tid < 32) shared.data[tid] = min(offs, tid + shared.task.data.residualOrder) >= 32 ? samples[shared.task.data.samplesOffs + offs - 32] >> shared.task.data.wbits : 0;
shared.data[32 + tid] = samples[shared.task.data.samplesOffs + offs] >> shared.task.data.wbits;
// if (tid < 32 && tid >= shared.task.data.residualOrder)
//shared.task.coefs[tid] = 0;
__syncthreads();
// compute residual
int s = 0;
for (int c = -shared.task.residualOrder; c < 0; c++)
s += __mul24(shared.data[32 + tid + c], shared.task.coefs[shared.task.residualOrder + c]);
s = shared.data[32 + tid] - (s >> shared.task.shift);
for (int c = -shared.task.data.residualOrder; c < 0; c++)
s += __mul24(shared.data[32 + tid + c], shared.task.coefs[shared.task.data.residualOrder + c]);
// int spos = 32 + tid - shared.task.data.residualOrder;
// int s=
//__mul24(shared.data[spos + 0], shared.task.coefs[0]) + __mul24(shared.data[spos + 1], shared.task.coefs[1]) +
//__mul24(shared.data[spos + 2], shared.task.coefs[2]) + __mul24(shared.data[spos + 3], shared.task.coefs[3]) +
//__mul24(shared.data[spos + 4], shared.task.coefs[4]) + __mul24(shared.data[spos + 5], shared.task.coefs[5]) +
//__mul24(shared.data[spos + 6], shared.task.coefs[6]) + __mul24(shared.data[spos + 7], shared.task.coefs[7]) +
//__mul24(shared.data[spos + 8], shared.task.coefs[8]) + __mul24(shared.data[spos + 9], shared.task.coefs[9]) +
//__mul24(shared.data[spos + 10], shared.task.coefs[10]) + __mul24(shared.data[spos + 11], shared.task.coefs[11]) +
//__mul24(shared.data[spos + 12], shared.task.coefs[12]) + __mul24(shared.data[spos + 13], shared.task.coefs[13]) +
//__mul24(shared.data[spos + 14], shared.task.coefs[14]) + __mul24(shared.data[spos + 15], shared.task.coefs[15]);
s = shared.data[32 + tid] - (s >> shared.task.data.shift);
if (offs >= shared.task.residualOrder)
residual[shared.task.residualOffs + offs] = s;
if (blockIdx.x != 0 || tid >= shared.task.data.residualOrder)
residual[shared.task.data.residualOffs + (blockIdx.x << 8) + tid] = s;
else
s = 0;
@@ -937,27 +1081,27 @@ extern "C" __global__ void cudaCalcPartition16(
// calc number of unary bits for part threadIdx.x with rice paramater threadIdx.y
int dpos = __mul24(threadIdx.x, 17);
s =
(shared.data[dpos + 0] >> threadIdx.y) + (shared.data[dpos + 1] >> threadIdx.y) +
(shared.data[dpos + 2] >> threadIdx.y) + (shared.data[dpos + 3] >> threadIdx.y) +
(shared.data[dpos + 4] >> threadIdx.y) + (shared.data[dpos + 5] >> threadIdx.y) +
(shared.data[dpos + 6] >> threadIdx.y) + (shared.data[dpos + 7] >> threadIdx.y) +
(shared.data[dpos + 8] >> threadIdx.y) + (shared.data[dpos + 9] >> threadIdx.y) +
(shared.data[dpos + 10] >> threadIdx.y) + (shared.data[dpos + 11] >> threadIdx.y) +
(shared.data[dpos + 12] >> threadIdx.y) + (shared.data[dpos + 13] >> threadIdx.y) +
(shared.data[dpos + 14] >> threadIdx.y) + (shared.data[dpos + 15] >> threadIdx.y);
int sum =
(shared.data[dpos + 0] >> threadIdx.y) + (shared.data[dpos + 1] >> threadIdx.y) +
(shared.data[dpos + 2] >> threadIdx.y) + (shared.data[dpos + 3] >> threadIdx.y) +
(shared.data[dpos + 4] >> threadIdx.y) + (shared.data[dpos + 5] >> threadIdx.y) +
(shared.data[dpos + 6] >> threadIdx.y) + (shared.data[dpos + 7] >> threadIdx.y) +
(shared.data[dpos + 8] >> threadIdx.y) + (shared.data[dpos + 9] >> threadIdx.y) +
(shared.data[dpos + 10] >> threadIdx.y) + (shared.data[dpos + 11] >> threadIdx.y) +
(shared.data[dpos + 12] >> threadIdx.y) + (shared.data[dpos + 13] >> threadIdx.y) +
(shared.data[dpos + 14] >> threadIdx.y) + (shared.data[dpos + 15] >> threadIdx.y);
// output length
const int pos = ((15 * blockIdx.y + threadIdx.y) << (max_porder + 1)) + (blockIdx.x << 4) + threadIdx.x;
if (threadIdx.y <= 14)
partition_lengths[pos] = s + (16 - shared.task.residualOrder * (threadIdx.x + blockIdx.x == 0)) * (threadIdx.y + 1);
partition_lengths[pos] = sum + (16 - shared.task.data.residualOrder * (threadIdx.x + blockIdx.x == 0)) * (threadIdx.y + 1);
}
extern "C" __global__ void cudaCalcLargePartition(
int* partition_lengths,
int* residual,
int* samples,
encodeResidualTaskStruct *tasks,
FlaCudaSubframeTask *tasks,
int max_porder, // <= 8
int psize, // == >= 128
int parts_per_block // == 1
@@ -966,7 +1110,7 @@ extern "C" __global__ void cudaCalcLargePartition(
__shared__ struct {
int data[256];
volatile int length[256];
encodeResidualTaskStruct task;
FlaCudaSubframeTask task;
} shared;
const int tid = threadIdx.x + (threadIdx.y << 4);
if (tid < sizeof(shared.task) / sizeof(int))
@@ -978,7 +1122,7 @@ extern "C" __global__ void cudaCalcLargePartition(
{
// fetch residual
int offs = blockIdx.x * psize + pos + tid;
int s = (offs >= shared.task.residualOrder && pos + tid < psize) ? residual[shared.task.residualOffs + offs] : 0;
int s = (offs >= shared.task.data.residualOrder && pos + tid < psize) ? residual[shared.task.data.residualOffs + offs] : 0;
// convert to unsigned
shared.data[tid] = min(0xfffff, (s << 1) ^ (s >> 31));
__syncthreads();
@@ -996,7 +1140,7 @@ extern "C" __global__ void cudaCalcLargePartition(
// output length
const int pos = (15 << (max_porder + 1)) * blockIdx.y + (threadIdx.y << (max_porder + 1));
if (threadIdx.y <= 14 && threadIdx.x == 0)
partition_lengths[pos + blockIdx.x] = min(0xfffff,shared.length[tid]) + (psize - shared.task.residualOrder * (blockIdx.x == 0)) * (threadIdx.y + 1);
partition_lengths[pos + blockIdx.x] = min(0xfffff,shared.length[tid]) + (psize - shared.task.data.residualOrder * (blockIdx.x == 0)) * (threadIdx.y + 1);
}
// Sums partition lengths for a certain k == blockIdx.x
@@ -1086,7 +1230,7 @@ extern "C" __global__ void cudaFindRiceParameter(
extern "C" __global__ void cudaFindPartitionOrder(
int* best_rice_parameters,
encodeResidualTaskStruct *tasks,
FlaCudaSubframeTask *tasks,
int* rice_parameters,
int max_porder
)
@@ -1096,7 +1240,8 @@ extern "C" __global__ void cudaFindPartitionOrder(
volatile int tmp[256];
int length[32];
int index[32];
encodeResidualTaskStruct task;
//char4 ch[64];
FlaCudaSubframeTask task;
} shared;
const int pos = (blockIdx.y << (max_porder + 2)) + (2 << max_porder);
if (threadIdx.x < sizeof(shared.task) / sizeof(int))
@@ -1130,22 +1275,36 @@ extern "C" __global__ void cudaFindPartitionOrder(
shared.length[threadIdx.x] = l1 = min(l1, l2);
}
if (threadIdx.x == 0)
tasks[blockIdx.y].porder = shared.index[0];
tasks[blockIdx.y].data.porder = shared.index[0];
if (threadIdx.x == 0)
{
int obits = shared.task.obits - shared.task.wbits;
tasks[blockIdx.y].size =
shared.task.type == Fixed ? shared.task.residualOrder * obits + 6 + l1 :
shared.task.type == LPC ? shared.task.residualOrder * obits + 6 + l1 + 4 + 5 + shared.task.residualOrder * shared.task.cbits :
shared.task.type == Constant ? obits : obits * shared.task.blocksize;
int obits = shared.task.data.obits - shared.task.data.wbits;
tasks[blockIdx.y].data.size =
shared.task.data.type == Fixed ? shared.task.data.residualOrder * obits + 6 + l1 :
shared.task.data.type == LPC ? shared.task.data.residualOrder * obits + 6 + l1 + 4 + 5 + shared.task.data.residualOrder * shared.task.data.cbits :
shared.task.data.type == Constant ? obits : obits * shared.task.data.blocksize;
}
}
__syncthreads();
int porder = shared.index[0];
//shared.data[threadIdx.x] = threadIdx.x < (1 << porder) ? rice_parameters[pos - (2 << porder) + threadIdx.x] : 0;
if (threadIdx.x < (1 << porder))
best_rice_parameters[(blockIdx.y << max_porder) + threadIdx.x] = rice_parameters[pos - (2 << porder) + threadIdx.x];
// FIXME: should be bytes?
// if (threadIdx.x < (1 << porder))
//shared.tmp[threadIdx.x] = rice_parameters[pos - (2 << porder) + threadIdx.x];
// __syncthreads();
// if (threadIdx.x < max(1, (1 << porder) >> 2))
// {
//char4 ch;
//ch.x = shared.tmp[(threadIdx.x << 2)];
//ch.y = shared.tmp[(threadIdx.x << 2) + 1];
//ch.z = shared.tmp[(threadIdx.x << 2) + 2];
//ch.w = shared.tmp[(threadIdx.x << 2) + 3];
//shared.ch[threadIdx.x] = ch
// }
// __syncthreads();
// if (threadIdx.x < max(1, (1 << porder) >> 2))
//best_rice_parameters[(blockIdx.y << max_porder) + threadIdx.x] = shared.ch[threadIdx.x];
}
#endif

File diff suppressed because it is too large Load Diff