mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
testing on Fermi
This commit is contained in:
@@ -37,6 +37,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
{
|
||||
this.DoVerify = false;
|
||||
this.GPUOnly = true;
|
||||
this.MappedMemory = false;
|
||||
this.DoMD5 = true;
|
||||
this.GroupSize = 64;
|
||||
this.DeviceType = OpenCLDeviceType.GPU;
|
||||
@@ -56,6 +57,10 @@ namespace CUETools.Codecs.FLACCL
|
||||
[SRDescription(typeof(Properties.Resources), "DescriptionGPUOnly")]
|
||||
public bool GPUOnly { get; set; }
|
||||
|
||||
[DefaultValue(false)]
|
||||
[SRDescription(typeof(Properties.Resources), "DescriptionMappedMemory")]
|
||||
public bool MappedMemory { get; set; }
|
||||
|
||||
[DefaultValue(64)]
|
||||
[SRDescription(typeof(Properties.Resources), "DescriptionGroupSize")]
|
||||
public int GroupSize { get; set; }
|
||||
@@ -247,6 +252,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
{
|
||||
_settings.GroupSize = 1;
|
||||
_settings.GPUOnly = false;
|
||||
_settings.MappedMemory = true;
|
||||
}
|
||||
eparams.flake_set_defaults(_compressionLevel, !_settings.GPUOnly);
|
||||
}
|
||||
@@ -956,7 +962,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
if ((eparams.window_function & flag) == 0 || task.nWindowFunctions == lpc.MAX_LPC_WINDOWS)
|
||||
return;
|
||||
|
||||
func(((float*)task.clWindowFunctions.HostPtr) + task.nWindowFunctions * task.frameSize, task.frameSize);
|
||||
func(((float*)task.clWindowFunctionsPtr) + task.nWindowFunctions * task.frameSize, task.frameSize);
|
||||
//int sz = _windowsize;
|
||||
//float* pos = window + _windowcount * FLACCLWriter.MAX_BLOCKSIZE * 2;
|
||||
//do
|
||||
@@ -983,12 +989,16 @@ namespace CUETools.Codecs.FLACCL
|
||||
calculate_window(task, lpc.window_bartlett, WindowFunction.Bartlett);
|
||||
if (task.nWindowFunctions == 0)
|
||||
throw new Exception("invalid windowfunction");
|
||||
task.openCLCQ.EnqueueWriteBuffer(task.clWindowFunctions, true, 0, sizeof(float) * task.nWindowFunctions * task.frameSize, task.clWindowFunctions.HostPtr);
|
||||
if (!_settings.MappedMemory)
|
||||
task.openCLCQ.EnqueueWriteBuffer(task.clWindowFunctions, true, 0, sizeof(float) * task.nWindowFunctions * task.frameSize, task.clWindowFunctionsPtr);
|
||||
}
|
||||
|
||||
task.nResidualTasks = 0;
|
||||
task.nTasksPerWindow = Math.Min(32, eparams.orders_per_window);
|
||||
task.nResidualTasksPerChannel = task.nWindowFunctions * task.nTasksPerWindow + (eparams.do_constant ? 1 : 0) + Math.Max(0, 1 + eparams.max_fixed_order - eparams.min_fixed_order);
|
||||
if (task.nResidualTasksPerChannel > 32)
|
||||
throw new Exception("too many tasks");
|
||||
|
||||
//if (task.nResidualTasksPerChannel >= 4)
|
||||
// task.nResidualTasksPerChannel = (task.nResidualTasksPerChannel + 7) & ~7;
|
||||
for (int iFrame = 0; iFrame < nFrames; iFrame++)
|
||||
@@ -1082,7 +1092,8 @@ namespace CUETools.Codecs.FLACCL
|
||||
if (sizeof(FLACCLSubframeTask) * task.nResidualTasks > task.residualTasksLen)
|
||||
throw new Exception("oops");
|
||||
|
||||
task.openCLCQ.EnqueueWriteBuffer(task.clResidualTasks, true, 0, sizeof(FLACCLSubframeTask) * task.nResidualTasks, task.clResidualTasks.HostPtr);
|
||||
if (!_settings.MappedMemory)
|
||||
task.openCLCQ.EnqueueWriteBuffer(task.clResidualTasks, true, 0, sizeof(FLACCLSubframeTask) * task.nResidualTasks, task.clResidualTasksPtr);
|
||||
}
|
||||
|
||||
unsafe void encode_residual(FLACCLTask task)
|
||||
@@ -1192,7 +1203,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
for (int ch = 0; ch < channels; ch++)
|
||||
{
|
||||
int index = ch + iFrame * channels;
|
||||
frame.subframes[ch].best.residual = ((int*)task.clResidual.HostPtr) + task.BestResidualTasks[index].residualOffs;
|
||||
frame.subframes[ch].best.residual = ((int*)task.clResidualPtr) + task.BestResidualTasks[index].residualOffs;
|
||||
frame.subframes[ch].best.type = SubframeType.Verbatim;
|
||||
frame.subframes[ch].best.size = (uint)(frame.subframes[ch].obits * frame.blocksize);
|
||||
frame.subframes[ch].wbits = 0;
|
||||
@@ -1213,7 +1224,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i];
|
||||
if (_settings.GPUOnly && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC))
|
||||
{
|
||||
int* riceParams = ((int*)task.clBestRiceParams.HostPtr) + (index << task.max_porder);
|
||||
int* riceParams = ((int*)task.clBestRiceParamsPtr) + (index << task.max_porder);
|
||||
fixed (int* dstParams = frame.subframes[ch].best.rc.rparams)
|
||||
AudioSamples.MemCpy(dstParams, riceParams, (1 << frame.subframes[ch].best.rc.porder));
|
||||
//for (int i = 0; i < (1 << frame.subframes[ch].best.rc.porder); i++)
|
||||
@@ -1240,7 +1251,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
unsafe void unpack_samples(FLACCLTask task, int count)
|
||||
{
|
||||
int iFrame = task.frame.frame_number;
|
||||
short* src = ((short*)task.clSamplesBytes.HostPtr) + iFrame * channels * task.frameSize;
|
||||
short* src = ((short*)task.clSamplesBytesPtr) + iFrame * channels * task.frameSize;
|
||||
|
||||
switch (task.frame.ch_mode)
|
||||
{
|
||||
@@ -1327,7 +1338,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
for (int ch = 0; ch < channelCount; ch++)
|
||||
task.frame.subframes[ch].Init(
|
||||
smp + ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * task.frameSize,
|
||||
((int*)task.clResidual.HostPtr) + ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * task.frameSize,
|
||||
((int*)task.clResidualPtr) + ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * task.frameSize,
|
||||
_pcm.BitsPerSample + (doMidside && ch == 3 ? 1 : 0), 0);
|
||||
|
||||
select_best_methods(task.frame, channelCount, iFrame, task);
|
||||
@@ -1360,7 +1371,8 @@ namespace CUETools.Codecs.FLACCL
|
||||
task.framePos = frame_pos;
|
||||
frame_count += nFrames;
|
||||
frame_pos += nFrames * blocksize;
|
||||
task.openCLCQ.EnqueueWriteBuffer(task.clSamplesBytes, false, 0, sizeof(short) * channels * blocksize * nFrames, task.clSamplesBytes.HostPtr);
|
||||
if (!_settings.MappedMemory)
|
||||
task.openCLCQ.EnqueueWriteBuffer(task.clSamplesBytes, false, 0, sizeof(short) * channels * blocksize * nFrames, task.clSamplesBytesPtr);
|
||||
//task.openCLCQ.EnqueueUnmapMemObject(task.clSamplesBytes, task.clSamplesBytes.HostPtr);
|
||||
//task.openCLCQ.EnqueueMapBuffer(task.clSamplesBytes, true, MapFlags.WRITE, 0, task.samplesBufferLen / 2);
|
||||
}
|
||||
@@ -1402,7 +1414,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
{
|
||||
for (int ch = 0; ch < channels; ch++)
|
||||
{
|
||||
short* res = ((short*)task.clSamplesBytes.HostPtr) + iFrame * channels * task.frameSize + ch;
|
||||
short* res = ((short*)task.clSamplesBytesPtr) + iFrame * channels * task.frameSize + ch;
|
||||
int* smp = r + ch * Flake.MAX_BLOCKSIZE;
|
||||
for (int i = task.frameSize; i > 0; i--)
|
||||
{
|
||||
@@ -1589,7 +1601,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
int block = Math.Min(buff.Length - pos, eparams.block_size * max_frames - samplesInBuffer);
|
||||
|
||||
fixed (byte* buf = buff.Bytes)
|
||||
AudioSamples.MemCpy(((byte*)task1.clSamplesBytes.HostPtr) + samplesInBuffer * _pcm.BlockAlign, buf + pos * _pcm.BlockAlign, block * _pcm.BlockAlign);
|
||||
AudioSamples.MemCpy(((byte*)task1.clSamplesBytesPtr) + samplesInBuffer * _pcm.BlockAlign, buf + pos * _pcm.BlockAlign, block * _pcm.BlockAlign);
|
||||
|
||||
samplesInBuffer += block;
|
||||
pos += block;
|
||||
@@ -1703,8 +1715,8 @@ namespace CUETools.Codecs.FLACCL
|
||||
samplesInBuffer -= bs;
|
||||
if (samplesInBuffer > 0)
|
||||
AudioSamples.MemCpy(
|
||||
((byte*)task2.clSamplesBytes.HostPtr),
|
||||
((byte*)task1.clSamplesBytes.HostPtr) + bs * _pcm.BlockAlign,
|
||||
((byte*)task2.clSamplesBytesPtr),
|
||||
((byte*)task1.clSamplesBytesPtr) + bs * _pcm.BlockAlign,
|
||||
samplesInBuffer * _pcm.BlockAlign);
|
||||
FLACCLTask tmp = task1;
|
||||
task1 = task2;
|
||||
@@ -2226,6 +2238,21 @@ namespace CUETools.Codecs.FLACCL
|
||||
public Mem clResidualOutput;
|
||||
public Mem clBestResidualTasks;
|
||||
public Mem clWindowFunctions;
|
||||
|
||||
public Mem clSamplesBytesPinned;
|
||||
public Mem clResidualPinned;
|
||||
public Mem clBestRiceParamsPinned;
|
||||
public Mem clResidualTasksPinned;
|
||||
public Mem clBestResidualTasksPinned;
|
||||
public Mem clWindowFunctionsPinned;
|
||||
|
||||
public IntPtr clSamplesBytesPtr;
|
||||
public IntPtr clResidualPtr;
|
||||
public IntPtr clBestRiceParamsPtr;
|
||||
public IntPtr clResidualTasksPtr;
|
||||
public IntPtr clBestResidualTasksPtr;
|
||||
public IntPtr clWindowFunctionsPtr;
|
||||
|
||||
public int[] samplesBuffer;
|
||||
public byte[] outputBuffer;
|
||||
public int outputSize = 0;
|
||||
@@ -2268,25 +2295,73 @@ namespace CUETools.Codecs.FLACCL
|
||||
#endif
|
||||
openCLCQ = openCLProgram.Context.CreateCommandQueue(openCLProgram.Context.Devices[0], prop);
|
||||
|
||||
residualTasksLen = sizeof(FLACCLSubframeTask) * channelsCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 8) * FLACCLWriter.maxFrames;
|
||||
int MAX_ORDER = this.writer.eparams.max_prediction_order;
|
||||
residualTasksLen = sizeof(FLACCLSubframeTask) * 32 * channelsCount * FLACCLWriter.maxFrames;
|
||||
bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channels * FLACCLWriter.maxFrames;
|
||||
samplesBufferLen = sizeof(int) * FLACCLWriter.MAX_BLOCKSIZE * channelsCount;
|
||||
int partitionsLen = sizeof(int) * (30 << 8) * channels * FLACCLWriter.maxFrames;
|
||||
int riceParamsLen = sizeof(int) * (4 << 8) * channels * FLACCLWriter.maxFrames;
|
||||
int lpcDataLen = sizeof(float) * 32 * 33 * lpc.MAX_LPC_WINDOWS * channelsCount * FLACCLWriter.maxFrames;
|
||||
int autocorLen = sizeof(float) * (MAX_ORDER + 1) * lpc.MAX_LPC_WINDOWS * channelsCount * FLACCLWriter.maxFrames;
|
||||
int lpcDataLen = autocorLen * 32;
|
||||
int resOutLen = sizeof(int) * channelsCount * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 8) * FLACCLWriter.maxFrames;
|
||||
int wndLen = sizeof(float) * FLACCLWriter.MAX_BLOCKSIZE /** 2*/ * lpc.MAX_LPC_WINDOWS;
|
||||
|
||||
clSamplesBytes = openCLProgram.Context.CreateBuffer(MemFlags.READ_ONLY | MemFlags.ALLOC_HOST_PTR, (uint)samplesBufferLen / 2);
|
||||
clSamples = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, samplesBufferLen);
|
||||
if (!writer._settings.MappedMemory)
|
||||
{
|
||||
clSamplesBytes = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, samplesBufferLen / 2);
|
||||
clResidual = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, samplesBufferLen);
|
||||
clBestRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, riceParamsLen / 4);
|
||||
clResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, residualTasksLen);
|
||||
clBestResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, bestResidualTasksLen);
|
||||
clWindowFunctions = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, wndLen);
|
||||
|
||||
clSamplesBytesPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, samplesBufferLen / 2);
|
||||
clResidualPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, samplesBufferLen);
|
||||
clBestRiceParamsPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceParamsLen / 4);
|
||||
clResidualTasksPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, residualTasksLen);
|
||||
clBestResidualTasksPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, bestResidualTasksLen);
|
||||
clWindowFunctionsPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, wndLen);
|
||||
|
||||
clSamplesBytesPtr = openCLCQ.EnqueueMapBuffer(clSamplesBytesPinned, true, MapFlags.WRITE, 0, samplesBufferLen / 2);
|
||||
clResidualPtr = openCLCQ.EnqueueMapBuffer(clResidualPinned, true, MapFlags.WRITE, 0, samplesBufferLen);
|
||||
clBestRiceParamsPtr = openCLCQ.EnqueueMapBuffer(clBestRiceParamsPinned, true, MapFlags.WRITE, 0, riceParamsLen / 4);
|
||||
clResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clResidualTasksPinned, true, MapFlags.WRITE, 0, residualTasksLen);
|
||||
clBestResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clBestResidualTasksPinned, true, MapFlags.WRITE, 0, bestResidualTasksLen);
|
||||
clWindowFunctionsPtr = openCLCQ.EnqueueMapBuffer(clWindowFunctionsPinned, true, MapFlags.WRITE, 0, wndLen);
|
||||
}
|
||||
else
|
||||
{
|
||||
clSamplesBytes = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, (uint)samplesBufferLen / 2);
|
||||
clResidual = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, samplesBufferLen);
|
||||
clLPCData = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, lpcDataLen);
|
||||
clPartitions = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, partitionsLen);
|
||||
clRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, riceParamsLen);
|
||||
clBestRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceParamsLen / 4);
|
||||
clAutocorOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, sizeof(float) * channelsCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * FLACCLWriter.maxFrames);
|
||||
clResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, residualTasksLen);
|
||||
clBestResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, bestResidualTasksLen);
|
||||
clResidualOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, sizeof(int) * channelsCount * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 8) * 64 /*FLACCLWriter.maxResidualParts*/ * FLACCLWriter.maxFrames);
|
||||
clWindowFunctions = openCLProgram.Context.CreateBuffer(MemFlags.READ_ONLY | MemFlags.ALLOC_HOST_PTR, sizeof(float) * FLACCLWriter.MAX_BLOCKSIZE /** 2*/ * lpc.MAX_LPC_WINDOWS);
|
||||
clWindowFunctions = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, wndLen);
|
||||
|
||||
clSamplesBytesPtr = openCLCQ.EnqueueMapBuffer(clSamplesBytes, true, MapFlags.WRITE, 0, samplesBufferLen / 2);
|
||||
clResidualPtr = openCLCQ.EnqueueMapBuffer(clResidual, true, MapFlags.WRITE, 0, samplesBufferLen);
|
||||
clBestRiceParamsPtr = openCLCQ.EnqueueMapBuffer(clBestRiceParams, true, MapFlags.WRITE, 0, riceParamsLen / 4);
|
||||
clResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clResidualTasks, true, MapFlags.WRITE, 0, residualTasksLen);
|
||||
clBestResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clBestResidualTasks, true, MapFlags.WRITE, 0, bestResidualTasksLen);
|
||||
clWindowFunctionsPtr = openCLCQ.EnqueueMapBuffer(clWindowFunctions, true, MapFlags.WRITE, 0, wndLen);
|
||||
|
||||
//clSamplesBytesPtr = clSamplesBytes.HostPtr;
|
||||
//clResidualPtr = clResidual.HostPtr;
|
||||
//clBestRiceParamsPtr = clBestRiceParams.HostPtr;
|
||||
//clResidualTasksPtr = clResidualTasks.HostPtr;
|
||||
//clBestResidualTasksPtr = clBestResidualTasks.HostPtr;
|
||||
//clWindowFunctionsPtr = clWindowFunctions.HostPtr;
|
||||
}
|
||||
|
||||
clSamples = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, samplesBufferLen);
|
||||
clLPCData = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, lpcDataLen);
|
||||
clAutocorOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, autocorLen);
|
||||
clResidualOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, resOutLen);
|
||||
if (writer._settings.GPUOnly)
|
||||
{
|
||||
clPartitions = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, partitionsLen);
|
||||
clRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, riceParamsLen);
|
||||
}
|
||||
|
||||
//openCLCQ.EnqueueMapBuffer(clSamplesBytes, true, MapFlags.WRITE, 0, samplesBufferLen / 2);
|
||||
|
||||
@@ -2337,6 +2412,8 @@ namespace CUETools.Codecs.FLACCL
|
||||
workThread = null;
|
||||
}
|
||||
|
||||
openCLCQ.Finish();
|
||||
|
||||
clComputeAutocor.Dispose();
|
||||
clStereoDecorr.Dispose();
|
||||
//cudaChannelDecorr.Dispose();
|
||||
@@ -2357,13 +2434,53 @@ namespace CUETools.Codecs.FLACCL
|
||||
clSumPartition.Dispose();
|
||||
clFindRiceParameter.Dispose();
|
||||
clFindPartitionOrder.Dispose();
|
||||
|
||||
clPartitions.Dispose();
|
||||
clRiceParams.Dispose();
|
||||
}
|
||||
|
||||
if (!writer._settings.MappedMemory)
|
||||
{
|
||||
if (clSamplesBytesPtr != IntPtr.Zero)
|
||||
openCLCQ.EnqueueUnmapMemObject(clSamplesBytesPinned, clSamplesBytesPtr);
|
||||
clSamplesBytesPtr = IntPtr.Zero;
|
||||
if (clResidualPtr != IntPtr.Zero)
|
||||
openCLCQ.EnqueueUnmapMemObject(clResidualPinned, clResidualPtr);
|
||||
clResidualPtr = IntPtr.Zero;
|
||||
if (clBestRiceParamsPtr != IntPtr.Zero)
|
||||
openCLCQ.EnqueueUnmapMemObject(clBestRiceParamsPinned, clBestRiceParamsPtr);
|
||||
clBestRiceParamsPtr = IntPtr.Zero;
|
||||
if (clResidualTasksPtr != IntPtr.Zero)
|
||||
openCLCQ.EnqueueUnmapMemObject(clResidualTasksPinned, clResidualTasksPtr);
|
||||
clResidualTasksPtr = IntPtr.Zero;
|
||||
if (clBestResidualTasksPtr != IntPtr.Zero)
|
||||
openCLCQ.EnqueueUnmapMemObject(clBestResidualTasksPinned, clBestResidualTasksPtr);
|
||||
clBestResidualTasksPtr = IntPtr.Zero;
|
||||
if (clWindowFunctionsPtr != IntPtr.Zero)
|
||||
openCLCQ.EnqueueUnmapMemObject(clWindowFunctionsPinned, clWindowFunctionsPtr);
|
||||
clWindowFunctionsPtr = IntPtr.Zero;
|
||||
|
||||
clSamplesBytesPinned.Dispose();
|
||||
clResidualPinned.Dispose();
|
||||
clBestRiceParamsPinned.Dispose();
|
||||
clResidualTasksPinned.Dispose();
|
||||
clBestResidualTasksPinned.Dispose();
|
||||
clWindowFunctionsPinned.Dispose();
|
||||
}
|
||||
else
|
||||
{
|
||||
openCLCQ.EnqueueUnmapMemObject(clSamplesBytes, clSamplesBytesPtr);
|
||||
openCLCQ.EnqueueUnmapMemObject(clResidual, clResidualPtr);
|
||||
openCLCQ.EnqueueUnmapMemObject(clBestRiceParams, clBestRiceParamsPtr);
|
||||
openCLCQ.EnqueueUnmapMemObject(clResidualTasks, clResidualTasksPtr);
|
||||
openCLCQ.EnqueueUnmapMemObject(clBestResidualTasks, clBestResidualTasksPtr);
|
||||
openCLCQ.EnqueueUnmapMemObject(clWindowFunctions, clWindowFunctionsPtr);
|
||||
}
|
||||
|
||||
clSamples.Dispose();
|
||||
clSamplesBytes.Dispose();
|
||||
clLPCData.Dispose();
|
||||
clResidual.Dispose();
|
||||
clPartitions.Dispose();
|
||||
clAutocorOutput.Dispose();
|
||||
clResidualTasks.Dispose();
|
||||
clResidualOutput.Dispose();
|
||||
@@ -2371,13 +2488,15 @@ namespace CUETools.Codecs.FLACCL
|
||||
clWindowFunctions.Dispose();
|
||||
|
||||
openCLCQ.Dispose();
|
||||
|
||||
GC.SuppressFinalize(this);
|
||||
}
|
||||
|
||||
public unsafe FLACCLSubframeTask* ResidualTasks
|
||||
{
|
||||
get
|
||||
{
|
||||
return (FLACCLSubframeTask*)clResidualTasks.HostPtr;
|
||||
return (FLACCLSubframeTask*)clResidualTasksPtr;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2385,7 +2504,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
{
|
||||
get
|
||||
{
|
||||
return (FLACCLSubframeTask*)clBestResidualTasks.HostPtr;
|
||||
return (FLACCLSubframeTask*)clBestResidualTasksPtr;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2398,18 +2517,18 @@ namespace CUETools.Codecs.FLACCL
|
||||
this.max_porder--;
|
||||
|
||||
if (channels != 2) throw new Exception("channels != 2"); // need to Enqueue cudaChannelDecorr for each channel
|
||||
Kernel cudaChannelDecorr = channels == 2 ? (channelsCount == 4 ? clStereoDecorr : clChannelDecorr2) : null;// cudaChannelDecorr;
|
||||
Kernel clChannelDecorr = channels == 2 ? (channelsCount == 4 ? clStereoDecorr : clChannelDecorr2) : null;// cudaChannelDecorr;
|
||||
|
||||
// openCLCQ.EnqueueMapBuffer(cudaSamplesBytes
|
||||
//openCLCQ.EnqueueUnmapMemObject(cudaSamplesBytes, cudaSamplesBytes.HostPtr);
|
||||
|
||||
// issue work to the GPU
|
||||
cudaChannelDecorr.SetArgs(
|
||||
clChannelDecorr.SetArgs(
|
||||
clSamples,
|
||||
clSamplesBytes,
|
||||
FLACCLWriter.MAX_BLOCKSIZE);
|
||||
|
||||
openCLCQ.EnqueueNDRangeKernel(cudaChannelDecorr, 0, frameSize * frameCount);
|
||||
openCLCQ.EnqueueNDRangeKernel(clChannelDecorr, 0, frameSize * frameCount);
|
||||
|
||||
if (eparams.do_wasted)
|
||||
{
|
||||
@@ -2442,6 +2561,11 @@ namespace CUETools.Codecs.FLACCL
|
||||
clLPCData,
|
||||
nWindowFunctions);
|
||||
|
||||
//openCLCQ.EnqueueNDRangeKernel(
|
||||
// clComputeLPC,
|
||||
// 64, 1,
|
||||
// nWindowFunctions,
|
||||
// frameCount);
|
||||
openCLCQ.EnqueueNDRangeKernel(
|
||||
clComputeLPC,
|
||||
Math.Min(groupSize, 32), 1,
|
||||
@@ -2580,18 +2704,44 @@ namespace CUETools.Codecs.FLACCL
|
||||
groupSize,
|
||||
channels * frameCount);
|
||||
|
||||
openCLCQ.EnqueueReadBuffer(clBestRiceParams, false, 0, sizeof(int) * (1 << max_porder) * channels * frameCount, clBestRiceParams.HostPtr);
|
||||
openCLCQ.EnqueueReadBuffer(clResidual, false, 0, sizeof(int) * FLACCLWriter.MAX_BLOCKSIZE * channels, clResidual.HostPtr);
|
||||
//openCLCQ.EnqueueMapBuffer(clBestRiceParams, false, MapFlags.READ, 0, sizeof(int) * (1 << max_porder) * channels * frameCount);
|
||||
//openCLCQ.EnqueueUnmapMemObject(clBestRiceParams, clBestRiceParams.HostPtr);
|
||||
//openCLCQ.EnqueueMapBuffer(clResidual, false, MapFlags.READ, 0, sizeof(int) * FLACCLWriter.MAX_BLOCKSIZE * channels);
|
||||
//openCLCQ.EnqueueUnmapMemObject(clResidual, clResidual.HostPtr);
|
||||
if (!writer._settings.MappedMemory)
|
||||
openCLCQ.EnqueueReadBuffer(clBestRiceParams, false, 0, sizeof(int) * (1 << max_porder) * channels * frameCount, clBestRiceParamsPtr);
|
||||
if (!writer._settings.MappedMemory)
|
||||
openCLCQ.EnqueueReadBuffer(clResidual, false, 0, sizeof(int) * FLACCLWriter.MAX_BLOCKSIZE * channels, clResidualPtr);
|
||||
}
|
||||
if (!writer._settings.MappedMemory)
|
||||
openCLCQ.EnqueueReadBuffer(clBestResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * channels * frameCount, clBestResidualTasksPtr);
|
||||
}
|
||||
}
|
||||
openCLCQ.EnqueueReadBuffer(clBestResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * channels * frameCount, clBestResidualTasks.HostPtr);
|
||||
//openCLCQ.EnqueueMapBuffer(clBestResidualTasks, false, MapFlags.READ, 0, sizeof(FLACCLSubframeTask) * channels * frameCount);
|
||||
//openCLCQ.EnqueueUnmapMemObject(clBestResidualTasks, clBestResidualTasks.HostPtr);
|
||||
|
||||
//openCLCQ.EnqueueMapBuffer(clSamplesBytes, false, MapFlags.WRITE, 0, samplesBufferLen / 2);
|
||||
public static class OpenCLExtensions
|
||||
{
|
||||
public static void SetArgs(this Kernel kernel, params object[] args)
|
||||
{
|
||||
int i = 0;
|
||||
foreach (object arg in args)
|
||||
{
|
||||
if (arg is int)
|
||||
kernel.SetArg(i, (int)arg);
|
||||
else if (arg is Mem)
|
||||
kernel.SetArg(i, (Mem)arg);
|
||||
else
|
||||
throw new ArgumentException("Invalid argument type", arg.GetType().ToString());
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
public static void EnqueueNDRangeKernel(this CommandQueue queue, Kernel kernel, long localSize, long globalSize)
|
||||
{
|
||||
if (localSize == 0)
|
||||
queue.EnqueueNDRangeKernel(kernel, 1, null, new long[] { globalSize }, null);
|
||||
else
|
||||
queue.EnqueueNDRangeKernel(kernel, 1, null, new long[] { localSize * globalSize }, new long[] { localSize });
|
||||
}
|
||||
|
||||
public static void EnqueueNDRangeKernel(this CommandQueue queue, Kernel kernel, long localSizeX, long localSizeY, long globalSizeX, long globalSizeY)
|
||||
{
|
||||
queue.EnqueueNDRangeKernel(kernel, 2, null, new long[] { localSizeX * globalSizeX, localSizeY * globalSizeY }, new long[] { localSizeX, localSizeY });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,12 +20,19 @@
|
||||
#ifndef _FLACCL_KERNEL_H_
|
||||
#define _FLACCL_KERNEL_H_
|
||||
|
||||
#ifdef DEBUG
|
||||
#pragma OPENCL EXTENSION cl_amd_printf : enable
|
||||
#endif
|
||||
#undef DEBUG
|
||||
|
||||
//#define AMD
|
||||
|
||||
//#ifdef DEBUG
|
||||
//#pragma OPENCL EXTENSION cl_amd_printf : enable
|
||||
//#endif
|
||||
|
||||
//#pragma OPENCL EXTENSION cl_amd_fp64 : enable
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics: enable
|
||||
#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics: enable
|
||||
|
||||
typedef enum
|
||||
{
|
||||
Constant = 0,
|
||||
@@ -59,6 +66,8 @@ typedef struct
|
||||
int coefs[32]; // fixme: should be short?
|
||||
} FLACCLSubframeTask;
|
||||
|
||||
#define iclamp(a,b,c) max(b,min(a,c))
|
||||
|
||||
__kernel void clStereoDecorr(
|
||||
__global int *samples,
|
||||
__global short2 *src,
|
||||
@@ -181,8 +190,10 @@ void clComputeAutocor(
|
||||
|
||||
int tid0 = tid % (GROUP_SIZE >> 2);
|
||||
int tid1 = tid / (GROUP_SIZE >> 2);
|
||||
#ifdef ATI
|
||||
__local float4 * dptr = ((__local float4 *)&data[0]) + tid0;
|
||||
__local float4 * dptr1 = ((__local float4 *)&data[tid1]) + tid0;
|
||||
#endif
|
||||
|
||||
for (int pos = 0; pos < bs; pos += GROUP_SIZE)
|
||||
{
|
||||
@@ -192,8 +203,15 @@ void clComputeAutocor(
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for (int ord4 = 0; ord4 < (MAX_ORDER / 4 + 1); ord4 ++)
|
||||
#ifdef ATI
|
||||
product[ord4 * GROUP_SIZE + tid] += dot(dptr[0], dptr1[ord4]);
|
||||
|
||||
#else
|
||||
product[ord4 * GROUP_SIZE + tid] +=
|
||||
data[tid0*4 + 0] * data[tid0*4 + ord4*4 + tid1 + 0] +
|
||||
data[tid0*4 + 1] * data[tid0*4 + ord4*4 + tid1 + 1] +
|
||||
data[tid0*4 + 2] * data[tid0*4 + ord4*4 + tid1 + 2] +
|
||||
data[tid0*4 + 3] * data[tid0*4 + ord4*4 + tid1 + 3];
|
||||
#endif
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
data[tid] = nextData;
|
||||
@@ -223,8 +241,8 @@ void clComputeLPC(
|
||||
volatile float autoc[33];
|
||||
} shared;
|
||||
const int tid = get_local_id(0);// + get_local_id(1) * 32;
|
||||
int lpcOffs = (get_group_id(0) + get_group_id(1) * windowCount) * (MAX_ORDER + 1) * 32;
|
||||
int autocOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1);
|
||||
int lpcOffs = autocOffs * 32;
|
||||
|
||||
if (get_local_id(0) <= MAX_ORDER)
|
||||
shared.autoc[get_local_id(0)] = autoc[autocOffs + get_local_id(0)];
|
||||
@@ -272,11 +290,12 @@ void clComputeLPC(
|
||||
shared.error[order] = error;
|
||||
|
||||
// Levinson-Durbin recursion
|
||||
float ldr =
|
||||
select(0.0f, reff * shared.ldr[order - 1 - get_local_id(0)], get_local_id(0) < order) +
|
||||
select(0.0f, reff, get_local_id(0) == order);
|
||||
float ldr = shared.ldr[get_local_id(0)];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
shared.ldr[get_local_id(0)] += ldr;
|
||||
if (get_local_id(0) < order)
|
||||
shared.ldr[order - 1 - get_local_id(0)] += reff * ldr;
|
||||
if (get_local_id(0) == order)
|
||||
shared.ldr[get_local_id(0)] += reff;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Output coeffs
|
||||
@@ -329,7 +348,7 @@ void clQuantizeLPC(
|
||||
|
||||
// Load prediction error estimates
|
||||
if (tid < MAX_ORDER)
|
||||
shared.error[tid] = shared.task.blocksize * log(lpcs[shared.lpcOffs + MAX_ORDER * 32 + tid]) + tid * 4.12f * log(shared.task.blocksize);
|
||||
shared.error[tid] = shared.task.blocksize * log(lpcs[shared.lpcOffs + MAX_ORDER * 32 + tid]) + tid * 4.12f * log((float)shared.task.blocksize);
|
||||
//shared.error[get_local_id(0)] = shared.task.blocksize * log(lpcs[shared.lpcOffs + MAX_ORDER * 32 + get_local_id(0)]) + get_local_id(0) * 0.30f * (shared.task.abits + 1) * log(shared.task.blocksize);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
@@ -387,7 +406,7 @@ void clQuantizeLPC(
|
||||
// get 15 bits of each coeff
|
||||
int coef = convert_int_rte(lpc * (1 << 15));
|
||||
// remove sign bits
|
||||
atomic_or(shared.maxcoef + i, coef ^ (coef >> 31));
|
||||
atom_or(shared.maxcoef + i, coef ^ (coef >> 31));
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
//SUM32(shared.tmpi,tid,|=);
|
||||
// choose precision
|
||||
@@ -402,12 +421,12 @@ void clQuantizeLPC(
|
||||
//if (shared.task.abits + 32 - clz(order) < shift
|
||||
//int shift = max(0,min(15, (shared.task.abits >> 2) - 14 + clz(shared.tmpi[tid & ~31]) + ((32 - clz(order))>>1)));
|
||||
// quantize coeffs with given shift
|
||||
coef = convert_int_rte(clamp(lpc * (1 << shift), -1 << (cbits - 1), 1 << (cbits - 1)));
|
||||
coef = convert_int_rte(clamp(lpc * (1 << shift), (float)(-1 << (cbits - 1)), (float)(1 << (cbits - 1))));
|
||||
// error correction
|
||||
//shared.tmp[tid] = (tid != 0) * (shared.arp[tid - 1]*(1 << shared.task.shift) - shared.task.coefs[tid - 1]);
|
||||
//shared.task.coefs[tid] = max(-(1 << (shared.task.cbits - 1)), min((1 << (shared.task.cbits - 1))-1, convert_int_rte((shared.arp[tid]) * (1 << shared.task.shift) + shared.tmp[tid])));
|
||||
// remove sign bits
|
||||
atomic_or(shared.maxcoef2 + i, coef ^ (coef >> 31));
|
||||
atom_or(shared.maxcoef2 + i, coef ^ (coef >> 31));
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
// calculate actual number of bits (+1 for sign)
|
||||
cbits = 1 + 32 - clz(shared.maxcoef2[i]);
|
||||
@@ -452,14 +471,16 @@ void clEstimateResidual(
|
||||
psum[tid] = 0;
|
||||
data[tid] = 0.0f;
|
||||
|
||||
int partOrder = clz(64) - clz(bs - 1) + 1;
|
||||
int partOrder = max(1, clz(64) - clz(bs - 1) + 1);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
#ifdef AMD
|
||||
float4 cptr0 = vload4(0, &fcoef[0]);
|
||||
float4 cptr1 = vload4(1, &fcoef[0]);
|
||||
#if MAX_ORDER > 8
|
||||
float4 cptr2 = vload4(2, &fcoef[0]);
|
||||
#endif
|
||||
#endif
|
||||
for (int pos = 0; pos < bs; pos += GROUP_SIZE)
|
||||
{
|
||||
@@ -471,6 +492,7 @@ void clEstimateResidual(
|
||||
|
||||
// compute residual
|
||||
__local float* dptr = &data[tid + GROUP_SIZE - ro];
|
||||
#ifdef AMD
|
||||
float4 sum = cptr0 * vload4(0, dptr)
|
||||
+ cptr1 * vload4(1, dptr)
|
||||
#if MAX_ORDER > 8
|
||||
@@ -488,20 +510,28 @@ void clEstimateResidual(
|
||||
;
|
||||
|
||||
int t = convert_int_rte(nextData + sum.x + sum.y + sum.z + sum.w);
|
||||
#else
|
||||
float sum =
|
||||
fcoef[0] * dptr[0] + fcoef[1] * dptr[1] + fcoef[2] * dptr[2] + fcoef[3] * dptr[3] +
|
||||
fcoef[4] * dptr[4] + fcoef[5] * dptr[5] + fcoef[6] * dptr[6] + fcoef[7] * dptr[7] +
|
||||
fcoef[8] * dptr[8] + fcoef[9] * dptr[9] + fcoef[10] * dptr[10] + fcoef[11] * dptr[11] ;
|
||||
int t = convert_int_rte(nextData + sum);
|
||||
#endif
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
data[tid] = nextData;
|
||||
// ensure we're within frame bounds
|
||||
t = select(0, t, offs >= ro && offs < bs);
|
||||
// overflow protection
|
||||
t = clamp(t, -0x7fffff, 0x7fffff);
|
||||
t = iclamp(t, -0x7fffff, 0x7fffff);
|
||||
// convert to unsigned
|
||||
atomic_add(&psum[offs >> partOrder], (t << 1) ^ (t >> 31));
|
||||
if (offs < bs)
|
||||
atom_add(&psum[offs >> partOrder], (t << 1) ^ (t >> 31));
|
||||
}
|
||||
|
||||
// calculate rice partition bit length for every (1 << partOrder) samples
|
||||
if (tid < 64)
|
||||
{
|
||||
int k = clamp(clz(1 << partOrder) - clz(psum[tid]), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16)
|
||||
int k = iclamp(clz(1 << partOrder) - clz(psum[tid]), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16)
|
||||
psum[tid] = (k << partOrder) + (psum[tid] >> k);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@@ -657,11 +687,11 @@ void clEncodeResidual(
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
__local int4 * cptr = (__local int4 *)&task.coefs[0];
|
||||
int4 cptr0 = cptr[0];
|
||||
int4 cptr0 = vload4(0, &task.coefs[0]);
|
||||
#if MAX_ORDER > 4
|
||||
int4 cptr1 = cptr[1];
|
||||
int4 cptr1 = vload4(1, &task.coefs[0]);
|
||||
#if MAX_ORDER > 8
|
||||
int4 cptr2 = cptr[2];
|
||||
int4 cptr2 = vload4(2, &task.coefs[0]);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -675,19 +705,19 @@ void clEncodeResidual(
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// compute residual
|
||||
__local int4 * dptr = (__local int4 *)&data[tid + GROUP_SIZE - ro];
|
||||
int4 sum = dptr[0] * cptr0
|
||||
__local int* dptr = &data[tid + GROUP_SIZE - ro];
|
||||
int4 sum = cptr0 * vload4(0, dptr)
|
||||
#if MAX_ORDER > 4
|
||||
+ dptr[1] * cptr1
|
||||
+ cptr1 * vload4(1, dptr)
|
||||
#if MAX_ORDER > 8
|
||||
+ dptr[2] * cptr2
|
||||
+ cptr2 * vload4(2, dptr)
|
||||
#if MAX_ORDER > 12
|
||||
+ dptr[3] * cptr[3]
|
||||
+ vload4(3, &task.coefs[0]) * vload4(3, dptr)
|
||||
#if MAX_ORDER > 16
|
||||
+ dptr[4] * cptr[4]
|
||||
+ dptr[5] * cptr[5]
|
||||
+ dptr[6] * cptr[6]
|
||||
+ dptr[7] * cptr[7]
|
||||
+ vload4(4, &task.coefs[0]) * vload4(4, dptr)
|
||||
+ vload4(5, &task.coefs[0]) * vload4(5, dptr)
|
||||
+ vload4(6, &task.coefs[0]) * vload4(6, dptr)
|
||||
+ vload4(7, &task.coefs[0]) * vload4(7, dptr)
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
@@ -732,13 +762,13 @@ void clCalcPartition(
|
||||
// fetch residual
|
||||
int s = (offs >= task.residualOrder && offs < end) ? residual[task.residualOffs + offs] : 0;
|
||||
// overflow protection
|
||||
s = clamp(s, -0x7fffff, 0x7fffff);
|
||||
s = iclamp(s, -0x7fffff, 0x7fffff);
|
||||
// convert to unsigned
|
||||
s = (s << 1) ^ (s >> 31);
|
||||
// calc number of unary bits for each residual sample with each rice paramater
|
||||
int part = (offs - start) / psize + (tid & 1) * (GROUP_SIZE / 16);
|
||||
for (int k = 0; k <= 14; k++)
|
||||
atomic_add(&pl[part][k], s >> k);
|
||||
atom_add(&pl[part][k], s >> k);
|
||||
//pl[part][k] += s >> k;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@@ -788,12 +818,11 @@ void clCalcPartition16(
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
__local int4 * cptr = (__local int4 *)&task.coefs[0];
|
||||
int4 cptr0 = cptr[0];
|
||||
int4 cptr0 = vload4(0, &task.coefs[0]);
|
||||
#if MAX_ORDER > 4
|
||||
int4 cptr1 = cptr[1];
|
||||
int4 cptr1 = vload4(1, &task.coefs[0]);
|
||||
#if MAX_ORDER > 8
|
||||
int4 cptr2 = cptr[2];
|
||||
int4 cptr2 = vload4(2, &task.coefs[0]);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -807,19 +836,19 @@ void clCalcPartition16(
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// compute residual
|
||||
__local int4 * dptr = (__local int4 *)&data[tid + GROUP_SIZE - ro];
|
||||
int4 sum = dptr[0] * cptr0
|
||||
__local int* dptr = &data[tid + GROUP_SIZE - ro];
|
||||
int4 sum = cptr0 * vload4(0, dptr)
|
||||
#if MAX_ORDER > 4
|
||||
+ dptr[1] * cptr1
|
||||
+ cptr1 * vload4(1, dptr)
|
||||
#if MAX_ORDER > 8
|
||||
+ dptr[2] * cptr2
|
||||
+ cptr2 * vload4(2, dptr)
|
||||
#if MAX_ORDER > 12
|
||||
+ dptr[3] * cptr[3]
|
||||
+ vload4(3, &task.coefs[0]) * vload4(3, dptr)
|
||||
#if MAX_ORDER > 16
|
||||
+ dptr[4] * cptr[4]
|
||||
+ dptr[5] * cptr[5]
|
||||
+ dptr[6] * cptr[6]
|
||||
+ dptr[7] * cptr[7]
|
||||
+ vload4(4, &task.coefs[0]) * vload4(4, dptr)
|
||||
+ vload4(5, &task.coefs[0]) * vload4(5, dptr)
|
||||
+ vload4(6, &task.coefs[0]) * vload4(6, dptr)
|
||||
+ vload4(7, &task.coefs[0]) * vload4(7, dptr)
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
@@ -833,11 +862,11 @@ void clCalcPartition16(
|
||||
|
||||
//int s = select(0, residual[task.data.residualOffs + offs], offs >= ro && offs < bs);
|
||||
|
||||
s = clamp(s, -0x7fffff, 0x7fffff);
|
||||
s = iclamp(s, -0x7fffff, 0x7fffff);
|
||||
// convert to unsigned
|
||||
res[tid] = (s << 1) ^ (s >> 31);
|
||||
|
||||
// for (int k = 0; k < 15; k++) atomic_add(&pl[x][k], s >> k);
|
||||
// for (int k = 0; k < 15; k++) atom_add(&pl[x][k], s >> k);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
data[tid] = nextData;
|
||||
@@ -943,7 +972,7 @@ void clFindPartitionOrder(
|
||||
{
|
||||
int len = rice_parameters[pos + offs];
|
||||
int porder = 31 - clz(lim - offs);
|
||||
atomic_add(&partlen[porder], len);
|
||||
atom_add(&partlen[porder], len);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
|
||||
@@ -152,7 +152,23 @@ void clComputeAutocor(
|
||||
for (int tid = 0; tid < len; tid++)
|
||||
data1[tid] = samples[task.samplesOffs + tid] * window[windowOffs + tid];
|
||||
data1[len] = 0.0f;
|
||||
__global float * pout = &output[(get_group_id(0) * get_num_groups(1) + get_group_id(1)) * (MAX_ORDER + 1)];
|
||||
for (int l = 1; l < MAX_ORDER; l++)
|
||||
data1[len + l] = 0.0f;
|
||||
|
||||
// double ac0 = 0.0, ac1 = 0.0, ac2 = 0.0, ac3 = 0.0;
|
||||
// for (int j = 0; j < len; j++)
|
||||
// {
|
||||
//float dj = data1[j];
|
||||
//ac0 += dj * dj;
|
||||
//ac1 += dj * data1[j + 1];
|
||||
//ac2 += dj * data1[j + 2];
|
||||
//ac3 += dj * data1[j + 3];
|
||||
// }
|
||||
// pout[0] = ac0;
|
||||
// pout[1] = ac1;
|
||||
// pout[2] = ac2;
|
||||
// pout[3] = ac3;
|
||||
for (int i = 0; i <= MAX_ORDER; ++i)
|
||||
{
|
||||
double temp = 1.0;
|
||||
@@ -164,7 +180,7 @@ void clComputeAutocor(
|
||||
temp += pdata[i] * pdata[0];
|
||||
temp2 += pdata[i + 1] * pdata[1];
|
||||
}
|
||||
output[(get_group_id(0) * get_num_groups(1) + get_group_id(1)) * (MAX_ORDER + 1) + i] = temp + temp2;
|
||||
pout[i] = temp + temp2;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user