mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
FLACCL: hi-res audio encoding optimisation
This commit is contained in:
@@ -354,6 +354,19 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
if (inited)
|
if (inited)
|
||||||
{
|
{
|
||||||
_IO.Close();
|
_IO.Close();
|
||||||
|
if (task2.frameCount > 0)
|
||||||
|
{
|
||||||
|
if (cpu_tasks != null)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < cpu_tasks.Length; i++)
|
||||||
|
{
|
||||||
|
wait_for_cpu_task();
|
||||||
|
oldest_cpu_task = (oldest_cpu_task + 1) % cpu_tasks.Length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
task2.openCLCQ.Finish(); // cuda.SynchronizeStream(task2.stream);
|
||||||
|
task2.frameCount = 0;
|
||||||
|
}
|
||||||
task1.Dispose();
|
task1.Dispose();
|
||||||
task2.Dispose();
|
task2.Dispose();
|
||||||
if (cpu_tasks != null)
|
if (cpu_tasks != null)
|
||||||
@@ -685,6 +698,38 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Special case when (n >> pmax) == 32
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="pmin"></param>
|
||||||
|
/// <param name="pmax"></param>
|
||||||
|
/// <param name="data"></param>
|
||||||
|
/// <param name="n"></param>
|
||||||
|
/// <param name="pred_order"></param>
|
||||||
|
/// <param name="sums"></param>
|
||||||
|
static unsafe void calc_sums32(int pmin, int pmax, uint* data, uint n, uint pred_order, ulong* sums)
|
||||||
|
{
|
||||||
|
int parts = (1 << pmax);
|
||||||
|
uint* res = data + pred_order;
|
||||||
|
uint cnt = 32 - pred_order;
|
||||||
|
ulong sum = 0UL;
|
||||||
|
for (uint j = cnt; j > 0; j--)
|
||||||
|
sum += *(res++);
|
||||||
|
sums[0] = sum;
|
||||||
|
for (int i = 1; i < parts; i++)
|
||||||
|
{
|
||||||
|
sums[i] = 0UL +
|
||||||
|
*(res++) + *(res++) + *(res++) + *(res++) +
|
||||||
|
*(res++) + *(res++) + *(res++) + *(res++) +
|
||||||
|
*(res++) + *(res++) + *(res++) + *(res++) +
|
||||||
|
*(res++) + *(res++) + *(res++) + *(res++) +
|
||||||
|
*(res++) + *(res++) + *(res++) + *(res++) +
|
||||||
|
*(res++) + *(res++) + *(res++) + *(res++) +
|
||||||
|
*(res++) + *(res++) + *(res++) + *(res++) +
|
||||||
|
*(res++) + *(res++) + *(res++) + *(res++);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Special case when (n >> pmax) == 18
|
/// Special case when (n >> pmax) == 18
|
||||||
/// </summary>
|
/// </summary>
|
||||||
@@ -728,7 +773,9 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
udata[i] = (uint)((data[i] << 1) ^ (data[i] >> 31));
|
udata[i] = (uint)((data[i] << 1) ^ (data[i] >> 31));
|
||||||
|
|
||||||
// sums for highest level
|
// sums for highest level
|
||||||
if ((n >> pmax) == 18)
|
if ((n >> pmax) == 32)
|
||||||
|
calc_sums32(pmin, pmax, udata, n, pred_order, sums + pmax * Flake.MAX_PARTITIONS);
|
||||||
|
else if ((n >> pmax) == 18)
|
||||||
calc_sums18(pmin, pmax, udata, n, pred_order, sums + pmax * Flake.MAX_PARTITIONS);
|
calc_sums18(pmin, pmax, udata, n, pred_order, sums + pmax * Flake.MAX_PARTITIONS);
|
||||||
else if ((n >> pmax) == 16)
|
else if ((n >> pmax) == 16)
|
||||||
calc_sums16(pmin, pmax, udata, n, pred_order, sums + pmax * Flake.MAX_PARTITIONS);
|
calc_sums16(pmin, pmax, udata, n, pred_order, sums + pmax * Flake.MAX_PARTITIONS);
|
||||||
@@ -1726,6 +1773,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
OCLMan.Defines =
|
OCLMan.Defines =
|
||||||
"#define MAX_ORDER " + eparams.max_prediction_order.ToString() + "\n" +
|
"#define MAX_ORDER " + eparams.max_prediction_order.ToString() + "\n" +
|
||||||
"#define GROUP_SIZE " + groupSize.ToString() + "\n" +
|
"#define GROUP_SIZE " + groupSize.ToString() + "\n" +
|
||||||
|
"#define GROUP_SIZE_LOG " + BitReader.log2i(groupSize).ToString() + "\n" +
|
||||||
"#define FLACCL_VERSION \"" + Vendor + "\"\n" +
|
"#define FLACCL_VERSION \"" + Vendor + "\"\n" +
|
||||||
(UseGPUOnly ? "#define DO_PARTITIONS\n" : "") +
|
(UseGPUOnly ? "#define DO_PARTITIONS\n" : "") +
|
||||||
(UseGPURice ? "#define DO_RICE\n" : "") +
|
(UseGPURice ? "#define DO_RICE\n" : "") +
|
||||||
@@ -2437,6 +2485,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
public Kernel clEncodeResidual;
|
public Kernel clEncodeResidual;
|
||||||
public Kernel clCalcPartition;
|
public Kernel clCalcPartition;
|
||||||
public Kernel clCalcPartition16;
|
public Kernel clCalcPartition16;
|
||||||
|
public Kernel clCalcPartition32;
|
||||||
public Kernel clSumPartition;
|
public Kernel clSumPartition;
|
||||||
public Kernel clFindRiceParameter;
|
public Kernel clFindRiceParameter;
|
||||||
public Kernel clFindPartitionOrder;
|
public Kernel clFindPartitionOrder;
|
||||||
@@ -2530,7 +2579,8 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
int MAX_CHANNELSIZE = MAX_FRAMES * ((writer.m_blockSize + 3) & ~3);
|
int MAX_CHANNELSIZE = MAX_FRAMES * ((writer.m_blockSize + 3) & ~3);
|
||||||
residualTasksLen = sizeof(FLACCLSubframeTask) * 32 * channelsCount * MAX_FRAMES;
|
residualTasksLen = sizeof(FLACCLSubframeTask) * 32 * channelsCount * MAX_FRAMES;
|
||||||
bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channels * MAX_FRAMES;
|
bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channels * MAX_FRAMES;
|
||||||
int samplesBufferLen = writer.Settings.PCM.BlockAlign * MAX_CHANNELSIZE * channelsCount;
|
int samplesBytesLen = writer.Settings.PCM.BlockAlign * MAX_CHANNELSIZE;
|
||||||
|
int samplesBufferLen = sizeof(int) * MAX_CHANNELSIZE * channelsCount;
|
||||||
int residualBufferLen = sizeof(int) * MAX_CHANNELSIZE * channels; // need to adjust residualOffset?
|
int residualBufferLen = sizeof(int) * MAX_CHANNELSIZE * channels; // need to adjust residualOffset?
|
||||||
int partitionsLen = sizeof(int) * ((writer.Settings.PCM.BitsPerSample > 16 ? 31 : 15) * 2 << 8) * channels * MAX_FRAMES;
|
int partitionsLen = sizeof(int) * ((writer.Settings.PCM.BitsPerSample > 16 ? 31 : 15) * 2 << 8) * channels * MAX_FRAMES;
|
||||||
int riceParamsLen = sizeof(int) * (4 << 8) * channels * MAX_FRAMES;
|
int riceParamsLen = sizeof(int) * (4 << 8) * channels * MAX_FRAMES;
|
||||||
@@ -2543,7 +2593,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
|
|
||||||
if (!this.UseMappedMemory)
|
if (!this.UseMappedMemory)
|
||||||
{
|
{
|
||||||
clSamplesBytes = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, samplesBufferLen / 2);
|
clSamplesBytes = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, samplesBytesLen);
|
||||||
clResidual = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, residualBufferLen);
|
clResidual = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, residualBufferLen);
|
||||||
clBestRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, riceParamsLen / 4);
|
clBestRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, riceParamsLen / 4);
|
||||||
clResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, residualTasksLen);
|
clResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, residualTasksLen);
|
||||||
@@ -2552,7 +2602,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
clSelectedTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, selectedLen);
|
clSelectedTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, selectedLen);
|
||||||
clRiceOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, riceLen);
|
clRiceOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, riceLen);
|
||||||
|
|
||||||
clSamplesBytesPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, samplesBufferLen / 2);
|
clSamplesBytesPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, samplesBytesLen);
|
||||||
clResidualPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, residualBufferLen);
|
clResidualPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, residualBufferLen);
|
||||||
clBestRiceParamsPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceParamsLen / 4);
|
clBestRiceParamsPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceParamsLen / 4);
|
||||||
clResidualTasksPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, residualTasksLen);
|
clResidualTasksPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, residualTasksLen);
|
||||||
@@ -2561,7 +2611,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
clSelectedTasksPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, selectedLen);
|
clSelectedTasksPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, selectedLen);
|
||||||
clRiceOutputPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceLen);
|
clRiceOutputPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceLen);
|
||||||
|
|
||||||
clSamplesBytesPtr = openCLCQ.EnqueueMapBuffer(clSamplesBytesPinned, true, MapFlags.READ_WRITE, 0, samplesBufferLen / 2);
|
clSamplesBytesPtr = openCLCQ.EnqueueMapBuffer(clSamplesBytesPinned, true, MapFlags.READ_WRITE, 0, samplesBytesLen);
|
||||||
clResidualPtr = openCLCQ.EnqueueMapBuffer(clResidualPinned, true, MapFlags.READ_WRITE, 0, residualBufferLen);
|
clResidualPtr = openCLCQ.EnqueueMapBuffer(clResidualPinned, true, MapFlags.READ_WRITE, 0, residualBufferLen);
|
||||||
clBestRiceParamsPtr = openCLCQ.EnqueueMapBuffer(clBestRiceParamsPinned, true, MapFlags.READ_WRITE, 0, riceParamsLen / 4);
|
clBestRiceParamsPtr = openCLCQ.EnqueueMapBuffer(clBestRiceParamsPinned, true, MapFlags.READ_WRITE, 0, riceParamsLen / 4);
|
||||||
clResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clResidualTasksPinned, true, MapFlags.READ_WRITE, 0, residualTasksLen);
|
clResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clResidualTasksPinned, true, MapFlags.READ_WRITE, 0, residualTasksLen);
|
||||||
@@ -2572,7 +2622,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
clSamplesBytes = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, (uint)samplesBufferLen / 2);
|
clSamplesBytes = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, (uint)samplesBytesLen);
|
||||||
clResidual = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, residualBufferLen);
|
clResidual = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, residualBufferLen);
|
||||||
clBestRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceParamsLen / 4);
|
clBestRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceParamsLen / 4);
|
||||||
clResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, residualTasksLen);
|
clResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, residualTasksLen);
|
||||||
@@ -2581,7 +2631,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
clSelectedTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, selectedLen);
|
clSelectedTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, selectedLen);
|
||||||
clRiceOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceLen);
|
clRiceOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceLen);
|
||||||
|
|
||||||
clSamplesBytesPtr = openCLCQ.EnqueueMapBuffer(clSamplesBytes, true, MapFlags.READ_WRITE, 0, samplesBufferLen / 2);
|
clSamplesBytesPtr = openCLCQ.EnqueueMapBuffer(clSamplesBytes, true, MapFlags.READ_WRITE, 0, samplesBytesLen);
|
||||||
clResidualPtr = openCLCQ.EnqueueMapBuffer(clResidual, true, MapFlags.READ_WRITE, 0, residualBufferLen);
|
clResidualPtr = openCLCQ.EnqueueMapBuffer(clResidual, true, MapFlags.READ_WRITE, 0, residualBufferLen);
|
||||||
clBestRiceParamsPtr = openCLCQ.EnqueueMapBuffer(clBestRiceParams, true, MapFlags.READ_WRITE, 0, riceParamsLen / 4);
|
clBestRiceParamsPtr = openCLCQ.EnqueueMapBuffer(clBestRiceParams, true, MapFlags.READ_WRITE, 0, riceParamsLen / 4);
|
||||||
clResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clResidualTasks, true, MapFlags.READ_WRITE, 0, residualTasksLen);
|
clResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clResidualTasks, true, MapFlags.READ_WRITE, 0, residualTasksLen);
|
||||||
@@ -2630,6 +2680,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
{
|
{
|
||||||
clCalcPartition = openCLProgram.CreateKernel("clCalcPartition");
|
clCalcPartition = openCLProgram.CreateKernel("clCalcPartition");
|
||||||
clCalcPartition16 = openCLProgram.CreateKernel("clCalcPartition16");
|
clCalcPartition16 = openCLProgram.CreateKernel("clCalcPartition16");
|
||||||
|
clCalcPartition32 = openCLProgram.CreateKernel("clCalcPartition32");
|
||||||
}
|
}
|
||||||
clSumPartition = openCLProgram.CreateKernel("clSumPartition");
|
clSumPartition = openCLProgram.CreateKernel("clSumPartition");
|
||||||
clFindRiceParameter = openCLProgram.CreateKernel("clFindRiceParameter");
|
clFindRiceParameter = openCLProgram.CreateKernel("clFindRiceParameter");
|
||||||
@@ -2684,6 +2735,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
{
|
{
|
||||||
clCalcPartition.Dispose();
|
clCalcPartition.Dispose();
|
||||||
clCalcPartition16.Dispose();
|
clCalcPartition16.Dispose();
|
||||||
|
clCalcPartition32.Dispose();
|
||||||
}
|
}
|
||||||
clSumPartition.Dispose();
|
clSumPartition.Dispose();
|
||||||
clFindRiceParameter.Dispose();
|
clFindRiceParameter.Dispose();
|
||||||
@@ -2759,7 +2811,9 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
clSelectedTasks.Dispose();
|
clSelectedTasks.Dispose();
|
||||||
clRiceOutput.Dispose();
|
clRiceOutput.Dispose();
|
||||||
|
|
||||||
|
openCLCQ.Finish();
|
||||||
openCLCQ.Dispose();
|
openCLCQ.Dispose();
|
||||||
|
openCLCQ = null;
|
||||||
|
|
||||||
GC.SuppressFinalize(this);
|
GC.SuppressFinalize(this);
|
||||||
}
|
}
|
||||||
@@ -2808,6 +2862,9 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
// channelSize == blockSize * nFrames;
|
||||||
|
// clSamplesBytes length = blockSize * nFrames * blockalign
|
||||||
|
// clSamples length = 4 * blockSize * nFrames * channels
|
||||||
clChannelDecorrX.SetArgs(
|
clChannelDecorrX.SetArgs(
|
||||||
clSamples,
|
clSamples,
|
||||||
clSamplesBytes,
|
clSamplesBytes,
|
||||||
@@ -2982,6 +3039,18 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
clCalcPartition16,
|
clCalcPartition16,
|
||||||
groupSize, channels * frameCount);
|
groupSize, channels * frameCount);
|
||||||
}
|
}
|
||||||
|
else if (frameSize >> max_porder == 32)
|
||||||
|
{
|
||||||
|
clCalcPartition32.SetArgs(
|
||||||
|
clPartitions,
|
||||||
|
clResidual,
|
||||||
|
clBestResidualTasks,
|
||||||
|
max_porder);
|
||||||
|
|
||||||
|
openCLCQ.EnqueueNDRangeKernel(
|
||||||
|
clCalcPartition32,
|
||||||
|
groupSize, channels * frameCount);
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
clCalcPartition.SetArgs(
|
clCalcPartition.SetArgs(
|
||||||
|
|||||||
@@ -1519,12 +1519,76 @@ void clCalcPartition16(
|
|||||||
|
|
||||||
for (int k0 = 0; k0 <= MAX_RICE_PARAM; k0 += 16)
|
for (int k0 = 0; k0 <= MAX_RICE_PARAM; k0 += 16)
|
||||||
{
|
{
|
||||||
int k1 = k0 + (tid >> 3), x1 = tid & 7;
|
int k1 = k0 + (tid >> (GROUP_SIZE_LOG - 4)), x1 = tid & ((1 << (GROUP_SIZE_LOG - 4)) - 1);
|
||||||
if (k1 <= MAX_RICE_PARAM && (pos >> 4) + x1 < (1 << max_porder))
|
if (k1 <= MAX_RICE_PARAM && (pos >> 4) + x1 < (1 << max_porder))
|
||||||
partition_lengths[((MAX_RICE_PARAM + 1) << (max_porder + 1)) * get_group_id(0) + (k1 << (max_porder + 1)) + (pos >> 4) + x1] = pl[x1][k1];
|
partition_lengths[((MAX_RICE_PARAM + 1) << (max_porder + 1)) * get_group_id(0) + (k1 << (max_porder + 1)) + (pos >> 4) + x1] = pl[x1][k1];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
|
||||||
|
void clCalcPartition32(
|
||||||
|
__global unsigned int *partition_lengths,
|
||||||
|
__global int *residual,
|
||||||
|
__global FLACCLSubframeTask *tasks,
|
||||||
|
int max_porder // <= 8
|
||||||
|
)
|
||||||
|
{
|
||||||
|
__local FLACCLSubframeData task;
|
||||||
|
__local unsigned int res[GROUP_SIZE];
|
||||||
|
__local unsigned int pl[GROUP_SIZE >> 5][32];
|
||||||
|
const int tid = get_local_id(0);
|
||||||
|
if (tid < sizeof(task) / sizeof(int))
|
||||||
|
((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(0)]))[tid];
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
int bs = task.blocksize;
|
||||||
|
int ro = task.residualOrder;
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
for (int pos = 0; pos < bs; pos += GROUP_SIZE)
|
||||||
|
{
|
||||||
|
int offs = pos + tid;
|
||||||
|
// fetch residual
|
||||||
|
int s = (offs >= ro && offs < bs) ? residual[task.residualOffs + offs] : 0;
|
||||||
|
// convert to unsigned
|
||||||
|
res[tid] = (s << 1) ^ (s >> 31);
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
// we must ensure that psize * (t >> k) doesn't overflow;
|
||||||
|
uint4 lim = 0x07ffffffU;
|
||||||
|
int x = tid >> 5;
|
||||||
|
__local uint * chunk = &res[x << 5];
|
||||||
|
// calc number of unary bits for each group of 32 residual samples
|
||||||
|
// with each rice parameter.
|
||||||
|
int k = tid & 31;
|
||||||
|
uint4 rsum
|
||||||
|
= min(lim, vload4(0,chunk) >> k)
|
||||||
|
+ min(lim, vload4(1,chunk) >> k)
|
||||||
|
+ min(lim, vload4(2,chunk) >> k)
|
||||||
|
+ min(lim, vload4(3,chunk) >> k)
|
||||||
|
+ min(lim, vload4(4,chunk) >> k)
|
||||||
|
+ min(lim, vload4(5,chunk) >> k)
|
||||||
|
+ min(lim, vload4(6,chunk) >> k)
|
||||||
|
+ min(lim, vload4(7,chunk) >> k)
|
||||||
|
;
|
||||||
|
uint rs = rsum.x + rsum.y + rsum.z + rsum.w;
|
||||||
|
|
||||||
|
// We can safely limit length here to 0x007fffffU, not causing length
|
||||||
|
// mismatch, because any such length would cause Verbatim frame anyway.
|
||||||
|
// And this limit protects us from overflows when calculating larger
|
||||||
|
// partitions, as we can have a maximum of 2^8 partitions, resulting
|
||||||
|
// in maximum partition length of 0x7fffffffU + change.
|
||||||
|
if (k <= MAX_RICE_PARAM) pl[x][k] = min(0x007fffffU, rs) + (uint)(32 - select(0, ro, offs < 32)) * (k + 1);
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
int k1 = (tid >> (GROUP_SIZE_LOG - 5)), x1 = tid & ((1 << (GROUP_SIZE_LOG - 5)) - 1);
|
||||||
|
if (k1 <= MAX_RICE_PARAM && (pos >> 5) + x1 < (1 << max_porder))
|
||||||
|
partition_lengths[((MAX_RICE_PARAM + 1) << (max_porder + 1)) * get_group_id(0) + (k1 << (max_porder + 1)) + (pos >> 5) + x1] = pl[x1][k1];
|
||||||
|
}
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef FLACCL_CPU
|
#ifdef FLACCL_CPU
|
||||||
|
|||||||
Reference in New Issue
Block a user