mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
optimizations
This commit is contained in:
@@ -39,7 +39,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
this.GPUOnly = true;
|
this.GPUOnly = true;
|
||||||
this.MappedMemory = false;
|
this.MappedMemory = false;
|
||||||
this.DoMD5 = true;
|
this.DoMD5 = true;
|
||||||
this.GroupSize = 64;
|
this.GroupSize = 128;
|
||||||
this.DeviceType = OpenCLDeviceType.GPU;
|
this.DeviceType = OpenCLDeviceType.GPU;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -61,7 +61,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
[SRDescription(typeof(Properties.Resources), "DescriptionMappedMemory")]
|
[SRDescription(typeof(Properties.Resources), "DescriptionMappedMemory")]
|
||||||
public bool MappedMemory { get; set; }
|
public bool MappedMemory { get; set; }
|
||||||
|
|
||||||
[DefaultValue(64)]
|
[DefaultValue(128)]
|
||||||
[SRDescription(typeof(Properties.Resources), "DescriptionGroupSize")]
|
[SRDescription(typeof(Properties.Resources), "DescriptionGroupSize")]
|
||||||
public int GroupSize { get; set; }
|
public int GroupSize { get; set; }
|
||||||
|
|
||||||
@@ -1504,6 +1504,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
OCLMan.Defines =
|
OCLMan.Defines =
|
||||||
"#define MAX_ORDER " + eparams.max_prediction_order.ToString() + "\n" +
|
"#define MAX_ORDER " + eparams.max_prediction_order.ToString() + "\n" +
|
||||||
"#define GROUP_SIZE " + groupSize.ToString() + "\n" +
|
"#define GROUP_SIZE " + groupSize.ToString() + "\n" +
|
||||||
|
"#define FLACCL_VERSION \"" + vendor_string + "\"\n" +
|
||||||
#if DEBUG
|
#if DEBUG
|
||||||
"#define DEBUG\n" +
|
"#define DEBUG\n" +
|
||||||
#endif
|
#endif
|
||||||
@@ -2112,9 +2113,9 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
do_constant = false;
|
do_constant = false;
|
||||||
do_midside = false;
|
do_midside = false;
|
||||||
window_function = WindowFunction.Bartlett;
|
window_function = WindowFunction.Bartlett;
|
||||||
|
orders_per_window = 1;
|
||||||
min_fixed_order = 2;
|
min_fixed_order = 2;
|
||||||
max_fixed_order = 2;
|
max_fixed_order = 2;
|
||||||
orders_per_window = 1;
|
|
||||||
max_prediction_order = 8;
|
max_prediction_order = 8;
|
||||||
max_partition_order = 4;
|
max_partition_order = 4;
|
||||||
break;
|
break;
|
||||||
@@ -2235,7 +2236,6 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
public Mem clBestRiceParams;
|
public Mem clBestRiceParams;
|
||||||
public Mem clAutocorOutput;
|
public Mem clAutocorOutput;
|
||||||
public Mem clResidualTasks;
|
public Mem clResidualTasks;
|
||||||
public Mem clResidualOutput;
|
|
||||||
public Mem clBestResidualTasks;
|
public Mem clBestResidualTasks;
|
||||||
public Mem clWindowFunctions;
|
public Mem clWindowFunctions;
|
||||||
|
|
||||||
@@ -2356,7 +2356,6 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
clSamples = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, samplesBufferLen);
|
clSamples = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, samplesBufferLen);
|
||||||
clLPCData = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, lpcDataLen);
|
clLPCData = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, lpcDataLen);
|
||||||
clAutocorOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, autocorLen);
|
clAutocorOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, autocorLen);
|
||||||
clResidualOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, resOutLen);
|
|
||||||
if (writer._settings.GPUOnly)
|
if (writer._settings.GPUOnly)
|
||||||
{
|
{
|
||||||
clPartitions = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, partitionsLen);
|
clPartitions = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, partitionsLen);
|
||||||
@@ -2483,7 +2482,6 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
clResidual.Dispose();
|
clResidual.Dispose();
|
||||||
clAutocorOutput.Dispose();
|
clAutocorOutput.Dispose();
|
||||||
clResidualTasks.Dispose();
|
clResidualTasks.Dispose();
|
||||||
clResidualOutput.Dispose();
|
|
||||||
clBestResidualTasks.Dispose();
|
clBestResidualTasks.Dispose();
|
||||||
clWindowFunctions.Dispose();
|
clWindowFunctions.Dispose();
|
||||||
|
|
||||||
@@ -2587,7 +2585,6 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
channelsCount * frameCount);
|
channelsCount * frameCount);
|
||||||
|
|
||||||
clEstimateResidual.SetArgs(
|
clEstimateResidual.SetArgs(
|
||||||
clResidualOutput,
|
|
||||||
clSamples,
|
clSamples,
|
||||||
clResidualTasks);
|
clResidualTasks);
|
||||||
|
|
||||||
@@ -2598,7 +2595,6 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
|
|
||||||
clChooseBestMethod.SetArgs(
|
clChooseBestMethod.SetArgs(
|
||||||
clResidualTasks,
|
clResidualTasks,
|
||||||
clResidualOutput,
|
|
||||||
nResidualTasksPerChannel);
|
nResidualTasksPerChannel);
|
||||||
|
|
||||||
openCLCQ.EnqueueNDRangeKernel(
|
openCLCQ.EnqueueNDRangeKernel(
|
||||||
@@ -2714,6 +2710,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if HJHKHJ
|
||||||
public static class OpenCLExtensions
|
public static class OpenCLExtensions
|
||||||
{
|
{
|
||||||
public static void SetArgs(this Kernel kernel, params object[] args)
|
public static void SetArgs(this Kernel kernel, params object[] args)
|
||||||
@@ -2744,4 +2741,5 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
queue.EnqueueNDRangeKernel(kernel, 2, null, new long[] { localSizeX * globalSizeX, localSizeY * globalSizeY }, new long[] { localSizeX, localSizeY });
|
queue.EnqueueNDRangeKernel(kernel, 2, null, new long[] { localSizeX * globalSizeX, localSizeY * globalSizeY }, new long[] { localSizeX, localSizeY });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -20,15 +20,16 @@
|
|||||||
#ifndef _FLACCL_KERNEL_H_
|
#ifndef _FLACCL_KERNEL_H_
|
||||||
#define _FLACCL_KERNEL_H_
|
#define _FLACCL_KERNEL_H_
|
||||||
|
|
||||||
#undef DEBUG
|
#if defined(__Cedar__) || defined(__Redwood__) || defined(__Juniper__) || defined(__Cypress__)
|
||||||
|
#define AMD
|
||||||
//#define AMD
|
#ifdef DEBUG
|
||||||
|
#pragma OPENCL EXTENSION cl_amd_printf : enable
|
||||||
//#ifdef DEBUG
|
#endif
|
||||||
//#pragma OPENCL EXTENSION cl_amd_printf : enable
|
|
||||||
//#endif
|
|
||||||
|
|
||||||
//#pragma OPENCL EXTENSION cl_amd_fp64 : enable
|
//#pragma OPENCL EXTENSION cl_amd_fp64 : enable
|
||||||
|
#define iclamp(a,b,c) clamp(a,b,c)
|
||||||
|
#else
|
||||||
|
#define iclamp(a,b,c) max(b,min(a,c))
|
||||||
|
#endif
|
||||||
|
|
||||||
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics: enable
|
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics: enable
|
||||||
#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics: enable
|
#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics: enable
|
||||||
@@ -66,8 +67,6 @@ typedef struct
|
|||||||
int coefs[32]; // fixme: should be short?
|
int coefs[32]; // fixme: should be short?
|
||||||
} FLACCLSubframeTask;
|
} FLACCLSubframeTask;
|
||||||
|
|
||||||
#define iclamp(a,b,c) max(b,min(a,c))
|
|
||||||
|
|
||||||
__kernel void clStereoDecorr(
|
__kernel void clStereoDecorr(
|
||||||
__global int *samples,
|
__global int *samples,
|
||||||
__global short2 *src,
|
__global short2 *src,
|
||||||
@@ -172,15 +171,11 @@ void clComputeAutocor(
|
|||||||
)
|
)
|
||||||
{
|
{
|
||||||
__local float data[GROUP_SIZE * 2];
|
__local float data[GROUP_SIZE * 2];
|
||||||
__local float product[(MAX_ORDER / 4 + 1) * GROUP_SIZE];
|
|
||||||
__local FLACCLSubframeData task;
|
__local FLACCLSubframeData task;
|
||||||
const int tid = get_local_id(0);
|
const int tid = get_local_id(0);
|
||||||
// fetch task data
|
// fetch task data
|
||||||
if (tid < sizeof(task) / sizeof(int))
|
if (tid < sizeof(task) / sizeof(int))
|
||||||
((__local int*)&task)[tid] = ((__global int*)(tasks + taskCount * get_group_id(0)))[tid];
|
((__local int*)&task)[tid] = ((__global int*)(tasks + taskCount * get_group_id(0)))[tid];
|
||||||
|
|
||||||
for (int ord4 = 0; ord4 < (MAX_ORDER / 4 + 1); ord4 ++)
|
|
||||||
product[ord4 * GROUP_SIZE + tid] = 0.0f;
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
int bs = task.blocksize;
|
int bs = task.blocksize;
|
||||||
@@ -188,13 +183,9 @@ void clComputeAutocor(
|
|||||||
|
|
||||||
data[tid] = tid < bs ? samples[task.samplesOffs + tid] * window[windowOffs + tid] : 0.0f;
|
data[tid] = tid < bs ? samples[task.samplesOffs + tid] * window[windowOffs + tid] : 0.0f;
|
||||||
|
|
||||||
int tid0 = tid % (GROUP_SIZE >> 2);
|
const int THREADS_FOR_ORDERS = MAX_ORDER < 8 ? 8 : MAX_ORDER < 16 ? 16 : MAX_ORDER < 32 ? 32 : 64;
|
||||||
int tid1 = tid / (GROUP_SIZE >> 2);
|
float corr = 0.0f;
|
||||||
#ifdef ATI
|
float corr1 = 0.0f;
|
||||||
__local float4 * dptr = ((__local float4 *)&data[0]) + tid0;
|
|
||||||
__local float4 * dptr1 = ((__local float4 *)&data[tid1]) + tid0;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
for (int pos = 0; pos < bs; pos += GROUP_SIZE)
|
for (int pos = 0; pos < bs; pos += GROUP_SIZE)
|
||||||
{
|
{
|
||||||
// fetch samples
|
// fetch samples
|
||||||
@@ -202,29 +193,40 @@ void clComputeAutocor(
|
|||||||
data[tid + GROUP_SIZE] = nextData;
|
data[tid + GROUP_SIZE] = nextData;
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
for (int ord4 = 0; ord4 < (MAX_ORDER / 4 + 1); ord4 ++)
|
#ifdef XXXAMD
|
||||||
#ifdef ATI
|
__local float * dptr = &data[tid & ~(THREADS_FOR_ORDERS - 1)];
|
||||||
product[ord4 * GROUP_SIZE + tid] += dot(dptr[0], dptr1[ord4]);
|
float4 res = 0.0f;
|
||||||
|
for (int i = 0; i < THREADS_FOR_ORDERS / 4; i++)
|
||||||
|
res += vload4(i, dptr) * vload4(i, &data[tid]);
|
||||||
|
corr += res.x + res.y + res.w + res.z;
|
||||||
#else
|
#else
|
||||||
product[ord4 * GROUP_SIZE + tid] +=
|
int tid1 = tid & ~(THREADS_FOR_ORDERS - 1);
|
||||||
data[tid0*4 + 0] * data[tid0*4 + ord4*4 + tid1 + 0] +
|
float res = 0.0f;
|
||||||
data[tid0*4 + 1] * data[tid0*4 + ord4*4 + tid1 + 1] +
|
for (int i = 0; i < THREADS_FOR_ORDERS; i++)
|
||||||
data[tid0*4 + 2] * data[tid0*4 + ord4*4 + tid1 + 2] +
|
res += data[tid1 + i] * data[tid + i];
|
||||||
data[tid0*4 + 3] * data[tid0*4 + ord4*4 + tid1 + 3];
|
corr += res;
|
||||||
#endif
|
#endif
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
if (THREADS_FOR_ORDERS > 8 && (pos & (GROUP_SIZE * 7)) == 0)
|
||||||
|
{
|
||||||
|
corr1 += corr;
|
||||||
|
corr = 0.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
data[tid] = nextData;
|
data[tid] = nextData;
|
||||||
}
|
}
|
||||||
for (int ord4 = 0; ord4 < (MAX_ORDER / 4 + 1); ord4 ++)
|
|
||||||
for (int l = (GROUP_SIZE >> 3); l > 0; l >>= 1)
|
data[tid] = corr + corr1;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
for (int i = GROUP_SIZE / 2; i >= THREADS_FOR_ORDERS; i >>= 1)
|
||||||
{
|
{
|
||||||
if (tid0 < l)
|
if (tid < i)
|
||||||
product[ord4 * GROUP_SIZE + tid] += product[ord4 * GROUP_SIZE + tid + l];
|
data[tid] += data[tid + i];
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tid <= MAX_ORDER)
|
if (tid <= MAX_ORDER)
|
||||||
output[(get_group_id(0) * get_num_groups(1) + get_group_id(1)) * (MAX_ORDER + 1) + tid] = product[tid * (GROUP_SIZE >> 2)];
|
output[(get_group_id(0) * get_num_groups(1) + get_group_id(1)) * (MAX_ORDER + 1) + tid] = data[tid];
|
||||||
}
|
}
|
||||||
|
|
||||||
__kernel __attribute__((reqd_work_group_size(32, 1, 1)))
|
__kernel __attribute__((reqd_work_group_size(32, 1, 1)))
|
||||||
@@ -326,7 +328,6 @@ void clQuantizeLPC(
|
|||||||
volatile float error[64];
|
volatile float error[64];
|
||||||
volatile int maxcoef[32];
|
volatile int maxcoef[32];
|
||||||
volatile int maxcoef2[32];
|
volatile int maxcoef2[32];
|
||||||
volatile int lpcOffs;
|
|
||||||
} shared;
|
} shared;
|
||||||
|
|
||||||
const int tid = get_local_id(0);
|
const int tid = get_local_id(0);
|
||||||
@@ -334,9 +335,8 @@ void clQuantizeLPC(
|
|||||||
// fetch task data
|
// fetch task data
|
||||||
if (tid < sizeof(shared.task) / sizeof(int))
|
if (tid < sizeof(shared.task) / sizeof(int))
|
||||||
((__local int*)&shared.task)[tid] = ((__global int*)(tasks + get_group_id(1) * taskCount))[tid];
|
((__local int*)&shared.task)[tid] = ((__global int*)(tasks + get_group_id(1) * taskCount))[tid];
|
||||||
if (tid == 0)
|
|
||||||
shared.lpcOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1) * 32;
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
const int lpcOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1) * 32;
|
||||||
|
|
||||||
// Select best orders based on Akaike's Criteria
|
// Select best orders based on Akaike's Criteria
|
||||||
shared.index[tid] = min(MAX_ORDER - 1, tid);
|
shared.index[tid] = min(MAX_ORDER - 1, tid);
|
||||||
@@ -348,8 +348,8 @@ void clQuantizeLPC(
|
|||||||
|
|
||||||
// Load prediction error estimates
|
// Load prediction error estimates
|
||||||
if (tid < MAX_ORDER)
|
if (tid < MAX_ORDER)
|
||||||
shared.error[tid] = shared.task.blocksize * log(lpcs[shared.lpcOffs + MAX_ORDER * 32 + tid]) + tid * 4.12f * log((float)shared.task.blocksize);
|
shared.error[tid] = shared.task.blocksize * log(lpcs[lpcOffs + MAX_ORDER * 32 + tid]) + tid * 4.12f * log((float)shared.task.blocksize);
|
||||||
//shared.error[get_local_id(0)] = shared.task.blocksize * log(lpcs[shared.lpcOffs + MAX_ORDER * 32 + get_local_id(0)]) + get_local_id(0) * 0.30f * (shared.task.abits + 1) * log(shared.task.blocksize);
|
//shared.error[get_local_id(0)] = shared.task.blocksize * log(lpcs[lpcOffs + MAX_ORDER * 32 + get_local_id(0)]) + get_local_id(0) * 0.30f * (shared.task.abits + 1) * log(shared.task.blocksize);
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
// Sort using bitonic sort
|
// Sort using bitonic sort
|
||||||
@@ -402,7 +402,7 @@ void clQuantizeLPC(
|
|||||||
for (int i = 0; i < taskCountLPC; i ++)
|
for (int i = 0; i < taskCountLPC; i ++)
|
||||||
{
|
{
|
||||||
int order = shared.index[i >> precisions];
|
int order = shared.index[i >> precisions];
|
||||||
float lpc = tid <= order ? lpcs[shared.lpcOffs + order * 32 + tid] : 0.0f;
|
float lpc = tid <= order ? lpcs[lpcOffs + order * 32 + tid] : 0.0f;
|
||||||
// get 15 bits of each coeff
|
// get 15 bits of each coeff
|
||||||
int coef = convert_int_rte(lpc * (1 << 15));
|
int coef = convert_int_rte(lpc * (1 << 15));
|
||||||
// remove sign bits
|
// remove sign bits
|
||||||
@@ -446,7 +446,6 @@ void clQuantizeLPC(
|
|||||||
|
|
||||||
__kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
|
__kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
|
||||||
void clEstimateResidual(
|
void clEstimateResidual(
|
||||||
__global int*output,
|
|
||||||
__global int*samples,
|
__global int*samples,
|
||||||
__global FLACCLSubframeTask *tasks
|
__global FLACCLSubframeTask *tasks
|
||||||
)
|
)
|
||||||
@@ -524,7 +523,7 @@ void clEstimateResidual(
|
|||||||
// overflow protection
|
// overflow protection
|
||||||
t = iclamp(t, -0x7fffff, 0x7fffff);
|
t = iclamp(t, -0x7fffff, 0x7fffff);
|
||||||
// convert to unsigned
|
// convert to unsigned
|
||||||
if (offs < bs)
|
//if (offs < bs)
|
||||||
atom_add(&psum[offs >> partOrder], (t << 1) ^ (t >> 31));
|
atom_add(&psum[offs >> partOrder], (t << 1) ^ (t >> 31));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -542,52 +541,39 @@ void clEstimateResidual(
|
|||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
}
|
}
|
||||||
if (tid == 0)
|
if (tid == 0)
|
||||||
output[get_group_id(0)] = psum[0] + (bs - ro);
|
{
|
||||||
|
int pl = psum[0] + (bs - ro);
|
||||||
|
int obits = task.data.obits - task.data.wbits;
|
||||||
|
int len = min(obits * task.data.blocksize,
|
||||||
|
task.data.type == Fixed ? task.data.residualOrder * obits + 6 + (4 * 1/2) + pl :
|
||||||
|
task.data.type == LPC ? task.data.residualOrder * obits + 4 + 5 + task.data.residualOrder * task.data.cbits + 6 + (4 * 1/2)/* << porder */ + pl :
|
||||||
|
task.data.type == Constant ? obits * select(1, task.data.blocksize, pl != task.data.blocksize - task.data.residualOrder) :
|
||||||
|
obits * task.data.blocksize);
|
||||||
|
tasks[get_group_id(0)].data.size = len;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__kernel __attribute__((reqd_work_group_size(32, 1, 1)))
|
__kernel __attribute__((reqd_work_group_size(32, 1, 1)))
|
||||||
void clChooseBestMethod(
|
void clChooseBestMethod(
|
||||||
__global FLACCLSubframeTask *tasks,
|
__global FLACCLSubframeTask *tasks,
|
||||||
__global int *residual,
|
|
||||||
int taskCount
|
int taskCount
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
int best_length = 0x7fffffff;
|
int best_length = 0x7fffffff;
|
||||||
int best_index = 0;
|
int best_index = 0;
|
||||||
__local int partLen[32];
|
|
||||||
__local FLACCLSubframeData task;
|
|
||||||
const int tid = get_local_id(0);
|
const int tid = get_local_id(0);
|
||||||
|
|
||||||
// fetch part sum
|
|
||||||
if (tid < taskCount)
|
|
||||||
partLen[tid] = residual[tid + taskCount * get_group_id(0)];
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
for (int taskNo = 0; taskNo < taskCount; taskNo++)
|
for (int taskNo = 0; taskNo < taskCount; taskNo++)
|
||||||
{
|
{
|
||||||
// fetch task data
|
|
||||||
if (tid < sizeof(task) / sizeof(int))
|
|
||||||
((__local int*)&task)[tid] = ((__global int*)(&tasks[taskNo + taskCount * get_group_id(0)].data))[tid];
|
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
|
|
||||||
if (tid == 0)
|
if (tid == 0)
|
||||||
{
|
{
|
||||||
int pl = partLen[taskNo];
|
int len = tasks[taskNo + taskCount * get_group_id(0)].data.size;
|
||||||
int obits = task.obits - task.wbits;
|
|
||||||
int len = min(obits * task.blocksize,
|
|
||||||
task.type == Fixed ? task.residualOrder * obits + 6 + (4 * 1/2) + pl :
|
|
||||||
task.type == LPC ? task.residualOrder * obits + 4 + 5 + task.residualOrder * task.cbits + 6 + (4 * 1/2)/* << porder */ + pl :
|
|
||||||
task.type == Constant ? obits * select(1, task.blocksize, pl != task.blocksize - task.residualOrder) :
|
|
||||||
obits * task.blocksize);
|
|
||||||
|
|
||||||
tasks[taskNo + taskCount * get_group_id(0)].data.size = len;
|
|
||||||
if (len < best_length)
|
if (len < best_length)
|
||||||
{
|
{
|
||||||
best_length = len;
|
best_length = len;
|
||||||
best_index = taskNo;
|
best_index = taskNo;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -686,7 +672,6 @@ void clEncodeResidual(
|
|||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
__local int4 * cptr = (__local int4 *)&task.coefs[0];
|
|
||||||
int4 cptr0 = vload4(0, &task.coefs[0]);
|
int4 cptr0 = vload4(0, &task.coefs[0]);
|
||||||
#if MAX_ORDER > 4
|
#if MAX_ORDER > 4
|
||||||
int4 cptr1 = vload4(1, &task.coefs[0]);
|
int4 cptr1 = vload4(1, &task.coefs[0]);
|
||||||
@@ -813,19 +798,12 @@ void clCalcPartition16(
|
|||||||
if (tid >= ro && tid < 32)
|
if (tid >= ro && tid < 32)
|
||||||
task.coefs[tid] = 0;
|
task.coefs[tid] = 0;
|
||||||
|
|
||||||
int k = tid % 16;
|
int k = tid & 15;
|
||||||
int x = tid / 16;
|
int x = tid / 16;
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
int4 cptr0 = vload4(0, &task.coefs[0]);
|
int4 cptr0 = vload4(0, &task.coefs[0]);
|
||||||
#if MAX_ORDER > 4
|
|
||||||
int4 cptr1 = vload4(1, &task.coefs[0]);
|
|
||||||
#if MAX_ORDER > 8
|
|
||||||
int4 cptr2 = vload4(2, &task.coefs[0]);
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
data[tid] = 0;
|
data[tid] = 0;
|
||||||
for (int pos = 0; pos < bs; pos += GROUP_SIZE)
|
for (int pos = 0; pos < bs; pos += GROUP_SIZE)
|
||||||
{
|
{
|
||||||
@@ -839,9 +817,9 @@ void clCalcPartition16(
|
|||||||
__local int* dptr = &data[tid + GROUP_SIZE - ro];
|
__local int* dptr = &data[tid + GROUP_SIZE - ro];
|
||||||
int4 sum = cptr0 * vload4(0, dptr)
|
int4 sum = cptr0 * vload4(0, dptr)
|
||||||
#if MAX_ORDER > 4
|
#if MAX_ORDER > 4
|
||||||
+ cptr1 * vload4(1, dptr)
|
+ vload4(1, &task.coefs[0]) * vload4(1, dptr)
|
||||||
#if MAX_ORDER > 8
|
#if MAX_ORDER > 8
|
||||||
+ cptr2 * vload4(2, dptr)
|
+ vload4(2, &task.coefs[0]) * vload4(2, dptr)
|
||||||
#if MAX_ORDER > 12
|
#if MAX_ORDER > 12
|
||||||
+ vload4(3, &task.coefs[0]) * vload4(3, dptr)
|
+ vload4(3, &task.coefs[0]) * vload4(3, dptr)
|
||||||
#if MAX_ORDER > 16
|
#if MAX_ORDER > 16
|
||||||
@@ -872,8 +850,8 @@ void clCalcPartition16(
|
|||||||
data[tid] = nextData;
|
data[tid] = nextData;
|
||||||
|
|
||||||
// calc number of unary bits for each residual sample with each rice paramater
|
// calc number of unary bits for each residual sample with each rice paramater
|
||||||
__local int4 * chunk = (__local int4 *)&res[x << 4];
|
__local int * chunk = &res[x << 4];
|
||||||
sum = (chunk[0] >> k) + (chunk[1] >> k) + (chunk[2] >> k) + (chunk[3] >> k);
|
sum = (vload4(0,chunk) >> k) + (vload4(1,chunk) >> k) + (vload4(2,chunk) >> k) + (vload4(3,chunk) >> k);
|
||||||
s = sum.x + sum.y + sum.z + sum.w;
|
s = sum.x + sum.y + sum.z + sum.w;
|
||||||
|
|
||||||
const int lpos = (15 << (max_porder + 1)) * get_group_id(0) + (k << (max_porder + 1)) + offs / 16;
|
const int lpos = (15 << (max_porder + 1)) * get_group_id(0) + (k << (max_porder + 1)) + offs / 16;
|
||||||
|
|||||||
@@ -329,7 +329,6 @@ void clQuantizeLPC(
|
|||||||
|
|
||||||
__kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(1, 1, 1)))
|
__kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||||
void clEstimateResidual(
|
void clEstimateResidual(
|
||||||
__global int*output,
|
|
||||||
__global int*samples,
|
__global int*samples,
|
||||||
__global FLACCLSubframeTask *tasks
|
__global FLACCLSubframeTask *tasks
|
||||||
)
|
)
|
||||||
@@ -412,13 +411,18 @@ void clEstimateResidual(
|
|||||||
int k = clamp(clz(1 << (12 - EPO)) - clz(res), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16)
|
int k = clamp(clz(1 << (12 - EPO)) - clz(res), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16)
|
||||||
total += (k << (12 - EPO)) + (res >> k);
|
total += (k << (12 - EPO)) + (res >> k);
|
||||||
}
|
}
|
||||||
output[get_group_id(0)] = min(0x7ffffff, total) + (bs - ro);
|
int partLen = min(0x7ffffff, total) + (bs - ro);
|
||||||
|
int obits = task.data.obits - task.data.wbits;
|
||||||
|
tasks[get_group_id(0)].data.size = min(obits * bs,
|
||||||
|
task.data.type == Fixed ? ro * obits + 6 + (4 * 1/2) + partLen :
|
||||||
|
task.data.type == LPC ? ro * obits + 4 + 5 + ro * task.data.cbits + 6 + (4 * 1/2)/* << porder */ + partLen :
|
||||||
|
task.data.type == Constant ? obits * select(1, bs, partLen != bs - ro) :
|
||||||
|
obits * bs);
|
||||||
}
|
}
|
||||||
|
|
||||||
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||||
void clChooseBestMethod(
|
void clChooseBestMethod(
|
||||||
__global FLACCLSubframeTask *tasks,
|
__global FLACCLSubframeTask *tasks,
|
||||||
__global int *residual,
|
|
||||||
int taskCount
|
int taskCount
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
@@ -426,19 +430,7 @@ void clChooseBestMethod(
|
|||||||
int best_no = 0;
|
int best_no = 0;
|
||||||
for (int taskNo = 0; taskNo < taskCount; taskNo++)
|
for (int taskNo = 0; taskNo < taskCount; taskNo++)
|
||||||
{
|
{
|
||||||
// fetch task data
|
int len = tasks[taskNo + taskCount * get_group_id(0)].data.size;
|
||||||
__global FLACCLSubframeTask* ptask = tasks + taskNo + taskCount * get_group_id(0);
|
|
||||||
// fetch part sum
|
|
||||||
int partLen = residual[taskNo + taskCount * get_group_id(0)];
|
|
||||||
int obits = ptask->data.obits - ptask->data.wbits;
|
|
||||||
int bs = ptask->data.blocksize;
|
|
||||||
int ro = ptask->data.residualOrder;
|
|
||||||
int len = min(obits * bs,
|
|
||||||
ptask->data.type == Fixed ? ro * obits + 6 + (4 * 1/2) + partLen :
|
|
||||||
ptask->data.type == LPC ? ro * obits + 4 + 5 + ro * ptask->data.cbits + 6 + (4 * 1/2)/* << porder */ + partLen :
|
|
||||||
ptask->data.type == Constant ? obits * select(1, bs, partLen != bs - ro) :
|
|
||||||
obits * bs);
|
|
||||||
ptask->data.size = len;
|
|
||||||
if (len < best_length)
|
if (len < best_length)
|
||||||
{
|
{
|
||||||
best_length = len;
|
best_length = len;
|
||||||
|
|||||||
Reference in New Issue
Block a user