mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
Intel OpenCL
This commit is contained in:
@@ -1593,6 +1593,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
#if DEBUG
|
#if DEBUG
|
||||||
"#define DEBUG\n" +
|
"#define DEBUG\n" +
|
||||||
#endif
|
#endif
|
||||||
|
(_settings.DeviceType == OpenCLDeviceType.CPU ? "#define FLACCL_CPU\n" : "") +
|
||||||
_settings.Defines + "\n";
|
_settings.Defines + "\n";
|
||||||
// The BuildOptions string is passed directly to clBuild and can be used to do debug builds etc
|
// The BuildOptions string is passed directly to clBuild and can be used to do debug builds etc
|
||||||
OCLMan.BuildOptions = "";
|
OCLMan.BuildOptions = "";
|
||||||
|
|||||||
@@ -20,7 +20,7 @@
|
|||||||
#ifndef _FLACCL_KERNEL_H_
|
#ifndef _FLACCL_KERNEL_H_
|
||||||
#define _FLACCL_KERNEL_H_
|
#define _FLACCL_KERNEL_H_
|
||||||
|
|
||||||
#if defined(__Cedar__) || defined(__Redwood__) || defined(__Juniper__) || defined(__Cypress__) || defined(__ATI_RV770__) || defined(__ATI_RV730__) || defined(__ATI_RV710__)
|
#if defined(__Cedar__) || defined(__Redwood__) || defined(__Juniper__) || defined(__Cypress__) || defined(__ATI_RV770__) || defined(__ATI_RV730__) || defined(__ATI_RV710__) || defined(__CPU__)
|
||||||
#define AMD
|
#define AMD
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@@ -152,7 +152,7 @@ __kernel void clChannelDecorr2(
|
|||||||
#define __ffs(a) (32 - clz(a & (-a)))
|
#define __ffs(a) (32 - clz(a & (-a)))
|
||||||
//#define __ffs(a) (33 - clz(~a & (a - 1)))
|
//#define __ffs(a) (33 - clz(~a & (a - 1)))
|
||||||
|
|
||||||
#ifdef __CPU__
|
#ifdef FLACCL_CPU
|
||||||
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||||
void clFindWastedBits(
|
void clFindWastedBits(
|
||||||
__global FLACCLSubframeTask *tasks,
|
__global FLACCLSubframeTask *tasks,
|
||||||
@@ -228,7 +228,7 @@ void clFindWastedBits(
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __CPU__
|
#ifdef FLACCL_CPU
|
||||||
#define TEMPBLOCK 128
|
#define TEMPBLOCK 128
|
||||||
#define STORE_AC(ro, val) if (ro <= MAX_ORDER) pout[ro] = val;
|
#define STORE_AC(ro, val) if (ro <= MAX_ORDER) pout[ro] = val;
|
||||||
#define STORE_AC4(ro, val) STORE_AC(ro*4+0, val##ro.x) STORE_AC(ro*4+1, val##ro.y) STORE_AC(ro*4+2, val##ro.z) STORE_AC(ro*4+3, val##ro.w)
|
#define STORE_AC4(ro, val) STORE_AC(ro*4+0, val##ro.x) STORE_AC(ro*4+1, val##ro.y) STORE_AC(ro*4+2, val##ro.z) STORE_AC(ro*4+3, val##ro.w)
|
||||||
@@ -381,7 +381,7 @@ void clComputeAutocor(
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __CPU__
|
#ifdef FLACCL_CPU
|
||||||
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||||
void clComputeLPC(
|
void clComputeLPC(
|
||||||
__global float *pautoc,
|
__global float *pautoc,
|
||||||
@@ -525,7 +525,7 @@ void clComputeLPC(
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __CPU__
|
#ifdef FLACCL_CPU
|
||||||
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||||
void clQuantizeLPC(
|
void clQuantizeLPC(
|
||||||
__global FLACCLSubframeTask *tasks,
|
__global FLACCLSubframeTask *tasks,
|
||||||
@@ -545,7 +545,7 @@ void clQuantizeLPC(
|
|||||||
// Load prediction error estimates based on Akaike's Criteria
|
// Load prediction error estimates based on Akaike's Criteria
|
||||||
for (int tid = 0; tid < MAX_ORDER; tid++)
|
for (int tid = 0; tid < MAX_ORDER; tid++)
|
||||||
{
|
{
|
||||||
error[tid] = bs * log(lpcs[lpcOffs + MAX_ORDER * 32 + tid]) + tid * 4.12f * log(bs);
|
error[tid] = bs * log(lpcs[lpcOffs + MAX_ORDER * 32 + tid]) + tid * 4.12f * log((float)bs);
|
||||||
best_orders[tid] = tid;
|
best_orders[tid] = tid;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -588,7 +588,7 @@ void clQuantizeLPC(
|
|||||||
{
|
{
|
||||||
float lpc = lpcs[lpcOffs + order * 32 + tid];
|
float lpc = lpcs[lpcOffs + order * 32 + tid];
|
||||||
// quantize coeffs with given shift
|
// quantize coeffs with given shift
|
||||||
int c = convert_int_rte(clamp(lpc * (1 << shift), -1 << (cbits - 1), 1 << (cbits - 1)));
|
int c = convert_int_rte(clamp(lpc * (1 << shift), (float)(-1 << (cbits - 1)), (float)(1 << (cbits - 1))));
|
||||||
// remove sign bits
|
// remove sign bits
|
||||||
tmpi |= c ^ (c >> 31);
|
tmpi |= c ^ (c >> 31);
|
||||||
tasks[taskNo].coefs[tid] = c;
|
tasks[taskNo].coefs[tid] = c;
|
||||||
@@ -764,7 +764,7 @@ void clQuantizeLPC(
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __CPU__
|
#ifdef FLACCL_CPU
|
||||||
inline int calc_residual(__global int *ptr, int * coefs, int ro)
|
inline int calc_residual(__global int *ptr, int * coefs, int ro)
|
||||||
{
|
{
|
||||||
int sum = 0;
|
int sum = 0;
|
||||||
@@ -796,7 +796,7 @@ inline int calc_residual(__global int *ptr, int * coefs, int ro)
|
|||||||
default: ENCODE_N(ro, action) \
|
default: ENCODE_N(ro, action) \
|
||||||
}
|
}
|
||||||
|
|
||||||
__kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(1, 1, 1)))
|
__kernel __attribute__(( vec_type_hint (int4))) __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||||
void clEstimateResidual(
|
void clEstimateResidual(
|
||||||
__global int*samples,
|
__global int*samples,
|
||||||
__global int*selectedTasks,
|
__global int*selectedTasks,
|
||||||
@@ -811,13 +811,30 @@ void clEstimateResidual(
|
|||||||
int len[1 << EPO]; // blocksize / 64!!!!
|
int len[1 << EPO]; // blocksize / 64!!!!
|
||||||
|
|
||||||
__global int *data = &samples[task.data.samplesOffs];
|
__global int *data = &samples[task.data.samplesOffs];
|
||||||
// for (int i = ro; i < 32; i++)
|
for (int i = ro; i < 32; i++)
|
||||||
//task.coefs[i] = 0;
|
task.coefs[i] = 0;
|
||||||
for (int i = 0; i < 1 << EPO; i++)
|
for (int i = 0; i < 1 << EPO; i++)
|
||||||
len[i] = 0;
|
len[i] = 0;
|
||||||
|
|
||||||
|
#ifdef AMD
|
||||||
SWITCH_N((t = clamp(t, -0x7fffff, 0x7fffff), len[pos >> (12 - EPO)] += (t << 1) ^ (t >> 31)))
|
SWITCH_N((t = clamp(t, -0x7fffff, 0x7fffff), len[pos >> (12 - EPO)] += (t << 1) ^ (t >> 31)))
|
||||||
|
#else
|
||||||
|
int4 c0 = vload4(0, &task.coefs[0]);
|
||||||
|
int4 c1 = vload4(1, &task.coefs[0]);
|
||||||
|
int4 c2 = vload4(2, &task.coefs[0]);
|
||||||
|
|
||||||
|
for (int pos = ro; pos < bs; pos ++)
|
||||||
|
{
|
||||||
|
__global int * dptr = data + pos - ro;
|
||||||
|
int4 sum
|
||||||
|
= c0 * vload4(0, dptr)
|
||||||
|
+ c1 * vload4(1, dptr)
|
||||||
|
+ c2 * vload4(2, dptr);
|
||||||
|
int t = (data[pos] - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift)) >> task.data.wbits;
|
||||||
|
t = iclamp(t, -0x7fffff, 0x7fffff);
|
||||||
|
len[pos >> (12 - EPO)] += (t << 1) ^ (t >> 31);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
int total = 0;
|
int total = 0;
|
||||||
for (int i = 0; i < 1 << EPO; i++)
|
for (int i = 0; i < 1 << EPO; i++)
|
||||||
{
|
{
|
||||||
@@ -1052,7 +1069,7 @@ void clChooseBestMethod(
|
|||||||
}
|
}
|
||||||
|
|
||||||
#ifdef DO_PARTITIONS
|
#ifdef DO_PARTITIONS
|
||||||
#ifdef __CPU__
|
#ifdef FLACCL_CPU
|
||||||
// get_group_id(0) == task index
|
// get_group_id(0) == task index
|
||||||
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||||
void clEncodeResidual(
|
void clEncodeResidual(
|
||||||
@@ -1144,7 +1161,7 @@ void clEncodeResidual(
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __CPU__
|
#ifdef FLACCL_CPU
|
||||||
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||||
void clCalcPartition(
|
void clCalcPartition(
|
||||||
__global int *partition_lengths,
|
__global int *partition_lengths,
|
||||||
@@ -1232,7 +1249,7 @@ void clCalcPartition(
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __CPU__
|
#ifdef FLACCL_CPU
|
||||||
// get_group_id(0) == task index
|
// get_group_id(0) == task index
|
||||||
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||||
void clCalcPartition16(
|
void clCalcPartition16(
|
||||||
@@ -1346,7 +1363,7 @@ void clCalcPartition16(
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __CPU__
|
#ifdef FLACCL_CPU
|
||||||
// Sums partition lengths for a certain k == get_group_id(0)
|
// Sums partition lengths for a certain k == get_group_id(0)
|
||||||
// get_group_id(0) == k
|
// get_group_id(0) == k
|
||||||
// get_group_id(1) == task index
|
// get_group_id(1) == task index
|
||||||
@@ -1405,7 +1422,7 @@ void clSumPartition(
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __CPU__
|
#ifdef FLACCL_CPU
|
||||||
// Finds optimal rice parameter for each partition.
|
// Finds optimal rice parameter for each partition.
|
||||||
// get_group_id(0) == task index
|
// get_group_id(0) == task index
|
||||||
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||||
@@ -1469,7 +1486,7 @@ void clFindRiceParameter(
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __CPU__
|
#ifdef FLACCL_CPU
|
||||||
// get_group_id(0) == task index
|
// get_group_id(0) == task index
|
||||||
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||||
void clFindPartitionOrder(
|
void clFindPartitionOrder(
|
||||||
@@ -1603,10 +1620,10 @@ void clFindPartitionOrder(
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef DO_RICE
|
#ifdef DO_RICE
|
||||||
#ifdef __CPU__
|
#ifdef FLACCL_CPU
|
||||||
typedef struct BitWriter_t
|
typedef struct BitWriter_t
|
||||||
{
|
{
|
||||||
__global int *buffer;
|
__global unsigned int *buffer;
|
||||||
unsigned int bit_buf;
|
unsigned int bit_buf;
|
||||||
int bit_left;
|
int bit_left;
|
||||||
int buf_ptr;
|
int buf_ptr;
|
||||||
@@ -1695,7 +1712,7 @@ void clRiceEncoding(
|
|||||||
int max_porder
|
int max_porder
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
#ifdef __CPU__
|
#ifdef FLACCL_CPU
|
||||||
__global FLACCLSubframeTask* task = tasks + get_group_id(0);
|
__global FLACCLSubframeTask* task = tasks + get_group_id(0);
|
||||||
if (task->data.type == Fixed || task->data.type == LPC)
|
if (task->data.type == Fixed || task->data.type == LPC)
|
||||||
{
|
{
|
||||||
@@ -1804,7 +1821,7 @@ void clRiceEncoding(
|
|||||||
int mylen = select(0, (v >> k) + 1 + k + select(0, 4, pstart), offs >= task.residualOrder && offs < bs);
|
int mylen = select(0, (v >> k) + 1 + k + select(0, 4, pstart), offs >= task.residualOrder && offs < bs);
|
||||||
mypos[tid] = mylen;
|
mypos[tid] = mylen;
|
||||||
// Inclusive scan(+)
|
// Inclusive scan(+)
|
||||||
#if 0
|
#if 1
|
||||||
int lane = (tid & (WARP_SIZE - 1));
|
int lane = (tid & (WARP_SIZE - 1));
|
||||||
for (int offset = 1; offset < WARP_SIZE; offset <<= 1)
|
for (int offset = 1; offset < WARP_SIZE; offset <<= 1)
|
||||||
mypos[tid] += mypos[select(GROUP_SIZE, tid - offset, lane >= offset)];
|
mypos[tid] += mypos[select(GROUP_SIZE, tid - offset, lane >= offset)];
|
||||||
@@ -1863,9 +1880,9 @@ void clRiceEncoding(
|
|||||||
start = mypos[GROUP_SIZE - 1];
|
start = mypos[GROUP_SIZE - 1];
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
unsigned int bb = data[tid];
|
unsigned int bb = data[tid];
|
||||||
// bb = (bb >> 24) | ((bb >> 8) & 0xff00U) | ((bb << 8) & 0xff0000U) | (bb << 24);
|
bb = (bb >> 24) | ((bb >> 8) & 0xff00U) | ((bb << 8) & 0xff0000U) | (bb << 24);
|
||||||
if ((start32 + tid) * 32 <= start)
|
if ((start32 + tid) * 32 <= start)
|
||||||
output[start32 + tid] = 0U;
|
output[start32 + tid] = bb;
|
||||||
unsigned int remainder = data[start / 32 - start32];
|
unsigned int remainder = data[start / 32 - start32];
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
data[tid] = select(0U, remainder, tid == 0);
|
data[tid] = select(0U, remainder, tid == 0);
|
||||||
|
|||||||
Reference in New Issue
Block a user