mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
FLACCL: cleanup
This commit is contained in:
@@ -206,7 +206,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
bool inited = false;
|
||||
|
||||
OpenCLManager OCLMan;
|
||||
Program openCLProgram;
|
||||
CLProgram openCLProgram;
|
||||
|
||||
FLACCLTask task1;
|
||||
FLACCLTask task2;
|
||||
@@ -1735,10 +1735,17 @@ namespace CUETools.Codecs.FLACCL
|
||||
#if DEBUG
|
||||
"#define DEBUG\n" +
|
||||
#endif
|
||||
(m_settings.DeviceType == OpenCLDeviceType.CPU ? "#define FLACCL_CPU\n" : "") +
|
||||
(m_settings.DeviceType == OpenCLDeviceType.CPU ? "#define FLACCL_CPU\n" : "") +
|
||||
"#define OPENCL_PLATFORM \"" + OpenCL.GetPlatform(platformId).Name + "\"\n" +
|
||||
m_settings.Defines + "\n";
|
||||
|
||||
var exts = new string[] { "cl_khr_local_int32_base_atomics", "cl_khr_local_int32_extended_atomics", "cl_khr_fp64", "cl_amd_fp64" };
|
||||
var exts = new string[] {
|
||||
"cl_khr_fp64",
|
||||
"cl_amd_fp64",
|
||||
#if DEBUG
|
||||
"cl_amd_printf",
|
||||
#endif
|
||||
};
|
||||
foreach (string extension in exts)
|
||||
if (OCLMan.Context.Devices[0].Extensions.Contains(extension))
|
||||
{
|
||||
@@ -2413,7 +2420,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
|
||||
internal class FLACCLTask
|
||||
{
|
||||
Program openCLProgram;
|
||||
CLProgram openCLProgram;
|
||||
public CommandQueue openCLCQ;
|
||||
public Kernel clStereoDecorr;
|
||||
//public Kernel cudaChannelDecorr;
|
||||
@@ -2501,7 +2508,7 @@ namespace CUETools.Codecs.FLACCL
|
||||
public bool UseGPURice = false;
|
||||
public bool UseMappedMemory = false;
|
||||
|
||||
unsafe public FLACCLTask(Program _openCLProgram, int channelsCount, int max_frame_size, FLACCLWriter writer, int groupSize, bool gpuOnly, bool gpuRice)
|
||||
unsafe public FLACCLTask(CLProgram _openCLProgram, int channelsCount, int max_frame_size, FLACCLWriter writer, int groupSize, bool gpuOnly, bool gpuRice)
|
||||
{
|
||||
this.UseGPUOnly = gpuOnly;
|
||||
this.UseGPURice = gpuOnly && gpuRice;
|
||||
@@ -3080,7 +3087,6 @@ namespace CUETools.Codecs.FLACCL
|
||||
}
|
||||
}
|
||||
|
||||
#if LKJLKJLJK
|
||||
public static class OpenCLExtensions
|
||||
{
|
||||
public static void SetArgs(this Kernel kernel, params object[] args)
|
||||
@@ -3111,5 +3117,13 @@ namespace CUETools.Codecs.FLACCL
|
||||
queue.EnqueueNDRangeKernel(kernel, 2, null, new long[] { localSizeX * globalSizeX, localSizeY * globalSizeY }, new long[] { localSizeX, localSizeY });
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
namespace System.Runtime.CompilerServices
|
||||
{
|
||||
[AttributeUsageAttribute(AttributeTargets.Assembly | AttributeTargets.Class | AttributeTargets.Method)]
|
||||
internal sealed class ExtensionAttribute : Attribute
|
||||
{
|
||||
public ExtensionAttribute() { }
|
||||
}
|
||||
}
|
||||
|
||||
Binary file not shown.
Binary file not shown.
@@ -20,16 +20,16 @@
|
||||
#ifndef _FLACCL_KERNEL_H_
|
||||
#define _FLACCL_KERNEL_H_
|
||||
|
||||
#if defined(__Cedar__) || defined(__Redwood__) || defined(__Juniper__) || defined(__Cypress__) || defined(__ATI_RV770__) || defined(__ATI_RV730__) || defined(__ATI_RV710__) || defined(__CPU__)
|
||||
#if __OPENCL_VERSION__ < 110
|
||||
#error OpenCL 1.1+ required!
|
||||
#endif
|
||||
|
||||
#if defined(__WinterPark__) || defined(__BeaverCreek__) || defined(__Turks__) || defined(__Caicos__) || defined(__Tahiti__) || defined(__Pitcairn__) || defined(__Capeverde__)
|
||||
#define AMD
|
||||
#elif defined(__Cayman__) || defined(__Barts__) || defined(__Cypress__) || defined(__Juniper__) || defined(__Redwood__) || defined(__Cedar__)
|
||||
#define AMD
|
||||
#elif defined(__ATI_RV770__) || defined(__ATI_RV730__) || defined(__ATI_RV710__)
|
||||
#define AMD
|
||||
#endif
|
||||
|
||||
#if defined(AMD) && defined(DEBUG)
|
||||
#pragma OPENCL EXTENSION cl_amd_printf : enable
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_cl_khr_local_int32_base_atomics) && defined(HAVE_cl_khr_local_int32_extended_atomics)
|
||||
#define HAVE_ATOM
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_cl_khr_fp64) || defined(HAVE_cl_amd_fp64)
|
||||
@@ -51,18 +51,6 @@
|
||||
#define ZEROFD 0.0f
|
||||
#endif
|
||||
|
||||
|
||||
//#if __OPENCL_VERSION__ == 110
|
||||
#ifdef AMD
|
||||
#define iclamp(a,b,c) clamp(a,b,c)
|
||||
#else
|
||||
#define iclamp(a,b,c) max(b,min(a,c))
|
||||
#endif
|
||||
|
||||
#ifndef M_PI_F
|
||||
#define M_PI_F M_PI
|
||||
#endif
|
||||
|
||||
#define WARP_SIZE 32
|
||||
|
||||
#if BITS_PER_SAMPLE > 16
|
||||
@@ -708,9 +696,6 @@ void clQuantizeLPC(
|
||||
volatile int index[64];
|
||||
volatile float error[64];
|
||||
volatile int maxcoef[32];
|
||||
#ifndef HAVE_ATOM
|
||||
volatile int tmp[32];
|
||||
#endif
|
||||
// volatile int best8;
|
||||
} shared;
|
||||
|
||||
@@ -800,20 +785,7 @@ void clQuantizeLPC(
|
||||
// get 15 bits of each coeff
|
||||
int coef = convert_int_rte(lpc * (1 << 15));
|
||||
// remove sign bits
|
||||
#ifdef HAVE_ATOM
|
||||
atom_or(shared.maxcoef + i, coef ^ (coef >> 31));
|
||||
#else
|
||||
shared.tmp[tid] = coef ^ (coef >> 31);
|
||||
if (tid < 16)
|
||||
{
|
||||
shared.tmp[tid] |= shared.tmp[tid + 16];
|
||||
shared.tmp[tid] |= shared.tmp[tid + 8];
|
||||
shared.tmp[tid] |= shared.tmp[tid + 4];
|
||||
shared.tmp[tid] |= shared.tmp[tid + 2];
|
||||
if (tid == 0)
|
||||
shared.maxcoef[i] = shared.tmp[tid] | shared.tmp[tid + 1];
|
||||
}
|
||||
#endif
|
||||
atomic_or(shared.maxcoef + i, coef ^ (coef >> 31));
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int cbits = min(51 - 2 * clz(shared.task.blocksize), shared.task.abits);
|
||||
@@ -821,17 +793,17 @@ void clQuantizeLPC(
|
||||
// Limit cbits so that 32-bit arithmetics will be enough when calculating residual
|
||||
cbits = min(cbits, clz(order) + 1 - shared.task.obits);
|
||||
#endif
|
||||
cbits = iclamp(cbits - minprecision + (i - ((i >> precisions) << precisions)), 3, 15);
|
||||
cbits = clamp(cbits - minprecision + (i - ((i >> precisions) << precisions)), 3, 15);
|
||||
|
||||
// Calculate shift based on precision and number of leading zeroes in coeffs.
|
||||
// We know that if shifted by 15, coefs require
|
||||
// 33 - clz(shared.maxcoef[i]) bits;
|
||||
// So to get the desired cbits, we need to shift coefs by
|
||||
// 15 + cbits - (33 - clz(shared.maxcoef[i]));
|
||||
int shift = iclamp(clz(shared.maxcoef[i]) - 18 + cbits, 0, 15);
|
||||
int shift = clamp(clz(shared.maxcoef[i]) - 18 + cbits, 0, 15);
|
||||
|
||||
int lim = (1 << (cbits - 1)) - 1;
|
||||
coef = iclamp(convert_int_rte(lpc * (1 << shift)), -lim, lim);
|
||||
coef = clamp(convert_int_rte(lpc * (1 << shift)), -lim, lim);
|
||||
|
||||
// output shift, cbits and output coeffs
|
||||
int taskNo = get_group_id(1) * taskCount + get_group_id(0) * taskCountLPC + i;
|
||||
@@ -847,31 +819,6 @@ void clQuantizeLPC(
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef FLACCL_CPU
|
||||
inline int fastclz(int iv)
|
||||
{
|
||||
unsigned int v = (unsigned int)iv;
|
||||
int x = (0 != (v >> 16)) * 16;
|
||||
x += (0 != (v >> (x + 8))) * 8;
|
||||
x += (0 != (v >> (x + 4))) * 4;
|
||||
x += (0 != (v >> (x + 2))) * 2;
|
||||
x += (0 != (v >> (x + 1)));
|
||||
x += (0 != (v >> x));
|
||||
return 32 - x;
|
||||
}
|
||||
#else
|
||||
inline int fastclz(int iv)
|
||||
{
|
||||
return clz(iv);
|
||||
}
|
||||
#endif
|
||||
inline int fastclz64(long iv)
|
||||
{
|
||||
unsigned long v = (unsigned long)iv;
|
||||
int x = (0 != (v >> 32)) * 32;
|
||||
return 32 - x + fastclz(v >> x);
|
||||
}
|
||||
|
||||
#ifdef FLACCL_CPU
|
||||
#define TEMPBLOCK1 TEMPBLOCK
|
||||
|
||||
@@ -1012,7 +959,7 @@ void clEstimateResidual(
|
||||
for (int i = 0; i < ERPARTS; i++)
|
||||
{
|
||||
int res = convert_int_sat_rte(len[i] * 2);
|
||||
int k = iclamp(31 - fastclz(res) - 6, 0, MAX_RICE_PARAM); // 25 - clz(res) == clz(64) - clz(res) == log2(res / 64)
|
||||
int k = clamp(31 - clz(res) - 6, 0, MAX_RICE_PARAM); // 25 - clz(res) == clz(64) - clz(res) == log2(res / 64)
|
||||
total += (k << 6) + (res >> k);
|
||||
}
|
||||
int partLen = min(0x7ffffff, total) + (bs - ro);
|
||||
@@ -1034,7 +981,7 @@ void clEstimateResidual(
|
||||
)
|
||||
{
|
||||
__local float data[GROUP_SIZE * 2 + 32];
|
||||
#if !defined(AMD) || !defined(HAVE_ATOM)
|
||||
#if !defined(AMD)
|
||||
__local volatile uint idata[GROUP_SIZE + 16];
|
||||
#endif
|
||||
__local FLACCLSubframeTask task;
|
||||
@@ -1112,14 +1059,14 @@ void clEstimateResidual(
|
||||
t = select(0U, t, offs >= ro);
|
||||
// overflow protection
|
||||
t = min(t, 0x7ffffffU);
|
||||
#if !defined(AMD) || !defined(HAVE_ATOM)
|
||||
#if !defined(AMD)
|
||||
idata[tid] = t;
|
||||
for (int l = 16; l > 1; l >>= 1)
|
||||
idata[tid] += idata[tid + l];
|
||||
if ((tid & 31) == 0)
|
||||
psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG] = idata[tid] + idata[tid + 1];
|
||||
#else
|
||||
atom_add(&psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG], t);
|
||||
atomic_add(&psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG], t);
|
||||
#endif
|
||||
}
|
||||
if (pos < bs)
|
||||
@@ -1158,14 +1105,14 @@ void clEstimateResidual(
|
||||
t = select(0U, t, offs >= ro && offs < bs);
|
||||
// overflow protection
|
||||
t = min(t, 0x7ffffffU);
|
||||
#if !defined(AMD) || !defined(HAVE_ATOM)
|
||||
#if !defined(AMD)
|
||||
idata[tid] = t;
|
||||
for (int l = 16; l > 1; l >>= 1)
|
||||
idata[tid] += idata[tid + l];
|
||||
if ((tid & 31) == 0)
|
||||
psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG] = idata[tid] + idata[tid + 1];
|
||||
#else
|
||||
atom_add(&psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG], t);
|
||||
atomic_add(&psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG], t);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1183,7 +1130,7 @@ void clEstimateResidual(
|
||||
//if (offs < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2)
|
||||
// psum[offs] = pl;
|
||||
// }
|
||||
int k = iclamp(31 - fastclz(pl) - (ESTPARTLOG + 1), 0, MAX_RICE_PARAM); // 26 - clz(res) == clz(32) - clz(res) == log2(res / 32)
|
||||
int k = clamp(31 - (int)clz(pl) - (ESTPARTLOG + 1), 0, MAX_RICE_PARAM); // 26 - clz(res) == clz(32) - clz(res) == log2(res / 32)
|
||||
if (tid < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2)
|
||||
psum[tid] = (k << (ESTPARTLOG + 1)) + (pl >> k);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@@ -1491,7 +1438,7 @@ void clCalcPartition(
|
||||
// we must ensure that psize * (t >> k) doesn't overflow;
|
||||
uint lim = 0x7fffffffU / (uint)psize;
|
||||
for (int k = 0; k <= MAX_RICE_PARAM; k++)
|
||||
atom_add(&pl[part][k], min(lim, t >> k));
|
||||
atomic_add(&pl[part][k], min(lim, t >> k));
|
||||
//pl[part][k] += s >> k;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@@ -1666,7 +1613,7 @@ void clFindRiceParameter(
|
||||
|
||||
ulong pl = ppl[pos];
|
||||
int ps = (bs >> porder) - ro;
|
||||
int k = iclamp(63 - fastclz64(pl / max(1, ps)), 0, MAX_RICE_PARAM);
|
||||
int k = clamp(63 - (int)clz(pl / max(1, ps)), 0, MAX_RICE_PARAM);
|
||||
int plk = ps * (k + 1) + (int)(pl >> k);
|
||||
|
||||
// output rice parameter
|
||||
@@ -1679,7 +1626,7 @@ void clFindRiceParameter(
|
||||
for (int offs = pos + 1; offs < fin; offs++)
|
||||
{
|
||||
pl = ppl[offs];
|
||||
k = iclamp(63 - fastclz64(pl / ps), 0, MAX_RICE_PARAM);
|
||||
k = clamp(63 - (int)clz(pl / ps), 0, MAX_RICE_PARAM);
|
||||
plk = ps * (k + 1) + (int)(pl >> k);
|
||||
|
||||
// output rice parameter
|
||||
@@ -1815,7 +1762,7 @@ void clFindPartitionOrder(
|
||||
{
|
||||
int len = rice_parameters[pos + offs];
|
||||
int porder = 31 - clz(lim - offs);
|
||||
atom_add(&partlen[porder], len);
|
||||
atomic_add(&partlen[porder], len);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
@@ -1891,11 +1838,7 @@ inline void flush(BitWriter *bw)
|
||||
|
||||
inline int len_utf8(int n)
|
||||
{
|
||||
#ifdef FLACCL_CPU
|
||||
int bts = 31 - fastclz(n);
|
||||
#else
|
||||
int bts = 31 - clz(n);
|
||||
#endif
|
||||
return select(8, 8 * ((bts + 4) / 5), bts > 6);
|
||||
}
|
||||
|
||||
@@ -2050,11 +1993,7 @@ void clRiceEncoding(
|
||||
unsigned int bb = bw.bit_buf << bw.bit_left;
|
||||
bw.bit_buf = 0;
|
||||
bw.bit_left += (32 - b);
|
||||
#ifdef AMD
|
||||
bw.buffer[bw.buf_ptr++] = as_int(as_char4(bb).wzyx);
|
||||
#else
|
||||
bw.buffer[bw.buf_ptr++] = (bb >> 24) | ((bb >> 8) & 0xff00) | ((bb << 8) & 0xff0000) | ((bb << 24) & 0xff000000);
|
||||
#endif
|
||||
}
|
||||
bits -= b;
|
||||
}
|
||||
@@ -2069,11 +2008,7 @@ void clRiceEncoding(
|
||||
unsigned int bb = (bw.bit_buf << bw.bit_left) | (val >> (bits - bw.bit_left));
|
||||
bw.bit_buf = val;
|
||||
bw.bit_left += (32 - bits);
|
||||
#ifdef AMD
|
||||
bw.buffer[bw.buf_ptr++] = as_int(as_char4(bb).wzyx);
|
||||
#else
|
||||
bw.buffer[bw.buf_ptr++] = (bb >> 24) | ((bb >> 8) & 0xff00) | ((bb << 8) & 0xff0000) | ((bb << 24) & 0xff000000);
|
||||
#endif
|
||||
}
|
||||
////if (get_group_id(0) == 0) printf("%x ", v);
|
||||
//writebits(&bw, (v >> k) + 1, 1);
|
||||
@@ -2171,8 +2106,8 @@ void clRiceEncoding(
|
||||
uint kval = (uint)k << (32 - RICE_PARAM_BITS);
|
||||
uint kval0 = kval >> kpos1;
|
||||
uint kval1 = kval << (32 - kpos1);
|
||||
if (kval0) atom_or(&data[kpos0], kval0);
|
||||
if (kpos1 && kval1) atom_or(&data[kpos0 + 1], kval1);
|
||||
if (kval0) atomic_or(&data[kpos0], kval0);
|
||||
if (kpos1 && kval1) atomic_or(&data[kpos0 + 1], kval1);
|
||||
}
|
||||
int qpos = mp - k - 1;
|
||||
int qpos0 = (qpos >> 5) - start32;
|
||||
@@ -2180,8 +2115,8 @@ void clRiceEncoding(
|
||||
uint qval = (1U << 31) | (v << (31 - k));
|
||||
uint qval0 = qval >> qpos1;
|
||||
uint qval1= qval << (32 - qpos1);
|
||||
if (qval0) atom_or(&data[qpos0], qval0);
|
||||
if (qpos1 && qval1) atom_or(&data[qpos0 + 1], qval1);
|
||||
if (qval0) atomic_or(&data[qpos0], qval0);
|
||||
if (qpos1 && qval1) atomic_or(&data[qpos0 + 1], qval1);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if ((start32 + tid) * 32 <= start)
|
||||
@@ -2232,8 +2167,8 @@ void clRiceEncoding(
|
||||
uint kval = (uint)k << (32 - RICE_PARAM_BITS);
|
||||
uint kval0 = kval >> kpos1;
|
||||
uint kval1 = kval << (32 - kpos1);
|
||||
if (kval0) atom_or(&data[kpos0], kval0);
|
||||
if (kpos1 && kval1) atom_or(&data[kpos0 + 1], kval1);
|
||||
if (kval0) atomic_or(&data[kpos0], kval0);
|
||||
if (kpos1 && kval1) atomic_or(&data[kpos0 + 1], kval1);
|
||||
}
|
||||
int qpos = mp - k - 1;
|
||||
int qpos0 = (qpos >> 5) - start32;
|
||||
@@ -2241,8 +2176,8 @@ void clRiceEncoding(
|
||||
uint qval = (1U << 31) | (v << (31 - k));
|
||||
uint qval0 = qval >> qpos1;
|
||||
uint qval1= qval << (32 - qpos1);
|
||||
if (qval0) atom_or(&data[qpos0], qval0);
|
||||
if (qpos1 && qval1) atom_or(&data[qpos0 + 1], qval1);
|
||||
if (qval0) atomic_or(&data[qpos0], qval0);
|
||||
if (qpos1 && qval1) atomic_or(&data[qpos0 + 1], qval1);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if ((start32 + tid) * 32 <= start)
|
||||
|
||||
Reference in New Issue
Block a user