FLACCL: cleanup

2025-12-16 18:14:25 +00:00 · 2013-05-30 22:14:16 -04:00
parent db81eee039
commit 351705f551
6 changed files with 53 additions and 104 deletions
--- a/CUETools.Codecs.FLACCL/FLACCLWriter.cs
+++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs
@@ -206,7 +206,7 @@ namespace CUETools.Codecs.FLACCL
        bool inited = false;

        OpenCLManager OCLMan;
-        Program openCLProgram;
+        CLProgram openCLProgram;

        FLACCLTask task1;
        FLACCLTask task2;
@@ -1735,10 +1735,17 @@ namespace CUETools.Codecs.FLACCL
 #if DEBUG
 					"#define DEBUG\n" +
 #endif
- (m_settings.DeviceType == OpenCLDeviceType.CPU ? "#define FLACCL_CPU\n" : "") +
+                    (m_settings.DeviceType == OpenCLDeviceType.CPU ? "#define FLACCL_CPU\n" : "") +
+                    "#define OPENCL_PLATFORM \"" + OpenCL.GetPlatform(platformId).Name + "\"\n" +
                    m_settings.Defines + "\n";

-                var exts = new string[] { "cl_khr_local_int32_base_atomics", "cl_khr_local_int32_extended_atomics", "cl_khr_fp64", "cl_amd_fp64" };
+                var exts = new string[] { 
+                    "cl_khr_fp64", 
+                    "cl_amd_fp64", 
+#if DEBUG
+                    "cl_amd_printf",
+#endif
+                };
                foreach (string extension in exts)
                    if (OCLMan.Context.Devices[0].Extensions.Contains(extension))
                    {
@@ -2413,7 +2420,7 @@ namespace CUETools.Codecs.FLACCL

    internal class FLACCLTask
    {
-        Program openCLProgram;
+        CLProgram openCLProgram;
        public CommandQueue openCLCQ;
        public Kernel clStereoDecorr;
        //public Kernel cudaChannelDecorr;
@@ -2501,7 +2508,7 @@ namespace CUETools.Codecs.FLACCL
        public bool UseGPURice = false;
        public bool UseMappedMemory = false;

-        unsafe public FLACCLTask(Program _openCLProgram, int channelsCount, int max_frame_size, FLACCLWriter writer, int groupSize, bool gpuOnly, bool gpuRice)
+        unsafe public FLACCLTask(CLProgram _openCLProgram, int channelsCount, int max_frame_size, FLACCLWriter writer, int groupSize, bool gpuOnly, bool gpuRice)
        {
            this.UseGPUOnly = gpuOnly;
            this.UseGPURice = gpuOnly && gpuRice;
@@ -3080,7 +3087,6 @@ namespace CUETools.Codecs.FLACCL
        }
    }

-#if LKJLKJLJK
    public static class OpenCLExtensions
    {
        public static void SetArgs(this Kernel kernel, params object[] args)
@@ -3111,5 +3117,13 @@ namespace CUETools.Codecs.FLACCL
            queue.EnqueueNDRangeKernel(kernel, 2, null, new long[] { localSizeX * globalSizeX, localSizeY * globalSizeY }, new long[] { localSizeX, localSizeY });
        }
    }
-#endif
+}
+
+namespace System.Runtime.CompilerServices
+{
+    [AttributeUsageAttribute(AttributeTargets.Assembly | AttributeTargets.Class | AttributeTargets.Method)]
+    internal sealed class ExtensionAttribute : Attribute
+    {
+        public ExtensionAttribute() { }
+    }
 }
--- a/CUETools.Codecs.FLACCL/OpenCLNet.dll
+++ b/CUETools.Codecs.FLACCL/OpenCLNet.dll
--- a/CUETools.Codecs.FLACCL/OpenCLNet.pdb
+++ b/CUETools.Codecs.FLACCL/OpenCLNet.pdb
--- a/CUETools.Codecs.FLACCL/flac.cl
+++ b/CUETools.Codecs.FLACCL/flac.cl
@@ -20,16 +20,16 @@
 #ifndef _FLACCL_KERNEL_H_
 #define _FLACCL_KERNEL_H_

-#if defined(__Cedar__) || defined(__Redwood__) || defined(__Juniper__) || defined(__Cypress__) || defined(__ATI_RV770__) || defined(__ATI_RV730__) || defined(__ATI_RV710__) || defined(__CPU__)
+#if __OPENCL_VERSION__ < 110
+#error OpenCL 1.1+ required!
+#endif
+
+#if defined(__WinterPark__) || defined(__BeaverCreek__) || defined(__Turks__) || defined(__Caicos__) || defined(__Tahiti__) || defined(__Pitcairn__) || defined(__Capeverde__)
+#define AMD
+#elif defined(__Cayman__) || defined(__Barts__) || defined(__Cypress__) || defined(__Juniper__) || defined(__Redwood__) || defined(__Cedar__)
+#define AMD
+#elif defined(__ATI_RV770__) || defined(__ATI_RV730__) || defined(__ATI_RV710__)
 #define AMD
-#endif
-
-#if defined(AMD) && defined(DEBUG)
-#pragma OPENCL EXTENSION cl_amd_printf : enable
-#endif
-
-#if defined(HAVE_cl_khr_local_int32_base_atomics) && defined(HAVE_cl_khr_local_int32_extended_atomics)
-#define HAVE_ATOM
 #endif

 #if defined(HAVE_cl_khr_fp64) || defined(HAVE_cl_amd_fp64)
@@ -51,18 +51,6 @@
 #define ZEROFD 0.0f
 #endif

-
-//#if __OPENCL_VERSION__ == 110
-#ifdef AMD
-#define iclamp(a,b,c) clamp(a,b,c)
-#else
-#define iclamp(a,b,c) max(b,min(a,c))
-#endif
-
-#ifndef M_PI_F
-#define M_PI_F M_PI
-#endif
-
 #define WARP_SIZE 32

 #if BITS_PER_SAMPLE > 16
@@ -708,9 +696,6 @@ void clQuantizeLPC(
 	volatile int index[64];
 	volatile float error[64];
 	volatile int maxcoef[32];
-#ifndef HAVE_ATOM
-	volatile int tmp[32];
-#endif
 //    	volatile int best8;
    } shared;

@@ -800,20 +785,7 @@ void clQuantizeLPC(
 	// get 15 bits of each coeff
 	int coef = convert_int_rte(lpc * (1 << 15));
 	// remove sign bits
-#ifdef HAVE_ATOM
-	atom_or(shared.maxcoef + i, coef ^ (coef >> 31));
-#else
-	shared.tmp[tid] = coef ^ (coef >> 31);
-	if (tid < 16)
-	{
-	    shared.tmp[tid] |= shared.tmp[tid + 16];
-	    shared.tmp[tid] |= shared.tmp[tid + 8];
-	    shared.tmp[tid] |= shared.tmp[tid + 4];
-	    shared.tmp[tid] |= shared.tmp[tid + 2];
-	    if (tid == 0)
-		shared.maxcoef[i] = shared.tmp[tid] | shared.tmp[tid + 1];
-	}
-#endif
+	atomic_or(shared.maxcoef + i, coef ^ (coef >> 31));
 	barrier(CLK_LOCAL_MEM_FENCE);

        int cbits = min(51 - 2 * clz(shared.task.blocksize), shared.task.abits);
@@ -821,17 +793,17 @@ void clQuantizeLPC(
 	// Limit cbits so that 32-bit arithmetics will be enough when calculating residual
        cbits = min(cbits, clz(order) + 1 - shared.task.obits);
 #endif
-        cbits = iclamp(cbits - minprecision + (i - ((i >> precisions) << precisions)), 3, 15);
+        cbits = clamp(cbits - minprecision + (i - ((i >> precisions) << precisions)), 3, 15);

 	// Calculate shift based on precision and number of leading zeroes in coeffs.
 	// We know that if shifted by 15, coefs require 
 	// 33 - clz(shared.maxcoef[i]) bits;
 	// So to get the desired cbits, we need to shift coefs by 
 	// 15 + cbits - (33 - clz(shared.maxcoef[i]));
-	int shift = iclamp(clz(shared.maxcoef[i]) - 18 + cbits, 0, 15);
+	int shift = clamp(clz(shared.maxcoef[i]) - 18 + cbits, 0, 15);

 	int lim = (1 << (cbits - 1)) - 1;
-	coef = iclamp(convert_int_rte(lpc * (1 << shift)), -lim, lim);
+	coef = clamp(convert_int_rte(lpc * (1 << shift)), -lim, lim);

 	// output shift, cbits and output coeffs
 	int taskNo = get_group_id(1) * taskCount + get_group_id(0) * taskCountLPC + i;
@@ -847,31 +819,6 @@ void clQuantizeLPC(
 }
 #endif

-#ifdef FLACCL_CPU
-inline int fastclz(int iv)
-{
-    unsigned int v = (unsigned int)iv;
-    int x = (0 != (v >> 16)) * 16;
-    x += (0 != (v >> (x + 8))) * 8;
-    x += (0 != (v >> (x + 4))) * 4;
-    x += (0 != (v >> (x + 2))) * 2;
-    x += (0 != (v >> (x + 1)));
-    x += (0 != (v >> x));
-    return 32 - x;
-}
-#else
-inline int fastclz(int iv)
-{
-    return clz(iv);
-}
-#endif
-inline int fastclz64(long iv)
-{
-    unsigned long v = (unsigned long)iv;
-    int x = (0 != (v >> 32)) * 32;
-    return 32 - x + fastclz(v >> x);
-}
-
 #ifdef FLACCL_CPU
 #define TEMPBLOCK1 TEMPBLOCK

@@ -1012,7 +959,7 @@ void clEstimateResidual(
    for (int i = 0; i < ERPARTS; i++)
    {
 	int res = convert_int_sat_rte(len[i] * 2);
-	int k = iclamp(31 - fastclz(res) - 6, 0, MAX_RICE_PARAM); // 25 - clz(res) == clz(64) - clz(res) == log2(res / 64)
+	int k = clamp(31 - clz(res) - 6, 0, MAX_RICE_PARAM); // 25 - clz(res) == clz(64) - clz(res) == log2(res / 64)
 	total += (k << 6) + (res >> k);
    }
    int partLen = min(0x7ffffff, total) + (bs - ro);
@@ -1034,7 +981,7 @@ void clEstimateResidual(
    )
 {
    __local float data[GROUP_SIZE * 2 + 32];
-#if !defined(AMD) || !defined(HAVE_ATOM)
+#if !defined(AMD)
    __local volatile uint idata[GROUP_SIZE + 16];
 #endif
    __local FLACCLSubframeTask task;
@@ -1112,14 +1059,14 @@ void clEstimateResidual(
 	t = select(0U, t, offs >= ro);
 	// overflow protection
 	t = min(t, 0x7ffffffU);
-#if !defined(AMD) || !defined(HAVE_ATOM)
+#if !defined(AMD)
 	idata[tid] = t;
 	for (int l = 16; l > 1; l >>= 1)
 	    idata[tid] += idata[tid + l];
 	if ((tid & 31) == 0)
 	    psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG] = idata[tid] + idata[tid + 1];
 #else
-	atom_add(&psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG], t);
+	atomic_add(&psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG], t);
 #endif
    }
    if (pos < bs)
@@ -1158,14 +1105,14 @@ void clEstimateResidual(
 	t = select(0U, t, offs >= ro && offs < bs);
 	// overflow protection
 	t = min(t, 0x7ffffffU);
-#if !defined(AMD) || !defined(HAVE_ATOM)
+#if !defined(AMD)
 	idata[tid] = t;
 	for (int l = 16; l > 1; l >>= 1)
 	    idata[tid] += idata[tid + l];
 	if ((tid & 31) == 0)
 	    psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG] = idata[tid] + idata[tid + 1];
 #else
-	atom_add(&psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG], t);
+	atomic_add(&psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG], t);
 #endif
    }

@@ -1183,7 +1130,7 @@ void clEstimateResidual(
 	//if (offs < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2)
 	//    psum[offs] = pl;
 //   }
-    int k = iclamp(31 - fastclz(pl) - (ESTPARTLOG + 1), 0, MAX_RICE_PARAM); // 26 - clz(res) == clz(32) - clz(res) == log2(res / 32)
+    int k = clamp(31 - (int)clz(pl) - (ESTPARTLOG + 1), 0, MAX_RICE_PARAM); // 26 - clz(res) == clz(32) - clz(res) == log2(res / 32)
    if (tid < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2)
 	psum[tid] = (k << (ESTPARTLOG + 1)) + (pl >> k);
    barrier(CLK_LOCAL_MEM_FENCE);
@@ -1491,7 +1438,7 @@ void clCalcPartition(
 	// we must ensure that psize * (t >> k) doesn't overflow;
 	uint lim = 0x7fffffffU / (uint)psize;
 	for (int k = 0; k <= MAX_RICE_PARAM; k++)
-	    atom_add(&pl[part][k], min(lim, t >> k));
+	    atomic_add(&pl[part][k], min(lim, t >> k));
 	    //pl[part][k] += s >> k;
    }   
    barrier(CLK_LOCAL_MEM_FENCE);
@@ -1666,7 +1613,7 @@ void clFindRiceParameter(

 	ulong pl = ppl[pos];
 	int ps = (bs >> porder) - ro;
-	int k = iclamp(63 - fastclz64(pl / max(1, ps)), 0, MAX_RICE_PARAM);
+	int k = clamp(63 - (int)clz(pl / max(1, ps)), 0, MAX_RICE_PARAM);
 	int plk = ps * (k + 1) + (int)(pl >> k);

 	// output rice parameter
@@ -1679,7 +1626,7 @@ void clFindRiceParameter(
 	for (int offs = pos + 1; offs < fin; offs++)
 	{
 	    pl = ppl[offs];
-	    k = iclamp(63 - fastclz64(pl / ps), 0, MAX_RICE_PARAM);
+	    k = clamp(63 - (int)clz(pl / ps), 0, MAX_RICE_PARAM);
 	    plk = ps * (k + 1) + (int)(pl >> k);
 	
 	    // output rice parameter
@@ -1815,7 +1762,7 @@ void clFindPartitionOrder(
    {
 	int len = rice_parameters[pos + offs];
 	int porder = 31 - clz(lim - offs);
-	atom_add(&partlen[porder], len);
+	atomic_add(&partlen[porder], len);
    }
    barrier(CLK_LOCAL_MEM_FENCE);

@@ -1891,11 +1838,7 @@ inline void flush(BitWriter *bw)

 inline int len_utf8(int n)
 {
-#ifdef FLACCL_CPU
-    int bts = 31 - fastclz(n);
-#else
    int bts = 31 - clz(n);
-#endif
    return select(8, 8 * ((bts + 4) / 5), bts > 6);
 }

@@ -2050,11 +1993,7 @@ void clRiceEncoding(
 			unsigned int bb = bw.bit_buf << bw.bit_left;
 			bw.bit_buf = 0;
 			bw.bit_left += (32 - b);
-#ifdef AMD
 			bw.buffer[bw.buf_ptr++] = as_int(as_char4(bb).wzyx);
-#else
-			bw.buffer[bw.buf_ptr++] = (bb >> 24) | ((bb >> 8) & 0xff00) | ((bb << 8) & 0xff0000) | ((bb << 24) & 0xff000000);
-#endif
 		    }
 		    bits -= b;
 		}
@@ -2069,11 +2008,7 @@ void clRiceEncoding(
 		    unsigned int bb = (bw.bit_buf << bw.bit_left) | (val >> (bits - bw.bit_left));
 		    bw.bit_buf = val;
 		    bw.bit_left += (32 - bits);
-#ifdef AMD
 		    bw.buffer[bw.buf_ptr++] = as_int(as_char4(bb).wzyx);
-#else
-		    bw.buffer[bw.buf_ptr++] = (bb >> 24) | ((bb >> 8) & 0xff00) | ((bb << 8) & 0xff0000) | ((bb << 24) & 0xff000000);
-#endif
 		}
 		////if (get_group_id(0) == 0) printf("%x ", v);
 		//writebits(&bw, (v >> k) + 1, 1);
@@ -2171,8 +2106,8 @@ void clRiceEncoding(
 		uint kval = (uint)k << (32 - RICE_PARAM_BITS);
 		uint kval0 = kval >> kpos1;
 		uint kval1 = kval << (32 - kpos1);
-		if (kval0) atom_or(&data[kpos0], kval0);
-		if (kpos1 && kval1) atom_or(&data[kpos0 + 1], kval1);
+		if (kval0) atomic_or(&data[kpos0], kval0);
+		if (kpos1 && kval1) atomic_or(&data[kpos0 + 1], kval1);
 	    }
 	    int qpos = mp - k - 1;
 	    int qpos0 = (qpos >> 5) - start32;
@@ -2180,8 +2115,8 @@ void clRiceEncoding(
 	    uint qval = (1U << 31) | (v << (31 - k));
 	    uint qval0 = qval >> qpos1;
 	    uint qval1= qval << (32 - qpos1);
-	    if (qval0) atom_or(&data[qpos0], qval0);
-	    if (qpos1 && qval1) atom_or(&data[qpos0 + 1], qval1);
+	    if (qval0) atomic_or(&data[qpos0], qval0);
+	    if (qpos1 && qval1) atomic_or(&data[qpos0 + 1], qval1);
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	if ((start32 + tid) * 32 <= start)
@@ -2232,8 +2167,8 @@ void clRiceEncoding(
 		uint kval = (uint)k << (32 - RICE_PARAM_BITS);
 		uint kval0 = kval >> kpos1;
 		uint kval1 = kval << (32 - kpos1);
-		if (kval0) atom_or(&data[kpos0], kval0);
-		if (kpos1 && kval1) atom_or(&data[kpos0 + 1], kval1);
+		if (kval0) atomic_or(&data[kpos0], kval0);
+		if (kpos1 && kval1) atomic_or(&data[kpos0 + 1], kval1);
 	    }
 	    int qpos = mp - k - 1;
 	    int qpos0 = (qpos >> 5) - start32;
@@ -2241,8 +2176,8 @@ void clRiceEncoding(
 	    uint qval = (1U << 31) | (v << (31 - k));
 	    uint qval0 = qval >> qpos1;
 	    uint qval1= qval << (32 - qpos1);
-	    if (qval0) atom_or(&data[qpos0], qval0);
-	    if (qpos1 && qval1) atom_or(&data[qpos0 + 1], qval1);
+	    if (qval0) atomic_or(&data[qpos0], qval0);
+	    if (qpos1 && qval1) atomic_or(&data[qpos0 + 1], qval1);
 	}
 	barrier(CLK_LOCAL_MEM_FENCE);
 	if ((start32 + tid) * 32 <= start)