From 987d48a2bddb3ba861220d749ed05ff6066afb1c Mon Sep 17 00:00:00 2001
From: chudov <devnull@localhost>
Date: Wed, 17 Nov 2010 05:26:59 +0000
Subject: [PATCH] autodetecting atomics support

---
 CUETools.Codecs.FLACCL/FLACCLWriter.cs |  5 +-
 CUETools.Codecs.FLACCL/flac.cl         | 93 +++++++++++++++++---------
 2 files changed, 64 insertions(+), 34 deletions(-)

diff --git a/CUETools.Codecs.FLACCL/FLACCLWriter.cs b/CUETools.Codecs.FLACCL/FLACCLWriter.cs
index a0df6e5..4542b3a 100644
--- a/CUETools.Codecs.FLACCL/FLACCLWriter.cs
+++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs
@@ -162,7 +162,6 @@ namespace CUETools.Codecs.FLACCL
 		bool inited = false;
 
 		OpenCLManager OCLMan;
-		Context openCLContext;
 		Program openCLProgram;
 
 		FLACCLTask task1;
@@ -1615,7 +1614,9 @@ namespace CUETools.Codecs.FLACCL
 				}
 				OCLMan.CreateDefaultContext(platformId, (DeviceType)_settings.DeviceType);
 
-				openCLContext = OCLMan.Context;
+				if (OCLMan.Context.Devices[0].Extensions.Contains("cl_khr_local_int32_extended_atomics"))
+					OCLMan.Defines += "#define HAVE_ATOM\n";
+
 				try
 				{
 					openCLProgram = OCLMan.CompileFile("flac.cl");
diff --git a/CUETools.Codecs.FLACCL/flac.cl b/CUETools.Codecs.FLACCL/flac.cl
index 398b77b..43c0056 100644
--- a/CUETools.Codecs.FLACCL/flac.cl
+++ b/CUETools.Codecs.FLACCL/flac.cl
@@ -36,8 +36,12 @@
 #endif
 #endif
 
+#define WARP_SIZE 32
+
+#ifdef HAVE_ATOM
 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics: enable
 #pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics: enable
+#endif
 
 typedef enum
 {
@@ -605,6 +609,9 @@ void clQuantizeLPC(
 	volatile float error[64];
 	volatile int maxcoef[32];
 	volatile int maxcoef2[32];
+#ifndef HAVE_ATOM
+	volatile int tmp[32];
+#endif
     } shared;
 
     const int tid = get_local_id(0);
@@ -683,7 +690,20 @@ void clQuantizeLPC(
 	// get 15 bits of each coeff
 	int coef = convert_int_rte(lpc * (1 << 15));
 	// remove sign bits
+#ifdef HAVE_ATOM
 	atom_or(shared.maxcoef + i, coef ^ (coef >> 31));
+#else
+	shared.tmp[tid] = coef ^ (coef >> 31);
+	if (tid < 16)
+	{
+	    shared.tmp[tid] |= shared.tmp[tid + 16];
+	    shared.tmp[tid] |= shared.tmp[tid + 8];
+	    shared.tmp[tid] |= shared.tmp[tid + 4];
+	    shared.tmp[tid] |= shared.tmp[tid + 2];
+	    if (tid == 0)
+		shared.maxcoef[i] = shared.tmp[tid] | shared.tmp[tid + 1];
+	}
+#endif
 	barrier(CLK_LOCAL_MEM_FENCE);
 	//SUM32(shared.tmpi,tid,|=);
 	// choose precision	
@@ -703,7 +723,20 @@ void clQuantizeLPC(
 	//shared.tmp[tid] = (tid != 0) * (shared.arp[tid - 1]*(1 << shared.task.shift) - shared.task.coefs[tid - 1]);
 	//shared.task.coefs[tid] = max(-(1 << (shared.task.cbits - 1)), min((1 << (shared.task.cbits - 1))-1, convert_int_rte((shared.arp[tid]) * (1 << shared.task.shift) + shared.tmp[tid])));
 	// remove sign bits
+#ifdef HAVE_ATOM
 	atom_or(shared.maxcoef2 + i, coef ^ (coef >> 31));
+#else
+	shared.tmp[tid] = coef ^ (coef >> 31);
+	if (tid < 16)
+	{
+	    shared.tmp[tid] |= shared.tmp[tid + 16];
+	    shared.tmp[tid] |= shared.tmp[tid + 8];
+	    shared.tmp[tid] |= shared.tmp[tid + 4];
+	    shared.tmp[tid] |= shared.tmp[tid + 2];
+	    if (tid == 0)
+		shared.maxcoef2[i] = shared.tmp[tid] | shared.tmp[tid + 1];
+	}
+#endif
 	barrier(CLK_LOCAL_MEM_FENCE);
 	// calculate actual number of bits (+1 for sign)
 	cbits = 1 + 32 - clz(shared.maxcoef2[i]);
@@ -800,6 +833,9 @@ void clEstimateResidual(
     )
 {
     __local float data[GROUP_SIZE * 2 + 32];
+#if !defined(AMD) || !defined(HAVE_ATOM)
+    __local volatile int idata[GROUP_SIZE];
+#endif
     __local FLACCLSubframeTask task;
     __local int psum[64];
     __local float fcoef[32];
@@ -827,7 +863,7 @@ void clEstimateResidual(
     if (tid < 32)
 	data[GROUP_SIZE * 2 + tid] = 0.0f;
 
-    int partOrder = max(1, clz(64) - clz(bs - 1) + 1);
+    int partOrder = max(6, clz(64) - clz(bs - 1) + 1);
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
@@ -876,32 +912,31 @@ void clEstimateResidual(
 
 	int t = convert_int_rte(nextData + sum.x + sum.y + sum.z + sum.w);
 	barrier(CLK_LOCAL_MEM_FENCE);
-#ifdef AMD
 	data[tid] = nextData;
 	// ensure we're within frame bounds
 	t = select(0, t, offs >= ro && offs < bs);
 	// overflow protection
 	t = iclamp(t, -0x7fffff, 0x7fffff);
 	// convert to unsigned
-	atom_add(&psum[min(63,offs >> partOrder)], (t << 1) ^ (t >> 31));
-#else
-	// ensure we're within frame bounds
-	t = select(0, t, offs >= ro && offs < bs);
-	// overflow protection
-	t = iclamp(t, -0x7fffff, 0x7fffff);
+	t = (t << 1) ^ (t >> 31);
+#if !defined(AMD) || !defined(HAVE_ATOM)
 	// convert to unsigned
-	data[tid] = (t << 1) ^ (t >> 31);
+	idata[tid] = t;
 	barrier(CLK_LOCAL_MEM_FENCE);
 	int ps = (1 << partOrder) - 1;
-	for (int l = 1 << (partOrder - 1); l > 0; l >>= 1)
+	int lane = tid & ps;
+	for (int l = 1 << (partOrder - 1); l > WARP_SIZE; l >>= 1)
 	{
-             if ((tid & ps) < l) 
-		data[tid] += data[tid + l];
+            if (lane < l) idata[tid] += idata[tid + l];
 	    barrier(CLK_LOCAL_MEM_FENCE);
 	}
-	if ((tid & ps) == 0)
-		psum[min(63,offs >> partOrder)] += data[tid];
-	data[tid] = nextData;
+	if (lane < WARP_SIZE)
+	    for (int l = WARP_SIZE; l > 0; l >>= 1)
+		idata[tid] += idata[tid + l];
+	if (lane == 0)
+	    psum[min(63,offs >> partOrder)] += idata[tid];
+#else
+	atom_add(&psum[min(63,offs >> partOrder)], t);
 #endif
     }
 
@@ -1735,40 +1770,35 @@ void clRiceEncoding(
 	flush(&bw);
     }
 #else
-#define WARP_SIZE 32
-    __local FLACCLSubframeData task;
-    __local int riceparams[256];
-    __local int mypos[GROUP_SIZE+1];
     __local unsigned int data[GROUP_SIZE];
-    __local int start;
+    __local int mypos[GROUP_SIZE+1];
+    __local FLACCLSubframeData task;
 
     int tid = get_local_id(0);
     if (tid < sizeof(task) / sizeof(int))
 	((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(0)]))[tid];
     barrier(CLK_LOCAL_MEM_FENCE);
-    for (int offs = tid; offs < (1 << task.porder); offs += GROUP_SIZE)
-	riceparams[offs] = best_rice_parameters[(get_group_id(0) << max_porder) + offs];
-    if (tid == 0)
-	start = task.encodingOffset;
     if (tid == 0)
 	mypos[GROUP_SIZE] = 0;
     data[tid] = 0;
     barrier(CLK_LOCAL_MEM_FENCE);
-    int bs = task.blocksize;
-    int partlen = bs >> task.porder;
+    const int bs = task.blocksize;
+    int start = task.encodingOffset;
     for (int pos = 0; pos < bs; pos += GROUP_SIZE)
     {
 	int offs = pos + tid;
 	int v = offs < bs ? residual[task.residualOffs + offs] : 0;
-	int k = offs < bs ? riceparams[offs / partlen] : 0;
-	int pstart = offs == task.residualOrder || (offs % partlen) == 0;
+	int part = (offs << task.porder) / bs;
+	int k = offs < bs ? best_rice_parameters[(get_group_id(0) << max_porder) + part] : 0;
+	int pstart = offs == task.residualOrder || offs == (part * bs) >> task.porder;
 	v = (v << 1) ^ (v >> 31);
 	int mylen = select(0, (v >> k) + 1 + k + select(0, 4, pstart), offs >= task.residualOrder && offs < bs);
 	mypos[tid] = mylen;
 	// Inclusive scan(+)
 #if 1
+	int lane = (tid & (WARP_SIZE - 1));
 	for (int offset = 1; offset < WARP_SIZE; offset <<= 1)
-	    mypos[tid] += mypos[select(GROUP_SIZE, tid - offset, offset <= (tid & (WARP_SIZE - 1)))];
+	    mypos[tid] += mypos[select(GROUP_SIZE, tid - offset, lane >= offset)];
 #if 1
 	barrier(CLK_LOCAL_MEM_FENCE);
         for (int j = GROUP_SIZE - WARP_SIZE; j > 0; j -= WARP_SIZE)
@@ -1826,8 +1856,7 @@ void clRiceEncoding(
 	unsigned int qval1= select(0, qval << (32 - qpos1), qpos1);
 	atom_or(&data[qpos0], qval0);
 	atom_or(&data[qpos0 + 1], qval1);
-	if (tid == GROUP_SIZE - 1)
-	    start = mypos[tid];
+	start = mypos[GROUP_SIZE - 1];
 	//if (get_group_id(0) == 0 && pos == 0)
 	//    printf("[%d] == %d\n", tid, mypos[tid]);
 	//if (get_group_id(0) == 0 && pos == 0)
@@ -1835,7 +1864,7 @@ void clRiceEncoding(
 	barrier(CLK_LOCAL_MEM_FENCE);
 	unsigned int bb = data[tid];
 	if ((start32 + tid) * 32 <= start)
-	    output[start32 + tid] = (bb >> 24) | ((bb >> 8) & 0xff00) | ((bb << 8) & 0xff0000) | ((bb << 24) & 0xff000000);
+	    output[start32 + tid] = (bb >> 24) | ((bb >> 8) & 0xff00) | ((bb << 8) & 0xff0000) | (bb << 24);
 	//if (get_group_id(0) == 0 && pos == 0 && bb != 0)
 	//    printf("[%08x] == %08X\n", 0x2dc8 + (tid + start32) * 4, data[tid]);
 	int remainder = data[start / 32 - start32];