From 9ebe8afe3fb564f4c194fda022b57e004d28b935 Mon Sep 17 00:00:00 2001
From: Grigory Chudov <gchudov@gmail.com>
Date: Sun, 23 Jun 2013 23:12:47 -0400
Subject: [PATCH] FLACCL: support for Intel HD Graphics

---
 CUETools.Codecs.FLACCL/FLACCLWriter.cs |  1 +
 CUETools.Codecs.FLACCL/flac.cl         | 77 ++++++++++++++++++--------
 CUETools.Codecs/CUETools.Codecs.csproj |  1 +
 3 files changed, 55 insertions(+), 24 deletions(-)

diff --git a/CUETools.Codecs.FLACCL/FLACCLWriter.cs b/CUETools.Codecs.FLACCL/FLACCLWriter.cs
index b582c2d..394f917 100644
--- a/CUETools.Codecs.FLACCL/FLACCLWriter.cs
+++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs
@@ -1764,6 +1764,7 @@ namespace CUETools.Codecs.FLACCL
 #endif
                     (m_settings.DeviceType == OpenCLDeviceType.CPU ? "#define FLACCL_CPU\n" : "") +
                     "#define OPENCL_PLATFORM \"" + OpenCL.GetPlatform(platformId).Name + "\"\n" +
+                    "#define VENDOR_ID " + OCLMan.Context.Devices[0].VendorID + "\n" +
                     m_settings.Defines + "\n";
 
                 var exts = new string[] { 
diff --git a/CUETools.Codecs.FLACCL/flac.cl b/CUETools.Codecs.FLACCL/flac.cl
index bff06e7..6b3f94f 100644
--- a/CUETools.Codecs.FLACCL/flac.cl
+++ b/CUETools.Codecs.FLACCL/flac.cl
@@ -32,6 +32,18 @@
 #define AMD
 #endif
 
+#define VENDOR_ID_INTEL  0x8086
+#define VENDOR_ID_NVIDIA 0x10DE
+#define VENDOR_ID_ATIAMD 0x1002
+
+#ifndef FLACCL_CPU
+#if VENDOR_ID == VENDOR_ID_INTEL
+#define WARP_SIZE 16
+#else
+#define WARP_SIZE 32
+#endif
+#endif
+
 #if defined(HAVE_cl_khr_fp64) || defined(HAVE_cl_amd_fp64)
 #define HAVE_DOUBLE
 #define ZEROD 0.0
@@ -51,8 +63,6 @@
 #define ZEROFD 0.0f
 #endif
 
-#define WARP_SIZE 32
-
 #if BITS_PER_SAMPLE > 16
 #define MAX_RICE_PARAM 30
 #define RICE_PARAM_BITS 5
@@ -416,21 +426,17 @@ void clComputeAutocor(
 	barrier(CLK_LOCAL_MEM_FENCE);
 	data[tid] = nextData;
     }
-    if (pos < bs)
     {
 	int off = pos + tid;
 	// fetch samples
 	double nextData = off < bs ? samples[task.samplesOffs + off] * wptr[off] : ZEROD;
 	data[tid + GROUP_SIZE] = nextData;
-	barrier(CLK_LOCAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);
 
 	fastdouble4 tmp = ZEROFD;
 	for (int i = 0; i < THREADS_FOR_ORDERS / 4; i++)
 	    tmp += vload4(i, &data[tid1 - lag]) * vload4(i, &data[tid1]);
 	corr += (tmp.x + tmp.y) + (tmp.w + tmp.z);
-
-	barrier(CLK_LOCAL_MEM_FENCE);
-	data[tid] = nextData;
     }
 
     data[tid] = corr;
@@ -524,8 +530,7 @@ void clComputeLPC(
     int autocOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1);
     int lpcOffs = autocOffs * 32;
     
-    if (get_local_id(0) <= MAX_ORDER)
-	shared.autoc[get_local_id(0)] = autoc[autocOffs + get_local_id(0)];
+    shared.autoc[get_local_id(0)] = get_local_id(0) <= MAX_ORDER ? autoc[autocOffs + get_local_id(0)] : 0;
     if (get_local_id(0) + get_local_size(0) <= MAX_ORDER)
 	shared.autoc[get_local_id(0) + get_local_size(0)] = autoc[autocOffs + get_local_id(0) + get_local_size(0)];
 
@@ -537,7 +542,7 @@ void clComputeLPC(
     double error = shared.autoc[0];
     
 #ifdef DEBUGPRINT1
-    int magic = shared.autoc[0] == 177286873088.0f;
+    int magic = autocOffs == 0; // shared.autoc[0] == 177286873088.0f;
     if (magic && get_local_id(0) <= MAX_ORDER)
         printf("autoc[%d] == %f\n", get_local_id(0), shared.autoc[get_local_id(0)]);
 #endif
@@ -585,6 +590,10 @@ void clComputeLPC(
 	//    printf("coef[%d] == %f, autoc == %f, error == %f\n", get_local_id(0), -shared.ldr[order - get_local_id(0)], shared.autoc[get_local_id(0)], shared.error[get_local_id(0)]);
     }
     barrier(CLK_LOCAL_MEM_FENCE);
+#ifdef DEBUGPRINT1
+    if (magic && get_local_id(0) < MAX_ORDER)
+        printf("error[%d] == %f\n", get_local_id(0), shared.error[get_local_id(0)]);
+#endif
     // Output prediction error estimates
     if (get_local_id(0) < MAX_ORDER)
 	lpcs[lpcOffs + MAX_ORDER * 32 + get_local_id(0)] = shared.error[get_local_id(0)];
@@ -1059,14 +1068,24 @@ void clEstimateResidual(
 	t = select(0U, t, offs >= ro);
 	// overflow protection
 	t = min(t, 0x7ffffffU);
-#if !defined(AMD)
-	idata[tid] = t;
-	for (int l = 16; l > 1; l >>= 1)
-	    idata[tid] += idata[tid + l];
-	if ((tid & 31) == 0)
-	    psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG] = idata[tid] + idata[tid + 1];
-#else
+#if defined(AMD)
 	atomic_add(&psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG], t);
+#else
+	idata[tid] = t;
+#if WARP_SIZE <= (1 << (ESTPARTLOG - 1))
+        barrier(CLK_LOCAL_MEM_FENCE);
+	for (int l = 1 << (ESTPARTLOG - 1); l >= WARP_SIZE; l >>= 1) {
+            if (!(tid & l)) idata[tid] += idata[tid + l];
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+	for (int l = WARP_SIZE / 2; l > 1; l >>= 1)
+	    idata[tid] += idata[tid + l];
+#else
+	for (int l = 1 << (ESTPARTLOG - 1); l > 1; l >>= 1)
+	    idata[tid] += idata[tid + l];
+#endif
+	if ((tid & (1 << ESTPARTLOG) - 1) == 0)
+            psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG] = idata[tid] + idata[tid + 1];
 #endif
     }
     if (pos < bs)
@@ -1105,14 +1124,24 @@ void clEstimateResidual(
 	t = select(0U, t, offs >= ro && offs < bs);
 	// overflow protection
 	t = min(t, 0x7ffffffU);
-#if !defined(AMD)
-	idata[tid] = t;
-	for (int l = 16; l > 1; l >>= 1)
-	    idata[tid] += idata[tid + l];
-	if ((tid & 31) == 0)
-	    psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG] = idata[tid] + idata[tid + 1];
-#else
+#if defined(AMD)
 	atomic_add(&psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG], t);
+#else
+	idata[tid] = t;
+#if WARP_SIZE <= (1 << (ESTPARTLOG - 1))
+        barrier(CLK_LOCAL_MEM_FENCE);
+	for (int l = 1 << (ESTPARTLOG - 1); l >= WARP_SIZE; l >>= 1) {
+            if (!(tid & l)) idata[tid] += idata[tid + l];
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+	for (int l = WARP_SIZE / 2; l > 1; l >>= 1)
+	    idata[tid] += idata[tid + l];
+#else
+	for (int l = 1 << (ESTPARTLOG - 1); l > 1; l >>= 1)
+	    idata[tid] += idata[tid + l];
+#endif
+	if ((tid & (1 << ESTPARTLOG) - 1) == 0)
+	    psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG] = idata[tid] + idata[tid + 1];
 #endif
     }
 
diff --git a/CUETools.Codecs/CUETools.Codecs.csproj b/CUETools.Codecs/CUETools.Codecs.csproj
index afe215c..21650f4 100644
--- a/CUETools.Codecs/CUETools.Codecs.csproj
+++ b/CUETools.Codecs/CUETools.Codecs.csproj
@@ -82,6 +82,7 @@
     <Compile Include="CyclicBuffer.cs" />
     <Compile Include="CyclicBufferInputStream.cs" />
     <Compile Include="CyclicBufferOutputStream.cs" />
+    <Compile Include="DefaultValueForMode.cs" />
     <Compile Include="DummyWriter.cs" />
     <Compile Include="IAudioDest.cs" />
     <Compile Include="AudioEncoderClass.cs" />