diff --git a/CUETools.FlaCuda/CUETools.Codecs.FlaCuda.csproj b/CUETools.FlaCuda/CUETools.Codecs.FlaCuda.csproj
index db38a62..515398c 100644
--- a/CUETools.FlaCuda/CUETools.Codecs.FlaCuda.csproj
+++ b/CUETools.FlaCuda/CUETools.Codecs.FlaCuda.csproj
@@ -67,7 +67,7 @@
   </Target>
   -->
   <PropertyGroup>
-    <PostBuildEvent>nvcc flacuda.cu --maxrregcount 10 --cubin --compiler-bindir "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\bin" --system-include "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\include" 
+    <PostBuildEvent>nvcc flacuda.cu  --maxrregcount 10 --cubin --compiler-bindir "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\bin" --system-include "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\include" 
 </PostBuildEvent>
   </PropertyGroup>
 </Project>
\ No newline at end of file
diff --git a/CUETools.FlaCuda/FlaCudaWriter.cs b/CUETools.FlaCuda/FlaCudaWriter.cs
index b6e5a14..8329ba7 100644
--- a/CUETools.FlaCuda/FlaCudaWriter.cs
+++ b/CUETools.FlaCuda/FlaCudaWriter.cs
@@ -1298,9 +1298,9 @@ namespace CUETools.Codecs.FlaCuda
 					int blocks_y = ((threads_x - 1) * (threads_y - 1)) / threads_y;
 					int blocks = (frame.blocksize + blocks_y * threads_y - 1) / (blocks_y * threads_y);
 					cuda.CopyHostToDevice(cudaSamples, (IntPtr)s, (uint)FlaCudaWriter.MAX_BLOCKSIZE * 4 * sizeof(int));
-					cuda.SetParameter(cudaComputeAutocor, 0, (uint)cudaSamples.Pointer);
-					cuda.SetParameter(cudaComputeAutocor, IntPtr.Size, (uint)cudaWindow.Pointer);
-					cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 2, (uint)cudaAutocor.Pointer);
+					cuda.SetParameter(cudaComputeAutocor, 0, (uint)cudaAutocor.Pointer);
+					cuda.SetParameter(cudaComputeAutocor, IntPtr.Size, (uint)cudaSamples.Pointer);
+					cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 2, (uint)cudaWindow.Pointer);
 					cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3, (uint)frame.blocksize);
 					cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3 + sizeof(uint), (uint)FlaCudaWriter.MAX_BLOCKSIZE);
 					cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3 + sizeof(uint) * 2, (uint)(blocks_y*threads_y));
diff --git a/CUETools.FlaCuda/flacuda.cu b/CUETools.FlaCuda/flacuda.cu
index 6cc5d87..2de4128 100644
--- a/CUETools.FlaCuda/flacuda.cu
+++ b/CUETools.FlaCuda/flacuda.cu
@@ -36,46 +36,48 @@
 #ifndef _FLACUDA_KERNEL_H_
 #define _FLACUDA_KERNEL_H_
 
-extern "C" __global__ void cudaComputeAutocor(const int * samples, const float * window, float* output, int frameSize, int frameOffset, int blocks)
+extern "C" __global__ void cudaComputeAutocor(
+    float *output,
+    const int *samples,
+    const float *window,
+    int frameSize,
+    int frameOffset,
+    int blocks)
 {
-    extern __shared__ float fshared[];
-    float * const matrix = fshared + 513;//257;
+    __shared__ struct {
+	float data[512];
+	float matrix[512];
+    } shared;
     const int iWin = blockIdx.y >> 2;
     const int iCh = blockIdx.y & 3;
     const int smpBase = iCh * frameOffset;
     const int winBase = iWin * 2 * frameOffset;
     const int pos = blockIdx.x * blocks;
+    const int tid = threadIdx.x + threadIdx.y * blockDim.x;
 
     // fetch blockDim.x*blockDim.y samples
-    int tid = threadIdx.x + threadIdx.y * blockDim.x;
-    fshared[tid] = pos + tid < frameSize ? samples[smpBase + pos + tid] * window[winBase + pos + tid] : 0.0f;
+    shared.data[tid] = pos + tid < frameSize ? samples[smpBase + pos + tid] * window[winBase + pos + tid] : 0.0f;
     __syncthreads();
 
     float s = 0.0f;    
     for (int i = 0; i < blocks; i += blockDim.y) 
-	s += fshared[i + threadIdx.y] * fshared[i + threadIdx.y + threadIdx.x];
-    matrix[tid + threadIdx.y] = s;
+	s += shared.data[i + threadIdx.y] * shared.data[i + threadIdx.y + threadIdx.x];
+    shared.matrix[tid] = s;
     __syncthreads();
 
     // reduction in shared mem
     for(unsigned int s=blockDim.y/2; s>1; s>>=1)
     {
 	if (threadIdx.y < s)
-	    matrix[tid + threadIdx.y] += matrix[threadIdx.x + (s + threadIdx.y) * (1 + blockDim.x)];
+	    shared.matrix[tid] += shared.matrix[tid + s * blockDim.x];
 	__syncthreads();
     }
 
     // return results
     if (threadIdx.y == 0)
-	output[(blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x] = matrix[threadIdx.x] + matrix[threadIdx.x + 1 + blockDim.x];
+	output[(blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x] = shared.matrix[threadIdx.x] + shared.matrix[threadIdx.x + blockDim.x];
 }
 
-#ifdef __DEVICE_EMULATION__
-#define EMUSYNC __syncthreads()
-#else
-#define EMUSYNC
-#endif
-
 extern "C" __global__ void cudaEncodeResidual(
     int*output,
     int*samples,
@@ -88,6 +90,7 @@ extern "C" __global__ void cudaEncodeResidual(
 	int data[256];
 	int residual[256];
 	int coefs[32];
+	int rice[16];
 	int shift;
     } shared;
     const int smpBase = (blockIdx.y & 3) * frameOffset;
@@ -96,20 +99,20 @@ extern "C" __global__ void cudaEncodeResidual(
     const int step = __mul24(blockDim.x, blockDim.y - 1);
     int total = 0;
 
-    shared.residual[tid] = 0;
     if (threadIdx.y == 0) shared.coefs[threadIdx.x] = threadIdx.x <= residualOrder ? allcoefs[threadIdx.x + residualOrder * 32 + blockIdx.y * 32 * 32] : 0;
     if (tid == 0) shared.shift = shifts[32 * blockIdx.y + residualOrder];
-    __syncthreads();
+    if (tid == 0) shared.rice[15] = 0x1fffffff;
     
     for(int pos = 0; pos < frameSize - residualOrder - 1; pos += step)
     {
 	// fetch blockDim.x * blockDim.y samples
-	shared.data[tid] = pos + tid < frameSize ? samples[smpBase + pos + tid] : 0;	
+	shared.data[tid] = (threadIdx.y == 0 && pos != 0) ? shared.data[tid + step]
+	    : (pos + tid < frameSize ? samples[smpBase + pos + tid] : 0);
 	__syncthreads();
 
 	long sum = 0;
 	for (int c = 0; c <= residualOrder; c++)
-	    sum += shared.data[tid + c] * shared.coefs[residualOrder - c];
+	    sum += __mul24(shared.data[tid + c], shared.coefs[residualOrder - c]);
 	int res = shared.data[tid + residualOrder + 1] - (sum >> shared.shift);
 	__syncthreads();
 
@@ -134,33 +137,22 @@ extern "C" __global__ void cudaEncodeResidual(
 	{
 	    if (threadIdx.x < s && threadIdx.y < s)
 		shared.residual[tid] += shared.residual[tid + s] + shared.residual[tid + s * blockDim.x] + shared.residual[tid + s + s * blockDim.x];
-	    EMUSYNC;
+	    __syncthreads();
 	}
 
-#ifndef __DEVICE_EMULATION__
-	if (tid < 16) // Max rice param is really 15
-#endif
-	{
-	    shared.data[tid] = limit * (tid + 1) + ((shared.residual[0] - (limit >> 1)) >> tid); EMUSYNC;
-	    //if (tid == 16) shared.rice[15] = 0x7fffffff;
-#ifndef __DEVICE_EMULATION__
-	    if (threadIdx.x < 8)
-#endif
-	    {
-		shared.data[threadIdx.x] = min(shared.data[threadIdx.x], shared.data[threadIdx.x + 8]); EMUSYNC;
-		shared.data[threadIdx.x] = min(shared.data[threadIdx.x], shared.data[threadIdx.x + 4]); EMUSYNC;
-		shared.data[threadIdx.x] = min(shared.data[threadIdx.x], shared.data[threadIdx.x + 2]); EMUSYNC;
-	    }
-	}
-	total += min(shared.data[0], shared.data[1]);
-	//total += shared.residual[0];
+	if (tid < 15) shared.rice[tid] = limit * (tid + 1) + ((shared.residual[0] - (limit >> 1)) >> tid); __syncthreads();
+	if (tid < 8) shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 8]); __syncthreads();
+	if (tid < 4) shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 4]); __syncthreads();
+	if (tid < 2) shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 2]); __syncthreads();
+	if (tid < 1) shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 1]); __syncthreads();
+	total += shared.rice[0];
     }
   
     if (tid == 0)
 	output[blockIdx.x + blockIdx.y * gridDim.x] = total;
 #ifdef __DEVICE_EMULATION__
     if (tid == 0)
-	printf("%d,%d:2:%d\n", blockIdx.x, blockIdx.y, total);
+	printf("%d,%d:%d\n", blockIdx.x, blockIdx.y, total);
 #endif
 }