From e05fbc88f5b2bc7ed66adb1a50b516e587f47ab6 Mon Sep 17 00:00:00 2001
From: chudov <devnull@localhost>
Date: Tue, 8 Sep 2009 09:51:33 +0000
Subject: [PATCH] tidying up

---
 CUETools.FlaCuda/FlaCudaWriter.cs | 55 ++++++++++++++++++++++++++-----
 CUETools.FlaCuda/flacuda.cu       | 48 ++++++++++++---------------
 2 files changed, 67 insertions(+), 36 deletions(-)

diff --git a/CUETools.FlaCuda/FlaCudaWriter.cs b/CUETools.FlaCuda/FlaCudaWriter.cs
index 8329ba7..cfcdb05 100644
--- a/CUETools.FlaCuda/FlaCudaWriter.cs
+++ b/CUETools.FlaCuda/FlaCudaWriter.cs
@@ -106,7 +106,7 @@ namespace CUETools.Codecs.FlaCuda
 			samplesBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels)];
 			residualBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 10 : channels + 1)];
 			windowBuffer = new float[FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS];
-			autocorBuffer3int = new int[(lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * 64];
+			autocorBuffer3int = new int[(lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS];
 			autocorBuffer = new float[(lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * 22];
 
 			eparams.flake_set_defaults(_compressionLevel);
@@ -786,6 +786,37 @@ namespace CUETools.Codecs.FlaCuda
 				}
 		}
 
+		unsafe void encode_residual_lpc_sub(FlacFrame frame, int* coefs, int shift, int iWindow, int order, int ch)
+		{
+			frame.current.type = SubframeType.LPC;
+			frame.current.order = order;
+			frame.current.window = iWindow;
+			frame.current.shift = shift;
+			fixed (int* fcoefs = frame.current.coefs)
+				AudioSamples.MemCpy(fcoefs, coefs, order);
+
+			ulong csum = 0;
+			int cbsum = 0;
+			int cbits = 1;
+			for (int i = frame.current.order; i > 0; i--)
+			{
+				csum += (ulong)Math.Abs(coefs[i - 1]);
+				cbsum |= coefs[i - 1];
+			}
+			while (cbits < 16 && cbsum != (cbsum << (32 - cbits)) >> (32 - cbits))
+				cbits++;
+
+			if ((csum << (int)frame.subframes[ch].obits) >= 1UL << 32)
+				lpc.encode_residual_long(frame.current.residual, frame.subframes[ch].samples, frame.blocksize, frame.current.order, coefs, frame.current.shift);
+			else
+				lpc.encode_residual(frame.current.residual, frame.subframes[ch].samples, frame.blocksize, frame.current.order, coefs, frame.current.shift);
+
+			frame.current.size = calc_rice_params_lpc(ref frame.current.rc, eparams.min_partition_order, eparams.max_partition_order,
+				frame.current.residual, frame.blocksize, frame.current.order, frame.subframes[ch].obits, (uint)cbits);
+
+			frame.ChooseBestSubframe(ch);
+		}
+
 		unsafe void encode_residual_fixed_sub(FlacFrame frame, int order, int ch)
 		{
 			if ((frame.subframes[ch].done_fixed & (1U << order)) != 0)
@@ -986,13 +1017,15 @@ namespace CUETools.Codecs.FlaCuda
 				predict == PredictionType.Search)
 				)
 			{
-				float* lpcs = stackalloc float[lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER];
+				//float* lpcs = stackalloc float[lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER];
 				LpcContext lpc_ctx = frame.subframes[ch].lpc_ctx[best_window];
 
-				lpc_ctx.GetReflection(eparams.max_prediction_order, smp, n, frame.window_buffer + best_window * FlaCudaWriter.MAX_BLOCKSIZE * 2);
-				lpc_ctx.ComputeLPC(lpcs);
+				//lpc_ctx.GetReflection(eparams.max_prediction_order, smp, n, frame.window_buffer + best_window * FlaCudaWriter.MAX_BLOCKSIZE * 2);
+				//lpc_ctx.ComputeLPC(lpcs);
 
-				encode_residual_lpc_sub(frame, lpcs, best_window, best_order, ch);
+				fixed (int *coefs = lpc_ctx.coefs)
+					encode_residual_lpc_sub(frame, coefs, lpc_ctx.shift, best_window, best_order, ch);
+				//encode_residual_lpc_sub(frame, lpcs, best_window, best_order, ch);
 			}
 		}
 
@@ -1306,7 +1339,7 @@ namespace CUETools.Codecs.FlaCuda
 					cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3 + sizeof(uint) * 2, (uint)(blocks_y*threads_y));
 					cuda.SetParameterSize(cudaComputeAutocor, (uint)(IntPtr.Size * 3) + sizeof(uint) * 3);
 					cuda.SetFunctionBlockShape(cudaComputeAutocor, threads_x, threads_y, 1);
-					cuda.SetFunctionSharedSize(cudaComputeAutocor, (uint)(sizeof(float) * 1090));//545));
+					//cuda.SetFunctionSharedSize(cudaComputeAutocor, (uint)(sizeof(float) * 1090));//545));
 					cuda.Launch(cudaComputeAutocor, blocks, 4 * _windowcount);
 					cuda.CopyDeviceToHost<float>(cudaAutocor, autocorBuffer);
 					//cuda.CopyDeviceToHost<int>(cudaAutocor, autocorBuffer3int);
@@ -1375,6 +1408,9 @@ namespace CUETools.Codecs.FlaCuda
 									frame.subframes[ch].best.order = order;
 									frame.subframes[ch].best.window = iWindow;
 									frame.subframes[ch].best.type = SubframeType.LPC;
+									frame.subframes[ch].lpc_ctx[iWindow].shift = shift[order - 1 + (ch + 4 * iWindow) * lpc.MAX_LPC_ORDER];
+									fixed(int *lcoefs = frame.subframes[ch].lpc_ctx[iWindow].coefs)
+										AudioSamples.MemCpy(lcoefs, coefs + (order - 1) * lpc.MAX_LPC_ORDER + (ch + 4 * iWindow) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER, order);
 								}
 							}
 							//uint[] sums_buf = new uint[Flake.MAX_PARTITION_ORDER * Flake.MAX_PARTITIONS];
@@ -1485,7 +1521,7 @@ namespace CUETools.Codecs.FlaCuda
 				fixed (int* s = verifyBuffer, r = verify.Samples)
 				{
 					for (int ch = 0; ch < channels; ch++)
-						if (AudioSamples.MemCmp(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * FlaCudaWriter.MAX_BLOCKSIZE, bs))
+						if (AudioSamples.MemCmp(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, bs))
 							throw new Exception("validation failed!");
 				}
 			}
@@ -1506,12 +1542,13 @@ namespace CUETools.Codecs.FlaCuda
 		{
 			if (!inited)
 			{
-				cuda = new CUDA(0, true);
+				cuda = new CUDA(true, InitializationFlags.None);
+				cuda.CreateContext(0, CUCtxFlags.SchedSpin);
 				cuda.LoadModule(System.IO.Path.Combine(Environment.CurrentDirectory, "flacuda.cubin"));
 				cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor");
 				cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual");
 				//cudaAutocor = cuda.Allocate((uint)(sizeof(int) * (lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * 3);
-				cudaAutocor = cuda.Allocate((uint)(sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * 64);
+				cudaAutocor = cuda.Allocate((uint)(sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * 22);
 				cudaSamples = cuda.Allocate((uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels)));
 				cudaCoeffs = cuda.Allocate((uint)(sizeof(int) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS));
 				cudaShifts = cuda.Allocate((uint)(sizeof(int) * lpc.MAX_LPC_ORDER * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS));
diff --git a/CUETools.FlaCuda/flacuda.cu b/CUETools.FlaCuda/flacuda.cu
index 2de4128..e56391b 100644
--- a/CUETools.FlaCuda/flacuda.cu
+++ b/CUETools.FlaCuda/flacuda.cu
@@ -114,40 +114,34 @@ extern "C" __global__ void cudaEncodeResidual(
 	for (int c = 0; c <= residualOrder; c++)
 	    sum += __mul24(shared.data[tid + c], shared.coefs[residualOrder - c]);
 	int res = shared.data[tid + residualOrder + 1] - (sum >> shared.shift);
-	__syncthreads();
+	//__syncthreads();
 
 	int limit = min(frameSize - pos - residualOrder - 1, step);
-	shared.residual[tid] = tid < limit ? (2 * res) ^ (res >> 31) : 0;
+	shared.residual[tid] = __mul24(tid < limit, (2 * res) ^ (res >> 31));
 
 	__syncthreads();
 	// reduction in shared mem
-	for(unsigned int s=blockDim.x/2; s >= blockDim.y; s>>=1)
-	{
-	    if (threadIdx.x < s)
-		shared.residual[tid] += shared.residual[tid + s];
-	    __syncthreads();
-	}
-	for(unsigned int s=blockDim.y/2; s >= blockDim.x; s>>=1)
-	{
-	    if (threadIdx.y < s)
-		shared.residual[tid] += shared.residual[tid + s * blockDim.x];
-	    __syncthreads();
-	}
-	for(unsigned int s=min(blockDim.x,blockDim.y)/2; s > 0; s>>=1)
-	{
-	    if (threadIdx.x < s && threadIdx.y < s)
-		shared.residual[tid] += shared.residual[tid + s] + shared.residual[tid + s * blockDim.x] + shared.residual[tid + s + s * blockDim.x];
-	    __syncthreads();
-	}
+	if (tid < 128) shared.residual[tid] += shared.residual[tid + 128]; __syncthreads();
+	if (tid < 64) shared.residual[tid] += shared.residual[tid + 64]; __syncthreads();
+	if (tid < 32) shared.residual[tid] += shared.residual[tid + 32]; __syncthreads();
+	shared.residual[tid] += shared.residual[tid + 16];
+	shared.residual[tid] += shared.residual[tid + 8];
+	shared.residual[tid] += shared.residual[tid + 4];
+	shared.residual[tid] += shared.residual[tid + 2];
+	shared.residual[tid] += shared.residual[tid + 1];
+	__syncthreads();
 
-	if (tid < 15) shared.rice[tid] = limit * (tid + 1) + ((shared.residual[0] - (limit >> 1)) >> tid); __syncthreads();
-	if (tid < 8) shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 8]); __syncthreads();
-	if (tid < 4) shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 4]); __syncthreads();
-	if (tid < 2) shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 2]); __syncthreads();
-	if (tid < 1) shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 1]); __syncthreads();
-	total += shared.rice[0];
+	if (tid < 16)
+	{
+	    shared.rice[tid] = __mul24(tid == 15, 0x7fffff) + limit * (tid + 1) + ((shared.residual[0] - (limit >> 1)) >> tid);
+	    shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 8]);
+	    shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 4]);
+	    shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 2]);
+	    total += min(shared.rice[tid], shared.rice[tid + 1]); 
+	}
+	__syncthreads();
     }
-  
+    __syncthreads();  
     if (tid == 0)
 	output[blockIdx.x + blockIdx.y * gridDim.x] = total;
 #ifdef __DEVICE_EMULATION__