optimizations

2025-12-16 18:14:25 +00:00 · 2009-10-27 22:12:24 +00:00
parent 28e8c70a9f
commit 8f3b3bef9f
3 changed files with 626 additions and 703 deletions
--- a/CUETools.FlaCuda/FlaCudaWriter.cs
+++ b/CUETools.FlaCuda/FlaCudaWriter.cs
@@ -557,131 +557,152 @@ namespace CUETools.Codecs.FlaCuda
 			}
 		}

-		static uint rice_encode_count(uint sum, uint n, uint k)
-		{
-			return n * (k + 1) + ((sum - (n >> 1)) >> (int)k);
-		}
-
-		//static unsafe uint find_optimal_rice_param(uint sum, uint n)
-		//{
-		//    uint* nbits = stackalloc uint[Flake.MAX_RICE_PARAM + 1];
-		//    int k_opt = 0;
-
-		//    nbits[0] = UINT32_MAX;
-		//    for (int k = 0; k <= Flake.MAX_RICE_PARAM; k++)
-		//    {
-		//        nbits[k] = rice_encode_count(sum, n, (uint)k);
-		//        if (nbits[k] < nbits[k_opt])
-		//            k_opt = k;
-		//    }
-		//    return (uint)k_opt;
-		//}
-
-		static unsafe int find_optimal_rice_param(uint sum, uint n, out uint nbits_best)
-		{
-			int k_opt = 0;
-			uint a = n;
-			uint b = sum - (n >> 1);
-			uint nbits = a + b;
-			for (int k = 1; k <= Flake.MAX_RICE_PARAM; k++)
-			{
-				a += n;
-				b >>= 1;
-				uint nbits_k = a + b;
-				if (nbits_k < nbits)
-				{
-					k_opt = k;
-					nbits = nbits_k;
-				}
-			}
-			nbits_best = nbits;
-			return k_opt;
-		}
-
-		static unsafe uint calc_optimal_rice_params(ref RiceContext rc, int porder, uint* sums, uint n, uint pred_order)
+		static unsafe uint calc_optimal_rice_params(int porder, int* parm, uint* sums, uint n, uint pred_order)
 		{
 			uint part = (1U << porder);
-			uint all_bits = 0;			
-			rc.rparams[0] = find_optimal_rice_param(sums[0], (n >> porder) - pred_order, out all_bits);
-			uint cnt = (n >> porder);
+			uint cnt = (n >> porder) - pred_order;
+			int k = cnt > 0 ? Math.Min(Flake.MAX_RICE_PARAM, BitReader.log2i(sums[0] / cnt)) : 0;
+			uint all_bits = cnt * ((uint)k + 1U) + (sums[0] >> k);
+			parm[0] = k;
+			cnt = (n >> porder);
 			for (uint i = 1; i < part; i++)
 			{
-				uint nbits;
-				rc.rparams[i] = find_optimal_rice_param(sums[i], cnt, out nbits);
-				all_bits += nbits;
+				k = Math.Min(Flake.MAX_RICE_PARAM, BitReader.log2i(sums[i] / cnt));
+				all_bits += cnt * ((uint)k + 1U) + (sums[i] >> k);
+				parm[i] = k;
 			}
-			all_bits += (4 * part);
-			rc.porder = porder;
-			return all_bits;
+			return all_bits + (4 * part);
 		}

-		static unsafe void calc_sums(int pmin, int pmax, int* data, uint n, uint pred_order, uint* sums)
+		static unsafe void calc_lower_sums(int pmin, int pmax, uint* sums)
 		{
-			// sums for highest level
-			int parts = (1 << pmax);
-			int* res = data + pred_order;
-			uint cnt = (n >> pmax) - pred_order;
-			uint sum = 0;
-			for (uint j = cnt; j > 0; j--)
-			{
-				int val = *(res++);
-				sum += (uint)((val << 1) ^ (val >> 31));
-			}
-			sums[pmax * Flake.MAX_PARTITIONS + 0] = sum;
-			cnt = (n >> pmax);
-			for (int i = 1; i < parts; i++)
-			{
-				sum = 0;
-				for (uint j = cnt; j > 0; j--)
-				{
-					int val = *(res++);
-					sum += (uint)((val << 1) ^ (val >> 31));
-				}
-				sums[pmax * Flake.MAX_PARTITIONS + i] = sum;
-			}
-			// sums for lower levels
 			for (int i = pmax - 1; i >= pmin; i--)
 			{
-				parts = (1 << i);
-				for (int j = 0; j < parts; j++)
+				for (int j = 0; j < (1 << i); j++)
 				{
-					sums[i * Flake.MAX_PARTITIONS + j] = 
-						sums[(i + 1) * Flake.MAX_PARTITIONS + 2 * j] + 
+					sums[i * Flake.MAX_PARTITIONS + j] =
+						sums[(i + 1) * Flake.MAX_PARTITIONS + 2 * j] +
 						sums[(i + 1) * Flake.MAX_PARTITIONS + 2 * j + 1];
 				}
 			}
 		}

-		static unsafe uint calc_rice_params(ref RiceContext rc, ref RiceContext tmp_rc, int pmin, int pmax, int* data, uint n, uint pred_order)
+		static unsafe void calc_sums(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums)
 		{
-			//uint* udata = stackalloc uint[(int)n];
+			int parts = (1 << pmax);
+			uint* res = data + pred_order;
+			uint cnt = (n >> pmax) - pred_order;
+			uint sum = 0;
+			for (uint j = cnt; j > 0; j--)
+				sum += *(res++);
+			sums[0] = sum;
+			cnt = (n >> pmax);
+			for (int i = 1; i < parts; i++)
+			{
+				sum = 0;
+				for (uint j = cnt; j > 0; j--)
+					sum += *(res++);
+				sums[i] = sum;
+			}
+		}
+
+		/// <summary>
+		/// Special case when (n >> pmax) == 18
+		/// </summary>
+		/// <param name="pmin"></param>
+		/// <param name="pmax"></param>
+		/// <param name="data"></param>
+		/// <param name="n"></param>
+		/// <param name="pred_order"></param>
+		/// <param name="sums"></param>
+		static unsafe void calc_sums18(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums)
+		{
+			int parts = (1 << pmax);
+			uint* res = data + pred_order;
+			uint cnt = 18 - pred_order;
+			uint sum = 0;
+			for (uint j = cnt; j > 0; j--)
+				sum += *(res++);
+			sums[0] = sum;
+			for (int i = 1; i < parts; i++)
+			{
+				sums[i] =
+					*(res++) + *(res++) + *(res++) + *(res++) +
+					*(res++) + *(res++) + *(res++) + *(res++) +
+					*(res++) + *(res++) + *(res++) + *(res++) +
+					*(res++) + *(res++) + *(res++) + *(res++) +
+					*(res++) + *(res++);
+			}
+		}
+
+		/// <summary>
+		/// Special case when (n >> pmax) == 18
+		/// </summary>
+		/// <param name="pmin"></param>
+		/// <param name="pmax"></param>
+		/// <param name="data"></param>
+		/// <param name="n"></param>
+		/// <param name="pred_order"></param>
+		/// <param name="sums"></param>
+		static unsafe void calc_sums16(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums)
+		{
+			int parts = (1 << pmax);
+			uint* res = data + pred_order;
+			uint cnt = 16 - pred_order;
+			uint sum = 0;
+			for (uint j = cnt; j > 0; j--)
+				sum += *(res++);
+			sums[0] = sum;
+			for (int i = 1; i < parts; i++)
+			{
+				sums[i] =
+					*(res++) + *(res++) + *(res++) + *(res++) +
+					*(res++) + *(res++) + *(res++) + *(res++) +
+					*(res++) + *(res++) + *(res++) + *(res++) +
+					*(res++) + *(res++) + *(res++) + *(res++);
+			}
+		}
+
+		static unsafe uint calc_rice_params(RiceContext rc, int pmin, int pmax, int* data, uint n, uint pred_order)
+		{
+			uint* udata = stackalloc uint[(int)n];
 			uint* sums = stackalloc uint[(pmax + 1) * Flake.MAX_PARTITIONS];
+			int* parm = stackalloc int[(pmax + 1) * Flake.MAX_PARTITIONS];
 			//uint* bits = stackalloc uint[Flake.MAX_PARTITION_ORDER];

 			//assert(pmin >= 0 && pmin <= Flake.MAX_PARTITION_ORDER);
 			//assert(pmax >= 0 && pmax <= Flake.MAX_PARTITION_ORDER);
 			//assert(pmin <= pmax);

-			//for (uint i = 0; i < n; i++)
-			//    udata[i] = (uint) ((2 * data[i]) ^ (data[i] >> 31));
+			for (uint i = 0; i < n; i++)
+				udata[i] = (uint)((data[i] << 1) ^ (data[i] >> 31));

-			calc_sums(pmin, pmax, data, n, pred_order, sums);
+			// sums for highest level
+			if ((n >> pmax) == 18)
+				calc_sums18(pmin, pmax, udata, n, pred_order, sums + pmax * Flake.MAX_PARTITIONS);
+			else if ((n >> pmax) == 16)
+				calc_sums16(pmin, pmax, udata, n, pred_order, sums + pmax * Flake.MAX_PARTITIONS);
+			else
+				calc_sums(pmin, pmax, udata, n, pred_order, sums + pmax * Flake.MAX_PARTITIONS);
+			// sums for lower levels
+			calc_lower_sums(pmin, pmax, sums);

-			int opt_porder = pmin;
 			uint opt_bits = AudioSamples.UINT32_MAX;
+			int opt_porder = pmin;
 			for (int i = pmin; i <= pmax; i++)
 			{
-				uint bits = calc_optimal_rice_params(ref tmp_rc, i, sums + i * Flake.MAX_PARTITIONS, n, pred_order);
+				uint bits = calc_optimal_rice_params(i, parm + i * Flake.MAX_PARTITIONS, sums + i * Flake.MAX_PARTITIONS, n, pred_order);
 				if (bits <= opt_bits)
 				{
-					opt_porder = i;
 					opt_bits = bits;
-					RiceContext tmp_rc2 = rc;
-					rc = tmp_rc;
-					tmp_rc = tmp_rc2;
+					opt_porder = i;
 				}
 			}

+			rc.porder = opt_porder;
+			fixed (int* rparms = rc.rparams)
+				AudioSamples.MemCpy(rparms, parm + opt_porder * Flake.MAX_PARTITIONS, (1 << opt_porder));
+
 			return opt_bits;
 		}

@@ -990,7 +1011,7 @@ namespace CUETools.Codecs.FlaCuda
 							int pmin = get_max_p_order(eparams.min_partition_order, frame.blocksize, frame.subframes[ch].best.order);
 							int pmax = get_max_p_order(eparams.max_partition_order, frame.blocksize, frame.subframes[ch].best.order);
 							uint bits = (uint)frame.subframes[ch].best.order * frame.subframes[ch].obits + 6;
-							frame.subframes[ch].best.size = bits + calc_rice_params(ref frame.subframes[ch].best.rc, ref frame.current.rc, pmin, pmax, frame.subframes[ch].best.residual, (uint)frame.blocksize, (uint)frame.subframes[ch].best.order);
+							frame.subframes[ch].best.size = bits + calc_rice_params(frame.subframes[ch].best.rc, pmin, pmax, frame.subframes[ch].best.residual, (uint)frame.blocksize, (uint)frame.subframes[ch].best.order);
 						}
 						break;
 					case SubframeType.LPC:
@@ -1009,7 +1030,7 @@ namespace CUETools.Codecs.FlaCuda
 								int pmax = get_max_p_order(eparams.max_partition_order, frame.blocksize, frame.subframes[ch].best.order);
 								uint bits = (uint)frame.subframes[ch].best.order * frame.subframes[ch].obits + 4 + 5 + (uint)frame.subframes[ch].best.order * (uint)frame.subframes[ch].best.cbits + 6;
 								//uint oldsize = frame.subframes[ch].best.size;
-								frame.subframes[ch].best.size = bits + calc_rice_params(ref frame.subframes[ch].best.rc, ref frame.current.rc, pmin, pmax, frame.subframes[ch].best.residual, (uint)frame.blocksize, (uint)frame.subframes[ch].best.order);
+								frame.subframes[ch].best.size = bits + calc_rice_params(frame.subframes[ch].best.rc, pmin, pmax, frame.subframes[ch].best.residual, (uint)frame.blocksize, (uint)frame.subframes[ch].best.order);
 								//if (frame.subframes[ch].best.size > frame.subframes[ch].obits * (uint)frame.blocksize &&
 								//    oldsize <= frame.subframes[ch].obits * (uint)frame.blocksize)
 								//    throw new Exception("oops");
@@ -1213,9 +1234,10 @@ namespace CUETools.Codecs.FlaCuda

 			cuda.SetParameter(task.cudaChooseBestMethod, 0 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer);
 			cuda.SetParameter(task.cudaChooseBestMethod, 1 * sizeof(uint), (uint)task.cudaResidualOutput.Pointer);
-			cuda.SetParameter(task.cudaChooseBestMethod, 2 * sizeof(uint), (uint)residualPartCount);
-			cuda.SetParameter(task.cudaChooseBestMethod, 3 * sizeof(uint), (uint)task.nResidualTasksPerChannel);
-			cuda.SetParameterSize(task.cudaChooseBestMethod, sizeof(uint) * 4U);
+			cuda.SetParameter(task.cudaChooseBestMethod, 2 * sizeof(uint), (uint)residualPartSize);
+			cuda.SetParameter(task.cudaChooseBestMethod, 3 * sizeof(uint), (uint)residualPartCount);
+			cuda.SetParameter(task.cudaChooseBestMethod, 4 * sizeof(uint), (uint)task.nResidualTasksPerChannel);
+			cuda.SetParameterSize(task.cudaChooseBestMethod, 5U * sizeof(uint));
 			cuda.SetFunctionBlockShape(task.cudaChooseBestMethod, 32, 8, 1);

 			cuda.SetParameter(task.cudaCopyBestMethod, 0, (uint)task.cudaBestResidualTasks.Pointer);
--- a/CUETools.FlaCuda/flacuda.cu
+++ b/CUETools.FlaCuda/flacuda.cu
@@ -61,6 +61,7 @@ typedef struct
 #define SUM512(buf,tid,op)  if (tid < 256) buf[tid] op buf[tid + 256]; __syncthreads(); SUM256(buf,tid,op)

 #define FSQR(s) ((s)*(s))
+#define FASTMUL(a,b) __mul24(a,b)

 extern "C" __global__ void cudaStereoDecorr(
    int *samples,
@@ -581,14 +582,8 @@ extern "C" __global__ void cudaEstimateResidual(

    shared.residual[tid] = shared.residual[tid] + shared.residual[tid + 8] + shared.residual[tid + 16] + shared.residual[tid + 24];
    shared.residual[tid] = shared.residual[tid] + shared.residual[tid + 2] + shared.residual[tid + 4] + shared.residual[tid + 6];
-    shared.residual[tid] += shared.residual[tid + 1];
-
-    // rice parameter search
-    shared.residual[tid] = (shared.task[threadIdx.y].type != Constant || shared.residual[threadIdx.y << 5] != 0) *
-	(__mul24(threadIdx.x >= 15, 0x7fffff) + residualLen * (threadIdx.x + 1) + ((shared.residual[threadIdx.y << 5] - (residualLen >> 1)) >> threadIdx.x));
-    shared.residual[tid] = min(min(shared.residual[tid], shared.residual[tid + 4]), min(shared.residual[tid + 8], shared.residual[tid + 12]));
    if (threadIdx.x == 0)
-	output[(blockIdx.y * blockDim.y + threadIdx.y) * 64 + blockIdx.x] = min(min(shared.residual[tid], shared.residual[tid + 1]), min(shared.residual[tid + 2], shared.residual[tid + 3]));
+	output[(blockIdx.y * blockDim.y + threadIdx.y) * 64 + blockIdx.x] = shared.residual[tid] + shared.residual[tid + 1];
 }

 extern "C" __global__ void cudaEstimateResidual1(
@@ -612,7 +607,7 @@ extern "C" __global__ void cudaEstimateResidual1(
    if (tid == 0)
    {
 	shared.pos = blockIdx.x * partSize; 
-	shared.dataLen =  min(shared.task.data.blocksize - shared.pos, partSize + max_order);
+	shared.dataLen =  min(shared.task.data.blocksize - shared.pos, partSize + shared.task.data.residualOrder);
    }
    __syncthreads();

@@ -631,22 +626,10 @@ extern "C" __global__ void cudaEstimateResidual1(
    shared.residual[tid] = __mul24(ptr < shared.dataLen, min(0x7fffff,(sum << 1) ^ (sum >> 31)));
    __syncthreads();
    SUM256(shared.residual, tid, +=);
-    
-    if (threadIdx.y == 0)
-    {
-	const int residualLen = max(0,min(shared.task.data.blocksize - shared.pos - shared.task.data.residualOrder, partSize));
-
-	// rice parameter search
-	shared.residual[threadIdx.x] = (shared.task.data.type != Constant || shared.residual[0] != 0) *
-	    (__mul24(threadIdx.x >= 15, 0x7fffff) + residualLen * (threadIdx.x + 1) + ((shared.residual[0] - (residualLen >> 1)) >> threadIdx.x));
-	shared.residual[threadIdx.x] = min(min(shared.residual[threadIdx.x], shared.residual[threadIdx.x + 4]), min(shared.residual[threadIdx.x + 8], shared.residual[threadIdx.x + 12]));
-	if (threadIdx.x == 0)
-	    output[blockIdx.y * 64 + blockIdx.x] = min(min(shared.residual[threadIdx.x], shared.residual[threadIdx.x + 1]), min(shared.residual[threadIdx.x + 2], shared.residual[threadIdx.x + 3]));
-    }
+    if (tid == 0)
+	output[blockIdx.y * 64 + blockIdx.x] = shared.residual[0];
 }

-#define FASTMUL(a,b) __mul24(a,b)
-
 extern "C" __global__ void cudaEstimateResidual8(
    int*output,
    int*samples,
@@ -656,57 +639,61 @@ extern "C" __global__ void cudaEstimateResidual8(
    )
 {
    __shared__ struct {
-	int data[32*9];
+	volatile int data[32*9];
 	volatile int residual[32*8];
 	FlaCudaSubframeData task[8];
 	int coefs[32*8];
+	volatile int pos;
+	volatile int dataLen;
+	volatile int dataOffs;
    } shared;
    const int tid = threadIdx.x + threadIdx.y * 32;
+    const int taskNo = FASTMUL(blockIdx.y, blockDim.y) + threadIdx.y;
    if (threadIdx.x < sizeof(FlaCudaSubframeData)/sizeof(int))
-	((int*)&shared.task[threadIdx.y])[threadIdx.x] = ((int*)(&tasks[blockIdx.y * blockDim.y + threadIdx.y]))[threadIdx.x];
+	((int*)&shared.task[threadIdx.y])[threadIdx.x] = ((int*)(&tasks[taskNo]))[threadIdx.x];
+    const int ro = shared.task[threadIdx.y].residualOrder;
+    shared.coefs[tid] = threadIdx.x < ro ? tasks[taskNo].coefs[threadIdx.x] : 0;
+    if (tid == 0)
+    {
+	shared.pos = FASTMUL(blockIdx.x, partSize);
+	shared.dataLen =  min(shared.task[0].blocksize - shared.pos, partSize + max_order);
+	shared.dataOffs = shared.task[0].samplesOffs + shared.pos;
+    }
    __syncthreads();
-    const int pos = blockIdx.x * partSize;
-    const int dataLen = min(shared.task[0].blocksize - pos, partSize + max_order);

    // fetch samples
-    shared.data[tid] = tid < dataLen ? samples[shared.task[0].samplesOffs + pos + tid] >> shared.task[0].wbits : 0;
-    if (tid < 32) shared.data[tid + partSize] = tid + partSize < dataLen ? samples[shared.task[0].samplesOffs + pos + tid + partSize] >> shared.task[0].wbits : 0;
+    if (tid < shared.dataLen)
+	shared.data[tid] = samples[shared.dataOffs + tid] >> shared.task[0].wbits;
+    if (tid + partSize < shared.dataLen)
+	shared.data[tid + partSize] = samples[shared.dataOffs + tid + partSize] >> shared.task[0].wbits;

    __syncthreads();

-    shared.residual[tid] = 0;
-    shared.coefs[tid] = threadIdx.x < shared.task[threadIdx.y].residualOrder ? tasks[blockIdx.y * blockDim.y + threadIdx.y].coefs[threadIdx.x] : 0;
-
-    const int residualLen = max(0,min(shared.task[0].blocksize - pos - shared.task[threadIdx.y].residualOrder, partSize));
+    const int residualLen = max(0,min(shared.dataLen - ro, partSize));
    const int ptr2 = threadIdx.y << 5;
    int s = 0;
-    for (int ptr = threadIdx.x + blockDim.y * 32 * (shared.task[threadIdx.y].type == Verbatim); ptr < blockDim.y * 32 + threadIdx.x; ptr += 32)
+    for (int ptr = threadIdx.x; ptr < residualLen; ptr += 32)
    {
 	// compute residual
 	int sum = 
 	    __mul24(shared.data[ptr + 0], shared.coefs[ptr2 + 0]) +
 	    __mul24(shared.data[ptr + 1], shared.coefs[ptr2 + 1]) +
 	    __mul24(shared.data[ptr + 2], shared.coefs[ptr2 + 2]) +
-	    __mul24(shared.data[ptr + 3], shared.coefs[ptr2 + 3]) +
+	    __mul24(shared.data[ptr + 3], shared.coefs[ptr2 + 3]);
+	sum +=
 	    __mul24(shared.data[ptr + 4], shared.coefs[ptr2 + 4]) +
 	    __mul24(shared.data[ptr + 5], shared.coefs[ptr2 + 5]) +
 	    __mul24(shared.data[ptr + 6], shared.coefs[ptr2 + 6]) +
 	    __mul24(shared.data[ptr + 7], shared.coefs[ptr2 + 7]);
-	sum = shared.data[ptr + shared.task[threadIdx.y].residualOrder] - (sum >> shared.task[threadIdx.y].shift);
-	s += __mul24(ptr < residualLen, min(0x7fffff,(sum << 1) ^ (sum >> 31)));
+	sum = shared.data[ptr + ro] - (sum >> shared.task[threadIdx.y].shift);
+	s += min(0x7fffff,(sum << 1) ^ (sum >> 31));
    }

    shared.residual[tid] = s;
    shared.residual[tid] = shared.residual[tid] + shared.residual[tid + 8] + shared.residual[tid + 16] + shared.residual[tid + 24];
    shared.residual[tid] = shared.residual[tid] + shared.residual[tid + 2] + shared.residual[tid + 4] + shared.residual[tid + 6];
-    shared.residual[tid] += shared.residual[tid + 1];
-
-    // rice parameter search
-    shared.residual[tid] = (shared.task[threadIdx.y].type != Constant || shared.residual[threadIdx.y << 5] != 0) *
-	(__mul24(threadIdx.x >= 15, 0x7fffff) + residualLen * (threadIdx.x + 1) + ((shared.residual[threadIdx.y << 5] - (residualLen >> 1)) >> threadIdx.x));
-    shared.residual[tid] = min(min(shared.residual[tid], shared.residual[tid + 4]), min(shared.residual[tid + 8], shared.residual[tid + 12]));
    if (threadIdx.x == 0)
-	output[(blockIdx.y * blockDim.y + threadIdx.y) * 64 + blockIdx.x] = min(min(shared.residual[tid], shared.residual[tid + 1]), min(shared.residual[tid + 2], shared.residual[tid + 3]));
+	output[(blockIdx.y * blockDim.y + threadIdx.y) * 64 + blockIdx.x] = shared.residual[tid] + shared.residual[tid + 1];
 }

 extern "C" __global__ void cudaEstimateResidual12(
@@ -727,8 +714,11 @@ extern "C" __global__ void cudaEstimateResidual12(
 	volatile int dataOffs;
    } shared;
    const int tid = threadIdx.x + threadIdx.y * 32;
+    const int taskNo = FASTMUL(blockIdx.y, blockDim.y) + threadIdx.y;
    if (threadIdx.x < sizeof(FlaCudaSubframeData)/sizeof(int))
-	((int*)&shared.task[threadIdx.y])[threadIdx.x] = ((int*)(&tasks[FASTMUL(blockIdx.y, blockDim.y) + threadIdx.y]))[threadIdx.x];
+	((int*)&shared.task[threadIdx.y])[threadIdx.x] = ((int*)(&tasks[taskNo]))[threadIdx.x];
+    const int ro = shared.task[threadIdx.y].residualOrder;
+    shared.coefs[tid] = threadIdx.x < ro ? tasks[taskNo].coefs[threadIdx.x] : 0;
    if (tid == 0)
    {
 	shared.pos = FASTMUL(blockIdx.x, partSize);
@@ -738,30 +728,30 @@ extern "C" __global__ void cudaEstimateResidual12(
    __syncthreads();

    // fetch samples
-    shared.data[tid] = tid < shared.dataLen ? samples[shared.dataOffs + tid] >> shared.task[0].wbits : 0;
-    if (tid < 32) shared.data[tid + partSize] = tid + partSize < shared.dataLen ? samples[shared.dataOffs + tid + partSize] >> shared.task[0].wbits : 0;
+    if (tid < shared.dataLen)
+	shared.data[tid] = samples[shared.dataOffs + tid] >> shared.task[0].wbits;
+    if (tid + partSize < shared.dataLen)
+	shared.data[tid + partSize] = samples[shared.dataOffs + tid + partSize] >> shared.task[0].wbits;

    __syncthreads();

-    const int ro = shared.task[threadIdx.y].residualOrder;
-    const int residualLen = max(0,min(shared.task[0].blocksize - shared.pos - ro, partSize));
+    int residualLen = max(0,min(shared.dataLen - ro, partSize));
    const int ptr2 = threadIdx.y << 5;
-    
-    shared.coefs[tid] = threadIdx.x < ro ? tasks[FASTMUL(blockIdx.y, blockDim.y) + threadIdx.y].coefs[threadIdx.x] : 0;
-
    int s = 0;
-    for (int ptr = shared.task[threadIdx.y].type == Verbatim ? residualLen : threadIdx.x; ptr < residualLen; ptr += 32)
+    for (int ptr = threadIdx.x; ptr < residualLen; ptr += 32)
    {
 	// compute residual
 	int sum =
    	    FASTMUL(shared.data[ptr + 0], shared.coefs[ptr2 + 0]) +
 	    FASTMUL(shared.data[ptr + 1], shared.coefs[ptr2 + 1]) +
 	    FASTMUL(shared.data[ptr + 2], shared.coefs[ptr2 + 2]) +
-	    FASTMUL(shared.data[ptr + 3], shared.coefs[ptr2 + 3]) +
+	    FASTMUL(shared.data[ptr + 3], shared.coefs[ptr2 + 3]);
+	sum += 
 	    FASTMUL(shared.data[ptr + 4], shared.coefs[ptr2 + 4]) +
 	    FASTMUL(shared.data[ptr + 5], shared.coefs[ptr2 + 5]) +
 	    FASTMUL(shared.data[ptr + 6], shared.coefs[ptr2 + 6]) +
-	    FASTMUL(shared.data[ptr + 7], shared.coefs[ptr2 + 7]) +
+	    FASTMUL(shared.data[ptr + 7], shared.coefs[ptr2 + 7]);
+	sum +=
 	    FASTMUL(shared.data[ptr + 8], shared.coefs[ptr2 + 8]) +
 	    FASTMUL(shared.data[ptr + 9], shared.coefs[ptr2 + 9]) +
 	    FASTMUL(shared.data[ptr + 10], shared.coefs[ptr2 + 10]) +
@@ -773,19 +763,14 @@ extern "C" __global__ void cudaEstimateResidual12(
    shared.residual[tid] = s;
    shared.residual[tid] = shared.residual[tid] + shared.residual[tid + 8] + shared.residual[tid + 16] + shared.residual[tid + 24];
    shared.residual[tid] = shared.residual[tid] + shared.residual[tid + 2] + shared.residual[tid + 4] + shared.residual[tid + 6];
-    shared.residual[tid] += shared.residual[tid + 1];
-
-    // rice parameter search
-    shared.residual[tid] = (shared.task[threadIdx.y].type != Constant || shared.residual[threadIdx.y << 5] != 0) *
-	(__mul24(threadIdx.x >= 15, 0x7fffff) + FASTMUL(residualLen, threadIdx.x + 1) + ((shared.residual[threadIdx.y << 5] - (residualLen >> 1)) >> threadIdx.x));
-    shared.residual[tid] = min(min(shared.residual[tid], shared.residual[tid + 4]), min(shared.residual[tid + 8], shared.residual[tid + 12]));
    if (threadIdx.x == 0)
-	output[(blockIdx.y * blockDim.y + threadIdx.y) * 64 + blockIdx.x] = min(min(shared.residual[tid], shared.residual[tid + 1]), min(shared.residual[tid + 2], shared.residual[tid + 3]));
+	output[(blockIdx.y * blockDim.y + threadIdx.y) * 64 + blockIdx.x] = shared.residual[tid] + shared.residual[tid + 1];
 }

 extern "C" __global__ void cudaChooseBestMethod(
    FlaCudaSubframeTask *tasks,
    int *residual,
+    int partSize,
    int partCount, // <= blockDim.y (256)
    int taskCount
    )
@@ -806,8 +791,18 @@ extern "C" __global__ void cudaChooseBestMethod(
 	    ((int*)&shared.task[threadIdx.y])[threadIdx.x] = ((int*)(tasks + task + threadIdx.y + taskCount * blockIdx.y))[threadIdx.x];

 	    int sum = 0;
-	    for (int pos = 0; pos < partCount; pos += blockDim.x)
-		sum += (pos + threadIdx.x < partCount ? residual[pos + threadIdx.x + 64 * (task + threadIdx.y + taskCount * blockIdx.y)] : 0);
+	    for (int pos = threadIdx.x; pos < partCount; pos += blockDim.x)
+	    {
+		// fetch part sum
+		int psum = residual[pos + 64 * (task + threadIdx.y + taskCount * blockIdx.y)];
+		// calculate part size
+		int residualLen = max(0,min(shared.task[threadIdx.y].data.blocksize - FASTMUL(pos, partSize) - shared.task[threadIdx.y].data.residualOrder, partSize));
+		residualLen = FASTMUL(residualLen, shared.task[threadIdx.y].data.type != Constant || psum != 0);
+		// calculate rice parameter
+		int k = max(0, min(14, __float2int_rz(__log2f((psum + 0.000001f) / (residualLen + 0.000001f) + 0.5f))));
+		// calculate part bit length
+		sum += FASTMUL(residualLen, k + 1) + (psum >> k);
+	    }
 	    shared.partLen[tid] = sum;

 	    // length sum: reduction in shared mem
--- a/CUETools.FlaCuda/flacuda.cubin
+++ b/CUETools.FlaCuda/flacuda.cubin