diff --git a/CUETools.Codecs.FLACCL/FLACCLWriter.cs b/CUETools.Codecs.FLACCL/FLACCLWriter.cs
index 171eb35..c6ccff1 100644
--- a/CUETools.Codecs.FLACCL/FLACCLWriter.cs
+++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs
@@ -1964,10 +1964,10 @@ namespace CUETools.Codecs.FLACCL
 
 		public unsafe void do_output_frames(int nFrames)
 		{
-			if (task2.frameCount > 0)
-				task2.openCLCQ.Finish();
 			send_to_GPU(task1, nFrames, eparams.block_size);
 			run_GPU_task(task1);
+			if (task2.frameCount > 0)
+				task2.openCLCQ.Finish();
 			if (task2.frameCount > 0)
 			{
 				if (cpu_tasks != null)
@@ -2681,9 +2681,11 @@ namespace CUETools.Codecs.FLACCL
 			if (UseGPUOnly)
 			{
 				clEncodeResidual = openCLProgram.CreateKernel("clEncodeResidual");
-				clCalcPartition = openCLProgram.CreateKernel("clCalcPartition");
 				if (openCLCQ.Device.DeviceType != DeviceType.CPU)
+				{
+					clCalcPartition = openCLProgram.CreateKernel("clCalcPartition");
 					clCalcPartition16 = openCLProgram.CreateKernel("clCalcPartition16");
+				}
 				clSumPartition = openCLProgram.CreateKernel("clSumPartition");
 				clFindRiceParameter = openCLProgram.CreateKernel("clFindRiceParameter");
 				clFindPartitionOrder = openCLProgram.CreateKernel("clFindPartitionOrder");
@@ -2736,9 +2738,11 @@ namespace CUETools.Codecs.FLACCL
 			if (UseGPUOnly)
 			{
 				clEncodeResidual.Dispose();
-				clCalcPartition.Dispose();
 				if (openCLCQ.Device.DeviceType != DeviceType.CPU)
+				{
+					clCalcPartition.Dispose();
 					clCalcPartition16.Dispose();
+				}
 				clSumPartition.Dispose();
 				clFindRiceParameter.Dispose();
 				clFindPartitionOrder.Dispose();
@@ -2942,11 +2946,19 @@ namespace CUETools.Codecs.FLACCL
 					groupSize,
 					nEstimateTasksPerChannel * channelsCount * frameCount); // 1 per channel, 4 channels
 
+				int tasksToSecondEstimate = nResidualTasksPerChannel - nEstimateTasksPerChannel;
+
+				//if (nEstimateTasksPerChannel < nTasksPerWindow * nWindowFunctions)
+					//tasksToSecondEstimate -= (nEstimateTasksPerChannel / nWindowFunctions) * (nWindowFunctions - 1);
+
 				clSelectStereoTasks.SetArgs(
 					clResidualTasks,
 					clSelectedTasks,
 					clSelectedTasksSecondEstimate,
 					clSelectedTasksBestMethod,
+					nTasksPerWindow,
+					nWindowFunctions,
+					tasksToSecondEstimate,
 					nResidualTasksPerChannel,
 					nEstimateTasksPerChannel);
 
@@ -2954,7 +2966,7 @@ namespace CUETools.Codecs.FLACCL
 					clSelectStereoTasks,
 					0, frameCount);
 
-				if (nEstimateTasksPerChannel < nResidualTasksPerChannel)
+				if (tasksToSecondEstimate > 0)
 				{
 					clEstimateResidual.SetArgs(
 						clSamples,
@@ -2964,7 +2976,7 @@ namespace CUETools.Codecs.FLACCL
 					openCLCQ.EnqueueNDRangeKernel(
 						clEstimateResidual,
 						groupSize,
-						(nResidualTasksPerChannel - nEstimateTasksPerChannel) * channels * frameCount);
+						tasksToSecondEstimate * channels * frameCount);
 				}
 
 				clChooseBestMethod.SetArgs(
@@ -3003,47 +3015,46 @@ namespace CUETools.Codecs.FLACCL
 			if (UseGPUOnly)
 			{
 				clEncodeResidual.SetArgs(
+					clPartitions,
 					clResidual,
 					clSamples,
-					clBestResidualTasks);
+					clBestResidualTasks,
+					max_porder,
+					frameSize >> max_porder);
 
 				openCLCQ.EnqueueNDRangeKernel(
 					clEncodeResidual,
 					groupSize, channels * frameCount);
 
-				if ((frameSize >> max_porder == 16) && openCLCQ.Device.DeviceType != DeviceType.CPU)
+				if (openCLCQ.Device.DeviceType != DeviceType.CPU)
 				{
-					clCalcPartition16.SetArgs(
-						clPartitions,
-						clResidual,
-						clBestResidualTasks,
-						max_porder);
+					if (frameSize >> max_porder == 16)
+					{
+						clCalcPartition16.SetArgs(
+							clPartitions,
+							clResidual,
+							clBestResidualTasks,
+							max_porder);
 
-					openCLCQ.EnqueueNDRangeKernel(
-						clCalcPartition16,
-						groupSize, channels * frameCount);
-				}
-				else
-				{
-					clCalcPartition.SetArgs(
-						clPartitions,
-						clResidual,
-						clBestResidualTasks,
-						max_porder,
-						frameSize >> max_porder);
-
-					if (openCLCQ.Device.DeviceType == DeviceType.CPU)
 						openCLCQ.EnqueueNDRangeKernel(
-							clCalcPartition,
-							groupSize, 1,
-							1,
-							channels * frameCount);
+							clCalcPartition16,
+							groupSize, channels * frameCount);
+					}
 					else
+					{
+						clCalcPartition.SetArgs(
+							clPartitions,
+							clResidual,
+							clBestResidualTasks,
+							max_porder,
+							frameSize >> max_porder);
+
 						openCLCQ.EnqueueNDRangeKernel(
 							clCalcPartition,
 							groupSize, 1,
 							1 + ((1 << max_porder) - 1) / (groupSize / 16),
 							channels * frameCount);
+					}
 				}
 
 				if (max_porder > 0)
diff --git a/CUETools.Codecs.FLACCL/flac.cl b/CUETools.Codecs.FLACCL/flac.cl
index 1fe16aa..7addadd 100644
--- a/CUETools.Codecs.FLACCL/flac.cl
+++ b/CUETools.Codecs.FLACCL/flac.cl
@@ -896,13 +896,13 @@ inline int fastclz64(long iv)
 }
 
 #if BITS_PER_SAMPLE > 16
-typedef long residual_t;
+#define residual_t long
 #define residual_log(s) (63 - fastclz64(s))
 #define convert_bps4 convert_long4
 #define convert_bps_sat convert_int_sat
 #define bpsint4 long4
 #else
-typedef int residual_t;
+#define residual_t int
 #define residual_log(s) (31 - fastclz(s))
 #define convert_bps4
 #define convert_bps_sat
@@ -967,88 +967,120 @@ void clEstimateResidual(
 
     SWITCH_N((len[pos >> 6] += fabs((float)t)))
 #else
-    float fcoef[32];
-    for (int tid = 0; tid < 32; tid++)
-	fcoef[tid] = tid < MAX_ORDER && tid + ro - MAX_ORDER >= 0 ? - ((float) task.coefs[tid + ro - MAX_ORDER]) / (1 << task.data.shift) : 0.0f;
-
-    float4 fc0 = vload4(0, &fcoef[0]);
-    float4 fc1 = vload4(1, &fcoef[0]);
-#if MAX_ORDER > 8
-    float4 fc2 = vload4(2, &fcoef[0]);
-#endif
-
-#if MAX_ORDER == 8
-    float fdata[32];
-    for (int pos = 0; pos < MAX_ORDER + ro; pos++)
-	fdata[pos] = pos < MAX_ORDER ? 0.0f : (float)(data[pos - MAX_ORDER] >> task.data.wbits);
-    float4 fd0 = vload4(0, &fdata[ro]);
-    float4 fd1 = vload4(1, &fdata[ro]);
-    for (int pos = ro; pos < bs; pos ++)
+    if (ro <= 4)
     {
-	float4 sum = fc0 * fd0 + fc1 * fd1;
-	fd0 = fd0.s1230;
-	fd1 = fd1.s1230;
-	fd0.s3 = fd1.s3;
-	fd1.s3 = (float)(data[pos] >> task.data.wbits);
-	len[pos >> 6] += fabs(fd1.s3 + (sum.x + sum.y + sum.z + sum.w));
-    }
-#elif MAX_ORDER == 12
-    float fdata[32];
-    for (int pos = 0; pos < MAX_ORDER + ro; pos++)
-	fdata[pos] = pos < MAX_ORDER ? 0.0f : (float)(data[pos - MAX_ORDER] >> task.data.wbits);
-    float4 fd0 = vload4(0, &fdata[ro]);
-    float4 fd1 = vload4(1, &fdata[ro]);
-    float4 fd2 = vload4(2, &fdata[ro]);
-    for (int pos = ro; pos < bs; pos ++)
-    {
-	float4 sum = fc0 * fd0 + fc1 * fd1 + fc2 * fd2;
-	fd0 = fd0.s1230;
-	fd1 = fd1.s1230;
-	fd2 = fd2.s1230;
-	fd0.s3 = fd1.s3;
-	fd1.s3 = fd2.s3;
-	fd2.s3 = (float)(data[pos] >> task.data.wbits);
-	len[pos >> 6] += fabs(fd2.s3 + (sum.x + sum.y + sum.z + sum.w));
-    }
-#else
-    float fdata[MAX_ORDER + TEMPBLOCK1 + 32];
-    for (int pos = 0; pos < MAX_ORDER; pos++)
-	fdata[pos] = 0.0f;
-    for (int pos = MAX_ORDER + TEMPBLOCK1; pos < MAX_ORDER + TEMPBLOCK1 + 32; pos++)
-	fdata[pos] = 0.0f;
-    for (int bpos = 0; bpos < bs; bpos += TEMPBLOCK1)
-    {
-	int end = min(bpos + TEMPBLOCK1, bs);
-
-	for (int pos = max(bpos - ro, 0); pos < max(bpos, ro); pos++)
-	    fdata[MAX_ORDER + pos - bpos] = (float)(data[pos] >> task.data.wbits);	
-
-	for (int pos = max(bpos, ro); pos < end; pos ++)
+	float fcoef[4];
+	for (int tid = 0; tid < 4; tid++)
+	    fcoef[tid] = tid + ro - 4 < 0 ? 0.0f : - ((float) task.coefs[tid + ro - 4]) / (1 << task.data.shift);
+	float4 fc0 = vload4(0, &fcoef[0]);
+	float fdata[4];
+	for (int pos = 0; pos < 4; pos++)
+	    fdata[pos] = pos + ro - 4 < 0 ? 0.0f : (float)(data[pos + ro - 4] >> task.data.wbits);
+	float4 fd0 = vload4(0, &fdata[0]);
+	for (int pos = ro; pos < bs; pos ++)
 	{
-	    float next = (float)(data[pos] >> task.data.wbits);
-	    float * dptr = fdata + pos - bpos;
-	    dptr[MAX_ORDER] = next;
-	    float4 sum 
-		= fc0 * vload4(0, dptr)
-		+ fc1 * vload4(1, dptr)
-#if MAX_ORDER > 8
-		+ fc2 * vload4(2, dptr)
-  #if MAX_ORDER > 12
-		+ vload4(3, &fcoef[0]) * vload4(3, dptr)
-    #if MAX_ORDER > 16
-		+ vload4(4, &fcoef[0]) * vload4(4, dptr)
-		+ vload4(5, &fcoef[0]) * vload4(5, dptr)
-		+ vload4(6, &fcoef[0]) * vload4(6, dptr)
-		+ vload4(7, &fcoef[0]) * vload4(7, dptr)
-    #endif
-  #endif
-#endif
-		;
-	    next += sum.x + sum.y + sum.z + sum.w;
-	    len[pos >> 6] += fabs(next);
+	    float4 sum4 = fc0 * fd0;
+	    float2 sum2 = sum4.s01 + sum4.s23;
+	    fd0 = fd0.s1230;
+	    fd0.s3 = (float)(data[pos] >> task.data.wbits);
+	    len[pos >> 6] += fabs(fd0.s3 + (sum2.x + sum2.y));
+	}
+    }
+    else if (ro <= 8)
+    {
+	float fcoef[8];
+	for (int tid = 0; tid < 8; tid++)
+	    fcoef[tid] = tid + ro - 8 < 0 ? 0.0f : - ((float) task.coefs[tid + ro - 8]) / (1 << task.data.shift);
+	float8 fc0 = vload8(0, &fcoef[0]);
+	float fdata[8];
+	for (int pos = 0; pos < 8; pos++)
+	    fdata[pos] = pos + ro - 8 < 0 ? 0.0f : (float)(data[pos + ro - 8] >> task.data.wbits);
+	float8 fd0 = vload8(0, &fdata[0]);
+	for (int pos = ro; pos < bs; pos ++)
+	{
+	    float8 sum8 = fc0 * fd0;
+	    float4 sum4 = sum8.s0123 + sum8.s4567;
+	    float2 sum2 = sum4.s01 + sum4.s23;
+	    fd0 = fd0.s12345670;
+	    fd0.s7 = (float)(data[pos] >> task.data.wbits);
+	    len[pos >> 6] += fabs(fd0.s7 + (sum2.x + sum2.y));
+	}
+    }
+    else if (ro <= 12)
+    {
+	float fcoef[12];
+	for (int tid = 0; tid < 12; tid++)
+	    fcoef[tid] = tid + ro - 12 >= 0 ? - ((float) task.coefs[tid + ro - 12]) / (1 << task.data.shift) : 0.0f;
+	float4 fc0 = vload4(0, &fcoef[0]);
+	float4 fc1 = vload4(1, &fcoef[0]);
+	float4 fc2 = vload4(2, &fcoef[0]);	
+	float fdata[12];
+	for (int pos = 0; pos < 12; pos++)
+	    fdata[pos] = pos + ro - 12 < 0 ? 0.0f : (float)(data[pos + ro - 12] >> task.data.wbits);
+	float4 fd0 = vload4(0, &fdata[0]);
+	float4 fd1 = vload4(1, &fdata[0]);
+	float4 fd2 = vload4(2, &fdata[0]);
+	for (int pos = ro; pos < bs; pos ++)
+	{
+	    float4 sum4 = fc0 * fd0 + fc1 * fd1 + fc2 * fd2;
+	    float2 sum2 = sum4.s01 + sum4.s23;
+	    fd0 = fd0.s1230;
+	    fd1 = fd1.s1230;
+	    fd2 = fd2.s1230;
+	    fd0.s3 = fd1.s3;
+	    fd1.s3 = fd2.s3;
+	    fd2.s3 = (float)(data[pos] >> task.data.wbits);
+	    len[pos >> 6] += fabs(fd2.s3 + (sum2.x + sum2.y));
+	}
+    }
+    else
+    {
+	float fcoef[32];
+	for (int tid = 0; tid < 32; tid++)
+	    fcoef[tid] = tid < MAX_ORDER && tid + ro - MAX_ORDER >= 0 ? - ((float) task.coefs[tid + ro - MAX_ORDER]) / (1 << task.data.shift) : 0.0f;
+
+	float4 fc0 = vload4(0, &fcoef[0]);
+	float4 fc1 = vload4(1, &fcoef[0]);
+	float4 fc2 = vload4(2, &fcoef[0]);
+
+	float fdata[MAX_ORDER + TEMPBLOCK1 + 32];
+	for (int pos = 0; pos < MAX_ORDER; pos++)
+	    fdata[pos] = 0.0f;
+	for (int pos = MAX_ORDER + TEMPBLOCK1; pos < MAX_ORDER + TEMPBLOCK1 + 32; pos++)
+	    fdata[pos] = 0.0f;
+	for (int bpos = 0; bpos < bs; bpos += TEMPBLOCK1)
+	{
+	    int end = min(bpos + TEMPBLOCK1, bs);
+
+	    for (int pos = max(bpos - ro, 0); pos < max(bpos, ro); pos++)
+		fdata[MAX_ORDER + pos - bpos] = (float)(data[pos] >> task.data.wbits);	
+
+	    for (int pos = max(bpos, ro); pos < end; pos ++)
+	    {
+		float next = (float)(data[pos] >> task.data.wbits);
+		float * dptr = fdata + pos - bpos;
+		dptr[MAX_ORDER] = next;
+		float4 sum 
+		    = fc0 * vload4(0, dptr)
+		    + fc1 * vload4(1, dptr)
+    #if MAX_ORDER > 8
+		    + fc2 * vload4(2, dptr)
+      #if MAX_ORDER > 12
+		    + vload4(3, &fcoef[0]) * vload4(3, dptr)
+	#if MAX_ORDER > 16
+		    + vload4(4, &fcoef[0]) * vload4(4, dptr)
+		    + vload4(5, &fcoef[0]) * vload4(5, dptr)
+		    + vload4(6, &fcoef[0]) * vload4(6, dptr)
+		    + vload4(7, &fcoef[0]) * vload4(7, dptr)
+	#endif
+      #endif
+    #endif
+		    ;
+		next += sum.x + sum.y + sum.z + sum.w;
+		len[pos >> 6] += fabs(next);
+	    }
 	}
     }
-#endif
 #endif
     int total = 0;
     for (int i = 0; i < ERPARTS; i++)
@@ -1257,22 +1289,31 @@ void clSelectStereoTasks(
     __global int*selectedTasks,
     __global int*selectedTasksSecondEstimate,
     __global int*selectedTasksBestMethod,
+    int tasksWindow,
+    int windowCount,
+    int tasksToSecondEstimate,
     int taskCount,
     int selectedCount
     )
 {
     int best_size[4];
+    int best_wind[4];
     for (int ch = 0; ch < 4; ch++)
     {
 	int first_no = selectedTasks[(get_global_id(0) * 4 + ch) * selectedCount];
 	int best_len = tasks[first_no].data.size;
+	int best_wnd = 0;
 	for (int i = 1; i < selectedCount; i++)
 	{
 	    int task_no = selectedTasks[(get_global_id(0) * 4 + ch) * selectedCount + i];
 	    int task_len = tasks[task_no].data.size;
+	    int task_wnd = (task_no - first_no) / tasksWindow;
+	    task_wnd = select(0, task_wnd, task_wnd < windowCount);
+	    best_wnd = select(best_wnd, task_wnd, task_len < best_len);
 	    best_len = min(task_len, best_len);
 	}
 	best_size[ch] = best_len;
+	best_wind[ch] = best_wnd;
     }
         
     int bitsBest = best_size[2] + best_size[3]; // MidSide
@@ -1291,16 +1332,17 @@ void clSelectStereoTasks(
 	int ch = select(chMask & 3, chMask >> 2, ich > 0);
 	int roffs = tasks[(get_global_id(0) * 4 + ich) * taskCount].data.samplesOffs;
 	int nonSelectedNo = 0;
-	for (int i = 0; i < taskCount; i++)
+	for (int j = taskCount - 1; j >= 0; j--)
 	{
+	    int i = select(j, (j % windowCount) * tasksWindow + (j / windowCount), j < windowCount * tasksWindow);
 	    int no = (get_global_id(0) * 4 + ch) * taskCount + i;
 	    selectedTasksBestMethod[(get_global_id(0) * 2 + ich) * taskCount + i] = no;
 	    tasks[no].data.residualOffs = roffs;
-	    int selectedFound = 0;
-	    for(int selectedNo = 0; selectedNo < selectedCount; selectedNo++)
-		selectedFound |= (selectedTasks[(get_global_id(0) * 4 + ch) * selectedCount + selectedNo] == no);
-	    if (!selectedFound)
-		selectedTasksSecondEstimate[(get_global_id(0) * 2 + ich) * (taskCount - selectedCount) + nonSelectedNo++] = no;
+	    if (j >= selectedCount)
+		tasks[no].data.size = 0x7fffffff;
+	    if (nonSelectedNo < tasksToSecondEstimate)
+		if (tasksToSecondEstimate == taskCount - selectedCount || best_wind[ch] == i / tasksWindow || i >= windowCount * tasksWindow)
+		    selectedTasksSecondEstimate[(get_global_id(0) * 2 + ich) * tasksToSecondEstimate + nonSelectedNo++] = no;
 	}
     }
 }
@@ -1330,24 +1372,42 @@ void clChooseBestMethod(
 // get_group_id(0) == task index
 __kernel __attribute__((reqd_work_group_size(1, 1, 1)))
 void clEncodeResidual(
+    __global ulong *partition_lengths,
     __global int *residual,
     __global int *samples,
-    __global FLACCLSubframeTask *tasks
+    __global FLACCLSubframeTask *tasks,
+    int max_porder, // <= 8
+    int psize // == task.blocksize >> max_porder?
     )
 {
     FLACCLSubframeTask task = tasks[get_group_id(0)];
     int bs = task.data.blocksize;
     int ro = task.data.residualOrder;
     __global int *data = &samples[task.data.samplesOffs];
-    SWITCH_N(residual[task.data.residualOffs + pos] = convert_bps_sat(t));
+    __global ulong *pl = partition_lengths + (1 << (max_porder + 1)) * get_group_id(0);
+    int r;
+    for (int p = 0; p < (1 << max_porder); p++)
+	pl[p] = 0UL;
+    __global int *rptr = residual + task.data.residualOffs;
+    if (psize == 16)
+    {
+	SWITCH_N((rptr[pos] = r = convert_bps_sat(t), pl[pos >> 4] += (uint)((r << 1) ^ (r >> 31))));
+    }
+    else
+    {
+	SWITCH_N((rptr[pos] = r = convert_bps_sat(t), pl[pos / psize] += (uint)((r << 1) ^ (r >> 31))));
+    }
 }
 #else
 // get_group_id(0) == task index
 __kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
 void clEncodeResidual(
+    __global int *partition_lengths,
     __global int *output,
     __global int *samples,
-    __global FLACCLSubframeTask *tasks
+    __global FLACCLSubframeTask *tasks,
+    int max_porder, // <= 8
+    int psize // == task.blocksize >> max_porder?
     )
 {
     __local FLACCLSubframeTask task;
@@ -1407,34 +1467,7 @@ void clEncodeResidual(
 }
 #endif
 
-#ifdef FLACCL_CPU
-__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
-void clCalcPartition(
-    __global ulong *partition_lengths,
-    __global int *residual,
-    __global FLACCLSubframeTask *tasks,
-    int max_porder, // <= 8
-    int psize // == task.blocksize >> max_porder?
-    )
-{
-    FLACCLSubframeTask task = tasks[get_group_id(1)];
-    int bs = task.data.blocksize;
-    int ro = task.data.residualOrder;
-    //int psize = bs >> max_porder;
-    __global ulong *pl = partition_lengths + (1 << (max_porder + 1)) * get_group_id(1);
-
-    for (int p = 0; p < (1 << max_porder); p++)
-	pl[p] = 0UL;
-
-    for (int pos = ro; pos < bs; pos ++)
-    {
-	int s = residual[task.data.residualOffs + pos];
-	// convert to unsigned
-	uint t = (s << 1) ^ (s >> 31);
-	pl[pos / psize] += t;
-    }
-}
-#else
+#ifndef FLACCL_CPU
 // get_group_id(0) == partition index / (GROUP_SIZE / 16)
 // get_group_id(1) == task index
 __kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))