optimizations

2025-12-16 18:14:25 +00:00 · 2009-09-11 11:16:45 +00:00
parent 5fd6108c9d
commit d1005089c2
3 changed files with 182 additions and 142 deletions
--- a/CUETools.FlaCuda/CUETools.Codecs.FlaCuda.csproj
+++ b/CUETools.FlaCuda/CUETools.Codecs.FlaCuda.csproj
@@ -54,9 +54,9 @@
  </ItemGroup>
  <ItemGroup>
    <None Include="flacuda.cu">
-      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-      <Generator>"%24%28CUDA_BIN_PATH%29\nvcc.exe" flacuda.cu --cubin -cbin "%24%28VCInstallDir%29bin"</Generator>
    </None>
+    <EmbeddedResource Include="flacuda.cubin">
+    </EmbeddedResource>
  </ItemGroup>
  <Import Project="$(MSBuildBinPath)\Microsoft.CSharp.targets" />
  <!-- To modify your build process, add your task inside one of the targets below and uncomment it. 
@@ -67,7 +67,8 @@
  </Target>
  -->
  <PropertyGroup>
-    <PostBuildEvent>nvcc flacuda.cu  --maxrregcount 10 --cubin --compiler-bindir "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\bin" --system-include "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\include" 
+    <PostBuildEvent>
    </PostBuildEvent>
+    <PreBuildEvent>nvcc $(ProjectDir)flacuda.cu  -o $(ProjectDir)\flacuda.cubin --machine 32 --maxrregcount 10 --cubin --compiler-bindir "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\bin" --system-include "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\include"</PreBuildEvent>
  </PropertyGroup>
 </Project>
--- a/CUETools.FlaCuda/FlaCudaWriter.cs
+++ b/CUETools.FlaCuda/FlaCudaWriter.cs
@@ -75,7 +75,7 @@ namespace CUETools.Codecs.FlaCuda
 		float[] windowBuffer;
 		int samplesInBuffer = 0;

-		int _compressionLevel = 7;
+		int _compressionLevel = 5;
 		int _blocksize = 0;
 		int _totalSize = 0;
 		int _windowsize = 0, _windowcount = 0;
@@ -96,6 +96,7 @@ namespace CUETools.Codecs.FlaCuda
 		CUfunction cudaComputeAutocor;
 		CUfunction cudaComputeLPC;
 		CUfunction cudaEstimateResidual;
+		CUfunction cudaSumResidualChunks;
 		CUfunction cudaSumResidual;
 		CUfunction cudaEncodeResidual;
 		CUdeviceptr cudaSamples;
@@ -104,6 +105,7 @@ namespace CUETools.Codecs.FlaCuda
 		CUdeviceptr cudaAutocorOutput;
 		CUdeviceptr cudaResidualTasks;
 		CUdeviceptr cudaResidualOutput;
+		CUdeviceptr cudaResidualSums;
 		IntPtr samplesBufferPtr = IntPtr.Zero;
 		IntPtr autocorTasksPtr = IntPtr.Zero;
 		IntPtr residualTasksPtr = IntPtr.Zero;
@@ -114,7 +116,7 @@ namespace CUETools.Codecs.FlaCuda
 		int nAutocorTasks = 0;

 		const int MAX_BLOCKSIZE = 8192;
-		const int maxResidualParts = MAX_BLOCKSIZE / (256 - 32);
+		const int maxResidualParts = 64;
 		const int maxAutocorParts = MAX_BLOCKSIZE / (256 - 32);

 		public FlaCudaWriter(string path, int bitsPerSample, int channelCount, int sampleRate, Stream IO)
@@ -218,6 +220,7 @@ namespace CUETools.Codecs.FlaCuda
 				cuda.Free(cudaAutocorOutput);
 				cuda.Free(cudaResidualTasks);
 				cuda.Free(cudaResidualOutput);
+				cuda.Free(cudaResidualSums);
 				CUDADriver.cuMemFreeHost(samplesBufferPtr);
 				CUDADriver.cuMemFreeHost(residualTasksPtr);
 				CUDADriver.cuMemFreeHost(autocorTasksPtr);
@@ -250,6 +253,7 @@ namespace CUETools.Codecs.FlaCuda
 				cuda.Free(cudaAutocorOutput);
 				cuda.Free(cudaResidualTasks);
 				cuda.Free(cudaResidualOutput);
+				cuda.Free(cudaResidualSums);
 				CUDADriver.cuMemFreeHost(samplesBufferPtr);
 				CUDADriver.cuMemFreeHost(residualTasksPtr);
 				CUDADriver.cuMemFreeHost(autocorTasksPtr);
@@ -278,7 +282,11 @@ namespace CUETools.Codecs.FlaCuda

 		public long BlockSize
 		{
-			set { _blocksize = (int)value; }
+			set {
+				if (value < 256 || value > MAX_BLOCKSIZE )
+					throw new Exception("unsupported BlockSize value");
+				_blocksize = (int)value; 
+			}
 			get { return _blocksize == 0 ? eparams.block_size : _blocksize; }
 		}

@@ -911,9 +919,9 @@ namespace CUETools.Codecs.FlaCuda
 					autocorTasks[nAutocorTasks].windowOffs = iWindow * 2 * FlaCudaWriter.MAX_BLOCKSIZE;
 					nAutocorTasks++;
 					// LPC tasks
-					for (int order = 1; order <= max_order; order++)
+					for (int order = 1; order <= ((max_order + 7) & ~7); order++)
 					{
-						residualTasks[nResidualTasks].residualOrder = order - 1;
+						residualTasks[nResidualTasks].residualOrder = order <= max_order ? order : 0;
 						residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE;
 						nResidualTasks++;
 					}
@@ -921,9 +929,9 @@ namespace CUETools.Codecs.FlaCuda
 			// Fixed prediction
 			for (int ch = 0; ch < channelsCount; ch++)
 			{
-				for (int order = 1; order <= 4; order++)
+				for (int order = 1; order <= 8; order++)
 				{
-					residualTasks[nResidualTasks].residualOrder = order - 1;
+					residualTasks[nResidualTasks].residualOrder = order <= 4 ? order : 0;
 					residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE;
 					residualTasks[nResidualTasks].shift = 0;
 					switch (order)
@@ -1025,9 +1033,11 @@ namespace CUETools.Codecs.FlaCuda
 				{
 					for (int order = 1; order <= max_order && order < frame.blocksize; order++)
 					{
-						int index = (order - 1) + max_order * (iWindow + _windowcount * ch);
+						int index = (order - 1) + ((max_order + 7) & ~7) * (iWindow + _windowcount * ch);
 						int cbits = residualTasks[index].cbits;
 						int nbits = order * (int)frame.subframes[ch].obits + 4 + 5 + order * cbits + 6 + residualTasks[index].size;
+						if (residualTasks[index].residualOrder != order)
+							throw new Exception("oops");
 						if (frame.subframes[ch].best.size > nbits)
 						{
 							frame.subframes[ch].best.type = SubframeType.LPC;
@@ -1048,8 +1058,10 @@ namespace CUETools.Codecs.FlaCuda
 			{
 				for (int order = 1; order <= 4 && order < frame.blocksize; order++)
 				{
-					int index = (order - 1) + 4 * ch;
-					int nbits = order * (int)frame.subframes[ch].obits + 6 + residualTasks[index + max_order * _windowcount * channelsCount].size;
+					int index = (order - 1) + 8 * ch + ((max_order + 7) & ~7) * _windowcount * channelsCount;
+					int nbits = order * (int)frame.subframes[ch].obits + 6 + residualTasks[index].size;
+					if (residualTasks[index].residualOrder != order)
+						throw new Exception("oops");
 					if (frame.subframes[ch].best.size > nbits)
 					{
 						frame.subframes[ch].best.type = SubframeType.Fixed;
@@ -1062,34 +1074,47 @@ namespace CUETools.Codecs.FlaCuda

 		unsafe void estimate_residual(FlacFrame frame, int channelsCount, int max_order, int autocorPartCount, out int partCount)
 		{
+			if (frame.blocksize <= 4)
+			{
+				partCount = 0;
+				return;
+			}
+
 			uint cbits = get_precision(frame.blocksize) + 1;
-			int residualThreads = 256;
-			int partSize = residualThreads - max_order;
-			partSize &= 0xffffff0;
+			int partSize = 256 - 32;

 			partCount = (frame.blocksize + partSize - 1) / partSize;
+
 			if (partCount > maxResidualParts)
-				throw new Exception("internal error");
+				throw new Exception("invalid combination of block size and LPC order");

-			if (frame.blocksize <= 4)
-				return;
+			cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 0, (uint)cudaResidualOutput.Pointer);
+			cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 1, (uint)cudaSamples.Pointer);
+			cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 2, (uint)cudaResidualTasks.Pointer);
+			cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 3, (uint)max_order);
+			cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 4, (uint)frame.blocksize);
+			cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 5, (uint)partSize);
+			cuda.SetParameterSize(cudaEstimateResidual, sizeof(uint) * 6);
+			cuda.SetFunctionBlockShape(cudaEstimateResidual, 64, 4, 1);

-			cuda.SetParameter(cudaEstimateResidual, 0, (uint)cudaResidualOutput.Pointer);
-			cuda.SetParameter(cudaEstimateResidual, IntPtr.Size, (uint)cudaSamples.Pointer);
-			cuda.SetParameter(cudaEstimateResidual, IntPtr.Size * 2, (uint)cudaResidualTasks.Pointer);
-			cuda.SetParameter(cudaEstimateResidual, IntPtr.Size * 3, (uint)frame.blocksize);
-			cuda.SetParameter(cudaEstimateResidual, IntPtr.Size * 3 + sizeof(uint), (uint)partSize);
-			cuda.SetParameterSize(cudaEstimateResidual, (uint)(IntPtr.Size * 3) + sizeof(uint) * 2U);
-			cuda.SetFunctionBlockShape(cudaEstimateResidual, residualThreads, 1, 1);
+			//cuda.SetParameter(cudaSumResidualChunks, 0, (uint)cudaResidualSums.Pointer);
+			//cuda.SetParameter(cudaSumResidualChunks, sizeof(uint), (uint)cudaResidualTasks.Pointer);
+			//cuda.SetParameter(cudaSumResidualChunks, sizeof(uint) * 2, (uint)cudaResidualOutput.Pointer);
+			//cuda.SetParameter(cudaSumResidualChunks, sizeof(uint) * 3, (uint)frame.blocksize);
+			//cuda.SetParameter(cudaSumResidualChunks, sizeof(uint) * 4, (uint)partSize);
+			//cuda.SetParameterSize(cudaSumResidualChunks, sizeof(uint) * 5U);
+			//cuda.SetFunctionBlockShape(cudaSumResidualChunks, residualThreads, 1, 1);

 			cuda.SetParameter(cudaSumResidual, 0, (uint)cudaResidualTasks.Pointer);
-			cuda.SetParameter(cudaSumResidual, IntPtr.Size, (uint)cudaResidualOutput.Pointer);
-			cuda.SetParameter(cudaSumResidual, IntPtr.Size * 2, (uint)partCount);
-			cuda.SetParameterSize(cudaSumResidual, (uint)(IntPtr.Size * 2) + sizeof(uint) * 1U);
+			cuda.SetParameter(cudaSumResidual, sizeof(uint), (uint)cudaResidualOutput.Pointer);
+			cuda.SetParameter(cudaSumResidual, sizeof(uint) * 2, (uint)partSize);
+			cuda.SetParameter(cudaSumResidual, sizeof(uint) * 3, (uint)partCount);
+			cuda.SetParameterSize(cudaSumResidual, sizeof(uint) * 4U);
 			cuda.SetFunctionBlockShape(cudaSumResidual, 64, 1, 1);

 			// issue work to the GPU
-			cuda.LaunchAsync(cudaEstimateResidual, partCount, nResidualTasks, cudaStream);
+			cuda.LaunchAsync(cudaEstimateResidual, partCount, nResidualTasks / 4, cudaStream);
+			//cuda.LaunchAsync(cudaSumResidualChunks, partCount, nResidualTasks, cudaStream);
 			cuda.LaunchAsync(cudaSumResidual, 1, nResidualTasks, cudaStream);
 			cuda.CopyDeviceToHostAsync(cudaResidualTasks, residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks), cudaStream);
 			cuda.SynchronizeStream(cudaStream);
@@ -1109,21 +1134,21 @@ namespace CUETools.Codecs.FlaCuda
 				return;

 			cuda.SetParameter(cudaComputeAutocor, 0, (uint)cudaAutocorOutput.Pointer);
-			cuda.SetParameter(cudaComputeAutocor, IntPtr.Size, (uint)cudaSamples.Pointer);
-			cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 2, (uint)cudaWindow.Pointer);
-			cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3, (uint)cudaAutocorTasks.Pointer);
-			cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 4, (uint)max_order);
-			cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 4 + sizeof(uint), (uint)frame.blocksize);
-			cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 4 + sizeof(uint) * 2, (uint)partSize);
-			cuda.SetParameterSize(cudaComputeAutocor, (uint)(IntPtr.Size * 4) + sizeof(uint) * 3);
+			cuda.SetParameter(cudaComputeAutocor, sizeof(uint), (uint)cudaSamples.Pointer);
+			cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 2, (uint)cudaWindow.Pointer);
+			cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 3, (uint)cudaAutocorTasks.Pointer);
+			cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 4, (uint)max_order);
+			cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 4 + sizeof(uint), (uint)frame.blocksize);
+			cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 4 + sizeof(uint) * 2, (uint)partSize);
+			cuda.SetParameterSize(cudaComputeAutocor, (uint)(sizeof(uint) * 4) + sizeof(uint) * 3);
 			cuda.SetFunctionBlockShape(cudaComputeAutocor, autocorThreads, 1, 1);

 			cuda.SetParameter(cudaComputeLPC, 0, (uint)cudaResidualTasks.Pointer);
-			cuda.SetParameter(cudaComputeLPC, IntPtr.Size, (uint)cudaAutocorOutput.Pointer);
-			cuda.SetParameter(cudaComputeLPC, IntPtr.Size * 2, (uint)cudaAutocorTasks.Pointer);
-			cuda.SetParameter(cudaComputeLPC, IntPtr.Size * 3, (uint)max_order);
-			cuda.SetParameter(cudaComputeLPC, IntPtr.Size * 3 + sizeof(uint), (uint)partCount);
-			cuda.SetParameterSize(cudaComputeLPC, (uint)(IntPtr.Size * 3) + sizeof(uint) * 2);
+			cuda.SetParameter(cudaComputeLPC, sizeof(uint), (uint)cudaAutocorOutput.Pointer);
+			cuda.SetParameter(cudaComputeLPC, sizeof(uint) * 2, (uint)cudaAutocorTasks.Pointer);
+			cuda.SetParameter(cudaComputeLPC, sizeof(uint) * 3, (uint)max_order);
+			cuda.SetParameter(cudaComputeLPC, sizeof(uint) * 3 + sizeof(uint), (uint)partCount);
+			cuda.SetParameterSize(cudaComputeLPC, (uint)(sizeof(uint) * 3) + sizeof(uint) * 2);
 			cuda.SetFunctionBlockShape(cudaComputeLPC, 64, 1, 1);

 			// issue work to the GPU
@@ -1268,24 +1293,30 @@ namespace CUETools.Codecs.FlaCuda
 			if (!inited)
 			{
 				cuda = new CUDA(true, InitializationFlags.None);
-				cuda.CreateContext(0, CUCtxFlags.SchedSpin);
-				cuda.LoadModule(System.IO.Path.Combine(Environment.CurrentDirectory, "flacuda.cubin"));
+				cuda.CreateContext(0, CUCtxFlags.BlockingSync);
+				using (Stream cubin = GetType().Assembly.GetManifestResourceStream(GetType(), "flacuda.cubin"))
+				using (StreamReader sr = new StreamReader(cubin))
+					cuda.LoadModule(new ASCIIEncoding().GetBytes(sr.ReadToEnd()));
+				//cuda.LoadModule(System.IO.Path.Combine(Environment.CurrentDirectory, "flacuda.cubin"));
 				cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor");
 				cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC");
 				cudaEstimateResidual = cuda.GetModuleFunction("cudaEstimateResidual");
 				cudaSumResidual = cuda.GetModuleFunction("cudaSumResidual");
+				cudaSumResidualChunks = cuda.GetModuleFunction("cudaSumResidualChunks");
 				cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual");
 				cudaSamples = cuda.Allocate((uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels)));
 				cudaWindow = cuda.Allocate((uint)sizeof(float) * FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS);
 				cudaAutocorTasks = cuda.Allocate((uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS));
 				cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * maxAutocorParts);
 				cudaResidualTasks = cuda.Allocate((uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 4)));
-				cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER + 1) * lpc.MAX_LPC_WINDOWS * maxResidualParts));
+				cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 4)));
+				cudaResidualSums = cuda.Allocate((uint)(sizeof(int) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 4) * maxResidualParts));
+				//cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 4) * maxResidualParts));
 				CUResult cuErr = CUDADriver.cuMemAllocHost(ref samplesBufferPtr, (uint)(sizeof(int) * (channels == 2 ? 4 : channels) * FlaCudaWriter.MAX_BLOCKSIZE));
 				if (cuErr == CUResult.Success)
 					cuErr = CUDADriver.cuMemAllocHost(ref autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS));
 				if (cuErr == CUResult.Success)
-					cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 4)));
+					cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 8)));
 				if (cuErr != CUResult.Success)
 				{
 					if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr); samplesBufferPtr = IntPtr.Zero;
@@ -1678,7 +1709,7 @@ namespace CUETools.Codecs.FlaCuda
 				case 0:
 					do_midside = false;
 					window_function = WindowFunction.Bartlett;
-					max_prediction_order = 7;
+					max_prediction_order = 8;
 					max_partition_order = 4;
 					break;
 				case 1:
@@ -1694,7 +1725,7 @@ namespace CUETools.Codecs.FlaCuda
 					break;
 				case 3:
 					window_function = WindowFunction.Bartlett;
-					max_prediction_order = 7;
+					max_prediction_order = 8;
 					break;
 				case 4:
 					window_function = WindowFunction.Bartlett;
@@ -1704,7 +1735,7 @@ namespace CUETools.Codecs.FlaCuda
 					window_function = WindowFunction.Bartlett;
 					break;
 				case 6:
-					max_prediction_order = 10;
+					//max_prediction_order = 10;
 					break;
 				case 7:
 					break;
--- a/CUETools.FlaCuda/flacuda.cu
+++ b/CUETools.FlaCuda/flacuda.cu
@@ -155,6 +155,7 @@ extern "C" __global__ void cudaComputeLPC(
 	if (tid < 32)
 	{
 	    int precision = 13;
+	    int taskNo = (blockIdx.x + blockIdx.y * gridDim.x) * ((max_order + 7) & ~7) + order;
 	    shared.bits[tid] = __mul24((33 - __clz(__float2int_rn(fabs(shared.tmp[tid]) * (1 << 15))) - precision), tid <= order);
 	    shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 16]);
 	    shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 8]);
@@ -164,9 +165,9 @@ extern "C" __global__ void cudaComputeLPC(
 	    int sh = max(0,min(15, 15 - shared.bits[0]));
 	    int coef = max(-(1 << precision),min((1 << precision)-1,__float2int_rn(-shared.tmp[tid] * (1 << sh))));
 	    if (tid <= order)
-		output[(blockIdx.x + blockIdx.y * gridDim.x) * max_order + order].coefs[tid] = coef;
+		output[taskNo].coefs[tid] = coef;
 	    if (tid == 0)
-		output[(blockIdx.x + blockIdx.y * gridDim.x) * max_order + order].shift = sh;
+		output[taskNo].shift = sh;
 	    shared.bits[tid] = 33 - max(__clz(coef),__clz(-1 ^ coef));
 	    shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 16]);
 	    shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 8]);
@@ -175,51 +176,111 @@ extern "C" __global__ void cudaComputeLPC(
 	    shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 1]);
 	    int cbits = shared.bits[0];
 	    if (tid == 0)
-		output[(blockIdx.x + blockIdx.y * gridDim.x) * max_order + order].cbits = cbits;
+		output[taskNo].cbits = cbits;
 	}
 	__syncthreads();
    }
 }

+// blockDim.x == 32
+// blockDim.y == 8
 extern "C" __global__ void cudaEstimateResidual(
    int*output,
    int*samples,
    encodeResidualTaskStruct *tasks,
+    int max_order,
    int frameSize,
-    int partSize // should be <= blockDim - max_order
+    int partSize // should be 224
    )
 {
    __shared__ struct {
 	int data[256];
 	int residual[256];
-	int rice[32];
-	encodeResidualTaskStruct task;
+	int rice[256];
+	int sums[8];
+	encodeResidualTaskStruct task[8];
    } shared;
-    const int tid = threadIdx.x;
-    // fetch task data
-    if (tid < sizeof(encodeResidualTaskStruct) / sizeof(int))
-	((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid];
+    const int tid = threadIdx.x + threadIdx.y * blockDim.x;
+    // fetch task data (8 * 64 == 512 elements);
+    ((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y * blockDim.y))[tid];
+    ((int*)&shared.task)[tid + 256] = ((int*)(tasks + blockIdx.y * blockDim.y))[tid + 256];
    __syncthreads();
-    const int pos = blockIdx.x * partSize;
-    const int residualOrder = shared.task.residualOrder + 1;
-    const int residualLen = min(frameSize - pos - residualOrder, partSize);
-    const int dataLen = residualLen + residualOrder;
+    const int residualOrder = shared.task[threadIdx.y].residualOrder;
+    const int partNumber = blockIdx.x;
+    const int pos = partNumber * partSize;
+    const int dataLen = min(frameSize - pos, partSize + max_order) * (residualOrder != 0);

    // fetch samples
-    shared.data[tid] = (tid < dataLen ? samples[shared.task.samplesOffs + pos + tid] : 0);
+    shared.data[tid] = (tid < dataLen ? samples[shared.task[0].samplesOffs + pos + tid] : 0);
+    if (tid < blockDim.y) shared.sums[tid] = 0;
+
+    // set upper residuals to zero, in case blockDim < 256
+    //shared.residual[255 - tid] = 0;
+
+    const int residualLen = min(frameSize - pos - residualOrder, partSize) * (residualOrder != 0);

    // reverse coefs
-    if (tid < residualOrder) shared.task.coefs[tid] = shared.task.coefs[residualOrder - 1 - tid];
+    if (threadIdx.x < residualOrder) shared.task[threadIdx.y].coefs[threadIdx.x] = shared.task[threadIdx.y].coefs[residualOrder - 1 - threadIdx.x];
    
-    // compute residual
    __syncthreads();
+
+    for (int i = 0; i < residualLen; i += blockDim.x)
+    {
+	// compute residual
 	long sum = 0;
 	for (int c = 0; c < residualOrder; c++)
-	sum += __mul24(shared.data[tid + c], shared.task.coefs[c]);
-    int res = shared.data[tid + residualOrder] - (sum >> shared.task.shift);
-    shared.residual[tid] = __mul24(tid < residualLen, (2 * res) ^ (res >> 31));
+	    sum += __mul24(shared.data[i + threadIdx.x + c], shared.task[threadIdx.y].coefs[c]);
+	int res = shared.data[i + threadIdx.x + residualOrder] - (sum >> shared.task[threadIdx.y].shift);
+	shared.residual[tid] = __mul24(i + threadIdx.x < residualLen, (2 * res) ^ (res >> 31));
+	__syncthreads(); if (threadIdx.x < 32) shared.residual[tid] += shared.residual[tid + 32]; __syncthreads();
+	shared.residual[tid] += shared.residual[tid + 16];
+	shared.residual[tid] += shared.residual[tid + 8];
+	shared.residual[tid] += shared.residual[tid + 4];
+	shared.residual[tid] += shared.residual[tid + 2];
+	if (threadIdx.x == 0) shared.sums[threadIdx.y] += shared.residual[tid] + shared.residual[tid + 1];
+    }
   
+    // rice parameter search
+    shared.rice[tid] = __mul24(threadIdx.x >= 15, 0x7fffff) + residualLen * (threadIdx.x + 1) + ((shared.sums[threadIdx.y] - (residualLen >> 1)) >> threadIdx.x);
+    shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 8]);
+    shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 4]);
+    shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 2]);
+    if (threadIdx.x == 0 && residualOrder != 0)
+	output[(blockIdx.y * blockDim.y + threadIdx.y) * gridDim.x + blockIdx.x] = min(shared.rice[tid], shared.rice[tid + 1]);
+}
+
+// blockDim.x == 256
+// gridDim.x = frameSize / chunkSize
+extern "C" __global__ void cudaSumResidualChunks(
+    int *output,
+    encodeResidualTaskStruct *tasks,
+    int *residual,
+    int frameSize,
+    int chunkSize // <= blockDim.x(256)
+    )
+{
+    __shared__ struct {
+	int residual[256];
+	int rice[32];
+    } shared;
+    
+    // fetch parameters
+    const int tid = threadIdx.x;
+    const int residualOrder = tasks[blockIdx.y].residualOrder;
+    const int chunkNumber = blockIdx.x;
+    const int pos = chunkNumber * chunkSize;
+    const int residualLen = min(frameSize - pos - residualOrder, chunkSize);
+
+    // set upper residuals to zero, in case blockDim < 256
+    shared.residual[255 - tid] = 0;
+
+    // read residual
+    int res = (tid < residualLen) ? residual[blockIdx.y * 8192 + pos + tid] : 0;
+
+    // convert to unsigned
+    shared.residual[tid] = (2 * res) ^ (res >> 31);
    __syncthreads();
+
    // residual sum: reduction in shared mem
    if (tid < 128) shared.residual[tid] += shared.residual[tid + 128]; __syncthreads();
    if (tid < 64) shared.residual[tid] += shared.residual[tid + 64]; __syncthreads();
@@ -229,7 +290,6 @@ extern "C" __global__ void cudaEstimateResidual(
    shared.residual[tid] += shared.residual[tid + 4];
    shared.residual[tid] += shared.residual[tid + 2];
    shared.residual[tid] += shared.residual[tid + 1];
-    __syncthreads();

    if (tid < 32)
    {
@@ -240,6 +300,8 @@ extern "C" __global__ void cudaEstimateResidual(
 	shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 2]);
 	shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 1]);
    }
+
+    // write output
    if (tid == 0)
 	output[blockIdx.x + blockIdx.y * gridDim.x] = shared.rice[0];
 }
@@ -247,34 +309,32 @@ extern "C" __global__ void cudaEstimateResidual(
 extern "C" __global__ void cudaSumResidual(
    encodeResidualTaskStruct *tasks,
    int *residual,
-    int partCount // <= blockDim.y (64)
+    int partSize,
+    int partCount // <= blockDim.y (256)
    )
 {
    __shared__ struct {
-	int partLen[64];
-	//encodeResidualTaskStruct task;
+	int partLen[256];
+	encodeResidualTaskStruct task;
    } shared;

    const int tid = threadIdx.x;
    // fetch task data
- //   if (tid < sizeof(encodeResidualTaskStruct) / sizeof(int))
-	//((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid];
- //   __syncthreads();
+    if (tid < sizeof(encodeResidualTaskStruct) / sizeof(int))
+	((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid];
+    __syncthreads();

    shared.partLen[tid] = (tid < partCount) ? residual[tid + partCount * blockIdx.y] : 0;
-
    __syncthreads();
    // length sum: reduction in shared mem
+    //if (tid < 128) shared.partLen[tid] += shared.partLen[tid + 128]; __syncthreads();
+    //if (tid < 64) shared.partLen[tid] += shared.partLen[tid + 64]; __syncthreads();
    if (tid < 32) shared.partLen[tid] += shared.partLen[tid + 32]; __syncthreads();
    shared.partLen[tid] += shared.partLen[tid + 16];
    shared.partLen[tid] += shared.partLen[tid + 8];
    shared.partLen[tid] += shared.partLen[tid + 4];
    shared.partLen[tid] += shared.partLen[tid + 2];
    shared.partLen[tid] += shared.partLen[tid + 1];
-    __syncthreads();
-    
-    // FIXME: should process partition order here!!!
-
    // return sum
    if (tid == 0)
 	tasks[blockIdx.y].size = shared.partLen[0];
@@ -288,58 +348,6 @@ extern "C" __global__ void cudaEncodeResidual(
    int partSize // should be <= blockDim - max_order
    )
 {
-    __shared__ struct {
-	int data[256];
-	int residual[256];
-	int rice[32];
-	encodeResidualTaskStruct task;
-    } shared;
-    const int tid = threadIdx.x;
-    // fetch task data
-    if (tid < sizeof(encodeResidualTaskStruct) / sizeof(int))
-	((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid];
    __syncthreads();
-    const int pos = blockIdx.x * partSize;
-    const int residualOrder = shared.task.residualOrder + 1;
-    const int residualLen = min(frameSize - pos - residualOrder, partSize);
-    const int dataLen = residualLen + residualOrder;
-
-    // fetch samples
-    shared.data[tid] = (tid < dataLen ? samples[shared.task.samplesOffs + pos + tid] : 0);
-
-    // reverse coefs
-    if (tid < residualOrder) shared.task.coefs[tid] = shared.task.coefs[residualOrder - 1 - tid];
-    
-    // compute residual
-    __syncthreads();
-    long sum = 0;
-    for (int c = 0; c < residualOrder; c++)
-	sum += __mul24(shared.data[tid + c], shared.task.coefs[c]);
-    int res = shared.data[tid + residualOrder] - (sum >> shared.task.shift);
-    shared.residual[tid] = __mul24(tid < residualLen, (2 * res) ^ (res >> 31));
-    
-    __syncthreads();
-    // residual sum: reduction in shared mem
-    if (tid < 128) shared.residual[tid] += shared.residual[tid + 128]; __syncthreads();
-    if (tid < 64) shared.residual[tid] += shared.residual[tid + 64]; __syncthreads();
-    if (tid < 32) shared.residual[tid] += shared.residual[tid + 32]; __syncthreads();
-    shared.residual[tid] += shared.residual[tid + 16];
-    shared.residual[tid] += shared.residual[tid + 8];
-    shared.residual[tid] += shared.residual[tid + 4];
-    shared.residual[tid] += shared.residual[tid + 2];
-    shared.residual[tid] += shared.residual[tid + 1];
-    __syncthreads();
-
-    if (tid < 32)
-    {
-	// rice parameter search
-	shared.rice[tid] = __mul24(tid >= 15, 0x7fffff) + residualLen * (tid + 1) + ((shared.residual[0] - (residualLen >> 1)) >> tid);
-	shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 8]);
-	shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 4]);
-	shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 2]);
-	shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 1]);
-    }
-    if (tid == 0)
-	output[blockIdx.x + blockIdx.y * gridDim.x] = shared.rice[0];
 }
 #endif