diff --git a/CUETools.FlaCuda/CUETools.Codecs.FlaCuda.csproj b/CUETools.FlaCuda/CUETools.Codecs.FlaCuda.csproj
new file mode 100644
index 0000000..db38a62
--- /dev/null
+++ b/CUETools.FlaCuda/CUETools.Codecs.FlaCuda.csproj
@@ -0,0 +1,73 @@
+﻿<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+    <ProductVersion>8.0.50727</ProductVersion>
+    <SchemaVersion>2.0</SchemaVersion>
+    <ProjectGuid>{DFE50673-906C-4B8F-993B-A24CAD1CA17D}</ProjectGuid>
+    <OutputType>Library</OutputType>
+    <AppDesignerFolder>Properties</AppDesignerFolder>
+    <RootNamespace>CUETools.Codecs.FlaCuda</RootNamespace>
+    <AssemblyName>CUETools.Codecs.FlaCuda</AssemblyName>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
+    <DebugSymbols>true</DebugSymbols>
+    <DebugType>full</DebugType>
+    <Optimize>false</Optimize>
+    <OutputPath>bin\Debug\</OutputPath>
+    <DefineConstants>DEBUG;TRACE</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
+    <DebugType>pdbonly</DebugType>
+    <Optimize>true</Optimize>
+    <OutputPath>bin\Release\</OutputPath>
+    <DefineConstants>TRACE</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+  </PropertyGroup>
+  <ItemGroup>
+    <Reference Include="CUDA.NET, Version=2.2.0.0, Culture=neutral, processorArchitecture=MSIL">
+      <SpecificVersion>False</SpecificVersion>
+      <HintPath>..\..\CUDANET\lib\CUDA.NET.dll</HintPath>
+    </Reference>
+    <Reference Include="System" />
+    <Reference Include="System.Data" />
+    <Reference Include="System.Xml" />
+  </ItemGroup>
+  <ItemGroup>
+    <Compile Include="FlaCudaWriter.cs" />
+    <Compile Include="Properties\AssemblyInfo.cs" />
+  </ItemGroup>
+  <ItemGroup>
+    <ProjectReference Include="..\CUETools.Codecs.FLAKE\CUETools.Codecs.FLAKE.csproj">
+      <Project>{082D6B9E-326E-4D15-9798-EDAE9EDE70A6}</Project>
+      <Name>CUETools.Codecs.FLAKE</Name>
+    </ProjectReference>
+    <ProjectReference Include="..\CUETools.Codecs\CUETools.Codecs.csproj">
+      <Project>{6458A13A-30EF-45A9-9D58-E5031B17BEE2}</Project>
+      <Name>CUETools.Codecs</Name>
+    </ProjectReference>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="flacuda.cu">
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+      <Generator>"%24%28CUDA_BIN_PATH%29\nvcc.exe" flacuda.cu --cubin -cbin "%24%28VCInstallDir%29bin"</Generator>
+    </None>
+  </ItemGroup>
+  <Import Project="$(MSBuildBinPath)\Microsoft.CSharp.targets" />
+  <!-- To modify your build process, add your task inside one of the targets below and uncomment it. 
+       Other similar extension points exist, see Microsoft.Common.targets.
+  <Target Name="BeforeBuild">
+  </Target>
+  <Target Name="AfterBuild">
+  </Target>
+  -->
+  <PropertyGroup>
+    <PostBuildEvent>nvcc flacuda.cu --maxrregcount 10 --cubin --compiler-bindir "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\bin" --system-include "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\include" 
+</PostBuildEvent>
+  </PropertyGroup>
+</Project>
\ No newline at end of file
diff --git a/CUETools.FlaCuda/FlaCudaWriter.cs b/CUETools.FlaCuda/FlaCudaWriter.cs
new file mode 100644
index 0000000..b6e5a14
--- /dev/null
+++ b/CUETools.FlaCuda/FlaCudaWriter.cs
@@ -0,0 +1,2003 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Security.Cryptography;
+using System.Text;
+using System.Runtime.InteropServices;
+using CUETools.Codecs;
+using CUETools.Codecs.FLAKE;
+using GASS.CUDA;
+using GASS.CUDA.Types;
+
+namespace CUETools.Codecs.FlaCuda
+{
+	public class FlaCudaWriter : IAudioDest
+	{
+		Stream _IO = null;
+		string _path;
+		long _position;
+
+		// number of audio channels
+		// valid values are 1 to 8
+		int channels, ch_code;
+
+		// audio sample rate in Hz
+		int sample_rate, sr_code0, sr_code1;
+
+		// sample size in bits
+		// only 16-bit is currently supported
+		uint bits_per_sample;
+		int bps_code;
+
+		// total stream samples
+		// if 0, stream length is unknown
+		int sample_count;
+
+		FlakeEncodeParams eparams;
+
+		// maximum frame size in bytes
+		// this can be used to allocate memory for output
+		int max_frame_size;
+
+		byte[] frame_buffer = null;
+
+		int frame_count = 0;
+
+		long first_frame_offset = 0;
+
+		TimeSpan _userProcessorTime;
+
+		// header bytes
+		// allocated by flake_encode_init and freed by flake_encode_close
+		byte[] header;
+
+		int[] samplesBuffer;
+		int[] verifyBuffer;
+		int[] residualBuffer;
+		float[] windowBuffer;
+		int[] autocorBuffer3int;
+		float[] autocorBuffer;
+		int samplesInBuffer = 0;
+
+		int _compressionLevel = 7;
+		int _blocksize = 0;
+		int _totalSize = 0;
+		int _windowsize = 0, _windowcount = 0;
+
+		Crc8 crc8;
+		Crc16 crc16;
+		MD5 md5;
+
+		FlacFrame frame;
+		FlakeReader verify;
+
+		SeekPoint[] seek_table;
+		int seek_table_offset = -1;
+
+		bool inited = false;
+
+		CUDA cuda;
+		CUfunction cudaComputeAutocor;
+		CUfunction cudaEncodeResidual;
+		CUdeviceptr cudaSamples;
+		CUdeviceptr cudaWindow;
+		CUdeviceptr cudaAutocor;
+		CUdeviceptr cudaCoeffs;
+		CUdeviceptr cudaShifts;
+
+		public const int MAX_BLOCKSIZE = 8192;
+
+		public FlaCudaWriter(string path, int bitsPerSample, int channelCount, int sampleRate, Stream IO)
+		{
+			if (bitsPerSample != 16)
+				throw new Exception("Bits per sample must be 16.");
+			if (channelCount != 2)
+				throw new Exception("ChannelCount must be 2.");
+
+			channels = channelCount;
+			sample_rate = sampleRate;
+			bits_per_sample = (uint) bitsPerSample;
+
+			// flake_validate_params
+
+			_path = path;
+			_IO = IO;
+
+			samplesBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels)];
+			residualBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 10 : channels + 1)];
+			windowBuffer = new float[FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS];
+			autocorBuffer3int = new int[(lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * 64];
+			autocorBuffer = new float[(lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * 22];
+
+			eparams.flake_set_defaults(_compressionLevel);
+			eparams.padding_size = 8192;
+
+			crc8 = new Crc8();
+			crc16 = new Crc16();
+			frame = new FlacFrame(channels * 2);
+		}
+
+		public int TotalSize
+		{
+			get
+			{
+				return _totalSize;
+			}
+		}
+
+		public int PaddingLength
+		{
+			get
+			{
+				return eparams.padding_size;
+			}
+			set
+			{
+				eparams.padding_size = value;
+			}
+		}
+
+		public int CompressionLevel
+		{
+			get
+			{
+				return _compressionLevel;
+			}
+			set
+			{
+				if (value < 0 || value > 11)
+					throw new Exception("unsupported compression level");
+				_compressionLevel = value;
+				eparams.flake_set_defaults(_compressionLevel);
+			}
+		}
+
+		//[DllImport("kernel32.dll")]
+		//static extern bool GetThreadTimes(IntPtr hThread, out long lpCreationTime, out long lpExitTime, out long lpKernelTime, out long lpUserTime);
+		//[DllImport("kernel32.dll")]
+		//static extern IntPtr GetCurrentThread();
+
+		void DoClose()
+		{
+			if (inited)
+			{
+				while (samplesInBuffer > 0)
+				{
+					eparams.block_size = samplesInBuffer;
+					output_frame();
+				}
+
+				if (_IO.CanSeek)
+				{
+					if (md5 != null)
+					{
+						md5.TransformFinalBlock(frame_buffer, 0, 0);
+						_IO.Position = 26;
+						_IO.Write(md5.Hash, 0, md5.Hash.Length);
+					}
+
+					if (seek_table != null)
+					{
+						_IO.Position = seek_table_offset;
+						int len = write_seekpoints(header, 0, 0);
+						_IO.Write(header, 4, len - 4);
+					}
+				}
+				_IO.Close();
+
+				cuda.Free(cudaWindow);
+				cuda.Free(cudaSamples);
+				cuda.Free(cudaAutocor);
+				cuda.Free(cudaCoeffs);
+				cuda.Free(cudaShifts);
+				cuda.Dispose();
+				inited = false;
+			}
+
+			//long fake, KernelStart, UserStart;
+			//GetThreadTimes(GetCurrentThread(), out fake, out fake, out KernelStart, out UserStart);
+			//_userProcessorTime = new TimeSpan(UserStart);
+		}
+
+		public void Close()
+		{
+			DoClose();
+			if (sample_count != 0 && _position != sample_count)
+				throw new Exception("Samples written differs from the expected sample count.");
+		}
+
+		public void Delete()
+		{
+			if (inited)
+			{
+				_IO.Close();
+				cuda.Free(cudaWindow);
+				cuda.Free(cudaSamples);
+				cuda.Free(cudaAutocor);
+				cuda.Free(cudaCoeffs);
+				cuda.Free(cudaShifts);
+				cuda.Dispose();
+				inited = false;
+			}
+
+			if (_path != "")
+				File.Delete(_path);
+		}
+
+		public long Position
+		{
+			get
+			{
+				return _position;
+			}
+		}
+
+		public long FinalSampleCount
+		{
+			set { sample_count = (int)value; }
+		}
+
+		public long BlockSize
+		{
+			set { _blocksize = (int)value; }
+			get { return _blocksize == 0 ? eparams.block_size : _blocksize; }
+		}
+
+		public OrderMethod OrderMethod
+		{
+			get { return eparams.order_method; }
+			set { eparams.order_method = value; }
+		}
+
+		public PredictionType PredictionType
+		{
+			get { return eparams.prediction_type; }
+			set { eparams.prediction_type = value; }
+		}
+
+		public StereoMethod StereoMethod
+		{
+			get { return eparams.stereo_method; }
+			set { eparams.stereo_method = value; }
+		}
+
+		public int MinPrecisionSearch
+		{
+			get { return eparams.lpc_min_precision_search; }
+			set
+			{
+				if (value < 0 || value > eparams.lpc_max_precision_search)
+					throw new Exception("unsupported MinPrecisionSearch value");
+				eparams.lpc_min_precision_search = value;
+			}
+		}
+
+		public int MaxPrecisionSearch
+		{
+			get { return eparams.lpc_max_precision_search; }
+			set
+			{
+				if (value < eparams.lpc_min_precision_search || value >= lpc.MAX_LPC_PRECISIONS)
+					throw new Exception("unsupported MaxPrecisionSearch value");
+				eparams.lpc_max_precision_search = value;
+			}
+		}
+
+		public WindowFunction WindowFunction
+		{
+			get { return eparams.window_function; }
+			set { eparams.window_function = value; }
+		}
+
+		public bool DoMD5
+		{
+			get { return eparams.do_md5; }
+			set { eparams.do_md5 = value; }
+		}
+
+		public bool DoVerify
+		{
+			get { return eparams.do_verify; }
+			set { eparams.do_verify = value; }
+		}
+
+		public bool DoSeekTable
+		{
+			get { return eparams.do_seektable; }
+			set { eparams.do_seektable = value; }
+		}
+
+		public int VBRMode
+		{
+			get { return eparams.variable_block_size; }
+			set { eparams.variable_block_size = value; }
+		}
+
+		public int MinPredictionOrder
+		{
+			get
+			{
+				return PredictionType == PredictionType.Fixed ?
+					MinFixedOrder : MinLPCOrder;
+			}
+			set
+			{
+				if (PredictionType == PredictionType.Fixed)
+					MinFixedOrder = value;
+				else
+					MinLPCOrder = value;
+			}
+		}
+
+		public int MaxPredictionOrder
+		{
+			get 
+			{
+				return PredictionType == PredictionType.Fixed ?
+					MaxFixedOrder : MaxLPCOrder; 
+			}
+			set
+			{
+				if (PredictionType == PredictionType.Fixed)
+					MaxFixedOrder = value;
+				else
+					MaxLPCOrder = value;
+			}
+		}
+
+		public int MinLPCOrder
+		{
+			get
+			{
+				return eparams.min_prediction_order;
+			}
+			set
+			{
+				if (value < 1 || value > eparams.max_prediction_order)
+					throw new Exception("invalid MinLPCOrder " + value.ToString());
+				eparams.min_prediction_order = value;
+			}
+		}
+
+		public int MaxLPCOrder
+		{
+			get
+			{
+				return eparams.max_prediction_order;
+			}
+			set
+			{
+				if (value > lpc.MAX_LPC_ORDER || value < eparams.min_prediction_order)
+					throw new Exception("invalid MaxLPCOrder " + value.ToString());
+				eparams.max_prediction_order = value;
+			}
+		}
+
+		public int EstimationDepth
+		{
+			get
+			{
+				return eparams.estimation_depth;
+			}
+			set
+			{
+				if (value > 32 || value < 1)
+					throw new Exception("invalid estimation_depth " + value.ToString());
+				eparams.estimation_depth = value;
+			}
+		}
+
+		public int MinFixedOrder
+		{
+			get
+			{
+				return eparams.min_fixed_order;
+			}
+			set
+			{
+				if (value < 0 || value > eparams.max_fixed_order)
+					throw new Exception("invalid MinFixedOrder " + value.ToString());
+				eparams.min_fixed_order = value;
+			}
+		}
+
+		public int MaxFixedOrder
+		{
+			get
+			{
+				return eparams.max_fixed_order;
+			}
+			set
+			{
+				if (value > 4 || value < eparams.min_fixed_order)
+					throw new Exception("invalid MaxFixedOrder " + value.ToString());
+				eparams.max_fixed_order = value;
+			}
+		}
+
+		public int MinPartitionOrder
+		{
+			get { return eparams.min_partition_order; }
+			set
+			{
+				if (value < 0 || value > eparams.max_partition_order)
+					throw new Exception("invalid MinPartitionOrder " + value.ToString());
+				eparams.min_partition_order = value;
+			}
+		}
+
+		public int MaxPartitionOrder
+		{
+			get { return eparams.max_partition_order; }
+			set
+			{
+				if (value > 8 || value < eparams.min_partition_order)
+					throw new Exception("invalid MaxPartitionOrder " + value.ToString());
+				eparams.max_partition_order = value;
+			}
+		}
+
+		public TimeSpan UserProcessorTime
+		{
+			get { return _userProcessorTime; }
+		}
+
+		public int BitsPerSample
+		{
+			get { return 16; }
+		}
+
+		unsafe uint get_wasted_bits(int* signal, int samples)
+		{
+			int i, shift;
+			int x = 0;
+
+			for (i = 0; i < samples && 0 == (x & 1); i++)
+				x |= signal[i];
+
+			if (x == 0)
+			{
+				shift = 0;
+			}
+			else
+			{
+				for (shift = 0; 0 == (x & 1); shift++)
+					x >>= 1;
+			}
+
+			if (shift > 0)
+			{
+				for (i = 0; i < samples; i++)
+					signal[i] >>= shift;
+			}
+
+			return (uint)shift;
+		}
+
+		/// <summary>
+		/// Copy channel-interleaved input samples into separate subframes
+		/// </summary>
+		/// <param name="samples"></param>
+		/// <param name="pos"></param>
+		/// <param name="block"></param>
+ 		unsafe void copy_samples(int[,] samples, int pos, int block)
+		{
+			fixed (int* fsamples = samplesBuffer, src = &samples[pos, 0])
+			{
+				if (channels == 2)
+					AudioSamples.Deinterlace(fsamples + samplesInBuffer, fsamples + FlaCudaWriter.MAX_BLOCKSIZE + samplesInBuffer, src, block);
+				else
+					for (int ch = 0; ch < channels; ch++)
+					{
+						int* psamples = fsamples + ch * FlaCudaWriter.MAX_BLOCKSIZE + samplesInBuffer;
+						for (int i = 0; i < block; i++)
+							psamples[i] = src[i * channels + ch];
+					}
+			}
+			samplesInBuffer += block;
+		}
+
+		static uint rice_encode_count(uint sum, uint n, uint k)
+		{
+			return n*(k+1) + ((sum-(n>>1))>>(int)k);
+		}
+
+		//static unsafe uint find_optimal_rice_param(uint sum, uint n)
+		//{
+		//    uint* nbits = stackalloc uint[Flake.MAX_RICE_PARAM + 1];
+		//    int k_opt = 0;
+
+		//    nbits[0] = UINT32_MAX;
+		//    for (int k = 0; k <= Flake.MAX_RICE_PARAM; k++)
+		//    {
+		//        nbits[k] = rice_encode_count(sum, n, (uint)k);
+		//        if (nbits[k] < nbits[k_opt])
+		//            k_opt = k;
+		//    }
+		//    return (uint)k_opt;
+		//}
+
+		static unsafe int find_optimal_rice_param(uint sum, uint n, out uint nbits_best)
+		{
+			int k_opt = 0;
+			uint a = n;
+			uint b = sum - (n >> 1);
+			uint nbits = a + b;
+			for (int k = 1; k <= Flake.MAX_RICE_PARAM; k++)
+			{
+				a += n;
+				b >>= 1;
+				uint nbits_k = a + b;
+				if (nbits_k < nbits)
+				{
+					k_opt = k;
+					nbits = nbits_k;
+				}
+			}
+			nbits_best = nbits;
+			return k_opt;
+		}
+
+		unsafe uint calc_decorr_score(FlacFrame frame, int ch)
+		{
+			int* s = frame.subframes[ch].samples;
+			int n = frame.blocksize;
+			ulong sum = 0;
+			for (int i = 2; i < n; i++)
+				sum += (ulong)Math.Abs(s[i] - 2 * s[i - 1] + s[i - 2]);
+			uint nbits;
+			find_optimal_rice_param((uint)(2 * sum), (uint)n, out nbits);
+			return nbits;
+		}
+
+		unsafe static void channel_decorrelation(int* leftS, int* rightS, int *leftM, int *rightM, int blocksize)
+		{
+			for (int i = 0; i < blocksize; i++)
+			{
+				leftM[i] = (leftS[i] + rightS[i]) >> 1;
+				rightM[i] = leftS[i] - rightS[i];
+			}
+		}
+
+		unsafe void encode_residual_verbatim(int* res, int* smp, uint n)
+		{
+			AudioSamples.MemCpy(res, smp, (int) n);
+		}
+
+		unsafe void encode_residual_fixed(int* res, int* smp, int n, int order)
+		{
+			int i;
+			int s0, s1, s2;
+			switch (order)
+			{
+				case 0:
+					AudioSamples.MemCpy(res, smp, n);
+					return;
+				case 1:
+					*(res++) = s1 = *(smp++);
+					for (i = n - 1; i > 0; i--)
+					{
+						s0 = *(smp++);
+						*(res++) = s0 - s1;
+						s1 = s0;
+					}
+					return;
+				case 2:
+					*(res++) = s2 = *(smp++);
+					*(res++) = s1 = *(smp++);
+					for (i = n - 2; i > 0; i--)
+					{
+						s0 = *(smp++);
+						*(res++) = s0 - 2 * s1 + s2;
+						s2 = s1;
+						s1 = s0;
+					}
+					return;
+				case 3:
+					res[0] = smp[0];
+					res[1] = smp[1];
+					res[2] = smp[2];
+					for (i = 3; i < n; i++)
+					{
+						res[i] = smp[i] - 3 * smp[i - 1] + 3 * smp[i - 2] - smp[i - 3];
+					}
+					return;
+				case 4:
+					res[0] = smp[0];
+					res[1] = smp[1];
+					res[2] = smp[2];
+					res[3] = smp[3];
+					for (i = 4; i < n; i++)
+					{
+						res[i] = smp[i] - 4 * smp[i - 1] + 6 * smp[i - 2] - 4 * smp[i - 3] + smp[i - 4];
+					}
+					return;
+				default:
+					return;
+			}
+		}
+
+		static unsafe uint calc_optimal_rice_params(ref RiceContext rc, int porder, uint* sums, uint n, uint pred_order)
+		{
+			uint part = (1U << porder);
+			uint all_bits = 0;			
+			rc.rparams[0] = find_optimal_rice_param(sums[0], (n >> porder) - pred_order, out all_bits);
+			uint cnt = (n >> porder);
+			for (uint i = 1; i < part; i++)
+			{
+				uint nbits;
+				rc.rparams[i] = find_optimal_rice_param(sums[i], cnt, out nbits);
+				all_bits += nbits;
+			}
+			all_bits += (4 * part);
+			rc.porder = porder;
+			return all_bits;
+		}
+
+		static unsafe void calc_sums(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums)
+		{
+			// sums for highest level
+			int parts = (1 << pmax);
+			uint* res = data + pred_order;
+			uint cnt = (n >> pmax) - pred_order;
+			uint sum = 0;
+			for (uint j = cnt; j > 0; j--)
+				sum += *(res++);
+			sums[pmax * Flake.MAX_PARTITIONS + 0] = sum;
+			cnt = (n >> pmax);
+			for (int i = 1; i < parts; i++)
+			{
+				sum = 0;
+				for (uint j = cnt; j > 0; j--)
+					sum += *(res++);
+				sums[pmax * Flake.MAX_PARTITIONS + i] = sum;
+			}
+			// sums for lower levels
+			for (int i = pmax - 1; i >= pmin; i--)
+			{
+				parts = (1 << i);
+				for (int j = 0; j < parts; j++)
+				{
+					sums[i * Flake.MAX_PARTITIONS + j] = 
+						sums[(i + 1) * Flake.MAX_PARTITIONS + 2 * j] + 
+						sums[(i + 1) * Flake.MAX_PARTITIONS + 2 * j + 1];
+				}
+			}
+		}
+
+		static unsafe uint calc_rice_params(ref RiceContext rc, int pmin, int pmax, int* data, uint n, uint pred_order)
+		{
+			RiceContext tmp_rc = new RiceContext(), tmp_rc2;
+			uint* udata = stackalloc uint[(int)n];
+			uint* sums = stackalloc uint[(pmax + 1) * Flake.MAX_PARTITIONS];
+			//uint* bits = stackalloc uint[Flake.MAX_PARTITION_ORDER];
+
+			//assert(pmin >= 0 && pmin <= Flake.MAX_PARTITION_ORDER);
+			//assert(pmax >= 0 && pmax <= Flake.MAX_PARTITION_ORDER);
+			//assert(pmin <= pmax);
+
+			for (uint i = 0; i < n; i++)
+				udata[i] = (uint) ((2 * data[i]) ^ (data[i] >> 31));
+
+			calc_sums(pmin, pmax, udata, n, pred_order, sums);
+
+			int opt_porder = pmin;
+			uint opt_bits = AudioSamples.UINT32_MAX;
+			for (int i = pmin; i <= pmax; i++)
+			{
+				uint bits = calc_optimal_rice_params(ref tmp_rc, i, sums + i * Flake.MAX_PARTITIONS, n, pred_order);
+				if (bits <= opt_bits)
+				{
+					opt_porder = i;
+					opt_bits = bits;
+					tmp_rc2 = rc;
+					rc = tmp_rc;
+					tmp_rc = tmp_rc2;
+				}
+			}
+
+			return opt_bits;
+		}
+
+		static int get_max_p_order(int max_porder, int n, int order)
+		{
+			int porder = Math.Min(max_porder, BitReader.log2i(n ^ (n - 1)));
+			if (order > 0)
+				porder = Math.Min(porder, BitReader.log2i(n / order));
+			return porder;
+		}
+
+		static unsafe uint calc_rice_params_fixed(ref RiceContext rc, int pmin, int pmax,
+			int* data, int n, int pred_order, uint bps)
+		{
+			pmin = get_max_p_order(pmin, n, pred_order);
+			pmax = get_max_p_order(pmax, n, pred_order);
+			uint bits = (uint)pred_order * bps + 6;
+			bits += calc_rice_params(ref rc, pmin, pmax, data, (uint)n, (uint)pred_order);
+			return bits;
+		}
+
+		static unsafe uint calc_rice_params_lpc(ref RiceContext rc, int pmin, int pmax,
+			int* data, int n, int pred_order, uint bps, uint precision)
+		{
+			pmin = get_max_p_order(pmin, n, pred_order);
+			pmax = get_max_p_order(pmax, n, pred_order);
+			uint bits = (uint)pred_order * bps + 4 + 5 + (uint)pred_order * precision + 6;
+			bits += calc_rice_params(ref rc, pmin, pmax, data, (uint)n, (uint)pred_order);
+			return bits;
+		}
+
+		// select LPC precision based on block size
+		static uint get_precision(int blocksize)
+		{
+			uint lpc_precision;
+			if (blocksize <= 192) lpc_precision = 7U;
+			else if (blocksize <= 384) lpc_precision = 8U;
+			else if (blocksize <= 576) lpc_precision = 9U;
+			else if (blocksize <= 1152) lpc_precision = 10U;
+			else if (blocksize <= 2304) lpc_precision = 11U;
+			else if (blocksize <= 4608) lpc_precision = 12U;
+			else if (blocksize <= 8192) lpc_precision = 13U;
+			else if (blocksize <= 16384) lpc_precision = 14U;
+			else lpc_precision = 15;
+			return lpc_precision;
+		}
+
+		unsafe void encode_residual_lpc_sub(FlacFrame frame, float * lpcs, int iWindow, int order, int ch)
+		{
+			// select LPC precision based on block size
+			uint lpc_precision = get_precision(frame.blocksize);
+
+			for (int i_precision = eparams.lpc_min_precision_search; i_precision <= eparams.lpc_max_precision_search && lpc_precision + i_precision < 16; i_precision++)
+				// check if we already calculated with this order, window and precision
+				if ((frame.subframes[ch].lpc_ctx[iWindow].done_lpcs[i_precision] & (1U << (order - 1))) == 0)
+				{
+					frame.subframes[ch].lpc_ctx[iWindow].done_lpcs[i_precision] |= (1U << (order - 1));
+
+					uint cbits = lpc_precision + (uint)i_precision;
+
+					frame.current.type = SubframeType.LPC;
+					frame.current.order = order;
+					frame.current.window = iWindow;
+
+					fixed (int* coefs = frame.current.coefs)
+					{
+						lpc.quantize_lpc_coefs(lpcs + (frame.current.order - 1) * lpc.MAX_LPC_ORDER,
+							frame.current.order, cbits, coefs, out frame.current.shift, 15, 0);
+
+						if (frame.current.shift < 0 || frame.current.shift > 15)
+							throw new Exception("negative shift");
+
+						ulong csum = 0;
+						for (int i = frame.current.order; i > 0; i--)
+							csum += (ulong)Math.Abs(coefs[i - 1]);
+
+						if ((csum << (int)frame.subframes[ch].obits) >= 1UL << 32)
+							lpc.encode_residual_long(frame.current.residual, frame.subframes[ch].samples, frame.blocksize, frame.current.order, coefs, frame.current.shift);
+						else
+							lpc.encode_residual(frame.current.residual, frame.subframes[ch].samples, frame.blocksize, frame.current.order, coefs, frame.current.shift);
+
+					}
+
+					frame.current.size = calc_rice_params_lpc(ref frame.current.rc, eparams.min_partition_order, eparams.max_partition_order,
+						frame.current.residual, frame.blocksize, frame.current.order, frame.subframes[ch].obits, cbits);
+
+					frame.ChooseBestSubframe(ch);
+				}
+		}
+
+		unsafe void encode_residual_fixed_sub(FlacFrame frame, int order, int ch)
+		{
+			if ((frame.subframes[ch].done_fixed & (1U << order)) != 0)
+				return; // already calculated;
+
+			frame.current.order = order;
+			frame.current.type = SubframeType.Fixed;
+
+			encode_residual_fixed(frame.current.residual, frame.subframes[ch].samples, frame.blocksize, frame.current.order);
+
+			frame.current.size = calc_rice_params_fixed(ref frame.current.rc, eparams.min_partition_order, eparams.max_partition_order,
+				frame.current.residual, frame.blocksize, frame.current.order, frame.subframes[ch].obits);
+
+			frame.subframes[ch].done_fixed |= (1U << order);
+
+			frame.ChooseBestSubframe(ch);
+		}
+
+		unsafe void encode_residual(FlacFrame frame, int ch, PredictionType predict, OrderMethod omethod, int pass)
+		{
+			int* smp = frame.subframes[ch].samples;
+			int i, n = frame.blocksize;
+			// save best.window, because we can overwrite it later with fixed frame
+			int best_window = frame.subframes[ch].best.type == SubframeType.LPC ? frame.subframes[ch].best.window : -1;
+
+			// CONSTANT
+			for (i = 1; i < n; i++)
+			{
+				if (smp[i] != smp[0]) break;
+			}
+			if (i == n)
+			{
+				frame.subframes[ch].best.type = SubframeType.Constant;
+				frame.subframes[ch].best.residual[0] = smp[0];
+				frame.subframes[ch].best.size = frame.subframes[ch].obits;
+				return;
+			}
+
+			// VERBATIM
+			frame.current.type = SubframeType.Verbatim;
+			frame.current.size = frame.subframes[ch].obits * (uint)frame.blocksize;
+			frame.ChooseBestSubframe(ch);
+
+			if (n < 5 || predict == PredictionType.None)
+				return;
+
+			// FIXED
+			if (predict == PredictionType.Fixed ||
+				(predict == PredictionType.Search && pass != 1) ||
+				//predict == PredictionType.Search ||
+				//(pass == 2 && frame.subframes[ch].best.type == SubframeType.Fixed) ||
+				n <= eparams.max_prediction_order)
+			{
+				int max_fixed_order = Math.Min(eparams.max_fixed_order, 4);
+				int min_fixed_order = Math.Min(eparams.min_fixed_order, max_fixed_order);
+
+				for (i = min_fixed_order; i <= max_fixed_order; i++)
+					encode_residual_fixed_sub(frame, i, ch);
+			}
+
+			// LPC
+			if (n > eparams.max_prediction_order &&
+			   (predict == PredictionType.Levinson ||
+				predict == PredictionType.Search)
+				//predict == PredictionType.Search ||
+				//(pass == 2 && frame.subframes[ch].best.type == SubframeType.LPC))
+				)
+			{
+				float* lpcs = stackalloc float[lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER];
+				int min_order = eparams.min_prediction_order;
+				int max_order = eparams.max_prediction_order;
+
+				for (int iWindow = 0; iWindow < _windowcount; iWindow++)
+				{
+					if (pass == 2 && iWindow != best_window)
+						continue;
+
+					LpcContext lpc_ctx = frame.subframes[ch].lpc_ctx[iWindow];
+			
+					lpc_ctx.GetReflection(max_order, smp, n, frame.window_buffer + iWindow * FlaCudaWriter.MAX_BLOCKSIZE * 2);
+					lpc_ctx.ComputeLPC(lpcs);
+
+					switch (omethod)
+					{
+						case OrderMethod.Max:
+							// always use maximum order
+							encode_residual_lpc_sub(frame, lpcs, iWindow, max_order, ch);
+							break;
+						case OrderMethod.Estimate:
+							// estimated orders
+							// Search at reflection coeff thresholds (where they cross 0.10)
+							{
+								int found = 0;
+								for (i = max_order; i >= min_order && found < eparams.estimation_depth; i--)
+									if (lpc_ctx.IsInterestingOrder(i, max_order))
+									{
+										encode_residual_lpc_sub(frame, lpcs, iWindow, i, ch);
+										found++;
+									}
+								if (0 == found)
+									encode_residual_lpc_sub(frame, lpcs, iWindow, min_order, ch);
+							}
+							break;
+						case OrderMethod.EstSearch2:
+							// Search at reflection coeff thresholds (where they cross 0.10)
+							{
+								int found = 0;
+								for (i = min_order; i <= max_order && found < eparams.estimation_depth; i++)
+									if (lpc_ctx.IsInterestingOrder(i))
+									{
+										encode_residual_lpc_sub(frame, lpcs, iWindow, i, ch);
+										found++;
+									}
+								if (0 == found)
+									encode_residual_lpc_sub(frame, lpcs, iWindow, min_order, ch);
+							}
+							break;
+						case OrderMethod.Search:
+							// brute-force optimal order search
+							for (i = max_order; i >= min_order; i--)
+								encode_residual_lpc_sub(frame, lpcs, iWindow, i, ch);
+							break;
+						case OrderMethod.LogFast:
+							// Try max, est, 32,16,8,4,2,1
+							encode_residual_lpc_sub(frame, lpcs, iWindow, max_order, ch);
+							for (i = lpc.MAX_LPC_ORDER; i >= min_order; i >>= 1)
+								if (i < max_order)
+									encode_residual_lpc_sub(frame, lpcs, iWindow, i, ch);
+							break;
+						case OrderMethod.LogSearch:
+							// do LogFast first
+							encode_residual_lpc_sub(frame, lpcs, iWindow, max_order, ch);
+							for (i = lpc.MAX_LPC_ORDER; i >= min_order; i >>= 1)
+								if (i < max_order)
+									encode_residual_lpc_sub(frame, lpcs, iWindow, i, ch);
+							// if found a good order, try to search around it
+							if (frame.subframes[ch].best.type == SubframeType.LPC)
+							{
+								// log search (written by Michael Niedermayer for FFmpeg)
+								for (int step = lpc.MAX_LPC_ORDER; step > 0; step >>= 1)
+								{
+									int last = frame.subframes[ch].best.order;
+									if (step <= (last + 1) / 2)
+										for (i = last - step; i <= last + step; i += step)
+											if (i >= min_order && i <= max_order)
+												encode_residual_lpc_sub(frame, lpcs, iWindow, i, ch);
+								}
+							}
+							break;
+						default:
+							throw new Exception("unknown ordermethod");
+					}
+				}
+			}
+		}
+
+		unsafe void encode_selected_residual(FlacFrame frame, int ch, PredictionType predict, OrderMethod omethod, int best_window, int best_order)
+		{
+			int* smp = frame.subframes[ch].samples;
+			int i, n = frame.blocksize;
+
+			// CONSTANT
+			for (i = 1; i < n; i++)
+			{
+				if (smp[i] != smp[0]) break;
+			}
+			if (i == n)
+			{
+				frame.subframes[ch].best.type = SubframeType.Constant;
+				frame.subframes[ch].best.residual[0] = smp[0];
+				frame.subframes[ch].best.size = frame.subframes[ch].obits;
+				return;
+			}
+
+			// VERBATIM
+			frame.current.type = SubframeType.Verbatim;
+			frame.current.size = frame.subframes[ch].obits * (uint)frame.blocksize;
+			frame.ChooseBestSubframe(ch);
+
+			if (n < 5 || predict == PredictionType.None)
+				return;
+
+			// FIXED
+			if (predict == PredictionType.Fixed ||
+				(predict == PredictionType.Search) ||
+				n <= eparams.max_prediction_order)
+			{
+				int max_fixed_order = Math.Min(eparams.max_fixed_order, 4);
+				int min_fixed_order = Math.Min(eparams.min_fixed_order, max_fixed_order);
+
+				for (i = min_fixed_order; i <= max_fixed_order; i++)
+					encode_residual_fixed_sub(frame, i, ch);
+			}
+
+			// LPC
+			if (n > eparams.max_prediction_order &&
+			   (predict == PredictionType.Levinson ||
+				predict == PredictionType.Search)
+				)
+			{
+				float* lpcs = stackalloc float[lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER];
+				LpcContext lpc_ctx = frame.subframes[ch].lpc_ctx[best_window];
+
+				lpc_ctx.GetReflection(eparams.max_prediction_order, smp, n, frame.window_buffer + best_window * FlaCudaWriter.MAX_BLOCKSIZE * 2);
+				lpc_ctx.ComputeLPC(lpcs);
+
+				encode_residual_lpc_sub(frame, lpcs, best_window, best_order, ch);
+			}
+		}
+
+		unsafe void output_frame_header(FlacFrame frame, BitWriter bitwriter)
+		{
+			bitwriter.writebits(15, 0x7FFC);
+			bitwriter.writebits(1, eparams.variable_block_size > 0 ? 1 : 0);
+			bitwriter.writebits(4, frame.bs_code0);
+			bitwriter.writebits(4, sr_code0);
+			if (frame.ch_mode == ChannelMode.NotStereo)
+				bitwriter.writebits(4, ch_code);
+			else
+				bitwriter.writebits(4, (int) frame.ch_mode);
+			bitwriter.writebits(3, bps_code);
+			bitwriter.writebits(1, 0);
+			bitwriter.write_utf8(frame_count);
+
+			// custom block size
+			if (frame.bs_code1 >= 0)
+			{
+				if (frame.bs_code1 < 256)
+					bitwriter.writebits(8, frame.bs_code1);
+				else
+					bitwriter.writebits(16, frame.bs_code1);
+			}
+
+			// custom sample rate
+			if (sr_code1 > 0)
+			{
+				if (sr_code1 < 256)
+					bitwriter.writebits(8, sr_code1);
+				else
+					bitwriter.writebits(16, sr_code1);
+			}
+
+			// CRC-8 of frame header
+			bitwriter.flush();
+			byte crc = crc8.ComputeChecksum(frame_buffer, 0, bitwriter.Length);
+			bitwriter.writebits(8, crc);
+		}
+
+		unsafe void output_residual(FlacFrame frame, BitWriter bitwriter, FlacSubframeInfo sub)
+		{
+			// rice-encoded block
+			bitwriter.writebits(2, 0);
+
+			// partition order
+			int porder = sub.best.rc.porder;
+			int psize = frame.blocksize >> porder;
+			//assert(porder >= 0);
+			bitwriter.writebits(4, porder);
+			int res_cnt = psize - sub.best.order;
+
+			// residual
+			int j = sub.best.order;
+			for (int p = 0; p < (1 << porder); p++)
+			{
+				int k = sub.best.rc.rparams[p];
+				bitwriter.writebits(4, k);
+				if (p == 1) res_cnt = psize;
+				int cnt = Math.Min(res_cnt, frame.blocksize - j);
+				bitwriter.write_rice_block_signed(k, sub.best.residual + j, cnt);
+				j += cnt;
+			}
+		}
+
+		unsafe void 
+		output_subframe_constant(FlacFrame frame, BitWriter bitwriter, FlacSubframeInfo sub)
+		{
+			bitwriter.writebits_signed(sub.obits, sub.best.residual[0]);
+		}
+
+		unsafe void
+		output_subframe_verbatim(FlacFrame frame, BitWriter bitwriter, FlacSubframeInfo sub)
+		{
+			int n = frame.blocksize;
+			for (int i = 0; i < n; i++)
+				bitwriter.writebits_signed(sub.obits, sub.samples[i]); 
+			// Don't use residual here, because we don't copy samples to residual for verbatim frames.
+		}
+
+		unsafe void
+		output_subframe_fixed(FlacFrame frame, BitWriter bitwriter, FlacSubframeInfo sub)
+		{
+			// warm-up samples
+			for (int i = 0; i < sub.best.order; i++)
+				bitwriter.writebits_signed(sub.obits, sub.samples[i]);
+
+			// residual
+			output_residual(frame, bitwriter, sub);
+		}
+
+		unsafe void
+		output_subframe_lpc(FlacFrame frame, BitWriter bitwriter, FlacSubframeInfo sub)
+		{
+			// warm-up samples
+			for (int i = 0; i < sub.best.order; i++)
+				bitwriter.writebits_signed(sub.obits, sub.samples[i]);
+
+			// LPC coefficients
+			int cbits = 1;
+			for (int i = 0; i < sub.best.order; i++)
+				while (cbits < 16 && sub.best.coefs[i] != (sub.best.coefs[i] << (32 - cbits)) >> (32 - cbits))
+					cbits++;
+			bitwriter.writebits(4, cbits - 1);
+			bitwriter.writebits_signed(5, sub.best.shift);
+			for (int i = 0; i < sub.best.order; i++)
+				bitwriter.writebits_signed(cbits, sub.best.coefs[i]);
+			
+			// residual
+			output_residual(frame, bitwriter, sub);
+		}
+
+		unsafe void output_subframes(FlacFrame frame, BitWriter bitwriter)
+		{
+			for (int ch = 0; ch < channels; ch++)
+			{
+				FlacSubframeInfo sub = frame.subframes[ch];
+				// subframe header
+				int type_code = (int) sub.best.type;
+				if (sub.best.type == SubframeType.Fixed)
+					type_code |= sub.best.order;
+				if (sub.best.type == SubframeType.LPC)
+					type_code |= sub.best.order - 1;
+				bitwriter.writebits(1, 0);
+				bitwriter.writebits(6, type_code);
+				bitwriter.writebits(1, sub.wbits != 0 ? 1 : 0);
+				if (sub.wbits > 0)
+					bitwriter.writebits((int)sub.wbits, 1);
+
+				// subframe
+				switch (sub.best.type)
+				{
+					case SubframeType.Constant:
+						output_subframe_constant(frame, bitwriter, sub);
+						break;
+					case SubframeType.Verbatim:
+						output_subframe_verbatim(frame, bitwriter, sub);
+						break;
+					case SubframeType.Fixed:
+						output_subframe_fixed(frame, bitwriter, sub);
+						break;
+					case SubframeType.LPC:
+						output_subframe_lpc(frame, bitwriter, sub);
+						break;
+				}
+			}
+		}
+
+		void output_frame_footer(BitWriter bitwriter)
+		{
+			bitwriter.flush();
+			ushort crc = crc16.ComputeChecksum(frame_buffer, 0, bitwriter.Length);
+			bitwriter.writebits(16, crc);
+			bitwriter.flush();
+		}
+
+		unsafe void encode_residual_pass1(FlacFrame frame, int ch)
+		{
+			int max_prediction_order = eparams.max_prediction_order;
+			int max_fixed_order = eparams.max_fixed_order;
+			int min_fixed_order = eparams.min_fixed_order;
+			int lpc_min_precision_search = eparams.lpc_min_precision_search;
+			int lpc_max_precision_search = eparams.lpc_max_precision_search;
+			int max_partition_order = eparams.max_partition_order;
+			int estimation_depth = eparams.estimation_depth;
+			eparams.min_fixed_order = 2;
+			eparams.max_fixed_order = 2;
+			eparams.lpc_min_precision_search = eparams.lpc_max_precision_search;
+			eparams.max_prediction_order = 8;
+			eparams.estimation_depth = 1;
+			encode_residual(frame, ch, eparams.prediction_type, OrderMethod.Estimate, 1);
+			eparams.min_fixed_order = min_fixed_order;
+			eparams.max_fixed_order = max_fixed_order;
+			eparams.max_prediction_order = max_prediction_order;
+			eparams.lpc_min_precision_search = lpc_min_precision_search;
+			eparams.lpc_max_precision_search = lpc_max_precision_search;
+			eparams.max_partition_order = max_partition_order;
+			eparams.estimation_depth = estimation_depth;
+		}
+
+		unsafe void encode_residual_pass2(FlacFrame frame, int ch)
+		{
+			encode_residual(frame, ch, eparams.prediction_type, eparams.order_method, 2);
+		}
+
+		unsafe void encode_residual_onepass(FlacFrame frame, int ch)
+		{
+			if (_windowcount > 1)
+			{
+				encode_residual_pass1(frame, ch);
+				encode_residual_pass2(frame, ch);
+			} else
+				encode_residual(frame, ch, eparams.prediction_type, eparams.order_method, 0);
+		}
+
+		unsafe uint measure_frame_size(FlacFrame frame, bool do_midside)
+		{
+			// crude estimation of header/footer size
+			uint total = (uint)(32 + ((BitReader.log2i(frame_count) + 4) / 5) * 8 + (eparams.variable_block_size != 0 ? 16 : 0) + 16);
+
+			if (do_midside)
+			{
+				uint bitsBest = AudioSamples.UINT32_MAX;
+				ChannelMode modeBest = ChannelMode.LeftRight;
+
+				if (bitsBest > frame.subframes[2].best.size + frame.subframes[3].best.size)
+				{
+					bitsBest = frame.subframes[2].best.size + frame.subframes[3].best.size;
+					modeBest = ChannelMode.MidSide;
+				}
+				if (bitsBest > frame.subframes[3].best.size + frame.subframes[1].best.size)
+				{
+					bitsBest = frame.subframes[3].best.size + frame.subframes[1].best.size;
+					modeBest = ChannelMode.RightSide;
+				}
+				if (bitsBest > frame.subframes[3].best.size + frame.subframes[0].best.size)
+				{
+					bitsBest = frame.subframes[3].best.size + frame.subframes[0].best.size;
+					modeBest = ChannelMode.LeftSide;
+				}
+				if (bitsBest > frame.subframes[0].best.size + frame.subframes[1].best.size)
+				{
+					bitsBest = frame.subframes[0].best.size + frame.subframes[1].best.size;
+					modeBest = ChannelMode.LeftRight;
+				}
+				frame.ch_mode = modeBest;
+				return total + bitsBest;
+			}
+
+			for (int ch = 0; ch < channels; ch++)
+				total += frame.subframes[ch].best.size;
+			return total;
+		}
+
+		unsafe delegate void window_function(float* window, int size);
+
+		unsafe void calculate_window(float* window, window_function func, WindowFunction flag)
+		{
+			if ((eparams.window_function & flag) == 0 || _windowcount == lpc.MAX_LPC_WINDOWS)
+				return;
+			int sz = _windowsize;
+			float* pos = window + _windowcount * FlaCudaWriter.MAX_BLOCKSIZE * 2;
+			do
+			{
+				func(pos, sz);
+				if ((sz & 1) != 0)
+					break;
+				pos += sz;
+				sz >>= 1;
+			} while (sz >= 32);
+			_windowcount++;
+		}
+
+		unsafe int encode_frame(out int size)
+		{
+			fixed (int* s = samplesBuffer, r = residualBuffer)
+			fixed (float* window = windowBuffer)
+			{
+				frame.InitSize(eparams.block_size, eparams.variable_block_size != 0);
+
+				if (frame.blocksize != _windowsize && frame.blocksize > 4)
+				{
+					_windowsize = frame.blocksize;
+					_windowcount = 0;
+					calculate_window(window, lpc.window_welch, WindowFunction.Welch);
+					calculate_window(window, lpc.window_tukey, WindowFunction.Tukey);
+					calculate_window(window, lpc.window_hann, WindowFunction.Hann);
+					calculate_window(window, lpc.window_flattop, WindowFunction.Flattop);
+					calculate_window(window, lpc.window_bartlett, WindowFunction.Bartlett);
+					if (_windowcount == 0)
+						throw new Exception("invalid windowfunction");
+					cuda.CopyHostToDevice<float>(cudaWindow, windowBuffer);
+				}
+
+				if (channels != 2 || frame.blocksize <= 32 || eparams.stereo_method == StereoMethod.Independent)
+				{
+					frame.window_buffer = window;
+					frame.current.residual = r + channels * FlaCudaWriter.MAX_BLOCKSIZE;
+					frame.ch_mode = channels != 2 ? ChannelMode.NotStereo : ChannelMode.LeftRight;
+					for (int ch = 0; ch < channels; ch++)
+						frame.subframes[ch].Init(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * FlaCudaWriter.MAX_BLOCKSIZE,
+							bits_per_sample, get_wasted_bits(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, frame.blocksize));
+
+					for (int ch = 0; ch < channels; ch++)
+						encode_residual_onepass(frame, ch);
+				}
+				else
+				{
+					channel_decorrelation(s, s + FlaCudaWriter.MAX_BLOCKSIZE, s + 2 * FlaCudaWriter.MAX_BLOCKSIZE, s + 3 * FlaCudaWriter.MAX_BLOCKSIZE, frame.blocksize);
+					frame.window_buffer = window;
+					frame.current.residual = r + 4 * FlaCudaWriter.MAX_BLOCKSIZE;
+					for (int ch = 0; ch < 4; ch++)
+						frame.subframes[ch].Init(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * FlaCudaWriter.MAX_BLOCKSIZE, 
+							bits_per_sample + (ch == 3 ? 1U : 0U), get_wasted_bits(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, frame.blocksize));
+
+					int orders = 8;
+					while (orders < eparams.max_prediction_order + 1 && orders < 32)
+						orders <<= 1;
+					int threads = 512;
+					int threads_x = orders;
+					int threads_y = threads / threads_x;
+					int blocks_y = ((threads_x - 1) * (threads_y - 1)) / threads_y;
+					int blocks = (frame.blocksize + blocks_y * threads_y - 1) / (blocks_y * threads_y);
+					cuda.CopyHostToDevice(cudaSamples, (IntPtr)s, (uint)FlaCudaWriter.MAX_BLOCKSIZE * 4 * sizeof(int));
+					cuda.SetParameter(cudaComputeAutocor, 0, (uint)cudaSamples.Pointer);
+					cuda.SetParameter(cudaComputeAutocor, IntPtr.Size, (uint)cudaWindow.Pointer);
+					cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 2, (uint)cudaAutocor.Pointer);
+					cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3, (uint)frame.blocksize);
+					cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3 + sizeof(uint), (uint)FlaCudaWriter.MAX_BLOCKSIZE);
+					cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3 + sizeof(uint) * 2, (uint)(blocks_y*threads_y));
+					cuda.SetParameterSize(cudaComputeAutocor, (uint)(IntPtr.Size * 3) + sizeof(uint) * 3);
+					cuda.SetFunctionBlockShape(cudaComputeAutocor, threads_x, threads_y, 1);
+					cuda.SetFunctionSharedSize(cudaComputeAutocor, (uint)(sizeof(float) * 1090));//545));
+					cuda.Launch(cudaComputeAutocor, blocks, 4 * _windowcount);
+					cuda.CopyDeviceToHost<float>(cudaAutocor, autocorBuffer);
+					//cuda.CopyDeviceToHost<int>(cudaAutocor, autocorBuffer3int);
+
+					int* shift = stackalloc int[lpc.MAX_LPC_ORDER * 4 * _windowcount];
+					int* coefs = stackalloc int[lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER * 4 * _windowcount];
+					uint cbits = get_precision(frame.blocksize) + 1;
+					AudioSamples.MemSet(coefs, 0, lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER * 4 * _windowcount);
+
+					for (int ch = 0; ch < 4; ch++)
+						for (int iWindow = 0; iWindow < _windowcount; iWindow++)
+						{
+							double* ac = stackalloc double[lpc.MAX_LPC_ORDER + 1];
+							for (int i = 0; i < orders; i++)
+							{
+								ac[i] = 0;
+								for (int i_block = 0; i_block < blocks; i_block++)
+									ac[i] += autocorBuffer[orders * (i_block + blocks * (ch + 4 * iWindow)) + i];
+							}
+							//double* ac = stackalloc double[33];
+							//for (int i = 0; i < 33; i++)
+							//    ac[i] = autocorBuffer3int[(33 * (ch + 4 * iWindow) + i) * 3] * (double)(1 << 18) +
+							//        autocorBuffer3int[(33 * (ch + 4 * iWindow) + i) * 3 + 1] * (double)(1 << 9) +
+							//        autocorBuffer3int[(33 * (ch + 4 * iWindow) + i) * 3 + 2];
+							//fixed (float *ac = &autocorBuffer[orders * (ch + 4 * iWindow)])
+							frame.subframes[ch].lpc_ctx[iWindow].ComputeReflection(orders - 1, ac);
+							float* lpcs = stackalloc float[lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER];
+							frame.subframes[ch].lpc_ctx[iWindow].ComputeLPC(lpcs);
+							for (int order = 0; order < orders - 1; order++)
+								lpc.quantize_lpc_coefs(lpcs + order * lpc.MAX_LPC_ORDER,
+									order + 1, cbits, coefs + order * lpc.MAX_LPC_ORDER + (ch + 4 * iWindow) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER,
+									out shift[order + (ch + 4 * iWindow) * lpc.MAX_LPC_ORDER], 15, 0);
+						}
+					
+					int max_order = Math.Min(orders - 1, eparams.max_prediction_order);
+
+					cuda.CopyHostToDevice(cudaCoeffs, (IntPtr)coefs, (uint)(sizeof(int) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER * 4 * _windowcount));
+					cuda.CopyHostToDevice(cudaShifts, (IntPtr)shift, (uint)(sizeof(int) * lpc.MAX_LPC_ORDER* 4 * _windowcount));
+					//if (frame_count == 0)
+					{
+						cuda.SetParameter(cudaEncodeResidual, 0, (uint)cudaAutocor.Pointer);
+						cuda.SetParameter(cudaEncodeResidual, IntPtr.Size, (uint)cudaSamples.Pointer);
+						cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 2, (uint)cudaCoeffs.Pointer);
+						cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 3, (uint)cudaShifts.Pointer);
+						cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 4, (uint)frame.blocksize);
+						cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 4 + sizeof(uint), (uint)FlaCudaWriter.MAX_BLOCKSIZE);
+						cuda.SetParameterSize(cudaEncodeResidual, (uint)(IntPtr.Size * 4) + sizeof(uint) * 2);
+						cuda.SetFunctionBlockShape(cudaEncodeResidual, threads_x, 256 / threads_x, 1);
+					}
+					//cuda.SetFunctionSharedSize(cudaEncodeResidual, (uint)(sizeof(float) * 1090));//545));
+					cuda.Launch(cudaEncodeResidual, max_order, 4 * _windowcount);
+					cuda.CopyDeviceToHost<int>(cudaAutocor, autocorBuffer3int);
+
+					for (int ch = 0; ch < 4; ch++)
+					{
+						frame.subframes[ch].best.size = AudioSamples.UINT32_MAX;
+						for (int iWindow = 0; iWindow < _windowcount; iWindow++)
+						{
+							for (int order = 1; order <= max_order; order++)
+							{
+								uint nbits = (uint)autocorBuffer3int[order - 1 + (ch + 4 * iWindow) * max_order];
+								nbits += (uint)order * frame.subframes[ch].obits + 4 + 5 + (uint)order * cbits + 6;
+								if (frame.subframes[ch].best.size > nbits)
+								{
+									frame.subframes[ch].best.size = nbits;
+									frame.subframes[ch].best.order = order;
+									frame.subframes[ch].best.window = iWindow;
+									frame.subframes[ch].best.type = SubframeType.LPC;
+								}
+							}
+							//uint[] sums_buf = new uint[Flake.MAX_PARTITION_ORDER * Flake.MAX_PARTITIONS];
+							//fixed (uint* sums = sums_buf)
+							//    for (int order = 1; order <= max_order; order++)
+							//{
+							//    //uint nbits;
+							//    //find_optimal_rice_param(2*(uint)autocorBuffer3int[order - 1 + (ch + 4 * iWindow) * max_order], (uint)frame.blocksize, out nbits);
+							//    //uint nbits = (uint)autocorBuffer3int[order - 1 + (ch + 4 * iWindow) * max_order];
+							//    //nbits += (uint)order * frame.subframes[ch].obits + 4 + 5 + (uint)order * cbits + 6;
+							//    for (int ip = 0; ip < 64; ip++)
+							//        sums[6 * Flake.MAX_PARTITIONS + ip] = (uint)autocorBuffer3int[64 * (order - 1 + (ch + 4 * iWindow) * max_order) + ip];
+							//    for (int ip = 5; ip >= 0; ip--)
+							//    {
+							//        int parts = (1 << ip);
+							//        for (int j = 0; j < parts; j++)
+							//        {
+							//            sums[ip * Flake.MAX_PARTITIONS + j] =
+							//                sums[(ip + 1) * Flake.MAX_PARTITIONS + 2 * j] +
+							//                sums[(ip + 1) * Flake.MAX_PARTITIONS + 2 * j + 1];
+							//        }
+							//    }
+							//    for (int ip = 0; ip <= get_max_p_order(Math.Min(6, eparams.max_partition_order), frame.blocksize, order); ip++)
+							//    {
+							//        uint nbits = calc_optimal_rice_params(ref frame.current.rc, ip, sums + ip * Flake.MAX_PARTITIONS, (uint)frame.blocksize, (uint)order);
+							//        nbits += (uint)order * frame.subframes[ch].obits + 4 + 5 + (uint)order * cbits + 6;
+							//        if (frame.subframes[ch].best.size > nbits)
+							//        {
+							//            frame.subframes[ch].best.size = nbits;
+							//            frame.subframes[ch].best.order = order;
+							//            frame.subframes[ch].best.window = iWindow;
+							//            frame.subframes[ch].best.type = SubframeType.LPC;
+							//        }
+							//    }
+							//}
+						}
+					}
+
+					uint fs = measure_frame_size(frame, true);
+					frame.ChooseSubframes();
+					for (int ch = 0; ch < channels; ch++)
+					{
+						frame.subframes[ch].best.size = AudioSamples.UINT32_MAX;
+						encode_selected_residual(frame, ch, eparams.prediction_type, eparams.order_method, 
+							frame.subframes[ch].best.window, frame.subframes[ch].best.order);
+					}					
+				}
+
+				BitWriter bitwriter = new BitWriter(frame_buffer, 0, max_frame_size);
+
+				output_frame_header(frame, bitwriter);
+				output_subframes(frame, bitwriter);
+				output_frame_footer(bitwriter);
+
+				if (frame_buffer != null)
+				{
+					if (eparams.variable_block_size > 0)
+						frame_count += frame.blocksize;
+					else
+						frame_count++;
+				}
+				size = frame.blocksize;
+				return bitwriter.Length;
+			}
+		}
+
+		unsafe int output_frame()
+		{
+			if (verify != null)
+			{
+				fixed (int* s = verifyBuffer, r = samplesBuffer)
+					for (int ch = 0; ch < channels; ch++)
+						AudioSamples.MemCpy(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * FlaCudaWriter.MAX_BLOCKSIZE, eparams.block_size);
+			}
+
+			int fs, bs;
+			//if (0 != eparams.variable_block_size && 0 == (eparams.block_size & 7) && eparams.block_size >= 128)
+			//    fs = encode_frame_vbs();
+			//else
+			fs = encode_frame(out bs);
+
+			if (seek_table != null && _IO.CanSeek)
+			{
+				for (int sp = 0; sp < seek_table.Length; sp++)
+				{
+					if (seek_table[sp].framesize != 0)
+						continue;
+					if (seek_table[sp].number > (ulong)_position + (ulong)bs)
+						break;
+					if (seek_table[sp].number >= (ulong)_position)
+					{
+						seek_table[sp].number = (ulong)_position;
+						seek_table[sp].offset = (ulong)(_IO.Position - first_frame_offset);
+						seek_table[sp].framesize = (uint)bs;
+					}
+				}
+			}
+
+			_position += bs;
+			_IO.Write(frame_buffer, 0, fs);
+			_totalSize += fs;
+
+			if (verify != null)
+			{
+				int decoded = verify.DecodeFrame(frame_buffer, 0, fs);
+				if (decoded != fs || verify.Remaining != (ulong)bs)
+					throw new Exception("validation failed!");
+				fixed (int* s = verifyBuffer, r = verify.Samples)
+				{
+					for (int ch = 0; ch < channels; ch++)
+						if (AudioSamples.MemCmp(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * FlaCudaWriter.MAX_BLOCKSIZE, bs))
+							throw new Exception("validation failed!");
+				}
+			}
+
+			if (bs < eparams.block_size)
+			{
+				fixed (int* s = samplesBuffer)
+					for (int ch = 0; ch < channels; ch++)
+						AudioSamples.MemCpy(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, s + bs + ch * FlaCudaWriter.MAX_BLOCKSIZE, eparams.block_size - bs);
+			}
+
+			samplesInBuffer -= bs;
+
+			return bs;
+		}
+
+		public void Write(int[,] buff, int pos, int sampleCount)
+		{
+			if (!inited)
+			{
+				cuda = new CUDA(0, true);
+				cuda.LoadModule(System.IO.Path.Combine(Environment.CurrentDirectory, "flacuda.cubin"));
+				cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor");
+				cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual");
+				//cudaAutocor = cuda.Allocate((uint)(sizeof(int) * (lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * 3);
+				cudaAutocor = cuda.Allocate((uint)(sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * 64);
+				cudaSamples = cuda.Allocate((uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels)));
+				cudaCoeffs = cuda.Allocate((uint)(sizeof(int) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS));
+				cudaShifts = cuda.Allocate((uint)(sizeof(int) * lpc.MAX_LPC_ORDER * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS));
+				cudaWindow = cuda.Allocate((uint)sizeof(float) * FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS);
+				if (_IO == null)
+					_IO = new FileStream(_path, FileMode.Create, FileAccess.Write, FileShare.Read);
+				int header_size = flake_encode_init();
+				_IO.Write(header, 0, header_size);
+				if (_IO.CanSeek)
+					first_frame_offset = _IO.Position;
+				inited = true;
+			}
+
+			int len = sampleCount;
+			while (len > 0)
+			{
+				int block = Math.Min(len, eparams.block_size - samplesInBuffer);
+
+				copy_samples(buff, pos, block);
+
+				if (md5 != null)
+				{
+					AudioSamples.FLACSamplesToBytes(buff, pos, frame_buffer, 0, block, channels, (int)bits_per_sample);
+					md5.TransformBlock(frame_buffer, 0, block * channels * ((int)bits_per_sample >> 3), null, 0);
+				}
+
+				len -= block;
+				pos += block;
+
+				while (samplesInBuffer >= eparams.block_size)
+					output_frame();
+			}
+		}
+
+		public string Path { get { return _path; } }
+
+		string vendor_string = "Flake#0.1";
+
+		int select_blocksize(int samplerate, int time_ms)
+		{
+			int blocksize = Flake.flac_blocksizes[1];
+			int target = (samplerate * time_ms) / 1000;
+			if (eparams.variable_block_size > 0)
+			{
+				blocksize = 1024;
+				while (target >= blocksize)
+					blocksize <<= 1;
+				return blocksize >> 1;
+			}
+
+			for (int i = 0; i < Flake.flac_blocksizes.Length; i++)
+				if (target >= Flake.flac_blocksizes[i] && Flake.flac_blocksizes[i] > blocksize)
+				{
+					blocksize = Flake.flac_blocksizes[i];
+				}
+			return blocksize;
+		}
+
+		void write_streaminfo(byte[] header, int pos, int last)
+		{
+			Array.Clear(header, pos, 38);
+			BitWriter bitwriter = new BitWriter(header, pos, 38);
+
+			// metadata header
+			bitwriter.writebits(1, last);
+			bitwriter.writebits(7, (int)MetadataType.StreamInfo);
+			bitwriter.writebits(24, 34);
+
+			if (eparams.variable_block_size > 0)
+				bitwriter.writebits(16, 0);
+			else
+				bitwriter.writebits(16, eparams.block_size);
+
+			bitwriter.writebits(16, eparams.block_size);
+			bitwriter.writebits(24, 0);
+			bitwriter.writebits(24, max_frame_size);
+			bitwriter.writebits(20, sample_rate);
+			bitwriter.writebits(3, channels - 1);
+			bitwriter.writebits(5, bits_per_sample - 1);
+
+			// total samples
+			if (sample_count > 0)
+			{
+				bitwriter.writebits(4, 0);
+				bitwriter.writebits(32, sample_count);
+			}
+			else
+			{
+				bitwriter.writebits(4, 0);
+				bitwriter.writebits(32, 0);
+			}
+			bitwriter.flush();
+		}
+
+		/**
+		 * Write vorbis comment metadata block to byte array.
+		 * Just writes the vendor string for now.
+	     */
+		int write_vorbis_comment(byte[] comment, int pos, int last)
+		{
+			BitWriter bitwriter = new BitWriter(comment, pos, 4);
+			Encoding enc = new ASCIIEncoding();
+			int vendor_len = enc.GetBytes(vendor_string, 0, vendor_string.Length, comment, pos + 8);
+
+			// metadata header
+			bitwriter.writebits(1, last);
+			bitwriter.writebits(7, (int)MetadataType.VorbisComment);
+			bitwriter.writebits(24, vendor_len + 8);
+
+			comment[pos + 4] = (byte)(vendor_len & 0xFF);
+			comment[pos + 5] = (byte)((vendor_len >> 8) & 0xFF);
+			comment[pos + 6] = (byte)((vendor_len >> 16) & 0xFF);
+			comment[pos + 7] = (byte)((vendor_len >> 24) & 0xFF);
+			comment[pos + 8 + vendor_len] = 0;
+			comment[pos + 9 + vendor_len] = 0;
+			comment[pos + 10 + vendor_len] = 0;
+			comment[pos + 11 + vendor_len] = 0;
+			bitwriter.flush();
+			return vendor_len + 12;
+		}
+
+		int write_seekpoints(byte[] header, int pos, int last)
+		{
+			seek_table_offset = pos + 4;
+
+			BitWriter bitwriter = new BitWriter(header, pos, 4 + 18 * seek_table.Length);
+
+			// metadata header
+			bitwriter.writebits(1, last);
+			bitwriter.writebits(7, (int)MetadataType.Seektable);
+			bitwriter.writebits(24, 18 * seek_table.Length);
+			for (int i = 0; i < seek_table.Length; i++)
+			{
+				bitwriter.writebits64(Flake.FLAC__STREAM_METADATA_SEEKPOINT_SAMPLE_NUMBER_LEN, seek_table[i].number);
+				bitwriter.writebits64(Flake.FLAC__STREAM_METADATA_SEEKPOINT_STREAM_OFFSET_LEN, seek_table[i].offset);
+				bitwriter.writebits(Flake.FLAC__STREAM_METADATA_SEEKPOINT_FRAME_SAMPLES_LEN, seek_table[i].framesize);
+			}
+			bitwriter.flush();
+			return 4 + 18 * seek_table.Length;
+		}
+
+		/**
+		 * Write padding metadata block to byte array.
+		 */
+		int
+		write_padding(byte[] padding, int pos, int last, int padlen)
+		{
+			BitWriter bitwriter = new BitWriter(padding, pos, 4);
+
+			// metadata header
+			bitwriter.writebits(1, last);
+			bitwriter.writebits(7, (int)MetadataType.Padding);
+			bitwriter.writebits(24, padlen);
+
+			return padlen + 4;
+		}
+
+		int write_headers()
+		{
+			int header_size = 0;
+			int last = 0;
+
+			// stream marker
+			header[0] = 0x66;
+			header[1] = 0x4C;
+			header[2] = 0x61;
+			header[3] = 0x43;
+			header_size += 4;
+
+			// streaminfo
+			write_streaminfo(header, header_size, last);
+			header_size += 38;
+
+			// seek table
+			if (_IO.CanSeek && seek_table != null)
+				header_size += write_seekpoints(header, header_size, last);
+
+			// vorbis comment
+			if (eparams.padding_size == 0) last = 1;
+			header_size += write_vorbis_comment(header, header_size, last);
+
+			// padding
+			if (eparams.padding_size > 0)
+			{
+				last = 1;
+				header_size += write_padding(header, header_size, last, eparams.padding_size);
+			}
+
+			return header_size;
+		}
+
+		int flake_encode_init()
+		{
+			int i, header_len;
+
+			//if(flake_validate_params(s) < 0)
+
+			ch_code = channels - 1;
+
+			// find samplerate in table
+			for (i = 4; i < 12; i++)
+			{
+				if (sample_rate == Flake.flac_samplerates[i])
+				{
+					sr_code0 = i;
+					break;
+				}
+			}
+
+			// if not in table, samplerate is non-standard
+			if (i == 12)
+				throw new Exception("non-standard samplerate");
+
+			for (i = 1; i < 8; i++)
+			{
+				if (bits_per_sample == Flake.flac_bitdepths[i])
+				{
+					bps_code = i;
+					break;
+				}
+			}
+			if (i == 8)
+				throw new Exception("non-standard bps");
+			// FIXME: For now, only 16-bit encoding is supported
+			if (bits_per_sample != 16)
+				throw new Exception("non-standard bps");
+
+			if (_blocksize == 0)
+			{
+				if (eparams.block_size == 0)
+					eparams.block_size = select_blocksize(sample_rate, eparams.block_time_ms);
+				_blocksize = eparams.block_size;
+			}
+			else
+				eparams.block_size = _blocksize;
+
+			// set maximum encoded frame size (if larger, re-encodes in verbatim mode)
+			if (channels == 2)
+				max_frame_size = 16 + ((eparams.block_size * (int)(bits_per_sample + bits_per_sample + 1) + 7) >> 3);
+			else
+				max_frame_size = 16 + ((eparams.block_size * channels * (int)bits_per_sample + 7) >> 3);
+
+			if (_IO.CanSeek && eparams.do_seektable)
+			{
+				int seek_points_distance = sample_rate * 10;
+				int num_seek_points = 1 + sample_count / seek_points_distance; // 1 seek point per 10 seconds
+				if (sample_count % seek_points_distance == 0)
+					num_seek_points--;
+				seek_table = new SeekPoint[num_seek_points];
+				for (int sp = 0; sp < num_seek_points; sp++)
+				{
+					seek_table[sp].framesize = 0;
+					seek_table[sp].offset = 0;
+					seek_table[sp].number = (ulong)(sp * seek_points_distance);
+				}
+			}
+
+			// output header bytes
+			header = new byte[eparams.padding_size + 1024 + (seek_table == null ? 0 : seek_table.Length * 18)];
+			header_len = write_headers();
+
+			// initialize CRC & MD5
+			if (_IO.CanSeek && eparams.do_md5)
+				md5 = new MD5CryptoServiceProvider();
+
+			if (eparams.do_verify)
+			{
+				verify = new FlakeReader(channels, bits_per_sample);
+				verifyBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * channels];
+			}
+
+			frame_buffer = new byte[max_frame_size];
+
+			return header_len;
+		}
+	}
+
+	struct FlakeEncodeParams
+	{
+		// compression quality
+		// set by user prior to calling flake_encode_init
+		// standard values are 0 to 8
+		// 0 is lower compression, faster encoding
+		// 8 is higher compression, slower encoding
+		// extended values 9 to 12 are slower and/or use
+		// higher prediction orders
+		public int compression;
+
+		// prediction order selection method
+		// set by user prior to calling flake_encode_init
+		// if set to less than 0, it is chosen based on compression.
+		// valid values are 0 to 5
+		// 0 = use maximum order only
+		// 1 = use estimation
+		// 2 = 2-level
+		// 3 = 4-level
+		// 4 = 8-level
+		// 5 = full search
+		// 6 = log search
+		public OrderMethod order_method;
+
+
+		// stereo decorrelation method
+		// set by user prior to calling flake_encode_init
+		// if set to less than 0, it is chosen based on compression.
+		// valid values are 0 to 2
+		// 0 = independent L+R channels
+		// 1 = mid-side encoding
+		public StereoMethod stereo_method;
+
+		// block size in samples
+		// set by the user prior to calling flake_encode_init
+		// if set to 0, a block size is chosen based on block_time_ms
+		// can also be changed by user before encoding a frame
+		public int block_size;
+
+		// block time in milliseconds
+		// set by the user prior to calling flake_encode_init
+		// used to calculate block_size based on sample rate
+		// can also be changed by user before encoding a frame
+		public int block_time_ms;
+
+		// padding size in bytes
+		// set by the user prior to calling flake_encode_init
+		// if set to less than 0, defaults to 4096
+		public int padding_size;
+
+		// minimum LPC order
+		// set by user prior to calling flake_encode_init
+		// if set to less than 0, it is chosen based on compression.
+		// valid values are 1 to 32
+		public int min_prediction_order;
+
+		// maximum LPC order
+		// set by user prior to calling flake_encode_init
+		// if set to less than 0, it is chosen based on compression.
+		// valid values are 1 to 32 
+		public int max_prediction_order;
+
+		// Number of LPC orders to try (for estimate mode)
+		// set by user prior to calling flake_encode_init
+		// if set to less than 0, it is chosen based on compression.
+		// valid values are 1 to 32 
+		public int estimation_depth;
+
+		// minimum fixed prediction order
+		// set by user prior to calling flake_encode_init
+		// if set to less than 0, it is chosen based on compression.
+		// valid values are 0 to 4
+		public int min_fixed_order;
+
+		// maximum fixed prediction order
+		// set by user prior to calling flake_encode_init
+		// if set to less than 0, it is chosen based on compression.
+		// valid values are 0 to 4
+		public int max_fixed_order;
+
+		// type of linear prediction
+		// set by user prior to calling flake_encode_init
+		public PredictionType prediction_type;
+
+		// minimum partition order
+		// set by user prior to calling flake_encode_init
+		// if set to less than 0, it is chosen based on compression.
+		// valid values are 0 to 8
+		public int min_partition_order;
+
+		// maximum partition order
+		// set by user prior to calling flake_encode_init
+		// if set to less than 0, it is chosen based on compression.
+		// valid values are 0 to 8
+		public int max_partition_order;
+
+		// whether to use variable block sizes
+		// set by user prior to calling flake_encode_init
+		// 0 = fixed block size
+		// 1 = variable block size
+		public int variable_block_size;
+
+		// whether to try various lpc_precisions
+		// 0 - use only one precision
+		// 1 - try two precisions
+		public int lpc_max_precision_search;
+
+		public int lpc_min_precision_search;
+
+		public WindowFunction window_function;
+
+		public bool do_md5;
+		public bool do_verify;
+		public bool do_seektable;
+
+		public int flake_set_defaults(int lvl)
+		{
+			compression = lvl;
+
+			if ((lvl < 0 || lvl > 12) && (lvl != 99))
+			{
+				return -1;
+			}
+
+			// default to level 5 params
+			window_function = WindowFunction.Flattop | WindowFunction.Tukey;
+			order_method = OrderMethod.Estimate;
+			stereo_method = StereoMethod.Evaluate;
+			block_size = 0;
+			block_time_ms = 105;			
+			prediction_type = PredictionType.Search;
+			min_prediction_order = 1;
+			max_prediction_order = 12;
+			estimation_depth = 1;
+			min_fixed_order = 2;
+			max_fixed_order = 2;
+			min_partition_order = 0;
+			max_partition_order = 6;
+			variable_block_size = 0;
+			lpc_min_precision_search = 1;
+			lpc_max_precision_search = 1;
+			do_md5 = true;
+			do_verify = false;
+			do_seektable = true; 
+
+			// differences from level 7
+			switch (lvl)
+			{
+				case 0:
+					block_time_ms = 53;
+					prediction_type = PredictionType.Fixed;
+					stereo_method = StereoMethod.Independent;
+					max_partition_order = 4;
+					break;
+				case 1:
+					prediction_type = PredictionType.Levinson;
+					stereo_method = StereoMethod.Independent;
+					window_function = WindowFunction.Bartlett;
+					max_prediction_order = 8;
+					max_partition_order = 4;
+					break;
+				case 2:
+					stereo_method = StereoMethod.Independent;
+					window_function = WindowFunction.Bartlett;
+					max_partition_order = 4;
+					break;
+				case 3:
+					stereo_method = StereoMethod.Estimate;
+					window_function = WindowFunction.Bartlett;
+					max_prediction_order = 8;
+					break;
+				case 4:
+					stereo_method = StereoMethod.Estimate;
+					window_function = WindowFunction.Bartlett;
+					break;
+				case 5:
+					window_function = WindowFunction.Bartlett;
+					break;
+				case 6:
+					stereo_method = StereoMethod.Estimate;
+					break;
+				case 7:
+					break;
+				case 8:
+					estimation_depth = 3;
+					min_fixed_order = 0;
+					max_fixed_order = 4;
+					lpc_max_precision_search = 2;
+					break;
+				case 9:
+					window_function = WindowFunction.Bartlett;
+					max_prediction_order = 32;
+					break;
+				case 10:
+					min_fixed_order = 0;
+					max_fixed_order = 4;
+					max_prediction_order = 32;
+					//lpc_max_precision_search = 2;
+					break;
+				case 11:
+					min_fixed_order = 0;
+					max_fixed_order = 4;
+					max_prediction_order = 32;
+					estimation_depth = 5;
+					//lpc_max_precision_search = 2;
+					variable_block_size = 4;
+					break;
+			}
+
+			return 0;
+		}
+	}}
diff --git a/CUETools.FlaCuda/Properties/AssemblyInfo.cs b/CUETools.FlaCuda/Properties/AssemblyInfo.cs
new file mode 100644
index 0000000..da44f74
--- /dev/null
+++ b/CUETools.FlaCuda/Properties/AssemblyInfo.cs
@@ -0,0 +1,35 @@
+﻿using System.Reflection;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+// General Information about an assembly is controlled through the following 
+// set of attributes. Change these attribute values to modify the information
+// associated with an assembly.
+[assembly: AssemblyTitle("CUETools.FlaCuda")]
+[assembly: AssemblyDescription("")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyCompany("Microsoft")]
+[assembly: AssemblyProduct("CUETools.FlaCuda")]
+[assembly: AssemblyCopyright("Copyright © Microsoft 2009")]
+[assembly: AssemblyTrademark("")]
+[assembly: AssemblyCulture("")]
+
+// Setting ComVisible to false makes the types in this assembly not visible 
+// to COM components.  If you need to access a type in this assembly from 
+// COM, set the ComVisible attribute to true on that type.
+[assembly: ComVisible(false)]
+
+// The following GUID is for the ID of the typelib if this project is exposed to COM
+[assembly: Guid("b28ffece-6c89-426b-b227-e647b435cc3d")]
+
+// Version information for an assembly consists of the following four values:
+//
+//      Major Version
+//      Minor Version 
+//      Build Number
+//      Revision
+//
+// You can specify all the values or you can default the Revision and Build Numbers 
+// by using the '*' as shown below:
+[assembly: AssemblyVersion("1.0.0.0")]
+[assembly: AssemblyFileVersion("1.0.0.0")]
diff --git a/CUETools.FlaCuda/flacuda.cu b/CUETools.FlaCuda/flacuda.cu
new file mode 100644
index 0000000..6cc5d87
--- /dev/null
+++ b/CUETools.FlaCuda/flacuda.cu
@@ -0,0 +1,369 @@
+/*
+ * Copyright 1993-2007 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and
+ * international Copyright laws.  Users and possessors of this source code
+ * are hereby granted a nonexclusive, royalty-free license to use this code
+ * in individual and commercial software.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOURCE CODE.
+ *
+ * U.S. Government End Users.   This source code is a "commercial item" as
+ * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+ * "commercial computer  software"  and "commercial computer software
+ * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+ * and is provided to the U.S. Government only as a commercial end item.
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+ * source code with only those rights set forth herein.
+ *
+ * Any use of this source code in individual and commercial software must
+ * include, in the user documentation and internal comments to the code,
+ * the above Disclaimer and U.S. Government End Users Notice.
+ */
+
+#ifndef _FLACUDA_KERNEL_H_
+#define _FLACUDA_KERNEL_H_
+
+extern "C" __global__ void cudaComputeAutocor(const int * samples, const float * window, float* output, int frameSize, int frameOffset, int blocks)
+{
+    extern __shared__ float fshared[];
+    float * const matrix = fshared + 513;//257;
+    const int iWin = blockIdx.y >> 2;
+    const int iCh = blockIdx.y & 3;
+    const int smpBase = iCh * frameOffset;
+    const int winBase = iWin * 2 * frameOffset;
+    const int pos = blockIdx.x * blocks;
+
+    // fetch blockDim.x*blockDim.y samples
+    int tid = threadIdx.x + threadIdx.y * blockDim.x;
+    fshared[tid] = pos + tid < frameSize ? samples[smpBase + pos + tid] * window[winBase + pos + tid] : 0.0f;
+    __syncthreads();
+
+    float s = 0.0f;    
+    for (int i = 0; i < blocks; i += blockDim.y) 
+	s += fshared[i + threadIdx.y] * fshared[i + threadIdx.y + threadIdx.x];
+    matrix[tid + threadIdx.y] = s;
+    __syncthreads();
+
+    // reduction in shared mem
+    for(unsigned int s=blockDim.y/2; s>1; s>>=1)
+    {
+	if (threadIdx.y < s)
+	    matrix[tid + threadIdx.y] += matrix[threadIdx.x + (s + threadIdx.y) * (1 + blockDim.x)];
+	__syncthreads();
+    }
+
+    // return results
+    if (threadIdx.y == 0)
+	output[(blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x] = matrix[threadIdx.x] + matrix[threadIdx.x + 1 + blockDim.x];
+}
+
+#ifdef __DEVICE_EMULATION__
+#define EMUSYNC __syncthreads()
+#else
+#define EMUSYNC
+#endif
+
+extern "C" __global__ void cudaEncodeResidual(
+    int*output,
+    int*samples,
+    int*allcoefs,
+    int*shifts,
+    int frameSize,
+    int frameOffset)
+{
+    __shared__ struct {
+	int data[256];
+	int residual[256];
+	int coefs[32];
+	int shift;
+    } shared;
+    const int smpBase = (blockIdx.y & 3) * frameOffset;
+    const int residualOrder = blockIdx.x;
+    const int tid = threadIdx.x + __mul24(threadIdx.y, blockDim.x);
+    const int step = __mul24(blockDim.x, blockDim.y - 1);
+    int total = 0;
+
+    shared.residual[tid] = 0;
+    if (threadIdx.y == 0) shared.coefs[threadIdx.x] = threadIdx.x <= residualOrder ? allcoefs[threadIdx.x + residualOrder * 32 + blockIdx.y * 32 * 32] : 0;
+    if (tid == 0) shared.shift = shifts[32 * blockIdx.y + residualOrder];
+    __syncthreads();
+    
+    for(int pos = 0; pos < frameSize - residualOrder - 1; pos += step)
+    {
+	// fetch blockDim.x * blockDim.y samples
+	shared.data[tid] = pos + tid < frameSize ? samples[smpBase + pos + tid] : 0;	
+	__syncthreads();
+
+	long sum = 0;
+	for (int c = 0; c <= residualOrder; c++)
+	    sum += shared.data[tid + c] * shared.coefs[residualOrder - c];
+	int res = shared.data[tid + residualOrder + 1] - (sum >> shared.shift);
+	__syncthreads();
+
+	int limit = min(frameSize - pos - residualOrder - 1, step);
+	shared.residual[tid] = tid < limit ? (2 * res) ^ (res >> 31) : 0;
+
+	__syncthreads();
+	// reduction in shared mem
+	for(unsigned int s=blockDim.x/2; s >= blockDim.y; s>>=1)
+	{
+	    if (threadIdx.x < s)
+		shared.residual[tid] += shared.residual[tid + s];
+	    __syncthreads();
+	}
+	for(unsigned int s=blockDim.y/2; s >= blockDim.x; s>>=1)
+	{
+	    if (threadIdx.y < s)
+		shared.residual[tid] += shared.residual[tid + s * blockDim.x];
+	    __syncthreads();
+	}
+	for(unsigned int s=min(blockDim.x,blockDim.y)/2; s > 0; s>>=1)
+	{
+	    if (threadIdx.x < s && threadIdx.y < s)
+		shared.residual[tid] += shared.residual[tid + s] + shared.residual[tid + s * blockDim.x] + shared.residual[tid + s + s * blockDim.x];
+	    EMUSYNC;
+	}
+
+#ifndef __DEVICE_EMULATION__
+	if (tid < 16) // Max rice param is really 15
+#endif
+	{
+	    shared.data[tid] = limit * (tid + 1) + ((shared.residual[0] - (limit >> 1)) >> tid); EMUSYNC;
+	    //if (tid == 16) shared.rice[15] = 0x7fffffff;
+#ifndef __DEVICE_EMULATION__
+	    if (threadIdx.x < 8)
+#endif
+	    {
+		shared.data[threadIdx.x] = min(shared.data[threadIdx.x], shared.data[threadIdx.x + 8]); EMUSYNC;
+		shared.data[threadIdx.x] = min(shared.data[threadIdx.x], shared.data[threadIdx.x + 4]); EMUSYNC;
+		shared.data[threadIdx.x] = min(shared.data[threadIdx.x], shared.data[threadIdx.x + 2]); EMUSYNC;
+	    }
+	}
+	total += min(shared.data[0], shared.data[1]);
+	//total += shared.residual[0];
+    }
+  
+    if (tid == 0)
+	output[blockIdx.x + blockIdx.y * gridDim.x] = total;
+#ifdef __DEVICE_EMULATION__
+    if (tid == 0)
+	printf("%d,%d:2:%d\n", blockIdx.x, blockIdx.y, total);
+#endif
+}
+
+#if 0
+extern "C" __global__ void cudaComputeAutocor3int(const int * samples, const float * window, int* output, int frameSize, int frameOffset, int channels)
+{
+    extern __shared__ short shared[];
+    int *ishared = (int*)shared;
+    const int lag = blockIdx.x;
+    
+    // fetch signal, multiply by window and split high bits/low bits
+    const int iWin = blockIdx.y >> 2; // blockIdx.y/channels;
+    const int iCh = (blockIdx.y  - iWin * channels);
+    const int smpBase = iCh * frameOffset;
+    const int winBase = iWin * 2 * frameOffset;
+
+    for(int i = threadIdx.x; i < frameSize; i += blockDim.x)
+    {
+	float val = samples[smpBase + i] * window[winBase + i];
+	int ival = __float2int_rz(fabs(val));
+	int sg =  (1 - 2 *signbit(val));
+	//int ival = (int) val;
+	//int sg =  ival < 0 ? -1 : 1;
+	ival = ival < 0 ? -ival : ival;
+	shared[i*2] = __mul24(sg, (ival >> 9));
+	shared[i*2+1] = __mul24(sg, (ival & ((1 << 9) - 1)));
+    }
+    __syncthreads();
+
+    // correlation
+    int sum1 = 0;
+    int sum2 = 0;
+    int sum3 = 0;
+    for (int i = threadIdx.x; i < frameSize - lag; i += blockDim.x)
+    {
+	sum1 += __mul24(shared[2*i], shared[2*(i+lag)]);
+	sum2 += __mul24(shared[2*i+1], shared[2*(i+lag)]);
+	sum2 += __mul24(shared[2*i], shared[2*(i+lag)+1]);
+	sum3 += __mul24(shared[2*i+1], shared[2*(i+lag)+1]);
+    }    
+    __syncthreads();
+    ishared[threadIdx.x] = sum1;
+    ishared[threadIdx.x + blockDim.x] = sum2;
+    ishared[threadIdx.x + blockDim.x * 2] = sum3;
+    __syncthreads();
+
+    // reduction in shared mem
+    for(unsigned int s=blockDim.x/2; s>0; s>>=1) 
+    {
+        if (threadIdx.x < s)
+        {
+            ishared[threadIdx.x] += ishared[threadIdx.x + s];
+	    ishared[threadIdx.x + blockDim.x] += ishared[threadIdx.x + s + blockDim.x];
+	    ishared[threadIdx.x + blockDim.x * 2] += ishared[threadIdx.x + s + blockDim.x * 2];
+        }
+        __syncthreads();
+    }
+
+    if (threadIdx.x == 0) {
+	output[(blockIdx.x + blockIdx.y * gridDim.x) * 3] = ishared[threadIdx.x];
+	output[(blockIdx.x + blockIdx.y * gridDim.x) * 3 + 1] = ishared[threadIdx.x + blockDim.x];
+	output[(blockIdx.x + blockIdx.y * gridDim.x) * 3 + 2] = ishared[threadIdx.x + blockDim.x * 2];
+    } 
+}
+
+__device__ float Bartlett(int i, int blockSize)
+{
+    float n = fminf(i, blockSize - i);
+    float k = 2.0f / blockSize * (blockSize / 2.0f - n);
+    k = 1.0f - k * k;
+    return k*k;
+}
+
+extern "C" __global__ void cudaComputeAutocorPart(const int * samples, const float * window, float* output, int frameSize, int frameOffset, int iCh, int iWin)
+{
+    extern __shared__ float fshared[];
+    // fetch signal, multiply by window
+    //const int iWin = blockIdx.y;
+    //const int iCh = blockIdx.x;
+    const int smpBase = iCh * frameOffset;
+    const int winBase = iWin * 2 * frameOffset;
+    float * matrix = fshared + 129;
+
+    // initialize results matrix
+    matrix[threadIdx.x + threadIdx.y * (blockDim.x + 1)] = 0.0f;
+
+    // prefetch blockDim.x + blockDim.y samples
+    int tid = threadIdx.x + threadIdx.y * blockDim.x;
+    if (tid < blockDim.x + blockDim.y)
+    {
+	if (blockIdx.x * blockDim.y + tid < frameSize)
+	    fshared[tid] = samples[smpBase + blockIdx.x * blockDim.y + tid] * window[winBase + blockIdx.x * blockDim.y + tid];
+	else
+	    fshared[tid] = 0.0f;
+    }
+
+    __syncthreads();
+
+    matrix[threadIdx.x + threadIdx.y * (1 + blockDim.x)] += fshared[threadIdx.y] * fshared[threadIdx.y + threadIdx.x];
+
+    __syncthreads();
+
+    // reduction in shared mem
+    for(unsigned int s=blockDim.y/2; s>0; s>>=1)
+    {
+	if (threadIdx.y < s)
+	    matrix[threadIdx.x + threadIdx.y * (1 + blockDim.x)] += matrix[threadIdx.x + (s + threadIdx.y) * (1 + blockDim.x)];
+	__syncthreads();
+    }
+
+    // return results
+    if (threadIdx.y == 0)
+	output[blockIdx.x * blockDim.x + threadIdx.x] = matrix[threadIdx.x];
+}
+
+extern "C" __global__ void cudaComputeAutocor2(const int * samples, const float * window, float* output, int frameSize, int frameOffset)
+{
+    extern __shared__ float fshared[];   
+    // fetch signal, multiply by window
+    const int iWin = blockIdx.y;
+    const int iCh = blockIdx.x;
+    const int smpBase = iCh * frameOffset;
+    const int winBase = iWin * 2 * frameOffset;
+
+    for(int i = threadIdx.x + threadIdx.y * blockDim.x; i < frameSize; i += blockDim.x * blockDim.y)
+	fshared[i] = samples[smpBase + i] * window[winBase + i];
+
+    __syncthreads();
+
+    const int lag = threadIdx.y;
+
+    // correlation
+    float sum = 0.0f;
+    for (int i = threadIdx.x; i < frameSize - lag; i += blockDim.x)
+	sum += fshared[i] * fshared[i+lag];
+    __syncthreads();
+
+    fshared[threadIdx.x + threadIdx.y * blockDim.x] = sum;
+
+    __syncthreads();
+
+    // reduction in shared mem
+    for(unsigned int s=blockDim.x/2; s>0; s>>=1)
+    {
+        if (threadIdx.x < s)
+            fshared[threadIdx.x + threadIdx.y * blockDim.x] += fshared[threadIdx.x + s + threadIdx.y * blockDim.x];
+        __syncthreads();
+    }
+
+ //   if (threadIdx.x < 32)
+ //   {
+	//if (blockDim.x >= 64) fshared[threadIdx.x + threadIdx.y * blockDim.x] += fshared[threadIdx.x + 32 + threadIdx.y * blockDim.x];
+	//if (blockDim.x >= 32) fshared[threadIdx.x + threadIdx.y * blockDim.x] += fshared[threadIdx.x + 16 + threadIdx.y * blockDim.x];
+	//if (blockDim.x >= 16) fshared[threadIdx.x + threadIdx.y * blockDim.x] += fshared[threadIdx.x +  8 + threadIdx.y * blockDim.x];
+	//if (blockDim.x >=  8) fshared[threadIdx.x + threadIdx.y * blockDim.x] += fshared[threadIdx.x +  4 + threadIdx.y * blockDim.x];
+	//if (blockDim.x >=  4) fshared[threadIdx.x + threadIdx.y * blockDim.x] += fshared[threadIdx.x +  2 + threadIdx.y * blockDim.x];
+	//if (blockDim.x >=  2) fshared[threadIdx.x + threadIdx.y * blockDim.x] += fshared[threadIdx.x +  1 + threadIdx.y * blockDim.x];
+ //   }
+
+    if (threadIdx.x == 0) {
+	output[(blockIdx.x + blockIdx.y * gridDim.x) * blockDim.y + threadIdx.y] 
+	= fshared[threadIdx.x + threadIdx.y * blockDim.x];
+    } 
+}
+
+extern "C" __global__ void cudaComputeAutocorOld(const int * samples, const float * window, float* output, int frameSize, int frameOffset, int channels)
+{
+    extern __shared__ float fshared[];
+    const int lag = blockIdx.x;
+    
+    // fetch signal, multiply by window
+    const int iWin = blockIdx.y >> 2; // blockIdx.y/channels;
+    const int iCh = (blockIdx.y  - iWin * channels);
+    const int smpBase = iCh * frameOffset;
+    const int winBase = iWin * 2 * frameOffset;
+
+    for(int i = threadIdx.x; i < frameSize; i += blockDim.x)
+	fshared[i] = samples[smpBase + i] * window[winBase + i];
+
+    __syncthreads();
+
+    // correlation
+    float sum = 0.0f;
+    for (int i = threadIdx.x; i < frameSize - lag; i += blockDim.x)
+	sum += fshared[i] * fshared[i+lag];
+    __syncthreads();
+
+    fshared[threadIdx.x] = sum;
+
+    __syncthreads();
+
+    // reduction in shared mem
+    for(unsigned int s=blockDim.x/2; s>0; s>>=1) 
+    {
+        if (threadIdx.x < s)
+            fshared[threadIdx.x] += fshared[threadIdx.x + s];
+        __syncthreads();
+    }
+
+    if (threadIdx.x == 0) {
+	output[blockIdx.x + blockIdx.y * gridDim.x] = fshared[threadIdx.x];
+    } 
+}
+#endif
+
+#endif // _FLACUDA_KERNEL_H_