optimizations

This commit is contained in:
chudov
2009-09-11 11:16:45 +00:00
parent 5fd6108c9d
commit d1005089c2
3 changed files with 182 additions and 142 deletions

View File

@@ -54,9 +54,9 @@
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<None Include="flacuda.cu"> <None Include="flacuda.cu">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
<Generator>"%24%28CUDA_BIN_PATH%29\nvcc.exe" flacuda.cu --cubin -cbin "%24%28VCInstallDir%29bin"</Generator>
</None> </None>
<EmbeddedResource Include="flacuda.cubin">
</EmbeddedResource>
</ItemGroup> </ItemGroup>
<Import Project="$(MSBuildBinPath)\Microsoft.CSharp.targets" /> <Import Project="$(MSBuildBinPath)\Microsoft.CSharp.targets" />
<!-- To modify your build process, add your task inside one of the targets below and uncomment it. <!-- To modify your build process, add your task inside one of the targets below and uncomment it.
@@ -67,7 +67,8 @@
</Target> </Target>
--> -->
<PropertyGroup> <PropertyGroup>
<PostBuildEvent>nvcc flacuda.cu --maxrregcount 10 --cubin --compiler-bindir "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\bin" --system-include "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\include" <PostBuildEvent>
</PostBuildEvent> </PostBuildEvent>
<PreBuildEvent>nvcc $(ProjectDir)flacuda.cu -o $(ProjectDir)\flacuda.cubin --machine 32 --maxrregcount 10 --cubin --compiler-bindir "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\bin" --system-include "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\include"</PreBuildEvent>
</PropertyGroup> </PropertyGroup>
</Project> </Project>

View File

@@ -75,7 +75,7 @@ namespace CUETools.Codecs.FlaCuda
float[] windowBuffer; float[] windowBuffer;
int samplesInBuffer = 0; int samplesInBuffer = 0;
int _compressionLevel = 7; int _compressionLevel = 5;
int _blocksize = 0; int _blocksize = 0;
int _totalSize = 0; int _totalSize = 0;
int _windowsize = 0, _windowcount = 0; int _windowsize = 0, _windowcount = 0;
@@ -96,6 +96,7 @@ namespace CUETools.Codecs.FlaCuda
CUfunction cudaComputeAutocor; CUfunction cudaComputeAutocor;
CUfunction cudaComputeLPC; CUfunction cudaComputeLPC;
CUfunction cudaEstimateResidual; CUfunction cudaEstimateResidual;
CUfunction cudaSumResidualChunks;
CUfunction cudaSumResidual; CUfunction cudaSumResidual;
CUfunction cudaEncodeResidual; CUfunction cudaEncodeResidual;
CUdeviceptr cudaSamples; CUdeviceptr cudaSamples;
@@ -104,6 +105,7 @@ namespace CUETools.Codecs.FlaCuda
CUdeviceptr cudaAutocorOutput; CUdeviceptr cudaAutocorOutput;
CUdeviceptr cudaResidualTasks; CUdeviceptr cudaResidualTasks;
CUdeviceptr cudaResidualOutput; CUdeviceptr cudaResidualOutput;
CUdeviceptr cudaResidualSums;
IntPtr samplesBufferPtr = IntPtr.Zero; IntPtr samplesBufferPtr = IntPtr.Zero;
IntPtr autocorTasksPtr = IntPtr.Zero; IntPtr autocorTasksPtr = IntPtr.Zero;
IntPtr residualTasksPtr = IntPtr.Zero; IntPtr residualTasksPtr = IntPtr.Zero;
@@ -114,7 +116,7 @@ namespace CUETools.Codecs.FlaCuda
int nAutocorTasks = 0; int nAutocorTasks = 0;
const int MAX_BLOCKSIZE = 8192; const int MAX_BLOCKSIZE = 8192;
const int maxResidualParts = MAX_BLOCKSIZE / (256 - 32); const int maxResidualParts = 64;
const int maxAutocorParts = MAX_BLOCKSIZE / (256 - 32); const int maxAutocorParts = MAX_BLOCKSIZE / (256 - 32);
public FlaCudaWriter(string path, int bitsPerSample, int channelCount, int sampleRate, Stream IO) public FlaCudaWriter(string path, int bitsPerSample, int channelCount, int sampleRate, Stream IO)
@@ -218,6 +220,7 @@ namespace CUETools.Codecs.FlaCuda
cuda.Free(cudaAutocorOutput); cuda.Free(cudaAutocorOutput);
cuda.Free(cudaResidualTasks); cuda.Free(cudaResidualTasks);
cuda.Free(cudaResidualOutput); cuda.Free(cudaResidualOutput);
cuda.Free(cudaResidualSums);
CUDADriver.cuMemFreeHost(samplesBufferPtr); CUDADriver.cuMemFreeHost(samplesBufferPtr);
CUDADriver.cuMemFreeHost(residualTasksPtr); CUDADriver.cuMemFreeHost(residualTasksPtr);
CUDADriver.cuMemFreeHost(autocorTasksPtr); CUDADriver.cuMemFreeHost(autocorTasksPtr);
@@ -250,6 +253,7 @@ namespace CUETools.Codecs.FlaCuda
cuda.Free(cudaAutocorOutput); cuda.Free(cudaAutocorOutput);
cuda.Free(cudaResidualTasks); cuda.Free(cudaResidualTasks);
cuda.Free(cudaResidualOutput); cuda.Free(cudaResidualOutput);
cuda.Free(cudaResidualSums);
CUDADriver.cuMemFreeHost(samplesBufferPtr); CUDADriver.cuMemFreeHost(samplesBufferPtr);
CUDADriver.cuMemFreeHost(residualTasksPtr); CUDADriver.cuMemFreeHost(residualTasksPtr);
CUDADriver.cuMemFreeHost(autocorTasksPtr); CUDADriver.cuMemFreeHost(autocorTasksPtr);
@@ -278,7 +282,11 @@ namespace CUETools.Codecs.FlaCuda
public long BlockSize public long BlockSize
{ {
set { _blocksize = (int)value; } set {
if (value < 256 || value > MAX_BLOCKSIZE )
throw new Exception("unsupported BlockSize value");
_blocksize = (int)value;
}
get { return _blocksize == 0 ? eparams.block_size : _blocksize; } get { return _blocksize == 0 ? eparams.block_size : _blocksize; }
} }
@@ -911,9 +919,9 @@ namespace CUETools.Codecs.FlaCuda
autocorTasks[nAutocorTasks].windowOffs = iWindow * 2 * FlaCudaWriter.MAX_BLOCKSIZE; autocorTasks[nAutocorTasks].windowOffs = iWindow * 2 * FlaCudaWriter.MAX_BLOCKSIZE;
nAutocorTasks++; nAutocorTasks++;
// LPC tasks // LPC tasks
for (int order = 1; order <= max_order; order++) for (int order = 1; order <= ((max_order + 7) & ~7); order++)
{ {
residualTasks[nResidualTasks].residualOrder = order - 1; residualTasks[nResidualTasks].residualOrder = order <= max_order ? order : 0;
residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE; residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE;
nResidualTasks++; nResidualTasks++;
} }
@@ -921,9 +929,9 @@ namespace CUETools.Codecs.FlaCuda
// Fixed prediction // Fixed prediction
for (int ch = 0; ch < channelsCount; ch++) for (int ch = 0; ch < channelsCount; ch++)
{ {
for (int order = 1; order <= 4; order++) for (int order = 1; order <= 8; order++)
{ {
residualTasks[nResidualTasks].residualOrder = order - 1; residualTasks[nResidualTasks].residualOrder = order <= 4 ? order : 0;
residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE; residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE;
residualTasks[nResidualTasks].shift = 0; residualTasks[nResidualTasks].shift = 0;
switch (order) switch (order)
@@ -1025,9 +1033,11 @@ namespace CUETools.Codecs.FlaCuda
{ {
for (int order = 1; order <= max_order && order < frame.blocksize; order++) for (int order = 1; order <= max_order && order < frame.blocksize; order++)
{ {
int index = (order - 1) + max_order * (iWindow + _windowcount * ch); int index = (order - 1) + ((max_order + 7) & ~7) * (iWindow + _windowcount * ch);
int cbits = residualTasks[index].cbits; int cbits = residualTasks[index].cbits;
int nbits = order * (int)frame.subframes[ch].obits + 4 + 5 + order * cbits + 6 + residualTasks[index].size; int nbits = order * (int)frame.subframes[ch].obits + 4 + 5 + order * cbits + 6 + residualTasks[index].size;
if (residualTasks[index].residualOrder != order)
throw new Exception("oops");
if (frame.subframes[ch].best.size > nbits) if (frame.subframes[ch].best.size > nbits)
{ {
frame.subframes[ch].best.type = SubframeType.LPC; frame.subframes[ch].best.type = SubframeType.LPC;
@@ -1048,8 +1058,10 @@ namespace CUETools.Codecs.FlaCuda
{ {
for (int order = 1; order <= 4 && order < frame.blocksize; order++) for (int order = 1; order <= 4 && order < frame.blocksize; order++)
{ {
int index = (order - 1) + 4 * ch; int index = (order - 1) + 8 * ch + ((max_order + 7) & ~7) * _windowcount * channelsCount;
int nbits = order * (int)frame.subframes[ch].obits + 6 + residualTasks[index + max_order * _windowcount * channelsCount].size; int nbits = order * (int)frame.subframes[ch].obits + 6 + residualTasks[index].size;
if (residualTasks[index].residualOrder != order)
throw new Exception("oops");
if (frame.subframes[ch].best.size > nbits) if (frame.subframes[ch].best.size > nbits)
{ {
frame.subframes[ch].best.type = SubframeType.Fixed; frame.subframes[ch].best.type = SubframeType.Fixed;
@@ -1062,34 +1074,47 @@ namespace CUETools.Codecs.FlaCuda
unsafe void estimate_residual(FlacFrame frame, int channelsCount, int max_order, int autocorPartCount, out int partCount) unsafe void estimate_residual(FlacFrame frame, int channelsCount, int max_order, int autocorPartCount, out int partCount)
{ {
if (frame.blocksize <= 4)
{
partCount = 0;
return;
}
uint cbits = get_precision(frame.blocksize) + 1; uint cbits = get_precision(frame.blocksize) + 1;
int residualThreads = 256; int partSize = 256 - 32;
int partSize = residualThreads - max_order;
partSize &= 0xffffff0;
partCount = (frame.blocksize + partSize - 1) / partSize; partCount = (frame.blocksize + partSize - 1) / partSize;
if (partCount > maxResidualParts) if (partCount > maxResidualParts)
throw new Exception("internal error"); throw new Exception("invalid combination of block size and LPC order");
if (frame.blocksize <= 4) cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 0, (uint)cudaResidualOutput.Pointer);
return; cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 1, (uint)cudaSamples.Pointer);
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 2, (uint)cudaResidualTasks.Pointer);
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 3, (uint)max_order);
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 4, (uint)frame.blocksize);
cuda.SetParameter(cudaEstimateResidual, sizeof(uint) * 5, (uint)partSize);
cuda.SetParameterSize(cudaEstimateResidual, sizeof(uint) * 6);
cuda.SetFunctionBlockShape(cudaEstimateResidual, 64, 4, 1);
cuda.SetParameter(cudaEstimateResidual, 0, (uint)cudaResidualOutput.Pointer); //cuda.SetParameter(cudaSumResidualChunks, 0, (uint)cudaResidualSums.Pointer);
cuda.SetParameter(cudaEstimateResidual, IntPtr.Size, (uint)cudaSamples.Pointer); //cuda.SetParameter(cudaSumResidualChunks, sizeof(uint), (uint)cudaResidualTasks.Pointer);
cuda.SetParameter(cudaEstimateResidual, IntPtr.Size * 2, (uint)cudaResidualTasks.Pointer); //cuda.SetParameter(cudaSumResidualChunks, sizeof(uint) * 2, (uint)cudaResidualOutput.Pointer);
cuda.SetParameter(cudaEstimateResidual, IntPtr.Size * 3, (uint)frame.blocksize); //cuda.SetParameter(cudaSumResidualChunks, sizeof(uint) * 3, (uint)frame.blocksize);
cuda.SetParameter(cudaEstimateResidual, IntPtr.Size * 3 + sizeof(uint), (uint)partSize); //cuda.SetParameter(cudaSumResidualChunks, sizeof(uint) * 4, (uint)partSize);
cuda.SetParameterSize(cudaEstimateResidual, (uint)(IntPtr.Size * 3) + sizeof(uint) * 2U); //cuda.SetParameterSize(cudaSumResidualChunks, sizeof(uint) * 5U);
cuda.SetFunctionBlockShape(cudaEstimateResidual, residualThreads, 1, 1); //cuda.SetFunctionBlockShape(cudaSumResidualChunks, residualThreads, 1, 1);
cuda.SetParameter(cudaSumResidual, 0, (uint)cudaResidualTasks.Pointer); cuda.SetParameter(cudaSumResidual, 0, (uint)cudaResidualTasks.Pointer);
cuda.SetParameter(cudaSumResidual, IntPtr.Size, (uint)cudaResidualOutput.Pointer); cuda.SetParameter(cudaSumResidual, sizeof(uint), (uint)cudaResidualOutput.Pointer);
cuda.SetParameter(cudaSumResidual, IntPtr.Size * 2, (uint)partCount); cuda.SetParameter(cudaSumResidual, sizeof(uint) * 2, (uint)partSize);
cuda.SetParameterSize(cudaSumResidual, (uint)(IntPtr.Size * 2) + sizeof(uint) * 1U); cuda.SetParameter(cudaSumResidual, sizeof(uint) * 3, (uint)partCount);
cuda.SetParameterSize(cudaSumResidual, sizeof(uint) * 4U);
cuda.SetFunctionBlockShape(cudaSumResidual, 64, 1, 1); cuda.SetFunctionBlockShape(cudaSumResidual, 64, 1, 1);
// issue work to the GPU // issue work to the GPU
cuda.LaunchAsync(cudaEstimateResidual, partCount, nResidualTasks, cudaStream); cuda.LaunchAsync(cudaEstimateResidual, partCount, nResidualTasks / 4, cudaStream);
//cuda.LaunchAsync(cudaSumResidualChunks, partCount, nResidualTasks, cudaStream);
cuda.LaunchAsync(cudaSumResidual, 1, nResidualTasks, cudaStream); cuda.LaunchAsync(cudaSumResidual, 1, nResidualTasks, cudaStream);
cuda.CopyDeviceToHostAsync(cudaResidualTasks, residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks), cudaStream); cuda.CopyDeviceToHostAsync(cudaResidualTasks, residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks), cudaStream);
cuda.SynchronizeStream(cudaStream); cuda.SynchronizeStream(cudaStream);
@@ -1109,21 +1134,21 @@ namespace CUETools.Codecs.FlaCuda
return; return;
cuda.SetParameter(cudaComputeAutocor, 0, (uint)cudaAutocorOutput.Pointer); cuda.SetParameter(cudaComputeAutocor, 0, (uint)cudaAutocorOutput.Pointer);
cuda.SetParameter(cudaComputeAutocor, IntPtr.Size, (uint)cudaSamples.Pointer); cuda.SetParameter(cudaComputeAutocor, sizeof(uint), (uint)cudaSamples.Pointer);
cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 2, (uint)cudaWindow.Pointer); cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 2, (uint)cudaWindow.Pointer);
cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3, (uint)cudaAutocorTasks.Pointer); cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 3, (uint)cudaAutocorTasks.Pointer);
cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 4, (uint)max_order); cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 4, (uint)max_order);
cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 4 + sizeof(uint), (uint)frame.blocksize); cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 4 + sizeof(uint), (uint)frame.blocksize);
cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 4 + sizeof(uint) * 2, (uint)partSize); cuda.SetParameter(cudaComputeAutocor, sizeof(uint) * 4 + sizeof(uint) * 2, (uint)partSize);
cuda.SetParameterSize(cudaComputeAutocor, (uint)(IntPtr.Size * 4) + sizeof(uint) * 3); cuda.SetParameterSize(cudaComputeAutocor, (uint)(sizeof(uint) * 4) + sizeof(uint) * 3);
cuda.SetFunctionBlockShape(cudaComputeAutocor, autocorThreads, 1, 1); cuda.SetFunctionBlockShape(cudaComputeAutocor, autocorThreads, 1, 1);
cuda.SetParameter(cudaComputeLPC, 0, (uint)cudaResidualTasks.Pointer); cuda.SetParameter(cudaComputeLPC, 0, (uint)cudaResidualTasks.Pointer);
cuda.SetParameter(cudaComputeLPC, IntPtr.Size, (uint)cudaAutocorOutput.Pointer); cuda.SetParameter(cudaComputeLPC, sizeof(uint), (uint)cudaAutocorOutput.Pointer);
cuda.SetParameter(cudaComputeLPC, IntPtr.Size * 2, (uint)cudaAutocorTasks.Pointer); cuda.SetParameter(cudaComputeLPC, sizeof(uint) * 2, (uint)cudaAutocorTasks.Pointer);
cuda.SetParameter(cudaComputeLPC, IntPtr.Size * 3, (uint)max_order); cuda.SetParameter(cudaComputeLPC, sizeof(uint) * 3, (uint)max_order);
cuda.SetParameter(cudaComputeLPC, IntPtr.Size * 3 + sizeof(uint), (uint)partCount); cuda.SetParameter(cudaComputeLPC, sizeof(uint) * 3 + sizeof(uint), (uint)partCount);
cuda.SetParameterSize(cudaComputeLPC, (uint)(IntPtr.Size * 3) + sizeof(uint) * 2); cuda.SetParameterSize(cudaComputeLPC, (uint)(sizeof(uint) * 3) + sizeof(uint) * 2);
cuda.SetFunctionBlockShape(cudaComputeLPC, 64, 1, 1); cuda.SetFunctionBlockShape(cudaComputeLPC, 64, 1, 1);
// issue work to the GPU // issue work to the GPU
@@ -1268,24 +1293,30 @@ namespace CUETools.Codecs.FlaCuda
if (!inited) if (!inited)
{ {
cuda = new CUDA(true, InitializationFlags.None); cuda = new CUDA(true, InitializationFlags.None);
cuda.CreateContext(0, CUCtxFlags.SchedSpin); cuda.CreateContext(0, CUCtxFlags.BlockingSync);
cuda.LoadModule(System.IO.Path.Combine(Environment.CurrentDirectory, "flacuda.cubin")); using (Stream cubin = GetType().Assembly.GetManifestResourceStream(GetType(), "flacuda.cubin"))
using (StreamReader sr = new StreamReader(cubin))
cuda.LoadModule(new ASCIIEncoding().GetBytes(sr.ReadToEnd()));
//cuda.LoadModule(System.IO.Path.Combine(Environment.CurrentDirectory, "flacuda.cubin"));
cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor"); cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor");
cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC"); cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC");
cudaEstimateResidual = cuda.GetModuleFunction("cudaEstimateResidual"); cudaEstimateResidual = cuda.GetModuleFunction("cudaEstimateResidual");
cudaSumResidual = cuda.GetModuleFunction("cudaSumResidual"); cudaSumResidual = cuda.GetModuleFunction("cudaSumResidual");
cudaSumResidualChunks = cuda.GetModuleFunction("cudaSumResidualChunks");
cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual"); cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual");
cudaSamples = cuda.Allocate((uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels))); cudaSamples = cuda.Allocate((uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels)));
cudaWindow = cuda.Allocate((uint)sizeof(float) * FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS); cudaWindow = cuda.Allocate((uint)sizeof(float) * FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS);
cudaAutocorTasks = cuda.Allocate((uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS)); cudaAutocorTasks = cuda.Allocate((uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS));
cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * maxAutocorParts); cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * maxAutocorParts);
cudaResidualTasks = cuda.Allocate((uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 4))); cudaResidualTasks = cuda.Allocate((uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 4)));
cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER + 1) * lpc.MAX_LPC_WINDOWS * maxResidualParts)); cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 4)));
cudaResidualSums = cuda.Allocate((uint)(sizeof(int) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 4) * maxResidualParts));
//cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 4) * maxResidualParts));
CUResult cuErr = CUDADriver.cuMemAllocHost(ref samplesBufferPtr, (uint)(sizeof(int) * (channels == 2 ? 4 : channels) * FlaCudaWriter.MAX_BLOCKSIZE)); CUResult cuErr = CUDADriver.cuMemAllocHost(ref samplesBufferPtr, (uint)(sizeof(int) * (channels == 2 ? 4 : channels) * FlaCudaWriter.MAX_BLOCKSIZE));
if (cuErr == CUResult.Success) if (cuErr == CUResult.Success)
cuErr = CUDADriver.cuMemAllocHost(ref autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS)); cuErr = CUDADriver.cuMemAllocHost(ref autocorTasksPtr, (uint)(sizeof(computeAutocorTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS));
if (cuErr == CUResult.Success) if (cuErr == CUResult.Success)
cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 4))); cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 8)));
if (cuErr != CUResult.Success) if (cuErr != CUResult.Success)
{ {
if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr); samplesBufferPtr = IntPtr.Zero; if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr); samplesBufferPtr = IntPtr.Zero;
@@ -1678,7 +1709,7 @@ namespace CUETools.Codecs.FlaCuda
case 0: case 0:
do_midside = false; do_midside = false;
window_function = WindowFunction.Bartlett; window_function = WindowFunction.Bartlett;
max_prediction_order = 7; max_prediction_order = 8;
max_partition_order = 4; max_partition_order = 4;
break; break;
case 1: case 1:
@@ -1694,7 +1725,7 @@ namespace CUETools.Codecs.FlaCuda
break; break;
case 3: case 3:
window_function = WindowFunction.Bartlett; window_function = WindowFunction.Bartlett;
max_prediction_order = 7; max_prediction_order = 8;
break; break;
case 4: case 4:
window_function = WindowFunction.Bartlett; window_function = WindowFunction.Bartlett;
@@ -1704,7 +1735,7 @@ namespace CUETools.Codecs.FlaCuda
window_function = WindowFunction.Bartlett; window_function = WindowFunction.Bartlett;
break; break;
case 6: case 6:
max_prediction_order = 10; //max_prediction_order = 10;
break; break;
case 7: case 7:
break; break;

View File

@@ -155,6 +155,7 @@ extern "C" __global__ void cudaComputeLPC(
if (tid < 32) if (tid < 32)
{ {
int precision = 13; int precision = 13;
int taskNo = (blockIdx.x + blockIdx.y * gridDim.x) * ((max_order + 7) & ~7) + order;
shared.bits[tid] = __mul24((33 - __clz(__float2int_rn(fabs(shared.tmp[tid]) * (1 << 15))) - precision), tid <= order); shared.bits[tid] = __mul24((33 - __clz(__float2int_rn(fabs(shared.tmp[tid]) * (1 << 15))) - precision), tid <= order);
shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 16]); shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 16]);
shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 8]); shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 8]);
@@ -164,9 +165,9 @@ extern "C" __global__ void cudaComputeLPC(
int sh = max(0,min(15, 15 - shared.bits[0])); int sh = max(0,min(15, 15 - shared.bits[0]));
int coef = max(-(1 << precision),min((1 << precision)-1,__float2int_rn(-shared.tmp[tid] * (1 << sh)))); int coef = max(-(1 << precision),min((1 << precision)-1,__float2int_rn(-shared.tmp[tid] * (1 << sh))));
if (tid <= order) if (tid <= order)
output[(blockIdx.x + blockIdx.y * gridDim.x) * max_order + order].coefs[tid] = coef; output[taskNo].coefs[tid] = coef;
if (tid == 0) if (tid == 0)
output[(blockIdx.x + blockIdx.y * gridDim.x) * max_order + order].shift = sh; output[taskNo].shift = sh;
shared.bits[tid] = 33 - max(__clz(coef),__clz(-1 ^ coef)); shared.bits[tid] = 33 - max(__clz(coef),__clz(-1 ^ coef));
shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 16]); shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 16]);
shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 8]); shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 8]);
@@ -175,51 +176,111 @@ extern "C" __global__ void cudaComputeLPC(
shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 1]); shared.bits[tid] = max(shared.bits[tid], shared.bits[tid + 1]);
int cbits = shared.bits[0]; int cbits = shared.bits[0];
if (tid == 0) if (tid == 0)
output[(blockIdx.x + blockIdx.y * gridDim.x) * max_order + order].cbits = cbits; output[taskNo].cbits = cbits;
} }
__syncthreads(); __syncthreads();
} }
} }
// blockDim.x == 32
// blockDim.y == 8
extern "C" __global__ void cudaEstimateResidual( extern "C" __global__ void cudaEstimateResidual(
int*output, int*output,
int*samples, int*samples,
encodeResidualTaskStruct *tasks, encodeResidualTaskStruct *tasks,
int max_order,
int frameSize, int frameSize,
int partSize // should be <= blockDim - max_order int partSize // should be 224
) )
{ {
__shared__ struct { __shared__ struct {
int data[256]; int data[256];
int residual[256]; int residual[256];
int rice[32]; int rice[256];
encodeResidualTaskStruct task; int sums[8];
encodeResidualTaskStruct task[8];
} shared; } shared;
const int tid = threadIdx.x; const int tid = threadIdx.x + threadIdx.y * blockDim.x;
// fetch task data // fetch task data (8 * 64 == 512 elements);
if (tid < sizeof(encodeResidualTaskStruct) / sizeof(int)) ((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y * blockDim.y))[tid];
((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid]; ((int*)&shared.task)[tid + 256] = ((int*)(tasks + blockIdx.y * blockDim.y))[tid + 256];
__syncthreads(); __syncthreads();
const int pos = blockIdx.x * partSize; const int residualOrder = shared.task[threadIdx.y].residualOrder;
const int residualOrder = shared.task.residualOrder + 1; const int partNumber = blockIdx.x;
const int residualLen = min(frameSize - pos - residualOrder, partSize); const int pos = partNumber * partSize;
const int dataLen = residualLen + residualOrder; const int dataLen = min(frameSize - pos, partSize + max_order) * (residualOrder != 0);
// fetch samples // fetch samples
shared.data[tid] = (tid < dataLen ? samples[shared.task.samplesOffs + pos + tid] : 0); shared.data[tid] = (tid < dataLen ? samples[shared.task[0].samplesOffs + pos + tid] : 0);
if (tid < blockDim.y) shared.sums[tid] = 0;
// set upper residuals to zero, in case blockDim < 256
//shared.residual[255 - tid] = 0;
const int residualLen = min(frameSize - pos - residualOrder, partSize) * (residualOrder != 0);
// reverse coefs // reverse coefs
if (tid < residualOrder) shared.task.coefs[tid] = shared.task.coefs[residualOrder - 1 - tid]; if (threadIdx.x < residualOrder) shared.task[threadIdx.y].coefs[threadIdx.x] = shared.task[threadIdx.y].coefs[residualOrder - 1 - threadIdx.x];
// compute residual
__syncthreads(); __syncthreads();
for (int i = 0; i < residualLen; i += blockDim.x)
{
// compute residual
long sum = 0; long sum = 0;
for (int c = 0; c < residualOrder; c++) for (int c = 0; c < residualOrder; c++)
sum += __mul24(shared.data[tid + c], shared.task.coefs[c]); sum += __mul24(shared.data[i + threadIdx.x + c], shared.task[threadIdx.y].coefs[c]);
int res = shared.data[tid + residualOrder] - (sum >> shared.task.shift); int res = shared.data[i + threadIdx.x + residualOrder] - (sum >> shared.task[threadIdx.y].shift);
shared.residual[tid] = __mul24(tid < residualLen, (2 * res) ^ (res >> 31)); shared.residual[tid] = __mul24(i + threadIdx.x < residualLen, (2 * res) ^ (res >> 31));
__syncthreads(); if (threadIdx.x < 32) shared.residual[tid] += shared.residual[tid + 32]; __syncthreads();
shared.residual[tid] += shared.residual[tid + 16];
shared.residual[tid] += shared.residual[tid + 8];
shared.residual[tid] += shared.residual[tid + 4];
shared.residual[tid] += shared.residual[tid + 2];
if (threadIdx.x == 0) shared.sums[threadIdx.y] += shared.residual[tid] + shared.residual[tid + 1];
}
// rice parameter search
shared.rice[tid] = __mul24(threadIdx.x >= 15, 0x7fffff) + residualLen * (threadIdx.x + 1) + ((shared.sums[threadIdx.y] - (residualLen >> 1)) >> threadIdx.x);
shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 8]);
shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 4]);
shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 2]);
if (threadIdx.x == 0 && residualOrder != 0)
output[(blockIdx.y * blockDim.y + threadIdx.y) * gridDim.x + blockIdx.x] = min(shared.rice[tid], shared.rice[tid + 1]);
}
// blockDim.x == 256
// gridDim.x = frameSize / chunkSize
extern "C" __global__ void cudaSumResidualChunks(
int *output,
encodeResidualTaskStruct *tasks,
int *residual,
int frameSize,
int chunkSize // <= blockDim.x(256)
)
{
__shared__ struct {
int residual[256];
int rice[32];
} shared;
// fetch parameters
const int tid = threadIdx.x;
const int residualOrder = tasks[blockIdx.y].residualOrder;
const int chunkNumber = blockIdx.x;
const int pos = chunkNumber * chunkSize;
const int residualLen = min(frameSize - pos - residualOrder, chunkSize);
// set upper residuals to zero, in case blockDim < 256
shared.residual[255 - tid] = 0;
// read residual
int res = (tid < residualLen) ? residual[blockIdx.y * 8192 + pos + tid] : 0;
// convert to unsigned
shared.residual[tid] = (2 * res) ^ (res >> 31);
__syncthreads(); __syncthreads();
// residual sum: reduction in shared mem // residual sum: reduction in shared mem
if (tid < 128) shared.residual[tid] += shared.residual[tid + 128]; __syncthreads(); if (tid < 128) shared.residual[tid] += shared.residual[tid + 128]; __syncthreads();
if (tid < 64) shared.residual[tid] += shared.residual[tid + 64]; __syncthreads(); if (tid < 64) shared.residual[tid] += shared.residual[tid + 64]; __syncthreads();
@@ -229,7 +290,6 @@ extern "C" __global__ void cudaEstimateResidual(
shared.residual[tid] += shared.residual[tid + 4]; shared.residual[tid] += shared.residual[tid + 4];
shared.residual[tid] += shared.residual[tid + 2]; shared.residual[tid] += shared.residual[tid + 2];
shared.residual[tid] += shared.residual[tid + 1]; shared.residual[tid] += shared.residual[tid + 1];
__syncthreads();
if (tid < 32) if (tid < 32)
{ {
@@ -240,6 +300,8 @@ extern "C" __global__ void cudaEstimateResidual(
shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 2]); shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 2]);
shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 1]); shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 1]);
} }
// write output
if (tid == 0) if (tid == 0)
output[blockIdx.x + blockIdx.y * gridDim.x] = shared.rice[0]; output[blockIdx.x + blockIdx.y * gridDim.x] = shared.rice[0];
} }
@@ -247,34 +309,32 @@ extern "C" __global__ void cudaEstimateResidual(
extern "C" __global__ void cudaSumResidual( extern "C" __global__ void cudaSumResidual(
encodeResidualTaskStruct *tasks, encodeResidualTaskStruct *tasks,
int *residual, int *residual,
int partCount // <= blockDim.y (64) int partSize,
int partCount // <= blockDim.y (256)
) )
{ {
__shared__ struct { __shared__ struct {
int partLen[64]; int partLen[256];
//encodeResidualTaskStruct task; encodeResidualTaskStruct task;
} shared; } shared;
const int tid = threadIdx.x; const int tid = threadIdx.x;
// fetch task data // fetch task data
// if (tid < sizeof(encodeResidualTaskStruct) / sizeof(int)) if (tid < sizeof(encodeResidualTaskStruct) / sizeof(int))
//((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid]; ((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid];
// __syncthreads(); __syncthreads();
shared.partLen[tid] = (tid < partCount) ? residual[tid + partCount * blockIdx.y] : 0; shared.partLen[tid] = (tid < partCount) ? residual[tid + partCount * blockIdx.y] : 0;
__syncthreads(); __syncthreads();
// length sum: reduction in shared mem // length sum: reduction in shared mem
//if (tid < 128) shared.partLen[tid] += shared.partLen[tid + 128]; __syncthreads();
//if (tid < 64) shared.partLen[tid] += shared.partLen[tid + 64]; __syncthreads();
if (tid < 32) shared.partLen[tid] += shared.partLen[tid + 32]; __syncthreads(); if (tid < 32) shared.partLen[tid] += shared.partLen[tid + 32]; __syncthreads();
shared.partLen[tid] += shared.partLen[tid + 16]; shared.partLen[tid] += shared.partLen[tid + 16];
shared.partLen[tid] += shared.partLen[tid + 8]; shared.partLen[tid] += shared.partLen[tid + 8];
shared.partLen[tid] += shared.partLen[tid + 4]; shared.partLen[tid] += shared.partLen[tid + 4];
shared.partLen[tid] += shared.partLen[tid + 2]; shared.partLen[tid] += shared.partLen[tid + 2];
shared.partLen[tid] += shared.partLen[tid + 1]; shared.partLen[tid] += shared.partLen[tid + 1];
__syncthreads();
// FIXME: should process partition order here!!!
// return sum // return sum
if (tid == 0) if (tid == 0)
tasks[blockIdx.y].size = shared.partLen[0]; tasks[blockIdx.y].size = shared.partLen[0];
@@ -288,58 +348,6 @@ extern "C" __global__ void cudaEncodeResidual(
int partSize // should be <= blockDim - max_order int partSize // should be <= blockDim - max_order
) )
{ {
__shared__ struct {
int data[256];
int residual[256];
int rice[32];
encodeResidualTaskStruct task;
} shared;
const int tid = threadIdx.x;
// fetch task data
if (tid < sizeof(encodeResidualTaskStruct) / sizeof(int))
((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid];
__syncthreads(); __syncthreads();
const int pos = blockIdx.x * partSize;
const int residualOrder = shared.task.residualOrder + 1;
const int residualLen = min(frameSize - pos - residualOrder, partSize);
const int dataLen = residualLen + residualOrder;
// fetch samples
shared.data[tid] = (tid < dataLen ? samples[shared.task.samplesOffs + pos + tid] : 0);
// reverse coefs
if (tid < residualOrder) shared.task.coefs[tid] = shared.task.coefs[residualOrder - 1 - tid];
// compute residual
__syncthreads();
long sum = 0;
for (int c = 0; c < residualOrder; c++)
sum += __mul24(shared.data[tid + c], shared.task.coefs[c]);
int res = shared.data[tid + residualOrder] - (sum >> shared.task.shift);
shared.residual[tid] = __mul24(tid < residualLen, (2 * res) ^ (res >> 31));
__syncthreads();
// residual sum: reduction in shared mem
if (tid < 128) shared.residual[tid] += shared.residual[tid + 128]; __syncthreads();
if (tid < 64) shared.residual[tid] += shared.residual[tid + 64]; __syncthreads();
if (tid < 32) shared.residual[tid] += shared.residual[tid + 32]; __syncthreads();
shared.residual[tid] += shared.residual[tid + 16];
shared.residual[tid] += shared.residual[tid + 8];
shared.residual[tid] += shared.residual[tid + 4];
shared.residual[tid] += shared.residual[tid + 2];
shared.residual[tid] += shared.residual[tid + 1];
__syncthreads();
if (tid < 32)
{
// rice parameter search
shared.rice[tid] = __mul24(tid >= 15, 0x7fffff) + residualLen * (tid + 1) + ((shared.residual[0] - (residualLen >> 1)) >> tid);
shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 8]);
shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 4]);
shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 2]);
shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 1]);
}
if (tid == 0)
output[blockIdx.x + blockIdx.y * gridDim.x] = shared.rice[0];
} }
#endif #endif