tidying up

This commit is contained in:
chudov
2009-09-08 16:26:53 +00:00
parent e05fbc88f5
commit cc808938ab
2 changed files with 149 additions and 116 deletions

View File

@@ -55,10 +55,11 @@ namespace CUETools.Codecs.FlaCuda
int[] verifyBuffer; int[] verifyBuffer;
int[] residualBuffer; int[] residualBuffer;
float[] windowBuffer; float[] windowBuffer;
int[] autocorBuffer3int;
float[] autocorBuffer;
int samplesInBuffer = 0; int samplesInBuffer = 0;
encodeResidualTaskStruct[] residualTasks;
int[] residualOutput;
int _compressionLevel = 7; int _compressionLevel = 7;
int _blocksize = 0; int _blocksize = 0;
int _totalSize = 0; int _totalSize = 0;
@@ -82,10 +83,13 @@ namespace CUETools.Codecs.FlaCuda
CUdeviceptr cudaSamples; CUdeviceptr cudaSamples;
CUdeviceptr cudaWindow; CUdeviceptr cudaWindow;
CUdeviceptr cudaAutocor; CUdeviceptr cudaAutocor;
CUdeviceptr cudaResidualTasks;
CUdeviceptr cudaResidualOutput;
CUdeviceptr cudaCoeffs; CUdeviceptr cudaCoeffs;
CUdeviceptr cudaShifts; IntPtr autocorBufferPtr = IntPtr.Zero;
public const int MAX_BLOCKSIZE = 8192; const int MAX_BLOCKSIZE = 8192;
const int maxResidualTasks = MAX_BLOCKSIZE / (256 - 32);
public FlaCudaWriter(string path, int bitsPerSample, int channelCount, int sampleRate, Stream IO) public FlaCudaWriter(string path, int bitsPerSample, int channelCount, int sampleRate, Stream IO)
{ {
@@ -106,8 +110,8 @@ namespace CUETools.Codecs.FlaCuda
samplesBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels)]; samplesBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels)];
residualBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 10 : channels + 1)]; residualBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 10 : channels + 1)];
windowBuffer = new float[FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS]; windowBuffer = new float[FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS];
autocorBuffer3int = new int[(lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS]; residualOutput = new int[(channels == 2 ? 4 : channels) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS * maxResidualTasks];
autocorBuffer = new float[(lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * 22]; residualTasks = new encodeResidualTaskStruct[(channels == 2 ? 4 : channels) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS];
eparams.flake_set_defaults(_compressionLevel); eparams.flake_set_defaults(_compressionLevel);
eparams.padding_size = 8192; eparams.padding_size = 8192;
@@ -189,7 +193,9 @@ namespace CUETools.Codecs.FlaCuda
cuda.Free(cudaSamples); cuda.Free(cudaSamples);
cuda.Free(cudaAutocor); cuda.Free(cudaAutocor);
cuda.Free(cudaCoeffs); cuda.Free(cudaCoeffs);
cuda.Free(cudaShifts); cuda.Free(cudaResidualTasks);
cuda.Free(cudaResidualOutput);
CUDADriver.cuMemFreeHost(autocorBufferPtr);
cuda.Dispose(); cuda.Dispose();
inited = false; inited = false;
} }
@@ -215,7 +221,9 @@ namespace CUETools.Codecs.FlaCuda
cuda.Free(cudaSamples); cuda.Free(cudaSamples);
cuda.Free(cudaAutocor); cuda.Free(cudaAutocor);
cuda.Free(cudaCoeffs); cuda.Free(cudaCoeffs);
cuda.Free(cudaShifts); cuda.Free(cudaResidualTasks);
cuda.Free(cudaResidualOutput);
CUDADriver.cuMemFreeHost(autocorBufferPtr);
cuda.Dispose(); cuda.Dispose();
inited = false; inited = false;
} }
@@ -1017,15 +1025,9 @@ namespace CUETools.Codecs.FlaCuda
predict == PredictionType.Search) predict == PredictionType.Search)
) )
{ {
//float* lpcs = stackalloc float[lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER];
LpcContext lpc_ctx = frame.subframes[ch].lpc_ctx[best_window]; LpcContext lpc_ctx = frame.subframes[ch].lpc_ctx[best_window];
//lpc_ctx.GetReflection(eparams.max_prediction_order, smp, n, frame.window_buffer + best_window * FlaCudaWriter.MAX_BLOCKSIZE * 2);
//lpc_ctx.ComputeLPC(lpcs);
fixed (int *coefs = lpc_ctx.coefs) fixed (int *coefs = lpc_ctx.coefs)
encode_residual_lpc_sub(frame, coefs, lpc_ctx.shift, best_window, best_order, ch); encode_residual_lpc_sub(frame, coefs, lpc_ctx.shift, best_window, best_order, ch);
//encode_residual_lpc_sub(frame, lpcs, best_window, best_order, ch);
} }
} }
@@ -1330,7 +1332,6 @@ namespace CUETools.Codecs.FlaCuda
int threads_y = threads / threads_x; int threads_y = threads / threads_x;
int blocks_y = ((threads_x - 1) * (threads_y - 1)) / threads_y; int blocks_y = ((threads_x - 1) * (threads_y - 1)) / threads_y;
int blocks = (frame.blocksize + blocks_y * threads_y - 1) / (blocks_y * threads_y); int blocks = (frame.blocksize + blocks_y * threads_y - 1) / (blocks_y * threads_y);
cuda.CopyHostToDevice(cudaSamples, (IntPtr)s, (uint)FlaCudaWriter.MAX_BLOCKSIZE * 4 * sizeof(int));
cuda.SetParameter(cudaComputeAutocor, 0, (uint)cudaAutocor.Pointer); cuda.SetParameter(cudaComputeAutocor, 0, (uint)cudaAutocor.Pointer);
cuda.SetParameter(cudaComputeAutocor, IntPtr.Size, (uint)cudaSamples.Pointer); cuda.SetParameter(cudaComputeAutocor, IntPtr.Size, (uint)cudaSamples.Pointer);
cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 2, (uint)cudaWindow.Pointer); cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 2, (uint)cudaWindow.Pointer);
@@ -1339,16 +1340,31 @@ namespace CUETools.Codecs.FlaCuda
cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3 + sizeof(uint) * 2, (uint)(blocks_y * threads_y)); cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3 + sizeof(uint) * 2, (uint)(blocks_y * threads_y));
cuda.SetParameterSize(cudaComputeAutocor, (uint)(IntPtr.Size * 3) + sizeof(uint) * 3); cuda.SetParameterSize(cudaComputeAutocor, (uint)(IntPtr.Size * 3) + sizeof(uint) * 3);
cuda.SetFunctionBlockShape(cudaComputeAutocor, threads_x, threads_y, 1); cuda.SetFunctionBlockShape(cudaComputeAutocor, threads_x, threads_y, 1);
//cuda.SetFunctionSharedSize(cudaComputeAutocor, (uint)(sizeof(float) * 1090));//545));
cuda.Launch(cudaComputeAutocor, blocks, 4 * _windowcount); int autocorBufferSize = sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * 4 * _windowcount * blocks;
cuda.CopyDeviceToHost<float>(cudaAutocor, autocorBuffer);
//cuda.CopyDeviceToHost<int>(cudaAutocor, autocorBuffer3int); // create cuda event handles
CUevent start = cuda.CreateEvent();
CUevent stop = cuda.CreateEvent();
// asynchronously issue work to the GPU (all to stream 0)
CUstream stream = new CUstream();
cuda.RecordEvent(start);
cuda.CopyHostToDeviceAsync(cudaSamples, (IntPtr)s, (uint)FlaCudaWriter.MAX_BLOCKSIZE * 4 * sizeof(int), stream);
cuda.LaunchAsync(cudaComputeAutocor, blocks, 4 * _windowcount, stream);
cuda.CopyDeviceToHostAsync(cudaAutocor, autocorBufferPtr, (uint)autocorBufferSize, stream);
cuda.RecordEvent(stop);
cuda.DestroyEvent(start);
cuda.DestroyEvent(stop);
int* shift = stackalloc int[lpc.MAX_LPC_ORDER * 4 * _windowcount]; int* shift = stackalloc int[lpc.MAX_LPC_ORDER * 4 * _windowcount];
int* coefs = stackalloc int[lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER * 4 * _windowcount]; int* coefs = stackalloc int[lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER * 4 * _windowcount];
uint cbits = get_precision(frame.blocksize) + 1; uint cbits = get_precision(frame.blocksize) + 1;
AudioSamples.MemSet(coefs, 0, lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER * 4 * _windowcount); AudioSamples.MemSet(coefs, 0, lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER * 4 * _windowcount);
int nResidualTasks = 0;
int partSize = 256 - 32;
int partCount = (frame.blocksize + partSize - 1) / partSize;
for (int ch = 0; ch < 4; ch++) for (int ch = 0; ch < 4; ch++)
for (int iWindow = 0; iWindow < _windowcount; iWindow++) for (int iWindow = 0; iWindow < _windowcount; iWindow++)
{ {
@@ -1357,41 +1373,43 @@ namespace CUETools.Codecs.FlaCuda
{ {
ac[i] = 0; ac[i] = 0;
for (int i_block = 0; i_block < blocks; i_block++) for (int i_block = 0; i_block < blocks; i_block++)
ac[i] += autocorBuffer[orders * (i_block + blocks * (ch + 4 * iWindow)) + i]; ac[i] += ((float*)autocorBufferPtr)[orders * (i_block + blocks * (ch + 4 * iWindow)) + i];
} }
//double* ac = stackalloc double[33];
//for (int i = 0; i < 33; i++)
// ac[i] = autocorBuffer3int[(33 * (ch + 4 * iWindow) + i) * 3] * (double)(1 << 18) +
// autocorBuffer3int[(33 * (ch + 4 * iWindow) + i) * 3 + 1] * (double)(1 << 9) +
// autocorBuffer3int[(33 * (ch + 4 * iWindow) + i) * 3 + 2];
//fixed (float *ac = &autocorBuffer[orders * (ch + 4 * iWindow)])
frame.subframes[ch].lpc_ctx[iWindow].ComputeReflection(orders - 1, ac); frame.subframes[ch].lpc_ctx[iWindow].ComputeReflection(orders - 1, ac);
float* lpcs = stackalloc float[lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER]; float* lpcs = stackalloc float[lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER];
frame.subframes[ch].lpc_ctx[iWindow].ComputeLPC(lpcs); frame.subframes[ch].lpc_ctx[iWindow].ComputeLPC(lpcs);
for (int order = 0; order < orders - 1; order++) for (int order = 0; order < orders - 1; order++)
{
int index = order + (orders - 1) * (iWindow + _windowcount * ch);
lpc.quantize_lpc_coefs(lpcs + order * lpc.MAX_LPC_ORDER, lpc.quantize_lpc_coefs(lpcs + order * lpc.MAX_LPC_ORDER,
order + 1, cbits, coefs + order * lpc.MAX_LPC_ORDER + (ch + 4 * iWindow) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER, order + 1, cbits, coefs + index * lpc.MAX_LPC_ORDER,
out shift[order + (ch + 4 * iWindow) * lpc.MAX_LPC_ORDER], 15, 0); out shift[order + (ch + 4 * iWindow) * lpc.MAX_LPC_ORDER], 15, 0);
residualTasks[nResidualTasks].residualOrder = order;
residualTasks[nResidualTasks].shift = shift[order + (ch + 4 * iWindow) * lpc.MAX_LPC_ORDER];
residualTasks[nResidualTasks].coefsOffs = index * lpc.MAX_LPC_ORDER;
residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE;
nResidualTasks++;
}
} }
int max_order = Math.Min(orders - 1, eparams.max_prediction_order); int max_order = Math.Min(orders - 1, eparams.max_prediction_order);
cuda.CopyHostToDevice(cudaCoeffs, (IntPtr)coefs, (uint)(sizeof(int) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER * 4 * _windowcount)); cuda.SetParameter(cudaEncodeResidual, 0, (uint)cudaResidualOutput.Pointer);
cuda.CopyHostToDevice(cudaShifts, (IntPtr)shift, (uint)(sizeof(int) * lpc.MAX_LPC_ORDER* 4 * _windowcount));
//if (frame_count == 0)
{
cuda.SetParameter(cudaEncodeResidual, 0, (uint)cudaAutocor.Pointer);
cuda.SetParameter(cudaEncodeResidual, IntPtr.Size, (uint)cudaSamples.Pointer); cuda.SetParameter(cudaEncodeResidual, IntPtr.Size, (uint)cudaSamples.Pointer);
cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 2, (uint)cudaCoeffs.Pointer); cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 2, (uint)cudaCoeffs.Pointer);
cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 3, (uint)cudaShifts.Pointer); cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 3, (uint)cudaResidualTasks.Pointer);
cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 4, (uint)frame.blocksize); cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 4, (uint)frame.blocksize);
cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 4 + sizeof(uint), (uint)FlaCudaWriter.MAX_BLOCKSIZE); cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 4 + sizeof(uint), (uint)partSize);
cuda.SetParameterSize(cudaEncodeResidual, (uint)(IntPtr.Size * 4) + sizeof(uint) * 2); cuda.SetParameterSize(cudaEncodeResidual, (uint)(IntPtr.Size * 4) + sizeof(uint) * 2U);
cuda.SetFunctionBlockShape(cudaEncodeResidual, threads_x, 256 / threads_x, 1); cuda.SetFunctionBlockShape(cudaEncodeResidual, 256, 1, 1);
}
//cuda.SetFunctionSharedSize(cudaEncodeResidual, (uint)(sizeof(float) * 1090));//545)); cuda.CopyHostToDevice(cudaCoeffs, (IntPtr)coefs, (uint)(sizeof(int) * lpc.MAX_LPC_ORDER * (orders - 1) * 4 * _windowcount));
cuda.Launch(cudaEncodeResidual, max_order, 4 * _windowcount); fixed (encodeResidualTaskStruct* ptr = residualTasks)
cuda.CopyDeviceToHost<int>(cudaAutocor, autocorBuffer3int); cuda.CopyHostToDevice(cudaResidualTasks, (IntPtr)ptr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks));
cuda.Launch(cudaEncodeResidual, partCount, nResidualTasks);
cuda.CopyDeviceToHost<int>(cudaResidualOutput, residualOutput);
for (int ch = 0; ch < 4; ch++) for (int ch = 0; ch < 4; ch++)
{ {
@@ -1400,17 +1418,20 @@ namespace CUETools.Codecs.FlaCuda
{ {
for (int order = 1; order <= max_order; order++) for (int order = 1; order <= max_order; order++)
{ {
uint nbits = (uint)autocorBuffer3int[order - 1 + (ch + 4 * iWindow) * max_order]; int nbits = 0;
nbits += (uint)order * frame.subframes[ch].obits + 4 + 5 + (uint)order * cbits + 6; int index = (order - 1) + (orders - 1) * (iWindow + _windowcount * ch);
for (int p = 0; p < partCount; p++)
nbits += residualOutput[p + partCount * index];
nbits += order * (int)frame.subframes[ch].obits + 4 + 5 + order * (int)cbits + 6;
if (frame.subframes[ch].best.size > nbits) if (frame.subframes[ch].best.size > nbits)
{ {
frame.subframes[ch].best.size = nbits; frame.subframes[ch].best.size = (uint)nbits;
frame.subframes[ch].best.order = order; frame.subframes[ch].best.order = order;
frame.subframes[ch].best.window = iWindow; frame.subframes[ch].best.window = iWindow;
frame.subframes[ch].best.type = SubframeType.LPC; frame.subframes[ch].best.type = SubframeType.LPC;
frame.subframes[ch].lpc_ctx[iWindow].shift = shift[order - 1 + (ch + 4 * iWindow) * lpc.MAX_LPC_ORDER]; frame.subframes[ch].lpc_ctx[iWindow].shift = residualTasks[index].shift;
fixed (int* lcoefs = frame.subframes[ch].lpc_ctx[iWindow].coefs) fixed (int* lcoefs = frame.subframes[ch].lpc_ctx[iWindow].coefs)
AudioSamples.MemCpy(lcoefs, coefs + (order - 1) * lpc.MAX_LPC_ORDER + (ch + 4 * iWindow) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER, order); AudioSamples.MemCpy(lcoefs, coefs + residualTasks[index].coefsOffs, order);
} }
} }
//uint[] sums_buf = new uint[Flake.MAX_PARTITION_ORDER * Flake.MAX_PARTITIONS]; //uint[] sums_buf = new uint[Flake.MAX_PARTITION_ORDER * Flake.MAX_PARTITIONS];
@@ -1418,11 +1439,11 @@ namespace CUETools.Codecs.FlaCuda
// for (int order = 1; order <= max_order; order++) // for (int order = 1; order <= max_order; order++)
//{ //{
// //uint nbits; // //uint nbits;
// //find_optimal_rice_param(2*(uint)autocorBuffer3int[order - 1 + (ch + 4 * iWindow) * max_order], (uint)frame.blocksize, out nbits); // //find_optimal_rice_param(2*(uint)residualOutput[order - 1 + (ch + 4 * iWindow) * max_order], (uint)frame.blocksize, out nbits);
// //uint nbits = (uint)autocorBuffer3int[order - 1 + (ch + 4 * iWindow) * max_order]; // //uint nbits = (uint)residualOutput[order - 1 + (ch + 4 * iWindow) * max_order];
// //nbits += (uint)order * frame.subframes[ch].obits + 4 + 5 + (uint)order * cbits + 6; // //nbits += (uint)order * frame.subframes[ch].obits + 4 + 5 + (uint)order * cbits + 6;
// for (int ip = 0; ip < 64; ip++) // for (int ip = 0; ip < 64; ip++)
// sums[6 * Flake.MAX_PARTITIONS + ip] = (uint)autocorBuffer3int[64 * (order - 1 + (ch + 4 * iWindow) * max_order) + ip]; // sums[6 * Flake.MAX_PARTITIONS + ip] = (uint)residualOutput[64 * (order - 1 + (ch + 4 * iWindow) * max_order) + ip];
// for (int ip = 5; ip >= 0; ip--) // for (int ip = 5; ip >= 0; ip--)
// { // {
// int parts = (1 << ip); // int parts = (1 << ip);
@@ -1538,7 +1559,7 @@ namespace CUETools.Codecs.FlaCuda
return bs; return bs;
} }
public void Write(int[,] buff, int pos, int sampleCount) public unsafe void Write(int[,] buff, int pos, int sampleCount)
{ {
if (!inited) if (!inited)
{ {
@@ -1547,12 +1568,15 @@ namespace CUETools.Codecs.FlaCuda
cuda.LoadModule(System.IO.Path.Combine(Environment.CurrentDirectory, "flacuda.cubin")); cuda.LoadModule(System.IO.Path.Combine(Environment.CurrentDirectory, "flacuda.cubin"));
cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor"); cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor");
cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual"); cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual");
//cudaAutocor = cuda.Allocate((uint)(sizeof(int) * (lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * 3);
cudaAutocor = cuda.Allocate((uint)(sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * 22); cudaAutocor = cuda.Allocate((uint)(sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * 22);
cudaSamples = cuda.Allocate((uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels))); cudaSamples = cuda.Allocate((uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels)));
cudaCoeffs = cuda.Allocate((uint)(sizeof(int) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS));
cudaShifts = cuda.Allocate((uint)(sizeof(int) * lpc.MAX_LPC_ORDER * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS));
cudaWindow = cuda.Allocate((uint)sizeof(float) * FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS); cudaWindow = cuda.Allocate((uint)sizeof(float) * FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS);
cudaCoeffs = cuda.Allocate((uint)(sizeof(int) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS));
cudaResidualTasks = cuda.Allocate((uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS));
cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS * maxResidualTasks));
CUResult cuErr = CUDADriver.cuMemAllocHost(ref autocorBufferPtr, (uint)(sizeof(float)*(lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * 22));
if (cuErr != CUResult.Success)
throw new CUDAException(cuErr);
if (_IO == null) if (_IO == null)
_IO = new FileStream(_path, FileMode.Create, FileAccess.Write, FileShare.Read); _IO = new FileStream(_path, FileMode.Create, FileAccess.Write, FileShare.Read);
int header_size = flake_encode_init(); int header_size = flake_encode_init();
@@ -2037,4 +2061,13 @@ namespace CUETools.Codecs.FlaCuda
return 0; return 0;
} }
}} }
struct encodeResidualTaskStruct
{
public int residualOrder;
public int shift;
public int coefsOffs;
public int samplesOffs;
};
}

View File

@@ -78,49 +78,55 @@ extern "C" __global__ void cudaComputeAutocor(
output[(blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x] = shared.matrix[threadIdx.x] + shared.matrix[threadIdx.x + blockDim.x]; output[(blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x] = shared.matrix[threadIdx.x] + shared.matrix[threadIdx.x + blockDim.x];
} }
typedef struct
{
int residualOrder;
int shift;
int coefsOffs;
int samplesOffs;
} encodeResidualTaskStruct;
extern "C" __global__ void cudaEncodeResidual( extern "C" __global__ void cudaEncodeResidual(
int*output, int*output,
int*samples, int*samples,
int*allcoefs, int*allcoefs,
int*shifts, encodeResidualTaskStruct *tasks,
int frameSize, int frameSize,
int frameOffset) int partSize
)
{ {
__shared__ struct { __shared__ struct {
int data[256]; int data[256];
int residual[256]; int residual[256];
int coefs[32]; int coefs[32];
int rice[16]; int rice[32];
int shift; encodeResidualTaskStruct task;
} shared; } shared;
const int smpBase = (blockIdx.y & 3) * frameOffset; const int tid = threadIdx.x;
const int residualOrder = blockIdx.x; // fetch task data
const int tid = threadIdx.x + __mul24(threadIdx.y, blockDim.x); if (tid < sizeof(encodeResidualTaskStruct) / sizeof(int))
const int step = __mul24(blockDim.x, blockDim.y - 1); ((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid];
int total = 0;
if (threadIdx.y == 0) shared.coefs[threadIdx.x] = threadIdx.x <= residualOrder ? allcoefs[threadIdx.x + residualOrder * 32 + blockIdx.y * 32 * 32] : 0;
if (tid == 0) shared.shift = shifts[32 * blockIdx.y + residualOrder];
if (tid == 0) shared.rice[15] = 0x1fffffff;
for(int pos = 0; pos < frameSize - residualOrder - 1; pos += step)
{
// fetch blockDim.x * blockDim.y samples
shared.data[tid] = (threadIdx.y == 0 && pos != 0) ? shared.data[tid + step]
: (pos + tid < frameSize ? samples[smpBase + pos + tid] : 0);
__syncthreads(); __syncthreads();
const int pos = blockIdx.x * partSize;
const int residualOrder = shared.task.residualOrder;
const int dataLen = min(frameSize - pos, partSize + residualOrder + 1);
const int residualLen = dataLen - residualOrder - 1;
// fetch coeffs, inverting their order
if (tid <= residualOrder) shared.coefs[residualOrder - tid] = allcoefs[shared.task.coefsOffs + tid];
// fetch samples
shared.data[tid] = (tid < dataLen ? samples[shared.task.samplesOffs + pos + tid] : 0);
// compute residual
__syncthreads();
long sum = 0; long sum = 0;
for (int c = 0; c <= residualOrder; c++) for (int c = 0; c <= residualOrder; c++)
sum += __mul24(shared.data[tid + c], shared.coefs[residualOrder - c]); sum += __mul24(shared.data[tid + c], shared.coefs[c]);
int res = shared.data[tid + residualOrder + 1] - (sum >> shared.shift); int res = shared.data[tid + residualOrder + 1] - (sum >> shared.task.shift);
//__syncthreads(); shared.residual[tid] = __mul24(tid < residualLen, (2 * res) ^ (res >> 31));
int limit = min(frameSize - pos - residualOrder - 1, step);
shared.residual[tid] = __mul24(tid < limit, (2 * res) ^ (res >> 31));
__syncthreads(); __syncthreads();
// reduction in shared mem // residual sum: reduction in shared mem
if (tid < 128) shared.residual[tid] += shared.residual[tid + 128]; __syncthreads(); if (tid < 128) shared.residual[tid] += shared.residual[tid + 128]; __syncthreads();
if (tid < 64) shared.residual[tid] += shared.residual[tid + 64]; __syncthreads(); if (tid < 64) shared.residual[tid] += shared.residual[tid + 64]; __syncthreads();
if (tid < 32) shared.residual[tid] += shared.residual[tid + 32]; __syncthreads(); if (tid < 32) shared.residual[tid] += shared.residual[tid + 32]; __syncthreads();
@@ -131,23 +137,17 @@ extern "C" __global__ void cudaEncodeResidual(
shared.residual[tid] += shared.residual[tid + 1]; shared.residual[tid] += shared.residual[tid + 1];
__syncthreads(); __syncthreads();
if (tid < 16) if (tid < 32)
{ {
shared.rice[tid] = __mul24(tid == 15, 0x7fffff) + limit * (tid + 1) + ((shared.residual[0] - (limit >> 1)) >> tid); // rice parameter search
shared.rice[tid] = __mul24(tid >= 15, 0x7fffff) + residualLen * (tid + 1) + ((shared.residual[0] - (residualLen >> 1)) >> tid);
shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 8]); shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 8]);
shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 4]); shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 4]);
shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 2]); shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 2]);
total += min(shared.rice[tid], shared.rice[tid + 1]); shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 1]);
} }
__syncthreads();
}
__syncthreads();
if (tid == 0) if (tid == 0)
output[blockIdx.x + blockIdx.y * gridDim.x] = total; output[blockIdx.x + blockIdx.y * gridDim.x] = shared.rice[0];
#ifdef __DEVICE_EMULATION__
if (tid == 0)
printf("%d,%d:%d\n", blockIdx.x, blockIdx.y, total);
#endif
} }
#if 0 #if 0