diff --git a/CUETools.FlaCuda/FlaCudaWriter.cs b/CUETools.FlaCuda/FlaCudaWriter.cs index cfcdb05..c93a668 100644 --- a/CUETools.FlaCuda/FlaCudaWriter.cs +++ b/CUETools.FlaCuda/FlaCudaWriter.cs @@ -55,10 +55,11 @@ namespace CUETools.Codecs.FlaCuda int[] verifyBuffer; int[] residualBuffer; float[] windowBuffer; - int[] autocorBuffer3int; - float[] autocorBuffer; int samplesInBuffer = 0; + encodeResidualTaskStruct[] residualTasks; + int[] residualOutput; + int _compressionLevel = 7; int _blocksize = 0; int _totalSize = 0; @@ -82,10 +83,13 @@ namespace CUETools.Codecs.FlaCuda CUdeviceptr cudaSamples; CUdeviceptr cudaWindow; CUdeviceptr cudaAutocor; + CUdeviceptr cudaResidualTasks; + CUdeviceptr cudaResidualOutput; CUdeviceptr cudaCoeffs; - CUdeviceptr cudaShifts; + IntPtr autocorBufferPtr = IntPtr.Zero; - public const int MAX_BLOCKSIZE = 8192; + const int MAX_BLOCKSIZE = 8192; + const int maxResidualTasks = MAX_BLOCKSIZE / (256 - 32); public FlaCudaWriter(string path, int bitsPerSample, int channelCount, int sampleRate, Stream IO) { @@ -106,8 +110,8 @@ namespace CUETools.Codecs.FlaCuda samplesBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels)]; residualBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 10 : channels + 1)]; windowBuffer = new float[FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS]; - autocorBuffer3int = new int[(lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS]; - autocorBuffer = new float[(lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * 22]; + residualOutput = new int[(channels == 2 ? 4 : channels) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS * maxResidualTasks]; + residualTasks = new encodeResidualTaskStruct[(channels == 2 ? 4 : channels) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS]; eparams.flake_set_defaults(_compressionLevel); eparams.padding_size = 8192; @@ -189,7 +193,9 @@ namespace CUETools.Codecs.FlaCuda cuda.Free(cudaSamples); cuda.Free(cudaAutocor); cuda.Free(cudaCoeffs); - cuda.Free(cudaShifts); + cuda.Free(cudaResidualTasks); + cuda.Free(cudaResidualOutput); + CUDADriver.cuMemFreeHost(autocorBufferPtr); cuda.Dispose(); inited = false; } @@ -215,7 +221,9 @@ namespace CUETools.Codecs.FlaCuda cuda.Free(cudaSamples); cuda.Free(cudaAutocor); cuda.Free(cudaCoeffs); - cuda.Free(cudaShifts); + cuda.Free(cudaResidualTasks); + cuda.Free(cudaResidualOutput); + CUDADriver.cuMemFreeHost(autocorBufferPtr); cuda.Dispose(); inited = false; } @@ -1017,15 +1025,9 @@ namespace CUETools.Codecs.FlaCuda predict == PredictionType.Search) ) { - //float* lpcs = stackalloc float[lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER]; LpcContext lpc_ctx = frame.subframes[ch].lpc_ctx[best_window]; - - //lpc_ctx.GetReflection(eparams.max_prediction_order, smp, n, frame.window_buffer + best_window * FlaCudaWriter.MAX_BLOCKSIZE * 2); - //lpc_ctx.ComputeLPC(lpcs); - fixed (int *coefs = lpc_ctx.coefs) encode_residual_lpc_sub(frame, coefs, lpc_ctx.shift, best_window, best_order, ch); - //encode_residual_lpc_sub(frame, lpcs, best_window, best_order, ch); } } @@ -1319,7 +1321,7 @@ namespace CUETools.Codecs.FlaCuda frame.window_buffer = window; frame.current.residual = r + 4 * FlaCudaWriter.MAX_BLOCKSIZE; for (int ch = 0; ch < 4; ch++) - frame.subframes[ch].Init(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * FlaCudaWriter.MAX_BLOCKSIZE, + frame.subframes[ch].Init(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * FlaCudaWriter.MAX_BLOCKSIZE, bits_per_sample + (ch == 3 ? 1U : 0U), get_wasted_bits(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, frame.blocksize)); int orders = 8; @@ -1330,25 +1332,39 @@ namespace CUETools.Codecs.FlaCuda int threads_y = threads / threads_x; int blocks_y = ((threads_x - 1) * (threads_y - 1)) / threads_y; int blocks = (frame.blocksize + blocks_y * threads_y - 1) / (blocks_y * threads_y); - cuda.CopyHostToDevice(cudaSamples, (IntPtr)s, (uint)FlaCudaWriter.MAX_BLOCKSIZE * 4 * sizeof(int)); cuda.SetParameter(cudaComputeAutocor, 0, (uint)cudaAutocor.Pointer); cuda.SetParameter(cudaComputeAutocor, IntPtr.Size, (uint)cudaSamples.Pointer); cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 2, (uint)cudaWindow.Pointer); cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3, (uint)frame.blocksize); cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3 + sizeof(uint), (uint)FlaCudaWriter.MAX_BLOCKSIZE); - cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3 + sizeof(uint) * 2, (uint)(blocks_y*threads_y)); + cuda.SetParameter(cudaComputeAutocor, IntPtr.Size * 3 + sizeof(uint) * 2, (uint)(blocks_y * threads_y)); cuda.SetParameterSize(cudaComputeAutocor, (uint)(IntPtr.Size * 3) + sizeof(uint) * 3); cuda.SetFunctionBlockShape(cudaComputeAutocor, threads_x, threads_y, 1); - //cuda.SetFunctionSharedSize(cudaComputeAutocor, (uint)(sizeof(float) * 1090));//545)); - cuda.Launch(cudaComputeAutocor, blocks, 4 * _windowcount); - cuda.CopyDeviceToHost(cudaAutocor, autocorBuffer); - //cuda.CopyDeviceToHost(cudaAutocor, autocorBuffer3int); + + int autocorBufferSize = sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * 4 * _windowcount * blocks; + + // create cuda event handles + CUevent start = cuda.CreateEvent(); + CUevent stop = cuda.CreateEvent(); + + // asynchronously issue work to the GPU (all to stream 0) + CUstream stream = new CUstream(); + cuda.RecordEvent(start); + cuda.CopyHostToDeviceAsync(cudaSamples, (IntPtr)s, (uint)FlaCudaWriter.MAX_BLOCKSIZE * 4 * sizeof(int), stream); + cuda.LaunchAsync(cudaComputeAutocor, blocks, 4 * _windowcount, stream); + cuda.CopyDeviceToHostAsync(cudaAutocor, autocorBufferPtr, (uint)autocorBufferSize, stream); + cuda.RecordEvent(stop); + cuda.DestroyEvent(start); + cuda.DestroyEvent(stop); int* shift = stackalloc int[lpc.MAX_LPC_ORDER * 4 * _windowcount]; int* coefs = stackalloc int[lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER * 4 * _windowcount]; uint cbits = get_precision(frame.blocksize) + 1; AudioSamples.MemSet(coefs, 0, lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER * 4 * _windowcount); + int nResidualTasks = 0; + int partSize = 256 - 32; + int partCount = (frame.blocksize + partSize - 1) / partSize; for (int ch = 0; ch < 4; ch++) for (int iWindow = 0; iWindow < _windowcount; iWindow++) { @@ -1357,41 +1373,43 @@ namespace CUETools.Codecs.FlaCuda { ac[i] = 0; for (int i_block = 0; i_block < blocks; i_block++) - ac[i] += autocorBuffer[orders * (i_block + blocks * (ch + 4 * iWindow)) + i]; + ac[i] += ((float*)autocorBufferPtr)[orders * (i_block + blocks * (ch + 4 * iWindow)) + i]; } - //double* ac = stackalloc double[33]; - //for (int i = 0; i < 33; i++) - // ac[i] = autocorBuffer3int[(33 * (ch + 4 * iWindow) + i) * 3] * (double)(1 << 18) + - // autocorBuffer3int[(33 * (ch + 4 * iWindow) + i) * 3 + 1] * (double)(1 << 9) + - // autocorBuffer3int[(33 * (ch + 4 * iWindow) + i) * 3 + 2]; - //fixed (float *ac = &autocorBuffer[orders * (ch + 4 * iWindow)]) frame.subframes[ch].lpc_ctx[iWindow].ComputeReflection(orders - 1, ac); float* lpcs = stackalloc float[lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER]; frame.subframes[ch].lpc_ctx[iWindow].ComputeLPC(lpcs); for (int order = 0; order < orders - 1; order++) + { + int index = order + (orders - 1) * (iWindow + _windowcount * ch); + lpc.quantize_lpc_coefs(lpcs + order * lpc.MAX_LPC_ORDER, - order + 1, cbits, coefs + order * lpc.MAX_LPC_ORDER + (ch + 4 * iWindow) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER, + order + 1, cbits, coefs + index * lpc.MAX_LPC_ORDER, out shift[order + (ch + 4 * iWindow) * lpc.MAX_LPC_ORDER], 15, 0); + + residualTasks[nResidualTasks].residualOrder = order; + residualTasks[nResidualTasks].shift = shift[order + (ch + 4 * iWindow) * lpc.MAX_LPC_ORDER]; + residualTasks[nResidualTasks].coefsOffs = index * lpc.MAX_LPC_ORDER; + residualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE; + nResidualTasks++; + } } - + int max_order = Math.Min(orders - 1, eparams.max_prediction_order); - cuda.CopyHostToDevice(cudaCoeffs, (IntPtr)coefs, (uint)(sizeof(int) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER * 4 * _windowcount)); - cuda.CopyHostToDevice(cudaShifts, (IntPtr)shift, (uint)(sizeof(int) * lpc.MAX_LPC_ORDER* 4 * _windowcount)); - //if (frame_count == 0) - { - cuda.SetParameter(cudaEncodeResidual, 0, (uint)cudaAutocor.Pointer); - cuda.SetParameter(cudaEncodeResidual, IntPtr.Size, (uint)cudaSamples.Pointer); - cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 2, (uint)cudaCoeffs.Pointer); - cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 3, (uint)cudaShifts.Pointer); - cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 4, (uint)frame.blocksize); - cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 4 + sizeof(uint), (uint)FlaCudaWriter.MAX_BLOCKSIZE); - cuda.SetParameterSize(cudaEncodeResidual, (uint)(IntPtr.Size * 4) + sizeof(uint) * 2); - cuda.SetFunctionBlockShape(cudaEncodeResidual, threads_x, 256 / threads_x, 1); - } - //cuda.SetFunctionSharedSize(cudaEncodeResidual, (uint)(sizeof(float) * 1090));//545)); - cuda.Launch(cudaEncodeResidual, max_order, 4 * _windowcount); - cuda.CopyDeviceToHost(cudaAutocor, autocorBuffer3int); + cuda.SetParameter(cudaEncodeResidual, 0, (uint)cudaResidualOutput.Pointer); + cuda.SetParameter(cudaEncodeResidual, IntPtr.Size, (uint)cudaSamples.Pointer); + cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 2, (uint)cudaCoeffs.Pointer); + cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 3, (uint)cudaResidualTasks.Pointer); + cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 4, (uint)frame.blocksize); + cuda.SetParameter(cudaEncodeResidual, IntPtr.Size * 4 + sizeof(uint), (uint)partSize); + cuda.SetParameterSize(cudaEncodeResidual, (uint)(IntPtr.Size * 4) + sizeof(uint) * 2U); + cuda.SetFunctionBlockShape(cudaEncodeResidual, 256, 1, 1); + + cuda.CopyHostToDevice(cudaCoeffs, (IntPtr)coefs, (uint)(sizeof(int) * lpc.MAX_LPC_ORDER * (orders - 1) * 4 * _windowcount)); + fixed (encodeResidualTaskStruct* ptr = residualTasks) + cuda.CopyHostToDevice(cudaResidualTasks, (IntPtr)ptr, (uint)(sizeof(encodeResidualTaskStruct) * nResidualTasks)); + cuda.Launch(cudaEncodeResidual, partCount, nResidualTasks); + cuda.CopyDeviceToHost(cudaResidualOutput, residualOutput); for (int ch = 0; ch < 4; ch++) { @@ -1400,17 +1418,20 @@ namespace CUETools.Codecs.FlaCuda { for (int order = 1; order <= max_order; order++) { - uint nbits = (uint)autocorBuffer3int[order - 1 + (ch + 4 * iWindow) * max_order]; - nbits += (uint)order * frame.subframes[ch].obits + 4 + 5 + (uint)order * cbits + 6; + int nbits = 0; + int index = (order - 1) + (orders - 1) * (iWindow + _windowcount * ch); + for (int p = 0; p < partCount; p++) + nbits += residualOutput[p + partCount * index]; + nbits += order * (int)frame.subframes[ch].obits + 4 + 5 + order * (int)cbits + 6; if (frame.subframes[ch].best.size > nbits) { - frame.subframes[ch].best.size = nbits; + frame.subframes[ch].best.size = (uint)nbits; frame.subframes[ch].best.order = order; frame.subframes[ch].best.window = iWindow; frame.subframes[ch].best.type = SubframeType.LPC; - frame.subframes[ch].lpc_ctx[iWindow].shift = shift[order - 1 + (ch + 4 * iWindow) * lpc.MAX_LPC_ORDER]; - fixed(int *lcoefs = frame.subframes[ch].lpc_ctx[iWindow].coefs) - AudioSamples.MemCpy(lcoefs, coefs + (order - 1) * lpc.MAX_LPC_ORDER + (ch + 4 * iWindow) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER, order); + frame.subframes[ch].lpc_ctx[iWindow].shift = residualTasks[index].shift; + fixed (int* lcoefs = frame.subframes[ch].lpc_ctx[iWindow].coefs) + AudioSamples.MemCpy(lcoefs, coefs + residualTasks[index].coefsOffs, order); } } //uint[] sums_buf = new uint[Flake.MAX_PARTITION_ORDER * Flake.MAX_PARTITIONS]; @@ -1418,11 +1439,11 @@ namespace CUETools.Codecs.FlaCuda // for (int order = 1; order <= max_order; order++) //{ // //uint nbits; - // //find_optimal_rice_param(2*(uint)autocorBuffer3int[order - 1 + (ch + 4 * iWindow) * max_order], (uint)frame.blocksize, out nbits); - // //uint nbits = (uint)autocorBuffer3int[order - 1 + (ch + 4 * iWindow) * max_order]; + // //find_optimal_rice_param(2*(uint)residualOutput[order - 1 + (ch + 4 * iWindow) * max_order], (uint)frame.blocksize, out nbits); + // //uint nbits = (uint)residualOutput[order - 1 + (ch + 4 * iWindow) * max_order]; // //nbits += (uint)order * frame.subframes[ch].obits + 4 + 5 + (uint)order * cbits + 6; // for (int ip = 0; ip < 64; ip++) - // sums[6 * Flake.MAX_PARTITIONS + ip] = (uint)autocorBuffer3int[64 * (order - 1 + (ch + 4 * iWindow) * max_order) + ip]; + // sums[6 * Flake.MAX_PARTITIONS + ip] = (uint)residualOutput[64 * (order - 1 + (ch + 4 * iWindow) * max_order) + ip]; // for (int ip = 5; ip >= 0; ip--) // { // int parts = (1 << ip); @@ -1454,9 +1475,9 @@ namespace CUETools.Codecs.FlaCuda for (int ch = 0; ch < channels; ch++) { frame.subframes[ch].best.size = AudioSamples.UINT32_MAX; - encode_selected_residual(frame, ch, eparams.prediction_type, eparams.order_method, + encode_selected_residual(frame, ch, eparams.prediction_type, eparams.order_method, frame.subframes[ch].best.window, frame.subframes[ch].best.order); - } + } } BitWriter bitwriter = new BitWriter(frame_buffer, 0, max_frame_size); @@ -1538,7 +1559,7 @@ namespace CUETools.Codecs.FlaCuda return bs; } - public void Write(int[,] buff, int pos, int sampleCount) + public unsafe void Write(int[,] buff, int pos, int sampleCount) { if (!inited) { @@ -1547,12 +1568,15 @@ namespace CUETools.Codecs.FlaCuda cuda.LoadModule(System.IO.Path.Combine(Environment.CurrentDirectory, "flacuda.cubin")); cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor"); cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual"); - //cudaAutocor = cuda.Allocate((uint)(sizeof(int) * (lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * 3); cudaAutocor = cuda.Allocate((uint)(sizeof(float) * (lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS) * 22); cudaSamples = cuda.Allocate((uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 4 : channels))); - cudaCoeffs = cuda.Allocate((uint)(sizeof(int) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS)); - cudaShifts = cuda.Allocate((uint)(sizeof(int) * lpc.MAX_LPC_ORDER * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS)); cudaWindow = cuda.Allocate((uint)sizeof(float) * FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS); + cudaCoeffs = cuda.Allocate((uint)(sizeof(int) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_ORDER * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS)); + cudaResidualTasks = cuda.Allocate((uint)(sizeof(encodeResidualTaskStruct) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS)); + cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS * maxResidualTasks)); + CUResult cuErr = CUDADriver.cuMemAllocHost(ref autocorBufferPtr, (uint)(sizeof(float)*(lpc.MAX_LPC_ORDER + 1) * (channels == 2 ? 4 : channels) * lpc.MAX_LPC_WINDOWS * 22)); + if (cuErr != CUResult.Success) + throw new CUDAException(cuErr); if (_IO == null) _IO = new FileStream(_path, FileMode.Create, FileAccess.Write, FileShare.Read); int header_size = flake_encode_init(); @@ -2037,4 +2061,13 @@ namespace CUETools.Codecs.FlaCuda return 0; } - }} + } + + struct encodeResidualTaskStruct + { + public int residualOrder; + public int shift; + public int coefsOffs; + public int samplesOffs; + }; +} diff --git a/CUETools.FlaCuda/flacuda.cu b/CUETools.FlaCuda/flacuda.cu index e56391b..8ff4b11 100644 --- a/CUETools.FlaCuda/flacuda.cu +++ b/CUETools.FlaCuda/flacuda.cu @@ -78,76 +78,76 @@ extern "C" __global__ void cudaComputeAutocor( output[(blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x] = shared.matrix[threadIdx.x] + shared.matrix[threadIdx.x + blockDim.x]; } +typedef struct +{ + int residualOrder; + int shift; + int coefsOffs; + int samplesOffs; +} encodeResidualTaskStruct; + extern "C" __global__ void cudaEncodeResidual( int*output, int*samples, int*allcoefs, - int*shifts, + encodeResidualTaskStruct *tasks, int frameSize, - int frameOffset) + int partSize + ) { __shared__ struct { int data[256]; int residual[256]; int coefs[32]; - int rice[16]; - int shift; + int rice[32]; + encodeResidualTaskStruct task; } shared; - const int smpBase = (blockIdx.y & 3) * frameOffset; - const int residualOrder = blockIdx.x; - const int tid = threadIdx.x + __mul24(threadIdx.y, blockDim.x); - const int step = __mul24(blockDim.x, blockDim.y - 1); - int total = 0; + const int tid = threadIdx.x; + // fetch task data + if (tid < sizeof(encodeResidualTaskStruct) / sizeof(int)) + ((int*)&shared.task)[tid] = ((int*)(tasks + blockIdx.y))[tid]; + __syncthreads(); + const int pos = blockIdx.x * partSize; + const int residualOrder = shared.task.residualOrder; + const int dataLen = min(frameSize - pos, partSize + residualOrder + 1); + const int residualLen = dataLen - residualOrder - 1; - if (threadIdx.y == 0) shared.coefs[threadIdx.x] = threadIdx.x <= residualOrder ? allcoefs[threadIdx.x + residualOrder * 32 + blockIdx.y * 32 * 32] : 0; - if (tid == 0) shared.shift = shifts[32 * blockIdx.y + residualOrder]; - if (tid == 0) shared.rice[15] = 0x1fffffff; + // fetch coeffs, inverting their order + if (tid <= residualOrder) shared.coefs[residualOrder - tid] = allcoefs[shared.task.coefsOffs + tid]; + // fetch samples + shared.data[tid] = (tid < dataLen ? samples[shared.task.samplesOffs + pos + tid] : 0); - for(int pos = 0; pos < frameSize - residualOrder - 1; pos += step) + // compute residual + __syncthreads(); + long sum = 0; + for (int c = 0; c <= residualOrder; c++) + sum += __mul24(shared.data[tid + c], shared.coefs[c]); + int res = shared.data[tid + residualOrder + 1] - (sum >> shared.task.shift); + shared.residual[tid] = __mul24(tid < residualLen, (2 * res) ^ (res >> 31)); + + __syncthreads(); + // residual sum: reduction in shared mem + if (tid < 128) shared.residual[tid] += shared.residual[tid + 128]; __syncthreads(); + if (tid < 64) shared.residual[tid] += shared.residual[tid + 64]; __syncthreads(); + if (tid < 32) shared.residual[tid] += shared.residual[tid + 32]; __syncthreads(); + shared.residual[tid] += shared.residual[tid + 16]; + shared.residual[tid] += shared.residual[tid + 8]; + shared.residual[tid] += shared.residual[tid + 4]; + shared.residual[tid] += shared.residual[tid + 2]; + shared.residual[tid] += shared.residual[tid + 1]; + __syncthreads(); + + if (tid < 32) { - // fetch blockDim.x * blockDim.y samples - shared.data[tid] = (threadIdx.y == 0 && pos != 0) ? shared.data[tid + step] - : (pos + tid < frameSize ? samples[smpBase + pos + tid] : 0); - __syncthreads(); - - long sum = 0; - for (int c = 0; c <= residualOrder; c++) - sum += __mul24(shared.data[tid + c], shared.coefs[residualOrder - c]); - int res = shared.data[tid + residualOrder + 1] - (sum >> shared.shift); - //__syncthreads(); - - int limit = min(frameSize - pos - residualOrder - 1, step); - shared.residual[tid] = __mul24(tid < limit, (2 * res) ^ (res >> 31)); - - __syncthreads(); - // reduction in shared mem - if (tid < 128) shared.residual[tid] += shared.residual[tid + 128]; __syncthreads(); - if (tid < 64) shared.residual[tid] += shared.residual[tid + 64]; __syncthreads(); - if (tid < 32) shared.residual[tid] += shared.residual[tid + 32]; __syncthreads(); - shared.residual[tid] += shared.residual[tid + 16]; - shared.residual[tid] += shared.residual[tid + 8]; - shared.residual[tid] += shared.residual[tid + 4]; - shared.residual[tid] += shared.residual[tid + 2]; - shared.residual[tid] += shared.residual[tid + 1]; - __syncthreads(); - - if (tid < 16) - { - shared.rice[tid] = __mul24(tid == 15, 0x7fffff) + limit * (tid + 1) + ((shared.residual[0] - (limit >> 1)) >> tid); - shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 8]); - shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 4]); - shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 2]); - total += min(shared.rice[tid], shared.rice[tid + 1]); - } - __syncthreads(); + // rice parameter search + shared.rice[tid] = __mul24(tid >= 15, 0x7fffff) + residualLen * (tid + 1) + ((shared.residual[0] - (residualLen >> 1)) >> tid); + shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 8]); + shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 4]); + shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 2]); + shared.rice[tid] = min(shared.rice[tid], shared.rice[tid + 1]); } - __syncthreads(); if (tid == 0) - output[blockIdx.x + blockIdx.y * gridDim.x] = total; -#ifdef __DEVICE_EMULATION__ - if (tid == 0) - printf("%d,%d:%d\n", blockIdx.x, blockIdx.y, total); -#endif + output[blockIdx.x + blockIdx.y * gridDim.x] = shared.rice[0]; } #if 0