From 492f5b845a43e1542b2d822946fd9a2ccb1ea58d Mon Sep 17 00:00:00 2001 From: chudov Date: Wed, 30 Dec 2009 03:39:47 +0000 Subject: [PATCH] optimizations --- CUETools.FlaCuda/FlaCudaWriter.cs | 255 ++++++++++++++++++------------ 1 file changed, 150 insertions(+), 105 deletions(-) diff --git a/CUETools.FlaCuda/FlaCudaWriter.cs b/CUETools.FlaCuda/FlaCudaWriter.cs index 8e6f2b5..e1e934e 100644 --- a/CUETools.FlaCuda/FlaCudaWriter.cs +++ b/CUETools.FlaCuda/FlaCudaWriter.cs @@ -498,46 +498,18 @@ namespace CUETools.Codecs.FlaCuda } /// - /// Copy channel-interleaved input samples into separate subframes + /// Copy input samples into task buffer /// /// /// /// unsafe void copy_samples(int[,] samples, int pos, int block, FlaCudaTask task) { - int* s = ((int*)task.samplesBufferPtr) + samplesInBuffer; - fixed (int* src = &samples[pos, 0]) - { - short* dst = ((short*)task.samplesBytesPtr) + samplesInBuffer * channels; - for (int i = 0; i < block * channels; i++) - dst[i] = (short)src[i]; - if (channels == 2 && eparams.do_midside) - channel_decorrelation(s, s + FlaCudaWriter.MAX_BLOCKSIZE, - s + 2 * FlaCudaWriter.MAX_BLOCKSIZE, s + 3 * FlaCudaWriter.MAX_BLOCKSIZE, src, block); - else - for (int ch = 0; ch < channels; ch++) - { - int* psamples = s + ch * FlaCudaWriter.MAX_BLOCKSIZE; - for (int i = 0; i < block; i++) - psamples[i] = src[i * channels + ch]; - } - } + AudioSamples.FLACSamplesToBytes(samples, pos, ((byte*)task.samplesBytesPtr) + samplesInBuffer * channels * 2, + block, channels, (int)bits_per_sample); samplesInBuffer += block; } - unsafe static void channel_decorrelation(int* leftS, int* rightS, int *leftM, int *rightM, int* src, int blocksize) - { - for (int i = 0; i < blocksize; i++) - { - int l = *(src++); - int r = *(src++); - leftS[i] = l; - rightS[i] = r; - leftM[i] = (l + r) >> 1; - rightM[i] = l - r; - } - } - unsafe void encode_residual_fixed(int* res, int* smp, int n, int order) { int i; @@ -1027,81 +999,87 @@ namespace CUETools.Codecs.FlaCuda task.frameSize = blocksize; } - unsafe void encode_residual(FlacFrame frame) + unsafe void encode_residual(FlaCudaTask task) { + bool unpacked = false; + unpack_samples(task, Math.Min(32, task.frameSize)); for (int ch = 0; ch < channels; ch++) { - switch (frame.subframes[ch].best.type) + switch (task.frame.subframes[ch].best.type) { case SubframeType.Constant: break; case SubframeType.Verbatim: + if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true; break; case SubframeType.Fixed: if (encode_on_cpu) { - encode_residual_fixed(frame.subframes[ch].best.residual, frame.subframes[ch].samples, - frame.blocksize, frame.subframes[ch].best.order); + if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true; + encode_residual_fixed(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, + task.frame.blocksize, task.frame.subframes[ch].best.order); - int pmin = get_max_p_order(eparams.min_partition_order, frame.blocksize, frame.subframes[ch].best.order); - int pmax = get_max_p_order(eparams.max_partition_order, frame.blocksize, frame.subframes[ch].best.order); - uint bits = (uint)frame.subframes[ch].best.order * frame.subframes[ch].obits + 6; - frame.subframes[ch].best.size = bits + calc_rice_params(frame.subframes[ch].best.rc, pmin, pmax, frame.subframes[ch].best.residual, (uint)frame.blocksize, (uint)frame.subframes[ch].best.order); + int pmin = get_max_p_order(eparams.min_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order); + int pmax = get_max_p_order(eparams.max_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order); + uint bits = (uint)task.frame.subframes[ch].best.order * task.frame.subframes[ch].obits + 6; + task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order); } break; case SubframeType.LPC: - fixed (int* coefs = frame.subframes[ch].best.coefs) + fixed (int* coefs = task.frame.subframes[ch].best.coefs) { ulong csum = 0; - for (int i = frame.subframes[ch].best.order; i > 0; i--) + for (int i = task.frame.subframes[ch].best.order; i > 0; i--) csum += (ulong)Math.Abs(coefs[i - 1]); - if ((csum << (int)frame.subframes[ch].obits) >= 1UL << 32) - lpc.encode_residual_long(frame.subframes[ch].best.residual, frame.subframes[ch].samples, frame.blocksize, frame.subframes[ch].best.order, coefs, frame.subframes[ch].best.shift); - else if (encode_on_cpu) - lpc.encode_residual(frame.subframes[ch].best.residual, frame.subframes[ch].samples, frame.blocksize, frame.subframes[ch].best.order, coefs, frame.subframes[ch].best.shift); - if ((csum << (int)frame.subframes[ch].obits) >= 1UL << 32 || encode_on_cpu) + if ((csum << (int)task.frame.subframes[ch].obits) >= 1UL << 32 || encode_on_cpu) { - int pmin = get_max_p_order(eparams.min_partition_order, frame.blocksize, frame.subframes[ch].best.order); - int pmax = get_max_p_order(eparams.max_partition_order, frame.blocksize, frame.subframes[ch].best.order); - uint bits = (uint)frame.subframes[ch].best.order * frame.subframes[ch].obits + 4 + 5 + (uint)frame.subframes[ch].best.order * (uint)frame.subframes[ch].best.cbits + 6; - //uint oldsize = frame.subframes[ch].best.size; - frame.subframes[ch].best.size = bits + calc_rice_params(frame.subframes[ch].best.rc, pmin, pmax, frame.subframes[ch].best.residual, (uint)frame.blocksize, (uint)frame.subframes[ch].best.order); - //if (frame.subframes[ch].best.size > frame.subframes[ch].obits * (uint)frame.blocksize && - // oldsize <= frame.subframes[ch].obits * (uint)frame.blocksize) + if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true; + if ((csum << (int)task.frame.subframes[ch].obits) >= 1UL << 32) + lpc.encode_residual_long(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, task.frame.blocksize, task.frame.subframes[ch].best.order, coefs, task.frame.subframes[ch].best.shift); + else + lpc.encode_residual(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, task.frame.blocksize, task.frame.subframes[ch].best.order, coefs, task.frame.subframes[ch].best.shift); + int pmin = get_max_p_order(eparams.min_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order); + int pmax = get_max_p_order(eparams.max_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order); + uint bits = (uint)task.frame.subframes[ch].best.order * task.frame.subframes[ch].obits + 4 + 5 + (uint)task.frame.subframes[ch].best.order * (uint)task.frame.subframes[ch].best.cbits + 6; + //uint oldsize = task.frame.subframes[ch].best.size; + task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order); + //if (task.frame.subframes[ch].best.size > task.frame.subframes[ch].obits * (uint)task.frame.blocksize && + // oldsize <= task.frame.subframes[ch].obits * (uint)task.frame.blocksize) // throw new Exception("oops"); } #if DEBUG else { // residual - int len = frame.subframes[ch].best.order * (int)frame.subframes[ch].obits + 6 + - 4 + 5 + frame.subframes[ch].best.order * frame.subframes[ch].best.cbits + - (4 << frame.subframes[ch].best.rc.porder); - int j = frame.subframes[ch].best.order; - int psize = frame.blocksize >> frame.subframes[ch].best.rc.porder; - for (int p = 0; p < (1 << frame.subframes[ch].best.rc.porder); p++) + int len = task.frame.subframes[ch].best.order * (int)task.frame.subframes[ch].obits + 6 + + 4 + 5 + task.frame.subframes[ch].best.order * task.frame.subframes[ch].best.cbits + + (4 << task.frame.subframes[ch].best.rc.porder); + int j = task.frame.subframes[ch].best.order; + int psize = task.frame.blocksize >> task.frame.subframes[ch].best.rc.porder; + for (int p = 0; p < (1 << task.frame.subframes[ch].best.rc.porder); p++) { - int k = frame.subframes[ch].best.rc.rparams[p]; - int cnt = p == 0 ? psize - frame.subframes[ch].best.order : psize; + int k = task.frame.subframes[ch].best.rc.rparams[p]; + int cnt = p == 0 ? psize - task.frame.subframes[ch].best.order : psize; len += (k + 1) * cnt; for (int i = j; i < j + cnt; i++) - len += (((frame.subframes[ch].best.residual[i] << 1) ^ (frame.subframes[ch].best.residual[i] >> 31)) >> k); + len += (((task.frame.subframes[ch].best.residual[i] << 1) ^ (task.frame.subframes[ch].best.residual[i] >> 31)) >> k); j += cnt; } - if (len != frame.subframes[ch].best.size) - throw new Exception(string.Format("length mismatch: {0} vs {1}", len, frame.subframes[ch].best.size)); + if (len != task.frame.subframes[ch].best.size) + throw new Exception(string.Format("length mismatch: {0} vs {1}", len, task.frame.subframes[ch].best.size)); } #endif } break; } - if (frame.subframes[ch].best.size > frame.subframes[ch].obits * (uint)frame.blocksize) + if (task.frame.subframes[ch].best.size > task.frame.subframes[ch].obits * (uint)task.frame.blocksize) { #if DEBUG throw new Exception("larger than verbatim"); #endif - frame.subframes[ch].best.type = SubframeType.Verbatim; - frame.subframes[ch].best.size = frame.subframes[ch].obits * (uint)frame.blocksize; + task.frame.subframes[ch].best.type = SubframeType.Verbatim; + task.frame.subframes[ch].best.size = task.frame.subframes[ch].obits * (uint)task.frame.blocksize; + if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true; } } } @@ -1128,11 +1106,12 @@ namespace CUETools.Codecs.FlaCuda for (int ch = 0; ch < channels; ch++) { + int index = ch + iFrame * channels; + frame.subframes[ch].best.residual = ((int*)task.residualBufferPtr) + task.BestResidualTasks[index].residualOffs; frame.subframes[ch].best.type = SubframeType.Verbatim; frame.subframes[ch].best.size = frame.subframes[ch].obits * (uint)frame.blocksize; frame.subframes[ch].wbits = 0; - int index = ch + iFrame * channels; if (task.BestResidualTasks[index].size < 0) throw new Exception("internal error"); if (frame.blocksize > Math.Max(4, eparams.max_prediction_order) && frame.subframes[ch].best.size > task.BestResidualTasks[index].size) @@ -1152,10 +1131,11 @@ namespace CUETools.Codecs.FlaCuda frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i]; if (!encode_on_cpu && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC)) { - AudioSamples.MemCpy(frame.subframes[ch].best.residual, (int*)task.residualBufferPtr + task.BestResidualTasks[index].residualOffs, frame.blocksize); int* riceParams = ((int*)task.bestRiceParamsPtr) + (index << task.max_porder); - for (int i = 0; i < (1 << frame.subframes[ch].best.rc.porder); i++) - frame.subframes[ch].best.rc.rparams[i] = riceParams[i]; + fixed (int* dstParams = frame.subframes[ch].best.rc.rparams) + AudioSamples.MemCpy(dstParams, riceParams, (1 << frame.subframes[ch].best.rc.porder)); + //for (int i = 0; i < (1 << frame.subframes[ch].best.rc.porder); i++) + // frame.subframes[ch].best.rc.rparams[i] = riceParams[i]; } } } @@ -1358,21 +1338,98 @@ namespace CUETools.Codecs.FlaCuda cuda.CopyDeviceToHostAsync(task.cudaBestResidualTasks, task.bestResidualTasksPtr, (uint)(sizeof(FlaCudaSubframeTask) * channels * task.frameCount), task.stream); } + /// + /// Copy channel-interleaved input samples into separate subframes + /// + /// + /// + unsafe void unpack_samples(FlaCudaTask task, int count) + { + int iFrame = task.frame.frame_number; + short* src = ((short*)task.samplesBytesPtr) + iFrame * channels * task.frameSize; + + switch (task.frame.ch_mode) + { + case ChannelMode.NotStereo: + for (int ch = 0; ch < channels; ch++) + { + int* s = task.frame.subframes[ch].samples; + for (int i = 0; i < count; i++) + s[i] = src[i * channels + ch]; + } + break; + case ChannelMode.LeftRight: + { + int* left = task.frame.subframes[0].samples; + int* right = task.frame.subframes[1].samples; + for (int i = 0; i < count; i++) + { + int l = *(src++); + int r = *(src++); + left[i] = l; + right[i] = r; + } + break; + } + case ChannelMode.LeftSide: + { + int* left = task.frame.subframes[0].samples; + int* right = task.frame.subframes[1].samples; + for (int i = 0; i < count; i++) + { + int l = *(src++); + int r = *(src++); + left[i] = l; + right[i] = l - r; + } + break; + } + case ChannelMode.RightSide: + { + int* left = task.frame.subframes[0].samples; + int* right = task.frame.subframes[1].samples; + for (int i = 0; i < count; i++) + { + int l = *(src++); + int r = *(src++); + left[i] = l - r; + right[i] = r; + } + break; + } + case ChannelMode.MidSide: + { + int* left = task.frame.subframes[0].samples; + int* right = task.frame.subframes[1].samples; + for (int i = 0; i < count; i++) + { + int l = *(src++); + int r = *(src++); + left[i] = (l + r) >> 1; + right[i] = l - r; + } + break; + } + } + } + unsafe int encode_frame(bool doMidside, int channelCount, int iFrame, FlaCudaTask task, int current_frame_number) { - fixed (int* r = task.residualBuffer) + task.frame.InitSize(task.frameSize, eparams.variable_block_size != 0); + task.frame.frame_number = iFrame; + task.frame.ch_mode = ChannelMode.NotStereo; + + fixed (int* smp = task.samplesBuffer) { - task.frame.InitSize(task.frameSize, eparams.variable_block_size != 0); for (int ch = 0; ch < channelCount; ch++) - { - int* s = ((int*)task.samplesBufferPtr) + ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * task.frameSize; - task.frame.subframes[ch].Init(s, r + ch * FlaCudaWriter.MAX_BLOCKSIZE, + task.frame.subframes[ch].Init( + smp + ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * task.frameSize, + ((int*)task.residualBufferPtr) + ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * task.frameSize, bits_per_sample + (doMidside && ch == 3 ? 1U : 0U), 0); - } select_best_methods(task.frame, channelCount, iFrame, task); - - encode_residual(task.frame); + //unpack_samples(task); + encode_residual(task); //task.frame.writer.Reset(); task.frame.frame_number = current_frame_number; @@ -1401,13 +1458,6 @@ namespace CUETools.Codecs.FlaCuda frame_count += nFrames; frame_pos += nFrames * blocksize; cuda.CopyHostToDeviceAsync(task.cudaSamplesBytes, task.samplesBytesPtr, (uint)(sizeof(short) * channels * blocksize * nFrames), task.stream); - if (task.verify != null) - { - int* r = (int*)task.samplesBufferPtr; - fixed (int* s = task.verifyBuffer) - for (int ch = 0; ch < channels; ch++) - AudioSamples.MemCpy(s + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * FlaCudaWriter.MAX_BLOCKSIZE, task.frameSize * task.frameCount); - } } unsafe void run_GPU_task(FlaCudaTask task) @@ -1457,11 +1507,20 @@ namespace CUETools.Codecs.FlaCuda int decoded = task.verify.DecodeFrame(task.frame.writer.Buffer, task.frame.writer_offset, fs); if (decoded != fs || task.verify.Remaining != (ulong)task.frameSize) throw new Exception("validation failed! frame size mismatch"); - fixed (int* s = task.verifyBuffer, r = task.verify.Samples) + fixed (int* r = task.verify.Samples) { for (int ch = 0; ch < channels; ch++) - if (AudioSamples.MemCmp(s + iFrame * task.frameSize + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, task.frameSize)) - throw new Exception("validation failed!"); + { + short* res = ((short*)task.samplesBytesPtr) + iFrame * channels * task.frameSize + ch; + int* smp = r + ch * Flake.MAX_BLOCKSIZE; + for (int i = task.frameSize; i > 0; i--) + { + //if (AudioSamples.MemCmp(s + iFrame * task.frameSize + ch * FlaCudaWriter.MAX_BLOCKSIZE, r + ch * Flake.MAX_BLOCKSIZE, task.frameSize)) + if (*res != *(smp++)) + throw new Exception(string.Format("validation failed! iFrame={0}, ch={1}", iFrame, ch)); + res += channels; + } + } } } @@ -1649,13 +1708,7 @@ namespace CUETools.Codecs.FlaCuda } int bs = eparams.block_size * nFrames; if (bs < samplesInBuffer) - { - int* s1 = (int*)task1.samplesBufferPtr; - int* s2 = (int*)task2.samplesBufferPtr; - for (int ch = 0; ch < channelCount; ch++) - AudioSamples.MemCpy(s2 + ch * FlaCudaWriter.MAX_BLOCKSIZE, s1 + bs + ch * FlaCudaWriter.MAX_BLOCKSIZE, samplesInBuffer - bs); AudioSamples.MemCpy(((short*)task2.samplesBytesPtr), ((short*)task1.samplesBytesPtr) + bs * channels, (samplesInBuffer - bs) * channels); - } samplesInBuffer -= bs; FlaCudaTask tmp = task1; task1 = task2; @@ -2174,14 +2227,12 @@ namespace CUETools.Codecs.FlaCuda public CUdeviceptr cudaResidualOutput; public CUdeviceptr cudaBestResidualTasks; public IntPtr samplesBytesPtr = IntPtr.Zero; - public IntPtr samplesBufferPtr = IntPtr.Zero; public IntPtr residualBufferPtr = IntPtr.Zero; public IntPtr bestRiceParamsPtr = IntPtr.Zero; public IntPtr residualTasksPtr = IntPtr.Zero; public IntPtr bestResidualTasksPtr = IntPtr.Zero; public CUstream stream; - public int[] residualBuffer; - public int[] verifyBuffer; + public int[] samplesBuffer; public byte[] outputBuffer; public int outputSize = 0; public int frameSize = 0; @@ -2228,8 +2279,6 @@ namespace CUETools.Codecs.FlaCuda CUResult cuErr = CUResult.Success; if (cuErr == CUResult.Success) cuErr = CUDADriver.cuMemAllocHost(ref samplesBytesPtr, (uint)samplesBufferLen/2); - if (cuErr == CUResult.Success) - cuErr = CUDADriver.cuMemAllocHost(ref samplesBufferPtr, (uint)samplesBufferLen); if (cuErr == CUResult.Success) cuErr = CUDADriver.cuMemAllocHost(ref residualBufferPtr, (uint)samplesBufferLen); if (cuErr == CUResult.Success) @@ -2241,7 +2290,6 @@ namespace CUETools.Codecs.FlaCuda if (cuErr != CUResult.Success) { if (samplesBytesPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBytesPtr); samplesBytesPtr = IntPtr.Zero; - if (samplesBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBufferPtr); samplesBufferPtr = IntPtr.Zero; if (residualBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualBufferPtr); residualBufferPtr = IntPtr.Zero; if (bestRiceParamsPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(bestRiceParamsPtr); bestRiceParamsPtr = IntPtr.Zero; if (residualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualTasksPtr); residualTasksPtr = IntPtr.Zero; @@ -2273,9 +2321,7 @@ namespace CUETools.Codecs.FlaCuda cudaFindPartitionOrder = cuda.GetModuleFunction("cudaFindPartitionOrder"); stream = cuda.CreateStream(); - verifyBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * channelCount]; // should be channels, not channelCount. And should null if not doing verify! - //residualBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * Math.Max(10, channelCount + 1)]; - residualBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * (channels == 2 ? 10 : channels + 1)]; + samplesBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * channelCount]; outputBuffer = new byte[max_frame_size * FlaCudaWriter.maxFrames + 1]; frame = new FlacFrame(channelCount); frame.writer = new BitWriter(outputBuffer, 0, outputBuffer.Length); @@ -2299,7 +2345,6 @@ namespace CUETools.Codecs.FlaCuda cuda.Free(cudaResidualOutput); cuda.Free(cudaBestResidualTasks); CUDADriver.cuMemFreeHost(samplesBytesPtr); - CUDADriver.cuMemFreeHost(samplesBufferPtr); CUDADriver.cuMemFreeHost(residualBufferPtr); CUDADriver.cuMemFreeHost(bestRiceParamsPtr); CUDADriver.cuMemFreeHost(residualTasksPtr);