From cab8e6da6b6ffe0bf50ec92ff5a759b51ae7fa46 Mon Sep 17 00:00:00 2001 From: chudov Date: Fri, 29 Oct 2010 16:51:11 +0000 Subject: [PATCH] testing on Fermi --- CUETools.Codecs.FLACCL/FLACCLWriter.cs | 276 +++++++++++++++++++------ CUETools.Codecs.FLACCL/flac.cl | 123 ++++++----- CUETools.Codecs.FLACCL/flaccpu.cl | 20 +- 3 files changed, 307 insertions(+), 112 deletions(-) diff --git a/CUETools.Codecs.FLACCL/FLACCLWriter.cs b/CUETools.Codecs.FLACCL/FLACCLWriter.cs index 0626210..03d96a6 100644 --- a/CUETools.Codecs.FLACCL/FLACCLWriter.cs +++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs @@ -36,7 +36,8 @@ namespace CUETools.Codecs.FLACCL public FLACCLWriterSettings() { this.DoVerify = false; - this.GPUOnly = true; + this.GPUOnly = true; + this.MappedMemory = false; this.DoMD5 = true; this.GroupSize = 64; this.DeviceType = OpenCLDeviceType.GPU; @@ -56,6 +57,10 @@ namespace CUETools.Codecs.FLACCL [SRDescription(typeof(Properties.Resources), "DescriptionGPUOnly")] public bool GPUOnly { get; set; } + [DefaultValue(false)] + [SRDescription(typeof(Properties.Resources), "DescriptionMappedMemory")] + public bool MappedMemory { get; set; } + [DefaultValue(64)] [SRDescription(typeof(Properties.Resources), "DescriptionGroupSize")] public int GroupSize { get; set; } @@ -247,6 +252,7 @@ namespace CUETools.Codecs.FLACCL { _settings.GroupSize = 1; _settings.GPUOnly = false; + _settings.MappedMemory = true; } eparams.flake_set_defaults(_compressionLevel, !_settings.GPUOnly); } @@ -956,7 +962,7 @@ namespace CUETools.Codecs.FLACCL if ((eparams.window_function & flag) == 0 || task.nWindowFunctions == lpc.MAX_LPC_WINDOWS) return; - func(((float*)task.clWindowFunctions.HostPtr) + task.nWindowFunctions * task.frameSize, task.frameSize); + func(((float*)task.clWindowFunctionsPtr) + task.nWindowFunctions * task.frameSize, task.frameSize); //int sz = _windowsize; //float* pos = window + _windowcount * FLACCLWriter.MAX_BLOCKSIZE * 2; //do @@ -983,12 +989,16 @@ namespace CUETools.Codecs.FLACCL calculate_window(task, lpc.window_bartlett, WindowFunction.Bartlett); if (task.nWindowFunctions == 0) throw new Exception("invalid windowfunction"); - task.openCLCQ.EnqueueWriteBuffer(task.clWindowFunctions, true, 0, sizeof(float) * task.nWindowFunctions * task.frameSize, task.clWindowFunctions.HostPtr); + if (!_settings.MappedMemory) + task.openCLCQ.EnqueueWriteBuffer(task.clWindowFunctions, true, 0, sizeof(float) * task.nWindowFunctions * task.frameSize, task.clWindowFunctionsPtr); } task.nResidualTasks = 0; task.nTasksPerWindow = Math.Min(32, eparams.orders_per_window); task.nResidualTasksPerChannel = task.nWindowFunctions * task.nTasksPerWindow + (eparams.do_constant ? 1 : 0) + Math.Max(0, 1 + eparams.max_fixed_order - eparams.min_fixed_order); + if (task.nResidualTasksPerChannel > 32) + throw new Exception("too many tasks"); + //if (task.nResidualTasksPerChannel >= 4) // task.nResidualTasksPerChannel = (task.nResidualTasksPerChannel + 7) & ~7; for (int iFrame = 0; iFrame < nFrames; iFrame++) @@ -1082,7 +1092,8 @@ namespace CUETools.Codecs.FLACCL if (sizeof(FLACCLSubframeTask) * task.nResidualTasks > task.residualTasksLen) throw new Exception("oops"); - task.openCLCQ.EnqueueWriteBuffer(task.clResidualTasks, true, 0, sizeof(FLACCLSubframeTask) * task.nResidualTasks, task.clResidualTasks.HostPtr); + if (!_settings.MappedMemory) + task.openCLCQ.EnqueueWriteBuffer(task.clResidualTasks, true, 0, sizeof(FLACCLSubframeTask) * task.nResidualTasks, task.clResidualTasksPtr); } unsafe void encode_residual(FLACCLTask task) @@ -1192,7 +1203,7 @@ namespace CUETools.Codecs.FLACCL for (int ch = 0; ch < channels; ch++) { int index = ch + iFrame * channels; - frame.subframes[ch].best.residual = ((int*)task.clResidual.HostPtr) + task.BestResidualTasks[index].residualOffs; + frame.subframes[ch].best.residual = ((int*)task.clResidualPtr) + task.BestResidualTasks[index].residualOffs; frame.subframes[ch].best.type = SubframeType.Verbatim; frame.subframes[ch].best.size = (uint)(frame.subframes[ch].obits * frame.blocksize); frame.subframes[ch].wbits = 0; @@ -1213,7 +1224,7 @@ namespace CUETools.Codecs.FLACCL frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i]; if (_settings.GPUOnly && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC)) { - int* riceParams = ((int*)task.clBestRiceParams.HostPtr) + (index << task.max_porder); + int* riceParams = ((int*)task.clBestRiceParamsPtr) + (index << task.max_porder); fixed (int* dstParams = frame.subframes[ch].best.rc.rparams) AudioSamples.MemCpy(dstParams, riceParams, (1 << frame.subframes[ch].best.rc.porder)); //for (int i = 0; i < (1 << frame.subframes[ch].best.rc.porder); i++) @@ -1240,7 +1251,7 @@ namespace CUETools.Codecs.FLACCL unsafe void unpack_samples(FLACCLTask task, int count) { int iFrame = task.frame.frame_number; - short* src = ((short*)task.clSamplesBytes.HostPtr) + iFrame * channels * task.frameSize; + short* src = ((short*)task.clSamplesBytesPtr) + iFrame * channels * task.frameSize; switch (task.frame.ch_mode) { @@ -1327,7 +1338,7 @@ namespace CUETools.Codecs.FLACCL for (int ch = 0; ch < channelCount; ch++) task.frame.subframes[ch].Init( smp + ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * task.frameSize, - ((int*)task.clResidual.HostPtr) + ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * task.frameSize, + ((int*)task.clResidualPtr) + ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * task.frameSize, _pcm.BitsPerSample + (doMidside && ch == 3 ? 1 : 0), 0); select_best_methods(task.frame, channelCount, iFrame, task); @@ -1360,7 +1371,8 @@ namespace CUETools.Codecs.FLACCL task.framePos = frame_pos; frame_count += nFrames; frame_pos += nFrames * blocksize; - task.openCLCQ.EnqueueWriteBuffer(task.clSamplesBytes, false, 0, sizeof(short) * channels * blocksize * nFrames, task.clSamplesBytes.HostPtr); + if (!_settings.MappedMemory) + task.openCLCQ.EnqueueWriteBuffer(task.clSamplesBytes, false, 0, sizeof(short) * channels * blocksize * nFrames, task.clSamplesBytesPtr); //task.openCLCQ.EnqueueUnmapMemObject(task.clSamplesBytes, task.clSamplesBytes.HostPtr); //task.openCLCQ.EnqueueMapBuffer(task.clSamplesBytes, true, MapFlags.WRITE, 0, task.samplesBufferLen / 2); } @@ -1402,7 +1414,7 @@ namespace CUETools.Codecs.FLACCL { for (int ch = 0; ch < channels; ch++) { - short* res = ((short*)task.clSamplesBytes.HostPtr) + iFrame * channels * task.frameSize + ch; + short* res = ((short*)task.clSamplesBytesPtr) + iFrame * channels * task.frameSize + ch; int* smp = r + ch * Flake.MAX_BLOCKSIZE; for (int i = task.frameSize; i > 0; i--) { @@ -1589,7 +1601,7 @@ namespace CUETools.Codecs.FLACCL int block = Math.Min(buff.Length - pos, eparams.block_size * max_frames - samplesInBuffer); fixed (byte* buf = buff.Bytes) - AudioSamples.MemCpy(((byte*)task1.clSamplesBytes.HostPtr) + samplesInBuffer * _pcm.BlockAlign, buf + pos * _pcm.BlockAlign, block * _pcm.BlockAlign); + AudioSamples.MemCpy(((byte*)task1.clSamplesBytesPtr) + samplesInBuffer * _pcm.BlockAlign, buf + pos * _pcm.BlockAlign, block * _pcm.BlockAlign); samplesInBuffer += block; pos += block; @@ -1703,8 +1715,8 @@ namespace CUETools.Codecs.FLACCL samplesInBuffer -= bs; if (samplesInBuffer > 0) AudioSamples.MemCpy( - ((byte*)task2.clSamplesBytes.HostPtr), - ((byte*)task1.clSamplesBytes.HostPtr) + bs * _pcm.BlockAlign, + ((byte*)task2.clSamplesBytesPtr), + ((byte*)task1.clSamplesBytesPtr) + bs * _pcm.BlockAlign, samplesInBuffer * _pcm.BlockAlign); FLACCLTask tmp = task1; task1 = task2; @@ -2226,7 +2238,22 @@ namespace CUETools.Codecs.FLACCL public Mem clResidualOutput; public Mem clBestResidualTasks; public Mem clWindowFunctions; - public int[] samplesBuffer; + + public Mem clSamplesBytesPinned; + public Mem clResidualPinned; + public Mem clBestRiceParamsPinned; + public Mem clResidualTasksPinned; + public Mem clBestResidualTasksPinned; + public Mem clWindowFunctionsPinned; + + public IntPtr clSamplesBytesPtr; + public IntPtr clResidualPtr; + public IntPtr clBestRiceParamsPtr; + public IntPtr clResidualTasksPtr; + public IntPtr clBestResidualTasksPtr; + public IntPtr clWindowFunctionsPtr; + + public int[] samplesBuffer; public byte[] outputBuffer; public int outputSize = 0; public int frameSize = 0; @@ -2268,29 +2295,77 @@ namespace CUETools.Codecs.FLACCL #endif openCLCQ = openCLProgram.Context.CreateCommandQueue(openCLProgram.Context.Devices[0], prop); - residualTasksLen = sizeof(FLACCLSubframeTask) * channelsCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 8) * FLACCLWriter.maxFrames; + int MAX_ORDER = this.writer.eparams.max_prediction_order; + residualTasksLen = sizeof(FLACCLSubframeTask) * 32 * channelsCount * FLACCLWriter.maxFrames; bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channels * FLACCLWriter.maxFrames; samplesBufferLen = sizeof(int) * FLACCLWriter.MAX_BLOCKSIZE * channelsCount; int partitionsLen = sizeof(int) * (30 << 8) * channels * FLACCLWriter.maxFrames; int riceParamsLen = sizeof(int) * (4 << 8) * channels * FLACCLWriter.maxFrames; - int lpcDataLen = sizeof(float) * 32 * 33 * lpc.MAX_LPC_WINDOWS * channelsCount * FLACCLWriter.maxFrames; + int autocorLen = sizeof(float) * (MAX_ORDER + 1) * lpc.MAX_LPC_WINDOWS * channelsCount * FLACCLWriter.maxFrames; + int lpcDataLen = autocorLen * 32; + int resOutLen = sizeof(int) * channelsCount * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 8) * FLACCLWriter.maxFrames; + int wndLen = sizeof(float) * FLACCLWriter.MAX_BLOCKSIZE /** 2*/ * lpc.MAX_LPC_WINDOWS; - clSamplesBytes = openCLProgram.Context.CreateBuffer(MemFlags.READ_ONLY | MemFlags.ALLOC_HOST_PTR, (uint)samplesBufferLen / 2); - clSamples = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, samplesBufferLen); - clResidual = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, samplesBufferLen); - clLPCData = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, lpcDataLen); - clPartitions = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, partitionsLen); - clRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, riceParamsLen); - clBestRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceParamsLen / 4); - clAutocorOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, sizeof(float) * channelsCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * FLACCLWriter.maxFrames); - clResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, residualTasksLen); - clBestResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, bestResidualTasksLen); - clResidualOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, sizeof(int) * channelsCount * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 8) * 64 /*FLACCLWriter.maxResidualParts*/ * FLACCLWriter.maxFrames); - clWindowFunctions = openCLProgram.Context.CreateBuffer(MemFlags.READ_ONLY | MemFlags.ALLOC_HOST_PTR, sizeof(float) * FLACCLWriter.MAX_BLOCKSIZE /** 2*/ * lpc.MAX_LPC_WINDOWS); + if (!writer._settings.MappedMemory) + { + clSamplesBytes = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, samplesBufferLen / 2); + clResidual = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, samplesBufferLen); + clBestRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, riceParamsLen / 4); + clResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, residualTasksLen); + clBestResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, bestResidualTasksLen); + clWindowFunctions = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, wndLen); - //openCLCQ.EnqueueMapBuffer(clSamplesBytes, true, MapFlags.WRITE, 0, samplesBufferLen / 2); + clSamplesBytesPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, samplesBufferLen / 2); + clResidualPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, samplesBufferLen); + clBestRiceParamsPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceParamsLen / 4); + clResidualTasksPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, residualTasksLen); + clBestResidualTasksPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, bestResidualTasksLen); + clWindowFunctionsPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, wndLen); - clComputeAutocor = openCLProgram.CreateKernel("clComputeAutocor"); + clSamplesBytesPtr = openCLCQ.EnqueueMapBuffer(clSamplesBytesPinned, true, MapFlags.WRITE, 0, samplesBufferLen / 2); + clResidualPtr = openCLCQ.EnqueueMapBuffer(clResidualPinned, true, MapFlags.WRITE, 0, samplesBufferLen); + clBestRiceParamsPtr = openCLCQ.EnqueueMapBuffer(clBestRiceParamsPinned, true, MapFlags.WRITE, 0, riceParamsLen / 4); + clResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clResidualTasksPinned, true, MapFlags.WRITE, 0, residualTasksLen); + clBestResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clBestResidualTasksPinned, true, MapFlags.WRITE, 0, bestResidualTasksLen); + clWindowFunctionsPtr = openCLCQ.EnqueueMapBuffer(clWindowFunctionsPinned, true, MapFlags.WRITE, 0, wndLen); + } + else + { + clSamplesBytes = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, (uint)samplesBufferLen / 2); + clResidual = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, samplesBufferLen); + clBestRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceParamsLen / 4); + clResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, residualTasksLen); + clBestResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, bestResidualTasksLen); + clWindowFunctions = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, wndLen); + + clSamplesBytesPtr = openCLCQ.EnqueueMapBuffer(clSamplesBytes, true, MapFlags.WRITE, 0, samplesBufferLen / 2); + clResidualPtr = openCLCQ.EnqueueMapBuffer(clResidual, true, MapFlags.WRITE, 0, samplesBufferLen); + clBestRiceParamsPtr = openCLCQ.EnqueueMapBuffer(clBestRiceParams, true, MapFlags.WRITE, 0, riceParamsLen / 4); + clResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clResidualTasks, true, MapFlags.WRITE, 0, residualTasksLen); + clBestResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clBestResidualTasks, true, MapFlags.WRITE, 0, bestResidualTasksLen); + clWindowFunctionsPtr = openCLCQ.EnqueueMapBuffer(clWindowFunctions, true, MapFlags.WRITE, 0, wndLen); + + //clSamplesBytesPtr = clSamplesBytes.HostPtr; + //clResidualPtr = clResidual.HostPtr; + //clBestRiceParamsPtr = clBestRiceParams.HostPtr; + //clResidualTasksPtr = clResidualTasks.HostPtr; + //clBestResidualTasksPtr = clBestResidualTasks.HostPtr; + //clWindowFunctionsPtr = clWindowFunctions.HostPtr; + } + + clSamples = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, samplesBufferLen); + clLPCData = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, lpcDataLen); + clAutocorOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, autocorLen); + clResidualOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, resOutLen); + if (writer._settings.GPUOnly) + { + clPartitions = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, partitionsLen); + clRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, riceParamsLen); + } + + //openCLCQ.EnqueueMapBuffer(clSamplesBytes, true, MapFlags.WRITE, 0, samplesBufferLen / 2); + + clComputeAutocor = openCLProgram.CreateKernel("clComputeAutocor"); clStereoDecorr = openCLProgram.CreateKernel("clStereoDecorr"); //cudaChannelDecorr = openCLProgram.CreateKernel("clChannelDecorr"); clChannelDecorr2 = openCLProgram.CreateKernel("clChannelDecorr2"); @@ -2337,6 +2412,8 @@ namespace CUETools.Codecs.FLACCL workThread = null; } + openCLCQ.Finish(); + clComputeAutocor.Dispose(); clStereoDecorr.Dispose(); //cudaChannelDecorr.Dispose(); @@ -2357,13 +2434,53 @@ namespace CUETools.Codecs.FLACCL clSumPartition.Dispose(); clFindRiceParameter.Dispose(); clFindPartitionOrder.Dispose(); - } + + clPartitions.Dispose(); + clRiceParams.Dispose(); + } + + if (!writer._settings.MappedMemory) + { + if (clSamplesBytesPtr != IntPtr.Zero) + openCLCQ.EnqueueUnmapMemObject(clSamplesBytesPinned, clSamplesBytesPtr); + clSamplesBytesPtr = IntPtr.Zero; + if (clResidualPtr != IntPtr.Zero) + openCLCQ.EnqueueUnmapMemObject(clResidualPinned, clResidualPtr); + clResidualPtr = IntPtr.Zero; + if (clBestRiceParamsPtr != IntPtr.Zero) + openCLCQ.EnqueueUnmapMemObject(clBestRiceParamsPinned, clBestRiceParamsPtr); + clBestRiceParamsPtr = IntPtr.Zero; + if (clResidualTasksPtr != IntPtr.Zero) + openCLCQ.EnqueueUnmapMemObject(clResidualTasksPinned, clResidualTasksPtr); + clResidualTasksPtr = IntPtr.Zero; + if (clBestResidualTasksPtr != IntPtr.Zero) + openCLCQ.EnqueueUnmapMemObject(clBestResidualTasksPinned, clBestResidualTasksPtr); + clBestResidualTasksPtr = IntPtr.Zero; + if (clWindowFunctionsPtr != IntPtr.Zero) + openCLCQ.EnqueueUnmapMemObject(clWindowFunctionsPinned, clWindowFunctionsPtr); + clWindowFunctionsPtr = IntPtr.Zero; + + clSamplesBytesPinned.Dispose(); + clResidualPinned.Dispose(); + clBestRiceParamsPinned.Dispose(); + clResidualTasksPinned.Dispose(); + clBestResidualTasksPinned.Dispose(); + clWindowFunctionsPinned.Dispose(); + } + else + { + openCLCQ.EnqueueUnmapMemObject(clSamplesBytes, clSamplesBytesPtr); + openCLCQ.EnqueueUnmapMemObject(clResidual, clResidualPtr); + openCLCQ.EnqueueUnmapMemObject(clBestRiceParams, clBestRiceParamsPtr); + openCLCQ.EnqueueUnmapMemObject(clResidualTasks, clResidualTasksPtr); + openCLCQ.EnqueueUnmapMemObject(clBestResidualTasks, clBestResidualTasksPtr); + openCLCQ.EnqueueUnmapMemObject(clWindowFunctions, clWindowFunctionsPtr); + } clSamples.Dispose(); clSamplesBytes.Dispose(); clLPCData.Dispose(); clResidual.Dispose(); - clPartitions.Dispose(); clAutocorOutput.Dispose(); clResidualTasks.Dispose(); clResidualOutput.Dispose(); @@ -2371,13 +2488,15 @@ namespace CUETools.Codecs.FLACCL clWindowFunctions.Dispose(); openCLCQ.Dispose(); - } + + GC.SuppressFinalize(this); + } public unsafe FLACCLSubframeTask* ResidualTasks { get { - return (FLACCLSubframeTask*)clResidualTasks.HostPtr; + return (FLACCLSubframeTask*)clResidualTasksPtr; } } @@ -2385,7 +2504,7 @@ namespace CUETools.Codecs.FLACCL { get { - return (FLACCLSubframeTask*)clBestResidualTasks.HostPtr; + return (FLACCLSubframeTask*)clBestResidualTasksPtr; } } @@ -2398,20 +2517,20 @@ namespace CUETools.Codecs.FLACCL this.max_porder--; if (channels != 2) throw new Exception("channels != 2"); // need to Enqueue cudaChannelDecorr for each channel - Kernel cudaChannelDecorr = channels == 2 ? (channelsCount == 4 ? clStereoDecorr : clChannelDecorr2) : null;// cudaChannelDecorr; + Kernel clChannelDecorr = channels == 2 ? (channelsCount == 4 ? clStereoDecorr : clChannelDecorr2) : null;// cudaChannelDecorr; // openCLCQ.EnqueueMapBuffer(cudaSamplesBytes //openCLCQ.EnqueueUnmapMemObject(cudaSamplesBytes, cudaSamplesBytes.HostPtr); // issue work to the GPU - cudaChannelDecorr.SetArgs( + clChannelDecorr.SetArgs( clSamples, clSamplesBytes, FLACCLWriter.MAX_BLOCKSIZE); - openCLCQ.EnqueueNDRangeKernel(cudaChannelDecorr, 0, frameSize * frameCount); + openCLCQ.EnqueueNDRangeKernel(clChannelDecorr, 0, frameSize * frameCount); - if (eparams.do_wasted) + if (eparams.do_wasted) { clFindWastedBits.SetArgs( clResidualTasks, @@ -2424,8 +2543,8 @@ namespace CUETools.Codecs.FLACCL frameCount * channelsCount); } - clComputeAutocor.SetArgs( - clAutocorOutput, + clComputeAutocor.SetArgs( + clAutocorOutput, clSamples, clWindowFunctions, clResidualTasks, @@ -2437,18 +2556,23 @@ namespace CUETools.Codecs.FLACCL channelsCount * frameCount, nWindowFunctions); - clComputeLPC.SetArgs( + clComputeLPC.SetArgs( clAutocorOutput, clLPCData, nWindowFunctions); - openCLCQ.EnqueueNDRangeKernel( - clComputeLPC, - Math.Min(groupSize, 32), 1, - nWindowFunctions, - channelsCount * frameCount); + //openCLCQ.EnqueueNDRangeKernel( + // clComputeLPC, + // 64, 1, + // nWindowFunctions, + // frameCount); + openCLCQ.EnqueueNDRangeKernel( + clComputeLPC, + Math.Min(groupSize, 32), 1, + nWindowFunctions, + channelsCount * frameCount); - clQuantizeLPC.SetArgs( + clQuantizeLPC.SetArgs( clResidualTasks, clLPCData, nResidualTasksPerChannel, @@ -2462,7 +2586,7 @@ namespace CUETools.Codecs.FLACCL nWindowFunctions, channelsCount * frameCount); - clEstimateResidual.SetArgs( + clEstimateResidual.SetArgs( clResidualOutput, clSamples, clResidualTasks); @@ -2472,7 +2596,7 @@ namespace CUETools.Codecs.FLACCL groupSize, nResidualTasksPerChannel * channelsCount * frameCount); - clChooseBestMethod.SetArgs( + clChooseBestMethod.SetArgs( clResidualTasks, clResidualOutput, nResidualTasksPerChannel); @@ -2481,7 +2605,7 @@ namespace CUETools.Codecs.FLACCL clChooseBestMethod, Math.Min(groupSize, 32), channelsCount * frameCount); - if (channels == 2 && channelsCount == 4) + if (channels == 2 && channelsCount == 4) { clCopyBestMethodStereo.SetArgs( clBestResidualTasks, @@ -2580,18 +2704,44 @@ namespace CUETools.Codecs.FLACCL groupSize, channels * frameCount); - openCLCQ.EnqueueReadBuffer(clBestRiceParams, false, 0, sizeof(int) * (1 << max_porder) * channels * frameCount, clBestRiceParams.HostPtr); - openCLCQ.EnqueueReadBuffer(clResidual, false, 0, sizeof(int) * FLACCLWriter.MAX_BLOCKSIZE * channels, clResidual.HostPtr); - //openCLCQ.EnqueueMapBuffer(clBestRiceParams, false, MapFlags.READ, 0, sizeof(int) * (1 << max_porder) * channels * frameCount); - //openCLCQ.EnqueueUnmapMemObject(clBestRiceParams, clBestRiceParams.HostPtr); - //openCLCQ.EnqueueMapBuffer(clResidual, false, MapFlags.READ, 0, sizeof(int) * FLACCLWriter.MAX_BLOCKSIZE * channels); - //openCLCQ.EnqueueUnmapMemObject(clResidual, clResidual.HostPtr); + if (!writer._settings.MappedMemory) + openCLCQ.EnqueueReadBuffer(clBestRiceParams, false, 0, sizeof(int) * (1 << max_porder) * channels * frameCount, clBestRiceParamsPtr); + if (!writer._settings.MappedMemory) + openCLCQ.EnqueueReadBuffer(clResidual, false, 0, sizeof(int) * FLACCLWriter.MAX_BLOCKSIZE * channels, clResidualPtr); } - openCLCQ.EnqueueReadBuffer(clBestResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * channels * frameCount, clBestResidualTasks.HostPtr); - //openCLCQ.EnqueueMapBuffer(clBestResidualTasks, false, MapFlags.READ, 0, sizeof(FLACCLSubframeTask) * channels * frameCount); - //openCLCQ.EnqueueUnmapMemObject(clBestResidualTasks, clBestResidualTasks.HostPtr); - - //openCLCQ.EnqueueMapBuffer(clSamplesBytes, false, MapFlags.WRITE, 0, samplesBufferLen / 2); - } + if (!writer._settings.MappedMemory) + openCLCQ.EnqueueReadBuffer(clBestResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * channels * frameCount, clBestResidualTasksPtr); + } } + + public static class OpenCLExtensions + { + public static void SetArgs(this Kernel kernel, params object[] args) + { + int i = 0; + foreach (object arg in args) + { + if (arg is int) + kernel.SetArg(i, (int)arg); + else if (arg is Mem) + kernel.SetArg(i, (Mem)arg); + else + throw new ArgumentException("Invalid argument type", arg.GetType().ToString()); + i++; + } + } + + public static void EnqueueNDRangeKernel(this CommandQueue queue, Kernel kernel, long localSize, long globalSize) + { + if (localSize == 0) + queue.EnqueueNDRangeKernel(kernel, 1, null, new long[] { globalSize }, null); + else + queue.EnqueueNDRangeKernel(kernel, 1, null, new long[] { localSize * globalSize }, new long[] { localSize }); + } + + public static void EnqueueNDRangeKernel(this CommandQueue queue, Kernel kernel, long localSizeX, long localSizeY, long globalSizeX, long globalSizeY) + { + queue.EnqueueNDRangeKernel(kernel, 2, null, new long[] { localSizeX * globalSizeX, localSizeY * globalSizeY }, new long[] { localSizeX, localSizeY }); + } + } } diff --git a/CUETools.Codecs.FLACCL/flac.cl b/CUETools.Codecs.FLACCL/flac.cl index 015761c..ea7e1a2 100644 --- a/CUETools.Codecs.FLACCL/flac.cl +++ b/CUETools.Codecs.FLACCL/flac.cl @@ -20,12 +20,19 @@ #ifndef _FLACCL_KERNEL_H_ #define _FLACCL_KERNEL_H_ -#ifdef DEBUG -#pragma OPENCL EXTENSION cl_amd_printf : enable -#endif +#undef DEBUG + +//#define AMD + +//#ifdef DEBUG +//#pragma OPENCL EXTENSION cl_amd_printf : enable +//#endif //#pragma OPENCL EXTENSION cl_amd_fp64 : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics: enable +#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics: enable + typedef enum { Constant = 0, @@ -59,6 +66,8 @@ typedef struct int coefs[32]; // fixme: should be short? } FLACCLSubframeTask; +#define iclamp(a,b,c) max(b,min(a,c)) + __kernel void clStereoDecorr( __global int *samples, __global short2 *src, @@ -181,8 +190,10 @@ void clComputeAutocor( int tid0 = tid % (GROUP_SIZE >> 2); int tid1 = tid / (GROUP_SIZE >> 2); +#ifdef ATI __local float4 * dptr = ((__local float4 *)&data[0]) + tid0; __local float4 * dptr1 = ((__local float4 *)&data[tid1]) + tid0; +#endif for (int pos = 0; pos < bs; pos += GROUP_SIZE) { @@ -192,8 +203,15 @@ void clComputeAutocor( barrier(CLK_LOCAL_MEM_FENCE); for (int ord4 = 0; ord4 < (MAX_ORDER / 4 + 1); ord4 ++) +#ifdef ATI product[ord4 * GROUP_SIZE + tid] += dot(dptr[0], dptr1[ord4]); - +#else + product[ord4 * GROUP_SIZE + tid] += + data[tid0*4 + 0] * data[tid0*4 + ord4*4 + tid1 + 0] + + data[tid0*4 + 1] * data[tid0*4 + ord4*4 + tid1 + 1] + + data[tid0*4 + 2] * data[tid0*4 + ord4*4 + tid1 + 2] + + data[tid0*4 + 3] * data[tid0*4 + ord4*4 + tid1 + 3]; +#endif barrier(CLK_LOCAL_MEM_FENCE); data[tid] = nextData; @@ -223,8 +241,8 @@ void clComputeLPC( volatile float autoc[33]; } shared; const int tid = get_local_id(0);// + get_local_id(1) * 32; - int lpcOffs = (get_group_id(0) + get_group_id(1) * windowCount) * (MAX_ORDER + 1) * 32; int autocOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1); + int lpcOffs = autocOffs * 32; if (get_local_id(0) <= MAX_ORDER) shared.autoc[get_local_id(0)] = autoc[autocOffs + get_local_id(0)]; @@ -272,11 +290,12 @@ void clComputeLPC( shared.error[order] = error; // Levinson-Durbin recursion - float ldr = - select(0.0f, reff * shared.ldr[order - 1 - get_local_id(0)], get_local_id(0) < order) + - select(0.0f, reff, get_local_id(0) == order); + float ldr = shared.ldr[get_local_id(0)]; barrier(CLK_LOCAL_MEM_FENCE); - shared.ldr[get_local_id(0)] += ldr; + if (get_local_id(0) < order) + shared.ldr[order - 1 - get_local_id(0)] += reff * ldr; + if (get_local_id(0) == order) + shared.ldr[get_local_id(0)] += reff; barrier(CLK_LOCAL_MEM_FENCE); // Output coeffs @@ -329,7 +348,7 @@ void clQuantizeLPC( // Load prediction error estimates if (tid < MAX_ORDER) - shared.error[tid] = shared.task.blocksize * log(lpcs[shared.lpcOffs + MAX_ORDER * 32 + tid]) + tid * 4.12f * log(shared.task.blocksize); + shared.error[tid] = shared.task.blocksize * log(lpcs[shared.lpcOffs + MAX_ORDER * 32 + tid]) + tid * 4.12f * log((float)shared.task.blocksize); //shared.error[get_local_id(0)] = shared.task.blocksize * log(lpcs[shared.lpcOffs + MAX_ORDER * 32 + get_local_id(0)]) + get_local_id(0) * 0.30f * (shared.task.abits + 1) * log(shared.task.blocksize); barrier(CLK_LOCAL_MEM_FENCE); @@ -387,7 +406,7 @@ void clQuantizeLPC( // get 15 bits of each coeff int coef = convert_int_rte(lpc * (1 << 15)); // remove sign bits - atomic_or(shared.maxcoef + i, coef ^ (coef >> 31)); + atom_or(shared.maxcoef + i, coef ^ (coef >> 31)); barrier(CLK_LOCAL_MEM_FENCE); //SUM32(shared.tmpi,tid,|=); // choose precision @@ -402,12 +421,12 @@ void clQuantizeLPC( //if (shared.task.abits + 32 - clz(order) < shift //int shift = max(0,min(15, (shared.task.abits >> 2) - 14 + clz(shared.tmpi[tid & ~31]) + ((32 - clz(order))>>1))); // quantize coeffs with given shift - coef = convert_int_rte(clamp(lpc * (1 << shift), -1 << (cbits - 1), 1 << (cbits - 1))); + coef = convert_int_rte(clamp(lpc * (1 << shift), (float)(-1 << (cbits - 1)), (float)(1 << (cbits - 1)))); // error correction //shared.tmp[tid] = (tid != 0) * (shared.arp[tid - 1]*(1 << shared.task.shift) - shared.task.coefs[tid - 1]); //shared.task.coefs[tid] = max(-(1 << (shared.task.cbits - 1)), min((1 << (shared.task.cbits - 1))-1, convert_int_rte((shared.arp[tid]) * (1 << shared.task.shift) + shared.tmp[tid]))); // remove sign bits - atomic_or(shared.maxcoef2 + i, coef ^ (coef >> 31)); + atom_or(shared.maxcoef2 + i, coef ^ (coef >> 31)); barrier(CLK_LOCAL_MEM_FENCE); // calculate actual number of bits (+1 for sign) cbits = 1 + 32 - clz(shared.maxcoef2[i]); @@ -452,14 +471,16 @@ void clEstimateResidual( psum[tid] = 0; data[tid] = 0.0f; - int partOrder = clz(64) - clz(bs - 1) + 1; + int partOrder = max(1, clz(64) - clz(bs - 1) + 1); barrier(CLK_LOCAL_MEM_FENCE); +#ifdef AMD float4 cptr0 = vload4(0, &fcoef[0]); float4 cptr1 = vload4(1, &fcoef[0]); #if MAX_ORDER > 8 float4 cptr2 = vload4(2, &fcoef[0]); +#endif #endif for (int pos = 0; pos < bs; pos += GROUP_SIZE) { @@ -471,6 +492,7 @@ void clEstimateResidual( // compute residual __local float* dptr = &data[tid + GROUP_SIZE - ro]; +#ifdef AMD float4 sum = cptr0 * vload4(0, dptr) + cptr1 * vload4(1, dptr) #if MAX_ORDER > 8 @@ -488,20 +510,28 @@ void clEstimateResidual( ; int t = convert_int_rte(nextData + sum.x + sum.y + sum.z + sum.w); +#else + float sum = + fcoef[0] * dptr[0] + fcoef[1] * dptr[1] + fcoef[2] * dptr[2] + fcoef[3] * dptr[3] + + fcoef[4] * dptr[4] + fcoef[5] * dptr[5] + fcoef[6] * dptr[6] + fcoef[7] * dptr[7] + + fcoef[8] * dptr[8] + fcoef[9] * dptr[9] + fcoef[10] * dptr[10] + fcoef[11] * dptr[11] ; + int t = convert_int_rte(nextData + sum); +#endif barrier(CLK_LOCAL_MEM_FENCE); data[tid] = nextData; // ensure we're within frame bounds t = select(0, t, offs >= ro && offs < bs); // overflow protection - t = clamp(t, -0x7fffff, 0x7fffff); + t = iclamp(t, -0x7fffff, 0x7fffff); // convert to unsigned - atomic_add(&psum[offs >> partOrder], (t << 1) ^ (t >> 31)); + if (offs < bs) + atom_add(&psum[offs >> partOrder], (t << 1) ^ (t >> 31)); } // calculate rice partition bit length for every (1 << partOrder) samples if (tid < 64) { - int k = clamp(clz(1 << partOrder) - clz(psum[tid]), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16) + int k = iclamp(clz(1 << partOrder) - clz(psum[tid]), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16) psum[tid] = (k << partOrder) + (psum[tid] >> k); } barrier(CLK_LOCAL_MEM_FENCE); @@ -657,11 +687,11 @@ void clEncodeResidual( barrier(CLK_LOCAL_MEM_FENCE); __local int4 * cptr = (__local int4 *)&task.coefs[0]; - int4 cptr0 = cptr[0]; + int4 cptr0 = vload4(0, &task.coefs[0]); #if MAX_ORDER > 4 - int4 cptr1 = cptr[1]; + int4 cptr1 = vload4(1, &task.coefs[0]); #if MAX_ORDER > 8 - int4 cptr2 = cptr[2]; + int4 cptr2 = vload4(2, &task.coefs[0]); #endif #endif @@ -675,19 +705,19 @@ void clEncodeResidual( barrier(CLK_LOCAL_MEM_FENCE); // compute residual - __local int4 * dptr = (__local int4 *)&data[tid + GROUP_SIZE - ro]; - int4 sum = dptr[0] * cptr0 + __local int* dptr = &data[tid + GROUP_SIZE - ro]; + int4 sum = cptr0 * vload4(0, dptr) #if MAX_ORDER > 4 - + dptr[1] * cptr1 + + cptr1 * vload4(1, dptr) #if MAX_ORDER > 8 - + dptr[2] * cptr2 + + cptr2 * vload4(2, dptr) #if MAX_ORDER > 12 - + dptr[3] * cptr[3] + + vload4(3, &task.coefs[0]) * vload4(3, dptr) #if MAX_ORDER > 16 - + dptr[4] * cptr[4] - + dptr[5] * cptr[5] - + dptr[6] * cptr[6] - + dptr[7] * cptr[7] + + vload4(4, &task.coefs[0]) * vload4(4, dptr) + + vload4(5, &task.coefs[0]) * vload4(5, dptr) + + vload4(6, &task.coefs[0]) * vload4(6, dptr) + + vload4(7, &task.coefs[0]) * vload4(7, dptr) #endif #endif #endif @@ -732,13 +762,13 @@ void clCalcPartition( // fetch residual int s = (offs >= task.residualOrder && offs < end) ? residual[task.residualOffs + offs] : 0; // overflow protection - s = clamp(s, -0x7fffff, 0x7fffff); + s = iclamp(s, -0x7fffff, 0x7fffff); // convert to unsigned s = (s << 1) ^ (s >> 31); // calc number of unary bits for each residual sample with each rice paramater int part = (offs - start) / psize + (tid & 1) * (GROUP_SIZE / 16); for (int k = 0; k <= 14; k++) - atomic_add(&pl[part][k], s >> k); + atom_add(&pl[part][k], s >> k); //pl[part][k] += s >> k; } barrier(CLK_LOCAL_MEM_FENCE); @@ -788,12 +818,11 @@ void clCalcPartition16( barrier(CLK_LOCAL_MEM_FENCE); - __local int4 * cptr = (__local int4 *)&task.coefs[0]; - int4 cptr0 = cptr[0]; + int4 cptr0 = vload4(0, &task.coefs[0]); #if MAX_ORDER > 4 - int4 cptr1 = cptr[1]; + int4 cptr1 = vload4(1, &task.coefs[0]); #if MAX_ORDER > 8 - int4 cptr2 = cptr[2]; + int4 cptr2 = vload4(2, &task.coefs[0]); #endif #endif @@ -807,19 +836,19 @@ void clCalcPartition16( barrier(CLK_LOCAL_MEM_FENCE); // compute residual - __local int4 * dptr = (__local int4 *)&data[tid + GROUP_SIZE - ro]; - int4 sum = dptr[0] * cptr0 + __local int* dptr = &data[tid + GROUP_SIZE - ro]; + int4 sum = cptr0 * vload4(0, dptr) #if MAX_ORDER > 4 - + dptr[1] * cptr1 + + cptr1 * vload4(1, dptr) #if MAX_ORDER > 8 - + dptr[2] * cptr2 + + cptr2 * vload4(2, dptr) #if MAX_ORDER > 12 - + dptr[3] * cptr[3] + + vload4(3, &task.coefs[0]) * vload4(3, dptr) #if MAX_ORDER > 16 - + dptr[4] * cptr[4] - + dptr[5] * cptr[5] - + dptr[6] * cptr[6] - + dptr[7] * cptr[7] + + vload4(4, &task.coefs[0]) * vload4(4, dptr) + + vload4(5, &task.coefs[0]) * vload4(5, dptr) + + vload4(6, &task.coefs[0]) * vload4(6, dptr) + + vload4(7, &task.coefs[0]) * vload4(7, dptr) #endif #endif #endif @@ -833,11 +862,11 @@ void clCalcPartition16( //int s = select(0, residual[task.data.residualOffs + offs], offs >= ro && offs < bs); - s = clamp(s, -0x7fffff, 0x7fffff); + s = iclamp(s, -0x7fffff, 0x7fffff); // convert to unsigned res[tid] = (s << 1) ^ (s >> 31); - // for (int k = 0; k < 15; k++) atomic_add(&pl[x][k], s >> k); + // for (int k = 0; k < 15; k++) atom_add(&pl[x][k], s >> k); barrier(CLK_LOCAL_MEM_FENCE); data[tid] = nextData; @@ -943,7 +972,7 @@ void clFindPartitionOrder( { int len = rice_parameters[pos + offs]; int porder = 31 - clz(lim - offs); - atomic_add(&partlen[porder], len); + atom_add(&partlen[porder], len); } barrier(CLK_LOCAL_MEM_FENCE); diff --git a/CUETools.Codecs.FLACCL/flaccpu.cl b/CUETools.Codecs.FLACCL/flaccpu.cl index a075299..5eaa5fd 100644 --- a/CUETools.Codecs.FLACCL/flaccpu.cl +++ b/CUETools.Codecs.FLACCL/flaccpu.cl @@ -152,7 +152,23 @@ void clComputeAutocor( for (int tid = 0; tid < len; tid++) data1[tid] = samples[task.samplesOffs + tid] * window[windowOffs + tid]; data1[len] = 0.0f; - + __global float * pout = &output[(get_group_id(0) * get_num_groups(1) + get_group_id(1)) * (MAX_ORDER + 1)]; + for (int l = 1; l < MAX_ORDER; l++) + data1[len + l] = 0.0f; + + // double ac0 = 0.0, ac1 = 0.0, ac2 = 0.0, ac3 = 0.0; + // for (int j = 0; j < len; j++) + // { + //float dj = data1[j]; + //ac0 += dj * dj; + //ac1 += dj * data1[j + 1]; + //ac2 += dj * data1[j + 2]; + //ac3 += dj * data1[j + 3]; + // } + // pout[0] = ac0; + // pout[1] = ac1; + // pout[2] = ac2; + // pout[3] = ac3; for (int i = 0; i <= MAX_ORDER; ++i) { double temp = 1.0; @@ -164,7 +180,7 @@ void clComputeAutocor( temp += pdata[i] * pdata[0]; temp2 += pdata[i + 1] * pdata[1]; } - output[(get_group_id(0) * get_num_groups(1) + get_group_id(1)) * (MAX_ORDER + 1) + i] = temp + temp2; + pout[i] = temp + temp2; } }