diff --git a/CUETools.Codecs.FLACCL/CUETools.Codecs.FLACCL.csproj b/CUETools.Codecs.FLACCL/CUETools.Codecs.FLACCL.csproj index 8e31c17..9fbebf1 100644 --- a/CUETools.Codecs.FLACCL/CUETools.Codecs.FLACCL.csproj +++ b/CUETools.Codecs.FLACCL/CUETools.Codecs.FLACCL.csproj @@ -65,8 +65,6 @@ - - PreserveNewest diff --git a/CUETools.Codecs.FLACCL/FLACCLWriter.cs b/CUETools.Codecs.FLACCL/FLACCLWriter.cs index c839e17..ec6d1a8 100644 --- a/CUETools.Codecs.FLACCL/FLACCLWriter.cs +++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs @@ -41,7 +41,7 @@ namespace CUETools.Codecs.FLACCL this.MappedMemory = false; this.DoMD5 = true; this.GroupSize = 128; - this.TaskSize = 32; + this.TaskSize = 8; this.DeviceType = OpenCLDeviceType.GPU; } @@ -67,17 +67,19 @@ namespace CUETools.Codecs.FLACCL [SRDescription(typeof(Properties.Resources), "DescriptionMappedMemory")] public bool MappedMemory { get; set; } + [TypeConverter(typeof(FLACCLWriterSettingsGroupSizeConverter))] [DefaultValue(128)] [SRDescription(typeof(Properties.Resources), "DescriptionGroupSize")] public int GroupSize { get; set; } - [DefaultValue(32)] + [DefaultValue(8)] [SRDescription(typeof(Properties.Resources), "DescriptionTaskSize")] public int TaskSize { get; set; } [SRDescription(typeof(Properties.Resources), "DescriptionDefines")] public string Defines { get; set; } + [TypeConverter(typeof(FLACCLWriterSettingsPlatformConverter))] [SRDescription(typeof(Properties.Resources), "DescriptionPlatform")] public string Platform { get; set; } @@ -103,6 +105,35 @@ namespace CUETools.Codecs.FLACCL } } + public class FLACCLWriterSettingsPlatformConverter : TypeConverter + { + public override bool GetStandardValuesSupported(ITypeDescriptorContext context) + { + return true; + } + + public override StandardValuesCollection GetStandardValues(ITypeDescriptorContext context) + { + var res = new List(); + foreach (var p in OpenCL.GetPlatforms()) + res.Add(p.Name); + return new StandardValuesCollection(res); + } + } + + public class FLACCLWriterSettingsGroupSizeConverter : TypeConverter + { + public override bool GetStandardValuesSupported(ITypeDescriptorContext context) + { + return true; + } + + public override StandardValuesCollection GetStandardValues(ITypeDescriptorContext context) + { + return new StandardValuesCollection(new int[] { 64, 128, 256 }); + } + } + public enum OpenCLDeviceType : ulong { CPU = DeviceType.CPU, @@ -173,6 +204,8 @@ namespace CUETools.Codecs.FLACCL FLACCLTask[] cpu_tasks; int oldest_cpu_task = 0; + internal int framesPerTask; + AudioPCMConfig _pcm; public const int MAX_BLOCKSIZE = 65536; @@ -1037,7 +1070,7 @@ namespace CUETools.Codecs.FLACCL if (task.nWindowFunctions == 0) throw new Exception("invalid windowfunction"); if (!_settings.MappedMemory) - task.openCLCQ.EnqueueWriteBuffer(task.clWindowFunctions, true, 0, sizeof(float) * task.nWindowFunctions * task.frameSize, task.clWindowFunctionsPtr); + task.openCLCQ.EnqueueWriteBuffer(task.clWindowFunctions, false, 0, sizeof(float) * task.nWindowFunctions * task.frameSize, task.clWindowFunctionsPtr); } task.nResidualTasks = 0; @@ -1163,9 +1196,9 @@ namespace CUETools.Codecs.FLACCL throw new Exception("oops"); if (!_settings.MappedMemory) - task.openCLCQ.EnqueueWriteBuffer(task.clResidualTasks, true, 0, sizeof(FLACCLSubframeTask) * task.nResidualTasks, task.clResidualTasksPtr); + task.openCLCQ.EnqueueWriteBuffer(task.clResidualTasks, false, 0, sizeof(FLACCLSubframeTask) * task.nResidualTasks, task.clResidualTasksPtr); if (!_settings.MappedMemory) - task.openCLCQ.EnqueueWriteBuffer(task.clSelectedTasks, true, 0, sizeof(int) * (nFrames * channelsCount * task.nEstimateTasksPerChannel), task.clSelectedTasksPtr); + task.openCLCQ.EnqueueWriteBuffer(task.clSelectedTasks, false, 0, sizeof(int) * (nFrames * channelsCount * task.nEstimateTasksPerChannel), task.clSelectedTasksPtr); } unsafe void encode_residual(FLACCLTask task) @@ -1471,7 +1504,7 @@ namespace CUETools.Codecs.FLACCL int channelsCount = doMidside ? 2 * channels : channels; if (task.nResidualTasks == 0) - initializeSubframeTasks(task.frameSize, channelsCount, _settings.TaskSize, task); + initializeSubframeTasks(task.frameSize, channelsCount, framesPerTask, task); estimate_residual(task, channelsCount); } @@ -1611,10 +1644,9 @@ namespace CUETools.Codecs.FLACCL } OCLMan.CreateDefaultContext(platformId, (DeviceType)_settings.DeviceType); - bool haveAtom = false; - if (OCLMan.Context.Devices[0].Extensions.Contains("cl_khr_local_int32_extended_atomics")) - haveAtom = true; - else + this.framesPerTask = (int)OCLMan.Context.Devices[0].MaxComputeUnits * _settings.TaskSize; + + if (!OCLMan.Context.Devices[0].Extensions.Contains("cl_khr_local_int32_extended_atomics")) _settings.GPUOnly = false; // The Defines string gets prepended to any and all sources that are compiled @@ -1625,13 +1657,20 @@ namespace CUETools.Codecs.FLACCL "#define FLACCL_VERSION \"" + vendor_string + "\"\n" + (_settings.GPUOnly ? "#define DO_PARTITIONS\n" : "") + (_settings.DoRice ? "#define DO_RICE\n" : "") + - (haveAtom ? "#define HAVE_ATOM\n" : "") + #if DEBUG "#define DEBUG\n" + #endif (_settings.DeviceType == OpenCLDeviceType.CPU ? "#define FLACCL_CPU\n" : "") + _settings.Defines + "\n"; + var exts = new string[] { "cl_khr_local_int32_base_atomics", "cl_khr_local_int32_extended_atomics", "cl_khr_fp64", "cl_amd_fp64" }; + foreach (string extension in exts) + if (OCLMan.Context.Devices[0].Extensions.Contains(extension)) + { + OCLMan.Defines += "#pragma OPENCL EXTENSION " + extension + ": enable\n"; + OCLMan.Defines += "#define HAVE_" + extension + "\n"; + } + try { openCLProgram = OCLMan.CompileFile("flac.cl"); @@ -1698,7 +1737,7 @@ namespace CUETools.Codecs.FLACCL int pos = 0; while (pos < buff.Length) { - int block = Math.Min(buff.Length - pos, eparams.block_size * _settings.TaskSize - samplesInBuffer); + int block = Math.Min(buff.Length - pos, eparams.block_size * framesPerTask - samplesInBuffer); fixed (byte* buf = buff.Bytes) AudioSamples.MemCpy(((byte*)task1.clSamplesBytesPtr) + samplesInBuffer * _pcm.BlockAlign, buf + pos * _pcm.BlockAlign, block * _pcm.BlockAlign); @@ -1707,7 +1746,7 @@ namespace CUETools.Codecs.FLACCL pos += block; int nFrames = samplesInBuffer / eparams.block_size; - if (nFrames >= _settings.TaskSize) + if (nFrames >= framesPerTask) do_output_frames(nFrames); } if (md5 != null) @@ -2405,7 +2444,7 @@ namespace CUETools.Codecs.FLACCL openCLCQ = openCLProgram.Context.CreateCommandQueue(openCLProgram.Context.Devices[0], prop); int MAX_ORDER = this.writer.eparams.max_prediction_order; - int MAX_FRAMES = this.writer._settings.TaskSize; + int MAX_FRAMES = this.writer.framesPerTask; int MAX_CHANNELSIZE = MAX_FRAMES * writer.eparams.block_size; residualTasksLen = sizeof(FLACCLSubframeTask) * 32 * channelsCount * MAX_FRAMES; bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channels * MAX_FRAMES; @@ -2440,14 +2479,14 @@ namespace CUETools.Codecs.FLACCL clSelectedTasksPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, selectedLen); clRiceOutputPinned = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceLen); - clSamplesBytesPtr = openCLCQ.EnqueueMapBuffer(clSamplesBytesPinned, true, MapFlags.WRITE, 0, samplesBufferLen / 2); - clResidualPtr = openCLCQ.EnqueueMapBuffer(clResidualPinned, true, MapFlags.WRITE, 0, residualBufferLen); - clBestRiceParamsPtr = openCLCQ.EnqueueMapBuffer(clBestRiceParamsPinned, true, MapFlags.WRITE, 0, riceParamsLen / 4); - clResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clResidualTasksPinned, true, MapFlags.WRITE, 0, residualTasksLen); - clBestResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clBestResidualTasksPinned, true, MapFlags.WRITE, 0, bestResidualTasksLen); - clWindowFunctionsPtr = openCLCQ.EnqueueMapBuffer(clWindowFunctionsPinned, true, MapFlags.WRITE, 0, wndLen); - clSelectedTasksPtr = openCLCQ.EnqueueMapBuffer(clSelectedTasksPinned, true, MapFlags.WRITE, 0, selectedLen); - clRiceOutputPtr = openCLCQ.EnqueueMapBuffer(clRiceOutputPinned, true, MapFlags.WRITE, 0, riceLen); + clSamplesBytesPtr = openCLCQ.EnqueueMapBuffer(clSamplesBytesPinned, true, MapFlags.READ_WRITE, 0, samplesBufferLen / 2); + clResidualPtr = openCLCQ.EnqueueMapBuffer(clResidualPinned, true, MapFlags.READ_WRITE, 0, residualBufferLen); + clBestRiceParamsPtr = openCLCQ.EnqueueMapBuffer(clBestRiceParamsPinned, true, MapFlags.READ_WRITE, 0, riceParamsLen / 4); + clResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clResidualTasksPinned, true, MapFlags.READ_WRITE, 0, residualTasksLen); + clBestResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clBestResidualTasksPinned, true, MapFlags.READ_WRITE, 0, bestResidualTasksLen); + clWindowFunctionsPtr = openCLCQ.EnqueueMapBuffer(clWindowFunctionsPinned, true, MapFlags.READ_WRITE, 0, wndLen); + clSelectedTasksPtr = openCLCQ.EnqueueMapBuffer(clSelectedTasksPinned, true, MapFlags.READ_WRITE, 0, selectedLen); + clRiceOutputPtr = openCLCQ.EnqueueMapBuffer(clRiceOutputPinned, true, MapFlags.READ_WRITE, 0, riceLen); } else { @@ -2904,7 +2943,7 @@ namespace CUETools.Codecs.FLACCL openCLCQ.EnqueueNDRangeKernel( clCalcOutputOffsets, - groupSize, + openCLCQ.Device.DeviceType == DeviceType.CPU ? groupSize : 32, 1); clRiceEncoding.SetArgs( diff --git a/CUETools.Codecs.FLACCL/Properties/Resources.Designer.cs b/CUETools.Codecs.FLACCL/Properties/Resources.Designer.cs index 5b6aa51..3d76527 100644 --- a/CUETools.Codecs.FLACCL/Properties/Resources.Designer.cs +++ b/CUETools.Codecs.FLACCL/Properties/Resources.Designer.cs @@ -124,7 +124,7 @@ namespace CUETools.Codecs.FLACCL.Properties { } /// - /// Looks up a localized string similar to OpenCL platform to use (ATI Stream, NVIDIA OpenCL, Intel OpenCL, etc). + /// Looks up a localized string similar to OpenCL platform to use. /// internal static string DescriptionPlatform { get { @@ -133,7 +133,7 @@ namespace CUETools.Codecs.FLACCL.Properties { } /// - /// Looks up a localized string similar to Number of frames processed simultaniously (32, 64). + /// Looks up a localized string similar to Number of frames processed per one multiprocessor. /// internal static string DescriptionTaskSize { get { diff --git a/CUETools.Codecs.FLACCL/Properties/Resources.resx b/CUETools.Codecs.FLACCL/Properties/Resources.resx index e2cbd7f..7809d11 100644 --- a/CUETools.Codecs.FLACCL/Properties/Resources.resx +++ b/CUETools.Codecs.FLACCL/Properties/Resources.resx @@ -139,10 +139,10 @@ Device uses host memory (Don't use) - OpenCL platform to use (ATI Stream, NVIDIA OpenCL, Intel OpenCL, etc) + OpenCL platform to use - Number of frames processed simultaniously (32, 64) + Number of frames processed per one multiprocessor Calculate MD5 hash for audio stream diff --git a/CUETools.Codecs.FLACCL/flac.cl b/CUETools.Codecs.FLACCL/flac.cl index 2cc5a3b..8cd66c2 100644 --- a/CUETools.Codecs.FLACCL/flac.cl +++ b/CUETools.Codecs.FLACCL/flac.cl @@ -28,10 +28,30 @@ #pragma OPENCL EXTENSION cl_amd_printf : enable #endif -#ifdef __CPU__ -#pragma OPENCL EXTENSION cl_amd_fp64 : enable +#if defined(HAVE_cl_khr_local_int32_base_atomics) && defined(HAVE_cl_khr_local_int32_extended_atomics) +#define HAVE_ATOM #endif +#if defined(HAVE_cl_khr_fp64) || defined(HAVE_cl_amd_fp64) +#define HAVE_DOUBLE +#define ZEROD 0.0 +//#define FAST_DOUBLE +#else +#define double float +#define double4 float4 +#define ZEROD 0.0f +#endif +#if defined(HAVE_DOUBLE) && defined(FAST_DOUBLE) +#define fastdouble double +#define fastdouble4 double4 +#define ZEROFD 0.0 +#else +#define fastdouble float +#define fastdouble4 float4 +#define ZEROFD 0.0f +#endif + + //#if __OPENCL_VERSION__ == 110 #ifdef AMD #define iclamp(a,b,c) clamp(a,b,c) @@ -45,11 +65,6 @@ #define WARP_SIZE 32 -#ifdef HAVE_ATOM -#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics: enable -#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics: enable -#endif - typedef enum { Constant = 0, @@ -303,7 +318,6 @@ void clComputeAutocor( #else // get_num_groups(0) == number of tasks // get_num_groups(1) == number of windows -#if 0 __kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) void clComputeAutocor( __global float *output, @@ -313,7 +327,7 @@ void clComputeAutocor( const int taskCount // tasks per block ) { - __local float data[GROUP_SIZE * 2]; + __local fastdouble data[GROUP_SIZE * 2]; __local FLACCLSubframeData task; const int tid = get_local_id(0); // fetch task data @@ -322,160 +336,50 @@ void clComputeAutocor( barrier(CLK_LOCAL_MEM_FENCE); int bs = task.blocksize; - int windowOffs = get_group_id(1) * bs; - - // if (tid < GROUP_SIZE / 4) - // { - //float4 dd = 0.0f; - //if (tid * 4 < bs) - // dd = vload4(tid, window + windowOffs) * convert_float4(vload4(tid, samples + task.samplesOffs)); - //vstore4(dd, tid, &data[0]); - // } - data[tid] = 0.0f; - // This simpler code doesn't work somehow!!! - //data[tid] = tid < bs ? samples[task.samplesOffs + tid] * window[windowOffs + tid] : 0.0f; - - const int THREADS_FOR_ORDERS = MAX_ORDER < 8 ? 8 : MAX_ORDER < 16 ? 16 : MAX_ORDER < 32 ? 32 : 64; - float corr = 0.0f; - float corr1 = 0.0f; - for (int pos = 0; pos < bs; pos += GROUP_SIZE) - { - // fetch samples - float nextData = pos + tid < bs ? samples[task.samplesOffs + pos + tid] * window[windowOffs + pos + tid] : 0.0f; - data[tid + GROUP_SIZE] = nextData; - barrier(CLK_LOCAL_MEM_FENCE); - - int lag = tid & (THREADS_FOR_ORDERS - 1); - int tid1 = tid + GROUP_SIZE - lag; -#ifdef AMD - float4 res = 0.0f; - for (int i = 0; i < THREADS_FOR_ORDERS / 4; i++) - res += vload4(i, &data[tid1 - lag]) * vload4(i, &data[tid1]); - corr += res.x + res.y + res.w + res.z; -#else - float res = 0.0f; - for (int i = 0; i < THREADS_FOR_ORDERS; i++) - res += data[tid1 - lag + i] * data[tid1 + i]; - corr += res; -#endif - if ((pos & (GROUP_SIZE * 15)) == 0) - { - corr1 += corr; - corr = 0.0f; - } - - barrier(CLK_LOCAL_MEM_FENCE); - data[tid] = nextData; - } - - data[tid] = corr + corr1; - barrier(CLK_LOCAL_MEM_FENCE); - for (int i = GROUP_SIZE / 2; i >= THREADS_FOR_ORDERS; i >>= 1) - { - if (tid < i) - data[tid] += data[tid + i]; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (tid <= MAX_ORDER) - output[(get_group_id(0) * get_num_groups(1) + get_group_id(1)) * (MAX_ORDER + 1) + tid] = data[tid]; -} -#else -__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) -void clComputeAutocor( - __global float *output, - __global const int *samples, - __global const float *window, - __global FLACCLSubframeTask *tasks, - const int taskCount // tasks per block -) -{ - __local float data[GROUP_SIZE * 2 + 32]; - __local FLACCLSubframeData task; - const int tid = get_local_id(0); - // fetch task data - if (tid < sizeof(task) / sizeof(int)) - ((__local int*)&task)[tid] = ((__global int*)(tasks + taskCount * get_group_id(0)))[tid]; - barrier(CLK_LOCAL_MEM_FENCE); - - int bs = task.blocksize; - data[tid] = 0.0f; - if (tid < 32) - data[GROUP_SIZE * 2 + tid] = 0.0f; + data[tid] = ZEROFD; const int THREADS_FOR_ORDERS = MAX_ORDER < 8 ? 8 : MAX_ORDER < 16 ? 16 : MAX_ORDER < 32 ? 32 : 64; int lag = tid & (THREADS_FOR_ORDERS - 1); int tid1 = tid + GROUP_SIZE - lag; int pos = 0; const __global float * wptr = &window[get_group_id(1) * bs]; -#ifdef AMD - float4 corr = 0.0f; -#else - float corr = 0.0f; -#endif - float corr1 = 0.0f; +// const __global int * sptr = &samples[task.samplesOffs]; + double corr = ZEROD; + for (pos = 0; pos + GROUP_SIZE - 1 < bs; pos += GROUP_SIZE) { - // fetch samples int off = pos + tid; -// const __global int * sptr = &samples[task.samplesOffs]; - float nextData = samples[task.samplesOffs + off] * wptr[off]; + // fetch samples + fastdouble nextData = samples[task.samplesOffs + off] * wptr[off]; data[tid + GROUP_SIZE] = nextData; barrier(CLK_LOCAL_MEM_FENCE); -#ifdef AMD + fastdouble4 tmp = ZEROFD; for (int i = 0; i < THREADS_FOR_ORDERS / 4; i++) - corr += vload4(i, &data[tid1 - lag]) * vload4(i, &data[tid1]); -#else - for (int i = 0; i < THREADS_FOR_ORDERS; i++) - corr += data[tid1 - lag + i] * data[tid1 + i]; -#endif - - if ((pos & (GROUP_SIZE * 15)) == 0) - { -#ifdef AMD - corr1 += (corr.x + corr.y) + (corr.w + corr.z); -#else - corr1 += corr; -#endif - corr = 0.0f; - } + tmp += vload4(i, &data[tid1 - lag]) * vload4(i, &data[tid1]); + corr += (tmp.x + tmp.y) + (tmp.w + tmp.z); barrier(CLK_LOCAL_MEM_FENCE); data[tid] = nextData; } if (pos < bs) { - // fetch samples int off = pos + tid; - float nextData = off < bs ? samples[task.samplesOffs + off] * wptr[off] : 0.0f; + // fetch samples + double nextData = off < bs ? samples[task.samplesOffs + off] * wptr[off] : ZEROD; data[tid + GROUP_SIZE] = nextData; barrier(CLK_LOCAL_MEM_FENCE); - int lag = tid & (THREADS_FOR_ORDERS - 1); - int tid1 = tid + GROUP_SIZE - lag; -//#if 1 -#ifdef AMD - float4 res = 0.0f; + fastdouble4 tmp = ZEROFD; for (int i = 0; i < THREADS_FOR_ORDERS / 4; i++) - res += vload4(i, &data[tid1 - lag]) * vload4(i, &data[tid1]); - corr1 += res.x + res.y + res.w + res.z; -#else - for (int i = 0; i < THREADS_FOR_ORDERS; i++) - corr1 += data[tid1 - lag + i] * data[tid1 + i]; -#endif + tmp += vload4(i, &data[tid1 - lag]) * vload4(i, &data[tid1]); + corr += (tmp.x + tmp.y) + (tmp.w + tmp.z); barrier(CLK_LOCAL_MEM_FENCE); data[tid] = nextData; } -#ifdef AMD - corr1 += corr.x + corr.y + corr.w + corr.z; -#else - corr1 += corr; -#endif - - data[tid] = corr1; + data[tid] = corr; barrier(CLK_LOCAL_MEM_FENCE); for (int i = GROUP_SIZE / 2; i >= THREADS_FOR_ORDERS; i >>= 1) { @@ -488,7 +392,6 @@ void clComputeAutocor( output[(get_group_id(0) * get_num_groups(1) + get_group_id(1)) * (MAX_ORDER + 1) + tid] = data[tid]; } #endif -#endif #ifdef FLACCL_CPU __kernel __attribute__((reqd_work_group_size(1, 1, 1))) @@ -558,8 +461,8 @@ void clComputeLPC( ) { __local struct { - volatile float ldr[32]; - volatile float gen1[32]; + volatile double ldr[32]; + volatile double gen1[32]; volatile float error[32]; volatile float autoc[33]; } shared; @@ -575,9 +478,9 @@ void clComputeLPC( barrier(CLK_LOCAL_MEM_FENCE); // Compute LPC using Schur and Levinson-Durbin recursion - float gen0 = shared.gen1[get_local_id(0)] = shared.autoc[get_local_id(0)+1]; - shared.ldr[get_local_id(0)] = 0.0f; - float error = shared.autoc[0]; + double gen0 = shared.gen1[get_local_id(0)] = shared.autoc[get_local_id(0)+1]; + shared.ldr[get_local_id(0)] = ZEROD; + double error = shared.autoc[0]; #ifdef DEBUGPRINT1 int magic = shared.autoc[0] == 177286873088.0f; @@ -589,10 +492,10 @@ void clComputeLPC( for (int order = 0; order < MAX_ORDER; order++) { // Schur recursion - float reff = -shared.gen1[0] / error; + double reff = -shared.gen1[0] / error; //error += shared.gen1[0] * reff; // Equivalent to error *= (1 - reff * reff); error *= (1 - reff * reff); - float gen1; + double gen1; if (get_local_id(0) < MAX_ORDER - 1 - order) { gen1 = shared.gen1[get_local_id(0) + 1] + reff * gen0; @@ -613,7 +516,7 @@ void clComputeLPC( shared.error[order] = error; // Levinson-Durbin recursion - float ldr = shared.ldr[get_local_id(0)]; + double ldr = shared.ldr[get_local_id(0)]; barrier(CLK_LOCAL_MEM_FENCE); if (get_local_id(0) < order) shared.ldr[order - 1 - get_local_id(0)] += reff * ldr; @@ -651,12 +554,20 @@ void clQuantizeLPC( float error[MAX_ORDER]; int best_orders[MAX_ORDER]; + int best8 = 0; // Load prediction error estimates based on Akaike's Criteria for (int tid = 0; tid < MAX_ORDER; tid++) { - error[tid] = bs * log(lpcs[lpcOffs + MAX_ORDER * 32 + tid]) + tid * 4.12f * log((float)bs); + error[tid] = bs * log(1.0f + lpcs[lpcOffs + MAX_ORDER * 32 + tid]) + tid * 4.12f * log((float)bs); best_orders[tid] = tid; + if (tid < 8 && error[tid] < error[best8]) + best8 = tid; } + +#if 0 + for (int i = best8 + 1; i < MAX_ORDER; i++) + error[i] += 20.5f * log((float)bs); +#endif // Select best orders for (int i = 0; i < MAX_ORDER && i < taskCountLPC; i++) @@ -730,6 +641,7 @@ void clQuantizeLPC( #ifndef HAVE_ATOM volatile int tmp[32]; #endif +// volatile int best8; } shared; const int tid = get_local_id(0); @@ -752,6 +664,17 @@ void clQuantizeLPC( if (tid < MAX_ORDER) shared.error[tid] = shared.task.blocksize * log(lpcs[lpcOffs + MAX_ORDER * 32 + tid]) + tid * 4.12f * log((float)shared.task.blocksize); //shared.error[get_local_id(0)] = shared.task.blocksize * log(lpcs[lpcOffs + MAX_ORDER * 32 + get_local_id(0)]) + get_local_id(0) * 0.30f * (shared.task.abits + 1) * log(shared.task.blocksize); +#if 0 + if (tid == 0) + { + int b8 = 0; + for (int i = 1; i < 8; i++) + if (shared.error[i] < shared.error[b8]) + b8 = i; + shared.best8 = b8; + } + shared.error[tid] += select(0.0f, 20.5f * log((float)shared.task.blocksize), tid > shared.best8); +#endif barrier(CLK_LOCAL_MEM_FENCE); // Sort using bitonic sort @@ -1452,6 +1375,7 @@ void clCalcPartition16( __local FLACCLSubframeTask task; __local int data[GROUP_SIZE * 2]; __local int res[GROUP_SIZE]; + __local int pl[GROUP_SIZE >> 4][15]; const int tid = get_local_id(0); if (tid < sizeof(task) / sizeof(int)) @@ -1524,8 +1448,16 @@ void clCalcPartition16( sum = (vload4(0,chunk) >> k) + (vload4(1,chunk) >> k) + (vload4(2,chunk) >> k) + (vload4(3,chunk) >> k); s = sum.x + sum.y + sum.z + sum.w; +#if 0 if (k <= 14 && offs < bs) plptr[offs >> 4] = min(0x7fffff, s) + (16 - select(0, ro, offs < 16)) * (k + 1); +#else + if (k <= 14) pl[x][k] = min(0x7fffff, s) + (16 - select(0, ro, offs < 16)) * (k + 1); + barrier(CLK_LOCAL_MEM_FENCE); + int k1 = tid >> 3, x1 = tid & 7; + if (k1 <= 14 && (pos >> 4) + x1 < (1 << max_porder)) + partition_lengths[(15 << (max_porder + 1)) * get_group_id(0) + (k1 << (max_porder + 1)) + (pos >> 4) + x1] = pl[x1][k1]; +#endif // if (task.data.blocksize == 16 && x == 0 && k <= 14) // printf("[%d] = %d = s:%d + %d * (k:%d + 1), ro=%d, offs=%d, lpos=%d\n", k, partition_lengths[lpos], s, (16 - select(0, ro, offs < 16)), k, ro, offs, lpos); @@ -1849,13 +1781,12 @@ inline int len_utf8(int n) #else int bts = 31 - clz(n); #endif - if (bts < 7) - return 8; - return 8 * ((bts + 4) / 5); + return select(8, 8 * ((bts + 4) / 5), bts > 6); } +#ifdef FLACCL_CPU // get_global_id(0) * channels == task index -__kernel +__kernel __attribute__((reqd_work_group_size(1, 1, 1))) void clCalcOutputOffsets( __global int *residual, __global int *samples, @@ -1874,9 +1805,8 @@ void clCalcOutputOffsets( // + 8-16 // custom sample rate ; int bs = tasks[iFrame * channels].data.blocksize; - //public static readonly int[] flac_blocksizes = new int[15] { 0, 192, 576, 1152, 2304, 4608, 0, 0, 256, 512, 1024, 2048, 4096, 8192, 16384 }; - if (bs != 4096 && bs != 4608) // TODO: check all other standard sizes - offset += select(8, 16, bs >= 256); + //public static readonly int[] flac_blocksizes = new int[15] { 0, 192, 576, 1152, 2304, 4608, 0, 0, 256, 512, 1024, 2048, 4096, 8192, 16384 }; + offset += select(0, select(8, 16, bs >= 256), bs != 4096 && bs != 4608); // TODO: check all other standard sizes // assert (offset % 8) == 0 offset += 8; @@ -1893,6 +1823,56 @@ void clCalcOutputOffsets( offset += 16; } } +#else +// get_global_id(0) * channels == task index +__kernel __attribute__((reqd_work_group_size(32, 1, 1))) +void clCalcOutputOffsets( + __global int *residual, + __global int *samples, + __global FLACCLSubframeTask *tasks, + int channels1, + int frameCount, + int firstFrame + ) +{ + const int channels = 2; + __local FLACCLSubframeData ltasks[2]; + __local volatile int mypos[2]; + int offset = 0; + for (int iFrame = 0; iFrame < frameCount; iFrame++) + { + if (get_local_id(0) < sizeof(ltasks[0]) / sizeof(int)) + for (int ch = 0; ch < channels; ch++) + ((__local int*)<asks[ch])[get_local_id(0)] = ((__global int*)(&tasks[iFrame * channels + ch]))[get_local_id(0)]; + + //printf("len_utf8(%d) == %d\n", firstFrame + iFrame, len_utf8(firstFrame + iFrame)); + offset += 15 + 1 + 4 + 4 + 4 + 3 + 1 + len_utf8(firstFrame + iFrame) + // + 8-16 // custom block size + // + 8-16 // custom sample rate + ; + int bs = ltasks[0].blocksize; + //public static readonly int[] flac_blocksizes = new int[15] { 0, 192, 576, 1152, 2304, 4608, 0, 0, 256, 512, 1024, 2048, 4096, 8192, 16384 }; + offset += select(0, select(8, 16, bs >= 256), bs != 4096 && bs != 4608); // TODO: check all other standard sizes + + // assert (offset % 8) == 0 + offset += 8; + if (get_local_id(0) < channels) + { + int ch = get_local_id(0); + // Add 64 bits to separate frames if header is too small so they can intersect + int mylen = 8 + ltasks[ch].wbits + 64 + ltasks[ch].size; + mypos[ch] = mylen; + for (int offset = 1; offset < WARP_SIZE && offset < channels; offset <<= 1) + if (ch >= offset) mypos[ch] += mypos[ch - offset]; + mypos[ch] += offset; + tasks[iFrame * channels + ch].data.encodingOffset = mypos[ch] - ltasks[ch].size + ltasks[ch].headerLen; + } + offset = mypos[channels - 1]; + offset = (offset + 7) & ~7; + offset += 16; + } +} +#endif // get_group_id(0) == task index __kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) @@ -2000,7 +1980,9 @@ void clRiceEncoding( #else __local unsigned int data[GROUP_SIZE]; __local volatile int mypos[GROUP_SIZE+1]; - //__local int brp[256]; +#if 0 + __local int brp[256]; +#endif __local volatile int warppos[WARP_SIZE]; __local FLACCLSubframeData task; @@ -2014,8 +1996,10 @@ void clRiceEncoding( mypos[GROUP_SIZE] = 0; if (tid < WARP_SIZE) warppos[tid] = 0; - // for (int offs = tid; offs < (1 << task.porder); offs ++) - //brp[offs] = best_rice_parameters[(get_group_id(0) << max_porder) + offs]; +#if 0 + for (int offs = tid; offs < (1 << task.porder); offs ++) + brp[offs] = best_rice_parameters[(get_group_id(0) << max_porder) + offs]; +#endif data[tid] = 0; barrier(CLK_LOCAL_MEM_FENCE); const int bs = task.blocksize; @@ -2023,19 +2007,23 @@ void clRiceEncoding( int plen = bs >> task.porder; //int plenoffs = 12 - task.porder; unsigned int remainder = 0U; - for (int pos = 0; pos < bs; pos += GROUP_SIZE) + int pos; + for (pos = 0; pos + GROUP_SIZE - 1 < bs; pos += GROUP_SIZE) { int offs = pos + tid; - int v = offs < bs ? residual[task.residualOffs + offs] : 0; + int v = residual[task.residualOffs + offs]; int part = offs / plen; // >> plenoffs; - //int k = brp[min(255, part)]; - int k = offs < bs ? best_rice_parameters[(get_group_id(0) << max_porder) + part] : 0; +#if 0 + int k = brp[part]; +#else + int k = best_rice_parameters[(get_group_id(0) << max_porder) + part]; +#endif int pstart = offs == task.residualOrder || offs == part * plen; v = (v << 1) ^ (v >> 31); int mylen = select(0, (v >> k) + 1 + k + select(0, 4, pstart), offs >= task.residualOrder && offs < bs); mypos[tid] = mylen; + // Inclusive scan(+) -#if 1 int lane = (tid & (WARP_SIZE - 1)); for (int offset = 1; offset < WARP_SIZE; offset <<= 1) mypos[tid] += mypos[select(GROUP_SIZE, tid - offset, lane >= offset)]; @@ -2052,19 +2040,68 @@ void clRiceEncoding( mp += start + select(0, warppos[tid / WARP_SIZE - 1], tid / WARP_SIZE > 0); int start32 = start >> 5; start += mypos[GROUP_SIZE - 1] + warppos[GROUP_SIZE / WARP_SIZE - 2]; -#else + + //if (tid == GROUP_SIZE - 1 && mypos[tid] > (GROUP_SIZE/2) * 32) + // printf("Oops: %d\n", mypos[tid]); + data[tid] = select(0U, remainder, tid == 0); barrier(CLK_LOCAL_MEM_FENCE); - for (int offset = 1; offset < GROUP_SIZE; offset <<= 1) + if (mylen) { - int t = tid >= offset ? mypos[tid - offset] : 0; - barrier(CLK_LOCAL_MEM_FENCE); - mypos[tid] += t; - barrier(CLK_LOCAL_MEM_FENCE); + if (pstart) + { + int kpos = mp - mylen; + int kpos0 = (kpos >> 5) - start32; + int kpos1 = kpos & 31; + unsigned int kval = (unsigned int)k << 28; + unsigned int kval0 = kval >> kpos1; + unsigned int kval1 = kval << (32 - kpos1); + if (kval0) atom_or(&data[kpos0], kval0); + if (kpos1 && kval1) atom_or(&data[kpos0 + 1], kval1); + } + int qpos = mp - k - 1; + int qpos0 = (qpos >> 5) - start32; + int qpos1 = qpos & 31; + unsigned int qval = (1U << 31) | ((unsigned int)v << (31 - k)); + unsigned int qval0 = qval >> qpos1; + unsigned int qval1= qval << (32 - qpos1); + if (qval0) atom_or(&data[qpos0], qval0); + if (qpos1 && qval1) atom_or(&data[qpos0 + 1], qval1); } - int mp = start + mypos[tid]; - int start32 = start / 32; - start += mypos[GROUP_SIZE - 1]; -#endif + barrier(CLK_LOCAL_MEM_FENCE); + if ((start32 + tid) * 32 <= start) + output[start32 + tid] = as_int(as_char4(data[tid]).wzyx); + remainder = data[start / 32 - start32]; + } + if (pos < bs) + { + int offs = pos + tid; + int v = offs < bs ? residual[task.residualOffs + offs] : 0; + int part = offs / plen; // >> plenoffs; + //int k = brp[min(255, part)]; + int k = offs < bs ? best_rice_parameters[(get_group_id(0) << max_porder) + part] : 0; + int pstart = offs == task.residualOrder || offs == part * plen; + v = (v << 1) ^ (v >> 31); + int mylen = select(0, (v >> k) + 1 + k + select(0, 4, pstart), offs >= task.residualOrder && offs < bs); + mypos[tid] = mylen; + + // Inclusive scan(+) + int lane = (tid & (WARP_SIZE - 1)); + for (int offset = 1; offset < WARP_SIZE; offset <<= 1) + mypos[tid] += mypos[select(GROUP_SIZE, tid - offset, lane >= offset)]; + int mp = mypos[tid]; + if ((tid & (WARP_SIZE - 1)) == WARP_SIZE - 1) + warppos[tid/WARP_SIZE] = mp; + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < GROUP_SIZE/WARP_SIZE) + { + for (int offset = 1; offset < GROUP_SIZE/WARP_SIZE; offset <<= 1) + warppos[tid] += warppos[select(GROUP_SIZE/WARP_SIZE, tid - offset, tid >= offset)]; + } + barrier(CLK_LOCAL_MEM_FENCE); + mp += start + select(0, warppos[tid / WARP_SIZE - 1], tid / WARP_SIZE > 0); + int start32 = start >> 5; + start += mypos[GROUP_SIZE - 1] + warppos[GROUP_SIZE / WARP_SIZE - 2]; + //if (tid == GROUP_SIZE - 1 && mypos[tid] > (GROUP_SIZE/2) * 32) // printf("Oops: %d\n", mypos[tid]); data[tid] = select(0U, remainder, tid == 0); diff --git a/CUETools.FLACCL.cmd/CUETools.FLACL.cmd.csproj b/CUETools.FLACCL.cmd/CUETools.FLACL.cmd.csproj index 1d765a3..550563b 100644 --- a/CUETools.FLACCL.cmd/CUETools.FLACL.cmd.csproj +++ b/CUETools.FLACCL.cmd/CUETools.FLACL.cmd.csproj @@ -32,6 +32,7 @@ TRACE prompt 4 + AnyCPU diff --git a/CUETools.FLACCL.cmd/Program.cs b/CUETools.FLACCL.cmd/Program.cs index 16d7cd3..2a2f988 100644 --- a/CUETools.FLACCL.cmd/Program.cs +++ b/CUETools.FLACCL.cmd/Program.cs @@ -47,17 +47,16 @@ namespace CUETools.FLACCL.cmd Console.WriteLine("OpenCL Options:"); Console.WriteLine(); Console.WriteLine(" --opencl-type CPU or GPU, default GPU"); - Console.WriteLine(" --opencl-platform '' 'ATI Stream', 'NVIDIA Cuda', 'Intel OpenCL' etc"); + Console.WriteLine(" --opencl-platform 'ATI Stream', 'NVIDIA CUDA', 'Intel OpenCL' etc"); Console.WriteLine(" --group-size # Set GPU workgroup size (64,128,256)"); - Console.WriteLine(" --task-size # Set number of frames per GPU call, default 32"); + Console.WriteLine(" --task-size # Set number of frames per multiprocessor, default 8"); Console.WriteLine(" --slow-gpu Some encoding stages are done on CPU"); - Console.WriteLine(" --do-rice Experimental mode, not recommended"); + Console.WriteLine(" --fast-gpu Experimental mode, not recommended"); Console.WriteLine(" --define OpenCL preprocessor definition"); Console.WriteLine(); Console.WriteLine("Advanced Options:"); Console.WriteLine(); Console.WriteLine(" -b # Block size"); - Console.WriteLine(" -v # Variable block size mode (0,4)"); Console.WriteLine(" -s Stereo decorrelation (independent,search)"); Console.WriteLine(" -r #[,#] Rice partition order {max} or {min},{max} (0..8)"); Console.WriteLine(); @@ -88,9 +87,7 @@ namespace CUETools.FLACCL.cmd min_precision = -1, max_precision = -1, orders_per_window = -1, orders_per_channel = -1, blocksize = -1; -#if DEBUG int input_len = 4096, input_val = 0; -#endif int level = -1, padding = -1, vbr_mode = -1; bool do_seektable = true; bool buffered = false; @@ -111,7 +108,7 @@ namespace CUETools.FLACCL.cmd do_seektable = false; else if (args[arg] == "--slow-gpu") settings.GPUOnly = false; - else if (args[arg] == "--do-rice") + else if (args[arg] == "--fast-gpu") settings.DoRice = true; else if (args[arg] == "--no-md5") settings.DoMD5 = false; @@ -135,12 +132,10 @@ namespace CUETools.FLACCL.cmd settings.MappedMemory = true; else if (args[arg] == "--opencl-type" && ++arg < args.Length) device_type = args[arg]; -#if DEBUG else if (args[arg] == "--input-length" && ++arg < args.Length && int.TryParse(args[arg], out intarg)) input_len = intarg; else if (args[arg] == "--input-value" && ++arg < args.Length && int.TryParse(args[arg], out intarg)) input_val = intarg; -#endif else if ((args[arg] == "-o" || args[arg] == "--output") && ++arg < args.Length) output_file = args[arg]; else if ((args[arg] == "-s" || args[arg] == "--stereo") && ++arg < args.Length) @@ -218,10 +213,8 @@ namespace CUETools.FLACCL.cmd IAudioSource audioSource; if (input_file == "-") audioSource = new WAVReader("", Console.OpenStandardInput()); -#if DEBUG else if (input_file == "nul") audioSource = new SilenceGenerator(input_len, input_val); -#endif else if (File.Exists(input_file) && Path.GetExtension(input_file) == ".wav") audioSource = new WAVReader(input_file, null); else if (File.Exists(input_file) && Path.GetExtension(input_file) == ".flac") @@ -326,6 +319,9 @@ namespace CUETools.FLACCL.cmd Console.Error.Write("\r \r"); Console.WriteLine("Error : {0}", ex.Message); Console.WriteLine("{0}", ex.BuildLogs[0]); + if (debug) + using (StreamWriter sw = new StreamWriter("debug.txt", true)) + sw.WriteLine("{0}\n{1}\n{2}", ex.Message, ex.StackTrace, ex.BuildLogs[0]); audioDest.Delete(); audioSource.Close(); return 4; @@ -335,6 +331,9 @@ namespace CUETools.FLACCL.cmd { Console.Error.Write("\r \r"); Console.WriteLine("Error : {0}", ex.Message); + if (debug) + using (StreamWriter sw = new StreamWriter("debug.txt", true)) + sw.WriteLine("{0}\n{1}", ex.Message, ex.StackTrace); audioDest.Delete(); audioSource.Close(); return 4;