diff --git a/CUETools.Codecs.FLACCL/FLACCLWriter.cs b/CUETools.Codecs.FLACCL/FLACCLWriter.cs index d55863f..9a2989e 100644 --- a/CUETools.Codecs.FLACCL/FLACCLWriter.cs +++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs @@ -195,7 +195,7 @@ namespace CUETools.Codecs.FLACCL _path = path; _IO = IO; - eparams.flake_set_defaults(_compressionLevel, !_settings.GPUOnly); + eparams.flake_set_defaults(_compressionLevel); eparams.padding_size = 8192; crc8 = new Crc8(); @@ -238,7 +238,7 @@ namespace CUETools.Codecs.FLACCL if (value < 0 || value > 11) throw new Exception("unsupported compression level"); _compressionLevel = value; - eparams.flake_set_defaults(_compressionLevel, !_settings.GPUOnly); + eparams.flake_set_defaults(_compressionLevel); } } @@ -261,7 +261,6 @@ namespace CUETools.Codecs.FLACCL //_settings.GPUOnly = true; _settings.MappedMemory = true; } - eparams.flake_set_defaults(_compressionLevel, !_settings.GPUOnly); } } @@ -1582,19 +1581,6 @@ namespace CUETools.Codecs.FLACCL // If true, RequireImageSupport will filter out any devices without image support // In this project we don't need image support though, so we set it to false OCLMan.RequireImageSupport = false; - // The Defines string gets prepended to any and all sources that are compiled - // and serve as a convenient way to pass configuration information to the compilation process - OCLMan.Defines = - "#define MAX_ORDER " + eparams.max_prediction_order.ToString() + "\n" + - "#define GROUP_SIZE " + groupSize.ToString() + "\n" + - "#define FLACCL_VERSION \"" + vendor_string + "\"\n" + - (_settings.GPUOnly ? "#define DO_PARTITIONS\n" : "") + - (_settings.DoRice ? "#define DO_RICE\n" : "") + -#if DEBUG - "#define DEBUG\n" + -#endif - (_settings.DeviceType == OpenCLDeviceType.CPU ? "#define FLACCL_CPU\n" : "") + - _settings.Defines + "\n"; // The BuildOptions string is passed directly to clBuild and can be used to do debug builds etc OCLMan.BuildOptions = ""; OCLMan.SourcePath = System.IO.Path.GetDirectoryName(GetType().Assembly.Location); @@ -1620,7 +1606,23 @@ namespace CUETools.Codecs.FLACCL OCLMan.CreateDefaultContext(platformId, (DeviceType)_settings.DeviceType); if (OCLMan.Context.Devices[0].Extensions.Contains("cl_khr_local_int32_extended_atomics")) - OCLMan.Defines += "#define HAVE_ATOM\n"; + _settings.Defines += "#define HAVE_ATOM\n"; + else + _settings.GPUOnly = false; + + // The Defines string gets prepended to any and all sources that are compiled + // and serve as a convenient way to pass configuration information to the compilation process + OCLMan.Defines = + "#define MAX_ORDER " + eparams.max_prediction_order.ToString() + "\n" + + "#define GROUP_SIZE " + groupSize.ToString() + "\n" + + "#define FLACCL_VERSION \"" + vendor_string + "\"\n" + + (_settings.GPUOnly ? "#define DO_PARTITIONS\n" : "") + + (_settings.DoRice ? "#define DO_RICE\n" : "") + +#if DEBUG + "#define DEBUG\n" + +#endif + (_settings.DeviceType == OpenCLDeviceType.CPU ? "#define FLACCL_CPU\n" : "") + + _settings.Defines + "\n"; try { @@ -2146,7 +2148,7 @@ namespace CUETools.Codecs.FLACCL public bool do_seektable; - public int flake_set_defaults(int lvl, bool encode_on_cpu) + public int flake_set_defaults(int lvl) { compression = lvl; @@ -2165,7 +2167,7 @@ namespace CUETools.Codecs.FLACCL min_prediction_order = 1; max_prediction_order = 12; min_partition_order = 0; - max_partition_order = 6; + max_partition_order = 8; variable_block_size = 0; lpc_min_precision_search = 0; lpc_max_precision_search = 0; @@ -2183,7 +2185,6 @@ namespace CUETools.Codecs.FLACCL do_midside = false; window_function = WindowFunction.Bartlett; orders_per_window = 1; - max_partition_order = 4; max_prediction_order = 7; min_fixed_order = 3; max_fixed_order = 2; @@ -2197,7 +2198,6 @@ namespace CUETools.Codecs.FLACCL min_fixed_order = 2; max_fixed_order = 2; max_prediction_order = 7; - max_partition_order = 4; break; case 2: do_constant = false; @@ -2207,7 +2207,6 @@ namespace CUETools.Codecs.FLACCL min_fixed_order = 2; max_fixed_order = 2; max_prediction_order = 8; - max_partition_order = 4; break; case 3: do_constant = false; @@ -2272,9 +2271,6 @@ namespace CUETools.Codecs.FLACCL break; } - if (!encode_on_cpu) - max_partition_order = 8; - return 0; } } @@ -2456,14 +2452,14 @@ namespace CUETools.Codecs.FLACCL clSelectedTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, selectedLen); clRiceOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceLen); - clSamplesBytesPtr = openCLCQ.EnqueueMapBuffer(clSamplesBytes, true, MapFlags.WRITE, 0, samplesBufferLen / 2); - clResidualPtr = openCLCQ.EnqueueMapBuffer(clResidual, true, MapFlags.WRITE, 0, residualBufferLen); - clBestRiceParamsPtr = openCLCQ.EnqueueMapBuffer(clBestRiceParams, true, MapFlags.WRITE, 0, riceParamsLen / 4); - clResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clResidualTasks, true, MapFlags.WRITE, 0, residualTasksLen); - clBestResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clBestResidualTasks, true, MapFlags.WRITE, 0, bestResidualTasksLen); - clWindowFunctionsPtr = openCLCQ.EnqueueMapBuffer(clWindowFunctions, true, MapFlags.WRITE, 0, wndLen); - clSelectedTasksPtr = openCLCQ.EnqueueMapBuffer(clSelectedTasks, true, MapFlags.WRITE, 0, selectedLen); - clRiceOutputPtr = openCLCQ.EnqueueMapBuffer(clRiceOutput, true, MapFlags.WRITE, 0, riceLen); + clSamplesBytesPtr = openCLCQ.EnqueueMapBuffer(clSamplesBytes, true, MapFlags.READ_WRITE, 0, samplesBufferLen / 2); + clResidualPtr = openCLCQ.EnqueueMapBuffer(clResidual, true, MapFlags.READ_WRITE, 0, residualBufferLen); + clBestRiceParamsPtr = openCLCQ.EnqueueMapBuffer(clBestRiceParams, true, MapFlags.READ_WRITE, 0, riceParamsLen / 4); + clResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clResidualTasks, true, MapFlags.READ_WRITE, 0, residualTasksLen); + clBestResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clBestResidualTasks, true, MapFlags.READ_WRITE, 0, bestResidualTasksLen); + clWindowFunctionsPtr = openCLCQ.EnqueueMapBuffer(clWindowFunctions, true, MapFlags.READ_WRITE, 0, wndLen); + clSelectedTasksPtr = openCLCQ.EnqueueMapBuffer(clSelectedTasks, true, MapFlags.READ_WRITE, 0, selectedLen); + clRiceOutputPtr = openCLCQ.EnqueueMapBuffer(clRiceOutput, true, MapFlags.READ_WRITE, 0, riceLen); //clSamplesBytesPtr = clSamplesBytes.HostPtr; //clResidualPtr = clResidual.HostPtr; diff --git a/CUETools.Codecs.FLACCL/flac.cl b/CUETools.Codecs.FLACCL/flac.cl index cfea513..faa3d64 100644 --- a/CUETools.Codecs.FLACCL/flac.cl +++ b/CUETools.Codecs.FLACCL/flac.cl @@ -865,6 +865,10 @@ void clEstimateResidual( obits * bs); } #else + +#define MAX_BLOCKSIZE 4096 +#define ESTPARTLOG 5 + __kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) void clEstimateResidual( __global int*samples, @@ -877,7 +881,7 @@ void clEstimateResidual( __local volatile int idata[GROUP_SIZE]; #endif __local FLACCLSubframeTask task; - __local int psum[64]; + __local int psum[MAX_BLOCKSIZE >> ESTPARTLOG]; __local float fcoef[32]; __local int selectedTask; @@ -896,15 +900,13 @@ void clEstimateResidual( if (tid < 32) //fcoef[tid] = select(0.0f, - ((float) task.coefs[tid]) / (1 << task.data.shift), tid < ro); fcoef[tid] = tid < MAX_ORDER && tid + ro - MAX_ORDER >= 0 ? - ((float) task.coefs[tid + ro - MAX_ORDER]) / (1 << task.data.shift) : 0.0f; - if (tid < 64) - psum[tid] = 0; + for (int offs = tid; offs < (MAX_BLOCKSIZE >> ESTPARTLOG); offs += GROUP_SIZE) + psum[offs] = 0; data[tid] = 0.0f; // need to initialize "extra" data, because NaNs can produce wierd results even when multipled by zero extra coefs if (tid < 32) data[GROUP_SIZE * 2 + tid] = 0.0f; - int partOrder = max(6, clz(64) - clz(bs - 1) + 1); - barrier(CLK_LOCAL_MEM_FENCE); #ifdef AMD @@ -960,35 +962,35 @@ void clEstimateResidual( // convert to unsigned t = (t << 1) ^ (t >> 31); #if !defined(AMD) || !defined(HAVE_ATOM) - // convert to unsigned idata[tid] = t; - barrier(CLK_LOCAL_MEM_FENCE); - int ps = (1 << partOrder) - 1; - int lane = tid & ps; - for (int l = 1 << (partOrder - 1); l > WARP_SIZE; l >>= 1) - { - if (lane < l) idata[tid] += idata[tid + l]; - barrier(CLK_LOCAL_MEM_FENCE); - } - if (lane < WARP_SIZE) - for (int l = WARP_SIZE; l > 0; l >>= 1) - idata[tid] += idata[tid + l]; - if (lane == 0) - psum[min(63,offs >> partOrder)] += idata[tid]; + for (int l = 16; l > 1; l >>= 1) + idata[tid] += idata[tid + l]; + if ((tid & 31) == 0) + psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG] = idata[tid] + idata[tid + 1]; #else - atom_add(&psum[min(63,offs >> partOrder)], t); + atom_add(&psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG], t); #endif } - // calculate rice partition bit length for every (1 << partOrder) samples + // calculate rice partition bit length for every 32 samples barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 64) - { - int k = iclamp(clz(1 << partOrder) - clz(psum[tid]), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16) - psum[tid] = (k << partOrder) + (psum[tid] >> k); - } + // Bug: if (MAX_BLOCKSIZE >> (ESTPARTLOG + 1)) > GROUP_SIZE + int pl = get_local_id(0) < (MAX_BLOCKSIZE >> (ESTPARTLOG + 1)) ? pl = psum[tid * 2] + psum[tid * 2 + 1] : 0; barrier(CLK_LOCAL_MEM_FENCE); - for (int l = 32; l > 0; l >>= 1) + // for (int pos = 0; pos < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2; pos += GROUP_SIZE) + // { + //int offs = pos + tid; + //int pl = offs < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2 ? psum[offs * 2] + psum[offs * 2 + 1] : 0; + ////int pl = psum[offs * 2] + psum[offs * 2 + 1]; + //barrier(CLK_LOCAL_MEM_FENCE); + //if (offs < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2) + // psum[offs] = pl; + // } + int k = iclamp(31 - (ESTPARTLOG + 1) - clz(pl), 0, 14); // 26 - clz(res) == clz(32) - clz(res) == log2(res / 32) + if (tid < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2) + psum[tid] = (k << (ESTPARTLOG + 1)) + (pl >> k); + barrier(CLK_LOCAL_MEM_FENCE); + for (int l = MAX_BLOCKSIZE >> (ESTPARTLOG + 2); l > 0; l >>= 1) { if (tid < l) psum[tid] += psum[tid + l]; @@ -1796,7 +1798,11 @@ void clRiceEncoding( unsigned int bb = bw.bit_buf << bw.bit_left; bw.bit_buf = 0; bw.bit_left += (32 - b); +#ifdef AMD + bw.buffer[bw.buf_ptr++] = as_int(as_char4(bb).wzyx); +#else bw.buffer[bw.buf_ptr++] = (bb >> 24) | ((bb >> 8) & 0xff00) | ((bb << 8) & 0xff0000) | ((bb << 24) & 0xff000000); +#endif } bits -= b; } @@ -1811,7 +1817,11 @@ void clRiceEncoding( unsigned int bb = (bw.bit_buf << bw.bit_left) | (val >> (bits - bw.bit_left)); bw.bit_buf = val; bw.bit_left += (32 - bits); +#ifdef AMD + bw.buffer[bw.buf_ptr++] = as_int(as_char4(bb).wzyx); +#else bw.buffer[bw.buf_ptr++] = (bb >> 24) | ((bb >> 8) & 0xff00) | ((bb << 8) & 0xff0000) | ((bb << 24) & 0xff000000); +#endif } ////if (get_group_id(0) == 0) printf("%x ", v); //writebits(&bw, (v >> k) + 1, 1); @@ -1916,10 +1926,8 @@ void clRiceEncoding( atom_or(&data[qpos0 + 1], qval1); start = mypos[GROUP_SIZE - 1]; barrier(CLK_LOCAL_MEM_FENCE); - unsigned int bb = data[tid]; - bb = (bb >> 24) | ((bb >> 8) & 0xff00U) | ((bb << 8) & 0xff0000U) | (bb << 24); if ((start32 + tid) * 32 <= start) - output[start32 + tid] = bb; + output[start32 + tid] = as_int(as_char4(data[tid]).wzyx); unsigned int remainder = data[start / 32 - start32]; barrier(CLK_LOCAL_MEM_FENCE); data[tid] = select(0U, remainder, tid == 0);