diff --git a/CUETools.Codecs.FLACCL/CUETools.Codecs.FLACCL.csproj b/CUETools.Codecs.FLACCL/CUETools.Codecs.FLACCL.csproj index 8e31c17..ff9d027 100644 --- a/CUETools.Codecs.FLACCL/CUETools.Codecs.FLACCL.csproj +++ b/CUETools.Codecs.FLACCL/CUETools.Codecs.FLACCL.csproj @@ -65,6 +65,9 @@ + + PreserveNewest + diff --git a/CUETools.Codecs.FLACCL/FLACCLWriter.cs b/CUETools.Codecs.FLACCL/FLACCLWriter.cs index 5b38d73..0626210 100644 --- a/CUETools.Codecs.FLACCL/FLACCLWriter.cs +++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs @@ -33,7 +33,14 @@ namespace CUETools.Codecs.FLACCL { public class FLACCLWriterSettings { - public FLACCLWriterSettings() { DoVerify = false; GPUOnly = true; DoMD5 = true; GroupSize = 64; } + public FLACCLWriterSettings() + { + this.DoVerify = false; + this.GPUOnly = true; + this.DoMD5 = true; + this.GroupSize = 64; + this.DeviceType = OpenCLDeviceType.GPU; + } [DefaultValue(false)] [DisplayName("Verify")] @@ -56,6 +63,13 @@ namespace CUETools.Codecs.FLACCL [SRDescription(typeof(Properties.Resources), "DescriptionDefines")] public string Defines { get; set; } + [SRDescription(typeof(Properties.Resources), "DescriptionPlatform")] + public string Platform { get; set; } + + [DefaultValue(OpenCLDeviceType.GPU)] + [SRDescription(typeof(Properties.Resources), "DescriptionDeviceType")] + public OpenCLDeviceType DeviceType { get; set; } + int cpu_threads = 1; [DefaultValue(1)] [SRDescription(typeof(Properties.Resources), "DescriptionCPUThreads")] @@ -74,6 +88,12 @@ namespace CUETools.Codecs.FLACCL } } + public enum OpenCLDeviceType : ulong + { + CPU = DeviceType.CPU, + GPU = DeviceType.GPU + } + [AudioEncoderClass("FLACCL", "flac", true, "0 1 2 3 4 5 6 7 8 9 10 11", "8", 2, typeof(FLACCLWriterSettings))] //[AudioEncoderClass("FLACCL nonsub", "flac", true, "9 10 11", "9", 1, typeof(FLACCLWriterSettings))] public class FLACCLWriter : IAudioDest @@ -223,6 +243,11 @@ namespace CUETools.Codecs.FLACCL if (value as FLACCLWriterSettings == null) throw new Exception("Unsupported options " + value); _settings = value as FLACCLWriterSettings; + if (_settings.DeviceType == OpenCLDeviceType.CPU) + { + _settings.GroupSize = 1; + _settings.GPUOnly = false; + } eparams.flake_set_defaults(_compressionLevel, !_settings.GPUOnly); } } @@ -449,7 +474,7 @@ namespace CUETools.Codecs.FLACCL } set { - if (value < 0 || value > eparams.max_fixed_order) + if (value < 0 || value > 4) throw new Exception("invalid MinFixedOrder " + value.ToString()); eparams.min_fixed_order = value; } @@ -463,7 +488,7 @@ namespace CUETools.Codecs.FLACCL } set { - if (value > 4 || value < eparams.min_fixed_order) + if (value > 4 || value < 0) throw new Exception("invalid MaxFixedOrder " + value.ToString()); eparams.max_fixed_order = value; } @@ -963,7 +988,7 @@ namespace CUETools.Codecs.FLACCL task.nResidualTasks = 0; task.nTasksPerWindow = Math.Min(32, eparams.orders_per_window); - task.nResidualTasksPerChannel = task.nWindowFunctions * task.nTasksPerWindow + 1 + (eparams.do_constant ? 1 : 0) + eparams.max_fixed_order - eparams.min_fixed_order; + task.nResidualTasksPerChannel = task.nWindowFunctions * task.nTasksPerWindow + (eparams.do_constant ? 1 : 0) + Math.Max(0, 1 + eparams.max_fixed_order - eparams.min_fixed_order); //if (task.nResidualTasksPerChannel >= 4) // task.nResidualTasksPerChannel = (task.nResidualTasksPerChannel + 7) & ~7; for (int iFrame = 0; iFrame < nFrames; iFrame++) @@ -1444,7 +1469,7 @@ namespace CUETools.Codecs.FLACCL if (OpenCL.NumberOfPlatforms < 1) throw new Exception("no opencl platforms found"); - int groupSize = _settings.GroupSize; + int groupSize = _settings.DeviceType == OpenCLDeviceType.CPU ? 1 : _settings.GroupSize; OCLMan = new OpenCLManager(); // Attempt to save binaries after compilation, as well as load precompiled binaries // to avoid compilation. Usually you'll want this to be true. @@ -1475,12 +1500,30 @@ namespace CUETools.Codecs.FLACCL OCLMan.BuildOptions = ""; OCLMan.SourcePath = System.IO.Path.GetDirectoryName(GetType().Assembly.Location); OCLMan.BinaryPath = System.IO.Path.Combine(System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData), "CUE Tools"), "OpenCL"); - OCLMan.CreateDefaultContext(0, DeviceType.GPU); + int platformId = 0; + if (_settings.Platform != null) + { + platformId = -1; + string platforms = ""; + for (int i = 0; i < OpenCL.NumberOfPlatforms; i++) + { + var platform = OpenCL.GetPlatform(i); + platforms += " \"" + platform.Name + "\""; + if (platform.Name.Equals(_settings.Platform, StringComparison.InvariantCultureIgnoreCase)) + { + platformId = i; + break; + } + } + if (platformId < 0) + throw new Exception("unknown platform \"" + _settings.Platform + "\". Platforms available:" + platforms); + } + OCLMan.CreateDefaultContext(platformId, (DeviceType)_settings.DeviceType); openCLContext = OCLMan.Context; try { - openCLProgram = OCLMan.CompileFile("flac.cl"); + openCLProgram = OCLMan.CompileFile(_settings.DeviceType == OpenCLDeviceType.CPU ? "flaccpu.cl" : "flac.cl"); } catch (OpenCLBuildException ex) { @@ -2218,13 +2261,12 @@ namespace CUETools.Codecs.FLACCL this.channelsCount = channelsCount; this.writer = writer; openCLProgram = _openCLProgram; - Device[] openCLDevices = openCLProgram.Context.Platform.QueryDevices(DeviceType.GPU); #if DEBUG var prop = CommandQueueProperties.PROFILING_ENABLE; #else var prop = CommandQueueProperties.NONE; #endif - openCLCQ = openCLProgram.Context.CreateCommandQueue(openCLDevices[0], prop); + openCLCQ = openCLProgram.Context.CreateCommandQueue(openCLProgram.Context.Devices[0], prop); residualTasksLen = sizeof(FLACCLSubframeTask) * channelsCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 8) * FLACCLWriter.maxFrames; bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channels * FLACCLWriter.maxFrames; @@ -2260,12 +2302,15 @@ namespace CUETools.Codecs.FLACCL clChooseBestMethod = openCLProgram.CreateKernel("clChooseBestMethod"); clCopyBestMethod = openCLProgram.CreateKernel("clCopyBestMethod"); clCopyBestMethodStereo = openCLProgram.CreateKernel("clCopyBestMethodStereo"); - clEncodeResidual = openCLProgram.CreateKernel("clEncodeResidual"); - clCalcPartition = openCLProgram.CreateKernel("clCalcPartition"); - clCalcPartition16 = openCLProgram.CreateKernel("clCalcPartition16"); - clSumPartition = openCLProgram.CreateKernel("clSumPartition"); - clFindRiceParameter = openCLProgram.CreateKernel("clFindRiceParameter"); - clFindPartitionOrder = openCLProgram.CreateKernel("clFindPartitionOrder"); + if (writer._settings.GPUOnly) + { + clEncodeResidual = openCLProgram.CreateKernel("clEncodeResidual"); + clCalcPartition = openCLProgram.CreateKernel("clCalcPartition"); + clCalcPartition16 = openCLProgram.CreateKernel("clCalcPartition16"); + clSumPartition = openCLProgram.CreateKernel("clSumPartition"); + clFindRiceParameter = openCLProgram.CreateKernel("clFindRiceParameter"); + clFindPartitionOrder = openCLProgram.CreateKernel("clFindPartitionOrder"); + } samplesBuffer = new int[FLACCLWriter.MAX_BLOCKSIZE * channelsCount]; outputBuffer = new byte[max_frame_size * FLACCLWriter.maxFrames + 1]; @@ -2304,12 +2349,15 @@ namespace CUETools.Codecs.FLACCL clChooseBestMethod.Dispose(); clCopyBestMethod.Dispose(); clCopyBestMethodStereo.Dispose(); - clEncodeResidual.Dispose(); - clCalcPartition.Dispose(); - clCalcPartition16.Dispose(); - clSumPartition.Dispose(); - clFindRiceParameter.Dispose(); - clFindPartitionOrder.Dispose(); + if (writer._settings.GPUOnly) + { + clEncodeResidual.Dispose(); + clCalcPartition.Dispose(); + clCalcPartition16.Dispose(); + clSumPartition.Dispose(); + clFindRiceParameter.Dispose(); + clFindPartitionOrder.Dispose(); + } clSamples.Dispose(); clSamplesBytes.Dispose(); @@ -2390,15 +2438,13 @@ namespace CUETools.Codecs.FLACCL nWindowFunctions); clComputeLPC.SetArgs( - clResidualTasks, clAutocorOutput, clLPCData, - nResidualTasksPerChannel, nWindowFunctions); openCLCQ.EnqueueNDRangeKernel( clComputeLPC, - 32, 1, + Math.Min(groupSize, 32), 1, nWindowFunctions, channelsCount * frameCount); @@ -2412,7 +2458,7 @@ namespace CUETools.Codecs.FLACCL openCLCQ.EnqueueNDRangeKernel( clQuantizeLPC, - 32, 1, + Math.Min(groupSize, 32), 1, nWindowFunctions, channelsCount * frameCount); @@ -2433,7 +2479,7 @@ namespace CUETools.Codecs.FLACCL openCLCQ.EnqueueNDRangeKernel( clChooseBestMethod, - 32, channelsCount * frameCount); + Math.Min(groupSize, 32), channelsCount * frameCount); if (channels == 2 && channelsCount == 4) { @@ -2444,7 +2490,7 @@ namespace CUETools.Codecs.FLACCL openCLCQ.EnqueueNDRangeKernel( clCopyBestMethodStereo, - 64, frameCount); + Math.Min(groupSize, 64), frameCount); } else { @@ -2455,7 +2501,7 @@ namespace CUETools.Codecs.FLACCL openCLCQ.EnqueueNDRangeKernel( clCopyBestMethod, - 64, channels * frameCount); + Math.Min(groupSize, 64), channels * frameCount); } if (writer._settings.GPUOnly) diff --git a/CUETools.Codecs.FLACCL/flac.cl b/CUETools.Codecs.FLACCL/flac.cl index 6d446ba..015761c 100644 --- a/CUETools.Codecs.FLACCL/flac.cl +++ b/CUETools.Codecs.FLACCL/flac.cl @@ -1,6 +1,6 @@ /** * CUETools.FLACCL: FLAC audio encoder using OpenCL - * Copyright (c) 2009 Gregory S. Chudov + * Copyright (c) 2010 Gregory S. Chudov * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -24,8 +24,6 @@ #pragma OPENCL EXTENSION cl_amd_printf : enable #endif -#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable - //#pragma OPENCL EXTENSION cl_amd_fp64 : enable typedef enum @@ -125,7 +123,7 @@ void clFindWastedBits( barrier(CLK_LOCAL_MEM_FENCE); int w = 0, a = 0; - for (int pos = tid; pos + tid < task.blocksize; pos += GROUP_SIZE) + for (int pos = tid; pos < task.blocksize; pos += GROUP_SIZE) { int smp = samples[task.samplesOffs + pos]; w |= smp; @@ -213,38 +211,25 @@ void clComputeAutocor( __kernel __attribute__((reqd_work_group_size(32, 1, 1))) void clComputeLPC( - __global FLACCLSubframeTask *tasks, __global float *autoc, __global float *lpcs, - int taskCount, // tasks per block int windowCount ) { __local struct { - FLACCLSubframeData task; volatile float ldr[32]; volatile float gen1[32]; volatile float error[32]; volatile float autoc[33]; - volatile int lpcOffs; - volatile int autocOffs; } shared; - const int tid = get_local_id(0);// + get_local_id(1) * 32; - - // fetch task data - if (tid < sizeof(shared.task) / sizeof(int)) - ((__local int*)&shared.task)[tid] = ((__global int*)(tasks + get_group_id(1)))[tid]; - if (tid == 0) - { - shared.lpcOffs = (get_group_id(0) + get_group_id(1) * windowCount) * (MAX_ORDER + 1) * 32; - shared.autocOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1); - } - barrier(CLK_LOCAL_MEM_FENCE); + const int tid = get_local_id(0);// + get_local_id(1) * 32; + int lpcOffs = (get_group_id(0) + get_group_id(1) * windowCount) * (MAX_ORDER + 1) * 32; + int autocOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1); if (get_local_id(0) <= MAX_ORDER) - shared.autoc[get_local_id(0)] = autoc[shared.autocOffs + get_local_id(0)]; + shared.autoc[get_local_id(0)] = autoc[autocOffs + get_local_id(0)]; if (get_local_id(0) + get_local_size(0) <= MAX_ORDER) - shared.autoc[get_local_id(0) + get_local_size(0)] = autoc[shared.autocOffs + get_local_id(0) + get_local_size(0)]; + shared.autoc[get_local_id(0) + get_local_size(0)] = autoc[autocOffs + get_local_id(0) + get_local_size(0)]; barrier(CLK_LOCAL_MEM_FENCE); @@ -296,14 +281,14 @@ void clComputeLPC( // Output coeffs if (get_local_id(0) <= order) - lpcs[shared.lpcOffs + order * 32 + get_local_id(0)] = -shared.ldr[order - get_local_id(0)]; + lpcs[lpcOffs + order * 32 + get_local_id(0)] = -shared.ldr[order - get_local_id(0)]; //if (get_local_id(0) <= order + 1 && fabs(-shared.ldr[0]) > 3000) // printf("coef[%d] == %f, autoc == %f, error == %f\n", get_local_id(0), -shared.ldr[order - get_local_id(0)], shared.autoc[get_local_id(0)], shared.error[get_local_id(0)]); } barrier(CLK_LOCAL_MEM_FENCE); // Output prediction error estimates if (get_local_id(0) < MAX_ORDER) - lpcs[shared.lpcOffs + MAX_ORDER * 32 + get_local_id(0)] = shared.error[get_local_id(0)]; + lpcs[lpcOffs + MAX_ORDER * 32 + get_local_id(0)] = shared.error[get_local_id(0)]; } __kernel __attribute__((reqd_work_group_size(32, 1, 1))) @@ -318,9 +303,10 @@ void clQuantizeLPC( { __local struct { FLACCLSubframeData task; - volatile int tmpi[32]; volatile int index[64]; volatile float error[64]; + volatile int maxcoef[32]; + volatile int maxcoef2[32]; volatile int lpcOffs; } shared; @@ -338,6 +324,8 @@ void clQuantizeLPC( shared.error[tid] = shared.task.blocksize * 64 + tid; shared.index[32 + tid] = MAX_ORDER - 1; shared.error[32 + tid] = shared.task.blocksize * 64 + tid + 32; + shared.maxcoef[tid] = 0; + shared.maxcoef2[tid] = 0; // Load prediction error estimates if (tid < MAX_ORDER) @@ -399,21 +387,14 @@ void clQuantizeLPC( // get 15 bits of each coeff int coef = convert_int_rte(lpc * (1 << 15)); // remove sign bits - shared.tmpi[tid] = coef ^ (coef >> 31); + atomic_or(shared.maxcoef + i, coef ^ (coef >> 31)); barrier(CLK_LOCAL_MEM_FENCE); - // OR reduction - for (int l = get_local_size(0) / 2; l > 1; l >>= 1) - { - if (tid < l) - shared.tmpi[tid] |= shared.tmpi[tid + l]; - barrier(CLK_LOCAL_MEM_FENCE); - } //SUM32(shared.tmpi,tid,|=); // choose precision //int cbits = max(3, min(10, 5 + (shared.task.abits >> 1))); // - convert_int_rte(shared.PE[order - 1]) int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (shared.task.blocksize <= 2304) - (shared.task.blocksize <= 1152) - (shared.task.blocksize <= 576), shared.task.abits), clz(order) + 1 - shared.task.abits)); // calculate shift based on precision and number of leading zeroes in coeffs - int shift = max(0,min(15, clz(shared.tmpi[0] | shared.tmpi[1]) - 18 + cbits)); + int shift = max(0,min(15, clz(shared.maxcoef[i]) - 18 + cbits)); //cbits = 13; //shift = 15; @@ -426,18 +407,10 @@ void clQuantizeLPC( //shared.tmp[tid] = (tid != 0) * (shared.arp[tid - 1]*(1 << shared.task.shift) - shared.task.coefs[tid - 1]); //shared.task.coefs[tid] = max(-(1 << (shared.task.cbits - 1)), min((1 << (shared.task.cbits - 1))-1, convert_int_rte((shared.arp[tid]) * (1 << shared.task.shift) + shared.tmp[tid]))); // remove sign bits - shared.tmpi[tid] = coef ^ (coef >> 31); + atomic_or(shared.maxcoef2 + i, coef ^ (coef >> 31)); barrier(CLK_LOCAL_MEM_FENCE); - // OR reduction - for (int l = get_local_size(0) / 2; l > 1; l >>= 1) - { - if (tid < l) - shared.tmpi[tid] |= shared.tmpi[tid + l]; - barrier(CLK_LOCAL_MEM_FENCE); - } - //SUM32(shared.tmpi,tid,|=); // calculate actual number of bits (+1 for sign) - cbits = 1 + 32 - clz(shared.tmpi[0] | shared.tmpi[1]); + cbits = 1 + 32 - clz(shared.maxcoef2[i]); // output shift, cbits and output coeffs int taskNo = get_group_id(1) * taskCount + get_group_id(0) * taskCountLPC + i; @@ -452,10 +425,6 @@ void clQuantizeLPC( } } -#ifndef PARTORDER -#define PARTORDER 4 -#endif - __kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) void clEstimateResidual( __global int*output, @@ -463,10 +432,10 @@ void clEstimateResidual( __global FLACCLSubframeTask *tasks ) { - __local int data[GROUP_SIZE * 2]; + __local float data[GROUP_SIZE * 2]; __local FLACCLSubframeTask task; - __local int residual[GROUP_SIZE]; - __local int len[GROUP_SIZE >> PARTORDER]; + __local int psum[64]; + __local float fcoef[32]; const int tid = get_local_id(0); if (tid < sizeof(task)/sizeof(int)) @@ -476,101 +445,74 @@ void clEstimateResidual( int ro = task.data.residualOrder; int bs = task.data.blocksize; - if (tid < 32 && tid >= ro) - task.coefs[tid] = 0; - if (tid < (GROUP_SIZE >> PARTORDER)) - len[tid] = 0; - data[tid] = 0; + if (tid < 32) + fcoef[tid] = select(0.0f, - ((float) task.coefs[tid]) / (1 << task.data.shift), tid < ro); + //fcoef[tid] = select(0.0f, - ((float) task.coefs[tid + ro - MAX_ORDER]) / (1 << task.data.shift), tid + ro >= MAX_ORDER && tid < MAX_ORDER); + if (tid < 64) + psum[tid] = 0; + data[tid] = 0.0f; + + int partOrder = clz(64) - clz(bs - 1) + 1; barrier(CLK_LOCAL_MEM_FENCE); - __local int4 * cptr = (__local int4 *)&task.coefs[0]; - int4 cptr0 = cptr[0]; -#if MAX_ORDER > 4 - int4 cptr1 = cptr[1]; + float4 cptr0 = vload4(0, &fcoef[0]); + float4 cptr1 = vload4(1, &fcoef[0]); #if MAX_ORDER > 8 - int4 cptr2 = cptr[2]; + float4 cptr2 = vload4(2, &fcoef[0]); #endif -#endif - for (int pos = 0; pos < bs; pos += GROUP_SIZE) { // fetch samples int offs = pos + tid; - int nextData = offs < bs ? samples[task.data.samplesOffs + offs] >> task.data.wbits : 0; + float nextData = offs < bs ? samples[task.data.samplesOffs + offs] >> task.data.wbits : 0.0f; data[tid + GROUP_SIZE] = nextData; barrier(CLK_LOCAL_MEM_FENCE); // compute residual - __local int4 * dptr = (__local int4 *)&data[tid + GROUP_SIZE - ro]; - int4 sum = dptr[0] * cptr0 -#if MAX_ORDER > 4 - + dptr[1] * cptr1 + __local float* dptr = &data[tid + GROUP_SIZE - ro]; + float4 sum = cptr0 * vload4(0, dptr) + + cptr1 * vload4(1, dptr) #if MAX_ORDER > 8 - + dptr[2] * cptr2 -#if MAX_ORDER > 12 - + dptr[3] * cptr[3] -#if MAX_ORDER > 16 - + dptr[4] * cptr[4] - + dptr[5] * cptr[5] - + dptr[6] * cptr[6] - + dptr[7] * cptr[7] -#endif -#endif -#endif + + cptr2 * vload4(2, dptr) + #if MAX_ORDER > 12 + + vload4(3, &fcoef[0]) * vload4(3, dptr) + #if MAX_ORDER > 16 + + vload4(4, &fcoef[0]) * vload4(4, dptr) + + vload4(5, &fcoef[0]) * vload4(5, dptr) + + vload4(6, &fcoef[0]) * vload4(6, dptr) + + vload4(7, &fcoef[0]) * vload4(7, dptr) + #endif + #endif #endif ; - - int t = nextData - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift); + + int t = convert_int_rte(nextData + sum.x + sum.y + sum.z + sum.w); + barrier(CLK_LOCAL_MEM_FENCE); + data[tid] = nextData; // ensure we're within frame bounds t = select(0, t, offs >= ro && offs < bs); // overflow protection t = clamp(t, -0x7fffff, 0x7fffff); // convert to unsigned - residual[tid] = (t << 1) ^ (t >> 31); - barrier(CLK_LOCAL_MEM_FENCE); - data[tid] = nextData; - - // calculate rice partition bit length for every 16 samples - if (tid < (GROUP_SIZE >> PARTORDER)) - { - //__local int4 * chunk = (__local int4 *)&residual[tid << PARTORDER]; - __local int4 * chunk = ((__local int4 *)residual) + (tid << (PARTORDER - 2)); -#if PARTORDER == 3 - int4 sum = chunk[0] + chunk[1]; -#elif PARTORDER == 4 - int4 sum = chunk[0] + chunk[1] + chunk[2] + chunk[3]; // [0 .. (1 << (PARTORDER - 2)) - 1] -#elif PARTORDER == 5 - int4 sum = chunk[0] + chunk[1] + chunk[2] + chunk[3] + chunk[4] + chunk[5] + chunk[6] + chunk[7]; -#else -#error Invalid PARTORDER -#endif - int res = sum.x + sum.y + sum.z + sum.w; - int k = clamp(clz(1 << PARTORDER) - clz(res), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16) -#ifdef EXTRAMODE -#if PARTORDER == 3 - sum = (chunk[0] >> k) + (chunk[1] >> k); -#elif PARTORDER == 4 - sum = (chunk[0] >> k) + (chunk[1] >> k) + (chunk[2] >> k) + (chunk[3] >> k); -#else -#error Invalid PARTORDER -#endif - len[tid] += (k << PARTORDER) + sum.x + sum.y + sum.z + sum.w; -#else - len[tid] += (k << PARTORDER) + (res >> k); -#endif - } + atomic_add(&psum[offs >> partOrder], (t << 1) ^ (t >> 31)); } + // calculate rice partition bit length for every (1 << partOrder) samples + if (tid < 64) + { + int k = clamp(clz(1 << partOrder) - clz(psum[tid]), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16) + psum[tid] = (k << partOrder) + (psum[tid] >> k); + } barrier(CLK_LOCAL_MEM_FENCE); - for (int l = GROUP_SIZE >> (PARTORDER + 1); l > 0; l >>= 1) + for (int l = 32; l > 0; l >>= 1) { if (tid < l) - len[tid] += len[tid + l]; + psum[tid] += psum[tid + l]; barrier(CLK_LOCAL_MEM_FENCE); } if (tid == 0) - output[get_group_id(0)] = len[0] + (bs - ro); + output[get_group_id(0)] = psum[0] + (bs - ro); } __kernel __attribute__((reqd_work_group_size(32, 1, 1))) @@ -580,15 +522,16 @@ void clChooseBestMethod( int taskCount ) { - __local struct { - volatile int index[32]; - volatile int length[32]; - } shared; + int best_length = 0x7fffffff; + int best_index = 0; + __local int partLen[32]; __local FLACCLSubframeData task; const int tid = get_local_id(0); - shared.length[tid] = 0x7fffffff; - shared.index[tid] = tid; + // fetch part sum + if (tid < taskCount) + partLen[tid] = residual[tid + taskCount * get_group_id(0)]; + barrier(CLK_LOCAL_MEM_FENCE); for (int taskNo = 0; taskNo < taskCount; taskNo++) { // fetch task data @@ -599,46 +542,27 @@ void clChooseBestMethod( if (tid == 0) { - // fetch part sum - int partLen = residual[taskNo + taskCount * get_group_id(0)]; - //// calculate part size - //int residualLen = task[get_local_id(1)].data.blocksize - task[get_local_id(1)].data.residualOrder; - //residualLen = residualLen * (task[get_local_id(1)].data.type != Constant || psum != 0); - //// calculate rice parameter - //int k = max(0, min(14, convert_int_rtz(log2((psum + 0.000001f) / (residualLen + 0.000001f) + 0.5f)))); - //// calculate part bit length - //int partLen = residualLen * (k + 1) + (psum >> k); - + int pl = partLen[taskNo]; int obits = task.obits - task.wbits; - shared.length[taskNo] = - min(obits * task.blocksize, - task.type == Fixed ? task.residualOrder * obits + 6 + (4 * 1/2) + partLen : - task.type == LPC ? task.residualOrder * obits + 4 + 5 + task.residualOrder * task.cbits + 6 + (4 * 1/2)/* << porder */ + partLen : - task.type == Constant ? obits * select(1, task.blocksize, partLen != task.blocksize - task.residualOrder) : + int len = min(obits * task.blocksize, + task.type == Fixed ? task.residualOrder * obits + 6 + (4 * 1/2) + pl : + task.type == LPC ? task.residualOrder * obits + 4 + 5 + task.residualOrder * task.cbits + 6 + (4 * 1/2)/* << porder */ + pl : + task.type == Constant ? obits * select(1, task.blocksize, pl != task.blocksize - task.residualOrder) : obits * task.blocksize); + + tasks[taskNo + taskCount * get_group_id(0)].data.size = len; + if (len < best_length) + { + best_length = len; + best_index = taskNo; + } } barrier(CLK_LOCAL_MEM_FENCE); } - //shared.index[get_local_id(0)] = get_local_id(0); - //shared.length[get_local_id(0)] = (get_local_id(0) < taskCount) ? tasks[get_local_id(0) + taskCount * get_group_id(0)].size : 0x7fffffff; - if (tid < taskCount) - tasks[tid + taskCount * get_group_id(0)].data.size = shared.length[tid]; - - int l1 = shared.length[tid]; - for (int l = 16; l > 0; l >>= 1) - { - if (tid < l) - { - int l2 = shared.length[tid + l]; - shared.index[tid] = shared.index[tid + select(0, l, l2 < l1)]; - shared.length[tid] = l1 = min(l1, l2); - } - barrier(CLK_LOCAL_MEM_FENCE); - } if (tid == 0) - tasks[taskCount * get_group_id(0)].data.best_index = taskCount * get_group_id(0) + shared.index[0]; + tasks[taskCount * get_group_id(0)].data.best_index = taskCount * get_group_id(0) + best_index; } __kernel __attribute__((reqd_work_group_size(64, 1, 1))) @@ -788,13 +712,13 @@ void clCalcPartition( int psize // == task.blocksize >> max_porder? ) { - __local int pl[(GROUP_SIZE / 16)][15]; + __local int pl[(GROUP_SIZE / 8)][15]; __local FLACCLSubframeData task; const int tid = get_local_id(0); if (tid < sizeof(task) / sizeof(int)) ((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(1)]))[tid]; - if (tid < (GROUP_SIZE / 16)) + if (tid < (GROUP_SIZE / 8)) { for (int k = 0; k <= 14; k++) pl[tid][k] = 0; @@ -807,13 +731,14 @@ void clCalcPartition( { // fetch residual int s = (offs >= task.residualOrder && offs < end) ? residual[task.residualOffs + offs] : 0; - // convert to unsigned + // overflow protection s = clamp(s, -0x7fffff, 0x7fffff); + // convert to unsigned s = (s << 1) ^ (s >> 31); // calc number of unary bits for each residual sample with each rice paramater - int part = (offs - start) / psize; + int part = (offs - start) / psize + (tid & 1) * (GROUP_SIZE / 16); for (int k = 0; k <= 14; k++) - atom_add(&pl[part][k], s >> k); + atomic_add(&pl[part][k], s >> k); //pl[part][k] += s >> k; } barrier(CLK_LOCAL_MEM_FENCE); @@ -825,7 +750,8 @@ void clCalcPartition( { // output length const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (k << (max_porder + 1)); - partition_lengths[pos + part] = min(0x7fffff, pl[tid][k]) + select(psize, psize - task.residualOrder, part == 0) * (k + 1); + int plen = pl[tid][k] + pl[tid + (GROUP_SIZE / 16)][k]; + partition_lengths[pos + part] = min(0x7fffff, plen) + (psize - select(0, task.residualOrder, part == 0)) * (k + 1); // if (get_group_id(1) == 0) //printf("pl[%d][%d] == %d\n", k, part, min(0x7fffff, pl[k][tid]) + (psize - task.residualOrder * (part == 0)) * (k + 1)); } @@ -911,7 +837,7 @@ void clCalcPartition16( // convert to unsigned res[tid] = (s << 1) ^ (s >> 31); - // for (int k = 0; k < 15; k++) atom_add(&pl[x][k], s >> k); + // for (int k = 0; k < 15; k++) atomic_add(&pl[x][k], s >> k); barrier(CLK_LOCAL_MEM_FENCE); data[tid] = nextData; @@ -1001,25 +927,23 @@ void clFindPartitionOrder( int max_porder ) { - __local int partlen[9]; + __local int partlen[16]; __local FLACCLSubframeData task; const int pos = (get_group_id(0) << (max_porder + 2)) + (2 << max_porder); if (get_local_id(0) < sizeof(task) / sizeof(int)) ((__local int*)&task)[get_local_id(0)] = ((__global int*)(&tasks[get_group_id(0)]))[get_local_id(0)]; - if (get_local_id(0) < 9) + if (get_local_id(0) < 16) partlen[get_local_id(0)] = 0; barrier(CLK_LOCAL_MEM_FENCE); // fetch partition lengths - for (int offs = 0; offs < (2 << max_porder); offs += GROUP_SIZE) + int lim = (2 << max_porder) - 1; + for (int offs = get_local_id(0); offs < lim; offs += GROUP_SIZE) { - if (offs + get_local_id(0) < (2 << max_porder) - 1) - { - int len = rice_parameters[pos + offs + get_local_id(0)]; - int porder = 31 - clz((2 << max_porder) - 1 - offs - get_local_id(0)); - atom_add(&partlen[porder], len); - } + int len = rice_parameters[pos + offs]; + int porder = 31 - clz(lim - offs); + atomic_add(&partlen[porder], len); } barrier(CLK_LOCAL_MEM_FENCE); diff --git a/CUETools.Codecs.FLACCL/flaccpu.cl b/CUETools.Codecs.FLACCL/flaccpu.cl new file mode 100644 index 0000000..a075299 --- /dev/null +++ b/CUETools.Codecs.FLACCL/flaccpu.cl @@ -0,0 +1,492 @@ +/** + * CUETools.FLACCL: FLAC audio encoder using OpenCL + * Copyright (c) 2010 Gregory S. Chudov + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _FLACCL_KERNEL_H_ +#define _FLACCL_KERNEL_H_ + +#ifdef DEBUG +#pragma OPENCL EXTENSION cl_amd_printf : enable +#endif + +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable + +#pragma OPENCL EXTENSION cl_amd_fp64 : enable + +typedef enum +{ + Constant = 0, + Verbatim = 1, + Fixed = 8, + LPC = 32 +} SubframeType; + +typedef struct +{ + int residualOrder; // <= 32 + int samplesOffs; + int shift; + int cbits; + int size; + int type; + int obits; + int blocksize; + int best_index; + int channel; + int residualOffs; + int wbits; + int abits; + int porder; + int reserved[2]; +} FLACCLSubframeData; + +typedef struct +{ + FLACCLSubframeData data; + int coefs[32]; // fixme: should be short? +} FLACCLSubframeTask; + +__kernel void clStereoDecorr( + __global int *samples, + __global short2 *src, + int offset +) +{ + int pos = get_global_id(0); + if (pos < offset) + { + short2 s = src[pos]; + samples[pos] = s.x; + samples[1 * offset + pos] = s.y; + samples[2 * offset + pos] = (s.x + s.y) >> 1; + samples[3 * offset + pos] = s.x - s.y; + } +} + +__kernel void clChannelDecorr2( + __global int *samples, + __global short2 *src, + int offset +) +{ + int pos = get_global_id(0); + if (pos < offset) + { + short2 s = src[pos]; + samples[pos] = s.x; + samples[1 * offset + pos] = s.y; + } +} + +//__kernel void clChannelDecorr( +// int *samples, +// short *src, +// int offset +//) +//{ +// int pos = get_global_id(0); +// if (pos < offset) +// samples[get_group_id(1) * offset + pos] = src[pos * get_num_groups(1) + get_group_id(1)]; +//} + +#define __ffs(a) (32 - clz(a & (-a))) +//#define __ffs(a) (33 - clz(~a & (a - 1))) + +__kernel __attribute__((reqd_work_group_size(1, 1, 1))) +void clFindWastedBits( + __global FLACCLSubframeTask *tasks, + __global int *samples, + int tasksPerChannel +) +{ + __global FLACCLSubframeTask* ptask = &tasks[get_group_id(0) * tasksPerChannel]; + int w = 0, a = 0; + for (int pos = 0; pos < ptask->data.blocksize; pos ++) + { + int smp = samples[ptask->data.samplesOffs + pos]; + w |= smp; + a |= smp ^ (smp >> 31); + } + w = max(0,__ffs(w) - 1); + a = 32 - clz(a) - w; + for (int i = 0; i < tasksPerChannel; i++) + { + ptask[i].data.wbits = w; + ptask[i].data.abits = a; + } +} + +// get_num_groups(0) == number of tasks +// get_num_groups(1) == number of windows +__kernel __attribute__((reqd_work_group_size(1, 1, 1))) +void clComputeAutocor( + __global float *output, + __global const int *samples, + __global const float *window, + __global FLACCLSubframeTask *tasks, + const int taskCount // tasks per block +) +{ + FLACCLSubframeData task = tasks[get_group_id(0) * taskCount].data; + int len = task.blocksize; + int windowOffs = get_group_id(1) * len; + float data1[4096 + 32]; + + // TODO!!!!!!!!!!! if (bs > 4096) data1[bs + 32] + + for (int tid = 0; tid < len; tid++) + data1[tid] = samples[task.samplesOffs + tid] * window[windowOffs + tid]; + data1[len] = 0.0f; + + for (int i = 0; i <= MAX_ORDER; ++i) + { + double temp = 1.0; + double temp2 = 1.0; + float* finish = data1 + len - i; + + for (float* pdata = data1; pdata < finish; pdata += 2) + { + temp += pdata[i] * pdata[0]; + temp2 += pdata[i + 1] * pdata[1]; + } + output[(get_group_id(0) * get_num_groups(1) + get_group_id(1)) * (MAX_ORDER + 1) + i] = temp + temp2; + } +} + +__kernel __attribute__((reqd_work_group_size(1, 1, 1))) +void clComputeLPC( + __global float *pautoc, + __global float *lpcs, + int windowCount +) +{ + int lpcOffs = (get_group_id(0) + get_group_id(1) * windowCount) * (MAX_ORDER + 1) * 32; + int autocOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1); + volatile double ldr[32]; + volatile double gen0[32]; + volatile double gen1[32]; + volatile double err[32]; + __global float* autoc = pautoc + autocOffs; + + for (int i = 0; i < MAX_ORDER; i++) + { + gen0[i] = gen1[i] = autoc[i + 1]; + ldr[i] = 0.0; + } + + // Compute LPC using Schur and Levinson-Durbin recursion + double error = autoc[0]; + for (int order = 0; order < MAX_ORDER; order++) + { + // Schur recursion + double reff = -gen1[0] / error; + //error += gen1[0] * reff; // Equivalent to error *= (1 - reff * reff); + error *= (1 - reff * reff); + + for (int j = 0; j < MAX_ORDER - 1 - order; j++) + { + gen1[j] = gen1[j + 1] + reff * gen0[j]; + gen0[j] = gen1[j + 1] * reff + gen0[j]; + } + + err[order] = error; + + // Levinson-Durbin recursion + + ldr[order] = reff; + for (int j = 0; j < order / 2; j++) + { + double tmp = ldr[j]; + ldr[j] += reff * ldr[order - 1 - j]; + ldr[order - 1 - j] += reff * tmp; + } + if (0 != (order & 1)) + ldr[order / 2] += ldr[order / 2] * reff; + + // Output coeffs + for (int j = 0; j <= order; j++) + lpcs[lpcOffs + order * 32 + j] = -ldr[order - j]; + } + // Output prediction error estimates + for (int j = 0; j < MAX_ORDER; j++) + lpcs[lpcOffs + MAX_ORDER * 32 + j] = err[j]; +} + +__kernel __attribute__((reqd_work_group_size(1, 1, 1))) +void clQuantizeLPC( + __global FLACCLSubframeTask *tasks, + __global float*lpcs, + int taskCount, // tasks per block + int taskCountLPC, // tasks per set of coeffs (<= 32) + int minprecision, + int precisions +) +{ + int bs = tasks[get_group_id(1) * taskCount].data.blocksize; + int abits = tasks[get_group_id(1) * taskCount].data.abits; + int lpcOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1) * 32; + float error[MAX_ORDER]; + int best_orders[MAX_ORDER]; + + // Load prediction error estimates based on Akaike's Criteria + for (int tid = 0; tid < MAX_ORDER; tid++) + { + error[tid] = bs * log(lpcs[lpcOffs + MAX_ORDER * 32 + tid]) + tid * 4.12f * log(bs); + best_orders[tid] = tid; + } + + // Select best orders + for (int i = 0; i < MAX_ORDER && i < taskCountLPC; i++) + { + for (int j = i + 1; j < MAX_ORDER; j++) + { + if (error[best_orders[j]] < error[best_orders[i]]) + { + int tmp = best_orders[j]; + best_orders[j] = best_orders[i]; + best_orders[i] = tmp; + } + } + } + + // Quantization + for (int i = 0; i < taskCountLPC; i ++) + { + int order = best_orders[i >> precisions]; + int tmpi = 0; + for (int tid = 0; tid <= order; tid ++) + { + float lpc = lpcs[lpcOffs + order * 32 + tid]; + // get 15 bits of each coeff + int c = convert_int_rte(lpc * (1 << 15)); + // remove sign bits + tmpi |= c ^ (c >> 31); + } + // choose precision + //int cbits = max(3, min(10, 5 + (abits >> 1))); // - convert_int_rte(shared.PE[order - 1]) + int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (bs <= 2304) - (bs <= 1152) - (bs <= 576), abits), clz(order) + 1 - abits)); + // calculate shift based on precision and number of leading zeroes in coeffs + int shift = max(0,min(15, clz(tmpi) - 18 + cbits)); + + int taskNo = get_group_id(1) * taskCount + get_group_id(0) * taskCountLPC + i; + tmpi = 0; + for (int tid = 0; tid <= order; tid ++) + { + float lpc = lpcs[lpcOffs + order * 32 + tid]; + // quantize coeffs with given shift + int c = convert_int_rte(clamp(lpc * (1 << shift), -1 << (cbits - 1), 1 << (cbits - 1))); + // remove sign bits + tmpi |= c ^ (c >> 31); + tasks[taskNo].coefs[tid] = c; + } + // calculate actual number of bits (+1 for sign) + cbits = 1 + 32 - clz(tmpi); + // output shift, cbits, ro + tasks[taskNo].data.shift = shift; + tasks[taskNo].data.cbits = cbits; + tasks[taskNo].data.residualOrder = order + 1; + } +} + +#define ESTIMATE_N(ro,sum) for (int pos = ro; pos < bs; pos ++) { \ + __global int *ptr = data + pos - ro; \ + int t = clamp((data[pos] - ((sum) >> task.data.shift)) >> task.data.wbits, -0x7fffff, 0x7fffff); \ + len[pos >> (12 - EPO)] += (t << 1) ^ (t >> 31); \ + } + +// int sum = 0; for (int i = 0; i < ro; i++) sum += *(ptr++) * task.coefs[i]; + +__kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(1, 1, 1))) +void clEstimateResidual( + __global int*output, + __global int*samples, + __global FLACCLSubframeTask *tasks + ) +{ + FLACCLSubframeTask task = tasks[get_group_id(0)]; + int ro = task.data.residualOrder; + int bs = task.data.blocksize; +#define EPO 6 + int len[1 << EPO]; + +#if 0 + //float data[4096 + 32]; + //float fcoef[32]; + + // TODO!!!!!!!!!!! if (bs > 4096) data1[bs + 32] + + for (int tid = 0; tid < bs; tid++) + data[tid] = (float)samples[task.data.samplesOffs + tid] / (1 << task.data.wbits); + for (int tid = 0; tid < 32; tid++) + fcoef[tid] = select(0.0f, - ((float) task.coefs[tid]) / (1 << task.data.shift), tid < ro); + float4 c0 = vload4(0, &fcoef[0]); + float4 c1 = vload4(1, &fcoef[0]); + float4 c2 = vload4(2, &fcoef[0]); +#else + __global int *data = &samples[task.data.samplesOffs]; + for (int i = ro; i < 32; i++) + task.coefs[i] = 0; +#endif + for (int i = 0; i < 1 << EPO; i++) + len[i] = 0; + + switch (ro) + { + case 0: ESTIMATE_N(0, 0) break; + case 1: ESTIMATE_N(1, *ptr * task.coefs[0]) break; + case 2: ESTIMATE_N(2, *(ptr++) * task.coefs[0] + *ptr * task.coefs[1]) break; + case 3: ESTIMATE_N(3, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *ptr * task.coefs[2]) break; + case 4: ESTIMATE_N(4, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *ptr * task.coefs[3]) break; + case 5: ESTIMATE_N(5, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *ptr * task.coefs[4]) break; + case 6: ESTIMATE_N(6, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *ptr * task.coefs[5]) break; + case 7: ESTIMATE_N(7, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *ptr * task.coefs[6]) break; + case 8: ESTIMATE_N(8, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *ptr * task.coefs[7]) break; + case 9: ESTIMATE_N(9, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7] + *ptr * task.coefs[8]) break; + case 10: ESTIMATE_N(10, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7] + *(ptr++) * task.coefs[8] + *ptr * task.coefs[9]) break; + case 11: ESTIMATE_N(11, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7] + *(ptr++) * task.coefs[8] + *(ptr++) * task.coefs[9] + *ptr * task.coefs[10]) break; + case 12: ESTIMATE_N(12, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7] + *(ptr++) * task.coefs[8] + *(ptr++) * task.coefs[9] + *(ptr++) * task.coefs[10] + *ptr * task.coefs[11]) break; + default: + for (int pos = ro; pos < bs; pos ++) + { + #if 0 + float sum = dot(vload4(0, data + pos - ro), c0) + + dot(vload4(1, data + pos - ro), c1) + + dot(vload4(2, data + pos - ro), c2) + ; + int t = convert_int_rte(data[pos] + sum); + #else + __global int *ptr = data + pos - ro; + int sum = + *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7] + + *(ptr++) * task.coefs[8] + *(ptr++) * task.coefs[9] + *(ptr++) * task.coefs[10] + *(ptr++) * task.coefs[11] + ; + for (int i = 12; i < ro; i++) + sum += *(ptr++) * task.coefs[i]; + int t = (data[pos] - (sum >> task.data.shift)) >> task.data.wbits; + #endif + // overflow protection + t = clamp(t, -0x7fffff, 0x7fffff); + // convert to unsigned + t = (t << 1) ^ (t >> 31); + len[pos >> (12 - EPO)] += t; + } + break; + } + + int total = 0; + for (int i = 0; i < 1 << EPO; i++) + { + int res = min(0x7fffff,len[i]); + int k = clamp(clz(1 << (12 - EPO)) - clz(res), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16) + total += (k << (12 - EPO)) + (res >> k); + } + output[get_group_id(0)] = min(0x7ffffff, total) + (bs - ro); +} + +__kernel __attribute__((reqd_work_group_size(1, 1, 1))) +void clChooseBestMethod( + __global FLACCLSubframeTask *tasks, + __global int *residual, + int taskCount + ) +{ + int best_length = 0x7fffff; + int best_no = 0; + for (int taskNo = 0; taskNo < taskCount; taskNo++) + { + // fetch task data + __global FLACCLSubframeTask* ptask = tasks + taskNo + taskCount * get_group_id(0); + // fetch part sum + int partLen = residual[taskNo + taskCount * get_group_id(0)]; + int obits = ptask->data.obits - ptask->data.wbits; + int bs = ptask->data.blocksize; + int ro = ptask->data.residualOrder; + int len = min(obits * bs, + ptask->data.type == Fixed ? ro * obits + 6 + (4 * 1/2) + partLen : + ptask->data.type == LPC ? ro * obits + 4 + 5 + ro * ptask->data.cbits + 6 + (4 * 1/2)/* << porder */ + partLen : + ptask->data.type == Constant ? obits * select(1, bs, partLen != bs - ro) : + obits * bs); + ptask->data.size = len; + if (len < best_length) + { + best_length = len; + best_no = taskNo; + } + } + + tasks[taskCount * get_group_id(0)].data.best_index = taskCount * get_group_id(0) + best_no; +} + +__kernel __attribute__((reqd_work_group_size(1, 1, 1))) +void clCopyBestMethod( + __global FLACCLSubframeTask *tasks_out, + __global FLACCLSubframeTask *tasks, + int count + ) +{ + int best_index = tasks[count * get_group_id(0)].data.best_index; + tasks_out[get_group_id(0)] = tasks[best_index]; +} + +__kernel __attribute__((reqd_work_group_size(1, 1, 1))) +void clCopyBestMethodStereo( + __global FLACCLSubframeTask *tasks_out, + __global FLACCLSubframeTask *tasks, + int count + ) +{ + int best_index[4]; + int best_size[4]; + int lr_index[2]; + + for (int i = 0; i < 4; i++) + { + int best = tasks[count * (get_group_id(0) * 4 + i)].data.best_index; + best_index[i] = best; + best_size[i] = tasks[best].data.size; + } + + int bitsBest = best_size[2] + best_size[3]; // MidSide + lr_index[0] = best_index[2]; + lr_index[1] = best_index[3]; + if (bitsBest > best_size[3] + best_size[1]) // RightSide + { + bitsBest = best_size[3] + best_size[1]; + lr_index[0] = best_index[3]; + lr_index[1] = best_index[1]; + } + if (bitsBest > best_size[0] + best_size[3]) // LeftSide + { + bitsBest = best_size[0] + best_size[3]; + lr_index[0] = best_index[0]; + lr_index[1] = best_index[3]; + } + if (bitsBest > best_size[0] + best_size[1]) // LeftRight + { + bitsBest = best_size[0] + best_size[1]; + lr_index[0] = best_index[0]; + lr_index[1] = best_index[1]; + } + tasks_out[2 * get_group_id(0)] = tasks[lr_index[0]]; + tasks_out[2 * get_group_id(0)].data.residualOffs = tasks[best_index[0]].data.residualOffs; + tasks_out[2 * get_group_id(0) + 1] = tasks[lr_index[1]]; + tasks_out[2 * get_group_id(0) + 1].data.residualOffs = tasks[best_index[1]].data.residualOffs; +} + +#endif diff --git a/CUETools.FLACCL.cmd/Program.cs b/CUETools.FLACCL.cmd/Program.cs index 60719f7..377e2ef 100644 --- a/CUETools.FLACCL.cmd/Program.cs +++ b/CUETools.FLACCL.cmd/Program.cs @@ -72,6 +72,7 @@ namespace CUETools.FLACCL.cmd string window_function = null; string input_file = null; string output_file = null; + string device_type = null; int min_partition_order = -1, max_partition_order = -1, min_lpc_order = -1, max_lpc_order = -1, min_fixed_order = -1, max_fixed_order = -1, @@ -112,6 +113,10 @@ namespace CUETools.FLACCL.cmd settings.GroupSize = intarg; else if (args[arg] == "--define" && arg + 2 < args.Length) settings.Defines += "#define " + args[++arg] + " " + args[++arg] + "\n"; + else if (args[arg] == "--opencl-platform" && ++arg < args.Length) + settings.Platform = args[arg]; + else if (args[arg] == "--opencl-type" && ++arg < args.Length) + device_type = args[arg]; else if ((args[arg] == "-o" || args[arg] == "--output") && ++arg < args.Length) output_file = args[arg]; else if ((args[arg] == "-s" || args[arg] == "--stereo") && ++arg < args.Length) @@ -210,6 +215,8 @@ namespace CUETools.FLACCL.cmd try { + if (device_type != null) + settings.DeviceType = (OpenCLDeviceType)(Enum.Parse(typeof(OpenCLDeviceType), device_type, true)); encoder.Settings = settings; if (level >= 0) encoder.CompressionLevel = level;