From 0466eb57c53116f24c30c1b0240cb817959e19fa Mon Sep 17 00:00:00 2001 From: chudov Date: Sun, 31 Oct 2010 07:42:09 +0000 Subject: [PATCH] optimizations --- CUETools.Codecs.FLACCL/FLACCLWriter.cs | 14 ++- CUETools.Codecs.FLACCL/flac.cl | 144 +++++++++++-------------- CUETools.Codecs.FLACCL/flaccpu.cl | 24 ++--- 3 files changed, 75 insertions(+), 107 deletions(-) diff --git a/CUETools.Codecs.FLACCL/FLACCLWriter.cs b/CUETools.Codecs.FLACCL/FLACCLWriter.cs index 03d96a6..9f8dc87 100644 --- a/CUETools.Codecs.FLACCL/FLACCLWriter.cs +++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs @@ -39,7 +39,7 @@ namespace CUETools.Codecs.FLACCL this.GPUOnly = true; this.MappedMemory = false; this.DoMD5 = true; - this.GroupSize = 64; + this.GroupSize = 128; this.DeviceType = OpenCLDeviceType.GPU; } @@ -61,7 +61,7 @@ namespace CUETools.Codecs.FLACCL [SRDescription(typeof(Properties.Resources), "DescriptionMappedMemory")] public bool MappedMemory { get; set; } - [DefaultValue(64)] + [DefaultValue(128)] [SRDescription(typeof(Properties.Resources), "DescriptionGroupSize")] public int GroupSize { get; set; } @@ -1504,6 +1504,7 @@ namespace CUETools.Codecs.FLACCL OCLMan.Defines = "#define MAX_ORDER " + eparams.max_prediction_order.ToString() + "\n" + "#define GROUP_SIZE " + groupSize.ToString() + "\n" + + "#define FLACCL_VERSION \"" + vendor_string + "\"\n" + #if DEBUG "#define DEBUG\n" + #endif @@ -2112,9 +2113,9 @@ namespace CUETools.Codecs.FLACCL do_constant = false; do_midside = false; window_function = WindowFunction.Bartlett; + orders_per_window = 1; min_fixed_order = 2; max_fixed_order = 2; - orders_per_window = 1; max_prediction_order = 8; max_partition_order = 4; break; @@ -2235,7 +2236,6 @@ namespace CUETools.Codecs.FLACCL public Mem clBestRiceParams; public Mem clAutocorOutput; public Mem clResidualTasks; - public Mem clResidualOutput; public Mem clBestResidualTasks; public Mem clWindowFunctions; @@ -2356,7 +2356,6 @@ namespace CUETools.Codecs.FLACCL clSamples = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, samplesBufferLen); clLPCData = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, lpcDataLen); clAutocorOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, autocorLen); - clResidualOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, resOutLen); if (writer._settings.GPUOnly) { clPartitions = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, partitionsLen); @@ -2483,7 +2482,6 @@ namespace CUETools.Codecs.FLACCL clResidual.Dispose(); clAutocorOutput.Dispose(); clResidualTasks.Dispose(); - clResidualOutput.Dispose(); clBestResidualTasks.Dispose(); clWindowFunctions.Dispose(); @@ -2587,7 +2585,6 @@ namespace CUETools.Codecs.FLACCL channelsCount * frameCount); clEstimateResidual.SetArgs( - clResidualOutput, clSamples, clResidualTasks); @@ -2598,7 +2595,6 @@ namespace CUETools.Codecs.FLACCL clChooseBestMethod.SetArgs( clResidualTasks, - clResidualOutput, nResidualTasksPerChannel); openCLCQ.EnqueueNDRangeKernel( @@ -2714,6 +2710,7 @@ namespace CUETools.Codecs.FLACCL } } +#if HJHKHJ public static class OpenCLExtensions { public static void SetArgs(this Kernel kernel, params object[] args) @@ -2744,4 +2741,5 @@ namespace CUETools.Codecs.FLACCL queue.EnqueueNDRangeKernel(kernel, 2, null, new long[] { localSizeX * globalSizeX, localSizeY * globalSizeY }, new long[] { localSizeX, localSizeY }); } } +#endif } diff --git a/CUETools.Codecs.FLACCL/flac.cl b/CUETools.Codecs.FLACCL/flac.cl index ea7e1a2..f0e2fb2 100644 --- a/CUETools.Codecs.FLACCL/flac.cl +++ b/CUETools.Codecs.FLACCL/flac.cl @@ -20,15 +20,16 @@ #ifndef _FLACCL_KERNEL_H_ #define _FLACCL_KERNEL_H_ -#undef DEBUG - -//#define AMD - -//#ifdef DEBUG -//#pragma OPENCL EXTENSION cl_amd_printf : enable -//#endif - +#if defined(__Cedar__) || defined(__Redwood__) || defined(__Juniper__) || defined(__Cypress__) +#define AMD +#ifdef DEBUG +#pragma OPENCL EXTENSION cl_amd_printf : enable +#endif //#pragma OPENCL EXTENSION cl_amd_fp64 : enable +#define iclamp(a,b,c) clamp(a,b,c) +#else +#define iclamp(a,b,c) max(b,min(a,c)) +#endif #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics: enable #pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics: enable @@ -66,8 +67,6 @@ typedef struct int coefs[32]; // fixme: should be short? } FLACCLSubframeTask; -#define iclamp(a,b,c) max(b,min(a,c)) - __kernel void clStereoDecorr( __global int *samples, __global short2 *src, @@ -172,15 +171,11 @@ void clComputeAutocor( ) { __local float data[GROUP_SIZE * 2]; - __local float product[(MAX_ORDER / 4 + 1) * GROUP_SIZE]; __local FLACCLSubframeData task; const int tid = get_local_id(0); // fetch task data if (tid < sizeof(task) / sizeof(int)) ((__local int*)&task)[tid] = ((__global int*)(tasks + taskCount * get_group_id(0)))[tid]; - - for (int ord4 = 0; ord4 < (MAX_ORDER / 4 + 1); ord4 ++) - product[ord4 * GROUP_SIZE + tid] = 0.0f; barrier(CLK_LOCAL_MEM_FENCE); int bs = task.blocksize; @@ -188,13 +183,9 @@ void clComputeAutocor( data[tid] = tid < bs ? samples[task.samplesOffs + tid] * window[windowOffs + tid] : 0.0f; - int tid0 = tid % (GROUP_SIZE >> 2); - int tid1 = tid / (GROUP_SIZE >> 2); -#ifdef ATI - __local float4 * dptr = ((__local float4 *)&data[0]) + tid0; - __local float4 * dptr1 = ((__local float4 *)&data[tid1]) + tid0; -#endif - + const int THREADS_FOR_ORDERS = MAX_ORDER < 8 ? 8 : MAX_ORDER < 16 ? 16 : MAX_ORDER < 32 ? 32 : 64; + float corr = 0.0f; + float corr1 = 0.0f; for (int pos = 0; pos < bs; pos += GROUP_SIZE) { // fetch samples @@ -202,29 +193,40 @@ void clComputeAutocor( data[tid + GROUP_SIZE] = nextData; barrier(CLK_LOCAL_MEM_FENCE); - for (int ord4 = 0; ord4 < (MAX_ORDER / 4 + 1); ord4 ++) -#ifdef ATI - product[ord4 * GROUP_SIZE + tid] += dot(dptr[0], dptr1[ord4]); +#ifdef XXXAMD + __local float * dptr = &data[tid & ~(THREADS_FOR_ORDERS - 1)]; + float4 res = 0.0f; + for (int i = 0; i < THREADS_FOR_ORDERS / 4; i++) + res += vload4(i, dptr) * vload4(i, &data[tid]); + corr += res.x + res.y + res.w + res.z; #else - product[ord4 * GROUP_SIZE + tid] += - data[tid0*4 + 0] * data[tid0*4 + ord4*4 + tid1 + 0] + - data[tid0*4 + 1] * data[tid0*4 + ord4*4 + tid1 + 1] + - data[tid0*4 + 2] * data[tid0*4 + ord4*4 + tid1 + 2] + - data[tid0*4 + 3] * data[tid0*4 + ord4*4 + tid1 + 3]; + int tid1 = tid & ~(THREADS_FOR_ORDERS - 1); + float res = 0.0f; + for (int i = 0; i < THREADS_FOR_ORDERS; i++) + res += data[tid1 + i] * data[tid + i]; + corr += res; #endif - barrier(CLK_LOCAL_MEM_FENCE); + if (THREADS_FOR_ORDERS > 8 && (pos & (GROUP_SIZE * 7)) == 0) + { + corr1 += corr; + corr = 0.0f; + } + barrier(CLK_LOCAL_MEM_FENCE); data[tid] = nextData; } - for (int ord4 = 0; ord4 < (MAX_ORDER / 4 + 1); ord4 ++) - for (int l = (GROUP_SIZE >> 3); l > 0; l >>= 1) - { - if (tid0 < l) - product[ord4 * GROUP_SIZE + tid] += product[ord4 * GROUP_SIZE + tid + l]; - barrier(CLK_LOCAL_MEM_FENCE); - } + + data[tid] = corr + corr1; + barrier(CLK_LOCAL_MEM_FENCE); + for (int i = GROUP_SIZE / 2; i >= THREADS_FOR_ORDERS; i >>= 1) + { + if (tid < i) + data[tid] += data[tid + i]; + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid <= MAX_ORDER) - output[(get_group_id(0) * get_num_groups(1) + get_group_id(1)) * (MAX_ORDER + 1) + tid] = product[tid * (GROUP_SIZE >> 2)]; + output[(get_group_id(0) * get_num_groups(1) + get_group_id(1)) * (MAX_ORDER + 1) + tid] = data[tid]; } __kernel __attribute__((reqd_work_group_size(32, 1, 1))) @@ -326,7 +328,6 @@ void clQuantizeLPC( volatile float error[64]; volatile int maxcoef[32]; volatile int maxcoef2[32]; - volatile int lpcOffs; } shared; const int tid = get_local_id(0); @@ -334,9 +335,8 @@ void clQuantizeLPC( // fetch task data if (tid < sizeof(shared.task) / sizeof(int)) ((__local int*)&shared.task)[tid] = ((__global int*)(tasks + get_group_id(1) * taskCount))[tid]; - if (tid == 0) - shared.lpcOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1) * 32; barrier(CLK_LOCAL_MEM_FENCE); + const int lpcOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1) * 32; // Select best orders based on Akaike's Criteria shared.index[tid] = min(MAX_ORDER - 1, tid); @@ -348,8 +348,8 @@ void clQuantizeLPC( // Load prediction error estimates if (tid < MAX_ORDER) - shared.error[tid] = shared.task.blocksize * log(lpcs[shared.lpcOffs + MAX_ORDER * 32 + tid]) + tid * 4.12f * log((float)shared.task.blocksize); - //shared.error[get_local_id(0)] = shared.task.blocksize * log(lpcs[shared.lpcOffs + MAX_ORDER * 32 + get_local_id(0)]) + get_local_id(0) * 0.30f * (shared.task.abits + 1) * log(shared.task.blocksize); + shared.error[tid] = shared.task.blocksize * log(lpcs[lpcOffs + MAX_ORDER * 32 + tid]) + tid * 4.12f * log((float)shared.task.blocksize); + //shared.error[get_local_id(0)] = shared.task.blocksize * log(lpcs[lpcOffs + MAX_ORDER * 32 + get_local_id(0)]) + get_local_id(0) * 0.30f * (shared.task.abits + 1) * log(shared.task.blocksize); barrier(CLK_LOCAL_MEM_FENCE); // Sort using bitonic sort @@ -402,7 +402,7 @@ void clQuantizeLPC( for (int i = 0; i < taskCountLPC; i ++) { int order = shared.index[i >> precisions]; - float lpc = tid <= order ? lpcs[shared.lpcOffs + order * 32 + tid] : 0.0f; + float lpc = tid <= order ? lpcs[lpcOffs + order * 32 + tid] : 0.0f; // get 15 bits of each coeff int coef = convert_int_rte(lpc * (1 << 15)); // remove sign bits @@ -446,7 +446,6 @@ void clQuantizeLPC( __kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) void clEstimateResidual( - __global int*output, __global int*samples, __global FLACCLSubframeTask *tasks ) @@ -524,7 +523,7 @@ void clEstimateResidual( // overflow protection t = iclamp(t, -0x7fffff, 0x7fffff); // convert to unsigned - if (offs < bs) + //if (offs < bs) atom_add(&psum[offs >> partOrder], (t << 1) ^ (t >> 31)); } @@ -542,52 +541,39 @@ void clEstimateResidual( barrier(CLK_LOCAL_MEM_FENCE); } if (tid == 0) - output[get_group_id(0)] = psum[0] + (bs - ro); + { + int pl = psum[0] + (bs - ro); + int obits = task.data.obits - task.data.wbits; + int len = min(obits * task.data.blocksize, + task.data.type == Fixed ? task.data.residualOrder * obits + 6 + (4 * 1/2) + pl : + task.data.type == LPC ? task.data.residualOrder * obits + 4 + 5 + task.data.residualOrder * task.data.cbits + 6 + (4 * 1/2)/* << porder */ + pl : + task.data.type == Constant ? obits * select(1, task.data.blocksize, pl != task.data.blocksize - task.data.residualOrder) : + obits * task.data.blocksize); + tasks[get_group_id(0)].data.size = len; + } } __kernel __attribute__((reqd_work_group_size(32, 1, 1))) void clChooseBestMethod( __global FLACCLSubframeTask *tasks, - __global int *residual, int taskCount ) { int best_length = 0x7fffffff; int best_index = 0; - __local int partLen[32]; - __local FLACCLSubframeData task; const int tid = get_local_id(0); - // fetch part sum - if (tid < taskCount) - partLen[tid] = residual[tid + taskCount * get_group_id(0)]; - barrier(CLK_LOCAL_MEM_FENCE); for (int taskNo = 0; taskNo < taskCount; taskNo++) { - // fetch task data - if (tid < sizeof(task) / sizeof(int)) - ((__local int*)&task)[tid] = ((__global int*)(&tasks[taskNo + taskCount * get_group_id(0)].data))[tid]; - - barrier(CLK_LOCAL_MEM_FENCE); - if (tid == 0) { - int pl = partLen[taskNo]; - int obits = task.obits - task.wbits; - int len = min(obits * task.blocksize, - task.type == Fixed ? task.residualOrder * obits + 6 + (4 * 1/2) + pl : - task.type == LPC ? task.residualOrder * obits + 4 + 5 + task.residualOrder * task.cbits + 6 + (4 * 1/2)/* << porder */ + pl : - task.type == Constant ? obits * select(1, task.blocksize, pl != task.blocksize - task.residualOrder) : - obits * task.blocksize); - - tasks[taskNo + taskCount * get_group_id(0)].data.size = len; + int len = tasks[taskNo + taskCount * get_group_id(0)].data.size; if (len < best_length) { best_length = len; best_index = taskNo; } } - barrier(CLK_LOCAL_MEM_FENCE); } @@ -686,7 +672,6 @@ void clEncodeResidual( barrier(CLK_LOCAL_MEM_FENCE); - __local int4 * cptr = (__local int4 *)&task.coefs[0]; int4 cptr0 = vload4(0, &task.coefs[0]); #if MAX_ORDER > 4 int4 cptr1 = vload4(1, &task.coefs[0]); @@ -813,19 +798,12 @@ void clCalcPartition16( if (tid >= ro && tid < 32) task.coefs[tid] = 0; - int k = tid % 16; + int k = tid & 15; int x = tid / 16; barrier(CLK_LOCAL_MEM_FENCE); int4 cptr0 = vload4(0, &task.coefs[0]); -#if MAX_ORDER > 4 - int4 cptr1 = vload4(1, &task.coefs[0]); -#if MAX_ORDER > 8 - int4 cptr2 = vload4(2, &task.coefs[0]); -#endif -#endif - data[tid] = 0; for (int pos = 0; pos < bs; pos += GROUP_SIZE) { @@ -839,9 +817,9 @@ void clCalcPartition16( __local int* dptr = &data[tid + GROUP_SIZE - ro]; int4 sum = cptr0 * vload4(0, dptr) #if MAX_ORDER > 4 - + cptr1 * vload4(1, dptr) + + vload4(1, &task.coefs[0]) * vload4(1, dptr) #if MAX_ORDER > 8 - + cptr2 * vload4(2, dptr) + + vload4(2, &task.coefs[0]) * vload4(2, dptr) #if MAX_ORDER > 12 + vload4(3, &task.coefs[0]) * vload4(3, dptr) #if MAX_ORDER > 16 @@ -872,8 +850,8 @@ void clCalcPartition16( data[tid] = nextData; // calc number of unary bits for each residual sample with each rice paramater - __local int4 * chunk = (__local int4 *)&res[x << 4]; - sum = (chunk[0] >> k) + (chunk[1] >> k) + (chunk[2] >> k) + (chunk[3] >> k); + __local int * chunk = &res[x << 4]; + sum = (vload4(0,chunk) >> k) + (vload4(1,chunk) >> k) + (vload4(2,chunk) >> k) + (vload4(3,chunk) >> k); s = sum.x + sum.y + sum.z + sum.w; const int lpos = (15 << (max_porder + 1)) * get_group_id(0) + (k << (max_porder + 1)) + offs / 16; diff --git a/CUETools.Codecs.FLACCL/flaccpu.cl b/CUETools.Codecs.FLACCL/flaccpu.cl index 5eaa5fd..a5397fc 100644 --- a/CUETools.Codecs.FLACCL/flaccpu.cl +++ b/CUETools.Codecs.FLACCL/flaccpu.cl @@ -329,7 +329,6 @@ void clQuantizeLPC( __kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(1, 1, 1))) void clEstimateResidual( - __global int*output, __global int*samples, __global FLACCLSubframeTask *tasks ) @@ -412,13 +411,18 @@ void clEstimateResidual( int k = clamp(clz(1 << (12 - EPO)) - clz(res), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16) total += (k << (12 - EPO)) + (res >> k); } - output[get_group_id(0)] = min(0x7ffffff, total) + (bs - ro); + int partLen = min(0x7ffffff, total) + (bs - ro); + int obits = task.data.obits - task.data.wbits; + tasks[get_group_id(0)].data.size = min(obits * bs, + task.data.type == Fixed ? ro * obits + 6 + (4 * 1/2) + partLen : + task.data.type == LPC ? ro * obits + 4 + 5 + ro * task.data.cbits + 6 + (4 * 1/2)/* << porder */ + partLen : + task.data.type == Constant ? obits * select(1, bs, partLen != bs - ro) : + obits * bs); } __kernel __attribute__((reqd_work_group_size(1, 1, 1))) void clChooseBestMethod( __global FLACCLSubframeTask *tasks, - __global int *residual, int taskCount ) { @@ -426,19 +430,7 @@ void clChooseBestMethod( int best_no = 0; for (int taskNo = 0; taskNo < taskCount; taskNo++) { - // fetch task data - __global FLACCLSubframeTask* ptask = tasks + taskNo + taskCount * get_group_id(0); - // fetch part sum - int partLen = residual[taskNo + taskCount * get_group_id(0)]; - int obits = ptask->data.obits - ptask->data.wbits; - int bs = ptask->data.blocksize; - int ro = ptask->data.residualOrder; - int len = min(obits * bs, - ptask->data.type == Fixed ? ro * obits + 6 + (4 * 1/2) + partLen : - ptask->data.type == LPC ? ro * obits + 4 + 5 + ro * ptask->data.cbits + 6 + (4 * 1/2)/* << porder */ + partLen : - ptask->data.type == Constant ? obits * select(1, bs, partLen != bs - ro) : - obits * bs); - ptask->data.size = len; + int len = tasks[taskNo + taskCount * get_group_id(0)].data.size; if (len < best_length) { best_length = len;