From ac35093c52aecb42af4ae034dfb07d6e8926e2a1 Mon Sep 17 00:00:00 2001 From: chudov Date: Fri, 19 Nov 2010 07:35:43 +0000 Subject: [PATCH] Intel OpenCL --- CUETools.Codecs.FLACCL/FLACCLWriter.cs | 1 + CUETools.Codecs.FLACCL/flac.cl | 63 ++++++++++++++++---------- 2 files changed, 41 insertions(+), 23 deletions(-) diff --git a/CUETools.Codecs.FLACCL/FLACCLWriter.cs b/CUETools.Codecs.FLACCL/FLACCLWriter.cs index fab3352..e330008 100644 --- a/CUETools.Codecs.FLACCL/FLACCLWriter.cs +++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs @@ -1593,6 +1593,7 @@ namespace CUETools.Codecs.FLACCL #if DEBUG "#define DEBUG\n" + #endif + (_settings.DeviceType == OpenCLDeviceType.CPU ? "#define FLACCL_CPU\n" : "") + _settings.Defines + "\n"; // The BuildOptions string is passed directly to clBuild and can be used to do debug builds etc OCLMan.BuildOptions = ""; diff --git a/CUETools.Codecs.FLACCL/flac.cl b/CUETools.Codecs.FLACCL/flac.cl index f479e00..c7edd91 100644 --- a/CUETools.Codecs.FLACCL/flac.cl +++ b/CUETools.Codecs.FLACCL/flac.cl @@ -20,7 +20,7 @@ #ifndef _FLACCL_KERNEL_H_ #define _FLACCL_KERNEL_H_ -#if defined(__Cedar__) || defined(__Redwood__) || defined(__Juniper__) || defined(__Cypress__) || defined(__ATI_RV770__) || defined(__ATI_RV730__) || defined(__ATI_RV710__) +#if defined(__Cedar__) || defined(__Redwood__) || defined(__Juniper__) || defined(__Cypress__) || defined(__ATI_RV770__) || defined(__ATI_RV730__) || defined(__ATI_RV710__) || defined(__CPU__) #define AMD #endif @@ -152,7 +152,7 @@ __kernel void clChannelDecorr2( #define __ffs(a) (32 - clz(a & (-a))) //#define __ffs(a) (33 - clz(~a & (a - 1))) -#ifdef __CPU__ +#ifdef FLACCL_CPU __kernel __attribute__((reqd_work_group_size(1, 1, 1))) void clFindWastedBits( __global FLACCLSubframeTask *tasks, @@ -228,7 +228,7 @@ void clFindWastedBits( } #endif -#ifdef __CPU__ +#ifdef FLACCL_CPU #define TEMPBLOCK 128 #define STORE_AC(ro, val) if (ro <= MAX_ORDER) pout[ro] = val; #define STORE_AC4(ro, val) STORE_AC(ro*4+0, val##ro.x) STORE_AC(ro*4+1, val##ro.y) STORE_AC(ro*4+2, val##ro.z) STORE_AC(ro*4+3, val##ro.w) @@ -381,7 +381,7 @@ void clComputeAutocor( } #endif -#ifdef __CPU__ +#ifdef FLACCL_CPU __kernel __attribute__((reqd_work_group_size(1, 1, 1))) void clComputeLPC( __global float *pautoc, @@ -525,7 +525,7 @@ void clComputeLPC( } #endif -#ifdef __CPU__ +#ifdef FLACCL_CPU __kernel __attribute__((reqd_work_group_size(1, 1, 1))) void clQuantizeLPC( __global FLACCLSubframeTask *tasks, @@ -545,7 +545,7 @@ void clQuantizeLPC( // Load prediction error estimates based on Akaike's Criteria for (int tid = 0; tid < MAX_ORDER; tid++) { - error[tid] = bs * log(lpcs[lpcOffs + MAX_ORDER * 32 + tid]) + tid * 4.12f * log(bs); + error[tid] = bs * log(lpcs[lpcOffs + MAX_ORDER * 32 + tid]) + tid * 4.12f * log((float)bs); best_orders[tid] = tid; } @@ -588,7 +588,7 @@ void clQuantizeLPC( { float lpc = lpcs[lpcOffs + order * 32 + tid]; // quantize coeffs with given shift - int c = convert_int_rte(clamp(lpc * (1 << shift), -1 << (cbits - 1), 1 << (cbits - 1))); + int c = convert_int_rte(clamp(lpc * (1 << shift), (float)(-1 << (cbits - 1)), (float)(1 << (cbits - 1)))); // remove sign bits tmpi |= c ^ (c >> 31); tasks[taskNo].coefs[tid] = c; @@ -764,7 +764,7 @@ void clQuantizeLPC( } #endif -#ifdef __CPU__ +#ifdef FLACCL_CPU inline int calc_residual(__global int *ptr, int * coefs, int ro) { int sum = 0; @@ -796,7 +796,7 @@ inline int calc_residual(__global int *ptr, int * coefs, int ro) default: ENCODE_N(ro, action) \ } -__kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(1, 1, 1))) +__kernel __attribute__(( vec_type_hint (int4))) __attribute__((reqd_work_group_size(1, 1, 1))) void clEstimateResidual( __global int*samples, __global int*selectedTasks, @@ -811,13 +811,30 @@ void clEstimateResidual( int len[1 << EPO]; // blocksize / 64!!!! __global int *data = &samples[task.data.samplesOffs]; - // for (int i = ro; i < 32; i++) - //task.coefs[i] = 0; + for (int i = ro; i < 32; i++) + task.coefs[i] = 0; for (int i = 0; i < 1 << EPO; i++) len[i] = 0; +#ifdef AMD SWITCH_N((t = clamp(t, -0x7fffff, 0x7fffff), len[pos >> (12 - EPO)] += (t << 1) ^ (t >> 31))) +#else + int4 c0 = vload4(0, &task.coefs[0]); + int4 c1 = vload4(1, &task.coefs[0]); + int4 c2 = vload4(2, &task.coefs[0]); + for (int pos = ro; pos < bs; pos ++) + { + __global int * dptr = data + pos - ro; + int4 sum + = c0 * vload4(0, dptr) + + c1 * vload4(1, dptr) + + c2 * vload4(2, dptr); + int t = (data[pos] - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift)) >> task.data.wbits; + t = iclamp(t, -0x7fffff, 0x7fffff); + len[pos >> (12 - EPO)] += (t << 1) ^ (t >> 31); + } +#endif int total = 0; for (int i = 0; i < 1 << EPO; i++) { @@ -1052,7 +1069,7 @@ void clChooseBestMethod( } #ifdef DO_PARTITIONS -#ifdef __CPU__ +#ifdef FLACCL_CPU // get_group_id(0) == task index __kernel __attribute__((reqd_work_group_size(1, 1, 1))) void clEncodeResidual( @@ -1144,7 +1161,7 @@ void clEncodeResidual( } #endif -#ifdef __CPU__ +#ifdef FLACCL_CPU __kernel __attribute__((reqd_work_group_size(1, 1, 1))) void clCalcPartition( __global int *partition_lengths, @@ -1232,7 +1249,7 @@ void clCalcPartition( } #endif -#ifdef __CPU__ +#ifdef FLACCL_CPU // get_group_id(0) == task index __kernel __attribute__((reqd_work_group_size(1, 1, 1))) void clCalcPartition16( @@ -1346,7 +1363,7 @@ void clCalcPartition16( } #endif -#ifdef __CPU__ +#ifdef FLACCL_CPU // Sums partition lengths for a certain k == get_group_id(0) // get_group_id(0) == k // get_group_id(1) == task index @@ -1405,7 +1422,7 @@ void clSumPartition( } #endif -#ifdef __CPU__ +#ifdef FLACCL_CPU // Finds optimal rice parameter for each partition. // get_group_id(0) == task index __kernel __attribute__((reqd_work_group_size(1, 1, 1))) @@ -1469,7 +1486,7 @@ void clFindRiceParameter( } #endif -#ifdef __CPU__ +#ifdef FLACCL_CPU // get_group_id(0) == task index __kernel __attribute__((reqd_work_group_size(1, 1, 1))) void clFindPartitionOrder( @@ -1603,10 +1620,10 @@ void clFindPartitionOrder( #endif #ifdef DO_RICE -#ifdef __CPU__ +#ifdef FLACCL_CPU typedef struct BitWriter_t { - __global int *buffer; + __global unsigned int *buffer; unsigned int bit_buf; int bit_left; int buf_ptr; @@ -1695,7 +1712,7 @@ void clRiceEncoding( int max_porder ) { -#ifdef __CPU__ +#ifdef FLACCL_CPU __global FLACCLSubframeTask* task = tasks + get_group_id(0); if (task->data.type == Fixed || task->data.type == LPC) { @@ -1804,7 +1821,7 @@ void clRiceEncoding( int mylen = select(0, (v >> k) + 1 + k + select(0, 4, pstart), offs >= task.residualOrder && offs < bs); mypos[tid] = mylen; // Inclusive scan(+) -#if 0 +#if 1 int lane = (tid & (WARP_SIZE - 1)); for (int offset = 1; offset < WARP_SIZE; offset <<= 1) mypos[tid] += mypos[select(GROUP_SIZE, tid - offset, lane >= offset)]; @@ -1863,9 +1880,9 @@ void clRiceEncoding( start = mypos[GROUP_SIZE - 1]; barrier(CLK_LOCAL_MEM_FENCE); unsigned int bb = data[tid]; -// bb = (bb >> 24) | ((bb >> 8) & 0xff00U) | ((bb << 8) & 0xff0000U) | (bb << 24); + bb = (bb >> 24) | ((bb >> 8) & 0xff00U) | ((bb << 8) & 0xff0000U) | (bb << 24); if ((start32 + tid) * 32 <= start) - output[start32 + tid] = 0U; + output[start32 + tid] = bb; unsigned int remainder = data[start / 32 - start32]; barrier(CLK_LOCAL_MEM_FENCE); data[tid] = select(0U, remainder, tid == 0);