diff --git a/CUETools.Codecs.FLACCL/FLACCLWriter.cs b/CUETools.Codecs.FLACCL/FLACCLWriter.cs index 65b5cf2..bbd537b 100644 --- a/CUETools.Codecs.FLACCL/FLACCLWriter.cs +++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs @@ -141,7 +141,6 @@ namespace CUETools.Codecs.FLACCL public const int MAX_BLOCKSIZE = 4096 * 16; internal const int maxFrames = 128; - internal const int maxAutocorParts = (MAX_BLOCKSIZE + 255) / 256; public FLACCLWriter(string path, Stream IO, AudioPCMConfig pcm) { @@ -1116,12 +1115,6 @@ namespace CUETools.Codecs.FLACCL if (task.frameSize <= 4) return; - //int autocorPartSize = (2 * 256 - eparams.max_prediction_order) & ~15; - int autocorPartSize = 32 * 7; - int autocorPartCount = (task.frameSize + autocorPartSize - 1) / autocorPartSize; - if (autocorPartCount > maxAutocorParts) - throw new Exception("internal error"); - int max_porder = get_max_p_order(eparams.max_partition_order, task.frameSize, eparams.max_prediction_order); int calcPartitionPartSize = task.frameSize >> max_porder; while (calcPartitionPartSize < 16 && max_porder > 0) @@ -1140,12 +1133,10 @@ namespace CUETools.Codecs.FLACCL cudaChannelDecorr.SetArg(2, (uint)MAX_BLOCKSIZE); task.cudaComputeLPC.SetArg(0, task.cudaResidualTasks); - task.cudaComputeLPC.SetArg(1, (uint)task.nResidualTasksPerChannel); - task.cudaComputeLPC.SetArg(2, task.cudaAutocorOutput); - task.cudaComputeLPC.SetArg(3, (uint)eparams.max_prediction_order); - task.cudaComputeLPC.SetArg(4, task.cudaLPCData); - task.cudaComputeLPC.SetArg(5, (uint)_windowcount); - task.cudaComputeLPC.SetArg(6, (uint)autocorPartCount); + task.cudaComputeLPC.SetArg(1, task.cudaAutocorOutput); + task.cudaComputeLPC.SetArg(2, task.cudaLPCData); + task.cudaComputeLPC.SetArg(3, (uint)task.nResidualTasksPerChannel); + task.cudaComputeLPC.SetArg(4, (uint)_windowcount); //task.cudaComputeLPCLattice.SetArg(0, task.cudaResidualTasks); //task.cudaComputeLPCLattice.SetArg(1, (uint)task.nResidualTasksPerChannel); @@ -1156,12 +1147,11 @@ namespace CUETools.Codecs.FLACCL //cuda.SetFunctionBlockShape(task.cudaComputeLPCLattice, 256, 1, 1); task.cudaQuantizeLPC.SetArg(0, task.cudaResidualTasks); - task.cudaQuantizeLPC.SetArg(1, (uint)task.nResidualTasksPerChannel); - task.cudaQuantizeLPC.SetArg(2, (uint)task.nTasksPerWindow); - task.cudaQuantizeLPC.SetArg(3, task.cudaLPCData); - task.cudaQuantizeLPC.SetArg(4, (uint)eparams.max_prediction_order); - task.cudaQuantizeLPC.SetArg(5, (uint)eparams.lpc_min_precision_search); - task.cudaQuantizeLPC.SetArg(6, (uint)(eparams.lpc_max_precision_search - eparams.lpc_min_precision_search)); + task.cudaQuantizeLPC.SetArg(1, task.cudaLPCData); + task.cudaQuantizeLPC.SetArg(2, (uint)task.nResidualTasksPerChannel); + task.cudaQuantizeLPC.SetArg(3, (uint)task.nTasksPerWindow); + task.cudaQuantizeLPC.SetArg(4, (uint)eparams.lpc_min_precision_search); + task.cudaQuantizeLPC.SetArg(5, (uint)(eparams.lpc_max_precision_search - eparams.lpc_min_precision_search)); task.cudaCopyBestMethod.SetArg(0, task.cudaBestResidualTasks); task.cudaCopyBestMethod.SetArg(1, task.cudaResidualTasks); @@ -1216,7 +1206,7 @@ namespace CUETools.Codecs.FLACCL // geometry??? task.openCLCQ.EnqueueBarrier(); - task.EnqueueComputeAutocor(autocorPartCount, channelsCount, cudaWindow, eparams.max_prediction_order); + task.EnqueueComputeAutocor(channelsCount, cudaWindow, eparams.max_prediction_order); //float* autoc = stackalloc float[1024]; //task.openCLCQ.EnqueueBarrier(); @@ -1524,6 +1514,7 @@ namespace CUETools.Codecs.FLACCL if (OpenCL.NumberOfPlatforms < 1) throw new Exception("no opencl platforms found"); + int groupSize = 64; OCLMan = new OpenCLManager(); // Attempt to save binaries after compilation, as well as load precompiled binaries // to avoid compilation. Usually you'll want this to be true. @@ -1543,7 +1534,9 @@ namespace CUETools.Codecs.FLACCL OCLMan.RequireImageSupport = false; // The Defines string gets prepended to any and all sources that are compiled // and serve as a convenient way to pass configuration information to the compilation process - OCLMan.Defines = "#define MAX_ORDER " + eparams.max_prediction_order.ToString(); + OCLMan.Defines = + "#define MAX_ORDER " + eparams.max_prediction_order.ToString() + "\n" + + "#define GROUP_SIZE " + groupSize.ToString() + "\n"; // The BuildOptions string is passed directly to clBuild and can be used to do debug builds etc OCLMan.BuildOptions = ""; @@ -1596,13 +1589,13 @@ namespace CUETools.Codecs.FLACCL if (_IO.CanSeek) first_frame_offset = _IO.Position; - task1 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify); - task2 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify); + task1 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify, groupSize); + task2 = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify, groupSize); if (_settings.CPUThreads > 0) { cpu_tasks = new FLACCLTask[_settings.CPUThreads]; for (int i = 0; i < cpu_tasks.Length; i++) - cpu_tasks[i] = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify); + cpu_tasks[i] = new FLACCLTask(openCLProgram, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify, groupSize); } cudaWindow = openCLProgram.Context.CreateBuffer(MemFlags.READ_ONLY, sizeof(float) * FLACCLWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS); @@ -2276,8 +2269,11 @@ namespace CUETools.Codecs.FLACCL public bool done = false; public bool exit = false; - unsafe public FLACCLTask(Program _openCLProgram, int channelCount, int channels, uint bits_per_sample, int max_frame_size, bool do_verify) + public int groupSize = 128; + + unsafe public FLACCLTask(Program _openCLProgram, int channelCount, int channels, uint bits_per_sample, int max_frame_size, bool do_verify, int groupSize) { + this.groupSize = groupSize; openCLProgram = _openCLProgram; Device[] openCLDevices = openCLProgram.Context.Platform.QueryDevices(DeviceType.GPU); openCLCQ = openCLProgram.Context.CreateCommandQueue(openCLDevices[0], CommandQueueProperties.PROFILING_ENABLE); @@ -2296,7 +2292,7 @@ namespace CUETools.Codecs.FLACCL cudaPartitions = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, partitionsLen); cudaRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, riceParamsLen); cudaBestRiceParams = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceParamsLen / 4); - cudaAutocorOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * (FLACCLWriter.maxAutocorParts + FLACCLWriter.maxFrames)); + cudaAutocorOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * FLACCLWriter.maxFrames); cudaResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, residualTasksLen); cudaBestResidualTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, bestResidualTasksLen); cudaResidualOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE, sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 8) * 64 /*FLACCLWriter.maxResidualParts*/ * FLACCLWriter.maxFrames); @@ -2397,36 +2393,22 @@ namespace CUETools.Codecs.FLACCL cudaFindWastedBits.SetArg(1, cudaSamples); cudaFindWastedBits.SetArg(2, nResidualTasksPerChannel); - int workX = 128; // 256 int grpX = frameCount * channelsCount; - //openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { 128 }, new int[] { 128 }); - //openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { 128 }, null); - openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { grpX * workX }, new int[] { workX }); - - //openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { 256 * 128 }, new int[] { 128 }); - //openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { grpX * workX }, null); - //cuda.SetFunctionBlockShape(task.cudaFindWastedBits, 256, 1, 1); - //cuda.LaunchAsync(task.cudaFindWastedBits, channelsCount * task.frameCount, 1, task.stream); + openCLCQ.EnqueueNDRangeKernel(cudaFindWastedBits, 1, null, new int[] { grpX * groupSize }, new int[] { groupSize }); } - public void EnqueueComputeAutocor(int autocorPartCount, int channelsCount, Mem cudaWindow, int max_prediction_order) + public void EnqueueComputeAutocor(int channelsCount, Mem cudaWindow, int max_prediction_order) { cudaComputeAutocor.SetArg(0, cudaAutocorOutput); cudaComputeAutocor.SetArg(1, cudaSamples); cudaComputeAutocor.SetArg(2, cudaWindow); cudaComputeAutocor.SetArg(3, cudaResidualTasks); - cudaComputeAutocor.SetArg(4, max_prediction_order); - cudaComputeAutocor.SetArg(5, (uint)nAutocorTasksPerChannel - 1); - cudaComputeAutocor.SetArg(6, (uint)nResidualTasksPerChannel); - int workX = autocorPartCount; + cudaComputeAutocor.SetArg(4, (uint)nAutocorTasksPerChannel - 1); + cudaComputeAutocor.SetArg(5, (uint)nResidualTasksPerChannel); + + int workX = max_prediction_order / 4 + 1; int workY = nAutocorTasksPerChannel * channelsCount * frameCount; - int ws = 32; - int wy = 4; - openCLCQ.EnqueueNDRangeKernel(cudaComputeAutocor, 2, null, new int[] { workX * ws, workY * wy }, new int[] { ws, wy }); - //openCLCQ.EnqueueNDRangeKernel(cudaComputeAutocor, 3, null, new int[] { workX * ws, workY, max_prediction_order + 1 }, new int[] { ws, 1, 1 }); - - //cuda.SetFunctionBlockShape(task.cudaComputeAutocor, 32, 8, 1); - //cuda.LaunchAsync(task.cudaComputeAutocor, autocorPartCount, task.nAutocorTasksPerChannel * channelsCount * task.frameCount, task.stream); + openCLCQ.EnqueueNDRangeKernel(cudaComputeAutocor, 2, null, new int[] { workX * groupSize, workY }, new int[] { groupSize, 1 }); } public void EnqueueEstimateResidual(int channelsCount, int max_prediction_order) @@ -2435,11 +2417,8 @@ namespace CUETools.Codecs.FLACCL cudaEstimateResidual.SetArg(1, cudaSamples); cudaEstimateResidual.SetArg(2, cudaResidualTasks); - int threads_x = 128; - int workX = threads_x; - int workY = nResidualTasksPerChannel * channelsCount * frameCount; - - openCLCQ.EnqueueNDRangeKernel(cudaEstimateResidual, 2, null, new int[] { workX, workY }, new int[] { threads_x, 1 }); + int work = nResidualTasksPerChannel * channelsCount * frameCount; + openCLCQ.EnqueueNDRangeKernel(cudaEstimateResidual, 1, null, new int[] { groupSize * work }, new int[] { groupSize }); } public void EnqueueChooseBestMethod(int channelsCount) diff --git a/CUETools.Codecs.FLACCL/flac.cl b/CUETools.Codecs.FLACCL/flac.cl index 6c30b50..47257c5 100644 --- a/CUETools.Codecs.FLACCL/flac.cl +++ b/CUETools.Codecs.FLACCL/flac.cl @@ -50,11 +50,7 @@ typedef struct typedef struct { FLACCLSubframeData data; - union - { - int coefs[32]; // fixme: should be short? - int4 coefs4[8]; - }; + int coefs[32]; // fixme: should be short? } FLACCLSubframeTask; __kernel void cudaStereoDecorr( @@ -103,15 +99,15 @@ __kernel void cudaChannelDecorr2( #define __ffs(a) (32 - clz(a & (-a))) //#define __ffs(a) (33 - clz(~a & (a - 1))) -__kernel __attribute__((reqd_work_group_size(128, 1, 1))) +__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) void cudaFindWastedBits( __global FLACCLSubframeTask *tasks, __global int *samples, int tasksPerChannel ) { - __local volatile int wbits[128]; - __local volatile int abits[128]; + __local int abits[GROUP_SIZE]; + __local int wbits[GROUP_SIZE]; __local FLACCLSubframeData task; int tid = get_local_id(0); @@ -129,12 +125,6 @@ void cudaFindWastedBits( wbits[tid] = w; abits[tid] = a; barrier(CLK_LOCAL_MEM_FENCE); - //atom_or(shared.wbits, shared.wbits[tid]); - //atom_or(shared.abits, shared.abits[tid]); - //SUM256(shared.wbits, tid, |=); - //SUM256(shared.abits, tid, |=); - //SUM128(wbits, tid, |=); - //SUM128(abits, tid, |=); for (int s = get_local_size(0) / 2; s > 0; s >>= 1) { @@ -146,160 +136,81 @@ void cudaFindWastedBits( barrier(CLK_LOCAL_MEM_FENCE); } - if (tid == 0) - task.wbits = max(0,__ffs(wbits[0]) - 1); - if (tid == 0) - task.abits = 32 - clz(abits[0]) - task.wbits; - // if (tid == 0) - //task.wbits = get_num_groups(0); - // if (tid == 0) - //task.abits = get_local_size(0); - - barrier(CLK_LOCAL_MEM_FENCE); - + w = max(0,__ffs(wbits[0]) - 1); + a = 32 - clz(abits[0]) - w; if (tid < tasksPerChannel) - tasks[get_group_id(0) * tasksPerChannel + tid].data.wbits = task.wbits; + tasks[get_group_id(0) * tasksPerChannel + tid].data.wbits = w; if (tid < tasksPerChannel) - tasks[get_group_id(0) * tasksPerChannel + tid].data.abits = task.abits; + tasks[get_group_id(0) * tasksPerChannel + tid].data.abits = a; } -//__kernel __attribute__((reqd_work_group_size(32, 4, 1))) -//void cudaComputeAutocor( -// __global float *output, -// __global const int *samples, -// __global const float *window, -// __global FLACCLSubframeTask *tasks, -// const int max_order, // should be <= 32 -// const int windowCount, // windows (log2: 0,1) -// const int taskCount // tasks per block -//) -//{ -// __local struct { -// float data[256]; -// volatile float product[128]; -// FLACCLSubframeData task; -// volatile int dataPos; -// volatile int dataLen; -// } shared; -// const int tid = get_local_id(0) + get_local_id(1) * 32; -// // fetch task data -// if (tid < sizeof(shared.task) / sizeof(int)) -// ((__local int*)&shared.task)[tid] = ((__global int*)(tasks + taskCount * (get_group_id(1) >> windowCount)))[tid]; -// if (tid == 0) -// { -// shared.dataPos = get_group_id(0) * 7 * 32; -// shared.dataLen = min(shared.task.blocksize - shared.dataPos, 7 * 32 + max_order); -// } -// barrier(CLK_LOCAL_MEM_FENCE); -// -// // fetch samples -// shared.data[tid] = tid < shared.dataLen ? samples[tid] * window[tid]: 0.0f; -// int tid2 = tid + 128; -// shared.data[tid2] = tid2 < shared.dataLen ? samples[tid2] * window[tid2]: 0.0f; -// barrier(CLK_LOCAL_MEM_FENCE); -// -// for (int lag = 0; lag <= max_order; lag ++) -// { -// if (lag <= 12) -// shared.product[tid] = 0.0f; -// barrier(CLK_LOCAL_MEM_FENCE); -// } -// barrier(CLK_LOCAL_MEM_FENCE); -// if (tid <= max_order) -// output[(get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) + tid] = shared.product[tid]; -//} - -__kernel __attribute__((reqd_work_group_size(32, 4, 1))) +__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) void cudaComputeAutocor( __global float *output, __global const int *samples, __global const float *window, __global FLACCLSubframeTask *tasks, - const int max_order, // should be <= 32 const int windowCount, // windows (log2: 0,1) const int taskCount // tasks per block ) { - __local struct { - float data[256]; - volatile float product[128]; - FLACCLSubframeData task; - volatile float result[33]; - volatile int dataPos; - volatile int dataLen; - volatile int windowOffs; - volatile int samplesOffs; - //volatile int resultOffs; - } shared; - const int tid = get_local_id(0) + get_local_id(1) * 32; + __local float data[GROUP_SIZE * 2]; + __local float product[GROUP_SIZE]; + __local FLACCLSubframeData task; + const int tid = get_local_id(0); // fetch task data - if (tid < sizeof(shared.task) / sizeof(int)) - ((__local int*)&shared.task)[tid] = ((__global int*)(tasks + taskCount * (get_group_id(1) >> windowCount)))[tid]; - if (tid == 0) - { - shared.dataPos = get_group_id(0) * 7 * 32; - shared.windowOffs = (get_group_id(1) & ((1 << windowCount)-1)) * shared.task.blocksize + shared.dataPos; - shared.samplesOffs = shared.task.samplesOffs + shared.dataPos; - shared.dataLen = min(shared.task.blocksize - shared.dataPos, 7 * 32 + max_order); - } - //if (tid == 32) - //shared.resultOffs = __mul24(get_group_id(0) + __mul24(get_group_id(1), get_num_groups(0)), max_order + 1); + if (tid < sizeof(task) / sizeof(int)) + ((__local int*)&task)[tid] = ((__global int*)(tasks + taskCount * (get_group_id(1) >> windowCount)))[tid]; barrier(CLK_LOCAL_MEM_FENCE); + + int bs = task.blocksize; + int windowOffs = (get_group_id(1) & ((1 << windowCount)-1)) * bs; - // fetch samples - shared.data[tid] = tid < shared.dataLen ? samples[shared.samplesOffs + tid] * window[shared.windowOffs + tid]: 0.0f; - int tid2 = tid + 128; - shared.data[tid2] = tid2 < shared.dataLen ? samples[shared.samplesOffs + tid2] * window[shared.windowOffs + tid2]: 0.0f; - barrier(CLK_LOCAL_MEM_FENCE); + data[tid] = tid < bs ? samples[task.samplesOffs + tid] * window[windowOffs + tid] : 0.0f; - const int ptr = get_local_id(0) * 7; - //if (get_local_id(1) == 0) for (int lag = 0; lag <= max_order; lag ++) - //for (int lag = get_local_id(1); lag <= max_order; lag += get_local_size(1)) - for (int lag0 = 0; lag0 <= max_order; lag0 += get_local_size(1)) + int tid0 = tid % (GROUP_SIZE >> 2); + int tid1 = tid / (GROUP_SIZE >> 2); + int lag0 = get_group_id(0) * 4; + __local float4 * dptr = ((__local float4 *)&data[0]) + tid0; + __local float4 * dptr1 = ((__local float4 *)&data[lag0 + tid1]) + tid0; + + float prod = 0.0f; + for (int pos = 0; pos < bs; pos += GROUP_SIZE) { - ////const int productLen = min(shared.task.blocksize - get_group_id(0) * partSize - lag, partSize); - const int lag = lag0 + get_local_id(1); - const int ptr2 = ptr + lag; - shared.product[tid] = - shared.data[ptr + 0] * shared.data[ptr2 + 0] + - shared.data[ptr + 1] * shared.data[ptr2 + 1] + - shared.data[ptr + 2] * shared.data[ptr2 + 2] + - shared.data[ptr + 3] * shared.data[ptr2 + 3] + - shared.data[ptr + 4] * shared.data[ptr2 + 4] + - shared.data[ptr + 5] * shared.data[ptr2 + 5] + - shared.data[ptr + 6] * shared.data[ptr2 + 6]; + // fetch samples + float nextData = pos + tid + GROUP_SIZE < bs ? samples[task.samplesOffs + pos + tid + GROUP_SIZE] * window[windowOffs + pos + tid + GROUP_SIZE] : 0.0f; + data[tid + GROUP_SIZE] = nextData; barrier(CLK_LOCAL_MEM_FENCE); - for (int l = 16; l > 1; l >>= 1) - { - if (get_local_id(0) < l) - shared.product[tid] = shared.product[tid] + shared.product[tid + l]; - barrier(CLK_LOCAL_MEM_FENCE); - } - // return results - if (get_local_id(0) == 0 && lag <= max_order) - shared.result[lag] = shared.product[tid] + shared.product[tid + 1]; + + prod += dot(*dptr, *dptr1); + + barrier(CLK_LOCAL_MEM_FENCE); + + data[tid] = nextData; + } + product[tid] = prod; + barrier(CLK_LOCAL_MEM_FENCE); + for (int l = (GROUP_SIZE >> 3); l > 0; l >>= 1) + { + if (tid0 < l) + product[tid] = product[tid] + product[tid + l]; barrier(CLK_LOCAL_MEM_FENCE); } - if (tid <= max_order) - output[(get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) + tid] = shared.result[tid]; - //output[(get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) + tid] = shared.product[tid]; - //output[(get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) + tid] = shared.windowOffs; + if (tid < 4 && tid + lag0 <= MAX_ORDER) + output[get_group_id(1) * (MAX_ORDER + 1) + tid + lag0] = product[tid * (GROUP_SIZE >> 2)]; } __kernel __attribute__((reqd_work_group_size(32, 1, 1))) void cudaComputeLPC( __global FLACCLSubframeTask *tasks, - int taskCount, // tasks per block - __global float*autoc, - int max_order, // should be <= 32 + __global float *autoc, __global float *lpcs, - int windowCount, - int partCount + int taskCount, // tasks per block + int windowCount ) { __local struct { FLACCLSubframeData task; - volatile float parts[32]; volatile float ldr[32]; volatile float gen1[32]; volatile float error[32]; @@ -311,40 +222,19 @@ void cudaComputeLPC( // fetch task data if (tid < sizeof(shared.task) / sizeof(int)) - ((__local int*)&shared.task)[tid] = ((__global int*)(tasks + get_group_id(1) * taskCount))[tid]; + ((__local int*)&shared.task)[tid] = ((__global int*)(tasks + get_group_id(1)))[tid]; if (tid == 0) { - shared.lpcOffs = (get_group_id(0) + get_group_id(1) * windowCount) * (max_order + 1) * 32; - shared.autocOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) * partCount; + shared.lpcOffs = (get_group_id(0) + get_group_id(1) * windowCount) * (MAX_ORDER + 1) * 32; + shared.autocOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1); } barrier(CLK_LOCAL_MEM_FENCE); - // add up autocorrelation parts + if (get_local_id(0) <= MAX_ORDER) + shared.autoc[get_local_id(0)] = autoc[shared.autocOffs + get_local_id(0)]; + if (get_local_id(0) + get_local_size(0) <= MAX_ORDER) + shared.autoc[get_local_id(0) + get_local_size(0)] = autoc[shared.autocOffs + get_local_id(0) + get_local_size(0)]; - // for (int order = get_local_id(0); order <= max_order; order += 32) - // { - //float sum = 0.0f; - //for (int pos = 0; pos < partCount; pos++) - // sum += autoc[shared.autocOffs + pos * (max_order + 1) + order]; - //shared.autoc[order] = sum; - // } - - for (int order = 0; order <= max_order; order ++) - { - float part = 0.0f; - for (int pos = get_local_id(0); pos < partCount; pos += get_local_size(0)) - part += autoc[shared.autocOffs + pos * (max_order + 1) + order]; - shared.parts[tid] = part; - barrier(CLK_LOCAL_MEM_FENCE); - for (int l = get_local_size(0) / 2; l > 1; l >>= 1) - { - if (get_local_id(0) < l) - shared.parts[tid] += shared.parts[tid + l]; - barrier(CLK_LOCAL_MEM_FENCE); - } - if (get_local_id(0) == 0) - shared.autoc[order] = shared.parts[tid] + shared.parts[tid + 1]; - } barrier(CLK_LOCAL_MEM_FENCE); // Compute LPC using Schur and Levinson-Durbin recursion @@ -352,19 +242,19 @@ void cudaComputeLPC( shared.ldr[get_local_id(0)] = 0.0f; float error = shared.autoc[0]; barrier(CLK_LOCAL_MEM_FENCE); - for (int order = 0; order < max_order; order++) + for (int order = 0; order < MAX_ORDER; order++) { // Schur recursion float reff = -shared.gen1[0] / error; error += shared.gen1[0] * reff; // Equivalent to error *= (1 - reff * reff); float gen1; - if (get_local_id(0) < max_order - 1 - order) + if (get_local_id(0) < MAX_ORDER - 1 - order) { gen1 = shared.gen1[get_local_id(0) + 1] + reff * gen0; gen0 += shared.gen1[get_local_id(0) + 1] * reff; } barrier(CLK_LOCAL_MEM_FENCE); - if (get_local_id(0) < max_order - 1 - order) + if (get_local_id(0) < MAX_ORDER - 1 - order) shared.gen1[get_local_id(0)] = gen1; // Store prediction error @@ -385,118 +275,16 @@ void cudaComputeLPC( } barrier(CLK_LOCAL_MEM_FENCE); // Output prediction error estimates - if (get_local_id(0) < max_order) - lpcs[shared.lpcOffs + max_order * 32 + get_local_id(0)] = shared.error[get_local_id(0)]; + if (get_local_id(0) < MAX_ORDER) + lpcs[shared.lpcOffs + MAX_ORDER * 32 + get_local_id(0)] = shared.error[get_local_id(0)]; } -//__kernel void cudaComputeLPCLattice( -// FLACCLSubframeTask *tasks, -// const int taskCount, // tasks per block -// const int *samples, -// const int windowCount, -// const int max_order, // should be <= 12 -// float*lpcs -//) -//{ -// __local struct { -// volatile FLACCLSubframeData task; -// volatile float F[512]; -// volatile float arp[32]; -// volatile float tmp[256]; -// volatile float error[32]; -// volatile int lpcOffs; -// } shared; -// -// // fetch task data -// if (get_local_id(0) < sizeof(shared.task) / sizeof(int)) -// ((int*)&shared.task)[get_local_id(0)] = ((int*)(tasks + taskCount * get_group_id(1)))[get_local_id(0)]; -// if (get_local_id(0) == 0) -// shared.lpcOffs = __mul24(__mul24(get_group_id(1) + 1, windowCount) - 1, max_order + 1) * 32; -// barrier(CLK_LOCAL_MEM_FENCE); -// -// // F = samples; B = samples -// float s1 = get_local_id(0) < shared.task.blocksize ? (samples[shared.task.samplesOffs + get_local_id(0)]) / 32768.0f : 0.0f; -// float s2 = get_local_id(0) + 256 < shared.task.blocksize ? (samples[shared.task.samplesOffs + get_local_id(0) + 256]) / 32768.0f : 0.0f; -// shared.F[get_local_id(0)] = s1; -// shared.F[get_local_id(0) + 256] = s2; -// barrier(CLK_LOCAL_MEM_FENCE); -// -// shared.tmp[get_local_id(0)] = FSQR(s1) + FSQR(s2); -// barrier(CLK_LOCAL_MEM_FENCE); -// SUM256(shared.tmp, get_local_id(0), +=); -// barrier(CLK_LOCAL_MEM_FENCE); -// float DEN = shared.tmp[0]; -// barrier(CLK_LOCAL_MEM_FENCE); -// -// for (int order = 0; order < max_order; order++) -// { -// // reff = F(order+1:frameSize) * B(1:frameSize-order)' / DEN -// int idxF = get_local_id(0) + order + 1; -// int idxF2 = idxF + 256; -// -// shared.tmp[get_local_id(0)] = idxF < shared.task.blocksize ? shared.F[idxF] * s1 : 0.0f; -// shared.tmp[get_local_id(0)] += idxF2 < shared.task.blocksize ? shared.F[idxF2] * s2 : 0.0f; -// barrier(CLK_LOCAL_MEM_FENCE); -// SUM256(shared.tmp, get_local_id(0), +=); -// barrier(CLK_LOCAL_MEM_FENCE); -// float reff = shared.tmp[0] / DEN; -// barrier(CLK_LOCAL_MEM_FENCE); -// -// // arp(order) = rc(order) = reff -// if (get_local_id(0) == 0) -// shared.arp[order] = reff; -// //shared.rc[order - 1] = shared.lpc[order - 1][order - 1] = reff; -// -// // Levinson-Durbin recursion -// // arp(1:order-1) = arp(1:order-1) - reff * arp(order-1:-1:1) -// if (get_local_id(0) < order) -// shared.arp[get_local_id(0)] = shared.arp[get_local_id(0)] - reff * shared.arp[order - 1 - get_local_id(0)]; -// -// // Output coeffs -// if (get_local_id(0) <= order) -// lpcs[shared.lpcOffs + order * 32 + get_local_id(0)] = shared.arp[order - get_local_id(0)]; -// -// // F1 = F(order+1:frameSize) - reff * B(1:frameSize-order) -// // B(1:frameSize-order) = B(1:frameSize-order) - reff * F(order+1:frameSize) -// // F(order+1:frameSize) = F1 -// if (idxF < shared.task.blocksize) -// { -// float f1 = shared.F[idxF]; -// shared.F[idxF] -= reff * s1; -// s1 -= reff * f1; -// } -// if (idxF2 < shared.task.blocksize) -// { -// float f2 = shared.F[idxF2]; -// shared.F[idxF2] -= reff * s2; -// s2 -= reff * f2; -// } -// -// // DEN = F(order+1:frameSize) * F(order+1:frameSize)' + B(1:frameSize-order) * B(1:frameSize-order)' (BURG) -// shared.tmp[get_local_id(0)] = (idxF + 1 < shared.task.blocksize ? FSQR(shared.F[idxF]) + FSQR(s1) : 0); -// shared.tmp[get_local_id(0)] += (idxF2 + 1 < shared.task.blocksize ? FSQR(shared.F[idxF2]) + FSQR(s2) : 0); -// barrier(CLK_LOCAL_MEM_FENCE); -// SUM256(shared.tmp, get_local_id(0), +=); -// barrier(CLK_LOCAL_MEM_FENCE); -// DEN = shared.tmp[0] / 2; -// // shared.PE[order-1] = shared.tmp[0] / 2 / (frameSize - order + 1); -// if (get_local_id(0) == 0) -// shared.error[order] = DEN / (shared.task.blocksize - order); -// barrier(CLK_LOCAL_MEM_FENCE); -// } -// -// // Output prediction error estimates -// if (get_local_id(0) < max_order) -// lpcs[shared.lpcOffs + max_order * 32 + get_local_id(0)] = shared.error[get_local_id(0)]; -//} - __kernel __attribute__((reqd_work_group_size(32, 4, 1))) void cudaQuantizeLPC( __global FLACCLSubframeTask *tasks, + __global float*lpcs, int taskCount, // tasks per block int taskCountLPC, // tasks per set of coeffs (<= 32) - __global float*lpcs, - int max_order, // should be <= 32 int minprecision, int precisions ) @@ -515,32 +303,30 @@ void cudaQuantizeLPC( if (tid < sizeof(shared.task) / sizeof(int)) ((__local int*)&shared.task)[tid] = ((__global int*)(tasks + get_group_id(1) * taskCount))[tid]; if (tid == 0) - shared.lpcOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (max_order + 1) * 32; + shared.lpcOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1) * 32; barrier(CLK_LOCAL_MEM_FENCE); + // Select best orders based on Akaike's Criteria if (get_local_id(1) == 0) { - shared.index[get_local_id(0)] = min(max_order - 1, get_local_id(0)); - shared.error[get_local_id(0)] = shared.task.blocksize * 64 + get_local_id(0); - shared.index[32 + get_local_id(0)] = min(max_order - 1, get_local_id(0)); - shared.error[32 + get_local_id(0)] = shared.task.blocksize * 64 + get_local_id(0); - - // Select best orders based on Akaike's Criteria + shared.index[tid] = min(MAX_ORDER - 1, tid); + shared.error[tid] = shared.task.blocksize * 64 + tid; + shared.index[32 + tid] = min(MAX_ORDER - 1, tid); + shared.error[32 + tid] = shared.task.blocksize * 64 + tid; // Load prediction error estimates - if (get_local_id(0) < max_order) - shared.error[get_local_id(0)] = shared.task.blocksize * log(lpcs[shared.lpcOffs + max_order * 32 + get_local_id(0)]) + get_local_id(0) * 5.12f * log(shared.task.blocksize); - //shared.error[get_local_id(0)] = shared.task.blocksize * log(lpcs[shared.lpcOffs + max_order * 32 + get_local_id(0)]) + get_local_id(0) * 0.30f * (shared.task.abits + 1) * log(shared.task.blocksize); + if (tid < MAX_ORDER) + shared.error[tid] = shared.task.blocksize * log(lpcs[shared.lpcOffs + MAX_ORDER * 32 + tid]) + tid * 5.12f * log(shared.task.blocksize); + //shared.error[get_local_id(0)] = shared.task.blocksize * log(lpcs[shared.lpcOffs + MAX_ORDER * 32 + get_local_id(0)]) + get_local_id(0) * 0.30f * (shared.task.abits + 1) * log(shared.task.blocksize); } - barrier(CLK_LOCAL_MEM_FENCE); // Sort using bitonic sort for(int size = 2; size < 64; size <<= 1){ //Bitonic merge - int ddd = (get_local_id(0) & (size / 2)) == 0; + int ddd = (tid & (size / 2)) == 0; for(int stride = size / 2; stride > 0; stride >>= 1){ - int pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + int pos = 2 * tid - (tid & (stride - 1)); float e0, e1; int i0, i1; if (get_local_id(1) == 0) @@ -551,7 +337,7 @@ void cudaQuantizeLPC( i1 = shared.index[pos + stride]; } barrier(CLK_LOCAL_MEM_FENCE); - if ((e0 >= e1) == ddd && get_local_id(1) == 0) + if (get_local_id(1) == 0 && (e0 >= e1) == ddd) { shared.error[pos] = e1; shared.error[pos + stride] = e0; @@ -566,7 +352,7 @@ void cudaQuantizeLPC( { for(int stride = 32; stride > 0; stride >>= 1){ //barrier(CLK_LOCAL_MEM_FENCE); - int pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); + int pos = 2 * tid - (tid & (stride - 1)); float e0, e1; int i0, i1; if (get_local_id(1) == 0) @@ -577,7 +363,7 @@ void cudaQuantizeLPC( i1 = shared.index[pos + stride]; } barrier(CLK_LOCAL_MEM_FENCE); - if (e0 >= e1 && get_local_id(1) == 0) + if (get_local_id(1) == 0 && e0 >= e1) { shared.error[pos] = e1; shared.error[pos + stride] = e0; @@ -653,21 +439,21 @@ void cudaQuantizeLPC( } } -__kernel __attribute__(( vec_type_hint (int4))) +__kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) void cudaEstimateResidual( __global int*output, __global int*samples, __global FLACCLSubframeTask *tasks ) { - __local float data[128 * 2]; - __local int residual[128]; + __local float data[GROUP_SIZE * 2]; + __local int residual[GROUP_SIZE]; __local FLACCLSubframeTask task; __local float4 coefsf4[8]; const int tid = get_local_id(0); if (tid < sizeof(task)/sizeof(int)) - ((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(1)]))[tid]; + ((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(0)]))[tid]; barrier(CLK_GLOBAL_MEM_FENCE); int ro = task.data.residualOrder; @@ -677,11 +463,11 @@ void cudaEstimateResidual( if (tid < 32) ((__local float *)&coefsf4[0])[tid] = select(0.0f, ((float)task.coefs[tid]) / (1 << task.data.shift), tid < ro); data[tid] = tid < bs ? (float)(samples[task.data.samplesOffs + tid] >> task.data.wbits) : 0.0f; - for (int pos = 0; pos < bs; pos += get_local_size(0)) + for (int pos = 0; pos < bs; pos += GROUP_SIZE) { // fetch samples - float nextData = pos + tid + get_local_size(0) < bs ? (float)(samples[task.data.samplesOffs + pos + tid + get_local_size(0)] >> task.data.wbits) : 0.0f; - data[tid + get_local_size(0)] = nextData; + float nextData = pos + tid + GROUP_SIZE < bs ? (float)(samples[task.data.samplesOffs + pos + tid + GROUP_SIZE] >> task.data.wbits) : 0.0f; + data[tid + GROUP_SIZE] = nextData; barrier(CLK_LOCAL_MEM_FENCE); // compute residual @@ -718,19 +504,19 @@ void cudaEstimateResidual( data[tid] = nextData; } - int residualLen = (bs - ro) / get_local_size(0) + select(0, 1, tid < (bs - ro) % get_local_size(0)); + int residualLen = (bs - ro) / GROUP_SIZE + select(0, 1, tid < (bs - ro) % GROUP_SIZE); int k = clamp(convert_int_rtn(log2((res + 0.000001f) / (residualLen + 0.000001f))), 0, 14); residual[tid] = residualLen * (k + 1) + (convert_int_rtz(res) >> k); barrier(CLK_LOCAL_MEM_FENCE); - for (int l = get_local_size(0) / 2; l > 0; l >>= 1) + for (int l = GROUP_SIZE / 2; l > 0; l >>= 1) { if (tid < l) residual[tid] += residual[tid + l]; barrier(CLK_LOCAL_MEM_FENCE); } if (tid == 0) - output[get_group_id(1)] = residual[0]; + output[get_group_id(0)] = residual[0]; } __kernel void cudaChooseBestMethod(