CPU mode + GPU optimizations

This commit is contained in:
chudov
2010-10-25 04:50:36 +00:00
parent c10a62b17f
commit f619c82ef3
5 changed files with 671 additions and 199 deletions

View File

@@ -65,6 +65,9 @@
</ProjectReference> </ProjectReference>
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<Content Include="flaccpu.cl">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
<None Include="flac.cu"> <None Include="flac.cu">
</None> </None>
<Content Include="flac.cl"> <Content Include="flac.cl">

View File

@@ -33,7 +33,14 @@ namespace CUETools.Codecs.FLACCL
{ {
public class FLACCLWriterSettings public class FLACCLWriterSettings
{ {
public FLACCLWriterSettings() { DoVerify = false; GPUOnly = true; DoMD5 = true; GroupSize = 64; } public FLACCLWriterSettings()
{
this.DoVerify = false;
this.GPUOnly = true;
this.DoMD5 = true;
this.GroupSize = 64;
this.DeviceType = OpenCLDeviceType.GPU;
}
[DefaultValue(false)] [DefaultValue(false)]
[DisplayName("Verify")] [DisplayName("Verify")]
@@ -56,6 +63,13 @@ namespace CUETools.Codecs.FLACCL
[SRDescription(typeof(Properties.Resources), "DescriptionDefines")] [SRDescription(typeof(Properties.Resources), "DescriptionDefines")]
public string Defines { get; set; } public string Defines { get; set; }
[SRDescription(typeof(Properties.Resources), "DescriptionPlatform")]
public string Platform { get; set; }
[DefaultValue(OpenCLDeviceType.GPU)]
[SRDescription(typeof(Properties.Resources), "DescriptionDeviceType")]
public OpenCLDeviceType DeviceType { get; set; }
int cpu_threads = 1; int cpu_threads = 1;
[DefaultValue(1)] [DefaultValue(1)]
[SRDescription(typeof(Properties.Resources), "DescriptionCPUThreads")] [SRDescription(typeof(Properties.Resources), "DescriptionCPUThreads")]
@@ -74,6 +88,12 @@ namespace CUETools.Codecs.FLACCL
} }
} }
public enum OpenCLDeviceType : ulong
{
CPU = DeviceType.CPU,
GPU = DeviceType.GPU
}
[AudioEncoderClass("FLACCL", "flac", true, "0 1 2 3 4 5 6 7 8 9 10 11", "8", 2, typeof(FLACCLWriterSettings))] [AudioEncoderClass("FLACCL", "flac", true, "0 1 2 3 4 5 6 7 8 9 10 11", "8", 2, typeof(FLACCLWriterSettings))]
//[AudioEncoderClass("FLACCL nonsub", "flac", true, "9 10 11", "9", 1, typeof(FLACCLWriterSettings))] //[AudioEncoderClass("FLACCL nonsub", "flac", true, "9 10 11", "9", 1, typeof(FLACCLWriterSettings))]
public class FLACCLWriter : IAudioDest public class FLACCLWriter : IAudioDest
@@ -223,6 +243,11 @@ namespace CUETools.Codecs.FLACCL
if (value as FLACCLWriterSettings == null) if (value as FLACCLWriterSettings == null)
throw new Exception("Unsupported options " + value); throw new Exception("Unsupported options " + value);
_settings = value as FLACCLWriterSettings; _settings = value as FLACCLWriterSettings;
if (_settings.DeviceType == OpenCLDeviceType.CPU)
{
_settings.GroupSize = 1;
_settings.GPUOnly = false;
}
eparams.flake_set_defaults(_compressionLevel, !_settings.GPUOnly); eparams.flake_set_defaults(_compressionLevel, !_settings.GPUOnly);
} }
} }
@@ -449,7 +474,7 @@ namespace CUETools.Codecs.FLACCL
} }
set set
{ {
if (value < 0 || value > eparams.max_fixed_order) if (value < 0 || value > 4)
throw new Exception("invalid MinFixedOrder " + value.ToString()); throw new Exception("invalid MinFixedOrder " + value.ToString());
eparams.min_fixed_order = value; eparams.min_fixed_order = value;
} }
@@ -463,7 +488,7 @@ namespace CUETools.Codecs.FLACCL
} }
set set
{ {
if (value > 4 || value < eparams.min_fixed_order) if (value > 4 || value < 0)
throw new Exception("invalid MaxFixedOrder " + value.ToString()); throw new Exception("invalid MaxFixedOrder " + value.ToString());
eparams.max_fixed_order = value; eparams.max_fixed_order = value;
} }
@@ -963,7 +988,7 @@ namespace CUETools.Codecs.FLACCL
task.nResidualTasks = 0; task.nResidualTasks = 0;
task.nTasksPerWindow = Math.Min(32, eparams.orders_per_window); task.nTasksPerWindow = Math.Min(32, eparams.orders_per_window);
task.nResidualTasksPerChannel = task.nWindowFunctions * task.nTasksPerWindow + 1 + (eparams.do_constant ? 1 : 0) + eparams.max_fixed_order - eparams.min_fixed_order; task.nResidualTasksPerChannel = task.nWindowFunctions * task.nTasksPerWindow + (eparams.do_constant ? 1 : 0) + Math.Max(0, 1 + eparams.max_fixed_order - eparams.min_fixed_order);
//if (task.nResidualTasksPerChannel >= 4) //if (task.nResidualTasksPerChannel >= 4)
// task.nResidualTasksPerChannel = (task.nResidualTasksPerChannel + 7) & ~7; // task.nResidualTasksPerChannel = (task.nResidualTasksPerChannel + 7) & ~7;
for (int iFrame = 0; iFrame < nFrames; iFrame++) for (int iFrame = 0; iFrame < nFrames; iFrame++)
@@ -1444,7 +1469,7 @@ namespace CUETools.Codecs.FLACCL
if (OpenCL.NumberOfPlatforms < 1) if (OpenCL.NumberOfPlatforms < 1)
throw new Exception("no opencl platforms found"); throw new Exception("no opencl platforms found");
int groupSize = _settings.GroupSize; int groupSize = _settings.DeviceType == OpenCLDeviceType.CPU ? 1 : _settings.GroupSize;
OCLMan = new OpenCLManager(); OCLMan = new OpenCLManager();
// Attempt to save binaries after compilation, as well as load precompiled binaries // Attempt to save binaries after compilation, as well as load precompiled binaries
// to avoid compilation. Usually you'll want this to be true. // to avoid compilation. Usually you'll want this to be true.
@@ -1475,12 +1500,30 @@ namespace CUETools.Codecs.FLACCL
OCLMan.BuildOptions = ""; OCLMan.BuildOptions = "";
OCLMan.SourcePath = System.IO.Path.GetDirectoryName(GetType().Assembly.Location); OCLMan.SourcePath = System.IO.Path.GetDirectoryName(GetType().Assembly.Location);
OCLMan.BinaryPath = System.IO.Path.Combine(System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData), "CUE Tools"), "OpenCL"); OCLMan.BinaryPath = System.IO.Path.Combine(System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData), "CUE Tools"), "OpenCL");
OCLMan.CreateDefaultContext(0, DeviceType.GPU); int platformId = 0;
if (_settings.Platform != null)
{
platformId = -1;
string platforms = "";
for (int i = 0; i < OpenCL.NumberOfPlatforms; i++)
{
var platform = OpenCL.GetPlatform(i);
platforms += " \"" + platform.Name + "\"";
if (platform.Name.Equals(_settings.Platform, StringComparison.InvariantCultureIgnoreCase))
{
platformId = i;
break;
}
}
if (platformId < 0)
throw new Exception("unknown platform \"" + _settings.Platform + "\". Platforms available:" + platforms);
}
OCLMan.CreateDefaultContext(platformId, (DeviceType)_settings.DeviceType);
openCLContext = OCLMan.Context; openCLContext = OCLMan.Context;
try try
{ {
openCLProgram = OCLMan.CompileFile("flac.cl"); openCLProgram = OCLMan.CompileFile(_settings.DeviceType == OpenCLDeviceType.CPU ? "flaccpu.cl" : "flac.cl");
} }
catch (OpenCLBuildException ex) catch (OpenCLBuildException ex)
{ {
@@ -2218,13 +2261,12 @@ namespace CUETools.Codecs.FLACCL
this.channelsCount = channelsCount; this.channelsCount = channelsCount;
this.writer = writer; this.writer = writer;
openCLProgram = _openCLProgram; openCLProgram = _openCLProgram;
Device[] openCLDevices = openCLProgram.Context.Platform.QueryDevices(DeviceType.GPU);
#if DEBUG #if DEBUG
var prop = CommandQueueProperties.PROFILING_ENABLE; var prop = CommandQueueProperties.PROFILING_ENABLE;
#else #else
var prop = CommandQueueProperties.NONE; var prop = CommandQueueProperties.NONE;
#endif #endif
openCLCQ = openCLProgram.Context.CreateCommandQueue(openCLDevices[0], prop); openCLCQ = openCLProgram.Context.CreateCommandQueue(openCLProgram.Context.Devices[0], prop);
residualTasksLen = sizeof(FLACCLSubframeTask) * channelsCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 8) * FLACCLWriter.maxFrames; residualTasksLen = sizeof(FLACCLSubframeTask) * channelsCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 8) * FLACCLWriter.maxFrames;
bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channels * FLACCLWriter.maxFrames; bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channels * FLACCLWriter.maxFrames;
@@ -2260,12 +2302,15 @@ namespace CUETools.Codecs.FLACCL
clChooseBestMethod = openCLProgram.CreateKernel("clChooseBestMethod"); clChooseBestMethod = openCLProgram.CreateKernel("clChooseBestMethod");
clCopyBestMethod = openCLProgram.CreateKernel("clCopyBestMethod"); clCopyBestMethod = openCLProgram.CreateKernel("clCopyBestMethod");
clCopyBestMethodStereo = openCLProgram.CreateKernel("clCopyBestMethodStereo"); clCopyBestMethodStereo = openCLProgram.CreateKernel("clCopyBestMethodStereo");
if (writer._settings.GPUOnly)
{
clEncodeResidual = openCLProgram.CreateKernel("clEncodeResidual"); clEncodeResidual = openCLProgram.CreateKernel("clEncodeResidual");
clCalcPartition = openCLProgram.CreateKernel("clCalcPartition"); clCalcPartition = openCLProgram.CreateKernel("clCalcPartition");
clCalcPartition16 = openCLProgram.CreateKernel("clCalcPartition16"); clCalcPartition16 = openCLProgram.CreateKernel("clCalcPartition16");
clSumPartition = openCLProgram.CreateKernel("clSumPartition"); clSumPartition = openCLProgram.CreateKernel("clSumPartition");
clFindRiceParameter = openCLProgram.CreateKernel("clFindRiceParameter"); clFindRiceParameter = openCLProgram.CreateKernel("clFindRiceParameter");
clFindPartitionOrder = openCLProgram.CreateKernel("clFindPartitionOrder"); clFindPartitionOrder = openCLProgram.CreateKernel("clFindPartitionOrder");
}
samplesBuffer = new int[FLACCLWriter.MAX_BLOCKSIZE * channelsCount]; samplesBuffer = new int[FLACCLWriter.MAX_BLOCKSIZE * channelsCount];
outputBuffer = new byte[max_frame_size * FLACCLWriter.maxFrames + 1]; outputBuffer = new byte[max_frame_size * FLACCLWriter.maxFrames + 1];
@@ -2304,12 +2349,15 @@ namespace CUETools.Codecs.FLACCL
clChooseBestMethod.Dispose(); clChooseBestMethod.Dispose();
clCopyBestMethod.Dispose(); clCopyBestMethod.Dispose();
clCopyBestMethodStereo.Dispose(); clCopyBestMethodStereo.Dispose();
if (writer._settings.GPUOnly)
{
clEncodeResidual.Dispose(); clEncodeResidual.Dispose();
clCalcPartition.Dispose(); clCalcPartition.Dispose();
clCalcPartition16.Dispose(); clCalcPartition16.Dispose();
clSumPartition.Dispose(); clSumPartition.Dispose();
clFindRiceParameter.Dispose(); clFindRiceParameter.Dispose();
clFindPartitionOrder.Dispose(); clFindPartitionOrder.Dispose();
}
clSamples.Dispose(); clSamples.Dispose();
clSamplesBytes.Dispose(); clSamplesBytes.Dispose();
@@ -2390,15 +2438,13 @@ namespace CUETools.Codecs.FLACCL
nWindowFunctions); nWindowFunctions);
clComputeLPC.SetArgs( clComputeLPC.SetArgs(
clResidualTasks,
clAutocorOutput, clAutocorOutput,
clLPCData, clLPCData,
nResidualTasksPerChannel,
nWindowFunctions); nWindowFunctions);
openCLCQ.EnqueueNDRangeKernel( openCLCQ.EnqueueNDRangeKernel(
clComputeLPC, clComputeLPC,
32, 1, Math.Min(groupSize, 32), 1,
nWindowFunctions, nWindowFunctions,
channelsCount * frameCount); channelsCount * frameCount);
@@ -2412,7 +2458,7 @@ namespace CUETools.Codecs.FLACCL
openCLCQ.EnqueueNDRangeKernel( openCLCQ.EnqueueNDRangeKernel(
clQuantizeLPC, clQuantizeLPC,
32, 1, Math.Min(groupSize, 32), 1,
nWindowFunctions, nWindowFunctions,
channelsCount * frameCount); channelsCount * frameCount);
@@ -2433,7 +2479,7 @@ namespace CUETools.Codecs.FLACCL
openCLCQ.EnqueueNDRangeKernel( openCLCQ.EnqueueNDRangeKernel(
clChooseBestMethod, clChooseBestMethod,
32, channelsCount * frameCount); Math.Min(groupSize, 32), channelsCount * frameCount);
if (channels == 2 && channelsCount == 4) if (channels == 2 && channelsCount == 4)
{ {
@@ -2444,7 +2490,7 @@ namespace CUETools.Codecs.FLACCL
openCLCQ.EnqueueNDRangeKernel( openCLCQ.EnqueueNDRangeKernel(
clCopyBestMethodStereo, clCopyBestMethodStereo,
64, frameCount); Math.Min(groupSize, 64), frameCount);
} }
else else
{ {
@@ -2455,7 +2501,7 @@ namespace CUETools.Codecs.FLACCL
openCLCQ.EnqueueNDRangeKernel( openCLCQ.EnqueueNDRangeKernel(
clCopyBestMethod, clCopyBestMethod,
64, channels * frameCount); Math.Min(groupSize, 64), channels * frameCount);
} }
if (writer._settings.GPUOnly) if (writer._settings.GPUOnly)

View File

@@ -1,6 +1,6 @@
/** /**
* CUETools.FLACCL: FLAC audio encoder using OpenCL * CUETools.FLACCL: FLAC audio encoder using OpenCL
* Copyright (c) 2009 Gregory S. Chudov * Copyright (c) 2010 Gregory S. Chudov
* *
* This library is free software; you can redistribute it and/or * This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public * modify it under the terms of the GNU Lesser General Public
@@ -24,8 +24,6 @@
#pragma OPENCL EXTENSION cl_amd_printf : enable #pragma OPENCL EXTENSION cl_amd_printf : enable
#endif #endif
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
//#pragma OPENCL EXTENSION cl_amd_fp64 : enable //#pragma OPENCL EXTENSION cl_amd_fp64 : enable
typedef enum typedef enum
@@ -125,7 +123,7 @@ void clFindWastedBits(
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
int w = 0, a = 0; int w = 0, a = 0;
for (int pos = tid; pos + tid < task.blocksize; pos += GROUP_SIZE) for (int pos = tid; pos < task.blocksize; pos += GROUP_SIZE)
{ {
int smp = samples[task.samplesOffs + pos]; int smp = samples[task.samplesOffs + pos];
w |= smp; w |= smp;
@@ -213,38 +211,25 @@ void clComputeAutocor(
__kernel __attribute__((reqd_work_group_size(32, 1, 1))) __kernel __attribute__((reqd_work_group_size(32, 1, 1)))
void clComputeLPC( void clComputeLPC(
__global FLACCLSubframeTask *tasks,
__global float *autoc, __global float *autoc,
__global float *lpcs, __global float *lpcs,
int taskCount, // tasks per block
int windowCount int windowCount
) )
{ {
__local struct { __local struct {
FLACCLSubframeData task;
volatile float ldr[32]; volatile float ldr[32];
volatile float gen1[32]; volatile float gen1[32];
volatile float error[32]; volatile float error[32];
volatile float autoc[33]; volatile float autoc[33];
volatile int lpcOffs;
volatile int autocOffs;
} shared; } shared;
const int tid = get_local_id(0);// + get_local_id(1) * 32; const int tid = get_local_id(0);// + get_local_id(1) * 32;
int lpcOffs = (get_group_id(0) + get_group_id(1) * windowCount) * (MAX_ORDER + 1) * 32;
// fetch task data int autocOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1);
if (tid < sizeof(shared.task) / sizeof(int))
((__local int*)&shared.task)[tid] = ((__global int*)(tasks + get_group_id(1)))[tid];
if (tid == 0)
{
shared.lpcOffs = (get_group_id(0) + get_group_id(1) * windowCount) * (MAX_ORDER + 1) * 32;
shared.autocOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1);
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) <= MAX_ORDER) if (get_local_id(0) <= MAX_ORDER)
shared.autoc[get_local_id(0)] = autoc[shared.autocOffs + get_local_id(0)]; shared.autoc[get_local_id(0)] = autoc[autocOffs + get_local_id(0)];
if (get_local_id(0) + get_local_size(0) <= MAX_ORDER) if (get_local_id(0) + get_local_size(0) <= MAX_ORDER)
shared.autoc[get_local_id(0) + get_local_size(0)] = autoc[shared.autocOffs + get_local_id(0) + get_local_size(0)]; shared.autoc[get_local_id(0) + get_local_size(0)] = autoc[autocOffs + get_local_id(0) + get_local_size(0)];
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@@ -296,14 +281,14 @@ void clComputeLPC(
// Output coeffs // Output coeffs
if (get_local_id(0) <= order) if (get_local_id(0) <= order)
lpcs[shared.lpcOffs + order * 32 + get_local_id(0)] = -shared.ldr[order - get_local_id(0)]; lpcs[lpcOffs + order * 32 + get_local_id(0)] = -shared.ldr[order - get_local_id(0)];
//if (get_local_id(0) <= order + 1 && fabs(-shared.ldr[0]) > 3000) //if (get_local_id(0) <= order + 1 && fabs(-shared.ldr[0]) > 3000)
// printf("coef[%d] == %f, autoc == %f, error == %f\n", get_local_id(0), -shared.ldr[order - get_local_id(0)], shared.autoc[get_local_id(0)], shared.error[get_local_id(0)]); // printf("coef[%d] == %f, autoc == %f, error == %f\n", get_local_id(0), -shared.ldr[order - get_local_id(0)], shared.autoc[get_local_id(0)], shared.error[get_local_id(0)]);
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// Output prediction error estimates // Output prediction error estimates
if (get_local_id(0) < MAX_ORDER) if (get_local_id(0) < MAX_ORDER)
lpcs[shared.lpcOffs + MAX_ORDER * 32 + get_local_id(0)] = shared.error[get_local_id(0)]; lpcs[lpcOffs + MAX_ORDER * 32 + get_local_id(0)] = shared.error[get_local_id(0)];
} }
__kernel __attribute__((reqd_work_group_size(32, 1, 1))) __kernel __attribute__((reqd_work_group_size(32, 1, 1)))
@@ -318,9 +303,10 @@ void clQuantizeLPC(
{ {
__local struct { __local struct {
FLACCLSubframeData task; FLACCLSubframeData task;
volatile int tmpi[32];
volatile int index[64]; volatile int index[64];
volatile float error[64]; volatile float error[64];
volatile int maxcoef[32];
volatile int maxcoef2[32];
volatile int lpcOffs; volatile int lpcOffs;
} shared; } shared;
@@ -338,6 +324,8 @@ void clQuantizeLPC(
shared.error[tid] = shared.task.blocksize * 64 + tid; shared.error[tid] = shared.task.blocksize * 64 + tid;
shared.index[32 + tid] = MAX_ORDER - 1; shared.index[32 + tid] = MAX_ORDER - 1;
shared.error[32 + tid] = shared.task.blocksize * 64 + tid + 32; shared.error[32 + tid] = shared.task.blocksize * 64 + tid + 32;
shared.maxcoef[tid] = 0;
shared.maxcoef2[tid] = 0;
// Load prediction error estimates // Load prediction error estimates
if (tid < MAX_ORDER) if (tid < MAX_ORDER)
@@ -399,21 +387,14 @@ void clQuantizeLPC(
// get 15 bits of each coeff // get 15 bits of each coeff
int coef = convert_int_rte(lpc * (1 << 15)); int coef = convert_int_rte(lpc * (1 << 15));
// remove sign bits // remove sign bits
shared.tmpi[tid] = coef ^ (coef >> 31); atomic_or(shared.maxcoef + i, coef ^ (coef >> 31));
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// OR reduction
for (int l = get_local_size(0) / 2; l > 1; l >>= 1)
{
if (tid < l)
shared.tmpi[tid] |= shared.tmpi[tid + l];
barrier(CLK_LOCAL_MEM_FENCE);
}
//SUM32(shared.tmpi,tid,|=); //SUM32(shared.tmpi,tid,|=);
// choose precision // choose precision
//int cbits = max(3, min(10, 5 + (shared.task.abits >> 1))); // - convert_int_rte(shared.PE[order - 1]) //int cbits = max(3, min(10, 5 + (shared.task.abits >> 1))); // - convert_int_rte(shared.PE[order - 1])
int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (shared.task.blocksize <= 2304) - (shared.task.blocksize <= 1152) - (shared.task.blocksize <= 576), shared.task.abits), clz(order) + 1 - shared.task.abits)); int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (shared.task.blocksize <= 2304) - (shared.task.blocksize <= 1152) - (shared.task.blocksize <= 576), shared.task.abits), clz(order) + 1 - shared.task.abits));
// calculate shift based on precision and number of leading zeroes in coeffs // calculate shift based on precision and number of leading zeroes in coeffs
int shift = max(0,min(15, clz(shared.tmpi[0] | shared.tmpi[1]) - 18 + cbits)); int shift = max(0,min(15, clz(shared.maxcoef[i]) - 18 + cbits));
//cbits = 13; //cbits = 13;
//shift = 15; //shift = 15;
@@ -426,18 +407,10 @@ void clQuantizeLPC(
//shared.tmp[tid] = (tid != 0) * (shared.arp[tid - 1]*(1 << shared.task.shift) - shared.task.coefs[tid - 1]); //shared.tmp[tid] = (tid != 0) * (shared.arp[tid - 1]*(1 << shared.task.shift) - shared.task.coefs[tid - 1]);
//shared.task.coefs[tid] = max(-(1 << (shared.task.cbits - 1)), min((1 << (shared.task.cbits - 1))-1, convert_int_rte((shared.arp[tid]) * (1 << shared.task.shift) + shared.tmp[tid]))); //shared.task.coefs[tid] = max(-(1 << (shared.task.cbits - 1)), min((1 << (shared.task.cbits - 1))-1, convert_int_rte((shared.arp[tid]) * (1 << shared.task.shift) + shared.tmp[tid])));
// remove sign bits // remove sign bits
shared.tmpi[tid] = coef ^ (coef >> 31); atomic_or(shared.maxcoef2 + i, coef ^ (coef >> 31));
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// OR reduction
for (int l = get_local_size(0) / 2; l > 1; l >>= 1)
{
if (tid < l)
shared.tmpi[tid] |= shared.tmpi[tid + l];
barrier(CLK_LOCAL_MEM_FENCE);
}
//SUM32(shared.tmpi,tid,|=);
// calculate actual number of bits (+1 for sign) // calculate actual number of bits (+1 for sign)
cbits = 1 + 32 - clz(shared.tmpi[0] | shared.tmpi[1]); cbits = 1 + 32 - clz(shared.maxcoef2[i]);
// output shift, cbits and output coeffs // output shift, cbits and output coeffs
int taskNo = get_group_id(1) * taskCount + get_group_id(0) * taskCountLPC + i; int taskNo = get_group_id(1) * taskCount + get_group_id(0) * taskCountLPC + i;
@@ -452,10 +425,6 @@ void clQuantizeLPC(
} }
} }
#ifndef PARTORDER
#define PARTORDER 4
#endif
__kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) __kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
void clEstimateResidual( void clEstimateResidual(
__global int*output, __global int*output,
@@ -463,10 +432,10 @@ void clEstimateResidual(
__global FLACCLSubframeTask *tasks __global FLACCLSubframeTask *tasks
) )
{ {
__local int data[GROUP_SIZE * 2]; __local float data[GROUP_SIZE * 2];
__local FLACCLSubframeTask task; __local FLACCLSubframeTask task;
__local int residual[GROUP_SIZE]; __local int psum[64];
__local int len[GROUP_SIZE >> PARTORDER]; __local float fcoef[32];
const int tid = get_local_id(0); const int tid = get_local_id(0);
if (tid < sizeof(task)/sizeof(int)) if (tid < sizeof(task)/sizeof(int))
@@ -476,101 +445,74 @@ void clEstimateResidual(
int ro = task.data.residualOrder; int ro = task.data.residualOrder;
int bs = task.data.blocksize; int bs = task.data.blocksize;
if (tid < 32 && tid >= ro) if (tid < 32)
task.coefs[tid] = 0; fcoef[tid] = select(0.0f, - ((float) task.coefs[tid]) / (1 << task.data.shift), tid < ro);
if (tid < (GROUP_SIZE >> PARTORDER)) //fcoef[tid] = select(0.0f, - ((float) task.coefs[tid + ro - MAX_ORDER]) / (1 << task.data.shift), tid + ro >= MAX_ORDER && tid < MAX_ORDER);
len[tid] = 0; if (tid < 64)
data[tid] = 0; psum[tid] = 0;
data[tid] = 0.0f;
int partOrder = clz(64) - clz(bs - 1) + 1;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
__local int4 * cptr = (__local int4 *)&task.coefs[0]; float4 cptr0 = vload4(0, &fcoef[0]);
int4 cptr0 = cptr[0]; float4 cptr1 = vload4(1, &fcoef[0]);
#if MAX_ORDER > 4
int4 cptr1 = cptr[1];
#if MAX_ORDER > 8 #if MAX_ORDER > 8
int4 cptr2 = cptr[2]; float4 cptr2 = vload4(2, &fcoef[0]);
#endif #endif
#endif
for (int pos = 0; pos < bs; pos += GROUP_SIZE) for (int pos = 0; pos < bs; pos += GROUP_SIZE)
{ {
// fetch samples // fetch samples
int offs = pos + tid; int offs = pos + tid;
int nextData = offs < bs ? samples[task.data.samplesOffs + offs] >> task.data.wbits : 0; float nextData = offs < bs ? samples[task.data.samplesOffs + offs] >> task.data.wbits : 0.0f;
data[tid + GROUP_SIZE] = nextData; data[tid + GROUP_SIZE] = nextData;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// compute residual // compute residual
__local int4 * dptr = (__local int4 *)&data[tid + GROUP_SIZE - ro]; __local float* dptr = &data[tid + GROUP_SIZE - ro];
int4 sum = dptr[0] * cptr0 float4 sum = cptr0 * vload4(0, dptr)
#if MAX_ORDER > 4 + cptr1 * vload4(1, dptr)
+ dptr[1] * cptr1
#if MAX_ORDER > 8 #if MAX_ORDER > 8
+ dptr[2] * cptr2 + cptr2 * vload4(2, dptr)
#if MAX_ORDER > 12 #if MAX_ORDER > 12
+ dptr[3] * cptr[3] + vload4(3, &fcoef[0]) * vload4(3, dptr)
#if MAX_ORDER > 16 #if MAX_ORDER > 16
+ dptr[4] * cptr[4] + vload4(4, &fcoef[0]) * vload4(4, dptr)
+ dptr[5] * cptr[5] + vload4(5, &fcoef[0]) * vload4(5, dptr)
+ dptr[6] * cptr[6] + vload4(6, &fcoef[0]) * vload4(6, dptr)
+ dptr[7] * cptr[7] + vload4(7, &fcoef[0]) * vload4(7, dptr)
#endif #endif
#endif #endif
#endif
#endif #endif
; ;
int t = nextData - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift); int t = convert_int_rte(nextData + sum.x + sum.y + sum.z + sum.w);
barrier(CLK_LOCAL_MEM_FENCE);
data[tid] = nextData;
// ensure we're within frame bounds // ensure we're within frame bounds
t = select(0, t, offs >= ro && offs < bs); t = select(0, t, offs >= ro && offs < bs);
// overflow protection // overflow protection
t = clamp(t, -0x7fffff, 0x7fffff); t = clamp(t, -0x7fffff, 0x7fffff);
// convert to unsigned // convert to unsigned
residual[tid] = (t << 1) ^ (t >> 31); atomic_add(&psum[offs >> partOrder], (t << 1) ^ (t >> 31));
barrier(CLK_LOCAL_MEM_FENCE); }
data[tid] = nextData;
// calculate rice partition bit length for every 16 samples // calculate rice partition bit length for every (1 << partOrder) samples
if (tid < (GROUP_SIZE >> PARTORDER)) if (tid < 64)
{ {
//__local int4 * chunk = (__local int4 *)&residual[tid << PARTORDER]; int k = clamp(clz(1 << partOrder) - clz(psum[tid]), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16)
__local int4 * chunk = ((__local int4 *)residual) + (tid << (PARTORDER - 2)); psum[tid] = (k << partOrder) + (psum[tid] >> k);
#if PARTORDER == 3
int4 sum = chunk[0] + chunk[1];
#elif PARTORDER == 4
int4 sum = chunk[0] + chunk[1] + chunk[2] + chunk[3]; // [0 .. (1 << (PARTORDER - 2)) - 1]
#elif PARTORDER == 5
int4 sum = chunk[0] + chunk[1] + chunk[2] + chunk[3] + chunk[4] + chunk[5] + chunk[6] + chunk[7];
#else
#error Invalid PARTORDER
#endif
int res = sum.x + sum.y + sum.z + sum.w;
int k = clamp(clz(1 << PARTORDER) - clz(res), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16)
#ifdef EXTRAMODE
#if PARTORDER == 3
sum = (chunk[0] >> k) + (chunk[1] >> k);
#elif PARTORDER == 4
sum = (chunk[0] >> k) + (chunk[1] >> k) + (chunk[2] >> k) + (chunk[3] >> k);
#else
#error Invalid PARTORDER
#endif
len[tid] += (k << PARTORDER) + sum.x + sum.y + sum.z + sum.w;
#else
len[tid] += (k << PARTORDER) + (res >> k);
#endif
} }
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (int l = GROUP_SIZE >> (PARTORDER + 1); l > 0; l >>= 1) for (int l = 32; l > 0; l >>= 1)
{ {
if (tid < l) if (tid < l)
len[tid] += len[tid + l]; psum[tid] += psum[tid + l];
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
if (tid == 0) if (tid == 0)
output[get_group_id(0)] = len[0] + (bs - ro); output[get_group_id(0)] = psum[0] + (bs - ro);
} }
__kernel __attribute__((reqd_work_group_size(32, 1, 1))) __kernel __attribute__((reqd_work_group_size(32, 1, 1)))
@@ -580,15 +522,16 @@ void clChooseBestMethod(
int taskCount int taskCount
) )
{ {
__local struct { int best_length = 0x7fffffff;
volatile int index[32]; int best_index = 0;
volatile int length[32]; __local int partLen[32];
} shared;
__local FLACCLSubframeData task; __local FLACCLSubframeData task;
const int tid = get_local_id(0); const int tid = get_local_id(0);
shared.length[tid] = 0x7fffffff; // fetch part sum
shared.index[tid] = tid; if (tid < taskCount)
partLen[tid] = residual[tid + taskCount * get_group_id(0)];
barrier(CLK_LOCAL_MEM_FENCE);
for (int taskNo = 0; taskNo < taskCount; taskNo++) for (int taskNo = 0; taskNo < taskCount; taskNo++)
{ {
// fetch task data // fetch task data
@@ -599,46 +542,27 @@ void clChooseBestMethod(
if (tid == 0) if (tid == 0)
{ {
// fetch part sum int pl = partLen[taskNo];
int partLen = residual[taskNo + taskCount * get_group_id(0)];
//// calculate part size
//int residualLen = task[get_local_id(1)].data.blocksize - task[get_local_id(1)].data.residualOrder;
//residualLen = residualLen * (task[get_local_id(1)].data.type != Constant || psum != 0);
//// calculate rice parameter
//int k = max(0, min(14, convert_int_rtz(log2((psum + 0.000001f) / (residualLen + 0.000001f) + 0.5f))));
//// calculate part bit length
//int partLen = residualLen * (k + 1) + (psum >> k);
int obits = task.obits - task.wbits; int obits = task.obits - task.wbits;
shared.length[taskNo] = int len = min(obits * task.blocksize,
min(obits * task.blocksize, task.type == Fixed ? task.residualOrder * obits + 6 + (4 * 1/2) + pl :
task.type == Fixed ? task.residualOrder * obits + 6 + (4 * 1/2) + partLen : task.type == LPC ? task.residualOrder * obits + 4 + 5 + task.residualOrder * task.cbits + 6 + (4 * 1/2)/* << porder */ + pl :
task.type == LPC ? task.residualOrder * obits + 4 + 5 + task.residualOrder * task.cbits + 6 + (4 * 1/2)/* << porder */ + partLen : task.type == Constant ? obits * select(1, task.blocksize, pl != task.blocksize - task.residualOrder) :
task.type == Constant ? obits * select(1, task.blocksize, partLen != task.blocksize - task.residualOrder) :
obits * task.blocksize); obits * task.blocksize);
tasks[taskNo + taskCount * get_group_id(0)].data.size = len;
if (len < best_length)
{
best_length = len;
best_index = taskNo;
}
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
//shared.index[get_local_id(0)] = get_local_id(0);
//shared.length[get_local_id(0)] = (get_local_id(0) < taskCount) ? tasks[get_local_id(0) + taskCount * get_group_id(0)].size : 0x7fffffff;
if (tid < taskCount)
tasks[tid + taskCount * get_group_id(0)].data.size = shared.length[tid];
int l1 = shared.length[tid];
for (int l = 16; l > 0; l >>= 1)
{
if (tid < l)
{
int l2 = shared.length[tid + l];
shared.index[tid] = shared.index[tid + select(0, l, l2 < l1)];
shared.length[tid] = l1 = min(l1, l2);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (tid == 0) if (tid == 0)
tasks[taskCount * get_group_id(0)].data.best_index = taskCount * get_group_id(0) + shared.index[0]; tasks[taskCount * get_group_id(0)].data.best_index = taskCount * get_group_id(0) + best_index;
} }
__kernel __attribute__((reqd_work_group_size(64, 1, 1))) __kernel __attribute__((reqd_work_group_size(64, 1, 1)))
@@ -788,13 +712,13 @@ void clCalcPartition(
int psize // == task.blocksize >> max_porder? int psize // == task.blocksize >> max_porder?
) )
{ {
__local int pl[(GROUP_SIZE / 16)][15]; __local int pl[(GROUP_SIZE / 8)][15];
__local FLACCLSubframeData task; __local FLACCLSubframeData task;
const int tid = get_local_id(0); const int tid = get_local_id(0);
if (tid < sizeof(task) / sizeof(int)) if (tid < sizeof(task) / sizeof(int))
((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(1)]))[tid]; ((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(1)]))[tid];
if (tid < (GROUP_SIZE / 16)) if (tid < (GROUP_SIZE / 8))
{ {
for (int k = 0; k <= 14; k++) for (int k = 0; k <= 14; k++)
pl[tid][k] = 0; pl[tid][k] = 0;
@@ -807,13 +731,14 @@ void clCalcPartition(
{ {
// fetch residual // fetch residual
int s = (offs >= task.residualOrder && offs < end) ? residual[task.residualOffs + offs] : 0; int s = (offs >= task.residualOrder && offs < end) ? residual[task.residualOffs + offs] : 0;
// convert to unsigned // overflow protection
s = clamp(s, -0x7fffff, 0x7fffff); s = clamp(s, -0x7fffff, 0x7fffff);
// convert to unsigned
s = (s << 1) ^ (s >> 31); s = (s << 1) ^ (s >> 31);
// calc number of unary bits for each residual sample with each rice paramater // calc number of unary bits for each residual sample with each rice paramater
int part = (offs - start) / psize; int part = (offs - start) / psize + (tid & 1) * (GROUP_SIZE / 16);
for (int k = 0; k <= 14; k++) for (int k = 0; k <= 14; k++)
atom_add(&pl[part][k], s >> k); atomic_add(&pl[part][k], s >> k);
//pl[part][k] += s >> k; //pl[part][k] += s >> k;
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@@ -825,7 +750,8 @@ void clCalcPartition(
{ {
// output length // output length
const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (k << (max_porder + 1)); const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (k << (max_porder + 1));
partition_lengths[pos + part] = min(0x7fffff, pl[tid][k]) + select(psize, psize - task.residualOrder, part == 0) * (k + 1); int plen = pl[tid][k] + pl[tid + (GROUP_SIZE / 16)][k];
partition_lengths[pos + part] = min(0x7fffff, plen) + (psize - select(0, task.residualOrder, part == 0)) * (k + 1);
// if (get_group_id(1) == 0) // if (get_group_id(1) == 0)
//printf("pl[%d][%d] == %d\n", k, part, min(0x7fffff, pl[k][tid]) + (psize - task.residualOrder * (part == 0)) * (k + 1)); //printf("pl[%d][%d] == %d\n", k, part, min(0x7fffff, pl[k][tid]) + (psize - task.residualOrder * (part == 0)) * (k + 1));
} }
@@ -911,7 +837,7 @@ void clCalcPartition16(
// convert to unsigned // convert to unsigned
res[tid] = (s << 1) ^ (s >> 31); res[tid] = (s << 1) ^ (s >> 31);
// for (int k = 0; k < 15; k++) atom_add(&pl[x][k], s >> k); // for (int k = 0; k < 15; k++) atomic_add(&pl[x][k], s >> k);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
data[tid] = nextData; data[tid] = nextData;
@@ -1001,25 +927,23 @@ void clFindPartitionOrder(
int max_porder int max_porder
) )
{ {
__local int partlen[9]; __local int partlen[16];
__local FLACCLSubframeData task; __local FLACCLSubframeData task;
const int pos = (get_group_id(0) << (max_porder + 2)) + (2 << max_porder); const int pos = (get_group_id(0) << (max_porder + 2)) + (2 << max_porder);
if (get_local_id(0) < sizeof(task) / sizeof(int)) if (get_local_id(0) < sizeof(task) / sizeof(int))
((__local int*)&task)[get_local_id(0)] = ((__global int*)(&tasks[get_group_id(0)]))[get_local_id(0)]; ((__local int*)&task)[get_local_id(0)] = ((__global int*)(&tasks[get_group_id(0)]))[get_local_id(0)];
if (get_local_id(0) < 9) if (get_local_id(0) < 16)
partlen[get_local_id(0)] = 0; partlen[get_local_id(0)] = 0;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// fetch partition lengths // fetch partition lengths
for (int offs = 0; offs < (2 << max_porder); offs += GROUP_SIZE) int lim = (2 << max_porder) - 1;
for (int offs = get_local_id(0); offs < lim; offs += GROUP_SIZE)
{ {
if (offs + get_local_id(0) < (2 << max_porder) - 1) int len = rice_parameters[pos + offs];
{ int porder = 31 - clz(lim - offs);
int len = rice_parameters[pos + offs + get_local_id(0)]; atomic_add(&partlen[porder], len);
int porder = 31 - clz((2 << max_porder) - 1 - offs - get_local_id(0));
atom_add(&partlen[porder], len);
}
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);

View File

@@ -0,0 +1,492 @@
/**
* CUETools.FLACCL: FLAC audio encoder using OpenCL
* Copyright (c) 2010 Gregory S. Chudov
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef _FLACCL_KERNEL_H_
#define _FLACCL_KERNEL_H_
#ifdef DEBUG
#pragma OPENCL EXTENSION cl_amd_printf : enable
#endif
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
typedef enum
{
Constant = 0,
Verbatim = 1,
Fixed = 8,
LPC = 32
} SubframeType;
typedef struct
{
int residualOrder; // <= 32
int samplesOffs;
int shift;
int cbits;
int size;
int type;
int obits;
int blocksize;
int best_index;
int channel;
int residualOffs;
int wbits;
int abits;
int porder;
int reserved[2];
} FLACCLSubframeData;
typedef struct
{
FLACCLSubframeData data;
int coefs[32]; // fixme: should be short?
} FLACCLSubframeTask;
__kernel void clStereoDecorr(
__global int *samples,
__global short2 *src,
int offset
)
{
int pos = get_global_id(0);
if (pos < offset)
{
short2 s = src[pos];
samples[pos] = s.x;
samples[1 * offset + pos] = s.y;
samples[2 * offset + pos] = (s.x + s.y) >> 1;
samples[3 * offset + pos] = s.x - s.y;
}
}
__kernel void clChannelDecorr2(
__global int *samples,
__global short2 *src,
int offset
)
{
int pos = get_global_id(0);
if (pos < offset)
{
short2 s = src[pos];
samples[pos] = s.x;
samples[1 * offset + pos] = s.y;
}
}
//__kernel void clChannelDecorr(
// int *samples,
// short *src,
// int offset
//)
//{
// int pos = get_global_id(0);
// if (pos < offset)
// samples[get_group_id(1) * offset + pos] = src[pos * get_num_groups(1) + get_group_id(1)];
//}
#define __ffs(a) (32 - clz(a & (-a)))
//#define __ffs(a) (33 - clz(~a & (a - 1)))
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
void clFindWastedBits(
__global FLACCLSubframeTask *tasks,
__global int *samples,
int tasksPerChannel
)
{
__global FLACCLSubframeTask* ptask = &tasks[get_group_id(0) * tasksPerChannel];
int w = 0, a = 0;
for (int pos = 0; pos < ptask->data.blocksize; pos ++)
{
int smp = samples[ptask->data.samplesOffs + pos];
w |= smp;
a |= smp ^ (smp >> 31);
}
w = max(0,__ffs(w) - 1);
a = 32 - clz(a) - w;
for (int i = 0; i < tasksPerChannel; i++)
{
ptask[i].data.wbits = w;
ptask[i].data.abits = a;
}
}
// get_num_groups(0) == number of tasks
// get_num_groups(1) == number of windows
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
void clComputeAutocor(
__global float *output,
__global const int *samples,
__global const float *window,
__global FLACCLSubframeTask *tasks,
const int taskCount // tasks per block
)
{
FLACCLSubframeData task = tasks[get_group_id(0) * taskCount].data;
int len = task.blocksize;
int windowOffs = get_group_id(1) * len;
float data1[4096 + 32];
// TODO!!!!!!!!!!! if (bs > 4096) data1[bs + 32]
for (int tid = 0; tid < len; tid++)
data1[tid] = samples[task.samplesOffs + tid] * window[windowOffs + tid];
data1[len] = 0.0f;
for (int i = 0; i <= MAX_ORDER; ++i)
{
double temp = 1.0;
double temp2 = 1.0;
float* finish = data1 + len - i;
for (float* pdata = data1; pdata < finish; pdata += 2)
{
temp += pdata[i] * pdata[0];
temp2 += pdata[i + 1] * pdata[1];
}
output[(get_group_id(0) * get_num_groups(1) + get_group_id(1)) * (MAX_ORDER + 1) + i] = temp + temp2;
}
}
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
void clComputeLPC(
__global float *pautoc,
__global float *lpcs,
int windowCount
)
{
int lpcOffs = (get_group_id(0) + get_group_id(1) * windowCount) * (MAX_ORDER + 1) * 32;
int autocOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1);
volatile double ldr[32];
volatile double gen0[32];
volatile double gen1[32];
volatile double err[32];
__global float* autoc = pautoc + autocOffs;
for (int i = 0; i < MAX_ORDER; i++)
{
gen0[i] = gen1[i] = autoc[i + 1];
ldr[i] = 0.0;
}
// Compute LPC using Schur and Levinson-Durbin recursion
double error = autoc[0];
for (int order = 0; order < MAX_ORDER; order++)
{
// Schur recursion
double reff = -gen1[0] / error;
//error += gen1[0] * reff; // Equivalent to error *= (1 - reff * reff);
error *= (1 - reff * reff);
for (int j = 0; j < MAX_ORDER - 1 - order; j++)
{
gen1[j] = gen1[j + 1] + reff * gen0[j];
gen0[j] = gen1[j + 1] * reff + gen0[j];
}
err[order] = error;
// Levinson-Durbin recursion
ldr[order] = reff;
for (int j = 0; j < order / 2; j++)
{
double tmp = ldr[j];
ldr[j] += reff * ldr[order - 1 - j];
ldr[order - 1 - j] += reff * tmp;
}
if (0 != (order & 1))
ldr[order / 2] += ldr[order / 2] * reff;
// Output coeffs
for (int j = 0; j <= order; j++)
lpcs[lpcOffs + order * 32 + j] = -ldr[order - j];
}
// Output prediction error estimates
for (int j = 0; j < MAX_ORDER; j++)
lpcs[lpcOffs + MAX_ORDER * 32 + j] = err[j];
}
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
void clQuantizeLPC(
__global FLACCLSubframeTask *tasks,
__global float*lpcs,
int taskCount, // tasks per block
int taskCountLPC, // tasks per set of coeffs (<= 32)
int minprecision,
int precisions
)
{
int bs = tasks[get_group_id(1) * taskCount].data.blocksize;
int abits = tasks[get_group_id(1) * taskCount].data.abits;
int lpcOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1) * 32;
float error[MAX_ORDER];
int best_orders[MAX_ORDER];
// Load prediction error estimates based on Akaike's Criteria
for (int tid = 0; tid < MAX_ORDER; tid++)
{
error[tid] = bs * log(lpcs[lpcOffs + MAX_ORDER * 32 + tid]) + tid * 4.12f * log(bs);
best_orders[tid] = tid;
}
// Select best orders
for (int i = 0; i < MAX_ORDER && i < taskCountLPC; i++)
{
for (int j = i + 1; j < MAX_ORDER; j++)
{
if (error[best_orders[j]] < error[best_orders[i]])
{
int tmp = best_orders[j];
best_orders[j] = best_orders[i];
best_orders[i] = tmp;
}
}
}
// Quantization
for (int i = 0; i < taskCountLPC; i ++)
{
int order = best_orders[i >> precisions];
int tmpi = 0;
for (int tid = 0; tid <= order; tid ++)
{
float lpc = lpcs[lpcOffs + order * 32 + tid];
// get 15 bits of each coeff
int c = convert_int_rte(lpc * (1 << 15));
// remove sign bits
tmpi |= c ^ (c >> 31);
}
// choose precision
//int cbits = max(3, min(10, 5 + (abits >> 1))); // - convert_int_rte(shared.PE[order - 1])
int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (bs <= 2304) - (bs <= 1152) - (bs <= 576), abits), clz(order) + 1 - abits));
// calculate shift based on precision and number of leading zeroes in coeffs
int shift = max(0,min(15, clz(tmpi) - 18 + cbits));
int taskNo = get_group_id(1) * taskCount + get_group_id(0) * taskCountLPC + i;
tmpi = 0;
for (int tid = 0; tid <= order; tid ++)
{
float lpc = lpcs[lpcOffs + order * 32 + tid];
// quantize coeffs with given shift
int c = convert_int_rte(clamp(lpc * (1 << shift), -1 << (cbits - 1), 1 << (cbits - 1)));
// remove sign bits
tmpi |= c ^ (c >> 31);
tasks[taskNo].coefs[tid] = c;
}
// calculate actual number of bits (+1 for sign)
cbits = 1 + 32 - clz(tmpi);
// output shift, cbits, ro
tasks[taskNo].data.shift = shift;
tasks[taskNo].data.cbits = cbits;
tasks[taskNo].data.residualOrder = order + 1;
}
}
#define ESTIMATE_N(ro,sum) for (int pos = ro; pos < bs; pos ++) { \
__global int *ptr = data + pos - ro; \
int t = clamp((data[pos] - ((sum) >> task.data.shift)) >> task.data.wbits, -0x7fffff, 0x7fffff); \
len[pos >> (12 - EPO)] += (t << 1) ^ (t >> 31); \
}
// int sum = 0; for (int i = 0; i < ro; i++) sum += *(ptr++) * task.coefs[i];
__kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(1, 1, 1)))
void clEstimateResidual(
__global int*output,
__global int*samples,
__global FLACCLSubframeTask *tasks
)
{
FLACCLSubframeTask task = tasks[get_group_id(0)];
int ro = task.data.residualOrder;
int bs = task.data.blocksize;
#define EPO 6
int len[1 << EPO];
#if 0
//float data[4096 + 32];
//float fcoef[32];
// TODO!!!!!!!!!!! if (bs > 4096) data1[bs + 32]
for (int tid = 0; tid < bs; tid++)
data[tid] = (float)samples[task.data.samplesOffs + tid] / (1 << task.data.wbits);
for (int tid = 0; tid < 32; tid++)
fcoef[tid] = select(0.0f, - ((float) task.coefs[tid]) / (1 << task.data.shift), tid < ro);
float4 c0 = vload4(0, &fcoef[0]);
float4 c1 = vload4(1, &fcoef[0]);
float4 c2 = vload4(2, &fcoef[0]);
#else
__global int *data = &samples[task.data.samplesOffs];
for (int i = ro; i < 32; i++)
task.coefs[i] = 0;
#endif
for (int i = 0; i < 1 << EPO; i++)
len[i] = 0;
switch (ro)
{
case 0: ESTIMATE_N(0, 0) break;
case 1: ESTIMATE_N(1, *ptr * task.coefs[0]) break;
case 2: ESTIMATE_N(2, *(ptr++) * task.coefs[0] + *ptr * task.coefs[1]) break;
case 3: ESTIMATE_N(3, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *ptr * task.coefs[2]) break;
case 4: ESTIMATE_N(4, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *ptr * task.coefs[3]) break;
case 5: ESTIMATE_N(5, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *ptr * task.coefs[4]) break;
case 6: ESTIMATE_N(6, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *ptr * task.coefs[5]) break;
case 7: ESTIMATE_N(7, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *ptr * task.coefs[6]) break;
case 8: ESTIMATE_N(8, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *ptr * task.coefs[7]) break;
case 9: ESTIMATE_N(9, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7] + *ptr * task.coefs[8]) break;
case 10: ESTIMATE_N(10, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7] + *(ptr++) * task.coefs[8] + *ptr * task.coefs[9]) break;
case 11: ESTIMATE_N(11, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7] + *(ptr++) * task.coefs[8] + *(ptr++) * task.coefs[9] + *ptr * task.coefs[10]) break;
case 12: ESTIMATE_N(12, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7] + *(ptr++) * task.coefs[8] + *(ptr++) * task.coefs[9] + *(ptr++) * task.coefs[10] + *ptr * task.coefs[11]) break;
default:
for (int pos = ro; pos < bs; pos ++)
{
#if 0
float sum = dot(vload4(0, data + pos - ro), c0)
+ dot(vload4(1, data + pos - ro), c1)
+ dot(vload4(2, data + pos - ro), c2)
;
int t = convert_int_rte(data[pos] + sum);
#else
__global int *ptr = data + pos - ro;
int sum =
*(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3]
+ *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7]
+ *(ptr++) * task.coefs[8] + *(ptr++) * task.coefs[9] + *(ptr++) * task.coefs[10] + *(ptr++) * task.coefs[11]
;
for (int i = 12; i < ro; i++)
sum += *(ptr++) * task.coefs[i];
int t = (data[pos] - (sum >> task.data.shift)) >> task.data.wbits;
#endif
// overflow protection
t = clamp(t, -0x7fffff, 0x7fffff);
// convert to unsigned
t = (t << 1) ^ (t >> 31);
len[pos >> (12 - EPO)] += t;
}
break;
}
int total = 0;
for (int i = 0; i < 1 << EPO; i++)
{
int res = min(0x7fffff,len[i]);
int k = clamp(clz(1 << (12 - EPO)) - clz(res), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16)
total += (k << (12 - EPO)) + (res >> k);
}
output[get_group_id(0)] = min(0x7ffffff, total) + (bs - ro);
}
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
void clChooseBestMethod(
__global FLACCLSubframeTask *tasks,
__global int *residual,
int taskCount
)
{
int best_length = 0x7fffff;
int best_no = 0;
for (int taskNo = 0; taskNo < taskCount; taskNo++)
{
// fetch task data
__global FLACCLSubframeTask* ptask = tasks + taskNo + taskCount * get_group_id(0);
// fetch part sum
int partLen = residual[taskNo + taskCount * get_group_id(0)];
int obits = ptask->data.obits - ptask->data.wbits;
int bs = ptask->data.blocksize;
int ro = ptask->data.residualOrder;
int len = min(obits * bs,
ptask->data.type == Fixed ? ro * obits + 6 + (4 * 1/2) + partLen :
ptask->data.type == LPC ? ro * obits + 4 + 5 + ro * ptask->data.cbits + 6 + (4 * 1/2)/* << porder */ + partLen :
ptask->data.type == Constant ? obits * select(1, bs, partLen != bs - ro) :
obits * bs);
ptask->data.size = len;
if (len < best_length)
{
best_length = len;
best_no = taskNo;
}
}
tasks[taskCount * get_group_id(0)].data.best_index = taskCount * get_group_id(0) + best_no;
}
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
void clCopyBestMethod(
__global FLACCLSubframeTask *tasks_out,
__global FLACCLSubframeTask *tasks,
int count
)
{
int best_index = tasks[count * get_group_id(0)].data.best_index;
tasks_out[get_group_id(0)] = tasks[best_index];
}
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
void clCopyBestMethodStereo(
__global FLACCLSubframeTask *tasks_out,
__global FLACCLSubframeTask *tasks,
int count
)
{
int best_index[4];
int best_size[4];
int lr_index[2];
for (int i = 0; i < 4; i++)
{
int best = tasks[count * (get_group_id(0) * 4 + i)].data.best_index;
best_index[i] = best;
best_size[i] = tasks[best].data.size;
}
int bitsBest = best_size[2] + best_size[3]; // MidSide
lr_index[0] = best_index[2];
lr_index[1] = best_index[3];
if (bitsBest > best_size[3] + best_size[1]) // RightSide
{
bitsBest = best_size[3] + best_size[1];
lr_index[0] = best_index[3];
lr_index[1] = best_index[1];
}
if (bitsBest > best_size[0] + best_size[3]) // LeftSide
{
bitsBest = best_size[0] + best_size[3];
lr_index[0] = best_index[0];
lr_index[1] = best_index[3];
}
if (bitsBest > best_size[0] + best_size[1]) // LeftRight
{
bitsBest = best_size[0] + best_size[1];
lr_index[0] = best_index[0];
lr_index[1] = best_index[1];
}
tasks_out[2 * get_group_id(0)] = tasks[lr_index[0]];
tasks_out[2 * get_group_id(0)].data.residualOffs = tasks[best_index[0]].data.residualOffs;
tasks_out[2 * get_group_id(0) + 1] = tasks[lr_index[1]];
tasks_out[2 * get_group_id(0) + 1].data.residualOffs = tasks[best_index[1]].data.residualOffs;
}
#endif

View File

@@ -72,6 +72,7 @@ namespace CUETools.FLACCL.cmd
string window_function = null; string window_function = null;
string input_file = null; string input_file = null;
string output_file = null; string output_file = null;
string device_type = null;
int min_partition_order = -1, max_partition_order = -1, int min_partition_order = -1, max_partition_order = -1,
min_lpc_order = -1, max_lpc_order = -1, min_lpc_order = -1, max_lpc_order = -1,
min_fixed_order = -1, max_fixed_order = -1, min_fixed_order = -1, max_fixed_order = -1,
@@ -112,6 +113,10 @@ namespace CUETools.FLACCL.cmd
settings.GroupSize = intarg; settings.GroupSize = intarg;
else if (args[arg] == "--define" && arg + 2 < args.Length) else if (args[arg] == "--define" && arg + 2 < args.Length)
settings.Defines += "#define " + args[++arg] + " " + args[++arg] + "\n"; settings.Defines += "#define " + args[++arg] + " " + args[++arg] + "\n";
else if (args[arg] == "--opencl-platform" && ++arg < args.Length)
settings.Platform = args[arg];
else if (args[arg] == "--opencl-type" && ++arg < args.Length)
device_type = args[arg];
else if ((args[arg] == "-o" || args[arg] == "--output") && ++arg < args.Length) else if ((args[arg] == "-o" || args[arg] == "--output") && ++arg < args.Length)
output_file = args[arg]; output_file = args[arg];
else if ((args[arg] == "-s" || args[arg] == "--stereo") && ++arg < args.Length) else if ((args[arg] == "-s" || args[arg] == "--stereo") && ++arg < args.Length)
@@ -210,6 +215,8 @@ namespace CUETools.FLACCL.cmd
try try
{ {
if (device_type != null)
settings.DeviceType = (OpenCLDeviceType)(Enum.Parse(typeof(OpenCLDeviceType), device_type, true));
encoder.Settings = settings; encoder.Settings = settings;
if (level >= 0) if (level >= 0)
encoder.CompressionLevel = level; encoder.CompressionLevel = level;