mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
CPU mode + GPU optimizations
This commit is contained in:
@@ -65,6 +65,9 @@
|
|||||||
</ProjectReference>
|
</ProjectReference>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
|
<Content Include="flaccpu.cl">
|
||||||
|
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||||
|
</Content>
|
||||||
<None Include="flac.cu">
|
<None Include="flac.cu">
|
||||||
</None>
|
</None>
|
||||||
<Content Include="flac.cl">
|
<Content Include="flac.cl">
|
||||||
|
|||||||
@@ -33,7 +33,14 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
{
|
{
|
||||||
public class FLACCLWriterSettings
|
public class FLACCLWriterSettings
|
||||||
{
|
{
|
||||||
public FLACCLWriterSettings() { DoVerify = false; GPUOnly = true; DoMD5 = true; GroupSize = 64; }
|
public FLACCLWriterSettings()
|
||||||
|
{
|
||||||
|
this.DoVerify = false;
|
||||||
|
this.GPUOnly = true;
|
||||||
|
this.DoMD5 = true;
|
||||||
|
this.GroupSize = 64;
|
||||||
|
this.DeviceType = OpenCLDeviceType.GPU;
|
||||||
|
}
|
||||||
|
|
||||||
[DefaultValue(false)]
|
[DefaultValue(false)]
|
||||||
[DisplayName("Verify")]
|
[DisplayName("Verify")]
|
||||||
@@ -56,6 +63,13 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
[SRDescription(typeof(Properties.Resources), "DescriptionDefines")]
|
[SRDescription(typeof(Properties.Resources), "DescriptionDefines")]
|
||||||
public string Defines { get; set; }
|
public string Defines { get; set; }
|
||||||
|
|
||||||
|
[SRDescription(typeof(Properties.Resources), "DescriptionPlatform")]
|
||||||
|
public string Platform { get; set; }
|
||||||
|
|
||||||
|
[DefaultValue(OpenCLDeviceType.GPU)]
|
||||||
|
[SRDescription(typeof(Properties.Resources), "DescriptionDeviceType")]
|
||||||
|
public OpenCLDeviceType DeviceType { get; set; }
|
||||||
|
|
||||||
int cpu_threads = 1;
|
int cpu_threads = 1;
|
||||||
[DefaultValue(1)]
|
[DefaultValue(1)]
|
||||||
[SRDescription(typeof(Properties.Resources), "DescriptionCPUThreads")]
|
[SRDescription(typeof(Properties.Resources), "DescriptionCPUThreads")]
|
||||||
@@ -74,6 +88,12 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public enum OpenCLDeviceType : ulong
|
||||||
|
{
|
||||||
|
CPU = DeviceType.CPU,
|
||||||
|
GPU = DeviceType.GPU
|
||||||
|
}
|
||||||
|
|
||||||
[AudioEncoderClass("FLACCL", "flac", true, "0 1 2 3 4 5 6 7 8 9 10 11", "8", 2, typeof(FLACCLWriterSettings))]
|
[AudioEncoderClass("FLACCL", "flac", true, "0 1 2 3 4 5 6 7 8 9 10 11", "8", 2, typeof(FLACCLWriterSettings))]
|
||||||
//[AudioEncoderClass("FLACCL nonsub", "flac", true, "9 10 11", "9", 1, typeof(FLACCLWriterSettings))]
|
//[AudioEncoderClass("FLACCL nonsub", "flac", true, "9 10 11", "9", 1, typeof(FLACCLWriterSettings))]
|
||||||
public class FLACCLWriter : IAudioDest
|
public class FLACCLWriter : IAudioDest
|
||||||
@@ -223,6 +243,11 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
if (value as FLACCLWriterSettings == null)
|
if (value as FLACCLWriterSettings == null)
|
||||||
throw new Exception("Unsupported options " + value);
|
throw new Exception("Unsupported options " + value);
|
||||||
_settings = value as FLACCLWriterSettings;
|
_settings = value as FLACCLWriterSettings;
|
||||||
|
if (_settings.DeviceType == OpenCLDeviceType.CPU)
|
||||||
|
{
|
||||||
|
_settings.GroupSize = 1;
|
||||||
|
_settings.GPUOnly = false;
|
||||||
|
}
|
||||||
eparams.flake_set_defaults(_compressionLevel, !_settings.GPUOnly);
|
eparams.flake_set_defaults(_compressionLevel, !_settings.GPUOnly);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -449,7 +474,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
}
|
}
|
||||||
set
|
set
|
||||||
{
|
{
|
||||||
if (value < 0 || value > eparams.max_fixed_order)
|
if (value < 0 || value > 4)
|
||||||
throw new Exception("invalid MinFixedOrder " + value.ToString());
|
throw new Exception("invalid MinFixedOrder " + value.ToString());
|
||||||
eparams.min_fixed_order = value;
|
eparams.min_fixed_order = value;
|
||||||
}
|
}
|
||||||
@@ -463,7 +488,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
}
|
}
|
||||||
set
|
set
|
||||||
{
|
{
|
||||||
if (value > 4 || value < eparams.min_fixed_order)
|
if (value > 4 || value < 0)
|
||||||
throw new Exception("invalid MaxFixedOrder " + value.ToString());
|
throw new Exception("invalid MaxFixedOrder " + value.ToString());
|
||||||
eparams.max_fixed_order = value;
|
eparams.max_fixed_order = value;
|
||||||
}
|
}
|
||||||
@@ -963,7 +988,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
|
|
||||||
task.nResidualTasks = 0;
|
task.nResidualTasks = 0;
|
||||||
task.nTasksPerWindow = Math.Min(32, eparams.orders_per_window);
|
task.nTasksPerWindow = Math.Min(32, eparams.orders_per_window);
|
||||||
task.nResidualTasksPerChannel = task.nWindowFunctions * task.nTasksPerWindow + 1 + (eparams.do_constant ? 1 : 0) + eparams.max_fixed_order - eparams.min_fixed_order;
|
task.nResidualTasksPerChannel = task.nWindowFunctions * task.nTasksPerWindow + (eparams.do_constant ? 1 : 0) + Math.Max(0, 1 + eparams.max_fixed_order - eparams.min_fixed_order);
|
||||||
//if (task.nResidualTasksPerChannel >= 4)
|
//if (task.nResidualTasksPerChannel >= 4)
|
||||||
// task.nResidualTasksPerChannel = (task.nResidualTasksPerChannel + 7) & ~7;
|
// task.nResidualTasksPerChannel = (task.nResidualTasksPerChannel + 7) & ~7;
|
||||||
for (int iFrame = 0; iFrame < nFrames; iFrame++)
|
for (int iFrame = 0; iFrame < nFrames; iFrame++)
|
||||||
@@ -1444,7 +1469,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
if (OpenCL.NumberOfPlatforms < 1)
|
if (OpenCL.NumberOfPlatforms < 1)
|
||||||
throw new Exception("no opencl platforms found");
|
throw new Exception("no opencl platforms found");
|
||||||
|
|
||||||
int groupSize = _settings.GroupSize;
|
int groupSize = _settings.DeviceType == OpenCLDeviceType.CPU ? 1 : _settings.GroupSize;
|
||||||
OCLMan = new OpenCLManager();
|
OCLMan = new OpenCLManager();
|
||||||
// Attempt to save binaries after compilation, as well as load precompiled binaries
|
// Attempt to save binaries after compilation, as well as load precompiled binaries
|
||||||
// to avoid compilation. Usually you'll want this to be true.
|
// to avoid compilation. Usually you'll want this to be true.
|
||||||
@@ -1475,12 +1500,30 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
OCLMan.BuildOptions = "";
|
OCLMan.BuildOptions = "";
|
||||||
OCLMan.SourcePath = System.IO.Path.GetDirectoryName(GetType().Assembly.Location);
|
OCLMan.SourcePath = System.IO.Path.GetDirectoryName(GetType().Assembly.Location);
|
||||||
OCLMan.BinaryPath = System.IO.Path.Combine(System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData), "CUE Tools"), "OpenCL");
|
OCLMan.BinaryPath = System.IO.Path.Combine(System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData), "CUE Tools"), "OpenCL");
|
||||||
OCLMan.CreateDefaultContext(0, DeviceType.GPU);
|
int platformId = 0;
|
||||||
|
if (_settings.Platform != null)
|
||||||
|
{
|
||||||
|
platformId = -1;
|
||||||
|
string platforms = "";
|
||||||
|
for (int i = 0; i < OpenCL.NumberOfPlatforms; i++)
|
||||||
|
{
|
||||||
|
var platform = OpenCL.GetPlatform(i);
|
||||||
|
platforms += " \"" + platform.Name + "\"";
|
||||||
|
if (platform.Name.Equals(_settings.Platform, StringComparison.InvariantCultureIgnoreCase))
|
||||||
|
{
|
||||||
|
platformId = i;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (platformId < 0)
|
||||||
|
throw new Exception("unknown platform \"" + _settings.Platform + "\". Platforms available:" + platforms);
|
||||||
|
}
|
||||||
|
OCLMan.CreateDefaultContext(platformId, (DeviceType)_settings.DeviceType);
|
||||||
|
|
||||||
openCLContext = OCLMan.Context;
|
openCLContext = OCLMan.Context;
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
openCLProgram = OCLMan.CompileFile("flac.cl");
|
openCLProgram = OCLMan.CompileFile(_settings.DeviceType == OpenCLDeviceType.CPU ? "flaccpu.cl" : "flac.cl");
|
||||||
}
|
}
|
||||||
catch (OpenCLBuildException ex)
|
catch (OpenCLBuildException ex)
|
||||||
{
|
{
|
||||||
@@ -2218,13 +2261,12 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
this.channelsCount = channelsCount;
|
this.channelsCount = channelsCount;
|
||||||
this.writer = writer;
|
this.writer = writer;
|
||||||
openCLProgram = _openCLProgram;
|
openCLProgram = _openCLProgram;
|
||||||
Device[] openCLDevices = openCLProgram.Context.Platform.QueryDevices(DeviceType.GPU);
|
|
||||||
#if DEBUG
|
#if DEBUG
|
||||||
var prop = CommandQueueProperties.PROFILING_ENABLE;
|
var prop = CommandQueueProperties.PROFILING_ENABLE;
|
||||||
#else
|
#else
|
||||||
var prop = CommandQueueProperties.NONE;
|
var prop = CommandQueueProperties.NONE;
|
||||||
#endif
|
#endif
|
||||||
openCLCQ = openCLProgram.Context.CreateCommandQueue(openCLDevices[0], prop);
|
openCLCQ = openCLProgram.Context.CreateCommandQueue(openCLProgram.Context.Devices[0], prop);
|
||||||
|
|
||||||
residualTasksLen = sizeof(FLACCLSubframeTask) * channelsCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 8) * FLACCLWriter.maxFrames;
|
residualTasksLen = sizeof(FLACCLSubframeTask) * channelsCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 8) * FLACCLWriter.maxFrames;
|
||||||
bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channels * FLACCLWriter.maxFrames;
|
bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channels * FLACCLWriter.maxFrames;
|
||||||
@@ -2260,12 +2302,15 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
clChooseBestMethod = openCLProgram.CreateKernel("clChooseBestMethod");
|
clChooseBestMethod = openCLProgram.CreateKernel("clChooseBestMethod");
|
||||||
clCopyBestMethod = openCLProgram.CreateKernel("clCopyBestMethod");
|
clCopyBestMethod = openCLProgram.CreateKernel("clCopyBestMethod");
|
||||||
clCopyBestMethodStereo = openCLProgram.CreateKernel("clCopyBestMethodStereo");
|
clCopyBestMethodStereo = openCLProgram.CreateKernel("clCopyBestMethodStereo");
|
||||||
clEncodeResidual = openCLProgram.CreateKernel("clEncodeResidual");
|
if (writer._settings.GPUOnly)
|
||||||
clCalcPartition = openCLProgram.CreateKernel("clCalcPartition");
|
{
|
||||||
clCalcPartition16 = openCLProgram.CreateKernel("clCalcPartition16");
|
clEncodeResidual = openCLProgram.CreateKernel("clEncodeResidual");
|
||||||
clSumPartition = openCLProgram.CreateKernel("clSumPartition");
|
clCalcPartition = openCLProgram.CreateKernel("clCalcPartition");
|
||||||
clFindRiceParameter = openCLProgram.CreateKernel("clFindRiceParameter");
|
clCalcPartition16 = openCLProgram.CreateKernel("clCalcPartition16");
|
||||||
clFindPartitionOrder = openCLProgram.CreateKernel("clFindPartitionOrder");
|
clSumPartition = openCLProgram.CreateKernel("clSumPartition");
|
||||||
|
clFindRiceParameter = openCLProgram.CreateKernel("clFindRiceParameter");
|
||||||
|
clFindPartitionOrder = openCLProgram.CreateKernel("clFindPartitionOrder");
|
||||||
|
}
|
||||||
|
|
||||||
samplesBuffer = new int[FLACCLWriter.MAX_BLOCKSIZE * channelsCount];
|
samplesBuffer = new int[FLACCLWriter.MAX_BLOCKSIZE * channelsCount];
|
||||||
outputBuffer = new byte[max_frame_size * FLACCLWriter.maxFrames + 1];
|
outputBuffer = new byte[max_frame_size * FLACCLWriter.maxFrames + 1];
|
||||||
@@ -2304,12 +2349,15 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
clChooseBestMethod.Dispose();
|
clChooseBestMethod.Dispose();
|
||||||
clCopyBestMethod.Dispose();
|
clCopyBestMethod.Dispose();
|
||||||
clCopyBestMethodStereo.Dispose();
|
clCopyBestMethodStereo.Dispose();
|
||||||
clEncodeResidual.Dispose();
|
if (writer._settings.GPUOnly)
|
||||||
clCalcPartition.Dispose();
|
{
|
||||||
clCalcPartition16.Dispose();
|
clEncodeResidual.Dispose();
|
||||||
clSumPartition.Dispose();
|
clCalcPartition.Dispose();
|
||||||
clFindRiceParameter.Dispose();
|
clCalcPartition16.Dispose();
|
||||||
clFindPartitionOrder.Dispose();
|
clSumPartition.Dispose();
|
||||||
|
clFindRiceParameter.Dispose();
|
||||||
|
clFindPartitionOrder.Dispose();
|
||||||
|
}
|
||||||
|
|
||||||
clSamples.Dispose();
|
clSamples.Dispose();
|
||||||
clSamplesBytes.Dispose();
|
clSamplesBytes.Dispose();
|
||||||
@@ -2390,15 +2438,13 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
nWindowFunctions);
|
nWindowFunctions);
|
||||||
|
|
||||||
clComputeLPC.SetArgs(
|
clComputeLPC.SetArgs(
|
||||||
clResidualTasks,
|
|
||||||
clAutocorOutput,
|
clAutocorOutput,
|
||||||
clLPCData,
|
clLPCData,
|
||||||
nResidualTasksPerChannel,
|
|
||||||
nWindowFunctions);
|
nWindowFunctions);
|
||||||
|
|
||||||
openCLCQ.EnqueueNDRangeKernel(
|
openCLCQ.EnqueueNDRangeKernel(
|
||||||
clComputeLPC,
|
clComputeLPC,
|
||||||
32, 1,
|
Math.Min(groupSize, 32), 1,
|
||||||
nWindowFunctions,
|
nWindowFunctions,
|
||||||
channelsCount * frameCount);
|
channelsCount * frameCount);
|
||||||
|
|
||||||
@@ -2412,7 +2458,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
|
|
||||||
openCLCQ.EnqueueNDRangeKernel(
|
openCLCQ.EnqueueNDRangeKernel(
|
||||||
clQuantizeLPC,
|
clQuantizeLPC,
|
||||||
32, 1,
|
Math.Min(groupSize, 32), 1,
|
||||||
nWindowFunctions,
|
nWindowFunctions,
|
||||||
channelsCount * frameCount);
|
channelsCount * frameCount);
|
||||||
|
|
||||||
@@ -2433,7 +2479,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
|
|
||||||
openCLCQ.EnqueueNDRangeKernel(
|
openCLCQ.EnqueueNDRangeKernel(
|
||||||
clChooseBestMethod,
|
clChooseBestMethod,
|
||||||
32, channelsCount * frameCount);
|
Math.Min(groupSize, 32), channelsCount * frameCount);
|
||||||
|
|
||||||
if (channels == 2 && channelsCount == 4)
|
if (channels == 2 && channelsCount == 4)
|
||||||
{
|
{
|
||||||
@@ -2444,7 +2490,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
|
|
||||||
openCLCQ.EnqueueNDRangeKernel(
|
openCLCQ.EnqueueNDRangeKernel(
|
||||||
clCopyBestMethodStereo,
|
clCopyBestMethodStereo,
|
||||||
64, frameCount);
|
Math.Min(groupSize, 64), frameCount);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@@ -2455,7 +2501,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
|
|
||||||
openCLCQ.EnqueueNDRangeKernel(
|
openCLCQ.EnqueueNDRangeKernel(
|
||||||
clCopyBestMethod,
|
clCopyBestMethod,
|
||||||
64, channels * frameCount);
|
Math.Min(groupSize, 64), channels * frameCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (writer._settings.GPUOnly)
|
if (writer._settings.GPUOnly)
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
/**
|
/**
|
||||||
* CUETools.FLACCL: FLAC audio encoder using OpenCL
|
* CUETools.FLACCL: FLAC audio encoder using OpenCL
|
||||||
* Copyright (c) 2009 Gregory S. Chudov
|
* Copyright (c) 2010 Gregory S. Chudov
|
||||||
*
|
*
|
||||||
* This library is free software; you can redistribute it and/or
|
* This library is free software; you can redistribute it and/or
|
||||||
* modify it under the terms of the GNU Lesser General Public
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
@@ -24,8 +24,6 @@
|
|||||||
#pragma OPENCL EXTENSION cl_amd_printf : enable
|
#pragma OPENCL EXTENSION cl_amd_printf : enable
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
|
|
||||||
|
|
||||||
//#pragma OPENCL EXTENSION cl_amd_fp64 : enable
|
//#pragma OPENCL EXTENSION cl_amd_fp64 : enable
|
||||||
|
|
||||||
typedef enum
|
typedef enum
|
||||||
@@ -125,7 +123,7 @@ void clFindWastedBits(
|
|||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
int w = 0, a = 0;
|
int w = 0, a = 0;
|
||||||
for (int pos = tid; pos + tid < task.blocksize; pos += GROUP_SIZE)
|
for (int pos = tid; pos < task.blocksize; pos += GROUP_SIZE)
|
||||||
{
|
{
|
||||||
int smp = samples[task.samplesOffs + pos];
|
int smp = samples[task.samplesOffs + pos];
|
||||||
w |= smp;
|
w |= smp;
|
||||||
@@ -213,38 +211,25 @@ void clComputeAutocor(
|
|||||||
|
|
||||||
__kernel __attribute__((reqd_work_group_size(32, 1, 1)))
|
__kernel __attribute__((reqd_work_group_size(32, 1, 1)))
|
||||||
void clComputeLPC(
|
void clComputeLPC(
|
||||||
__global FLACCLSubframeTask *tasks,
|
|
||||||
__global float *autoc,
|
__global float *autoc,
|
||||||
__global float *lpcs,
|
__global float *lpcs,
|
||||||
int taskCount, // tasks per block
|
|
||||||
int windowCount
|
int windowCount
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
__local struct {
|
__local struct {
|
||||||
FLACCLSubframeData task;
|
|
||||||
volatile float ldr[32];
|
volatile float ldr[32];
|
||||||
volatile float gen1[32];
|
volatile float gen1[32];
|
||||||
volatile float error[32];
|
volatile float error[32];
|
||||||
volatile float autoc[33];
|
volatile float autoc[33];
|
||||||
volatile int lpcOffs;
|
|
||||||
volatile int autocOffs;
|
|
||||||
} shared;
|
} shared;
|
||||||
const int tid = get_local_id(0);// + get_local_id(1) * 32;
|
const int tid = get_local_id(0);// + get_local_id(1) * 32;
|
||||||
|
int lpcOffs = (get_group_id(0) + get_group_id(1) * windowCount) * (MAX_ORDER + 1) * 32;
|
||||||
// fetch task data
|
int autocOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1);
|
||||||
if (tid < sizeof(shared.task) / sizeof(int))
|
|
||||||
((__local int*)&shared.task)[tid] = ((__global int*)(tasks + get_group_id(1)))[tid];
|
|
||||||
if (tid == 0)
|
|
||||||
{
|
|
||||||
shared.lpcOffs = (get_group_id(0) + get_group_id(1) * windowCount) * (MAX_ORDER + 1) * 32;
|
|
||||||
shared.autocOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1);
|
|
||||||
}
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
|
|
||||||
if (get_local_id(0) <= MAX_ORDER)
|
if (get_local_id(0) <= MAX_ORDER)
|
||||||
shared.autoc[get_local_id(0)] = autoc[shared.autocOffs + get_local_id(0)];
|
shared.autoc[get_local_id(0)] = autoc[autocOffs + get_local_id(0)];
|
||||||
if (get_local_id(0) + get_local_size(0) <= MAX_ORDER)
|
if (get_local_id(0) + get_local_size(0) <= MAX_ORDER)
|
||||||
shared.autoc[get_local_id(0) + get_local_size(0)] = autoc[shared.autocOffs + get_local_id(0) + get_local_size(0)];
|
shared.autoc[get_local_id(0) + get_local_size(0)] = autoc[autocOffs + get_local_id(0) + get_local_size(0)];
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
@@ -296,14 +281,14 @@ void clComputeLPC(
|
|||||||
|
|
||||||
// Output coeffs
|
// Output coeffs
|
||||||
if (get_local_id(0) <= order)
|
if (get_local_id(0) <= order)
|
||||||
lpcs[shared.lpcOffs + order * 32 + get_local_id(0)] = -shared.ldr[order - get_local_id(0)];
|
lpcs[lpcOffs + order * 32 + get_local_id(0)] = -shared.ldr[order - get_local_id(0)];
|
||||||
//if (get_local_id(0) <= order + 1 && fabs(-shared.ldr[0]) > 3000)
|
//if (get_local_id(0) <= order + 1 && fabs(-shared.ldr[0]) > 3000)
|
||||||
// printf("coef[%d] == %f, autoc == %f, error == %f\n", get_local_id(0), -shared.ldr[order - get_local_id(0)], shared.autoc[get_local_id(0)], shared.error[get_local_id(0)]);
|
// printf("coef[%d] == %f, autoc == %f, error == %f\n", get_local_id(0), -shared.ldr[order - get_local_id(0)], shared.autoc[get_local_id(0)], shared.error[get_local_id(0)]);
|
||||||
}
|
}
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
// Output prediction error estimates
|
// Output prediction error estimates
|
||||||
if (get_local_id(0) < MAX_ORDER)
|
if (get_local_id(0) < MAX_ORDER)
|
||||||
lpcs[shared.lpcOffs + MAX_ORDER * 32 + get_local_id(0)] = shared.error[get_local_id(0)];
|
lpcs[lpcOffs + MAX_ORDER * 32 + get_local_id(0)] = shared.error[get_local_id(0)];
|
||||||
}
|
}
|
||||||
|
|
||||||
__kernel __attribute__((reqd_work_group_size(32, 1, 1)))
|
__kernel __attribute__((reqd_work_group_size(32, 1, 1)))
|
||||||
@@ -318,9 +303,10 @@ void clQuantizeLPC(
|
|||||||
{
|
{
|
||||||
__local struct {
|
__local struct {
|
||||||
FLACCLSubframeData task;
|
FLACCLSubframeData task;
|
||||||
volatile int tmpi[32];
|
|
||||||
volatile int index[64];
|
volatile int index[64];
|
||||||
volatile float error[64];
|
volatile float error[64];
|
||||||
|
volatile int maxcoef[32];
|
||||||
|
volatile int maxcoef2[32];
|
||||||
volatile int lpcOffs;
|
volatile int lpcOffs;
|
||||||
} shared;
|
} shared;
|
||||||
|
|
||||||
@@ -338,6 +324,8 @@ void clQuantizeLPC(
|
|||||||
shared.error[tid] = shared.task.blocksize * 64 + tid;
|
shared.error[tid] = shared.task.blocksize * 64 + tid;
|
||||||
shared.index[32 + tid] = MAX_ORDER - 1;
|
shared.index[32 + tid] = MAX_ORDER - 1;
|
||||||
shared.error[32 + tid] = shared.task.blocksize * 64 + tid + 32;
|
shared.error[32 + tid] = shared.task.blocksize * 64 + tid + 32;
|
||||||
|
shared.maxcoef[tid] = 0;
|
||||||
|
shared.maxcoef2[tid] = 0;
|
||||||
|
|
||||||
// Load prediction error estimates
|
// Load prediction error estimates
|
||||||
if (tid < MAX_ORDER)
|
if (tid < MAX_ORDER)
|
||||||
@@ -399,21 +387,14 @@ void clQuantizeLPC(
|
|||||||
// get 15 bits of each coeff
|
// get 15 bits of each coeff
|
||||||
int coef = convert_int_rte(lpc * (1 << 15));
|
int coef = convert_int_rte(lpc * (1 << 15));
|
||||||
// remove sign bits
|
// remove sign bits
|
||||||
shared.tmpi[tid] = coef ^ (coef >> 31);
|
atomic_or(shared.maxcoef + i, coef ^ (coef >> 31));
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
// OR reduction
|
|
||||||
for (int l = get_local_size(0) / 2; l > 1; l >>= 1)
|
|
||||||
{
|
|
||||||
if (tid < l)
|
|
||||||
shared.tmpi[tid] |= shared.tmpi[tid + l];
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
}
|
|
||||||
//SUM32(shared.tmpi,tid,|=);
|
//SUM32(shared.tmpi,tid,|=);
|
||||||
// choose precision
|
// choose precision
|
||||||
//int cbits = max(3, min(10, 5 + (shared.task.abits >> 1))); // - convert_int_rte(shared.PE[order - 1])
|
//int cbits = max(3, min(10, 5 + (shared.task.abits >> 1))); // - convert_int_rte(shared.PE[order - 1])
|
||||||
int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (shared.task.blocksize <= 2304) - (shared.task.blocksize <= 1152) - (shared.task.blocksize <= 576), shared.task.abits), clz(order) + 1 - shared.task.abits));
|
int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (shared.task.blocksize <= 2304) - (shared.task.blocksize <= 1152) - (shared.task.blocksize <= 576), shared.task.abits), clz(order) + 1 - shared.task.abits));
|
||||||
// calculate shift based on precision and number of leading zeroes in coeffs
|
// calculate shift based on precision and number of leading zeroes in coeffs
|
||||||
int shift = max(0,min(15, clz(shared.tmpi[0] | shared.tmpi[1]) - 18 + cbits));
|
int shift = max(0,min(15, clz(shared.maxcoef[i]) - 18 + cbits));
|
||||||
|
|
||||||
//cbits = 13;
|
//cbits = 13;
|
||||||
//shift = 15;
|
//shift = 15;
|
||||||
@@ -426,18 +407,10 @@ void clQuantizeLPC(
|
|||||||
//shared.tmp[tid] = (tid != 0) * (shared.arp[tid - 1]*(1 << shared.task.shift) - shared.task.coefs[tid - 1]);
|
//shared.tmp[tid] = (tid != 0) * (shared.arp[tid - 1]*(1 << shared.task.shift) - shared.task.coefs[tid - 1]);
|
||||||
//shared.task.coefs[tid] = max(-(1 << (shared.task.cbits - 1)), min((1 << (shared.task.cbits - 1))-1, convert_int_rte((shared.arp[tid]) * (1 << shared.task.shift) + shared.tmp[tid])));
|
//shared.task.coefs[tid] = max(-(1 << (shared.task.cbits - 1)), min((1 << (shared.task.cbits - 1))-1, convert_int_rte((shared.arp[tid]) * (1 << shared.task.shift) + shared.tmp[tid])));
|
||||||
// remove sign bits
|
// remove sign bits
|
||||||
shared.tmpi[tid] = coef ^ (coef >> 31);
|
atomic_or(shared.maxcoef2 + i, coef ^ (coef >> 31));
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
// OR reduction
|
|
||||||
for (int l = get_local_size(0) / 2; l > 1; l >>= 1)
|
|
||||||
{
|
|
||||||
if (tid < l)
|
|
||||||
shared.tmpi[tid] |= shared.tmpi[tid + l];
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
}
|
|
||||||
//SUM32(shared.tmpi,tid,|=);
|
|
||||||
// calculate actual number of bits (+1 for sign)
|
// calculate actual number of bits (+1 for sign)
|
||||||
cbits = 1 + 32 - clz(shared.tmpi[0] | shared.tmpi[1]);
|
cbits = 1 + 32 - clz(shared.maxcoef2[i]);
|
||||||
|
|
||||||
// output shift, cbits and output coeffs
|
// output shift, cbits and output coeffs
|
||||||
int taskNo = get_group_id(1) * taskCount + get_group_id(0) * taskCountLPC + i;
|
int taskNo = get_group_id(1) * taskCount + get_group_id(0) * taskCountLPC + i;
|
||||||
@@ -452,10 +425,6 @@ void clQuantizeLPC(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef PARTORDER
|
|
||||||
#define PARTORDER 4
|
|
||||||
#endif
|
|
||||||
|
|
||||||
__kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
|
__kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
|
||||||
void clEstimateResidual(
|
void clEstimateResidual(
|
||||||
__global int*output,
|
__global int*output,
|
||||||
@@ -463,10 +432,10 @@ void clEstimateResidual(
|
|||||||
__global FLACCLSubframeTask *tasks
|
__global FLACCLSubframeTask *tasks
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
__local int data[GROUP_SIZE * 2];
|
__local float data[GROUP_SIZE * 2];
|
||||||
__local FLACCLSubframeTask task;
|
__local FLACCLSubframeTask task;
|
||||||
__local int residual[GROUP_SIZE];
|
__local int psum[64];
|
||||||
__local int len[GROUP_SIZE >> PARTORDER];
|
__local float fcoef[32];
|
||||||
|
|
||||||
const int tid = get_local_id(0);
|
const int tid = get_local_id(0);
|
||||||
if (tid < sizeof(task)/sizeof(int))
|
if (tid < sizeof(task)/sizeof(int))
|
||||||
@@ -476,101 +445,74 @@ void clEstimateResidual(
|
|||||||
int ro = task.data.residualOrder;
|
int ro = task.data.residualOrder;
|
||||||
int bs = task.data.blocksize;
|
int bs = task.data.blocksize;
|
||||||
|
|
||||||
if (tid < 32 && tid >= ro)
|
if (tid < 32)
|
||||||
task.coefs[tid] = 0;
|
fcoef[tid] = select(0.0f, - ((float) task.coefs[tid]) / (1 << task.data.shift), tid < ro);
|
||||||
if (tid < (GROUP_SIZE >> PARTORDER))
|
//fcoef[tid] = select(0.0f, - ((float) task.coefs[tid + ro - MAX_ORDER]) / (1 << task.data.shift), tid + ro >= MAX_ORDER && tid < MAX_ORDER);
|
||||||
len[tid] = 0;
|
if (tid < 64)
|
||||||
data[tid] = 0;
|
psum[tid] = 0;
|
||||||
|
data[tid] = 0.0f;
|
||||||
|
|
||||||
|
int partOrder = clz(64) - clz(bs - 1) + 1;
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
__local int4 * cptr = (__local int4 *)&task.coefs[0];
|
float4 cptr0 = vload4(0, &fcoef[0]);
|
||||||
int4 cptr0 = cptr[0];
|
float4 cptr1 = vload4(1, &fcoef[0]);
|
||||||
#if MAX_ORDER > 4
|
|
||||||
int4 cptr1 = cptr[1];
|
|
||||||
#if MAX_ORDER > 8
|
#if MAX_ORDER > 8
|
||||||
int4 cptr2 = cptr[2];
|
float4 cptr2 = vload4(2, &fcoef[0]);
|
||||||
#endif
|
#endif
|
||||||
#endif
|
|
||||||
|
|
||||||
for (int pos = 0; pos < bs; pos += GROUP_SIZE)
|
for (int pos = 0; pos < bs; pos += GROUP_SIZE)
|
||||||
{
|
{
|
||||||
// fetch samples
|
// fetch samples
|
||||||
int offs = pos + tid;
|
int offs = pos + tid;
|
||||||
int nextData = offs < bs ? samples[task.data.samplesOffs + offs] >> task.data.wbits : 0;
|
float nextData = offs < bs ? samples[task.data.samplesOffs + offs] >> task.data.wbits : 0.0f;
|
||||||
data[tid + GROUP_SIZE] = nextData;
|
data[tid + GROUP_SIZE] = nextData;
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
// compute residual
|
// compute residual
|
||||||
__local int4 * dptr = (__local int4 *)&data[tid + GROUP_SIZE - ro];
|
__local float* dptr = &data[tid + GROUP_SIZE - ro];
|
||||||
int4 sum = dptr[0] * cptr0
|
float4 sum = cptr0 * vload4(0, dptr)
|
||||||
#if MAX_ORDER > 4
|
+ cptr1 * vload4(1, dptr)
|
||||||
+ dptr[1] * cptr1
|
|
||||||
#if MAX_ORDER > 8
|
#if MAX_ORDER > 8
|
||||||
+ dptr[2] * cptr2
|
+ cptr2 * vload4(2, dptr)
|
||||||
#if MAX_ORDER > 12
|
#if MAX_ORDER > 12
|
||||||
+ dptr[3] * cptr[3]
|
+ vload4(3, &fcoef[0]) * vload4(3, dptr)
|
||||||
#if MAX_ORDER > 16
|
#if MAX_ORDER > 16
|
||||||
+ dptr[4] * cptr[4]
|
+ vload4(4, &fcoef[0]) * vload4(4, dptr)
|
||||||
+ dptr[5] * cptr[5]
|
+ vload4(5, &fcoef[0]) * vload4(5, dptr)
|
||||||
+ dptr[6] * cptr[6]
|
+ vload4(6, &fcoef[0]) * vload4(6, dptr)
|
||||||
+ dptr[7] * cptr[7]
|
+ vload4(7, &fcoef[0]) * vload4(7, dptr)
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
#endif
|
|
||||||
#endif
|
#endif
|
||||||
;
|
;
|
||||||
|
|
||||||
int t = nextData - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift);
|
int t = convert_int_rte(nextData + sum.x + sum.y + sum.z + sum.w);
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
data[tid] = nextData;
|
||||||
// ensure we're within frame bounds
|
// ensure we're within frame bounds
|
||||||
t = select(0, t, offs >= ro && offs < bs);
|
t = select(0, t, offs >= ro && offs < bs);
|
||||||
// overflow protection
|
// overflow protection
|
||||||
t = clamp(t, -0x7fffff, 0x7fffff);
|
t = clamp(t, -0x7fffff, 0x7fffff);
|
||||||
// convert to unsigned
|
// convert to unsigned
|
||||||
residual[tid] = (t << 1) ^ (t >> 31);
|
atomic_add(&psum[offs >> partOrder], (t << 1) ^ (t >> 31));
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
data[tid] = nextData;
|
|
||||||
|
|
||||||
// calculate rice partition bit length for every 16 samples
|
|
||||||
if (tid < (GROUP_SIZE >> PARTORDER))
|
|
||||||
{
|
|
||||||
//__local int4 * chunk = (__local int4 *)&residual[tid << PARTORDER];
|
|
||||||
__local int4 * chunk = ((__local int4 *)residual) + (tid << (PARTORDER - 2));
|
|
||||||
#if PARTORDER == 3
|
|
||||||
int4 sum = chunk[0] + chunk[1];
|
|
||||||
#elif PARTORDER == 4
|
|
||||||
int4 sum = chunk[0] + chunk[1] + chunk[2] + chunk[3]; // [0 .. (1 << (PARTORDER - 2)) - 1]
|
|
||||||
#elif PARTORDER == 5
|
|
||||||
int4 sum = chunk[0] + chunk[1] + chunk[2] + chunk[3] + chunk[4] + chunk[5] + chunk[6] + chunk[7];
|
|
||||||
#else
|
|
||||||
#error Invalid PARTORDER
|
|
||||||
#endif
|
|
||||||
int res = sum.x + sum.y + sum.z + sum.w;
|
|
||||||
int k = clamp(clz(1 << PARTORDER) - clz(res), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16)
|
|
||||||
#ifdef EXTRAMODE
|
|
||||||
#if PARTORDER == 3
|
|
||||||
sum = (chunk[0] >> k) + (chunk[1] >> k);
|
|
||||||
#elif PARTORDER == 4
|
|
||||||
sum = (chunk[0] >> k) + (chunk[1] >> k) + (chunk[2] >> k) + (chunk[3] >> k);
|
|
||||||
#else
|
|
||||||
#error Invalid PARTORDER
|
|
||||||
#endif
|
|
||||||
len[tid] += (k << PARTORDER) + sum.x + sum.y + sum.z + sum.w;
|
|
||||||
#else
|
|
||||||
len[tid] += (k << PARTORDER) + (res >> k);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// calculate rice partition bit length for every (1 << partOrder) samples
|
||||||
|
if (tid < 64)
|
||||||
|
{
|
||||||
|
int k = clamp(clz(1 << partOrder) - clz(psum[tid]), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16)
|
||||||
|
psum[tid] = (k << partOrder) + (psum[tid] >> k);
|
||||||
|
}
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
for (int l = GROUP_SIZE >> (PARTORDER + 1); l > 0; l >>= 1)
|
for (int l = 32; l > 0; l >>= 1)
|
||||||
{
|
{
|
||||||
if (tid < l)
|
if (tid < l)
|
||||||
len[tid] += len[tid + l];
|
psum[tid] += psum[tid + l];
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
}
|
}
|
||||||
if (tid == 0)
|
if (tid == 0)
|
||||||
output[get_group_id(0)] = len[0] + (bs - ro);
|
output[get_group_id(0)] = psum[0] + (bs - ro);
|
||||||
}
|
}
|
||||||
|
|
||||||
__kernel __attribute__((reqd_work_group_size(32, 1, 1)))
|
__kernel __attribute__((reqd_work_group_size(32, 1, 1)))
|
||||||
@@ -580,15 +522,16 @@ void clChooseBestMethod(
|
|||||||
int taskCount
|
int taskCount
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
__local struct {
|
int best_length = 0x7fffffff;
|
||||||
volatile int index[32];
|
int best_index = 0;
|
||||||
volatile int length[32];
|
__local int partLen[32];
|
||||||
} shared;
|
|
||||||
__local FLACCLSubframeData task;
|
__local FLACCLSubframeData task;
|
||||||
const int tid = get_local_id(0);
|
const int tid = get_local_id(0);
|
||||||
|
|
||||||
shared.length[tid] = 0x7fffffff;
|
// fetch part sum
|
||||||
shared.index[tid] = tid;
|
if (tid < taskCount)
|
||||||
|
partLen[tid] = residual[tid + taskCount * get_group_id(0)];
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
for (int taskNo = 0; taskNo < taskCount; taskNo++)
|
for (int taskNo = 0; taskNo < taskCount; taskNo++)
|
||||||
{
|
{
|
||||||
// fetch task data
|
// fetch task data
|
||||||
@@ -599,46 +542,27 @@ void clChooseBestMethod(
|
|||||||
|
|
||||||
if (tid == 0)
|
if (tid == 0)
|
||||||
{
|
{
|
||||||
// fetch part sum
|
int pl = partLen[taskNo];
|
||||||
int partLen = residual[taskNo + taskCount * get_group_id(0)];
|
|
||||||
//// calculate part size
|
|
||||||
//int residualLen = task[get_local_id(1)].data.blocksize - task[get_local_id(1)].data.residualOrder;
|
|
||||||
//residualLen = residualLen * (task[get_local_id(1)].data.type != Constant || psum != 0);
|
|
||||||
//// calculate rice parameter
|
|
||||||
//int k = max(0, min(14, convert_int_rtz(log2((psum + 0.000001f) / (residualLen + 0.000001f) + 0.5f))));
|
|
||||||
//// calculate part bit length
|
|
||||||
//int partLen = residualLen * (k + 1) + (psum >> k);
|
|
||||||
|
|
||||||
int obits = task.obits - task.wbits;
|
int obits = task.obits - task.wbits;
|
||||||
shared.length[taskNo] =
|
int len = min(obits * task.blocksize,
|
||||||
min(obits * task.blocksize,
|
task.type == Fixed ? task.residualOrder * obits + 6 + (4 * 1/2) + pl :
|
||||||
task.type == Fixed ? task.residualOrder * obits + 6 + (4 * 1/2) + partLen :
|
task.type == LPC ? task.residualOrder * obits + 4 + 5 + task.residualOrder * task.cbits + 6 + (4 * 1/2)/* << porder */ + pl :
|
||||||
task.type == LPC ? task.residualOrder * obits + 4 + 5 + task.residualOrder * task.cbits + 6 + (4 * 1/2)/* << porder */ + partLen :
|
task.type == Constant ? obits * select(1, task.blocksize, pl != task.blocksize - task.residualOrder) :
|
||||||
task.type == Constant ? obits * select(1, task.blocksize, partLen != task.blocksize - task.residualOrder) :
|
|
||||||
obits * task.blocksize);
|
obits * task.blocksize);
|
||||||
|
|
||||||
|
tasks[taskNo + taskCount * get_group_id(0)].data.size = len;
|
||||||
|
if (len < best_length)
|
||||||
|
{
|
||||||
|
best_length = len;
|
||||||
|
best_index = taskNo;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
}
|
}
|
||||||
//shared.index[get_local_id(0)] = get_local_id(0);
|
|
||||||
//shared.length[get_local_id(0)] = (get_local_id(0) < taskCount) ? tasks[get_local_id(0) + taskCount * get_group_id(0)].size : 0x7fffffff;
|
|
||||||
|
|
||||||
if (tid < taskCount)
|
|
||||||
tasks[tid + taskCount * get_group_id(0)].data.size = shared.length[tid];
|
|
||||||
|
|
||||||
int l1 = shared.length[tid];
|
|
||||||
for (int l = 16; l > 0; l >>= 1)
|
|
||||||
{
|
|
||||||
if (tid < l)
|
|
||||||
{
|
|
||||||
int l2 = shared.length[tid + l];
|
|
||||||
shared.index[tid] = shared.index[tid + select(0, l, l2 < l1)];
|
|
||||||
shared.length[tid] = l1 = min(l1, l2);
|
|
||||||
}
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
}
|
|
||||||
if (tid == 0)
|
if (tid == 0)
|
||||||
tasks[taskCount * get_group_id(0)].data.best_index = taskCount * get_group_id(0) + shared.index[0];
|
tasks[taskCount * get_group_id(0)].data.best_index = taskCount * get_group_id(0) + best_index;
|
||||||
}
|
}
|
||||||
|
|
||||||
__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
|
__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
|
||||||
@@ -788,13 +712,13 @@ void clCalcPartition(
|
|||||||
int psize // == task.blocksize >> max_porder?
|
int psize // == task.blocksize >> max_porder?
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
__local int pl[(GROUP_SIZE / 16)][15];
|
__local int pl[(GROUP_SIZE / 8)][15];
|
||||||
__local FLACCLSubframeData task;
|
__local FLACCLSubframeData task;
|
||||||
|
|
||||||
const int tid = get_local_id(0);
|
const int tid = get_local_id(0);
|
||||||
if (tid < sizeof(task) / sizeof(int))
|
if (tid < sizeof(task) / sizeof(int))
|
||||||
((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(1)]))[tid];
|
((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(1)]))[tid];
|
||||||
if (tid < (GROUP_SIZE / 16))
|
if (tid < (GROUP_SIZE / 8))
|
||||||
{
|
{
|
||||||
for (int k = 0; k <= 14; k++)
|
for (int k = 0; k <= 14; k++)
|
||||||
pl[tid][k] = 0;
|
pl[tid][k] = 0;
|
||||||
@@ -807,13 +731,14 @@ void clCalcPartition(
|
|||||||
{
|
{
|
||||||
// fetch residual
|
// fetch residual
|
||||||
int s = (offs >= task.residualOrder && offs < end) ? residual[task.residualOffs + offs] : 0;
|
int s = (offs >= task.residualOrder && offs < end) ? residual[task.residualOffs + offs] : 0;
|
||||||
// convert to unsigned
|
// overflow protection
|
||||||
s = clamp(s, -0x7fffff, 0x7fffff);
|
s = clamp(s, -0x7fffff, 0x7fffff);
|
||||||
|
// convert to unsigned
|
||||||
s = (s << 1) ^ (s >> 31);
|
s = (s << 1) ^ (s >> 31);
|
||||||
// calc number of unary bits for each residual sample with each rice paramater
|
// calc number of unary bits for each residual sample with each rice paramater
|
||||||
int part = (offs - start) / psize;
|
int part = (offs - start) / psize + (tid & 1) * (GROUP_SIZE / 16);
|
||||||
for (int k = 0; k <= 14; k++)
|
for (int k = 0; k <= 14; k++)
|
||||||
atom_add(&pl[part][k], s >> k);
|
atomic_add(&pl[part][k], s >> k);
|
||||||
//pl[part][k] += s >> k;
|
//pl[part][k] += s >> k;
|
||||||
}
|
}
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
@@ -825,7 +750,8 @@ void clCalcPartition(
|
|||||||
{
|
{
|
||||||
// output length
|
// output length
|
||||||
const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (k << (max_porder + 1));
|
const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (k << (max_porder + 1));
|
||||||
partition_lengths[pos + part] = min(0x7fffff, pl[tid][k]) + select(psize, psize - task.residualOrder, part == 0) * (k + 1);
|
int plen = pl[tid][k] + pl[tid + (GROUP_SIZE / 16)][k];
|
||||||
|
partition_lengths[pos + part] = min(0x7fffff, plen) + (psize - select(0, task.residualOrder, part == 0)) * (k + 1);
|
||||||
// if (get_group_id(1) == 0)
|
// if (get_group_id(1) == 0)
|
||||||
//printf("pl[%d][%d] == %d\n", k, part, min(0x7fffff, pl[k][tid]) + (psize - task.residualOrder * (part == 0)) * (k + 1));
|
//printf("pl[%d][%d] == %d\n", k, part, min(0x7fffff, pl[k][tid]) + (psize - task.residualOrder * (part == 0)) * (k + 1));
|
||||||
}
|
}
|
||||||
@@ -911,7 +837,7 @@ void clCalcPartition16(
|
|||||||
// convert to unsigned
|
// convert to unsigned
|
||||||
res[tid] = (s << 1) ^ (s >> 31);
|
res[tid] = (s << 1) ^ (s >> 31);
|
||||||
|
|
||||||
// for (int k = 0; k < 15; k++) atom_add(&pl[x][k], s >> k);
|
// for (int k = 0; k < 15; k++) atomic_add(&pl[x][k], s >> k);
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
data[tid] = nextData;
|
data[tid] = nextData;
|
||||||
@@ -1001,25 +927,23 @@ void clFindPartitionOrder(
|
|||||||
int max_porder
|
int max_porder
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
__local int partlen[9];
|
__local int partlen[16];
|
||||||
__local FLACCLSubframeData task;
|
__local FLACCLSubframeData task;
|
||||||
|
|
||||||
const int pos = (get_group_id(0) << (max_porder + 2)) + (2 << max_porder);
|
const int pos = (get_group_id(0) << (max_porder + 2)) + (2 << max_porder);
|
||||||
if (get_local_id(0) < sizeof(task) / sizeof(int))
|
if (get_local_id(0) < sizeof(task) / sizeof(int))
|
||||||
((__local int*)&task)[get_local_id(0)] = ((__global int*)(&tasks[get_group_id(0)]))[get_local_id(0)];
|
((__local int*)&task)[get_local_id(0)] = ((__global int*)(&tasks[get_group_id(0)]))[get_local_id(0)];
|
||||||
if (get_local_id(0) < 9)
|
if (get_local_id(0) < 16)
|
||||||
partlen[get_local_id(0)] = 0;
|
partlen[get_local_id(0)] = 0;
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
// fetch partition lengths
|
// fetch partition lengths
|
||||||
for (int offs = 0; offs < (2 << max_porder); offs += GROUP_SIZE)
|
int lim = (2 << max_porder) - 1;
|
||||||
|
for (int offs = get_local_id(0); offs < lim; offs += GROUP_SIZE)
|
||||||
{
|
{
|
||||||
if (offs + get_local_id(0) < (2 << max_porder) - 1)
|
int len = rice_parameters[pos + offs];
|
||||||
{
|
int porder = 31 - clz(lim - offs);
|
||||||
int len = rice_parameters[pos + offs + get_local_id(0)];
|
atomic_add(&partlen[porder], len);
|
||||||
int porder = 31 - clz((2 << max_porder) - 1 - offs - get_local_id(0));
|
|
||||||
atom_add(&partlen[porder], len);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
|||||||
492
CUETools.Codecs.FLACCL/flaccpu.cl
Normal file
492
CUETools.Codecs.FLACCL/flaccpu.cl
Normal file
@@ -0,0 +1,492 @@
|
|||||||
|
/**
|
||||||
|
* CUETools.FLACCL: FLAC audio encoder using OpenCL
|
||||||
|
* Copyright (c) 2010 Gregory S. Chudov
|
||||||
|
*
|
||||||
|
* This library is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This library is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with this library; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef _FLACCL_KERNEL_H_
|
||||||
|
#define _FLACCL_KERNEL_H_
|
||||||
|
|
||||||
|
#ifdef DEBUG
|
||||||
|
#pragma OPENCL EXTENSION cl_amd_printf : enable
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
|
||||||
|
|
||||||
|
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
|
||||||
|
|
||||||
|
typedef enum
|
||||||
|
{
|
||||||
|
Constant = 0,
|
||||||
|
Verbatim = 1,
|
||||||
|
Fixed = 8,
|
||||||
|
LPC = 32
|
||||||
|
} SubframeType;
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
int residualOrder; // <= 32
|
||||||
|
int samplesOffs;
|
||||||
|
int shift;
|
||||||
|
int cbits;
|
||||||
|
int size;
|
||||||
|
int type;
|
||||||
|
int obits;
|
||||||
|
int blocksize;
|
||||||
|
int best_index;
|
||||||
|
int channel;
|
||||||
|
int residualOffs;
|
||||||
|
int wbits;
|
||||||
|
int abits;
|
||||||
|
int porder;
|
||||||
|
int reserved[2];
|
||||||
|
} FLACCLSubframeData;
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
FLACCLSubframeData data;
|
||||||
|
int coefs[32]; // fixme: should be short?
|
||||||
|
} FLACCLSubframeTask;
|
||||||
|
|
||||||
|
__kernel void clStereoDecorr(
|
||||||
|
__global int *samples,
|
||||||
|
__global short2 *src,
|
||||||
|
int offset
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int pos = get_global_id(0);
|
||||||
|
if (pos < offset)
|
||||||
|
{
|
||||||
|
short2 s = src[pos];
|
||||||
|
samples[pos] = s.x;
|
||||||
|
samples[1 * offset + pos] = s.y;
|
||||||
|
samples[2 * offset + pos] = (s.x + s.y) >> 1;
|
||||||
|
samples[3 * offset + pos] = s.x - s.y;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void clChannelDecorr2(
|
||||||
|
__global int *samples,
|
||||||
|
__global short2 *src,
|
||||||
|
int offset
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int pos = get_global_id(0);
|
||||||
|
if (pos < offset)
|
||||||
|
{
|
||||||
|
short2 s = src[pos];
|
||||||
|
samples[pos] = s.x;
|
||||||
|
samples[1 * offset + pos] = s.y;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//__kernel void clChannelDecorr(
|
||||||
|
// int *samples,
|
||||||
|
// short *src,
|
||||||
|
// int offset
|
||||||
|
//)
|
||||||
|
//{
|
||||||
|
// int pos = get_global_id(0);
|
||||||
|
// if (pos < offset)
|
||||||
|
// samples[get_group_id(1) * offset + pos] = src[pos * get_num_groups(1) + get_group_id(1)];
|
||||||
|
//}
|
||||||
|
|
||||||
|
#define __ffs(a) (32 - clz(a & (-a)))
|
||||||
|
//#define __ffs(a) (33 - clz(~a & (a - 1)))
|
||||||
|
|
||||||
|
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||||
|
void clFindWastedBits(
|
||||||
|
__global FLACCLSubframeTask *tasks,
|
||||||
|
__global int *samples,
|
||||||
|
int tasksPerChannel
|
||||||
|
)
|
||||||
|
{
|
||||||
|
__global FLACCLSubframeTask* ptask = &tasks[get_group_id(0) * tasksPerChannel];
|
||||||
|
int w = 0, a = 0;
|
||||||
|
for (int pos = 0; pos < ptask->data.blocksize; pos ++)
|
||||||
|
{
|
||||||
|
int smp = samples[ptask->data.samplesOffs + pos];
|
||||||
|
w |= smp;
|
||||||
|
a |= smp ^ (smp >> 31);
|
||||||
|
}
|
||||||
|
w = max(0,__ffs(w) - 1);
|
||||||
|
a = 32 - clz(a) - w;
|
||||||
|
for (int i = 0; i < tasksPerChannel; i++)
|
||||||
|
{
|
||||||
|
ptask[i].data.wbits = w;
|
||||||
|
ptask[i].data.abits = a;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// get_num_groups(0) == number of tasks
|
||||||
|
// get_num_groups(1) == number of windows
|
||||||
|
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||||
|
void clComputeAutocor(
|
||||||
|
__global float *output,
|
||||||
|
__global const int *samples,
|
||||||
|
__global const float *window,
|
||||||
|
__global FLACCLSubframeTask *tasks,
|
||||||
|
const int taskCount // tasks per block
|
||||||
|
)
|
||||||
|
{
|
||||||
|
FLACCLSubframeData task = tasks[get_group_id(0) * taskCount].data;
|
||||||
|
int len = task.blocksize;
|
||||||
|
int windowOffs = get_group_id(1) * len;
|
||||||
|
float data1[4096 + 32];
|
||||||
|
|
||||||
|
// TODO!!!!!!!!!!! if (bs > 4096) data1[bs + 32]
|
||||||
|
|
||||||
|
for (int tid = 0; tid < len; tid++)
|
||||||
|
data1[tid] = samples[task.samplesOffs + tid] * window[windowOffs + tid];
|
||||||
|
data1[len] = 0.0f;
|
||||||
|
|
||||||
|
for (int i = 0; i <= MAX_ORDER; ++i)
|
||||||
|
{
|
||||||
|
double temp = 1.0;
|
||||||
|
double temp2 = 1.0;
|
||||||
|
float* finish = data1 + len - i;
|
||||||
|
|
||||||
|
for (float* pdata = data1; pdata < finish; pdata += 2)
|
||||||
|
{
|
||||||
|
temp += pdata[i] * pdata[0];
|
||||||
|
temp2 += pdata[i + 1] * pdata[1];
|
||||||
|
}
|
||||||
|
output[(get_group_id(0) * get_num_groups(1) + get_group_id(1)) * (MAX_ORDER + 1) + i] = temp + temp2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||||
|
void clComputeLPC(
|
||||||
|
__global float *pautoc,
|
||||||
|
__global float *lpcs,
|
||||||
|
int windowCount
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int lpcOffs = (get_group_id(0) + get_group_id(1) * windowCount) * (MAX_ORDER + 1) * 32;
|
||||||
|
int autocOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1);
|
||||||
|
volatile double ldr[32];
|
||||||
|
volatile double gen0[32];
|
||||||
|
volatile double gen1[32];
|
||||||
|
volatile double err[32];
|
||||||
|
__global float* autoc = pautoc + autocOffs;
|
||||||
|
|
||||||
|
for (int i = 0; i < MAX_ORDER; i++)
|
||||||
|
{
|
||||||
|
gen0[i] = gen1[i] = autoc[i + 1];
|
||||||
|
ldr[i] = 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute LPC using Schur and Levinson-Durbin recursion
|
||||||
|
double error = autoc[0];
|
||||||
|
for (int order = 0; order < MAX_ORDER; order++)
|
||||||
|
{
|
||||||
|
// Schur recursion
|
||||||
|
double reff = -gen1[0] / error;
|
||||||
|
//error += gen1[0] * reff; // Equivalent to error *= (1 - reff * reff);
|
||||||
|
error *= (1 - reff * reff);
|
||||||
|
|
||||||
|
for (int j = 0; j < MAX_ORDER - 1 - order; j++)
|
||||||
|
{
|
||||||
|
gen1[j] = gen1[j + 1] + reff * gen0[j];
|
||||||
|
gen0[j] = gen1[j + 1] * reff + gen0[j];
|
||||||
|
}
|
||||||
|
|
||||||
|
err[order] = error;
|
||||||
|
|
||||||
|
// Levinson-Durbin recursion
|
||||||
|
|
||||||
|
ldr[order] = reff;
|
||||||
|
for (int j = 0; j < order / 2; j++)
|
||||||
|
{
|
||||||
|
double tmp = ldr[j];
|
||||||
|
ldr[j] += reff * ldr[order - 1 - j];
|
||||||
|
ldr[order - 1 - j] += reff * tmp;
|
||||||
|
}
|
||||||
|
if (0 != (order & 1))
|
||||||
|
ldr[order / 2] += ldr[order / 2] * reff;
|
||||||
|
|
||||||
|
// Output coeffs
|
||||||
|
for (int j = 0; j <= order; j++)
|
||||||
|
lpcs[lpcOffs + order * 32 + j] = -ldr[order - j];
|
||||||
|
}
|
||||||
|
// Output prediction error estimates
|
||||||
|
for (int j = 0; j < MAX_ORDER; j++)
|
||||||
|
lpcs[lpcOffs + MAX_ORDER * 32 + j] = err[j];
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||||
|
void clQuantizeLPC(
|
||||||
|
__global FLACCLSubframeTask *tasks,
|
||||||
|
__global float*lpcs,
|
||||||
|
int taskCount, // tasks per block
|
||||||
|
int taskCountLPC, // tasks per set of coeffs (<= 32)
|
||||||
|
int minprecision,
|
||||||
|
int precisions
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int bs = tasks[get_group_id(1) * taskCount].data.blocksize;
|
||||||
|
int abits = tasks[get_group_id(1) * taskCount].data.abits;
|
||||||
|
int lpcOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1) * 32;
|
||||||
|
float error[MAX_ORDER];
|
||||||
|
int best_orders[MAX_ORDER];
|
||||||
|
|
||||||
|
// Load prediction error estimates based on Akaike's Criteria
|
||||||
|
for (int tid = 0; tid < MAX_ORDER; tid++)
|
||||||
|
{
|
||||||
|
error[tid] = bs * log(lpcs[lpcOffs + MAX_ORDER * 32 + tid]) + tid * 4.12f * log(bs);
|
||||||
|
best_orders[tid] = tid;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Select best orders
|
||||||
|
for (int i = 0; i < MAX_ORDER && i < taskCountLPC; i++)
|
||||||
|
{
|
||||||
|
for (int j = i + 1; j < MAX_ORDER; j++)
|
||||||
|
{
|
||||||
|
if (error[best_orders[j]] < error[best_orders[i]])
|
||||||
|
{
|
||||||
|
int tmp = best_orders[j];
|
||||||
|
best_orders[j] = best_orders[i];
|
||||||
|
best_orders[i] = tmp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Quantization
|
||||||
|
for (int i = 0; i < taskCountLPC; i ++)
|
||||||
|
{
|
||||||
|
int order = best_orders[i >> precisions];
|
||||||
|
int tmpi = 0;
|
||||||
|
for (int tid = 0; tid <= order; tid ++)
|
||||||
|
{
|
||||||
|
float lpc = lpcs[lpcOffs + order * 32 + tid];
|
||||||
|
// get 15 bits of each coeff
|
||||||
|
int c = convert_int_rte(lpc * (1 << 15));
|
||||||
|
// remove sign bits
|
||||||
|
tmpi |= c ^ (c >> 31);
|
||||||
|
}
|
||||||
|
// choose precision
|
||||||
|
//int cbits = max(3, min(10, 5 + (abits >> 1))); // - convert_int_rte(shared.PE[order - 1])
|
||||||
|
int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (bs <= 2304) - (bs <= 1152) - (bs <= 576), abits), clz(order) + 1 - abits));
|
||||||
|
// calculate shift based on precision and number of leading zeroes in coeffs
|
||||||
|
int shift = max(0,min(15, clz(tmpi) - 18 + cbits));
|
||||||
|
|
||||||
|
int taskNo = get_group_id(1) * taskCount + get_group_id(0) * taskCountLPC + i;
|
||||||
|
tmpi = 0;
|
||||||
|
for (int tid = 0; tid <= order; tid ++)
|
||||||
|
{
|
||||||
|
float lpc = lpcs[lpcOffs + order * 32 + tid];
|
||||||
|
// quantize coeffs with given shift
|
||||||
|
int c = convert_int_rte(clamp(lpc * (1 << shift), -1 << (cbits - 1), 1 << (cbits - 1)));
|
||||||
|
// remove sign bits
|
||||||
|
tmpi |= c ^ (c >> 31);
|
||||||
|
tasks[taskNo].coefs[tid] = c;
|
||||||
|
}
|
||||||
|
// calculate actual number of bits (+1 for sign)
|
||||||
|
cbits = 1 + 32 - clz(tmpi);
|
||||||
|
// output shift, cbits, ro
|
||||||
|
tasks[taskNo].data.shift = shift;
|
||||||
|
tasks[taskNo].data.cbits = cbits;
|
||||||
|
tasks[taskNo].data.residualOrder = order + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#define ESTIMATE_N(ro,sum) for (int pos = ro; pos < bs; pos ++) { \
|
||||||
|
__global int *ptr = data + pos - ro; \
|
||||||
|
int t = clamp((data[pos] - ((sum) >> task.data.shift)) >> task.data.wbits, -0x7fffff, 0x7fffff); \
|
||||||
|
len[pos >> (12 - EPO)] += (t << 1) ^ (t >> 31); \
|
||||||
|
}
|
||||||
|
|
||||||
|
// int sum = 0; for (int i = 0; i < ro; i++) sum += *(ptr++) * task.coefs[i];
|
||||||
|
|
||||||
|
__kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||||
|
void clEstimateResidual(
|
||||||
|
__global int*output,
|
||||||
|
__global int*samples,
|
||||||
|
__global FLACCLSubframeTask *tasks
|
||||||
|
)
|
||||||
|
{
|
||||||
|
FLACCLSubframeTask task = tasks[get_group_id(0)];
|
||||||
|
int ro = task.data.residualOrder;
|
||||||
|
int bs = task.data.blocksize;
|
||||||
|
#define EPO 6
|
||||||
|
int len[1 << EPO];
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
//float data[4096 + 32];
|
||||||
|
//float fcoef[32];
|
||||||
|
|
||||||
|
// TODO!!!!!!!!!!! if (bs > 4096) data1[bs + 32]
|
||||||
|
|
||||||
|
for (int tid = 0; tid < bs; tid++)
|
||||||
|
data[tid] = (float)samples[task.data.samplesOffs + tid] / (1 << task.data.wbits);
|
||||||
|
for (int tid = 0; tid < 32; tid++)
|
||||||
|
fcoef[tid] = select(0.0f, - ((float) task.coefs[tid]) / (1 << task.data.shift), tid < ro);
|
||||||
|
float4 c0 = vload4(0, &fcoef[0]);
|
||||||
|
float4 c1 = vload4(1, &fcoef[0]);
|
||||||
|
float4 c2 = vload4(2, &fcoef[0]);
|
||||||
|
#else
|
||||||
|
__global int *data = &samples[task.data.samplesOffs];
|
||||||
|
for (int i = ro; i < 32; i++)
|
||||||
|
task.coefs[i] = 0;
|
||||||
|
#endif
|
||||||
|
for (int i = 0; i < 1 << EPO; i++)
|
||||||
|
len[i] = 0;
|
||||||
|
|
||||||
|
switch (ro)
|
||||||
|
{
|
||||||
|
case 0: ESTIMATE_N(0, 0) break;
|
||||||
|
case 1: ESTIMATE_N(1, *ptr * task.coefs[0]) break;
|
||||||
|
case 2: ESTIMATE_N(2, *(ptr++) * task.coefs[0] + *ptr * task.coefs[1]) break;
|
||||||
|
case 3: ESTIMATE_N(3, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *ptr * task.coefs[2]) break;
|
||||||
|
case 4: ESTIMATE_N(4, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *ptr * task.coefs[3]) break;
|
||||||
|
case 5: ESTIMATE_N(5, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *ptr * task.coefs[4]) break;
|
||||||
|
case 6: ESTIMATE_N(6, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *ptr * task.coefs[5]) break;
|
||||||
|
case 7: ESTIMATE_N(7, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *ptr * task.coefs[6]) break;
|
||||||
|
case 8: ESTIMATE_N(8, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *ptr * task.coefs[7]) break;
|
||||||
|
case 9: ESTIMATE_N(9, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7] + *ptr * task.coefs[8]) break;
|
||||||
|
case 10: ESTIMATE_N(10, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7] + *(ptr++) * task.coefs[8] + *ptr * task.coefs[9]) break;
|
||||||
|
case 11: ESTIMATE_N(11, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7] + *(ptr++) * task.coefs[8] + *(ptr++) * task.coefs[9] + *ptr * task.coefs[10]) break;
|
||||||
|
case 12: ESTIMATE_N(12, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7] + *(ptr++) * task.coefs[8] + *(ptr++) * task.coefs[9] + *(ptr++) * task.coefs[10] + *ptr * task.coefs[11]) break;
|
||||||
|
default:
|
||||||
|
for (int pos = ro; pos < bs; pos ++)
|
||||||
|
{
|
||||||
|
#if 0
|
||||||
|
float sum = dot(vload4(0, data + pos - ro), c0)
|
||||||
|
+ dot(vload4(1, data + pos - ro), c1)
|
||||||
|
+ dot(vload4(2, data + pos - ro), c2)
|
||||||
|
;
|
||||||
|
int t = convert_int_rte(data[pos] + sum);
|
||||||
|
#else
|
||||||
|
__global int *ptr = data + pos - ro;
|
||||||
|
int sum =
|
||||||
|
*(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3]
|
||||||
|
+ *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7]
|
||||||
|
+ *(ptr++) * task.coefs[8] + *(ptr++) * task.coefs[9] + *(ptr++) * task.coefs[10] + *(ptr++) * task.coefs[11]
|
||||||
|
;
|
||||||
|
for (int i = 12; i < ro; i++)
|
||||||
|
sum += *(ptr++) * task.coefs[i];
|
||||||
|
int t = (data[pos] - (sum >> task.data.shift)) >> task.data.wbits;
|
||||||
|
#endif
|
||||||
|
// overflow protection
|
||||||
|
t = clamp(t, -0x7fffff, 0x7fffff);
|
||||||
|
// convert to unsigned
|
||||||
|
t = (t << 1) ^ (t >> 31);
|
||||||
|
len[pos >> (12 - EPO)] += t;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
int total = 0;
|
||||||
|
for (int i = 0; i < 1 << EPO; i++)
|
||||||
|
{
|
||||||
|
int res = min(0x7fffff,len[i]);
|
||||||
|
int k = clamp(clz(1 << (12 - EPO)) - clz(res), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16)
|
||||||
|
total += (k << (12 - EPO)) + (res >> k);
|
||||||
|
}
|
||||||
|
output[get_group_id(0)] = min(0x7ffffff, total) + (bs - ro);
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||||
|
void clChooseBestMethod(
|
||||||
|
__global FLACCLSubframeTask *tasks,
|
||||||
|
__global int *residual,
|
||||||
|
int taskCount
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int best_length = 0x7fffff;
|
||||||
|
int best_no = 0;
|
||||||
|
for (int taskNo = 0; taskNo < taskCount; taskNo++)
|
||||||
|
{
|
||||||
|
// fetch task data
|
||||||
|
__global FLACCLSubframeTask* ptask = tasks + taskNo + taskCount * get_group_id(0);
|
||||||
|
// fetch part sum
|
||||||
|
int partLen = residual[taskNo + taskCount * get_group_id(0)];
|
||||||
|
int obits = ptask->data.obits - ptask->data.wbits;
|
||||||
|
int bs = ptask->data.blocksize;
|
||||||
|
int ro = ptask->data.residualOrder;
|
||||||
|
int len = min(obits * bs,
|
||||||
|
ptask->data.type == Fixed ? ro * obits + 6 + (4 * 1/2) + partLen :
|
||||||
|
ptask->data.type == LPC ? ro * obits + 4 + 5 + ro * ptask->data.cbits + 6 + (4 * 1/2)/* << porder */ + partLen :
|
||||||
|
ptask->data.type == Constant ? obits * select(1, bs, partLen != bs - ro) :
|
||||||
|
obits * bs);
|
||||||
|
ptask->data.size = len;
|
||||||
|
if (len < best_length)
|
||||||
|
{
|
||||||
|
best_length = len;
|
||||||
|
best_no = taskNo;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tasks[taskCount * get_group_id(0)].data.best_index = taskCount * get_group_id(0) + best_no;
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||||
|
void clCopyBestMethod(
|
||||||
|
__global FLACCLSubframeTask *tasks_out,
|
||||||
|
__global FLACCLSubframeTask *tasks,
|
||||||
|
int count
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int best_index = tasks[count * get_group_id(0)].data.best_index;
|
||||||
|
tasks_out[get_group_id(0)] = tasks[best_index];
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||||
|
void clCopyBestMethodStereo(
|
||||||
|
__global FLACCLSubframeTask *tasks_out,
|
||||||
|
__global FLACCLSubframeTask *tasks,
|
||||||
|
int count
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int best_index[4];
|
||||||
|
int best_size[4];
|
||||||
|
int lr_index[2];
|
||||||
|
|
||||||
|
for (int i = 0; i < 4; i++)
|
||||||
|
{
|
||||||
|
int best = tasks[count * (get_group_id(0) * 4 + i)].data.best_index;
|
||||||
|
best_index[i] = best;
|
||||||
|
best_size[i] = tasks[best].data.size;
|
||||||
|
}
|
||||||
|
|
||||||
|
int bitsBest = best_size[2] + best_size[3]; // MidSide
|
||||||
|
lr_index[0] = best_index[2];
|
||||||
|
lr_index[1] = best_index[3];
|
||||||
|
if (bitsBest > best_size[3] + best_size[1]) // RightSide
|
||||||
|
{
|
||||||
|
bitsBest = best_size[3] + best_size[1];
|
||||||
|
lr_index[0] = best_index[3];
|
||||||
|
lr_index[1] = best_index[1];
|
||||||
|
}
|
||||||
|
if (bitsBest > best_size[0] + best_size[3]) // LeftSide
|
||||||
|
{
|
||||||
|
bitsBest = best_size[0] + best_size[3];
|
||||||
|
lr_index[0] = best_index[0];
|
||||||
|
lr_index[1] = best_index[3];
|
||||||
|
}
|
||||||
|
if (bitsBest > best_size[0] + best_size[1]) // LeftRight
|
||||||
|
{
|
||||||
|
bitsBest = best_size[0] + best_size[1];
|
||||||
|
lr_index[0] = best_index[0];
|
||||||
|
lr_index[1] = best_index[1];
|
||||||
|
}
|
||||||
|
tasks_out[2 * get_group_id(0)] = tasks[lr_index[0]];
|
||||||
|
tasks_out[2 * get_group_id(0)].data.residualOffs = tasks[best_index[0]].data.residualOffs;
|
||||||
|
tasks_out[2 * get_group_id(0) + 1] = tasks[lr_index[1]];
|
||||||
|
tasks_out[2 * get_group_id(0) + 1].data.residualOffs = tasks[best_index[1]].data.residualOffs;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
@@ -72,6 +72,7 @@ namespace CUETools.FLACCL.cmd
|
|||||||
string window_function = null;
|
string window_function = null;
|
||||||
string input_file = null;
|
string input_file = null;
|
||||||
string output_file = null;
|
string output_file = null;
|
||||||
|
string device_type = null;
|
||||||
int min_partition_order = -1, max_partition_order = -1,
|
int min_partition_order = -1, max_partition_order = -1,
|
||||||
min_lpc_order = -1, max_lpc_order = -1,
|
min_lpc_order = -1, max_lpc_order = -1,
|
||||||
min_fixed_order = -1, max_fixed_order = -1,
|
min_fixed_order = -1, max_fixed_order = -1,
|
||||||
@@ -112,6 +113,10 @@ namespace CUETools.FLACCL.cmd
|
|||||||
settings.GroupSize = intarg;
|
settings.GroupSize = intarg;
|
||||||
else if (args[arg] == "--define" && arg + 2 < args.Length)
|
else if (args[arg] == "--define" && arg + 2 < args.Length)
|
||||||
settings.Defines += "#define " + args[++arg] + " " + args[++arg] + "\n";
|
settings.Defines += "#define " + args[++arg] + " " + args[++arg] + "\n";
|
||||||
|
else if (args[arg] == "--opencl-platform" && ++arg < args.Length)
|
||||||
|
settings.Platform = args[arg];
|
||||||
|
else if (args[arg] == "--opencl-type" && ++arg < args.Length)
|
||||||
|
device_type = args[arg];
|
||||||
else if ((args[arg] == "-o" || args[arg] == "--output") && ++arg < args.Length)
|
else if ((args[arg] == "-o" || args[arg] == "--output") && ++arg < args.Length)
|
||||||
output_file = args[arg];
|
output_file = args[arg];
|
||||||
else if ((args[arg] == "-s" || args[arg] == "--stereo") && ++arg < args.Length)
|
else if ((args[arg] == "-s" || args[arg] == "--stereo") && ++arg < args.Length)
|
||||||
@@ -210,6 +215,8 @@ namespace CUETools.FLACCL.cmd
|
|||||||
|
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
|
if (device_type != null)
|
||||||
|
settings.DeviceType = (OpenCLDeviceType)(Enum.Parse(typeof(OpenCLDeviceType), device_type, true));
|
||||||
encoder.Settings = settings;
|
encoder.Settings = settings;
|
||||||
if (level >= 0)
|
if (level >= 0)
|
||||||
encoder.CompressionLevel = level;
|
encoder.CompressionLevel = level;
|
||||||
|
|||||||
Reference in New Issue
Block a user