mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
Compatibility issues with HD4XXX, Fermi
This commit is contained in:
@@ -195,7 +195,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
_path = path;
|
_path = path;
|
||||||
_IO = IO;
|
_IO = IO;
|
||||||
|
|
||||||
eparams.flake_set_defaults(_compressionLevel, !_settings.GPUOnly);
|
eparams.flake_set_defaults(_compressionLevel);
|
||||||
eparams.padding_size = 8192;
|
eparams.padding_size = 8192;
|
||||||
|
|
||||||
crc8 = new Crc8();
|
crc8 = new Crc8();
|
||||||
@@ -238,7 +238,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
if (value < 0 || value > 11)
|
if (value < 0 || value > 11)
|
||||||
throw new Exception("unsupported compression level");
|
throw new Exception("unsupported compression level");
|
||||||
_compressionLevel = value;
|
_compressionLevel = value;
|
||||||
eparams.flake_set_defaults(_compressionLevel, !_settings.GPUOnly);
|
eparams.flake_set_defaults(_compressionLevel);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -261,7 +261,6 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
//_settings.GPUOnly = true;
|
//_settings.GPUOnly = true;
|
||||||
_settings.MappedMemory = true;
|
_settings.MappedMemory = true;
|
||||||
}
|
}
|
||||||
eparams.flake_set_defaults(_compressionLevel, !_settings.GPUOnly);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1582,19 +1581,6 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
// If true, RequireImageSupport will filter out any devices without image support
|
// If true, RequireImageSupport will filter out any devices without image support
|
||||||
// In this project we don't need image support though, so we set it to false
|
// In this project we don't need image support though, so we set it to false
|
||||||
OCLMan.RequireImageSupport = false;
|
OCLMan.RequireImageSupport = false;
|
||||||
// The Defines string gets prepended to any and all sources that are compiled
|
|
||||||
// and serve as a convenient way to pass configuration information to the compilation process
|
|
||||||
OCLMan.Defines =
|
|
||||||
"#define MAX_ORDER " + eparams.max_prediction_order.ToString() + "\n" +
|
|
||||||
"#define GROUP_SIZE " + groupSize.ToString() + "\n" +
|
|
||||||
"#define FLACCL_VERSION \"" + vendor_string + "\"\n" +
|
|
||||||
(_settings.GPUOnly ? "#define DO_PARTITIONS\n" : "") +
|
|
||||||
(_settings.DoRice ? "#define DO_RICE\n" : "") +
|
|
||||||
#if DEBUG
|
|
||||||
"#define DEBUG\n" +
|
|
||||||
#endif
|
|
||||||
(_settings.DeviceType == OpenCLDeviceType.CPU ? "#define FLACCL_CPU\n" : "") +
|
|
||||||
_settings.Defines + "\n";
|
|
||||||
// The BuildOptions string is passed directly to clBuild and can be used to do debug builds etc
|
// The BuildOptions string is passed directly to clBuild and can be used to do debug builds etc
|
||||||
OCLMan.BuildOptions = "";
|
OCLMan.BuildOptions = "";
|
||||||
OCLMan.SourcePath = System.IO.Path.GetDirectoryName(GetType().Assembly.Location);
|
OCLMan.SourcePath = System.IO.Path.GetDirectoryName(GetType().Assembly.Location);
|
||||||
@@ -1620,7 +1606,23 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
OCLMan.CreateDefaultContext(platformId, (DeviceType)_settings.DeviceType);
|
OCLMan.CreateDefaultContext(platformId, (DeviceType)_settings.DeviceType);
|
||||||
|
|
||||||
if (OCLMan.Context.Devices[0].Extensions.Contains("cl_khr_local_int32_extended_atomics"))
|
if (OCLMan.Context.Devices[0].Extensions.Contains("cl_khr_local_int32_extended_atomics"))
|
||||||
OCLMan.Defines += "#define HAVE_ATOM\n";
|
_settings.Defines += "#define HAVE_ATOM\n";
|
||||||
|
else
|
||||||
|
_settings.GPUOnly = false;
|
||||||
|
|
||||||
|
// The Defines string gets prepended to any and all sources that are compiled
|
||||||
|
// and serve as a convenient way to pass configuration information to the compilation process
|
||||||
|
OCLMan.Defines =
|
||||||
|
"#define MAX_ORDER " + eparams.max_prediction_order.ToString() + "\n" +
|
||||||
|
"#define GROUP_SIZE " + groupSize.ToString() + "\n" +
|
||||||
|
"#define FLACCL_VERSION \"" + vendor_string + "\"\n" +
|
||||||
|
(_settings.GPUOnly ? "#define DO_PARTITIONS\n" : "") +
|
||||||
|
(_settings.DoRice ? "#define DO_RICE\n" : "") +
|
||||||
|
#if DEBUG
|
||||||
|
"#define DEBUG\n" +
|
||||||
|
#endif
|
||||||
|
(_settings.DeviceType == OpenCLDeviceType.CPU ? "#define FLACCL_CPU\n" : "") +
|
||||||
|
_settings.Defines + "\n";
|
||||||
|
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
@@ -2146,7 +2148,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
|
|
||||||
public bool do_seektable;
|
public bool do_seektable;
|
||||||
|
|
||||||
public int flake_set_defaults(int lvl, bool encode_on_cpu)
|
public int flake_set_defaults(int lvl)
|
||||||
{
|
{
|
||||||
compression = lvl;
|
compression = lvl;
|
||||||
|
|
||||||
@@ -2165,7 +2167,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
min_prediction_order = 1;
|
min_prediction_order = 1;
|
||||||
max_prediction_order = 12;
|
max_prediction_order = 12;
|
||||||
min_partition_order = 0;
|
min_partition_order = 0;
|
||||||
max_partition_order = 6;
|
max_partition_order = 8;
|
||||||
variable_block_size = 0;
|
variable_block_size = 0;
|
||||||
lpc_min_precision_search = 0;
|
lpc_min_precision_search = 0;
|
||||||
lpc_max_precision_search = 0;
|
lpc_max_precision_search = 0;
|
||||||
@@ -2183,7 +2185,6 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
do_midside = false;
|
do_midside = false;
|
||||||
window_function = WindowFunction.Bartlett;
|
window_function = WindowFunction.Bartlett;
|
||||||
orders_per_window = 1;
|
orders_per_window = 1;
|
||||||
max_partition_order = 4;
|
|
||||||
max_prediction_order = 7;
|
max_prediction_order = 7;
|
||||||
min_fixed_order = 3;
|
min_fixed_order = 3;
|
||||||
max_fixed_order = 2;
|
max_fixed_order = 2;
|
||||||
@@ -2197,7 +2198,6 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
min_fixed_order = 2;
|
min_fixed_order = 2;
|
||||||
max_fixed_order = 2;
|
max_fixed_order = 2;
|
||||||
max_prediction_order = 7;
|
max_prediction_order = 7;
|
||||||
max_partition_order = 4;
|
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
do_constant = false;
|
do_constant = false;
|
||||||
@@ -2207,7 +2207,6 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
min_fixed_order = 2;
|
min_fixed_order = 2;
|
||||||
max_fixed_order = 2;
|
max_fixed_order = 2;
|
||||||
max_prediction_order = 8;
|
max_prediction_order = 8;
|
||||||
max_partition_order = 4;
|
|
||||||
break;
|
break;
|
||||||
case 3:
|
case 3:
|
||||||
do_constant = false;
|
do_constant = false;
|
||||||
@@ -2272,9 +2271,6 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!encode_on_cpu)
|
|
||||||
max_partition_order = 8;
|
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -2456,14 +2452,14 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
clSelectedTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, selectedLen);
|
clSelectedTasks = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, selectedLen);
|
||||||
clRiceOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceLen);
|
clRiceOutput = openCLProgram.Context.CreateBuffer(MemFlags.READ_WRITE | MemFlags.ALLOC_HOST_PTR, riceLen);
|
||||||
|
|
||||||
clSamplesBytesPtr = openCLCQ.EnqueueMapBuffer(clSamplesBytes, true, MapFlags.WRITE, 0, samplesBufferLen / 2);
|
clSamplesBytesPtr = openCLCQ.EnqueueMapBuffer(clSamplesBytes, true, MapFlags.READ_WRITE, 0, samplesBufferLen / 2);
|
||||||
clResidualPtr = openCLCQ.EnqueueMapBuffer(clResidual, true, MapFlags.WRITE, 0, residualBufferLen);
|
clResidualPtr = openCLCQ.EnqueueMapBuffer(clResidual, true, MapFlags.READ_WRITE, 0, residualBufferLen);
|
||||||
clBestRiceParamsPtr = openCLCQ.EnqueueMapBuffer(clBestRiceParams, true, MapFlags.WRITE, 0, riceParamsLen / 4);
|
clBestRiceParamsPtr = openCLCQ.EnqueueMapBuffer(clBestRiceParams, true, MapFlags.READ_WRITE, 0, riceParamsLen / 4);
|
||||||
clResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clResidualTasks, true, MapFlags.WRITE, 0, residualTasksLen);
|
clResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clResidualTasks, true, MapFlags.READ_WRITE, 0, residualTasksLen);
|
||||||
clBestResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clBestResidualTasks, true, MapFlags.WRITE, 0, bestResidualTasksLen);
|
clBestResidualTasksPtr = openCLCQ.EnqueueMapBuffer(clBestResidualTasks, true, MapFlags.READ_WRITE, 0, bestResidualTasksLen);
|
||||||
clWindowFunctionsPtr = openCLCQ.EnqueueMapBuffer(clWindowFunctions, true, MapFlags.WRITE, 0, wndLen);
|
clWindowFunctionsPtr = openCLCQ.EnqueueMapBuffer(clWindowFunctions, true, MapFlags.READ_WRITE, 0, wndLen);
|
||||||
clSelectedTasksPtr = openCLCQ.EnqueueMapBuffer(clSelectedTasks, true, MapFlags.WRITE, 0, selectedLen);
|
clSelectedTasksPtr = openCLCQ.EnqueueMapBuffer(clSelectedTasks, true, MapFlags.READ_WRITE, 0, selectedLen);
|
||||||
clRiceOutputPtr = openCLCQ.EnqueueMapBuffer(clRiceOutput, true, MapFlags.WRITE, 0, riceLen);
|
clRiceOutputPtr = openCLCQ.EnqueueMapBuffer(clRiceOutput, true, MapFlags.READ_WRITE, 0, riceLen);
|
||||||
|
|
||||||
//clSamplesBytesPtr = clSamplesBytes.HostPtr;
|
//clSamplesBytesPtr = clSamplesBytes.HostPtr;
|
||||||
//clResidualPtr = clResidual.HostPtr;
|
//clResidualPtr = clResidual.HostPtr;
|
||||||
|
|||||||
@@ -865,6 +865,10 @@ void clEstimateResidual(
|
|||||||
obits * bs);
|
obits * bs);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
#define MAX_BLOCKSIZE 4096
|
||||||
|
#define ESTPARTLOG 5
|
||||||
|
|
||||||
__kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
|
__kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
|
||||||
void clEstimateResidual(
|
void clEstimateResidual(
|
||||||
__global int*samples,
|
__global int*samples,
|
||||||
@@ -877,7 +881,7 @@ void clEstimateResidual(
|
|||||||
__local volatile int idata[GROUP_SIZE];
|
__local volatile int idata[GROUP_SIZE];
|
||||||
#endif
|
#endif
|
||||||
__local FLACCLSubframeTask task;
|
__local FLACCLSubframeTask task;
|
||||||
__local int psum[64];
|
__local int psum[MAX_BLOCKSIZE >> ESTPARTLOG];
|
||||||
__local float fcoef[32];
|
__local float fcoef[32];
|
||||||
__local int selectedTask;
|
__local int selectedTask;
|
||||||
|
|
||||||
@@ -896,15 +900,13 @@ void clEstimateResidual(
|
|||||||
if (tid < 32)
|
if (tid < 32)
|
||||||
//fcoef[tid] = select(0.0f, - ((float) task.coefs[tid]) / (1 << task.data.shift), tid < ro);
|
//fcoef[tid] = select(0.0f, - ((float) task.coefs[tid]) / (1 << task.data.shift), tid < ro);
|
||||||
fcoef[tid] = tid < MAX_ORDER && tid + ro - MAX_ORDER >= 0 ? - ((float) task.coefs[tid + ro - MAX_ORDER]) / (1 << task.data.shift) : 0.0f;
|
fcoef[tid] = tid < MAX_ORDER && tid + ro - MAX_ORDER >= 0 ? - ((float) task.coefs[tid + ro - MAX_ORDER]) / (1 << task.data.shift) : 0.0f;
|
||||||
if (tid < 64)
|
for (int offs = tid; offs < (MAX_BLOCKSIZE >> ESTPARTLOG); offs += GROUP_SIZE)
|
||||||
psum[tid] = 0;
|
psum[offs] = 0;
|
||||||
data[tid] = 0.0f;
|
data[tid] = 0.0f;
|
||||||
// need to initialize "extra" data, because NaNs can produce wierd results even when multipled by zero extra coefs
|
// need to initialize "extra" data, because NaNs can produce wierd results even when multipled by zero extra coefs
|
||||||
if (tid < 32)
|
if (tid < 32)
|
||||||
data[GROUP_SIZE * 2 + tid] = 0.0f;
|
data[GROUP_SIZE * 2 + tid] = 0.0f;
|
||||||
|
|
||||||
int partOrder = max(6, clz(64) - clz(bs - 1) + 1);
|
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
#ifdef AMD
|
#ifdef AMD
|
||||||
@@ -960,35 +962,35 @@ void clEstimateResidual(
|
|||||||
// convert to unsigned
|
// convert to unsigned
|
||||||
t = (t << 1) ^ (t >> 31);
|
t = (t << 1) ^ (t >> 31);
|
||||||
#if !defined(AMD) || !defined(HAVE_ATOM)
|
#if !defined(AMD) || !defined(HAVE_ATOM)
|
||||||
// convert to unsigned
|
|
||||||
idata[tid] = t;
|
idata[tid] = t;
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
for (int l = 16; l > 1; l >>= 1)
|
||||||
int ps = (1 << partOrder) - 1;
|
idata[tid] += idata[tid + l];
|
||||||
int lane = tid & ps;
|
if ((tid & 31) == 0)
|
||||||
for (int l = 1 << (partOrder - 1); l > WARP_SIZE; l >>= 1)
|
psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG] = idata[tid] + idata[tid + 1];
|
||||||
{
|
|
||||||
if (lane < l) idata[tid] += idata[tid + l];
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
}
|
|
||||||
if (lane < WARP_SIZE)
|
|
||||||
for (int l = WARP_SIZE; l > 0; l >>= 1)
|
|
||||||
idata[tid] += idata[tid + l];
|
|
||||||
if (lane == 0)
|
|
||||||
psum[min(63,offs >> partOrder)] += idata[tid];
|
|
||||||
#else
|
#else
|
||||||
atom_add(&psum[min(63,offs >> partOrder)], t);
|
atom_add(&psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG], t);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// calculate rice partition bit length for every (1 << partOrder) samples
|
// calculate rice partition bit length for every 32 samples
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
if (tid < 64)
|
// Bug: if (MAX_BLOCKSIZE >> (ESTPARTLOG + 1)) > GROUP_SIZE
|
||||||
{
|
int pl = get_local_id(0) < (MAX_BLOCKSIZE >> (ESTPARTLOG + 1)) ? pl = psum[tid * 2] + psum[tid * 2 + 1] : 0;
|
||||||
int k = iclamp(clz(1 << partOrder) - clz(psum[tid]), 0, 14); // 27 - clz(res) == clz(16) - clz(res) == log2(res / 16)
|
|
||||||
psum[tid] = (k << partOrder) + (psum[tid] >> k);
|
|
||||||
}
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
for (int l = 32; l > 0; l >>= 1)
|
// for (int pos = 0; pos < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2; pos += GROUP_SIZE)
|
||||||
|
// {
|
||||||
|
//int offs = pos + tid;
|
||||||
|
//int pl = offs < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2 ? psum[offs * 2] + psum[offs * 2 + 1] : 0;
|
||||||
|
////int pl = psum[offs * 2] + psum[offs * 2 + 1];
|
||||||
|
//barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
//if (offs < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2)
|
||||||
|
// psum[offs] = pl;
|
||||||
|
// }
|
||||||
|
int k = iclamp(31 - (ESTPARTLOG + 1) - clz(pl), 0, 14); // 26 - clz(res) == clz(32) - clz(res) == log2(res / 32)
|
||||||
|
if (tid < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2)
|
||||||
|
psum[tid] = (k << (ESTPARTLOG + 1)) + (pl >> k);
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
for (int l = MAX_BLOCKSIZE >> (ESTPARTLOG + 2); l > 0; l >>= 1)
|
||||||
{
|
{
|
||||||
if (tid < l)
|
if (tid < l)
|
||||||
psum[tid] += psum[tid + l];
|
psum[tid] += psum[tid + l];
|
||||||
@@ -1796,7 +1798,11 @@ void clRiceEncoding(
|
|||||||
unsigned int bb = bw.bit_buf << bw.bit_left;
|
unsigned int bb = bw.bit_buf << bw.bit_left;
|
||||||
bw.bit_buf = 0;
|
bw.bit_buf = 0;
|
||||||
bw.bit_left += (32 - b);
|
bw.bit_left += (32 - b);
|
||||||
|
#ifdef AMD
|
||||||
|
bw.buffer[bw.buf_ptr++] = as_int(as_char4(bb).wzyx);
|
||||||
|
#else
|
||||||
bw.buffer[bw.buf_ptr++] = (bb >> 24) | ((bb >> 8) & 0xff00) | ((bb << 8) & 0xff0000) | ((bb << 24) & 0xff000000);
|
bw.buffer[bw.buf_ptr++] = (bb >> 24) | ((bb >> 8) & 0xff00) | ((bb << 8) & 0xff0000) | ((bb << 24) & 0xff000000);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
bits -= b;
|
bits -= b;
|
||||||
}
|
}
|
||||||
@@ -1811,7 +1817,11 @@ void clRiceEncoding(
|
|||||||
unsigned int bb = (bw.bit_buf << bw.bit_left) | (val >> (bits - bw.bit_left));
|
unsigned int bb = (bw.bit_buf << bw.bit_left) | (val >> (bits - bw.bit_left));
|
||||||
bw.bit_buf = val;
|
bw.bit_buf = val;
|
||||||
bw.bit_left += (32 - bits);
|
bw.bit_left += (32 - bits);
|
||||||
|
#ifdef AMD
|
||||||
|
bw.buffer[bw.buf_ptr++] = as_int(as_char4(bb).wzyx);
|
||||||
|
#else
|
||||||
bw.buffer[bw.buf_ptr++] = (bb >> 24) | ((bb >> 8) & 0xff00) | ((bb << 8) & 0xff0000) | ((bb << 24) & 0xff000000);
|
bw.buffer[bw.buf_ptr++] = (bb >> 24) | ((bb >> 8) & 0xff00) | ((bb << 8) & 0xff0000) | ((bb << 24) & 0xff000000);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
////if (get_group_id(0) == 0) printf("%x ", v);
|
////if (get_group_id(0) == 0) printf("%x ", v);
|
||||||
//writebits(&bw, (v >> k) + 1, 1);
|
//writebits(&bw, (v >> k) + 1, 1);
|
||||||
@@ -1916,10 +1926,8 @@ void clRiceEncoding(
|
|||||||
atom_or(&data[qpos0 + 1], qval1);
|
atom_or(&data[qpos0 + 1], qval1);
|
||||||
start = mypos[GROUP_SIZE - 1];
|
start = mypos[GROUP_SIZE - 1];
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
unsigned int bb = data[tid];
|
|
||||||
bb = (bb >> 24) | ((bb >> 8) & 0xff00U) | ((bb << 8) & 0xff0000U) | (bb << 24);
|
|
||||||
if ((start32 + tid) * 32 <= start)
|
if ((start32 + tid) * 32 <= start)
|
||||||
output[start32 + tid] = bb;
|
output[start32 + tid] = as_int(as_char4(data[tid]).wzyx);
|
||||||
unsigned int remainder = data[start / 32 - start32];
|
unsigned int remainder = data[start / 32 - start32];
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
data[tid] = select(0U, remainder, tid == 0);
|
data[tid] = select(0U, remainder, tid == 0);
|
||||||
|
|||||||
Reference in New Issue
Block a user