mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
experimenting with rice encoding on GPU
This commit is contained in:
@@ -37,6 +37,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
{
|
{
|
||||||
this.DoVerify = false;
|
this.DoVerify = false;
|
||||||
this.GPUOnly = true;
|
this.GPUOnly = true;
|
||||||
|
this.DoRice = false;
|
||||||
this.MappedMemory = false;
|
this.MappedMemory = false;
|
||||||
this.DoMD5 = true;
|
this.DoMD5 = true;
|
||||||
this.GroupSize = 128;
|
this.GroupSize = 128;
|
||||||
@@ -57,6 +58,10 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
[SRDescription(typeof(Properties.Resources), "DescriptionGPUOnly")]
|
[SRDescription(typeof(Properties.Resources), "DescriptionGPUOnly")]
|
||||||
public bool GPUOnly { get; set; }
|
public bool GPUOnly { get; set; }
|
||||||
|
|
||||||
|
[DefaultValue(false)]
|
||||||
|
[SRDescription(typeof(Properties.Resources), "DescriptionDoRice")]
|
||||||
|
public bool DoRice { get; set; }
|
||||||
|
|
||||||
[DefaultValue(false)]
|
[DefaultValue(false)]
|
||||||
[SRDescription(typeof(Properties.Resources), "DescriptionMappedMemory")]
|
[SRDescription(typeof(Properties.Resources), "DescriptionMappedMemory")]
|
||||||
public bool MappedMemory { get; set; }
|
public bool MappedMemory { get; set; }
|
||||||
@@ -837,7 +842,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
//assert(porder >= 0);
|
//assert(porder >= 0);
|
||||||
frame.writer.writebits(4, porder);
|
frame.writer.writebits(4, porder);
|
||||||
|
|
||||||
if (task.riceOnGPU)
|
if (_settings.DoRice)
|
||||||
{
|
{
|
||||||
if (task.BestResidualTasks[index].size != (int)sub.best.size)
|
if (task.BestResidualTasks[index].size != (int)sub.best.size)
|
||||||
throw new Exception("Encoding offset mismatch");
|
throw new Exception("Encoding offset mismatch");
|
||||||
@@ -846,6 +851,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
if (task.BestResidualTasks[index].encodingOffset != frame.writer.BitLength)
|
if (task.BestResidualTasks[index].encodingOffset != frame.writer.BitLength)
|
||||||
throw new Exception("Encoding offset mismatch");
|
throw new Exception("Encoding offset mismatch");
|
||||||
int len = task.BestResidualTasks[index].size - task.BestResidualTasks[index].headerLen;
|
int len = task.BestResidualTasks[index].size - task.BestResidualTasks[index].headerLen;
|
||||||
|
//Console.WriteLine("{0:x} => {1:x}", _totalSize + frame.writer.BitLength / 8, _totalSize + (frame.writer.BitLength + len) / 8);
|
||||||
// task.BestResidualTasks[index].headerLen
|
// task.BestResidualTasks[index].headerLen
|
||||||
frame.writer.writeints(len, (byte*)task.clRiceOutputPtr);
|
frame.writer.writeints(len, (byte*)task.clRiceOutputPtr);
|
||||||
if (task.BestResidualTasks[index].encodingOffset + len != frame.writer.BitLength)
|
if (task.BestResidualTasks[index].encodingOffset + len != frame.writer.BitLength)
|
||||||
@@ -1196,7 +1202,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
|
|
||||||
#if DEBUG
|
#if DEBUG
|
||||||
// check size
|
// check size
|
||||||
if (_settings.GPUOnly && !task.riceOnGPU)
|
if (_settings.GPUOnly && !_settings.DoRice)
|
||||||
{
|
{
|
||||||
uint real_size = measure_subframe(task.frame, task.frame.subframes[ch]);
|
uint real_size = measure_subframe(task.frame, task.frame.subframes[ch]);
|
||||||
if (real_size != task.frame.subframes[ch].best.size)
|
if (real_size != task.frame.subframes[ch].best.size)
|
||||||
@@ -1204,7 +1210,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (((csum << task.frame.subframes[ch].obits) >= 1UL << 32 || !_settings.GPUOnly) && !task.riceOnGPU)
|
if (((csum << task.frame.subframes[ch].obits) >= 1UL << 32 || !_settings.GPUOnly) && !_settings.DoRice)
|
||||||
{
|
{
|
||||||
if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true;
|
if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true;
|
||||||
if ((csum << task.frame.subframes[ch].obits) >= 1UL << 32)
|
if ((csum << task.frame.subframes[ch].obits) >= 1UL << 32)
|
||||||
@@ -1289,10 +1295,10 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
frame.subframes[ch].wbits = task.BestResidualTasks[index].wbits;
|
frame.subframes[ch].wbits = task.BestResidualTasks[index].wbits;
|
||||||
for (int i = 0; i < task.BestResidualTasks[index].residualOrder; i++)
|
for (int i = 0; i < task.BestResidualTasks[index].residualOrder; i++)
|
||||||
frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i];
|
frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i];
|
||||||
if (_settings.GPUOnly && !task.riceOnGPU && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC))
|
frame.subframes[ch].best.rc.porder = task.BestResidualTasks[index].porder;
|
||||||
|
if (_settings.GPUOnly && !_settings.DoRice && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC))
|
||||||
//if (_settings.GPUOnly && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC))
|
//if (_settings.GPUOnly && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC))
|
||||||
{
|
{
|
||||||
frame.subframes[ch].best.rc.porder = task.BestResidualTasks[index].porder;
|
|
||||||
int* riceParams = ((int*)task.clBestRiceParamsPtr) + (index << task.max_porder);
|
int* riceParams = ((int*)task.clBestRiceParamsPtr) + (index << task.max_porder);
|
||||||
fixed (int* dstParams = frame.subframes[ch].best.rc.rparams)
|
fixed (int* dstParams = frame.subframes[ch].best.rc.rparams)
|
||||||
AudioSamples.MemCpy(dstParams, riceParams, (1 << frame.subframes[ch].best.rc.porder));
|
AudioSamples.MemCpy(dstParams, riceParams, (1 << frame.subframes[ch].best.rc.porder));
|
||||||
@@ -1305,7 +1311,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (task.riceOnGPU)
|
if (_settings.DoRice && frame.subframes[ch].best.size != task.BestResidualTasks[index].size)
|
||||||
throw new Exception("size reported incorrectly");
|
throw new Exception("size reported incorrectly");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1579,6 +1585,8 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
"#define MAX_ORDER " + eparams.max_prediction_order.ToString() + "\n" +
|
"#define MAX_ORDER " + eparams.max_prediction_order.ToString() + "\n" +
|
||||||
"#define GROUP_SIZE " + groupSize.ToString() + "\n" +
|
"#define GROUP_SIZE " + groupSize.ToString() + "\n" +
|
||||||
"#define FLACCL_VERSION \"" + vendor_string + "\"\n" +
|
"#define FLACCL_VERSION \"" + vendor_string + "\"\n" +
|
||||||
|
(_settings.GPUOnly ? "#define DO_PARTITIONS\n" : "") +
|
||||||
|
(_settings.DoRice ? "#define DO_RICE\n" : "") +
|
||||||
#if DEBUG
|
#if DEBUG
|
||||||
"#define DEBUG\n" +
|
"#define DEBUG\n" +
|
||||||
#endif
|
#endif
|
||||||
@@ -1802,7 +1810,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
|
|
||||||
public string Path { get { return _path; } }
|
public string Path { get { return _path; } }
|
||||||
|
|
||||||
public static readonly string vendor_string = "FLACCL#0.2";
|
public static readonly string vendor_string = "FLACCL#0.3";
|
||||||
|
|
||||||
int select_blocksize(int samplerate, int time_ms)
|
int select_blocksize(int samplerate, int time_ms)
|
||||||
{
|
{
|
||||||
@@ -2362,8 +2370,6 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
public int nWindowFunctions = 0;
|
public int nWindowFunctions = 0;
|
||||||
public int max_porder = 0;
|
public int max_porder = 0;
|
||||||
|
|
||||||
public bool riceOnGPU = false;
|
|
||||||
|
|
||||||
public FlakeReader verify;
|
public FlakeReader verify;
|
||||||
|
|
||||||
public Thread workThread = null;
|
public Thread workThread = null;
|
||||||
@@ -2492,7 +2498,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
clSumPartition = openCLProgram.CreateKernel("clSumPartition");
|
clSumPartition = openCLProgram.CreateKernel("clSumPartition");
|
||||||
clFindRiceParameter = openCLProgram.CreateKernel("clFindRiceParameter");
|
clFindRiceParameter = openCLProgram.CreateKernel("clFindRiceParameter");
|
||||||
clFindPartitionOrder = openCLProgram.CreateKernel("clFindPartitionOrder");
|
clFindPartitionOrder = openCLProgram.CreateKernel("clFindPartitionOrder");
|
||||||
if (riceOnGPU)
|
if (writer._settings.DoRice)
|
||||||
{
|
{
|
||||||
clCalcOutputOffsets = openCLProgram.CreateKernel("clCalcOutputOffsets");
|
clCalcOutputOffsets = openCLProgram.CreateKernel("clCalcOutputOffsets");
|
||||||
clRiceEncoding = openCLProgram.CreateKernel("clRiceEncoding");
|
clRiceEncoding = openCLProgram.CreateKernel("clRiceEncoding");
|
||||||
@@ -2545,7 +2551,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
clSumPartition.Dispose();
|
clSumPartition.Dispose();
|
||||||
clFindRiceParameter.Dispose();
|
clFindRiceParameter.Dispose();
|
||||||
clFindPartitionOrder.Dispose();
|
clFindPartitionOrder.Dispose();
|
||||||
if (riceOnGPU)
|
if (writer._settings.DoRice)
|
||||||
{
|
{
|
||||||
clCalcOutputOffsets.Dispose();
|
clCalcOutputOffsets.Dispose();
|
||||||
clRiceEncoding.Dispose();
|
clRiceEncoding.Dispose();
|
||||||
@@ -2875,7 +2881,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
groupSize,
|
groupSize,
|
||||||
channels * frameCount);
|
channels * frameCount);
|
||||||
|
|
||||||
if (riceOnGPU)
|
if (writer._settings.DoRice)
|
||||||
{
|
{
|
||||||
clCalcOutputOffsets.SetArgs(
|
clCalcOutputOffsets.SetArgs(
|
||||||
clResidual,
|
clResidual,
|
||||||
@@ -2907,7 +2913,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
if (!writer._settings.MappedMemory)
|
if (!writer._settings.MappedMemory)
|
||||||
{
|
{
|
||||||
openCLCQ.EnqueueReadBuffer(clBestRiceParams, false, 0, sizeof(int) * (1 << max_porder) * channels * frameCount, clBestRiceParamsPtr);
|
openCLCQ.EnqueueReadBuffer(clBestRiceParams, false, 0, sizeof(int) * (1 << max_porder) * channels * frameCount, clBestRiceParamsPtr);
|
||||||
if (riceOnGPU)
|
if (writer._settings.DoRice)
|
||||||
openCLCQ.EnqueueReadBuffer(clRiceOutput, false, 0, sizeof(int) * FLACCLWriter.MAX_BLOCKSIZE * channels, clRiceOutputPtr);
|
openCLCQ.EnqueueReadBuffer(clRiceOutput, false, 0, sizeof(int) * FLACCLWriter.MAX_BLOCKSIZE * channels, clRiceOutputPtr);
|
||||||
else
|
else
|
||||||
openCLCQ.EnqueueReadBuffer(clResidual, false, 0, sizeof(int) * FLACCLWriter.MAX_BLOCKSIZE * channels, clResidualPtr);
|
openCLCQ.EnqueueReadBuffer(clResidual, false, 0, sizeof(int) * FLACCLWriter.MAX_BLOCKSIZE * channels, clResidualPtr);
|
||||||
|
|||||||
@@ -1007,6 +1007,7 @@ void clChooseBestMethod(
|
|||||||
tasks_out[get_global_id(0)] = tasks[best_no];
|
tasks_out[get_global_id(0)] = tasks[best_no];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef DO_PARTITIONS
|
||||||
#ifdef __CPU__
|
#ifdef __CPU__
|
||||||
// get_group_id(0) == task index
|
// get_group_id(0) == task index
|
||||||
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
|
||||||
@@ -1292,8 +1293,11 @@ void clCalcPartition16(
|
|||||||
s = sum.x + sum.y + sum.z + sum.w;
|
s = sum.x + sum.y + sum.z + sum.w;
|
||||||
|
|
||||||
const int lpos = (15 << (max_porder + 1)) * get_group_id(0) + (k << (max_porder + 1)) + offs / 16;
|
const int lpos = (15 << (max_porder + 1)) * get_group_id(0) + (k << (max_porder + 1)) + offs / 16;
|
||||||
if (k <= 14)
|
if (k <= 14 && offs < bs)
|
||||||
partition_lengths[lpos] = min(0x7fffff, s) + (16 - select(0, ro, offs < 16)) * (k + 1);
|
partition_lengths[lpos] = min(0x7fffff, s) + (16 - select(0, ro, offs < 16)) * (k + 1);
|
||||||
|
|
||||||
|
// if (task.data.blocksize == 16 && x == 0 && k <= 14)
|
||||||
|
// printf("[%d] = %d = s:%d + %d * (k:%d + 1), ro=%d, offs=%d, lpos=%d\n", k, partition_lengths[lpos], s, (16 - select(0, ro, offs < 16)), k, ro, offs, lpos);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@@ -1531,20 +1535,30 @@ void clFindPartitionOrder(
|
|||||||
|
|
||||||
if (get_local_id(0) == 0)
|
if (get_local_id(0) == 0)
|
||||||
{
|
{
|
||||||
tasks[get_group_id(0)].data.porder = best_porder;
|
task.porder = best_porder;
|
||||||
int obits = task.obits - task.wbits;
|
int obits = task.obits - task.wbits;
|
||||||
tasks[get_group_id(0)].data.size =
|
task.headerLen =
|
||||||
task.type == Fixed ? task.residualOrder * obits + 6 + best_length :
|
task.type == Fixed ? task.residualOrder * obits + 6 :
|
||||||
task.type == LPC ? task.residualOrder * obits + 6 + best_length + 4 + 5 + task.residualOrder * task.cbits :
|
task.type == LPC ? task.residualOrder * obits + 6 + 4 + 5 + task.residualOrder * task.cbits :
|
||||||
task.type == Constant ? obits : obits * task.blocksize;
|
task.type == Constant ? obits :
|
||||||
|
/* task.type == Verbatim ? */ obits * task.blocksize;
|
||||||
|
task.size = task.headerLen + select(0, best_length, task.type == Fixed || task.type == LPC);
|
||||||
|
if (task.size >= obits * task.blocksize)
|
||||||
|
{
|
||||||
|
task.headerLen = task.size = obits * task.blocksize;
|
||||||
|
task.type = Verbatim;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
if (get_local_id(0) < sizeof(task) / sizeof(int))
|
||||||
|
((__global int*)(&tasks[get_group_id(0)]))[get_local_id(0)] = ((__local int*)&task)[get_local_id(0)];
|
||||||
for (int offs = get_local_id(0); offs < (1 << best_porder); offs += GROUP_SIZE)
|
for (int offs = get_local_id(0); offs < (1 << best_porder); offs += GROUP_SIZE)
|
||||||
best_rice_parameters[(get_group_id(0) << max_porder) + offs] = rice_parameters[pos - (2 << best_porder) + offs];
|
best_rice_parameters[(get_group_id(0) << max_porder) + offs] = rice_parameters[pos - (2 << best_porder) + offs];
|
||||||
// FIXME: should be bytes?
|
// FIXME: should be bytes?
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef DO_RICE
|
||||||
#ifdef __CPU__
|
#ifdef __CPU__
|
||||||
typedef struct BitWriter_t
|
typedef struct BitWriter_t
|
||||||
{
|
{
|
||||||
@@ -1720,6 +1734,100 @@ void clRiceEncoding(
|
|||||||
//if (get_group_id(0) == 0) printf("\n");
|
//if (get_group_id(0) == 0) printf("\n");
|
||||||
flush(&bw);
|
flush(&bw);
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
__local FLACCLSubframeData task;
|
||||||
|
__local int riceparams[256];
|
||||||
|
__local int mypos[GROUP_SIZE];
|
||||||
|
__local unsigned int data[GROUP_SIZE];
|
||||||
|
__local int start;
|
||||||
|
|
||||||
|
int tid = get_local_id(0);
|
||||||
|
if (tid < sizeof(task) / sizeof(int))
|
||||||
|
((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(0)]))[tid];
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
for (int offs = tid; offs < (1 << task.porder); offs += GROUP_SIZE)
|
||||||
|
riceparams[offs] = best_rice_parameters[(get_group_id(0) << max_porder) + offs];
|
||||||
|
if (tid == 0)
|
||||||
|
start = task.encodingOffset;
|
||||||
|
data[tid] = 0;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
int bs = task.blocksize;
|
||||||
|
int partlen = bs >> task.porder;
|
||||||
|
for (int pos = 0; pos < bs; pos += GROUP_SIZE)
|
||||||
|
{
|
||||||
|
int offs = pos + tid;
|
||||||
|
int v = offs < bs ? residual[task.residualOffs + offs] : 0;
|
||||||
|
int k = offs < bs ? riceparams[offs / partlen] : 0;
|
||||||
|
int pstart = offs == task.residualOrder || (offs % partlen) == 0;
|
||||||
|
v = (v << 1) ^ (v >> 31);
|
||||||
|
int mylen = select(0, (v >> k) + 1 + k + select(0, 4, pstart), offs >= task.residualOrder && offs < bs);
|
||||||
|
mypos[tid] = mylen;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
// Inclusive scan(+)
|
||||||
|
for (int offset = 1; offset < GROUP_SIZE; offset <<= 1)
|
||||||
|
{
|
||||||
|
int t = tid >= offset ? mypos[tid - offset] : 0;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
mypos[tid] += t;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
// make it exclusive
|
||||||
|
//if ((get_global_id(0) == 64 || get_global_id(0) == 63) && pos == 0)
|
||||||
|
// printf("v=%x,k=%d,mylen=%d,mypos=%d,pstart=%d,partlen=%d,start=%d\n", v, k, mylen, mypos[tid-1], pstart, partlen, start);
|
||||||
|
//barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
mypos[tid] += start;
|
||||||
|
int start32 = start / 32;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
//if ((get_global_id(0) == 64 || get_global_id(0) == 63) && pos == 0)
|
||||||
|
// printf("v=%x,k=%d,mylen=%d,mypos=%d,pstart=%d,partlen=%d\n", v, k, mylen, mypos[tid], pstart, partlen);
|
||||||
|
if (mylen > 0)
|
||||||
|
{
|
||||||
|
if (pstart)
|
||||||
|
{
|
||||||
|
int kpos = mypos[tid] - mylen;
|
||||||
|
unsigned int kval = (k << 28);
|
||||||
|
// if (get_group_id(0) == 0 && kpos / 32 - task.encodingOffset / 32 == 5 && pos == 0)
|
||||||
|
//printf("{%08X |= %08X}\n", data[kpos / 32 - start32], kval >> (kpos & 31));
|
||||||
|
atom_or(&data[kpos / 32 - start32], kval >> (kpos & 31));
|
||||||
|
if ((kpos & 31) != 0)
|
||||||
|
atom_or(&data[kpos / 32 - start32 + 1], kval << (32 - (kpos & 31)));
|
||||||
|
}
|
||||||
|
int qpos = mypos[tid] - k - 1;
|
||||||
|
unsigned int qval = (1U << 31) | (v << (31 - k));
|
||||||
|
//if (get_group_id(0) == 0 && qpos / 32 - task.encodingOffset / 32 == 5 && pos == 0)
|
||||||
|
// printf("(%08X |= %08X) tid == %d, qpos == %d, qval == %08X\n", data[qpos / 32 - start32], qval >> (qpos & 31), tid, qpos, qval);
|
||||||
|
// if (get_group_id(0) == 0 && pos == 0)
|
||||||
|
// {
|
||||||
|
// printf("[%08X] (%08X |= %08X) qval==%08x qpos==%08x\n", qpos / 32 - start32, data[qpos / 32 - start32], qval >> (qpos & 31), qval, qpos);
|
||||||
|
//if (qval << (32 - (qpos & 31)) != 0)
|
||||||
|
// printf("[%08X] (%08X |= %08X)\n", qpos / 32 - start32 + 1, data[qpos / 32 - start32+1], qval << (32 - (qpos & 31)));
|
||||||
|
// }
|
||||||
|
atom_or(&data[qpos / 32 - start32], qval >> (qpos & 31));
|
||||||
|
if ((qpos & 31) != 0)
|
||||||
|
atom_or(&data[qpos / 32 - start32 + 1], qval << (32 - (qpos & 31)));
|
||||||
|
}
|
||||||
|
if (tid == GROUP_SIZE - 1)
|
||||||
|
start = mypos[tid];
|
||||||
|
//if (get_group_id(0) == 0 && pos == 0)
|
||||||
|
// printf("[%d] == %d\n", tid, mypos[tid]);
|
||||||
|
//if (get_group_id(0) == 0 && pos == 0)
|
||||||
|
// printf("%d == %d\n", (((qpos % 32) / 8) * 16 + 7 - qpos % 32), (((qpos << 1) & 48) + 7 - qpos & 31));
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
unsigned int bb = data[tid];
|
||||||
|
if ((start32 + tid) * 32 <= start)
|
||||||
|
output[start32 + tid] = (bb >> 24) | ((bb >> 8) & 0xff00) | ((bb << 8) & 0xff0000) | ((bb << 24) & 0xff000000);
|
||||||
|
//if (get_group_id(0) == 0 && pos == 0 && bb != 0)
|
||||||
|
// printf("[%08x] == %08X\n", 0x2dc8 + (tid + start32) * 4, data[tid]);
|
||||||
|
int remainder = data[start / 32 - start32];
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
data[tid] = select(0, remainder, tid == 0);
|
||||||
|
//if (start / 32 - start32 > GROUP_SIZE)
|
||||||
|
// printf("buffer overflow: %d > %d\n", start / 32 - start32, GROUP_SIZE);
|
||||||
|
}
|
||||||
|
// if (tid == 0 && start != task.encodingOffset - task.headerLen + task.size)
|
||||||
|
//printf("size mismatch: %d != %d\n", start, task.encodingOffset - task.headerLen + task.size);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
#endif /* DO_RICE */
|
||||||
|
#endif /* DO_PARTITIONS */
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -77,7 +77,7 @@ namespace CUETools.FLACCL.cmd
|
|||||||
min_lpc_order = -1, max_lpc_order = -1,
|
min_lpc_order = -1, max_lpc_order = -1,
|
||||||
min_fixed_order = -1, max_fixed_order = -1,
|
min_fixed_order = -1, max_fixed_order = -1,
|
||||||
min_precision = -1, max_precision = -1,
|
min_precision = -1, max_precision = -1,
|
||||||
orders_per_window = -1,
|
orders_per_window = -1, orders_per_channel = -1,
|
||||||
blocksize = -1;
|
blocksize = -1;
|
||||||
int level = -1, padding = -1, vbr_mode = -1;
|
int level = -1, padding = -1, vbr_mode = -1;
|
||||||
bool do_seektable = true;
|
bool do_seektable = true;
|
||||||
@@ -99,6 +99,8 @@ namespace CUETools.FLACCL.cmd
|
|||||||
do_seektable = false;
|
do_seektable = false;
|
||||||
else if (args[arg] == "--slow-gpu")
|
else if (args[arg] == "--slow-gpu")
|
||||||
settings.GPUOnly = false;
|
settings.GPUOnly = false;
|
||||||
|
else if (args[arg] == "--do-rice")
|
||||||
|
settings.DoRice = true;
|
||||||
else if (args[arg] == "--no-md5")
|
else if (args[arg] == "--no-md5")
|
||||||
settings.DoMD5 = false;
|
settings.DoMD5 = false;
|
||||||
else if (args[arg] == "--buffered")
|
else if (args[arg] == "--buffered")
|
||||||
@@ -155,6 +157,8 @@ namespace CUETools.FLACCL.cmd
|
|||||||
ok = (++arg < args.Length) && int.TryParse(args[arg], out vbr_mode);
|
ok = (++arg < args.Length) && int.TryParse(args[arg], out vbr_mode);
|
||||||
else if (args[arg] == "--orders-per-window")
|
else if (args[arg] == "--orders-per-window")
|
||||||
ok = (++arg < args.Length) && int.TryParse(args[arg], out orders_per_window);
|
ok = (++arg < args.Length) && int.TryParse(args[arg], out orders_per_window);
|
||||||
|
else if (args[arg] == "--orders-per-channel")
|
||||||
|
ok = (++arg < args.Length) && int.TryParse(args[arg], out orders_per_channel);
|
||||||
else if ((args[arg] == "-b" || args[arg] == "--blocksize") && ++arg < args.Length)
|
else if ((args[arg] == "-b" || args[arg] == "--blocksize") && ++arg < args.Length)
|
||||||
ok = int.TryParse(args[arg], out blocksize);
|
ok = int.TryParse(args[arg], out blocksize);
|
||||||
else if ((args[arg] == "-p" || args[arg] == "--padding") && ++arg < args.Length)
|
else if ((args[arg] == "-p" || args[arg] == "--padding") && ++arg < args.Length)
|
||||||
@@ -248,6 +252,8 @@ namespace CUETools.FLACCL.cmd
|
|||||||
encoder.VBRMode = vbr_mode;
|
encoder.VBRMode = vbr_mode;
|
||||||
if (orders_per_window >= 0)
|
if (orders_per_window >= 0)
|
||||||
encoder.OrdersPerWindow = orders_per_window;
|
encoder.OrdersPerWindow = orders_per_window;
|
||||||
|
if (orders_per_channel >= 0)
|
||||||
|
encoder.OrdersPerChannel = orders_per_channel;
|
||||||
encoder.DoSeekTable = do_seektable;
|
encoder.DoSeekTable = do_seektable;
|
||||||
}
|
}
|
||||||
catch (Exception ex)
|
catch (Exception ex)
|
||||||
@@ -327,7 +333,7 @@ namespace CUETools.FLACCL.cmd
|
|||||||
Console.Out.WriteLine("{0}\t{1}\t{2}\t{3}\t{4} ({5})\t{6}/{7}+{12}{13}\t{8}..{9}\t{10}\t{11}",
|
Console.Out.WriteLine("{0}\t{1}\t{2}\t{3}\t{4} ({5})\t{6}/{7}+{12}{13}\t{8}..{9}\t{10}\t{11}",
|
||||||
encoder.TotalSize,
|
encoder.TotalSize,
|
||||||
encoder.UserProcessorTime.TotalSeconds > 0 ? encoder.UserProcessorTime.TotalSeconds : totalElapsed.TotalSeconds,
|
encoder.UserProcessorTime.TotalSeconds > 0 ? encoder.UserProcessorTime.TotalSeconds : totalElapsed.TotalSeconds,
|
||||||
encoder.StereoMethod.ToString().PadRight(15),
|
(encoder.StereoMethod.ToString() + (encoder.OrdersPerChannel == 32 ? "" : "(" + encoder.OrdersPerChannel.ToString() + ")")).PadRight(15),
|
||||||
encoder.WindowFunction.ToString().PadRight(15),
|
encoder.WindowFunction.ToString().PadRight(15),
|
||||||
encoder.MaxPartitionOrder,
|
encoder.MaxPartitionOrder,
|
||||||
settings.GPUOnly ? "GPU" : "CPU",
|
settings.GPUOnly ? "GPU" : "CPU",
|
||||||
|
|||||||
Reference in New Issue
Block a user