mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
optimizations
This commit is contained in:
@@ -842,7 +842,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
//assert(porder >= 0);
|
//assert(porder >= 0);
|
||||||
frame.writer.writebits(4, porder);
|
frame.writer.writebits(4, porder);
|
||||||
|
|
||||||
if (_settings.DoRice)
|
if (_settings.GPUOnly && _settings.DoRice)
|
||||||
{
|
{
|
||||||
if (task.BestResidualTasks[index].size != (int)sub.best.size)
|
if (task.BestResidualTasks[index].size != (int)sub.best.size)
|
||||||
throw new Exception("Encoding offset mismatch");
|
throw new Exception("Encoding offset mismatch");
|
||||||
@@ -1210,7 +1210,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (((csum << task.frame.subframes[ch].obits) >= 1UL << 32 || !_settings.GPUOnly) && !_settings.DoRice)
|
if (((csum << task.frame.subframes[ch].obits) >= 1UL << 32 && !_settings.DoRice) || !_settings.GPUOnly)
|
||||||
{
|
{
|
||||||
if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true;
|
if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true;
|
||||||
if ((csum << task.frame.subframes[ch].obits) >= 1UL << 32)
|
if ((csum << task.frame.subframes[ch].obits) >= 1UL << 32)
|
||||||
@@ -1311,7 +1311,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (_settings.DoRice && frame.subframes[ch].best.size != task.BestResidualTasks[index].size)
|
if (_settings.GPUOnly && _settings.DoRice && frame.subframes[ch].best.size != task.BestResidualTasks[index].size)
|
||||||
throw new Exception("size reported incorrectly");
|
throw new Exception("size reported incorrectly");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -2914,7 +2914,7 @@ namespace CUETools.Codecs.FLACCL
|
|||||||
{
|
{
|
||||||
openCLCQ.EnqueueReadBuffer(clBestRiceParams, false, 0, sizeof(int) * (1 << max_porder) * channels * frameCount, clBestRiceParamsPtr);
|
openCLCQ.EnqueueReadBuffer(clBestRiceParams, false, 0, sizeof(int) * (1 << max_porder) * channels * frameCount, clBestRiceParamsPtr);
|
||||||
if (writer._settings.DoRice)
|
if (writer._settings.DoRice)
|
||||||
openCLCQ.EnqueueReadBuffer(clRiceOutput, false, 0, sizeof(int) * FLACCLWriter.MAX_BLOCKSIZE * channels, clRiceOutputPtr);
|
openCLCQ.EnqueueReadBuffer(clRiceOutput, false, 0, (channels * frameSize * 17 + 128) / 8 * frameCount, clRiceOutputPtr);
|
||||||
else
|
else
|
||||||
openCLCQ.EnqueueReadBuffer(clResidual, false, 0, sizeof(int) * FLACCLWriter.MAX_BLOCKSIZE * channels, clResidualPtr);
|
openCLCQ.EnqueueReadBuffer(clResidual, false, 0, sizeof(int) * FLACCLWriter.MAX_BLOCKSIZE * channels, clResidualPtr);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1735,9 +1735,10 @@ void clRiceEncoding(
|
|||||||
flush(&bw);
|
flush(&bw);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
#define WARP_SIZE 32
|
||||||
__local FLACCLSubframeData task;
|
__local FLACCLSubframeData task;
|
||||||
__local int riceparams[256];
|
__local int riceparams[256];
|
||||||
__local int mypos[GROUP_SIZE];
|
__local int mypos[GROUP_SIZE+1];
|
||||||
__local unsigned int data[GROUP_SIZE];
|
__local unsigned int data[GROUP_SIZE];
|
||||||
__local int start;
|
__local int start;
|
||||||
|
|
||||||
@@ -1749,6 +1750,8 @@ void clRiceEncoding(
|
|||||||
riceparams[offs] = best_rice_parameters[(get_group_id(0) << max_porder) + offs];
|
riceparams[offs] = best_rice_parameters[(get_group_id(0) << max_porder) + offs];
|
||||||
if (tid == 0)
|
if (tid == 0)
|
||||||
start = task.encodingOffset;
|
start = task.encodingOffset;
|
||||||
|
if (tid == 0)
|
||||||
|
mypos[GROUP_SIZE] = 0;
|
||||||
data[tid] = 0;
|
data[tid] = 0;
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
int bs = task.blocksize;
|
int bs = task.blocksize;
|
||||||
@@ -1762,8 +1765,32 @@ void clRiceEncoding(
|
|||||||
v = (v << 1) ^ (v >> 31);
|
v = (v << 1) ^ (v >> 31);
|
||||||
int mylen = select(0, (v >> k) + 1 + k + select(0, 4, pstart), offs >= task.residualOrder && offs < bs);
|
int mylen = select(0, (v >> k) + 1 + k + select(0, 4, pstart), offs >= task.residualOrder && offs < bs);
|
||||||
mypos[tid] = mylen;
|
mypos[tid] = mylen;
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
// Inclusive scan(+)
|
// Inclusive scan(+)
|
||||||
|
#if 1
|
||||||
|
for (int offset = 1; offset < WARP_SIZE; offset <<= 1)
|
||||||
|
mypos[tid] += mypos[select(GROUP_SIZE, tid - offset, offset <= (tid & (WARP_SIZE - 1)))];
|
||||||
|
#if 1
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
for (int j = GROUP_SIZE - WARP_SIZE; j > 0; j -= WARP_SIZE)
|
||||||
|
{
|
||||||
|
if (tid >= j)
|
||||||
|
mypos[tid] += mypos[j - 1];
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
if ((tid & (WARP_SIZE - 1)) == WARP_SIZE - 1)
|
||||||
|
warppos[tid/WARP_SIZE] = mypos[tid];
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
for (int offset = 1; offset < GROUP_SIZE/WARP_SIZE; offset <<= 1)
|
||||||
|
{
|
||||||
|
if (offset <= tid && tid < GROUP_SIZE/WARP_SIZE)
|
||||||
|
warppos[tid] += warppos[tid - offset];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
mypos[tid] += tid / WARP_SIZE == 0 ? 0 : warppos[tid / WARP_SIZE - 1];
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
for (int offset = 1; offset < GROUP_SIZE; offset <<= 1)
|
for (int offset = 1; offset < GROUP_SIZE; offset <<= 1)
|
||||||
{
|
{
|
||||||
int t = tid >= offset ? mypos[tid - offset] : 0;
|
int t = tid >= offset ? mypos[tid - offset] : 0;
|
||||||
@@ -1771,7 +1798,7 @@ void clRiceEncoding(
|
|||||||
mypos[tid] += t;
|
mypos[tid] += t;
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
}
|
}
|
||||||
// make it exclusive
|
#endif
|
||||||
//if ((get_global_id(0) == 64 || get_global_id(0) == 63) && pos == 0)
|
//if ((get_global_id(0) == 64 || get_global_id(0) == 63) && pos == 0)
|
||||||
// printf("v=%x,k=%d,mylen=%d,mypos=%d,pstart=%d,partlen=%d,start=%d\n", v, k, mylen, mypos[tid-1], pstart, partlen, start);
|
// printf("v=%x,k=%d,mylen=%d,mypos=%d,pstart=%d,partlen=%d,start=%d\n", v, k, mylen, mypos[tid-1], pstart, partlen, start);
|
||||||
//barrier(CLK_LOCAL_MEM_FENCE);
|
//barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|||||||
Reference in New Issue
Block a user