mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
optimizations
This commit is contained in:
@@ -101,9 +101,11 @@ namespace CUETools.Codecs.FlaCuda
|
||||
int nResidualTasks = 0;
|
||||
int nAutocorTasks = 0;
|
||||
|
||||
bool encode_on_cpu = true;
|
||||
|
||||
public const int MAX_BLOCKSIZE = 4608 * 4;
|
||||
internal const int maxFrames = 8;
|
||||
internal const int maxResidualParts = (MAX_BLOCKSIZE + 32 * 3) / (32 * 3);
|
||||
internal const int maxResidualParts = 64;
|
||||
internal const int maxAutocorParts = MAX_BLOCKSIZE / (256 - 32);
|
||||
|
||||
public FlaCudaWriter(string path, int bitsPerSample, int channelCount, int sampleRate, Stream IO)
|
||||
@@ -439,47 +441,6 @@ namespace CUETools.Codecs.FlaCuda
|
||||
samplesInBuffer += block;
|
||||
}
|
||||
|
||||
static uint rice_encode_count(uint sum, uint n, uint k)
|
||||
{
|
||||
return n*(k+1) + ((sum-(n>>1))>>(int)k);
|
||||
}
|
||||
|
||||
//static unsafe uint find_optimal_rice_param(uint sum, uint n)
|
||||
//{
|
||||
// uint* nbits = stackalloc uint[Flake.MAX_RICE_PARAM + 1];
|
||||
// int k_opt = 0;
|
||||
|
||||
// nbits[0] = UINT32_MAX;
|
||||
// for (int k = 0; k <= Flake.MAX_RICE_PARAM; k++)
|
||||
// {
|
||||
// nbits[k] = rice_encode_count(sum, n, (uint)k);
|
||||
// if (nbits[k] < nbits[k_opt])
|
||||
// k_opt = k;
|
||||
// }
|
||||
// return (uint)k_opt;
|
||||
//}
|
||||
|
||||
static unsafe int find_optimal_rice_param(uint sum, uint n, out uint nbits_best)
|
||||
{
|
||||
int k_opt = 0;
|
||||
uint a = n;
|
||||
uint b = sum - (n >> 1);
|
||||
uint nbits = a + b;
|
||||
for (int k = 1; k <= Flake.MAX_RICE_PARAM; k++)
|
||||
{
|
||||
a += n;
|
||||
b >>= 1;
|
||||
uint nbits_k = a + b;
|
||||
if (nbits_k < nbits)
|
||||
{
|
||||
k_opt = k;
|
||||
nbits = nbits_k;
|
||||
}
|
||||
}
|
||||
nbits_best = nbits;
|
||||
return k_opt;
|
||||
}
|
||||
|
||||
unsafe static void channel_decorrelation(int* leftS, int* rightS, int *leftM, int *rightM, int* src, int blocksize)
|
||||
{
|
||||
for (int i = 0; i < blocksize; i++)
|
||||
@@ -546,6 +507,47 @@ namespace CUETools.Codecs.FlaCuda
|
||||
}
|
||||
}
|
||||
|
||||
static uint rice_encode_count(uint sum, uint n, uint k)
|
||||
{
|
||||
return n * (k + 1) + ((sum - (n >> 1)) >> (int)k);
|
||||
}
|
||||
|
||||
//static unsafe uint find_optimal_rice_param(uint sum, uint n)
|
||||
//{
|
||||
// uint* nbits = stackalloc uint[Flake.MAX_RICE_PARAM + 1];
|
||||
// int k_opt = 0;
|
||||
|
||||
// nbits[0] = UINT32_MAX;
|
||||
// for (int k = 0; k <= Flake.MAX_RICE_PARAM; k++)
|
||||
// {
|
||||
// nbits[k] = rice_encode_count(sum, n, (uint)k);
|
||||
// if (nbits[k] < nbits[k_opt])
|
||||
// k_opt = k;
|
||||
// }
|
||||
// return (uint)k_opt;
|
||||
//}
|
||||
|
||||
static unsafe int find_optimal_rice_param(uint sum, uint n, out uint nbits_best)
|
||||
{
|
||||
int k_opt = 0;
|
||||
uint a = n;
|
||||
uint b = sum - (n >> 1);
|
||||
uint nbits = a + b;
|
||||
for (int k = 1; k <= Flake.MAX_RICE_PARAM; k++)
|
||||
{
|
||||
a += n;
|
||||
b >>= 1;
|
||||
uint nbits_k = a + b;
|
||||
if (nbits_k < nbits)
|
||||
{
|
||||
k_opt = k;
|
||||
nbits = nbits_k;
|
||||
}
|
||||
}
|
||||
nbits_best = nbits;
|
||||
return k_opt;
|
||||
}
|
||||
|
||||
static unsafe uint calc_optimal_rice_params(ref RiceContext rc, int porder, uint* sums, uint n, uint pred_order)
|
||||
{
|
||||
uint part = (1U << porder);
|
||||
@@ -563,22 +565,28 @@ namespace CUETools.Codecs.FlaCuda
|
||||
return all_bits;
|
||||
}
|
||||
|
||||
static unsafe void calc_sums(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums)
|
||||
static unsafe void calc_sums(int pmin, int pmax, int* data, uint n, uint pred_order, uint* sums)
|
||||
{
|
||||
// sums for highest level
|
||||
int parts = (1 << pmax);
|
||||
uint* res = data + pred_order;
|
||||
int* res = data + pred_order;
|
||||
uint cnt = (n >> pmax) - pred_order;
|
||||
uint sum = 0;
|
||||
for (uint j = cnt; j > 0; j--)
|
||||
sum += *(res++);
|
||||
{
|
||||
int val = *(res++);
|
||||
sum += (uint)((val << 1) ^ (val >> 31));
|
||||
}
|
||||
sums[pmax * Flake.MAX_PARTITIONS + 0] = sum;
|
||||
cnt = (n >> pmax);
|
||||
for (int i = 1; i < parts; i++)
|
||||
{
|
||||
sum = 0;
|
||||
for (uint j = cnt; j > 0; j--)
|
||||
sum += *(res++);
|
||||
{
|
||||
int val = *(res++);
|
||||
sum += (uint)((val << 1) ^ (val >> 31));
|
||||
}
|
||||
sums[pmax * Flake.MAX_PARTITIONS + i] = sum;
|
||||
}
|
||||
// sums for lower levels
|
||||
@@ -596,7 +604,7 @@ namespace CUETools.Codecs.FlaCuda
|
||||
|
||||
static unsafe uint calc_rice_params(ref RiceContext rc, ref RiceContext tmp_rc, int pmin, int pmax, int* data, uint n, uint pred_order)
|
||||
{
|
||||
uint* udata = stackalloc uint[(int)n];
|
||||
//uint* udata = stackalloc uint[(int)n];
|
||||
uint* sums = stackalloc uint[(pmax + 1) * Flake.MAX_PARTITIONS];
|
||||
//uint* bits = stackalloc uint[Flake.MAX_PARTITION_ORDER];
|
||||
|
||||
@@ -604,10 +612,10 @@ namespace CUETools.Codecs.FlaCuda
|
||||
//assert(pmax >= 0 && pmax <= Flake.MAX_PARTITION_ORDER);
|
||||
//assert(pmin <= pmax);
|
||||
|
||||
for (uint i = 0; i < n; i++)
|
||||
udata[i] = (uint) ((2 * data[i]) ^ (data[i] >> 31));
|
||||
//for (uint i = 0; i < n; i++)
|
||||
// udata[i] = (uint) ((2 * data[i]) ^ (data[i] >> 31));
|
||||
|
||||
calc_sums(pmin, pmax, udata, n, pred_order, sums);
|
||||
calc_sums(pmin, pmax, data, n, pred_order, sums);
|
||||
|
||||
int opt_porder = pmin;
|
||||
uint opt_bits = AudioSamples.UINT32_MAX;
|
||||
@@ -790,45 +798,6 @@ namespace CUETools.Codecs.FlaCuda
|
||||
bitwriter.flush();
|
||||
}
|
||||
|
||||
unsafe uint measure_frame_size(FlacFrame frame, bool do_midside)
|
||||
{
|
||||
// crude estimation of header/footer size
|
||||
uint total = (uint)(32 + ((BitReader.log2i(frame_count) + 4) / 5) * 8 + (eparams.variable_block_size != 0 ? 16 : 0) + 16);
|
||||
|
||||
if (do_midside)
|
||||
{
|
||||
uint bitsBest = AudioSamples.UINT32_MAX;
|
||||
ChannelMode modeBest = ChannelMode.LeftRight;
|
||||
|
||||
if (bitsBest > frame.subframes[2].best.size + frame.subframes[3].best.size)
|
||||
{
|
||||
bitsBest = frame.subframes[2].best.size + frame.subframes[3].best.size;
|
||||
modeBest = ChannelMode.MidSide;
|
||||
}
|
||||
if (bitsBest > frame.subframes[3].best.size + frame.subframes[1].best.size)
|
||||
{
|
||||
bitsBest = frame.subframes[3].best.size + frame.subframes[1].best.size;
|
||||
modeBest = ChannelMode.RightSide;
|
||||
}
|
||||
if (bitsBest > frame.subframes[3].best.size + frame.subframes[0].best.size)
|
||||
{
|
||||
bitsBest = frame.subframes[3].best.size + frame.subframes[0].best.size;
|
||||
modeBest = ChannelMode.LeftSide;
|
||||
}
|
||||
if (bitsBest > frame.subframes[0].best.size + frame.subframes[1].best.size)
|
||||
{
|
||||
bitsBest = frame.subframes[0].best.size + frame.subframes[1].best.size;
|
||||
modeBest = ChannelMode.LeftRight;
|
||||
}
|
||||
frame.ch_mode = modeBest;
|
||||
return total + bitsBest;
|
||||
}
|
||||
|
||||
for (int ch = 0; ch < channels; ch++)
|
||||
total += frame.subframes[ch].best.size;
|
||||
return total;
|
||||
}
|
||||
|
||||
unsafe delegate void window_function(float* window, int size);
|
||||
|
||||
unsafe void calculate_window(float* window, window_function func, WindowFunction flag)
|
||||
@@ -869,39 +838,58 @@ namespace CUETools.Codecs.FlaCuda
|
||||
for (int order = 1; order <= max_order; order++)
|
||||
{
|
||||
task.ResidualTasks[nResidualTasks].type = (int)SubframeType.LPC;
|
||||
task.ResidualTasks[nResidualTasks].channel = ch;
|
||||
task.ResidualTasks[nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0);
|
||||
task.ResidualTasks[nResidualTasks].blocksize = blocksize;
|
||||
task.ResidualTasks[nResidualTasks].residualOrder = order <= max_order ? order : 0;
|
||||
task.ResidualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize;
|
||||
task.ResidualTasks[nResidualTasks].residualOffs = task.ResidualTasks[nResidualTasks].samplesOffs;
|
||||
nResidualTasks++;
|
||||
}
|
||||
}
|
||||
// Fixed prediction
|
||||
for (int order = 1; order <= max_order; order++)
|
||||
{
|
||||
task.ResidualTasks[nResidualTasks].type = order <= 5 ? (int)SubframeType.Fixed : (int)SubframeType.Verbatim;
|
||||
task.ResidualTasks[nResidualTasks].type = (int)SubframeType.Verbatim;
|
||||
task.ResidualTasks[nResidualTasks].channel = ch;
|
||||
task.ResidualTasks[nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0);
|
||||
task.ResidualTasks[nResidualTasks].blocksize = blocksize;
|
||||
task.ResidualTasks[nResidualTasks].residualOrder = order <= 4 ? order : 0;
|
||||
task.ResidualTasks[nResidualTasks].residualOrder = 0;
|
||||
task.ResidualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize;
|
||||
task.ResidualTasks[nResidualTasks].residualOffs = task.ResidualTasks[nResidualTasks].samplesOffs;
|
||||
task.ResidualTasks[nResidualTasks].shift = 0;
|
||||
switch (order)
|
||||
{
|
||||
case 5:
|
||||
break;
|
||||
case 1:
|
||||
task.ResidualTasks[nResidualTasks].type = (int)SubframeType.Constant;
|
||||
task.ResidualTasks[nResidualTasks].residualOrder = 1;
|
||||
task.ResidualTasks[nResidualTasks].coefs[0] = 1;
|
||||
break;
|
||||
case 3:
|
||||
task.ResidualTasks[nResidualTasks].type = (int)SubframeType.Fixed;
|
||||
task.ResidualTasks[nResidualTasks].residualOrder = 0;
|
||||
break;
|
||||
case 4:
|
||||
task.ResidualTasks[nResidualTasks].type = (int)SubframeType.Fixed;
|
||||
task.ResidualTasks[nResidualTasks].residualOrder = 1;
|
||||
task.ResidualTasks[nResidualTasks].coefs[0] = 1;
|
||||
break;
|
||||
case 2:
|
||||
task.ResidualTasks[nResidualTasks].type = (int)SubframeType.Fixed;
|
||||
task.ResidualTasks[nResidualTasks].residualOrder = 2;
|
||||
task.ResidualTasks[nResidualTasks].coefs[1] = 2;
|
||||
task.ResidualTasks[nResidualTasks].coefs[0] = -1;
|
||||
break;
|
||||
case 3:
|
||||
case 5:
|
||||
task.ResidualTasks[nResidualTasks].type = (int)SubframeType.Fixed;
|
||||
task.ResidualTasks[nResidualTasks].residualOrder = 3;
|
||||
task.ResidualTasks[nResidualTasks].coefs[2] = 3;
|
||||
task.ResidualTasks[nResidualTasks].coefs[1] = -3;
|
||||
task.ResidualTasks[nResidualTasks].coefs[0] = 1;
|
||||
break;
|
||||
case 4:
|
||||
case 6:
|
||||
task.ResidualTasks[nResidualTasks].type = (int)SubframeType.Fixed;
|
||||
task.ResidualTasks[nResidualTasks].residualOrder = 4;
|
||||
task.ResidualTasks[nResidualTasks].coefs[3] = 4;
|
||||
task.ResidualTasks[nResidualTasks].coefs[2] = -6;
|
||||
task.ResidualTasks[nResidualTasks].coefs[1] = 4;
|
||||
@@ -950,7 +938,7 @@ namespace CUETools.Codecs.FlaCuda
|
||||
csum += (ulong)Math.Abs(coefs[i - 1]);
|
||||
if ((csum << (int)frame.subframes[ch].obits) >= 1UL << 32)
|
||||
lpc.encode_residual_long(frame.subframes[ch].best.residual, frame.subframes[ch].samples, frame.blocksize, frame.subframes[ch].best.order, coefs, frame.subframes[ch].best.shift);
|
||||
else
|
||||
else if (encode_on_cpu)
|
||||
lpc.encode_residual(frame.subframes[ch].best.residual, frame.subframes[ch].samples, frame.blocksize, frame.subframes[ch].best.order, coefs, frame.subframes[ch].best.shift);
|
||||
|
||||
int pmin = get_max_p_order(eparams.min_partition_order, frame.blocksize, frame.subframes[ch].best.order);
|
||||
@@ -973,33 +961,33 @@ namespace CUETools.Codecs.FlaCuda
|
||||
|
||||
unsafe void select_best_methods(FlacFrame frame, int channelsCount, int max_order, int iFrame, FlaCudaTask task)
|
||||
{
|
||||
for (int ch = 0; ch < channelsCount; ch++)
|
||||
if (channelsCount == 4 && channels == 2)
|
||||
{
|
||||
int i;
|
||||
for (i = 1; i < frame.blocksize; i++)
|
||||
if (frame.subframes[ch].samples[i] != frame.subframes[ch].samples[0])
|
||||
break;
|
||||
// CONSTANT
|
||||
if (i == frame.blocksize)
|
||||
{
|
||||
frame.subframes[ch].best.type = SubframeType.Constant;
|
||||
frame.subframes[ch].best.size = frame.subframes[ch].obits;
|
||||
}
|
||||
// VERBATIM
|
||||
if (task.BestResidualTasks[iFrame * 2].channel == 0 && task.BestResidualTasks[iFrame * 2 + 1].channel == 1)
|
||||
frame.ch_mode = ChannelMode.LeftRight;
|
||||
else if (task.BestResidualTasks[iFrame * 2].channel == 0 && task.BestResidualTasks[iFrame * 2 + 1].channel == 3)
|
||||
frame.ch_mode = ChannelMode.LeftSide;
|
||||
else if (task.BestResidualTasks[iFrame * 2].channel == 3 && task.BestResidualTasks[iFrame * 2 + 1].channel == 1)
|
||||
frame.ch_mode = ChannelMode.RightSide;
|
||||
else if (task.BestResidualTasks[iFrame * 2].channel == 2 && task.BestResidualTasks[iFrame * 2 + 1].channel == 3)
|
||||
frame.ch_mode = ChannelMode.MidSide;
|
||||
else
|
||||
throw new Exception("internal error: invalid stereo mode");
|
||||
frame.SwapSubframes(0, task.BestResidualTasks[iFrame * 2].channel);
|
||||
frame.SwapSubframes(1, task.BestResidualTasks[iFrame * 2 + 1].channel);
|
||||
}
|
||||
else
|
||||
frame.ch_mode = channels != 2 ? ChannelMode.NotStereo : ChannelMode.LeftRight;
|
||||
|
||||
for (int ch = 0; ch < channels; ch++)
|
||||
{
|
||||
frame.subframes[ch].best.type = SubframeType.Verbatim;
|
||||
frame.subframes[ch].best.size = frame.subframes[ch].obits * (uint)frame.blocksize;
|
||||
}
|
||||
}
|
||||
|
||||
if (frame.blocksize <= 4)
|
||||
return;
|
||||
|
||||
for (int ch = 0; ch < channelsCount; ch++)
|
||||
{
|
||||
int index = ch + iFrame * channelsCount;
|
||||
if (frame.subframes[ch].best.size > task.BestResidualTasks[index].size)
|
||||
int index = ch + iFrame * channels;
|
||||
if (task.BestResidualTasks[index].size < 0)
|
||||
throw new Exception("internal error");
|
||||
if (frame.blocksize > 4 && frame.subframes[ch].best.size > task.BestResidualTasks[index].size)
|
||||
{
|
||||
frame.subframes[ch].best.type = (SubframeType)task.BestResidualTasks[index].type;
|
||||
frame.subframes[ch].best.size = (uint)task.BestResidualTasks[index].size;
|
||||
@@ -1008,47 +996,10 @@ namespace CUETools.Codecs.FlaCuda
|
||||
frame.subframes[ch].best.shift = task.BestResidualTasks[index].shift;
|
||||
for (int i = 0; i < task.BestResidualTasks[index].residualOrder; i++)
|
||||
frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i];
|
||||
AudioSamples.MemCpy(frame.subframes[ch].best.residual + frame.subframes[ch].best.order, (int*)task.residualBufferPtr + task.BestResidualTasks[index].samplesOffs, frame.blocksize - frame.subframes[ch].best.order);
|
||||
if (!encode_on_cpu)
|
||||
AudioSamples.MemCpy(frame.subframes[ch].best.residual + frame.subframes[ch].best.order, (int*)task.residualBufferPtr + task.BestResidualTasks[index].residualOffs, frame.blocksize - frame.subframes[ch].best.order);
|
||||
}
|
||||
//for (int iWindow = 0; iWindow < _windowcount; iWindow++)
|
||||
//{
|
||||
// for (int order = 1; order <= max_order && order < frame.blocksize; order++)
|
||||
// {
|
||||
// int index = (order - 1) + max_order * (iWindow + (_windowcount + 1) * (ch + iFrame * channelsCount));
|
||||
// if (task.ResidualTasks[index].residualOrder != order || task.ResidualTasks[index].type != (int)SubframeType.LPC)
|
||||
// throw new Exception("oops");
|
||||
// if (frame.subframes[ch].best.size > task.ResidualTasks[index].size)
|
||||
// {
|
||||
// frame.subframes[ch].best.type = SubframeType.LPC;
|
||||
// frame.subframes[ch].best.size = (uint)task.ResidualTasks[index].size;
|
||||
// frame.subframes[ch].best.order = task.ResidualTasks[index].residualOrder;
|
||||
// //frame.subframes[ch].best.window = iWindow;
|
||||
// frame.subframes[ch].best.cbits = task.ResidualTasks[index].cbits;
|
||||
// frame.subframes[ch].best.shift = task.ResidualTasks[index].shift;
|
||||
// for (int i = 0; i < order; i++)
|
||||
// frame.subframes[ch].best.coefs[i] = task.ResidualTasks[index].coefs[order - 1 - i];
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
}
|
||||
|
||||
// FIXED
|
||||
//for (int ch = 0; ch < channelsCount; ch++)
|
||||
//{
|
||||
// for (int order = 1; order <= 5 && order <= max_order && order < frame.blocksize; order++)
|
||||
// {
|
||||
// int index = (order - 1) + max_order * (_windowcount + (_windowcount + 1) * (ch + iFrame * channelsCount));
|
||||
// int forder = order == 5 ? 0 : order;
|
||||
// if (task.ResidualTasks[index].residualOrder != (order == 5 ? 1 : order))
|
||||
// throw new Exception("oops");
|
||||
// if (frame.subframes[ch].best.size > task.ResidualTasks[index].size)
|
||||
// {
|
||||
// frame.subframes[ch].best.type = SubframeType.Fixed;
|
||||
// frame.subframes[ch].best.size = (uint)task.ResidualTasks[index].size;
|
||||
// frame.subframes[ch].best.order = forder;
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
}
|
||||
|
||||
unsafe void estimate_residual(int blocksize, int channelsCount, int max_order, int nFrames, FlaCudaTask task)
|
||||
@@ -1090,18 +1041,30 @@ namespace CUETools.Codecs.FlaCuda
|
||||
|
||||
cuda.SetParameter(task.cudaSumResidual, 0, (uint)task.cudaResidualTasks.Pointer);
|
||||
cuda.SetParameter(task.cudaSumResidual, sizeof(uint), (uint)task.cudaResidualOutput.Pointer);
|
||||
cuda.SetParameter(task.cudaSumResidual, sizeof(uint) * 2, (uint)partSize);
|
||||
cuda.SetParameter(task.cudaSumResidual, sizeof(uint) * 3, (uint)partCount);
|
||||
cuda.SetParameterSize(task.cudaSumResidual, sizeof(uint) * 4U);
|
||||
cuda.SetParameter(task.cudaSumResidual, 2 * sizeof(uint), (uint)partCount);
|
||||
cuda.SetParameterSize(task.cudaSumResidual, sizeof(uint) * 3U);
|
||||
cuda.SetFunctionBlockShape(task.cudaSumResidual, 64, 1, 1);
|
||||
|
||||
int tasksPerChannel = (_windowcount + 1) * max_order;
|
||||
int nBestTasks = nResidualTasks / tasksPerChannel;
|
||||
cuda.SetParameter(task.cudaChooseBestResidual, 0, (uint)task.cudaBestResidualTasks.Pointer);
|
||||
cuda.SetParameter(task.cudaChooseBestResidual, 1 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer);
|
||||
cuda.SetParameter(task.cudaChooseBestResidual, 2 * sizeof(uint), (uint)tasksPerChannel);
|
||||
cuda.SetParameterSize(task.cudaChooseBestResidual, sizeof(uint) * 3U);
|
||||
cuda.SetFunctionBlockShape(task.cudaChooseBestResidual, 256, 1, 1);
|
||||
cuda.SetParameter(task.cudaChooseBestMethod, 0 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer);
|
||||
cuda.SetParameter(task.cudaChooseBestMethod, 1 * sizeof(uint), (uint)task.cudaResidualOutput.Pointer);
|
||||
cuda.SetParameter(task.cudaChooseBestMethod, 2 * sizeof(uint), (uint)partCount);
|
||||
cuda.SetParameter(task.cudaChooseBestMethod, 3 * sizeof(uint), (uint)tasksPerChannel);
|
||||
cuda.SetParameterSize(task.cudaChooseBestMethod, sizeof(uint) * 4U);
|
||||
cuda.SetFunctionBlockShape(task.cudaChooseBestMethod, 32, 16, 1);
|
||||
|
||||
cuda.SetParameter(task.cudaCopyBestMethod, 0, (uint)task.cudaBestResidualTasks.Pointer);
|
||||
cuda.SetParameter(task.cudaCopyBestMethod, 1 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer);
|
||||
cuda.SetParameter(task.cudaCopyBestMethod, 2 * sizeof(uint), (uint)tasksPerChannel);
|
||||
cuda.SetParameterSize(task.cudaCopyBestMethod, sizeof(uint) * 3U);
|
||||
cuda.SetFunctionBlockShape(task.cudaCopyBestMethod, 64, 1, 1);
|
||||
|
||||
cuda.SetParameter(task.cudaCopyBestMethodStereo, 0, (uint)task.cudaBestResidualTasks.Pointer);
|
||||
cuda.SetParameter(task.cudaCopyBestMethodStereo, 1 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer);
|
||||
cuda.SetParameter(task.cudaCopyBestMethodStereo, 2 * sizeof(uint), (uint)tasksPerChannel);
|
||||
cuda.SetParameterSize(task.cudaCopyBestMethodStereo, sizeof(uint) * 3U);
|
||||
cuda.SetFunctionBlockShape(task.cudaCopyBestMethodStereo, 64, 1, 1);
|
||||
|
||||
cuda.SetParameter(task.cudaEncodeResidual, 0, (uint)task.cudaResidual.Pointer);
|
||||
cuda.SetParameter(task.cudaEncodeResidual, 1 * sizeof(uint), (uint)task.cudaSamples.Pointer);
|
||||
@@ -1111,11 +1074,20 @@ namespace CUETools.Codecs.FlaCuda
|
||||
|
||||
// issue work to the GPU
|
||||
cuda.LaunchAsync(task.cudaEstimateResidual, partCount, (nResidualTasks / threads_y * nFrames) / maxFrames, task.stream);
|
||||
cuda.LaunchAsync(task.cudaSumResidual, 1, (nResidualTasks * nFrames) / maxFrames, task.stream);
|
||||
cuda.LaunchAsync(task.cudaChooseBestResidual, 1, (nBestTasks * nFrames) / maxFrames, task.stream);
|
||||
//cuda.LaunchAsync(task.cudaEncodeResidual, partCount, (nBestTasks * nFrames) / maxFrames, task.stream);
|
||||
//cuda.LaunchAsync(task.cudaSumResidual, 1, (nResidualTasks * nFrames) / maxFrames, task.stream);
|
||||
cuda.LaunchAsync(task.cudaChooseBestMethod, 1, (nBestTasks * nFrames) / maxFrames, task.stream);
|
||||
if (channels == 2 && channelsCount == 4)
|
||||
{
|
||||
cuda.LaunchAsync(task.cudaCopyBestMethodStereo, 1, (nBestTasks * nFrames) / maxFrames / 4, task.stream);
|
||||
nBestTasks /= 2;
|
||||
}
|
||||
else
|
||||
cuda.LaunchAsync(task.cudaCopyBestMethod, 1, (nBestTasks * nFrames) / maxFrames, task.stream);
|
||||
if (!encode_on_cpu)
|
||||
cuda.LaunchAsync(task.cudaEncodeResidual, partCount, (nBestTasks * nFrames) / maxFrames, task.stream);
|
||||
cuda.CopyDeviceToHostAsync(task.cudaBestResidualTasks, task.bestResidualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * ((nBestTasks * nFrames) / maxFrames)), task.stream);
|
||||
//cuda.CopyDeviceToHostAsync(task.cudaResidual, task.residualBufferPtr, (uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelsCount), task.stream);
|
||||
if (!encode_on_cpu)
|
||||
cuda.CopyDeviceToHostAsync(task.cudaResidual, task.residualBufferPtr, (uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channels), task.stream);
|
||||
}
|
||||
|
||||
unsafe void compute_autocorellation(int blocksize, int channelsCount, int max_order, int nFrames, FlaCudaTask task)
|
||||
@@ -1169,14 +1141,6 @@ namespace CUETools.Codecs.FlaCuda
|
||||
|
||||
select_best_methods(frame, channelCount, eparams.max_prediction_order, iFrame, task);
|
||||
|
||||
if (doMidside)
|
||||
{
|
||||
measure_frame_size(frame, true);
|
||||
frame.ChooseSubframes();
|
||||
}
|
||||
else
|
||||
frame.ch_mode = channels != 2 ? ChannelMode.NotStereo : ChannelMode.LeftRight;
|
||||
|
||||
encode_residual(frame);
|
||||
|
||||
frame_writer.Reset();
|
||||
@@ -1610,6 +1574,8 @@ namespace CUETools.Codecs.FlaCuda
|
||||
frame_buffer = new byte[max_frame_size + 1];
|
||||
frame_writer = new BitWriter(frame_buffer, 0, max_frame_size + 1);
|
||||
|
||||
encode_on_cpu = eparams.lpc_max_precision_search <= 1;
|
||||
|
||||
return header_len;
|
||||
}
|
||||
}
|
||||
@@ -1792,7 +1758,10 @@ namespace CUETools.Codecs.FlaCuda
|
||||
public int type;
|
||||
public int obits;
|
||||
public int blocksize;
|
||||
public fixed int reserved[8];
|
||||
public int best_index;
|
||||
public int channel;
|
||||
public int residualOffs;
|
||||
public fixed int reserved[5];
|
||||
public fixed int coefs[32];
|
||||
};
|
||||
|
||||
@@ -1802,7 +1771,9 @@ namespace CUETools.Codecs.FlaCuda
|
||||
public CUfunction cudaComputeAutocor;
|
||||
public CUfunction cudaComputeLPC;
|
||||
public CUfunction cudaEstimateResidual;
|
||||
public CUfunction cudaChooseBestResidual;
|
||||
public CUfunction cudaChooseBestMethod;
|
||||
public CUfunction cudaCopyBestMethod;
|
||||
public CUfunction cudaCopyBestMethodStereo;
|
||||
//public CUfunction cudaSumResidualChunks;
|
||||
public CUfunction cudaSumResidual;
|
||||
public CUfunction cudaEncodeResidual;
|
||||
@@ -1842,7 +1813,7 @@ namespace CUETools.Codecs.FlaCuda
|
||||
cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * FlaCudaWriter.maxAutocorParts));
|
||||
cudaResidualTasks = cuda.Allocate((uint)residualTasksLen);
|
||||
cudaBestResidualTasks = cuda.Allocate((uint)bestResidualTasksLen);
|
||||
cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS + 1) * lpc.MAX_LPC_ORDER * FlaCudaWriter.maxResidualParts));
|
||||
cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS + 1) * lpc.MAX_LPC_ORDER * FlaCudaWriter.maxResidualParts * FlaCudaWriter.maxFrames));
|
||||
CUResult cuErr = CUDADriver.cuMemAllocHost(ref samplesBufferPtr, (uint)samplesBufferLen);
|
||||
if (cuErr == CUResult.Success)
|
||||
cuErr = CUDADriver.cuMemAllocHost(ref residualBufferPtr, (uint)samplesBufferLen);
|
||||
@@ -1866,7 +1837,9 @@ namespace CUETools.Codecs.FlaCuda
|
||||
cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC");
|
||||
cudaEstimateResidual = cuda.GetModuleFunction("cudaEstimateResidual");
|
||||
cudaSumResidual = cuda.GetModuleFunction("cudaSumResidual");
|
||||
cudaChooseBestResidual = cuda.GetModuleFunction("cudaChooseBestResidual");
|
||||
cudaChooseBestMethod = cuda.GetModuleFunction("cudaChooseBestMethod");
|
||||
cudaCopyBestMethod = cuda.GetModuleFunction("cudaCopyBestMethod");
|
||||
cudaCopyBestMethodStereo = cuda.GetModuleFunction("cudaCopyBestMethodStereo");
|
||||
cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual");
|
||||
//cudaSumResidualChunks = cuda.GetModuleFunction("cudaSumResidualChunks");
|
||||
|
||||
|
||||
@@ -46,7 +46,10 @@ typedef struct
|
||||
int type;
|
||||
int obits;
|
||||
int blocksize;
|
||||
int reserved[8];
|
||||
int best_index;
|
||||
int channel;
|
||||
int residualOffs;
|
||||
int reserved[5];
|
||||
int coefs[32];
|
||||
} encodeResidualTaskStruct;
|
||||
|
||||
@@ -246,7 +249,7 @@ extern "C" __global__ void cudaEstimateResidual(
|
||||
for (c = 0; c < shared.task[threadIdx.y].residualOrder; c++)
|
||||
sum += __mul24(shared.data[ptr + c], shared.task[threadIdx.y].coefs[c]);
|
||||
sum = shared.data[ptr + c] - (sum >> shared.task[threadIdx.y].shift);
|
||||
shared.residual[tid] += __mul24(ptr < residualLen, (sum << 1) ^ (sum >> 31));
|
||||
shared.residual[tid] += __mul24(ptr < residualLen, min(0x7fffff,(sum << 1) ^ (sum >> 31)));
|
||||
}
|
||||
|
||||
// enable this line when using blockDim.x == 64
|
||||
@@ -258,13 +261,14 @@ extern "C" __global__ void cudaEstimateResidual(
|
||||
shared.residual[tid] += shared.residual[tid + 1];
|
||||
|
||||
// rice parameter search
|
||||
shared.residual[tid] = __mul24(threadIdx.x >= 15, 0x7fffff) + residualLen * (threadIdx.x + 1) + ((shared.residual[threadIdx.y * blockDim.x] - (residualLen >> 1)) >> threadIdx.x);
|
||||
shared.residual[tid] = (shared.task[threadIdx.y].type != Constant || shared.residual[threadIdx.y * blockDim.x] != 0) *
|
||||
(__mul24(threadIdx.x >= 15, 0x7fffff) + residualLen * (threadIdx.x + 1) + ((shared.residual[threadIdx.y * blockDim.x] - (residualLen >> 1)) >> threadIdx.x));
|
||||
shared.residual[tid] = min(shared.residual[tid], shared.residual[tid + 8]);
|
||||
shared.residual[tid] = min(shared.residual[tid], shared.residual[tid + 4]);
|
||||
shared.residual[tid] = min(shared.residual[tid], shared.residual[tid + 2]);
|
||||
shared.residual[tid] = min(shared.residual[tid], shared.residual[tid + 1]);
|
||||
if (threadIdx.x == 0)
|
||||
output[(blockIdx.y * blockDim.y + threadIdx.y) * gridDim.x + blockIdx.x] = shared.residual[tid];
|
||||
output[(blockIdx.y * blockDim.y + threadIdx.y) * 64 + blockIdx.x] = shared.residual[tid];
|
||||
}
|
||||
|
||||
// blockDim.x == 256
|
||||
@@ -327,12 +331,11 @@ extern "C" __global__ void cudaSumResidualChunks(
|
||||
extern "C" __global__ void cudaSumResidual(
|
||||
encodeResidualTaskStruct *tasks,
|
||||
int *residual,
|
||||
int partSize,
|
||||
int partCount // <= blockDim.y (256)
|
||||
)
|
||||
{
|
||||
__shared__ struct {
|
||||
int partLen[256];
|
||||
volatile int partLen[256];
|
||||
encodeResidualTaskStruct task;
|
||||
} shared;
|
||||
|
||||
@@ -355,47 +358,154 @@ extern "C" __global__ void cudaSumResidual(
|
||||
shared.partLen[tid] += shared.partLen[tid + 1];
|
||||
// return sum
|
||||
if (tid == 0)
|
||||
tasks[blockIdx.y].size = shared.task.type == Fixed ?
|
||||
shared.task.residualOrder * shared.task.obits + 6 + shared.partLen[0] : shared.task.type == LPC ?
|
||||
shared.task.residualOrder * shared.task.obits + 4 + 5 + shared.task.residualOrder * shared.task.cbits + 6 + (4 * partCount/2)/* << porder */ + shared.partLen[0] :
|
||||
shared.task.obits * shared.task.blocksize;
|
||||
tasks[blockIdx.y].size = min(shared.task.obits * shared.task.blocksize,
|
||||
shared.task.type == Fixed ? shared.task.residualOrder * shared.task.obits + 6 + shared.partLen[0] :
|
||||
shared.task.type == LPC ? shared.task.residualOrder * shared.task.obits + 4 + 5 + shared.task.residualOrder * shared.task.cbits + 6 + (4 * partCount/2)/* << porder */ + shared.partLen[0] :
|
||||
shared.task.type == Constant ? shared.task.obits * (1 + shared.task.blocksize * (shared.partLen[0] != 0)) :
|
||||
shared.task.obits * shared.task.blocksize);
|
||||
}
|
||||
|
||||
#define BEST_INDEX(a,b) ((a) + ((b) - (a)) * (shared.length[b] < shared.length[a]))
|
||||
|
||||
extern "C" __global__ void cudaChooseBestResidual(
|
||||
extern "C" __global__ void cudaChooseBestMethod(
|
||||
encodeResidualTaskStruct *tasks,
|
||||
int *residual,
|
||||
int partCount, // <= blockDim.y (256)
|
||||
int taskCount
|
||||
)
|
||||
{
|
||||
__shared__ struct {
|
||||
volatile int index[128];
|
||||
volatile int partLen[512];
|
||||
int length[256];
|
||||
volatile encodeResidualTaskStruct task[16];
|
||||
} shared;
|
||||
const int tid = threadIdx.x + threadIdx.y * 32;
|
||||
|
||||
if (tid < 256) shared.length[tid] = 0x7fffffff;
|
||||
for (int task = 0; task < taskCount; task += blockDim.y)
|
||||
if (task + threadIdx.y < taskCount)
|
||||
{
|
||||
// fetch task data
|
||||
((int*)&shared.task[threadIdx.y])[threadIdx.x] = ((int*)(tasks + task + threadIdx.y + taskCount * blockIdx.y))[threadIdx.x];
|
||||
|
||||
int sum = 0;
|
||||
for (int pos = 0; pos < partCount; pos += blockDim.x)
|
||||
sum += (pos + threadIdx.x < partCount ? residual[pos + threadIdx.x + 64 * (task + threadIdx.y + taskCount * blockIdx.y)] : 0);
|
||||
shared.partLen[tid] = sum;
|
||||
|
||||
// length sum: reduction in shared mem
|
||||
shared.partLen[tid] += shared.partLen[tid + 16];
|
||||
shared.partLen[tid] += shared.partLen[tid + 8];
|
||||
shared.partLen[tid] += shared.partLen[tid + 4];
|
||||
shared.partLen[tid] += shared.partLen[tid + 2];
|
||||
shared.partLen[tid] += shared.partLen[tid + 1];
|
||||
// return sum
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
shared.length[task + threadIdx.y] =
|
||||
min(shared.task[threadIdx.y].obits * shared.task[threadIdx.y].blocksize,
|
||||
shared.task[threadIdx.y].type == Fixed ? shared.task[threadIdx.y].residualOrder * shared.task[threadIdx.y].obits + 6 + shared.partLen[threadIdx.y * 32] :
|
||||
shared.task[threadIdx.y].type == LPC ? shared.task[threadIdx.y].residualOrder * shared.task[threadIdx.y].obits + 4 + 5 + shared.task[threadIdx.y].residualOrder * shared.task[threadIdx.y].cbits + 6 + (4 * partCount/2)/* << porder */ + shared.partLen[threadIdx.y * 32] :
|
||||
shared.task[threadIdx.y].type == Constant ? shared.task[threadIdx.y].obits * (1 + shared.task[threadIdx.y].blocksize * (shared.partLen[threadIdx.y * 32] != 0)) :
|
||||
shared.task[threadIdx.y].obits * shared.task[threadIdx.y].blocksize);
|
||||
}
|
||||
}
|
||||
//shared.index[threadIdx.x] = threadIdx.x;
|
||||
//shared.length[threadIdx.x] = (threadIdx.x < taskCount) ? tasks[threadIdx.x + taskCount * blockIdx.y].size : 0x7fffffff;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
//if (tid < 128) shared.index[tid] = BEST_INDEX(shared.index[tid], shared.index[tid + 128]); __syncthreads();
|
||||
if (tid < 128) shared.index[tid] = BEST_INDEX(tid, tid + 128); __syncthreads();
|
||||
if (tid < 64) shared.index[tid] = BEST_INDEX(shared.index[tid], shared.index[tid + 64]); __syncthreads();
|
||||
if (tid < 32)
|
||||
{
|
||||
shared.index[tid] = BEST_INDEX(shared.index[tid], shared.index[tid + 32]);
|
||||
shared.index[tid] = BEST_INDEX(shared.index[tid], shared.index[tid + 16]);
|
||||
shared.index[tid] = BEST_INDEX(shared.index[tid], shared.index[tid + 8]);
|
||||
shared.index[tid] = BEST_INDEX(shared.index[tid], shared.index[tid + 4]);
|
||||
shared.index[tid] = BEST_INDEX(shared.index[tid], shared.index[tid + 2]);
|
||||
shared.index[tid] = BEST_INDEX(shared.index[tid], shared.index[tid + 1]);
|
||||
}
|
||||
__syncthreads();
|
||||
// if (threadIdx.x < sizeof(encodeResidualTaskStruct)/sizeof(int))
|
||||
//((int*)(tasks_out + blockIdx.y))[threadIdx.x] = ((int*)(tasks + taskCount * blockIdx.y + shared.index[0]))[threadIdx.x];
|
||||
if (tid == 0)
|
||||
tasks[taskCount * blockIdx.y].best_index = taskCount * blockIdx.y + shared.index[0];
|
||||
if (tid < taskCount)
|
||||
tasks[tid + taskCount * blockIdx.y].size = shared.length[tid];
|
||||
}
|
||||
|
||||
extern "C" __global__ void cudaCopyBestMethod(
|
||||
encodeResidualTaskStruct *tasks_out,
|
||||
encodeResidualTaskStruct *tasks,
|
||||
int count
|
||||
)
|
||||
{
|
||||
__shared__ struct {
|
||||
volatile int index[128];
|
||||
int length[256];
|
||||
int best_index;
|
||||
} shared;
|
||||
|
||||
//shared.index[threadIdx.x] = threadIdx.x;
|
||||
shared.length[threadIdx.x] = (threadIdx.x < count) ? tasks[threadIdx.x + count * blockIdx.y].size : 0x7fffffff;
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
shared.best_index = tasks[count * blockIdx.y].best_index;
|
||||
__syncthreads();
|
||||
if (threadIdx.x < sizeof(encodeResidualTaskStruct)/sizeof(int))
|
||||
((int*)(tasks_out + blockIdx.y))[threadIdx.x] = ((int*)(tasks + shared.best_index))[threadIdx.x];
|
||||
}
|
||||
|
||||
//if (threadIdx.x < 128) shared.index[threadIdx.x] = BEST_INDEX(shared.index[threadIdx.x], shared.index[threadIdx.x + 128]); __syncthreads();
|
||||
if (threadIdx.x < 128) shared.index[threadIdx.x] = BEST_INDEX(threadIdx.x, threadIdx.x + 128); __syncthreads();
|
||||
if (threadIdx.x < 64) shared.index[threadIdx.x] = BEST_INDEX(shared.index[threadIdx.x], shared.index[threadIdx.x + 64]); __syncthreads();
|
||||
if (threadIdx.x < 32)
|
||||
extern "C" __global__ void cudaCopyBestMethodStereo(
|
||||
encodeResidualTaskStruct *tasks_out,
|
||||
encodeResidualTaskStruct *tasks,
|
||||
int count
|
||||
)
|
||||
{
|
||||
__shared__ struct {
|
||||
int best_index[4];
|
||||
int best_size[4];
|
||||
int lr_index[2];
|
||||
} shared;
|
||||
if (threadIdx.x < 4)
|
||||
shared.best_index[threadIdx.x] = tasks[count * (blockIdx.y * 4 + threadIdx.x)].best_index;
|
||||
if (threadIdx.x < 4)
|
||||
shared.best_size[threadIdx.x] = tasks[shared.best_index[threadIdx.x]].size;
|
||||
__syncthreads();
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
shared.index[threadIdx.x] = BEST_INDEX(shared.index[threadIdx.x], shared.index[threadIdx.x + 32]);
|
||||
shared.index[threadIdx.x] = BEST_INDEX(shared.index[threadIdx.x], shared.index[threadIdx.x + 16]);
|
||||
shared.index[threadIdx.x] = BEST_INDEX(shared.index[threadIdx.x], shared.index[threadIdx.x + 8]);
|
||||
shared.index[threadIdx.x] = BEST_INDEX(shared.index[threadIdx.x], shared.index[threadIdx.x + 4]);
|
||||
shared.index[threadIdx.x] = BEST_INDEX(shared.index[threadIdx.x], shared.index[threadIdx.x + 2]);
|
||||
shared.index[threadIdx.x] = BEST_INDEX(shared.index[threadIdx.x], shared.index[threadIdx.x + 1]);
|
||||
int bitsBest = 0x7fffffff;
|
||||
if (bitsBest > shared.best_size[2] + shared.best_size[3]) // MidSide
|
||||
{
|
||||
bitsBest = shared.best_size[2] + shared.best_size[3];
|
||||
shared.lr_index[0] = shared.best_index[2];
|
||||
shared.lr_index[1] = shared.best_index[3];
|
||||
}
|
||||
if (bitsBest > shared.best_size[3] + shared.best_size[1]) // RightSide
|
||||
{
|
||||
bitsBest = shared.best_size[3] + shared.best_size[1];
|
||||
shared.lr_index[0] = shared.best_index[3];
|
||||
shared.lr_index[1] = shared.best_index[1];
|
||||
}
|
||||
if (bitsBest > shared.best_size[0] + shared.best_size[3]) // LeftSide
|
||||
{
|
||||
bitsBest = shared.best_size[0] + shared.best_size[3];
|
||||
shared.lr_index[0] = shared.best_index[0];
|
||||
shared.lr_index[1] = shared.best_index[3];
|
||||
}
|
||||
if (bitsBest > shared.best_size[0] + shared.best_size[1]) // LeftRight
|
||||
{
|
||||
bitsBest = shared.best_size[0] + shared.best_size[1];
|
||||
shared.lr_index[0] = shared.best_index[0];
|
||||
shared.lr_index[1] = shared.best_index[1];
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
if (threadIdx.x < sizeof(encodeResidualTaskStruct)/sizeof(int))
|
||||
((int*)(tasks_out + blockIdx.y))[threadIdx.x] = ((int*)(tasks + count * blockIdx.y + shared.index[0]))[threadIdx.x];
|
||||
// if (threadIdx.x == 0)
|
||||
//tasks[count * blockIdx.y].best = count * blockIdx.y + shared.index[0];
|
||||
((int*)(tasks_out + 2 * blockIdx.y))[threadIdx.x] = ((int*)(tasks + shared.lr_index[0]))[threadIdx.x];
|
||||
if (threadIdx.x == 0)
|
||||
tasks_out[2 * blockIdx.y].residualOffs = tasks[shared.best_index[0]].residualOffs;
|
||||
if (threadIdx.x < sizeof(encodeResidualTaskStruct)/sizeof(int))
|
||||
((int*)(tasks_out + 2 * blockIdx.y + 1))[threadIdx.x] = ((int*)(tasks + shared.lr_index[1]))[threadIdx.x];
|
||||
if (threadIdx.x == 0)
|
||||
tasks_out[2 * blockIdx.y + 1].residualOffs = tasks[shared.best_index[1]].residualOffs;
|
||||
}
|
||||
|
||||
extern "C" __global__ void cudaEncodeResidual(
|
||||
@@ -428,6 +538,6 @@ extern "C" __global__ void cudaEncodeResidual(
|
||||
for (int c = 0; c < shared.task.residualOrder; c++)
|
||||
sum += __mul24(shared.data[tid + c], shared.task.coefs[c]);
|
||||
if (tid < residualLen)
|
||||
output[shared.task.samplesOffs + pos + tid] = shared.data[tid + shared.task.residualOrder] - (sum >> shared.task.shift);
|
||||
output[shared.task.residualOffs + pos + tid] = shared.data[tid + shared.task.residualOrder] - (sum >> shared.task.shift);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1,93 +1,6 @@
|
||||
architecture {sm_10}
|
||||
abiversion {1}
|
||||
modname {cubin}
|
||||
code {
|
||||
name = cudaChooseBestResidual
|
||||
lmem = 0
|
||||
smem = 1564
|
||||
reg = 7
|
||||
bar = 1
|
||||
const {
|
||||
segname = const
|
||||
segnum = 1
|
||||
offset = 0
|
||||
bytes = 16
|
||||
mem {
|
||||
0x0000007f 0x0000003f 0x0000001f 0x0000002f
|
||||
}
|
||||
}
|
||||
bincode {
|
||||
0xa0000009 0x04000780 0x3002cdfd 0x6420c7c8
|
||||
0xa0010003 0x00000000 0x1000f003 0x00000280
|
||||
0x1000cc01 0x0423c780 0x40014e05 0x00200780
|
||||
0x30100205 0xc4100780 0x60004e01 0x00204780
|
||||
0x20000001 0x04008780 0x30070005 0xc4100780
|
||||
0x30060001 0xc4100780 0x20008200 0x2100ea00
|
||||
0x20108001 0x00000003 0xd00e0001 0x80c00780
|
||||
0x10010003 0x00000780 0x103f8001 0x07ffffff
|
||||
0x00020405 0xc0000782 0x04010e01 0xe4200780
|
||||
0x3002040d 0xc4100780 0x861ffe03 0x00000000
|
||||
0x308005fd 0x644107c8 0xa001f003 0x00000000
|
||||
0x1001f003 0x00000280 0x00000605 0xc0000780
|
||||
0xd408380d 0x20000780 0xd4043809 0x20000780
|
||||
0x1c00c001 0x0423c780 0x3800c1fd 0x6c2107c8
|
||||
0x20008401 0x0000000b 0x10000401 0x0403c500
|
||||
0x04000e01 0xe4200780 0xf0000001 0xe0000002
|
||||
0x861ffe03 0x00000000 0x308105fd 0x644107c8
|
||||
0xa0030003 0x00000000 0x10030003 0x00000280
|
||||
0x00000605 0xc0000780 0xd4023809 0x20000780
|
||||
0x0802c00d 0xc0200780 0x0402ce11 0xc0200780
|
||||
0xdc04380d 0x20000780 0x1400ce01 0x0423c780
|
||||
0xd0043811 0x20000784 0x1d00e004 0x2940e000
|
||||
0x3001c005 0x6c20c784 0xd0010001 0x04020780
|
||||
0x2400ce01 0x04200780 0x04000e01 0xe4200780
|
||||
0xf0000001 0xe0000002 0x861ffe03 0x00000000
|
||||
0x308205fd 0x644107c8 0xa0074003 0x00000000
|
||||
0x10074003 0x00000280 0x00000605 0xc0000780
|
||||
0xd4013809 0x20000780 0x0802c00d 0xc0200780
|
||||
0x0402ce11 0xc0200780 0xdc04380d 0x20000780
|
||||
0x1400ce01 0x0423c780 0xd0043811 0x20000784
|
||||
0x1d00e004 0x2940e000 0x3001c005 0x6c20c784
|
||||
0xd0010001 0x04020780 0x2400ce01 0x04200780
|
||||
0x04000e01 0xe4200780 0x0402ee0d 0xc0200780
|
||||
0x0402ce09 0xc0200780 0xdc04380d 0x20000780
|
||||
0x1400ce01 0x0423c780 0xd8043809 0x20000780
|
||||
0x1c00c005 0x0423c780 0x2440ee01 0x04200780
|
||||
0x3801c005 0x6c20c780 0xd0010001 0x04020780
|
||||
0x2400ce01 0x04200780 0x04000e01 0xe4200780
|
||||
0x0402de0d 0xc0200780 0x0402ce09 0xc0200780
|
||||
0xdc04380d 0x20000780 0x1400ce01 0x0423c780
|
||||
0xd8043809 0x20000780 0x1d00e004 0x2540fe00
|
||||
0x3801c005 0x6c20c780 0xd0010001 0x04020780
|
||||
0x2400ce01 0x04200780 0x04000e01 0xe4200780
|
||||
0x0402d60d 0xc0200780 0x0402ce09 0xc0200780
|
||||
0xdc04380d 0x20000780 0x1400ce01 0x0423c780
|
||||
0xd8043809 0x20000780 0x1d00e004 0x2540f600
|
||||
0x3801c005 0x6c20c780 0xd0010001 0x04020780
|
||||
0x2400ce01 0x04200780 0x04000e01 0xe4200780
|
||||
0x0402d20d 0xc0200780 0x0402ce09 0xc0200780
|
||||
0xdc04380d 0x20000780 0x1400ce01 0x0423c780
|
||||
0xd8043809 0x20000780 0x1d00e004 0x2540f200
|
||||
0x3801c005 0x6c20c780 0xd0010001 0x04020780
|
||||
0x2400ce01 0x04200780 0x04000e01 0xe4200780
|
||||
0x0402d00d 0xc0200780 0x0402ce09 0xc0200780
|
||||
0xdc04380d 0x20000780 0x1400ce01 0x0423c780
|
||||
0xd8043809 0x20000780 0x1d00e004 0x2540f000
|
||||
0x3801c005 0x6c20c780 0xd0010001 0x04020780
|
||||
0x2400ce01 0x04200780 0x04000e01 0xe4200780
|
||||
0xf0000001 0xe0000002 0x861ffe03 0x00000000
|
||||
0x308305fd 0x644107c8 0x30000003 0x00000280
|
||||
0xa0004e05 0x04200780 0x1000cc01 0x0423c780
|
||||
0x40020209 0x00000780 0x30100409 0xc4100780
|
||||
0x60020001 0x00008780 0x3007ce09 0xc4300780
|
||||
0x3006ce11 0xc4300780 0x30070015 0xc4100780
|
||||
0x30060019 0xc4100780 0x30070201 0xc4100780
|
||||
0x30060205 0xc4100780 0x20048408 0x20068a10
|
||||
0x20018000 0x20028608 0x2104ea10 0x2100e804
|
||||
0x20000401 0x04010780 0xd00e0001 0x80c00780
|
||||
0x20000605 0x04004780 0xd00e0201 0xa0c00781
|
||||
}
|
||||
}
|
||||
code {
|
||||
name = cudaComputeAutocor
|
||||
lmem = 0
|
||||
@@ -190,10 +103,10 @@ code {
|
||||
segname = const
|
||||
segnum = 1
|
||||
offset = 0
|
||||
bytes = 20
|
||||
bytes = 24
|
||||
mem {
|
||||
0x000003ff 0x0000000f 0x0000001f 0x00000001
|
||||
0x0000000e
|
||||
0x007fffff 0x0000000e
|
||||
}
|
||||
}
|
||||
bincode {
|
||||
@@ -244,8 +157,8 @@ code {
|
||||
0x1000f801 0x0403c780 0x08047401 0xe4200782
|
||||
0xd4112809 0x20000780 0x3883c005 0x6c608780
|
||||
0xa0004401 0x04200780 0xd001001d 0x04000780
|
||||
0x30000ffd 0x640187c8 0xa0093003 0x00000000
|
||||
0x10091003 0x00000280 0x3003d201 0xac200780
|
||||
0x30000ffd 0x640187c8 0xa0094003 0x00000000
|
||||
0x10092003 0x00000280 0x3003d201 0xac200780
|
||||
0x307c000d 0x8c000780 0xd4110009 0x20000780
|
||||
0x387cc1fd 0x6c20c7c8 0xa007c003 0x00000000
|
||||
0x1000f801 0x0403c780 0x1000f825 0x0403c780
|
||||
@@ -266,42 +179,242 @@ code {
|
||||
0xd0000205 0x04008780 0x30080601 0x6c010780
|
||||
0x00020c0d 0xc0000780 0xdc095009 0x20000780
|
||||
0xa0000021 0x2c014780 0x1800c001 0x0423c780
|
||||
0x60011005 0x80000780 0x20018e1d 0x00000003
|
||||
0xa0004401 0x04200780 0x30000ffd 0x640147c8
|
||||
0x0c025401 0xe4204780 0x10063003 0x00000280
|
||||
0x10093003 0x00000780 0x3003d201 0xac200780
|
||||
0x307c000d 0x8c000780 0x00020c0d 0xc0000782
|
||||
0xdc095005 0x20000780 0x1400e001 0x0423c780
|
||||
0x2400c001 0x04200780 0x0c025401 0xe4200780
|
||||
0x1500f000 0x2500e000 0x0c025401 0xe4200780
|
||||
0x1500e800 0x2500e000 0x0c025401 0xe4200780
|
||||
0x1500e400 0x2500e000 0x0c025401 0xe4200780
|
||||
0x1400c205 0x0423c780 0x20018801 0x00000003
|
||||
0x2400c005 0x04204780 0x0c025401 0xe4204780
|
||||
0x40010c05 0x00000780 0x60000e05 0x00004780
|
||||
0x30100205 0xc4100780 0x60000c01 0x00004780
|
||||
0x30840805 0x64410780 0x00020a09 0xc0000780
|
||||
0xd8095009 0x20000780 0xa0000215 0x2c014780
|
||||
0x30010605 0xec100780 0x407f8a0d 0x0007ffff
|
||||
0x2941e004 0x2003800c 0x30040201 0xec000780
|
||||
0x20000001 0x0400c780 0x0c025401 0xe4200780
|
||||
0x1400d001 0x0423c780 0x3400c001 0xac200780
|
||||
0x0c025401 0xe4200780 0x1400c801 0x0423c780
|
||||
0x3400c001 0xac200780 0x0c025401 0xe4200780
|
||||
0x1400c401 0x0423c780 0x3400c001 0xac200780
|
||||
0x0c025401 0xe4200780 0x1400c201 0x0423c780
|
||||
0x3400c001 0xac200780 0x307c09fd 0x640147c8
|
||||
0x0c025401 0xe4200780 0x30000003 0x00000280
|
||||
0x30840205 0xac400780 0x60011005 0x80000780
|
||||
0x20018e1d 0x00000003 0xa0004401 0x04200780
|
||||
0x30000ffd 0x640147c8 0x0c025401 0xe4204780
|
||||
0x10063003 0x00000280 0x10094003 0x00000780
|
||||
0x3003d201 0xac200780 0x307c000d 0x8c000780
|
||||
0x00020c0d 0xc0000782 0xdc095009 0x20000780
|
||||
0x1800e001 0x0423c780 0x2800c001 0x04200780
|
||||
0x0c025401 0xe4200780 0x1900f000 0x2900e000
|
||||
0x0c025401 0xe4200780 0x1900e800 0x2900e000
|
||||
0x0c025401 0xe4200780 0x1900e400 0x2900e000
|
||||
0x0c025401 0xe4200780 0x1900e200 0x2900e000
|
||||
0x0c025401 0xe4200780 0xd4112805 0x20000780
|
||||
0x347cc1fd 0x6c2147c8 0xa00ac003 0x00000000
|
||||
0x100a9003 0x00000280 0x00020a05 0xc0000780
|
||||
0xd4095005 0x20000780 0x347cc1fd 0x6c2087c8
|
||||
0x100ab003 0x00000280 0x10018001 0x00000003
|
||||
0x100ac003 0x00000780 0x1000f801 0x0403c780
|
||||
0xf0000001 0xe0000002 0x20018805 0x00000003
|
||||
0x40030c1d 0x00000780 0x60020e1d 0x0001c780
|
||||
0x00020a05 0xc0000780 0xd4095005 0x20000780
|
||||
0x30010615 0xec100780 0x30100e1d 0xc4100780
|
||||
0x2440c015 0x04214780 0x60020c0d 0x0001c780
|
||||
0x307c09fd 0x640147c8 0x30040a05 0xec000780
|
||||
0x30850811 0x64410780 0xa0000811 0x2c014780
|
||||
0x407f8811 0x0007ffff 0x2004860c 0x20038204
|
||||
0x4003000d 0x00000780 0x6002020d 0x0000c780
|
||||
0x3010060d 0xc4100780 0x60020001 0x0000c780
|
||||
0x00020c09 0xc0000780 0x08025401 0xe4200780
|
||||
0xd8095005 0x20000780 0x1400d001 0x0423c780
|
||||
0x3400c001 0xac200780 0x08025401 0xe4200780
|
||||
0x1400c801 0x0423c780 0x3400c001 0xac200780
|
||||
0x08025401 0xe4200780 0x1400c401 0x0423c780
|
||||
0x3400c001 0xac200780 0x08025401 0xe4200780
|
||||
0x1400c201 0x0423c780 0x3400c001 0xac200780
|
||||
0x08025401 0xe4200780 0x30000003 0x00000280
|
||||
0xa0004401 0x04200780 0x40014e05 0x00200780
|
||||
0x30100205 0xc4100780 0x60004e01 0x00204780
|
||||
0x20000001 0x04008780 0x40014805 0x00200780
|
||||
0x30100205 0xc4100780 0x60004801 0x00204780
|
||||
0xa0004c0d 0x04200780 0x20000001 0x0400c780
|
||||
0x20000001 0x04008780 0x30060001 0xc4100780
|
||||
0xa0004c0d 0x04200780 0x20000601 0x04000780
|
||||
0x00020c0d 0xc0000780 0xdc095005 0x20000780
|
||||
0x30020005 0xc4100780 0x1500e000 0x2101e804
|
||||
0xd00e0201 0xa0c00781
|
||||
}
|
||||
}
|
||||
code {
|
||||
name = cudaChooseBestMethod
|
||||
lmem = 0
|
||||
smem = 6688
|
||||
reg = 10
|
||||
bar = 1
|
||||
const {
|
||||
segname = const
|
||||
segnum = 1
|
||||
offset = 0
|
||||
bytes = 32
|
||||
mem {
|
||||
0x000003ff 0x000000ff 0x00000008 0x00000020
|
||||
0x00000001 0x0000007f 0x0000003f 0x0000001f
|
||||
}
|
||||
}
|
||||
bincode {
|
||||
0xd0800209 0x00400780 0xa000040d 0x04000780
|
||||
0xa0000009 0x04000780 0xd0800209 0x00400780
|
||||
0xa0000405 0x04000780 0x30050205 0xc4100780
|
||||
0x20000409 0x04004780 0x308105fd 0x6c4107c8
|
||||
0xa0013003 0x00000000 0x00070609 0xc0000780
|
||||
0x10013003 0x00000280 0xa0000009 0x04000780
|
||||
0xd0800209 0x00400780 0xa0000405 0x04000780
|
||||
0x30050205 0xc4100780 0x20000409 0x04004780
|
||||
0x00020405 0xc0000780 0x103f8005 0x07ffffff
|
||||
0x04051001 0xe4204780 0x307ccffd 0x6c20c7ca
|
||||
0x100a3003 0x00000280 0x1000f811 0x0403c780
|
||||
0x20000815 0x0400c780 0x3005cffd 0x6420c7c8
|
||||
0xa009f003 0x00000000 0x1009f003 0x00000280
|
||||
0x1000ce05 0x0423c780 0x40034e09 0x00200780
|
||||
0x30100409 0xc4100780 0x60024e1d 0x00208780
|
||||
0x30070605 0xc4100780 0x30060609 0xc4100780
|
||||
0x20000205 0x04008780 0x30070e09 0xc4100780
|
||||
0x30060e19 0xc4100780 0x20000409 0x04018780
|
||||
0xa0000019 0x04000780 0x30020c21 0xc4100780
|
||||
0x20029008 0x20088220 0x0000100d 0xc0000780
|
||||
0x30070821 0xc4100780 0x30060825 0xc4100780
|
||||
0x20099020 0x2108e820 0x00000205 0xc0000780
|
||||
0x20088204 0x20018404 0xd00e0205 0x80c00780
|
||||
0x0c071001 0xe4204780 0x307ccdfd 0x6c20c7c8
|
||||
0x1000f805 0x0403c780 0x10043003 0x00000280
|
||||
0xa0042003 0x00000000 0x10008c08 0x2106ec18
|
||||
0xa0004221 0x04200780 0x3002cdfd 0x6420c7c8
|
||||
0xa003f003 0x00000000 0x1003f003 0x00000280
|
||||
0x20000a25 0x0401c780 0x30061225 0xc4100780
|
||||
0x20000425 0x04024780 0x30021225 0xc4100780
|
||||
0x2000ca25 0x04224780 0xd00e1225 0x80c00780
|
||||
0x20001205 0x04004780 0x20000409 0x04020782
|
||||
0x300605fd 0x6c0047c8 0x10035003 0x00000280
|
||||
0xf0000001 0xe0000002 0xa0000009 0x04000780
|
||||
0xd0800231 0x00400780 0xa0001819 0x04000780
|
||||
0x30050c19 0xc4100780 0x20000409 0x04018780
|
||||
0x0002040d 0xc0000780 0x0c011001 0xe4204780
|
||||
0xdc044011 0x20000780 0x1000e005 0x0423c784
|
||||
0x2000c005 0x04204784 0x0c011001 0xe4204780
|
||||
0x1000d005 0x0423c784 0x2000c005 0x04204784
|
||||
0x0c011001 0xe4204780 0x1000c805 0x0423c784
|
||||
0x2000c005 0x04204784 0x0c011001 0xe4204780
|
||||
0x1000c405 0x0423c784 0x2000c005 0x04204784
|
||||
0x0c011001 0xe4204780 0x1000c205 0x0423c784
|
||||
0x2000c005 0x04204784 0xa0000019 0x04000780
|
||||
0x307c0dfd 0x640147c8 0x0c011001 0xe4204780
|
||||
0x1009f003 0x00000280 0xd41c680d 0x20000780
|
||||
0x1d00e204 0x1d00e408 0x40050419 0x00000780
|
||||
0x60040619 0x00018780 0x30100c19 0xc4100780
|
||||
0x3c82c1fd 0x6c6147c8 0x60040419 0x00018780
|
||||
0xa009c003 0x00000000 0x10070003 0x00000280
|
||||
0xd41c4005 0x20000780 0x1500e004 0x1500ec08
|
||||
0x4005041d 0x00000780 0x6004061d 0x0001c780
|
||||
0x30100e1d 0xc4100780 0x60040405 0x0001c780
|
||||
0xd8044005 0x20000780 0x2400c005 0x04204780
|
||||
0x20068205 0x00000003 0x1009c003 0x00000780
|
||||
0xd41c680d 0x20000780 0x3c83c1fd 0x6c6147c8
|
||||
0xa009b003 0x00000000 0x10085003 0x00000280
|
||||
0x3002cc05 0xc4300780 0x301f0209 0xec100780
|
||||
0xd0840409 0x04400780 0x2000041d 0x04004780
|
||||
0xd41c4005 0x20000780 0x1500ec08 0x1500e004
|
||||
0x2400c609 0x04208780 0x40050421 0x00000780
|
||||
0x60040621 0x00020780 0x30101021 0xc4100780
|
||||
0x60040405 0x00020780 0x30010e09 0xec100780
|
||||
0x20000205 0x04008780 0xd8044005 0x20000780
|
||||
0x2400c005 0x04204780 0x200f8205 0x00000003
|
||||
0x1009b003 0x00000780 0xd41c680d 0x20000780
|
||||
0x3c7cc1fd 0x6c2147c8 0xa009a003 0x00000000
|
||||
0x10094003 0x00000280 0xd804400d 0x20000780
|
||||
0xd41c7005 0x20000780 0x3c7cc1fd 0x6c2087c8
|
||||
0x2501e209 0x00000003 0x1400c005 0x0423c780
|
||||
0x10000809 0x2440c280 0x4005041d 0x00000780
|
||||
0x6004061d 0x0001c780 0x30100e1d 0xc4100780
|
||||
0x60040405 0x0001c780 0x1009a003 0x00000780
|
||||
0xd41c7005 0x20000780 0x1500e004 0x1500e208
|
||||
0x4005041d 0x00000780 0x6004061d 0x0001c780
|
||||
0x30100e1d 0xc4100780 0x60040405 0x0001c780
|
||||
0xf0000001 0xe0000002 0xf0000001 0xe0000002
|
||||
0x30060205 0xac000782 0x00020a05 0xc0000780
|
||||
0x04051001 0xe4204780 0xa0004405 0x04200782
|
||||
0x20000811 0x04004780 0x3004cffd 0x6c2107c8
|
||||
0x10016003 0x00000280 0x861ffe03 0x00000000
|
||||
0xa0000009 0x04000780 0xd0800209 0x00400780
|
||||
0xa0000405 0x04000780 0x30050205 0xc4100780
|
||||
0x20000409 0x04004780 0x308505fd 0x6c4107c8
|
||||
0xa00be003 0x00000000 0x100be003 0x00000280
|
||||
0xa0000009 0x04000780 0xd0800209 0x00400780
|
||||
0xa0000405 0x04000780 0x30050205 0xc4100780
|
||||
0x20000409 0x04004780 0x00020405 0xc0000780
|
||||
0xd418400d 0x20000780 0xd4144009 0x20000780
|
||||
0x1c00c005 0x0423c780 0x3801c1fd 0x6c2107c8
|
||||
0xa0000009 0x04000780 0xd0800209 0x00400780
|
||||
0xa0000405 0x04000780 0x30050205 0xc4100780
|
||||
0x20000409 0x04004780 0x20008405 0x0000000b
|
||||
0x10000405 0x0403c500 0x04001001 0xe4204780
|
||||
0xf0000001 0xe0000002 0x861ffe03 0x00000000
|
||||
0xa0000009 0x04000780 0xd0800209 0x00400780
|
||||
0xa0000405 0x04000780 0x30050205 0xc4100780
|
||||
0x20000409 0x04004780 0x308605fd 0x6c4107c8
|
||||
0xa00d9003 0x00000000 0x100d9003 0x00000280
|
||||
0xa0000009 0x04000780 0xd0800209 0x00400780
|
||||
0xa0000405 0x04000780 0x30050205 0xc4100780
|
||||
0x20000409 0x04004780 0x00020405 0xc0000780
|
||||
0xd4024009 0x20000780 0x0802c00d 0xc0200780
|
||||
0x0402d011 0xc0200780 0xdc14400d 0x20000780
|
||||
0x1400d005 0x0423c780 0xd0144011 0x20000784
|
||||
0x1d00e008 0x2941e004 0x3002c009 0x6c20c784
|
||||
0xd0020205 0x04020780 0x2400d005 0x04204780
|
||||
0x04001001 0xe4204780 0xf0000001 0xe0000002
|
||||
0x861ffe03 0x00000000 0xa0000009 0x04000780
|
||||
0xd0800209 0x00400780 0xa0000405 0x04000780
|
||||
0x30050205 0xc4100780 0x20000409 0x04004780
|
||||
0x308705fd 0x6c4107c8 0xa0128003 0x00000000
|
||||
0x10128003 0x00000280 0xa0000009 0x04000780
|
||||
0xd0800209 0x00400780 0xa0000405 0x04000780
|
||||
0x30050205 0xc4100780 0x20000409 0x04004780
|
||||
0x00020405 0xc0000780 0xd4014009 0x20000780
|
||||
0x0802c00d 0xc0200780 0x0402d011 0xc0200780
|
||||
0xdc14400d 0x20000780 0x1400d005 0x0423c780
|
||||
0xd0144011 0x20000784 0x1d00e008 0x2941e004
|
||||
0x3002c009 0x6c20c784 0xd0020205 0x04020780
|
||||
0x2400d005 0x04204780 0x04001001 0xe4204780
|
||||
0x0402f00d 0xc0200780 0x0402d009 0xc0200780
|
||||
0xdc14400d 0x20000780 0x1400d005 0x0423c780
|
||||
0xd8144009 0x20000780 0x1c00c009 0x0423c780
|
||||
0x2440f005 0x04204780 0x3802c009 0x6c20c780
|
||||
0xd0020205 0x04020780 0x2400d005 0x04204780
|
||||
0x04001001 0xe4204780 0x0402e00d 0xc0200780
|
||||
0x0402d009 0xc0200780 0xdc14400d 0x20000780
|
||||
0x1400d005 0x0423c780 0xd8144009 0x20000780
|
||||
0x1c00c009 0x0423c780 0x2440e005 0x04204780
|
||||
0x3802c009 0x6c20c780 0xd0020205 0x04020780
|
||||
0x2400d005 0x04204780 0x04001001 0xe4204780
|
||||
0x0402d80d 0xc0200780 0x0402d009 0xc0200780
|
||||
0xdc14400d 0x20000780 0x1400d005 0x0423c780
|
||||
0xd8144009 0x20000780 0x1d00e008 0x2541f804
|
||||
0x3802c009 0x6c20c780 0xd0020205 0x04020780
|
||||
0x2400d005 0x04204780 0x04001001 0xe4204780
|
||||
0x0402d40d 0xc0200780 0x0402d009 0xc0200780
|
||||
0xdc14400d 0x20000780 0x1400d005 0x0423c780
|
||||
0xd8144009 0x20000780 0x1d00e008 0x2541f404
|
||||
0x3802c009 0x6c20c780 0xd0020205 0x04020780
|
||||
0x2400d005 0x04204780 0x04001001 0xe4204780
|
||||
0x0402d20d 0xc0200780 0x0402d009 0xc0200780
|
||||
0xdc14400d 0x20000780 0x1400d005 0x0423c780
|
||||
0xd8144009 0x20000780 0x1d00e008 0x2541f204
|
||||
0x3802c009 0x6c20c780 0xd0020205 0x04020780
|
||||
0x2400d005 0x04204780 0x04001001 0xe4204780
|
||||
0xf0000001 0xe0000002 0x861ffe03 0x00000000
|
||||
0xa0000009 0x04000780 0xd0800209 0x00400780
|
||||
0xa0000405 0x04000780 0x30050205 0xc4100780
|
||||
0x20000409 0x04004780 0x307c05fd 0x6c0147c8
|
||||
0xa013c003 0x00000000 0x1013c003 0x00000280
|
||||
0x1000ce05 0x0423c780 0x40034e09 0x00200780
|
||||
0x30100409 0xc4100780 0x60024e05 0x00208780
|
||||
0x30070209 0xc4100780 0x3006020d 0xc4100780
|
||||
0x20038408 0x2102e808 0x2000d00d 0x04204780
|
||||
0x20208405 0x00000003 0xd00e020d 0xa0c00780
|
||||
0xa0000009 0x04000782 0xd0800209 0x00400780
|
||||
0xa0000405 0x04000780 0x30050205 0xc4100780
|
||||
0x20000409 0x04004780 0x3002cffd 0x6c20c7c8
|
||||
0x30000003 0x00000280 0x1000ce05 0x0423c780
|
||||
0x40034e09 0x00200780 0x30100409 0xc4100780
|
||||
0x60024e05 0x00208780 0xa0000009 0x04000780
|
||||
0xd0800219 0x00400780 0xa0000c0d 0x04000780
|
||||
0x3005060d 0xc4100780 0x20038408 0x20028204
|
||||
0x3007020d 0xc4100780 0x30060205 0xc4100780
|
||||
0xa0000009 0x04000780 0xd0800201 0x00400780
|
||||
0xa0000001 0x04000780 0x30050001 0xc4100780
|
||||
0x20000409 0x04000780 0x00020405 0xc0000780
|
||||
0x20000601 0x04004780 0xd4144005 0x20000780
|
||||
0x2100e804 0x1500e000 0x20108205 0x00000003
|
||||
0xd00e0201 0xa0c00781
|
||||
}
|
||||
}
|
||||
code {
|
||||
name = cudaEncodeResidual
|
||||
lmem = 0
|
||||
@@ -356,13 +469,46 @@ code {
|
||||
0x3005c1fd 0x6c2147cc 0x6800ce0d 0x8020c780
|
||||
0xd4000805 0x20000780 0x1000c001 0x0423c784
|
||||
0x10040003 0x00000280 0x300109fd 0x6c00c7c8
|
||||
0x30000003 0x00000280 0xd0094005 0x20000780
|
||||
0x2502e008 0x20008210 0x1500e200 0x20028204
|
||||
0x30000003 0x00000280 0xd0094805 0x20000780
|
||||
0x2502f008 0x20008210 0x1500e000 0x20028204
|
||||
0x00020805 0xc0000780 0x30000609 0xec000780
|
||||
0x30020201 0xc4100780 0x2542ee04 0x2100e800
|
||||
0xd00e0005 0xa0c00781
|
||||
}
|
||||
}
|
||||
code {
|
||||
name = cudaCopyBestMethod
|
||||
lmem = 0
|
||||
smem = 32
|
||||
reg = 5
|
||||
bar = 1
|
||||
const {
|
||||
segname = const
|
||||
segnum = 1
|
||||
offset = 0
|
||||
bytes = 4
|
||||
mem {
|
||||
0x0000002f
|
||||
}
|
||||
}
|
||||
bincode {
|
||||
0xa0000005 0x040007c0 0xa000d003 0x00000000
|
||||
0x1000d003 0x00000280 0x1000cc01 0x0423c780
|
||||
0x40014e09 0x00200780 0x30100409 0xc4100780
|
||||
0x60004e01 0x00208780 0x30070009 0xc4100780
|
||||
0x30060001 0xc4100780 0x20008400 0x2100ea00
|
||||
0x20208001 0x00000003 0xd00e0001 0x80c00780
|
||||
0x00000e01 0xe4200780 0xf0000001 0xe0000002
|
||||
0x861ffe03 0x00000000 0x308003fd 0x644107c8
|
||||
0x30000003 0x00000280 0xa0004e01 0x04200780
|
||||
0x3007ce0d 0xc4300780 0x3006ce11 0xc4300780
|
||||
0x30070009 0xc4100780 0x30060001 0xc4100780
|
||||
0x2004860c 0x20008400 0x30020205 0xc4100780
|
||||
0x2103ea0c 0x2100e808 0x20000201 0x0400c780
|
||||
0xd00e0001 0x80c00780 0x20000205 0x04008780
|
||||
0xd00e0201 0xa0c00781
|
||||
}
|
||||
}
|
||||
code {
|
||||
name = cudaSumResidualChunks
|
||||
lmem = 0
|
||||
@@ -572,11 +718,91 @@ code {
|
||||
0xf0000001 0xe0000001
|
||||
}
|
||||
}
|
||||
code {
|
||||
name = cudaCopyBestMethodStereo
|
||||
lmem = 0
|
||||
smem = 68
|
||||
reg = 6
|
||||
bar = 1
|
||||
const {
|
||||
segname = const
|
||||
segnum = 1
|
||||
offset = 0
|
||||
bytes = 16
|
||||
mem {
|
||||
0x00000003 0x00000004 0x7fffffff 0x0000002f
|
||||
}
|
||||
}
|
||||
bincode {
|
||||
0xa0000001 0x04000780 0x30800005 0x6440c7d0
|
||||
0xa00003fd 0x0c0147c8 0xa0012003 0x00000000
|
||||
0x10012003 0x00001100 0x60824e09 0x00600780
|
||||
0x1000cc05 0x0423c780 0x4005040d 0x00000780
|
||||
0x6004060d 0x0000c780 0x3010060d 0xc4100780
|
||||
0x60040405 0x0000c780 0x30070209 0xc4100780
|
||||
0x30060205 0xc4100780 0x20018404 0x2101ea04
|
||||
0x20208205 0x00000003 0xd00e0205 0x80c00780
|
||||
0x00020005 0xc0000780 0x04000e01 0xe4204780
|
||||
0xf0000001 0xe0000002 0xa001c003 0x00000000
|
||||
0x1001c003 0x00000100 0x00020005 0xc0000780
|
||||
0x3407ce05 0xc4300780 0x3406ce09 0xc4300780
|
||||
0x20028204 0x2101ea04 0x20108205 0x00000003
|
||||
0xd00e0205 0x80c00780 0x04001601 0xe4204780
|
||||
0xf0000001 0xe0000002 0x861ffe03 0x00000000
|
||||
0x307c0005 0x640087d0 0xa00003fd 0x0c0147c8
|
||||
0xa003f003 0x00000000 0x1003f003 0x00001100
|
||||
0x1100fc04 0x2101fa04 0x308203fd 0x6c4187d8
|
||||
0x1000d209 0x0423d500 0x00001e01 0xe4209500
|
||||
0x1000d409 0x0423d500 0x00002001 0xe4209500
|
||||
0x10000405 0x2440d280 0x1100fc08 0x2102f808
|
||||
0x300105fd 0x6c0187d8 0x10031003 0x00001280
|
||||
0x1000d405 0x0423c780 0x00001e01 0xe4204780
|
||||
0x1000d005 0x0423c780 0x00002001 0xe4204780
|
||||
0x10000405 0x0403c780 0x1100fc08 0x2102f608
|
||||
0x300105fd 0x6c0187d8 0x10039003 0x00001280
|
||||
0x1000ce05 0x0423c780 0x00001e01 0xe4204780
|
||||
0x1000d405 0x0423c780 0x00002001 0xe4204780
|
||||
0x10000405 0x0403c780 0x1100f808 0x2102f608
|
||||
0x300203fd 0x6c00c7d8 0x1000ce05 0x0423d500
|
||||
0x00001e01 0xe4205500 0x1000d005 0x0423d500
|
||||
0x00002001 0xe4205500 0xf0000001 0xe0000002
|
||||
0x861ffe03 0x00000000 0x30830005 0x6440c7e0
|
||||
0xa00003fd 0x0c0147d8 0xa0051003 0x00000000
|
||||
0x10051003 0x00002100 0x41022e05 0x00000003
|
||||
0x3007de0d 0xc4300780 0x3006de11 0xc4300780
|
||||
0x30070209 0xc4100780 0x30060205 0xc4100780
|
||||
0x2004860c 0x20018404 0x30020009 0xc4100780
|
||||
0x2103ea10 0x2101e80c 0x20000405 0x04010780
|
||||
0xd00e0205 0x80c00780 0x20000409 0x0400c780
|
||||
0xd00e0405 0xa0c00780 0xf0000001 0xe0000002
|
||||
0xa005f003 0x00000000 0x1005f003 0x00000100
|
||||
0x41022e05 0x00000003 0x3007ce09 0xc4300780
|
||||
0x3006ce0d 0xc4300780 0x30070211 0xc4100780
|
||||
0x30060215 0xc4100780 0x20038404 0x20058808
|
||||
0x2101ea04 0x2102e808 0x20288205 0x00000003
|
||||
0xd00e0205 0x80c00780 0x20288409 0x00000003
|
||||
0xd00e0405 0xa0c00780 0xf0000001 0xe0000002
|
||||
0xa006e003 0x00000000 0x1006e003 0x00001100
|
||||
0x41022e05 0x00000003 0x3007e00d 0xc4300780
|
||||
0x30070209 0xc4100780 0x30060205 0xc4100780
|
||||
0x3006e011 0xc4300780 0x20018404 0x20048608
|
||||
0x30020001 0xc4100780 0x2101e804 0x2102ea08
|
||||
0x20018004 0x20028000 0xd00e0001 0x80c00780
|
||||
0x20008205 0x0000000f 0xd00e0201 0xa0c00780
|
||||
0xf0000001 0xe0000002 0x30000003 0x00000100
|
||||
0x41022e01 0x00000003 0x3007d005 0xc4300780
|
||||
0x3006d009 0xc4300780 0x3007000d 0xc4100780
|
||||
0x30060011 0xc4100780 0x20028200 0x20048604
|
||||
0x2100ea00 0x2101e804 0x20288001 0x00000003
|
||||
0xd00e0001 0x80c00780 0x20288205 0x0000000f
|
||||
0xd00e0201 0xa0c00781
|
||||
}
|
||||
}
|
||||
code {
|
||||
name = cudaSumResidual
|
||||
lmem = 0
|
||||
smem = 1248
|
||||
reg = 5
|
||||
smem = 1244
|
||||
reg = 7
|
||||
bar = 1
|
||||
const {
|
||||
segname = const
|
||||
@@ -595,46 +821,59 @@ code {
|
||||
0x3007000d 0xc4100780 0x30060001 0xc4100780
|
||||
0x20008600 0x2100e800 0x20000401 0x04000780
|
||||
0xd00e0001 0x80c00780 0x00000405 0xc0000780
|
||||
0x04021001 0xe4200780 0xf0000001 0xe0000002
|
||||
0x861ffe03 0x00000000 0x3001cffd 0x6c20c7c8
|
||||
0x04020e01 0xe4200780 0xf0000001 0xe0000002
|
||||
0x861ffe03 0x00000000 0x3001cdfd 0x6c20c7c8
|
||||
0xa001c003 0x00000000 0x1001b003 0x00000280
|
||||
0x1000ce01 0x0423c780 0x40014e0d 0x00200780
|
||||
0x1000cc01 0x0423c780 0x40014e0d 0x00200780
|
||||
0x3010060d 0xc4100780 0x60004e01 0x0020c780
|
||||
0x20000001 0x04004780 0x30020001 0xc4100780
|
||||
0x2000ca01 0x04200780 0xd00e0001 0x80c00780
|
||||
0x1001c003 0x00000780 0x1000f801 0x0403c780
|
||||
0x00000405 0xc0000782 0x04001001 0xe4200780
|
||||
0x00000405 0xc0000782 0x04000e01 0xe4200780
|
||||
0x861ffe03 0x00000000 0x308103fd 0x6c4107c8
|
||||
0x00000405 0xc0000500 0xd4014009 0x20000500
|
||||
0x1800c001 0x0423c500 0x2400d001 0x04200500
|
||||
0x04001001 0xe4200500 0x861ffe03 0x00000000
|
||||
0x00000405 0xc0000780 0x1400f001 0x0423c780
|
||||
0x2400d001 0x04200780 0x04001001 0xe4200780
|
||||
0x2400e001 0x04200780 0x04001001 0xe4200780
|
||||
0x2400d801 0x04200780 0x04001001 0xe4200780
|
||||
0x2400d401 0x04200780 0x04001001 0xe4200780
|
||||
0x2400d201 0x04200780 0x307c03fd 0x6c0147c8
|
||||
0x04001001 0xe4200780 0x30000003 0x00000280
|
||||
0xd0086805 0x20000780 0x3482c1fd 0x6c6147c8
|
||||
0x10040003 0x00000280 0xd0084005 0x20000780
|
||||
0x1500ec00 0x1500e004 0x40030009 0x00000780
|
||||
0x60020209 0x00008780 0x30100409 0xc4100780
|
||||
0x60020001 0x00008780 0x2000d001 0x04200780
|
||||
0x20068001 0x00000003 0x10058003 0x00000780
|
||||
0xd0086805 0x20000780 0x3483c1fd 0x6c6147c8
|
||||
0x10052003 0x00000280 0xd0084005 0x20000780
|
||||
0x1500e604 0x1500e000 0x2400cc05 0x04204780
|
||||
0x3002ce0d 0xc4300780 0x40030009 0x00000780
|
||||
0x301f0611 0xec100780 0x60020209 0x00008780
|
||||
0xd0840811 0x04400780 0x30100409 0xc4100780
|
||||
0x2000080d 0x0400c780 0x60020001 0x00008780
|
||||
0x30010605 0xec100780 0x20018000 0x2100f000
|
||||
0x200f8001 0x00000003 0x10058003 0x00000780
|
||||
0xd0087005 0x20000780 0x1500e000 0x1500e204
|
||||
0x00000405 0xc0000500 0xd4013809 0x20000500
|
||||
0x1800c001 0x0423c500 0x2400ce01 0x04200500
|
||||
0x04000e01 0xe4200500 0x861ffe03 0x00000000
|
||||
0x00000405 0xc0000780 0x1400ee01 0x0423c780
|
||||
0x2400ce01 0x04200780 0x04000e01 0xe4200780
|
||||
0x1500fe00 0x2500ee00 0x04000e01 0xe4200780
|
||||
0x1500f600 0x2500ee00 0x04000e01 0xe4200780
|
||||
0x1500f200 0x2500ee00 0x04000e01 0xe4200780
|
||||
0x1500f000 0x2500ee00 0x307c03fd 0x6c0147c8
|
||||
0x04000e01 0xe4200780 0x30000003 0x00000280
|
||||
0xd0086005 0x20000780 0x3482c1fd 0x6c6147c8
|
||||
0x10044003 0x00000280 0xd0083805 0x20000780
|
||||
0x1500ec00 0x1500e008 0x4005000c 0x1500ee04
|
||||
0x6004020d 0x0000c780 0x40020211 0x00000780
|
||||
0x3010060d 0xc4100780 0x60030011 0x00010780
|
||||
0x60040009 0x0000c780 0x3010080d 0xc4100780
|
||||
0x2000ce09 0x04208780 0x60020001 0x0000c780
|
||||
0x20068405 0x00000003 0x10070003 0x00000780
|
||||
0xd0086005 0x20000780 0x3483c1fd 0x6c6147c8
|
||||
0x1005c003 0x00000280 0xd0083805 0x20000780
|
||||
0x1500ec00 0x1500e008 0x2400c60d 0x04200780
|
||||
0x3002cc15 0xc4300780 0x40070805 0x00000780
|
||||
0x301f0a19 0xec100780 0x60060a11 0x00004780
|
||||
0x1400ce05 0x0423c780 0xd0840c19 0x04400780
|
||||
0x30100811 0xc4100780 0x20000c15 0x04014780
|
||||
0x60060809 0x00010780 0x40010411 0x00000780
|
||||
0x30010a0d 0xec100780 0x60000611 0x00010780
|
||||
0x20000409 0x0400c780 0x3010080d 0xc4100780
|
||||
0x2000ce09 0x04208780 0x60000401 0x0000c780
|
||||
0x200f8405 0x00000003 0x10070003 0x00000780
|
||||
0xd0086005 0x20000780 0x1500e400 0x1500e204
|
||||
0x40030009 0x00000780 0x60020209 0x00008780
|
||||
0x30100409 0xc4100780 0x60020001 0x00008780
|
||||
0xa0004e05 0x04200780 0x30070209 0xc4100780
|
||||
0x30060205 0xc4100780 0x20018404 0x2101e804
|
||||
0x20108205 0x00000003 0xd00e0201 0xa0c00781
|
||||
0x30100409 0xc4100780 0x347cc1fd 0x6c2147c8
|
||||
0x60020001 0x00008780 0x1006f003 0x00000280
|
||||
0xd0086805 0x20000780 0x2501e209 0x00000003
|
||||
0x1400c005 0x0423c780 0x4005040d 0x00000780
|
||||
0x6004060d 0x0000c780 0x3010060d 0xc4100780
|
||||
0x60040409 0x0000c780 0x307ccffd 0x6c2147c8
|
||||
0x1400c005 0x0423c780 0x10000405 0x0403c280
|
||||
0x10070003 0x00000780 0x10000005 0x0403c780
|
||||
0xa0004e09 0x04200780 0x3007040d 0xc4100780
|
||||
0x30060409 0xc4100780 0x20028608 0x2102e808
|
||||
0x30000201 0xac000780 0x20108405 0x00000003
|
||||
0xd00e0201 0xa0c00781
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user