optimizations

This commit is contained in:
chudov
2009-09-17 14:37:25 +00:00
parent 672c0cb20e
commit 7796ef40c6
3 changed files with 692 additions and 370 deletions

View File

@@ -101,9 +101,11 @@ namespace CUETools.Codecs.FlaCuda
int nResidualTasks = 0; int nResidualTasks = 0;
int nAutocorTasks = 0; int nAutocorTasks = 0;
bool encode_on_cpu = true;
public const int MAX_BLOCKSIZE = 4608 * 4; public const int MAX_BLOCKSIZE = 4608 * 4;
internal const int maxFrames = 8; internal const int maxFrames = 8;
internal const int maxResidualParts = (MAX_BLOCKSIZE + 32 * 3) / (32 * 3); internal const int maxResidualParts = 64;
internal const int maxAutocorParts = MAX_BLOCKSIZE / (256 - 32); internal const int maxAutocorParts = MAX_BLOCKSIZE / (256 - 32);
public FlaCudaWriter(string path, int bitsPerSample, int channelCount, int sampleRate, Stream IO) public FlaCudaWriter(string path, int bitsPerSample, int channelCount, int sampleRate, Stream IO)
@@ -439,47 +441,6 @@ namespace CUETools.Codecs.FlaCuda
samplesInBuffer += block; samplesInBuffer += block;
} }
static uint rice_encode_count(uint sum, uint n, uint k)
{
return n*(k+1) + ((sum-(n>>1))>>(int)k);
}
//static unsafe uint find_optimal_rice_param(uint sum, uint n)
//{
// uint* nbits = stackalloc uint[Flake.MAX_RICE_PARAM + 1];
// int k_opt = 0;
// nbits[0] = UINT32_MAX;
// for (int k = 0; k <= Flake.MAX_RICE_PARAM; k++)
// {
// nbits[k] = rice_encode_count(sum, n, (uint)k);
// if (nbits[k] < nbits[k_opt])
// k_opt = k;
// }
// return (uint)k_opt;
//}
static unsafe int find_optimal_rice_param(uint sum, uint n, out uint nbits_best)
{
int k_opt = 0;
uint a = n;
uint b = sum - (n >> 1);
uint nbits = a + b;
for (int k = 1; k <= Flake.MAX_RICE_PARAM; k++)
{
a += n;
b >>= 1;
uint nbits_k = a + b;
if (nbits_k < nbits)
{
k_opt = k;
nbits = nbits_k;
}
}
nbits_best = nbits;
return k_opt;
}
unsafe static void channel_decorrelation(int* leftS, int* rightS, int *leftM, int *rightM, int* src, int blocksize) unsafe static void channel_decorrelation(int* leftS, int* rightS, int *leftM, int *rightM, int* src, int blocksize)
{ {
for (int i = 0; i < blocksize; i++) for (int i = 0; i < blocksize; i++)
@@ -546,6 +507,47 @@ namespace CUETools.Codecs.FlaCuda
} }
} }
static uint rice_encode_count(uint sum, uint n, uint k)
{
return n * (k + 1) + ((sum - (n >> 1)) >> (int)k);
}
//static unsafe uint find_optimal_rice_param(uint sum, uint n)
//{
// uint* nbits = stackalloc uint[Flake.MAX_RICE_PARAM + 1];
// int k_opt = 0;
// nbits[0] = UINT32_MAX;
// for (int k = 0; k <= Flake.MAX_RICE_PARAM; k++)
// {
// nbits[k] = rice_encode_count(sum, n, (uint)k);
// if (nbits[k] < nbits[k_opt])
// k_opt = k;
// }
// return (uint)k_opt;
//}
static unsafe int find_optimal_rice_param(uint sum, uint n, out uint nbits_best)
{
int k_opt = 0;
uint a = n;
uint b = sum - (n >> 1);
uint nbits = a + b;
for (int k = 1; k <= Flake.MAX_RICE_PARAM; k++)
{
a += n;
b >>= 1;
uint nbits_k = a + b;
if (nbits_k < nbits)
{
k_opt = k;
nbits = nbits_k;
}
}
nbits_best = nbits;
return k_opt;
}
static unsafe uint calc_optimal_rice_params(ref RiceContext rc, int porder, uint* sums, uint n, uint pred_order) static unsafe uint calc_optimal_rice_params(ref RiceContext rc, int porder, uint* sums, uint n, uint pred_order)
{ {
uint part = (1U << porder); uint part = (1U << porder);
@@ -563,22 +565,28 @@ namespace CUETools.Codecs.FlaCuda
return all_bits; return all_bits;
} }
static unsafe void calc_sums(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums) static unsafe void calc_sums(int pmin, int pmax, int* data, uint n, uint pred_order, uint* sums)
{ {
// sums for highest level // sums for highest level
int parts = (1 << pmax); int parts = (1 << pmax);
uint* res = data + pred_order; int* res = data + pred_order;
uint cnt = (n >> pmax) - pred_order; uint cnt = (n >> pmax) - pred_order;
uint sum = 0; uint sum = 0;
for (uint j = cnt; j > 0; j--) for (uint j = cnt; j > 0; j--)
sum += *(res++); {
int val = *(res++);
sum += (uint)((val << 1) ^ (val >> 31));
}
sums[pmax * Flake.MAX_PARTITIONS + 0] = sum; sums[pmax * Flake.MAX_PARTITIONS + 0] = sum;
cnt = (n >> pmax); cnt = (n >> pmax);
for (int i = 1; i < parts; i++) for (int i = 1; i < parts; i++)
{ {
sum = 0; sum = 0;
for (uint j = cnt; j > 0; j--) for (uint j = cnt; j > 0; j--)
sum += *(res++); {
int val = *(res++);
sum += (uint)((val << 1) ^ (val >> 31));
}
sums[pmax * Flake.MAX_PARTITIONS + i] = sum; sums[pmax * Flake.MAX_PARTITIONS + i] = sum;
} }
// sums for lower levels // sums for lower levels
@@ -596,7 +604,7 @@ namespace CUETools.Codecs.FlaCuda
static unsafe uint calc_rice_params(ref RiceContext rc, ref RiceContext tmp_rc, int pmin, int pmax, int* data, uint n, uint pred_order) static unsafe uint calc_rice_params(ref RiceContext rc, ref RiceContext tmp_rc, int pmin, int pmax, int* data, uint n, uint pred_order)
{ {
uint* udata = stackalloc uint[(int)n]; //uint* udata = stackalloc uint[(int)n];
uint* sums = stackalloc uint[(pmax + 1) * Flake.MAX_PARTITIONS]; uint* sums = stackalloc uint[(pmax + 1) * Flake.MAX_PARTITIONS];
//uint* bits = stackalloc uint[Flake.MAX_PARTITION_ORDER]; //uint* bits = stackalloc uint[Flake.MAX_PARTITION_ORDER];
@@ -604,10 +612,10 @@ namespace CUETools.Codecs.FlaCuda
//assert(pmax >= 0 && pmax <= Flake.MAX_PARTITION_ORDER); //assert(pmax >= 0 && pmax <= Flake.MAX_PARTITION_ORDER);
//assert(pmin <= pmax); //assert(pmin <= pmax);
for (uint i = 0; i < n; i++) //for (uint i = 0; i < n; i++)
udata[i] = (uint) ((2 * data[i]) ^ (data[i] >> 31)); // udata[i] = (uint) ((2 * data[i]) ^ (data[i] >> 31));
calc_sums(pmin, pmax, udata, n, pred_order, sums); calc_sums(pmin, pmax, data, n, pred_order, sums);
int opt_porder = pmin; int opt_porder = pmin;
uint opt_bits = AudioSamples.UINT32_MAX; uint opt_bits = AudioSamples.UINT32_MAX;
@@ -790,45 +798,6 @@ namespace CUETools.Codecs.FlaCuda
bitwriter.flush(); bitwriter.flush();
} }
unsafe uint measure_frame_size(FlacFrame frame, bool do_midside)
{
// crude estimation of header/footer size
uint total = (uint)(32 + ((BitReader.log2i(frame_count) + 4) / 5) * 8 + (eparams.variable_block_size != 0 ? 16 : 0) + 16);
if (do_midside)
{
uint bitsBest = AudioSamples.UINT32_MAX;
ChannelMode modeBest = ChannelMode.LeftRight;
if (bitsBest > frame.subframes[2].best.size + frame.subframes[3].best.size)
{
bitsBest = frame.subframes[2].best.size + frame.subframes[3].best.size;
modeBest = ChannelMode.MidSide;
}
if (bitsBest > frame.subframes[3].best.size + frame.subframes[1].best.size)
{
bitsBest = frame.subframes[3].best.size + frame.subframes[1].best.size;
modeBest = ChannelMode.RightSide;
}
if (bitsBest > frame.subframes[3].best.size + frame.subframes[0].best.size)
{
bitsBest = frame.subframes[3].best.size + frame.subframes[0].best.size;
modeBest = ChannelMode.LeftSide;
}
if (bitsBest > frame.subframes[0].best.size + frame.subframes[1].best.size)
{
bitsBest = frame.subframes[0].best.size + frame.subframes[1].best.size;
modeBest = ChannelMode.LeftRight;
}
frame.ch_mode = modeBest;
return total + bitsBest;
}
for (int ch = 0; ch < channels; ch++)
total += frame.subframes[ch].best.size;
return total;
}
unsafe delegate void window_function(float* window, int size); unsafe delegate void window_function(float* window, int size);
unsafe void calculate_window(float* window, window_function func, WindowFunction flag) unsafe void calculate_window(float* window, window_function func, WindowFunction flag)
@@ -869,39 +838,58 @@ namespace CUETools.Codecs.FlaCuda
for (int order = 1; order <= max_order; order++) for (int order = 1; order <= max_order; order++)
{ {
task.ResidualTasks[nResidualTasks].type = (int)SubframeType.LPC; task.ResidualTasks[nResidualTasks].type = (int)SubframeType.LPC;
task.ResidualTasks[nResidualTasks].channel = ch;
task.ResidualTasks[nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0); task.ResidualTasks[nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0);
task.ResidualTasks[nResidualTasks].blocksize = blocksize; task.ResidualTasks[nResidualTasks].blocksize = blocksize;
task.ResidualTasks[nResidualTasks].residualOrder = order <= max_order ? order : 0; task.ResidualTasks[nResidualTasks].residualOrder = order <= max_order ? order : 0;
task.ResidualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize; task.ResidualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize;
task.ResidualTasks[nResidualTasks].residualOffs = task.ResidualTasks[nResidualTasks].samplesOffs;
nResidualTasks++; nResidualTasks++;
} }
} }
// Fixed prediction // Fixed prediction
for (int order = 1; order <= max_order; order++) for (int order = 1; order <= max_order; order++)
{ {
task.ResidualTasks[nResidualTasks].type = order <= 5 ? (int)SubframeType.Fixed : (int)SubframeType.Verbatim; task.ResidualTasks[nResidualTasks].type = (int)SubframeType.Verbatim;
task.ResidualTasks[nResidualTasks].channel = ch;
task.ResidualTasks[nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0); task.ResidualTasks[nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0);
task.ResidualTasks[nResidualTasks].blocksize = blocksize; task.ResidualTasks[nResidualTasks].blocksize = blocksize;
task.ResidualTasks[nResidualTasks].residualOrder = order <= 4 ? order : 0; task.ResidualTasks[nResidualTasks].residualOrder = 0;
task.ResidualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize; task.ResidualTasks[nResidualTasks].samplesOffs = ch * FlaCudaWriter.MAX_BLOCKSIZE + iFrame * blocksize;
task.ResidualTasks[nResidualTasks].residualOffs = task.ResidualTasks[nResidualTasks].samplesOffs;
task.ResidualTasks[nResidualTasks].shift = 0; task.ResidualTasks[nResidualTasks].shift = 0;
switch (order) switch (order)
{ {
case 5:
break;
case 1: case 1:
task.ResidualTasks[nResidualTasks].type = (int)SubframeType.Constant;
task.ResidualTasks[nResidualTasks].residualOrder = 1;
task.ResidualTasks[nResidualTasks].coefs[0] = 1;
break;
case 3:
task.ResidualTasks[nResidualTasks].type = (int)SubframeType.Fixed;
task.ResidualTasks[nResidualTasks].residualOrder = 0;
break;
case 4:
task.ResidualTasks[nResidualTasks].type = (int)SubframeType.Fixed;
task.ResidualTasks[nResidualTasks].residualOrder = 1;
task.ResidualTasks[nResidualTasks].coefs[0] = 1; task.ResidualTasks[nResidualTasks].coefs[0] = 1;
break; break;
case 2: case 2:
task.ResidualTasks[nResidualTasks].type = (int)SubframeType.Fixed;
task.ResidualTasks[nResidualTasks].residualOrder = 2;
task.ResidualTasks[nResidualTasks].coefs[1] = 2; task.ResidualTasks[nResidualTasks].coefs[1] = 2;
task.ResidualTasks[nResidualTasks].coefs[0] = -1; task.ResidualTasks[nResidualTasks].coefs[0] = -1;
break; break;
case 3: case 5:
task.ResidualTasks[nResidualTasks].type = (int)SubframeType.Fixed;
task.ResidualTasks[nResidualTasks].residualOrder = 3;
task.ResidualTasks[nResidualTasks].coefs[2] = 3; task.ResidualTasks[nResidualTasks].coefs[2] = 3;
task.ResidualTasks[nResidualTasks].coefs[1] = -3; task.ResidualTasks[nResidualTasks].coefs[1] = -3;
task.ResidualTasks[nResidualTasks].coefs[0] = 1; task.ResidualTasks[nResidualTasks].coefs[0] = 1;
break; break;
case 4: case 6:
task.ResidualTasks[nResidualTasks].type = (int)SubframeType.Fixed;
task.ResidualTasks[nResidualTasks].residualOrder = 4;
task.ResidualTasks[nResidualTasks].coefs[3] = 4; task.ResidualTasks[nResidualTasks].coefs[3] = 4;
task.ResidualTasks[nResidualTasks].coefs[2] = -6; task.ResidualTasks[nResidualTasks].coefs[2] = -6;
task.ResidualTasks[nResidualTasks].coefs[1] = 4; task.ResidualTasks[nResidualTasks].coefs[1] = 4;
@@ -950,7 +938,7 @@ namespace CUETools.Codecs.FlaCuda
csum += (ulong)Math.Abs(coefs[i - 1]); csum += (ulong)Math.Abs(coefs[i - 1]);
if ((csum << (int)frame.subframes[ch].obits) >= 1UL << 32) if ((csum << (int)frame.subframes[ch].obits) >= 1UL << 32)
lpc.encode_residual_long(frame.subframes[ch].best.residual, frame.subframes[ch].samples, frame.blocksize, frame.subframes[ch].best.order, coefs, frame.subframes[ch].best.shift); lpc.encode_residual_long(frame.subframes[ch].best.residual, frame.subframes[ch].samples, frame.blocksize, frame.subframes[ch].best.order, coefs, frame.subframes[ch].best.shift);
else else if (encode_on_cpu)
lpc.encode_residual(frame.subframes[ch].best.residual, frame.subframes[ch].samples, frame.blocksize, frame.subframes[ch].best.order, coefs, frame.subframes[ch].best.shift); lpc.encode_residual(frame.subframes[ch].best.residual, frame.subframes[ch].samples, frame.blocksize, frame.subframes[ch].best.order, coefs, frame.subframes[ch].best.shift);
int pmin = get_max_p_order(eparams.min_partition_order, frame.blocksize, frame.subframes[ch].best.order); int pmin = get_max_p_order(eparams.min_partition_order, frame.blocksize, frame.subframes[ch].best.order);
@@ -973,33 +961,33 @@ namespace CUETools.Codecs.FlaCuda
unsafe void select_best_methods(FlacFrame frame, int channelsCount, int max_order, int iFrame, FlaCudaTask task) unsafe void select_best_methods(FlacFrame frame, int channelsCount, int max_order, int iFrame, FlaCudaTask task)
{ {
for (int ch = 0; ch < channelsCount; ch++) if (channelsCount == 4 && channels == 2)
{ {
int i; if (task.BestResidualTasks[iFrame * 2].channel == 0 && task.BestResidualTasks[iFrame * 2 + 1].channel == 1)
for (i = 1; i < frame.blocksize; i++) frame.ch_mode = ChannelMode.LeftRight;
if (frame.subframes[ch].samples[i] != frame.subframes[ch].samples[0]) else if (task.BestResidualTasks[iFrame * 2].channel == 0 && task.BestResidualTasks[iFrame * 2 + 1].channel == 3)
break; frame.ch_mode = ChannelMode.LeftSide;
// CONSTANT else if (task.BestResidualTasks[iFrame * 2].channel == 3 && task.BestResidualTasks[iFrame * 2 + 1].channel == 1)
if (i == frame.blocksize) frame.ch_mode = ChannelMode.RightSide;
{ else if (task.BestResidualTasks[iFrame * 2].channel == 2 && task.BestResidualTasks[iFrame * 2 + 1].channel == 3)
frame.subframes[ch].best.type = SubframeType.Constant; frame.ch_mode = ChannelMode.MidSide;
frame.subframes[ch].best.size = frame.subframes[ch].obits;
}
// VERBATIM
else else
throw new Exception("internal error: invalid stereo mode");
frame.SwapSubframes(0, task.BestResidualTasks[iFrame * 2].channel);
frame.SwapSubframes(1, task.BestResidualTasks[iFrame * 2 + 1].channel);
}
else
frame.ch_mode = channels != 2 ? ChannelMode.NotStereo : ChannelMode.LeftRight;
for (int ch = 0; ch < channels; ch++)
{ {
frame.subframes[ch].best.type = SubframeType.Verbatim; frame.subframes[ch].best.type = SubframeType.Verbatim;
frame.subframes[ch].best.size = frame.subframes[ch].obits * (uint)frame.blocksize; frame.subframes[ch].best.size = frame.subframes[ch].obits * (uint)frame.blocksize;
}
}
if (frame.blocksize <= 4) int index = ch + iFrame * channels;
return; if (task.BestResidualTasks[index].size < 0)
throw new Exception("internal error");
for (int ch = 0; ch < channelsCount; ch++) if (frame.blocksize > 4 && frame.subframes[ch].best.size > task.BestResidualTasks[index].size)
{
int index = ch + iFrame * channelsCount;
if (frame.subframes[ch].best.size > task.BestResidualTasks[index].size)
{ {
frame.subframes[ch].best.type = (SubframeType)task.BestResidualTasks[index].type; frame.subframes[ch].best.type = (SubframeType)task.BestResidualTasks[index].type;
frame.subframes[ch].best.size = (uint)task.BestResidualTasks[index].size; frame.subframes[ch].best.size = (uint)task.BestResidualTasks[index].size;
@@ -1008,47 +996,10 @@ namespace CUETools.Codecs.FlaCuda
frame.subframes[ch].best.shift = task.BestResidualTasks[index].shift; frame.subframes[ch].best.shift = task.BestResidualTasks[index].shift;
for (int i = 0; i < task.BestResidualTasks[index].residualOrder; i++) for (int i = 0; i < task.BestResidualTasks[index].residualOrder; i++)
frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i]; frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i];
AudioSamples.MemCpy(frame.subframes[ch].best.residual + frame.subframes[ch].best.order, (int*)task.residualBufferPtr + task.BestResidualTasks[index].samplesOffs, frame.blocksize - frame.subframes[ch].best.order); if (!encode_on_cpu)
AudioSamples.MemCpy(frame.subframes[ch].best.residual + frame.subframes[ch].best.order, (int*)task.residualBufferPtr + task.BestResidualTasks[index].residualOffs, frame.blocksize - frame.subframes[ch].best.order);
} }
//for (int iWindow = 0; iWindow < _windowcount; iWindow++)
//{
// for (int order = 1; order <= max_order && order < frame.blocksize; order++)
// {
// int index = (order - 1) + max_order * (iWindow + (_windowcount + 1) * (ch + iFrame * channelsCount));
// if (task.ResidualTasks[index].residualOrder != order || task.ResidualTasks[index].type != (int)SubframeType.LPC)
// throw new Exception("oops");
// if (frame.subframes[ch].best.size > task.ResidualTasks[index].size)
// {
// frame.subframes[ch].best.type = SubframeType.LPC;
// frame.subframes[ch].best.size = (uint)task.ResidualTasks[index].size;
// frame.subframes[ch].best.order = task.ResidualTasks[index].residualOrder;
// //frame.subframes[ch].best.window = iWindow;
// frame.subframes[ch].best.cbits = task.ResidualTasks[index].cbits;
// frame.subframes[ch].best.shift = task.ResidualTasks[index].shift;
// for (int i = 0; i < order; i++)
// frame.subframes[ch].best.coefs[i] = task.ResidualTasks[index].coefs[order - 1 - i];
// }
// }
//}
} }
// FIXED
//for (int ch = 0; ch < channelsCount; ch++)
//{
// for (int order = 1; order <= 5 && order <= max_order && order < frame.blocksize; order++)
// {
// int index = (order - 1) + max_order * (_windowcount + (_windowcount + 1) * (ch + iFrame * channelsCount));
// int forder = order == 5 ? 0 : order;
// if (task.ResidualTasks[index].residualOrder != (order == 5 ? 1 : order))
// throw new Exception("oops");
// if (frame.subframes[ch].best.size > task.ResidualTasks[index].size)
// {
// frame.subframes[ch].best.type = SubframeType.Fixed;
// frame.subframes[ch].best.size = (uint)task.ResidualTasks[index].size;
// frame.subframes[ch].best.order = forder;
// }
// }
//}
} }
unsafe void estimate_residual(int blocksize, int channelsCount, int max_order, int nFrames, FlaCudaTask task) unsafe void estimate_residual(int blocksize, int channelsCount, int max_order, int nFrames, FlaCudaTask task)
@@ -1090,18 +1041,30 @@ namespace CUETools.Codecs.FlaCuda
cuda.SetParameter(task.cudaSumResidual, 0, (uint)task.cudaResidualTasks.Pointer); cuda.SetParameter(task.cudaSumResidual, 0, (uint)task.cudaResidualTasks.Pointer);
cuda.SetParameter(task.cudaSumResidual, sizeof(uint), (uint)task.cudaResidualOutput.Pointer); cuda.SetParameter(task.cudaSumResidual, sizeof(uint), (uint)task.cudaResidualOutput.Pointer);
cuda.SetParameter(task.cudaSumResidual, sizeof(uint) * 2, (uint)partSize); cuda.SetParameter(task.cudaSumResidual, 2 * sizeof(uint), (uint)partCount);
cuda.SetParameter(task.cudaSumResidual, sizeof(uint) * 3, (uint)partCount); cuda.SetParameterSize(task.cudaSumResidual, sizeof(uint) * 3U);
cuda.SetParameterSize(task.cudaSumResidual, sizeof(uint) * 4U);
cuda.SetFunctionBlockShape(task.cudaSumResidual, 64, 1, 1); cuda.SetFunctionBlockShape(task.cudaSumResidual, 64, 1, 1);
int tasksPerChannel = (_windowcount + 1) * max_order; int tasksPerChannel = (_windowcount + 1) * max_order;
int nBestTasks = nResidualTasks / tasksPerChannel; int nBestTasks = nResidualTasks / tasksPerChannel;
cuda.SetParameter(task.cudaChooseBestResidual, 0, (uint)task.cudaBestResidualTasks.Pointer); cuda.SetParameter(task.cudaChooseBestMethod, 0 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer);
cuda.SetParameter(task.cudaChooseBestResidual, 1 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer); cuda.SetParameter(task.cudaChooseBestMethod, 1 * sizeof(uint), (uint)task.cudaResidualOutput.Pointer);
cuda.SetParameter(task.cudaChooseBestResidual, 2 * sizeof(uint), (uint)tasksPerChannel); cuda.SetParameter(task.cudaChooseBestMethod, 2 * sizeof(uint), (uint)partCount);
cuda.SetParameterSize(task.cudaChooseBestResidual, sizeof(uint) * 3U); cuda.SetParameter(task.cudaChooseBestMethod, 3 * sizeof(uint), (uint)tasksPerChannel);
cuda.SetFunctionBlockShape(task.cudaChooseBestResidual, 256, 1, 1); cuda.SetParameterSize(task.cudaChooseBestMethod, sizeof(uint) * 4U);
cuda.SetFunctionBlockShape(task.cudaChooseBestMethod, 32, 16, 1);
cuda.SetParameter(task.cudaCopyBestMethod, 0, (uint)task.cudaBestResidualTasks.Pointer);
cuda.SetParameter(task.cudaCopyBestMethod, 1 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer);
cuda.SetParameter(task.cudaCopyBestMethod, 2 * sizeof(uint), (uint)tasksPerChannel);
cuda.SetParameterSize(task.cudaCopyBestMethod, sizeof(uint) * 3U);
cuda.SetFunctionBlockShape(task.cudaCopyBestMethod, 64, 1, 1);
cuda.SetParameter(task.cudaCopyBestMethodStereo, 0, (uint)task.cudaBestResidualTasks.Pointer);
cuda.SetParameter(task.cudaCopyBestMethodStereo, 1 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer);
cuda.SetParameter(task.cudaCopyBestMethodStereo, 2 * sizeof(uint), (uint)tasksPerChannel);
cuda.SetParameterSize(task.cudaCopyBestMethodStereo, sizeof(uint) * 3U);
cuda.SetFunctionBlockShape(task.cudaCopyBestMethodStereo, 64, 1, 1);
cuda.SetParameter(task.cudaEncodeResidual, 0, (uint)task.cudaResidual.Pointer); cuda.SetParameter(task.cudaEncodeResidual, 0, (uint)task.cudaResidual.Pointer);
cuda.SetParameter(task.cudaEncodeResidual, 1 * sizeof(uint), (uint)task.cudaSamples.Pointer); cuda.SetParameter(task.cudaEncodeResidual, 1 * sizeof(uint), (uint)task.cudaSamples.Pointer);
@@ -1111,11 +1074,20 @@ namespace CUETools.Codecs.FlaCuda
// issue work to the GPU // issue work to the GPU
cuda.LaunchAsync(task.cudaEstimateResidual, partCount, (nResidualTasks / threads_y * nFrames) / maxFrames, task.stream); cuda.LaunchAsync(task.cudaEstimateResidual, partCount, (nResidualTasks / threads_y * nFrames) / maxFrames, task.stream);
cuda.LaunchAsync(task.cudaSumResidual, 1, (nResidualTasks * nFrames) / maxFrames, task.stream); //cuda.LaunchAsync(task.cudaSumResidual, 1, (nResidualTasks * nFrames) / maxFrames, task.stream);
cuda.LaunchAsync(task.cudaChooseBestResidual, 1, (nBestTasks * nFrames) / maxFrames, task.stream); cuda.LaunchAsync(task.cudaChooseBestMethod, 1, (nBestTasks * nFrames) / maxFrames, task.stream);
//cuda.LaunchAsync(task.cudaEncodeResidual, partCount, (nBestTasks * nFrames) / maxFrames, task.stream); if (channels == 2 && channelsCount == 4)
{
cuda.LaunchAsync(task.cudaCopyBestMethodStereo, 1, (nBestTasks * nFrames) / maxFrames / 4, task.stream);
nBestTasks /= 2;
}
else
cuda.LaunchAsync(task.cudaCopyBestMethod, 1, (nBestTasks * nFrames) / maxFrames, task.stream);
if (!encode_on_cpu)
cuda.LaunchAsync(task.cudaEncodeResidual, partCount, (nBestTasks * nFrames) / maxFrames, task.stream);
cuda.CopyDeviceToHostAsync(task.cudaBestResidualTasks, task.bestResidualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * ((nBestTasks * nFrames) / maxFrames)), task.stream); cuda.CopyDeviceToHostAsync(task.cudaBestResidualTasks, task.bestResidualTasksPtr, (uint)(sizeof(encodeResidualTaskStruct) * ((nBestTasks * nFrames) / maxFrames)), task.stream);
//cuda.CopyDeviceToHostAsync(task.cudaResidual, task.residualBufferPtr, (uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelsCount), task.stream); if (!encode_on_cpu)
cuda.CopyDeviceToHostAsync(task.cudaResidual, task.residualBufferPtr, (uint)(sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channels), task.stream);
} }
unsafe void compute_autocorellation(int blocksize, int channelsCount, int max_order, int nFrames, FlaCudaTask task) unsafe void compute_autocorellation(int blocksize, int channelsCount, int max_order, int nFrames, FlaCudaTask task)
@@ -1169,14 +1141,6 @@ namespace CUETools.Codecs.FlaCuda
select_best_methods(frame, channelCount, eparams.max_prediction_order, iFrame, task); select_best_methods(frame, channelCount, eparams.max_prediction_order, iFrame, task);
if (doMidside)
{
measure_frame_size(frame, true);
frame.ChooseSubframes();
}
else
frame.ch_mode = channels != 2 ? ChannelMode.NotStereo : ChannelMode.LeftRight;
encode_residual(frame); encode_residual(frame);
frame_writer.Reset(); frame_writer.Reset();
@@ -1610,6 +1574,8 @@ namespace CUETools.Codecs.FlaCuda
frame_buffer = new byte[max_frame_size + 1]; frame_buffer = new byte[max_frame_size + 1];
frame_writer = new BitWriter(frame_buffer, 0, max_frame_size + 1); frame_writer = new BitWriter(frame_buffer, 0, max_frame_size + 1);
encode_on_cpu = eparams.lpc_max_precision_search <= 1;
return header_len; return header_len;
} }
} }
@@ -1792,7 +1758,10 @@ namespace CUETools.Codecs.FlaCuda
public int type; public int type;
public int obits; public int obits;
public int blocksize; public int blocksize;
public fixed int reserved[8]; public int best_index;
public int channel;
public int residualOffs;
public fixed int reserved[5];
public fixed int coefs[32]; public fixed int coefs[32];
}; };
@@ -1802,7 +1771,9 @@ namespace CUETools.Codecs.FlaCuda
public CUfunction cudaComputeAutocor; public CUfunction cudaComputeAutocor;
public CUfunction cudaComputeLPC; public CUfunction cudaComputeLPC;
public CUfunction cudaEstimateResidual; public CUfunction cudaEstimateResidual;
public CUfunction cudaChooseBestResidual; public CUfunction cudaChooseBestMethod;
public CUfunction cudaCopyBestMethod;
public CUfunction cudaCopyBestMethodStereo;
//public CUfunction cudaSumResidualChunks; //public CUfunction cudaSumResidualChunks;
public CUfunction cudaSumResidual; public CUfunction cudaSumResidual;
public CUfunction cudaEncodeResidual; public CUfunction cudaEncodeResidual;
@@ -1842,7 +1813,7 @@ namespace CUETools.Codecs.FlaCuda
cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * FlaCudaWriter.maxAutocorParts)); cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * FlaCudaWriter.maxAutocorParts));
cudaResidualTasks = cuda.Allocate((uint)residualTasksLen); cudaResidualTasks = cuda.Allocate((uint)residualTasksLen);
cudaBestResidualTasks = cuda.Allocate((uint)bestResidualTasksLen); cudaBestResidualTasks = cuda.Allocate((uint)bestResidualTasksLen);
cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS + 1) * lpc.MAX_LPC_ORDER * FlaCudaWriter.maxResidualParts)); cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS + 1) * lpc.MAX_LPC_ORDER * FlaCudaWriter.maxResidualParts * FlaCudaWriter.maxFrames));
CUResult cuErr = CUDADriver.cuMemAllocHost(ref samplesBufferPtr, (uint)samplesBufferLen); CUResult cuErr = CUDADriver.cuMemAllocHost(ref samplesBufferPtr, (uint)samplesBufferLen);
if (cuErr == CUResult.Success) if (cuErr == CUResult.Success)
cuErr = CUDADriver.cuMemAllocHost(ref residualBufferPtr, (uint)samplesBufferLen); cuErr = CUDADriver.cuMemAllocHost(ref residualBufferPtr, (uint)samplesBufferLen);
@@ -1866,7 +1837,9 @@ namespace CUETools.Codecs.FlaCuda
cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC"); cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC");
cudaEstimateResidual = cuda.GetModuleFunction("cudaEstimateResidual"); cudaEstimateResidual = cuda.GetModuleFunction("cudaEstimateResidual");
cudaSumResidual = cuda.GetModuleFunction("cudaSumResidual"); cudaSumResidual = cuda.GetModuleFunction("cudaSumResidual");
cudaChooseBestResidual = cuda.GetModuleFunction("cudaChooseBestResidual"); cudaChooseBestMethod = cuda.GetModuleFunction("cudaChooseBestMethod");
cudaCopyBestMethod = cuda.GetModuleFunction("cudaCopyBestMethod");
cudaCopyBestMethodStereo = cuda.GetModuleFunction("cudaCopyBestMethodStereo");
cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual"); cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual");
//cudaSumResidualChunks = cuda.GetModuleFunction("cudaSumResidualChunks"); //cudaSumResidualChunks = cuda.GetModuleFunction("cudaSumResidualChunks");

View File

@@ -46,7 +46,10 @@ typedef struct
int type; int type;
int obits; int obits;
int blocksize; int blocksize;
int reserved[8]; int best_index;
int channel;
int residualOffs;
int reserved[5];
int coefs[32]; int coefs[32];
} encodeResidualTaskStruct; } encodeResidualTaskStruct;
@@ -246,7 +249,7 @@ extern "C" __global__ void cudaEstimateResidual(
for (c = 0; c < shared.task[threadIdx.y].residualOrder; c++) for (c = 0; c < shared.task[threadIdx.y].residualOrder; c++)
sum += __mul24(shared.data[ptr + c], shared.task[threadIdx.y].coefs[c]); sum += __mul24(shared.data[ptr + c], shared.task[threadIdx.y].coefs[c]);
sum = shared.data[ptr + c] - (sum >> shared.task[threadIdx.y].shift); sum = shared.data[ptr + c] - (sum >> shared.task[threadIdx.y].shift);
shared.residual[tid] += __mul24(ptr < residualLen, (sum << 1) ^ (sum >> 31)); shared.residual[tid] += __mul24(ptr < residualLen, min(0x7fffff,(sum << 1) ^ (sum >> 31)));
} }
// enable this line when using blockDim.x == 64 // enable this line when using blockDim.x == 64
@@ -258,13 +261,14 @@ extern "C" __global__ void cudaEstimateResidual(
shared.residual[tid] += shared.residual[tid + 1]; shared.residual[tid] += shared.residual[tid + 1];
// rice parameter search // rice parameter search
shared.residual[tid] = __mul24(threadIdx.x >= 15, 0x7fffff) + residualLen * (threadIdx.x + 1) + ((shared.residual[threadIdx.y * blockDim.x] - (residualLen >> 1)) >> threadIdx.x); shared.residual[tid] = (shared.task[threadIdx.y].type != Constant || shared.residual[threadIdx.y * blockDim.x] != 0) *
(__mul24(threadIdx.x >= 15, 0x7fffff) + residualLen * (threadIdx.x + 1) + ((shared.residual[threadIdx.y * blockDim.x] - (residualLen >> 1)) >> threadIdx.x));
shared.residual[tid] = min(shared.residual[tid], shared.residual[tid + 8]); shared.residual[tid] = min(shared.residual[tid], shared.residual[tid + 8]);
shared.residual[tid] = min(shared.residual[tid], shared.residual[tid + 4]); shared.residual[tid] = min(shared.residual[tid], shared.residual[tid + 4]);
shared.residual[tid] = min(shared.residual[tid], shared.residual[tid + 2]); shared.residual[tid] = min(shared.residual[tid], shared.residual[tid + 2]);
shared.residual[tid] = min(shared.residual[tid], shared.residual[tid + 1]); shared.residual[tid] = min(shared.residual[tid], shared.residual[tid + 1]);
if (threadIdx.x == 0) if (threadIdx.x == 0)
output[(blockIdx.y * blockDim.y + threadIdx.y) * gridDim.x + blockIdx.x] = shared.residual[tid]; output[(blockIdx.y * blockDim.y + threadIdx.y) * 64 + blockIdx.x] = shared.residual[tid];
} }
// blockDim.x == 256 // blockDim.x == 256
@@ -327,12 +331,11 @@ extern "C" __global__ void cudaSumResidualChunks(
extern "C" __global__ void cudaSumResidual( extern "C" __global__ void cudaSumResidual(
encodeResidualTaskStruct *tasks, encodeResidualTaskStruct *tasks,
int *residual, int *residual,
int partSize,
int partCount // <= blockDim.y (256) int partCount // <= blockDim.y (256)
) )
{ {
__shared__ struct { __shared__ struct {
int partLen[256]; volatile int partLen[256];
encodeResidualTaskStruct task; encodeResidualTaskStruct task;
} shared; } shared;
@@ -355,47 +358,154 @@ extern "C" __global__ void cudaSumResidual(
shared.partLen[tid] += shared.partLen[tid + 1]; shared.partLen[tid] += shared.partLen[tid + 1];
// return sum // return sum
if (tid == 0) if (tid == 0)
tasks[blockIdx.y].size = shared.task.type == Fixed ? tasks[blockIdx.y].size = min(shared.task.obits * shared.task.blocksize,
shared.task.residualOrder * shared.task.obits + 6 + shared.partLen[0] : shared.task.type == LPC ? shared.task.type == Fixed ? shared.task.residualOrder * shared.task.obits + 6 + shared.partLen[0] :
shared.task.residualOrder * shared.task.obits + 4 + 5 + shared.task.residualOrder * shared.task.cbits + 6 + (4 * partCount/2)/* << porder */ + shared.partLen[0] : shared.task.type == LPC ? shared.task.residualOrder * shared.task.obits + 4 + 5 + shared.task.residualOrder * shared.task.cbits + 6 + (4 * partCount/2)/* << porder */ + shared.partLen[0] :
shared.task.obits * shared.task.blocksize; shared.task.type == Constant ? shared.task.obits * (1 + shared.task.blocksize * (shared.partLen[0] != 0)) :
shared.task.obits * shared.task.blocksize);
} }
#define BEST_INDEX(a,b) ((a) + ((b) - (a)) * (shared.length[b] < shared.length[a])) #define BEST_INDEX(a,b) ((a) + ((b) - (a)) * (shared.length[b] < shared.length[a]))
extern "C" __global__ void cudaChooseBestResidual( extern "C" __global__ void cudaChooseBestMethod(
encodeResidualTaskStruct *tasks,
int *residual,
int partCount, // <= blockDim.y (256)
int taskCount
)
{
__shared__ struct {
volatile int index[128];
volatile int partLen[512];
int length[256];
volatile encodeResidualTaskStruct task[16];
} shared;
const int tid = threadIdx.x + threadIdx.y * 32;
if (tid < 256) shared.length[tid] = 0x7fffffff;
for (int task = 0; task < taskCount; task += blockDim.y)
if (task + threadIdx.y < taskCount)
{
// fetch task data
((int*)&shared.task[threadIdx.y])[threadIdx.x] = ((int*)(tasks + task + threadIdx.y + taskCount * blockIdx.y))[threadIdx.x];
int sum = 0;
for (int pos = 0; pos < partCount; pos += blockDim.x)
sum += (pos + threadIdx.x < partCount ? residual[pos + threadIdx.x + 64 * (task + threadIdx.y + taskCount * blockIdx.y)] : 0);
shared.partLen[tid] = sum;
// length sum: reduction in shared mem
shared.partLen[tid] += shared.partLen[tid + 16];
shared.partLen[tid] += shared.partLen[tid + 8];
shared.partLen[tid] += shared.partLen[tid + 4];
shared.partLen[tid] += shared.partLen[tid + 2];
shared.partLen[tid] += shared.partLen[tid + 1];
// return sum
if (threadIdx.x == 0)
{
shared.length[task + threadIdx.y] =
min(shared.task[threadIdx.y].obits * shared.task[threadIdx.y].blocksize,
shared.task[threadIdx.y].type == Fixed ? shared.task[threadIdx.y].residualOrder * shared.task[threadIdx.y].obits + 6 + shared.partLen[threadIdx.y * 32] :
shared.task[threadIdx.y].type == LPC ? shared.task[threadIdx.y].residualOrder * shared.task[threadIdx.y].obits + 4 + 5 + shared.task[threadIdx.y].residualOrder * shared.task[threadIdx.y].cbits + 6 + (4 * partCount/2)/* << porder */ + shared.partLen[threadIdx.y * 32] :
shared.task[threadIdx.y].type == Constant ? shared.task[threadIdx.y].obits * (1 + shared.task[threadIdx.y].blocksize * (shared.partLen[threadIdx.y * 32] != 0)) :
shared.task[threadIdx.y].obits * shared.task[threadIdx.y].blocksize);
}
}
//shared.index[threadIdx.x] = threadIdx.x;
//shared.length[threadIdx.x] = (threadIdx.x < taskCount) ? tasks[threadIdx.x + taskCount * blockIdx.y].size : 0x7fffffff;
__syncthreads();
//if (tid < 128) shared.index[tid] = BEST_INDEX(shared.index[tid], shared.index[tid + 128]); __syncthreads();
if (tid < 128) shared.index[tid] = BEST_INDEX(tid, tid + 128); __syncthreads();
if (tid < 64) shared.index[tid] = BEST_INDEX(shared.index[tid], shared.index[tid + 64]); __syncthreads();
if (tid < 32)
{
shared.index[tid] = BEST_INDEX(shared.index[tid], shared.index[tid + 32]);
shared.index[tid] = BEST_INDEX(shared.index[tid], shared.index[tid + 16]);
shared.index[tid] = BEST_INDEX(shared.index[tid], shared.index[tid + 8]);
shared.index[tid] = BEST_INDEX(shared.index[tid], shared.index[tid + 4]);
shared.index[tid] = BEST_INDEX(shared.index[tid], shared.index[tid + 2]);
shared.index[tid] = BEST_INDEX(shared.index[tid], shared.index[tid + 1]);
}
__syncthreads();
// if (threadIdx.x < sizeof(encodeResidualTaskStruct)/sizeof(int))
//((int*)(tasks_out + blockIdx.y))[threadIdx.x] = ((int*)(tasks + taskCount * blockIdx.y + shared.index[0]))[threadIdx.x];
if (tid == 0)
tasks[taskCount * blockIdx.y].best_index = taskCount * blockIdx.y + shared.index[0];
if (tid < taskCount)
tasks[tid + taskCount * blockIdx.y].size = shared.length[tid];
}
extern "C" __global__ void cudaCopyBestMethod(
encodeResidualTaskStruct *tasks_out, encodeResidualTaskStruct *tasks_out,
encodeResidualTaskStruct *tasks, encodeResidualTaskStruct *tasks,
int count int count
) )
{ {
__shared__ struct { __shared__ struct {
volatile int index[128]; int best_index;
int length[256];
} shared; } shared;
if (threadIdx.x == 0)
//shared.index[threadIdx.x] = threadIdx.x; shared.best_index = tasks[count * blockIdx.y].best_index;
shared.length[threadIdx.x] = (threadIdx.x < count) ? tasks[threadIdx.x + count * blockIdx.y].size : 0x7fffffff;
__syncthreads(); __syncthreads();
if (threadIdx.x < sizeof(encodeResidualTaskStruct)/sizeof(int))
((int*)(tasks_out + blockIdx.y))[threadIdx.x] = ((int*)(tasks + shared.best_index))[threadIdx.x];
}
//if (threadIdx.x < 128) shared.index[threadIdx.x] = BEST_INDEX(shared.index[threadIdx.x], shared.index[threadIdx.x + 128]); __syncthreads(); extern "C" __global__ void cudaCopyBestMethodStereo(
if (threadIdx.x < 128) shared.index[threadIdx.x] = BEST_INDEX(threadIdx.x, threadIdx.x + 128); __syncthreads(); encodeResidualTaskStruct *tasks_out,
if (threadIdx.x < 64) shared.index[threadIdx.x] = BEST_INDEX(shared.index[threadIdx.x], shared.index[threadIdx.x + 64]); __syncthreads(); encodeResidualTaskStruct *tasks,
if (threadIdx.x < 32) int count
)
{ {
shared.index[threadIdx.x] = BEST_INDEX(shared.index[threadIdx.x], shared.index[threadIdx.x + 32]); __shared__ struct {
shared.index[threadIdx.x] = BEST_INDEX(shared.index[threadIdx.x], shared.index[threadIdx.x + 16]); int best_index[4];
shared.index[threadIdx.x] = BEST_INDEX(shared.index[threadIdx.x], shared.index[threadIdx.x + 8]); int best_size[4];
shared.index[threadIdx.x] = BEST_INDEX(shared.index[threadIdx.x], shared.index[threadIdx.x + 4]); int lr_index[2];
shared.index[threadIdx.x] = BEST_INDEX(shared.index[threadIdx.x], shared.index[threadIdx.x + 2]); } shared;
shared.index[threadIdx.x] = BEST_INDEX(shared.index[threadIdx.x], shared.index[threadIdx.x + 1]); if (threadIdx.x < 4)
shared.best_index[threadIdx.x] = tasks[count * (blockIdx.y * 4 + threadIdx.x)].best_index;
if (threadIdx.x < 4)
shared.best_size[threadIdx.x] = tasks[shared.best_index[threadIdx.x]].size;
__syncthreads();
if (threadIdx.x == 0)
{
int bitsBest = 0x7fffffff;
if (bitsBest > shared.best_size[2] + shared.best_size[3]) // MidSide
{
bitsBest = shared.best_size[2] + shared.best_size[3];
shared.lr_index[0] = shared.best_index[2];
shared.lr_index[1] = shared.best_index[3];
}
if (bitsBest > shared.best_size[3] + shared.best_size[1]) // RightSide
{
bitsBest = shared.best_size[3] + shared.best_size[1];
shared.lr_index[0] = shared.best_index[3];
shared.lr_index[1] = shared.best_index[1];
}
if (bitsBest > shared.best_size[0] + shared.best_size[3]) // LeftSide
{
bitsBest = shared.best_size[0] + shared.best_size[3];
shared.lr_index[0] = shared.best_index[0];
shared.lr_index[1] = shared.best_index[3];
}
if (bitsBest > shared.best_size[0] + shared.best_size[1]) // LeftRight
{
bitsBest = shared.best_size[0] + shared.best_size[1];
shared.lr_index[0] = shared.best_index[0];
shared.lr_index[1] = shared.best_index[1];
}
} }
__syncthreads(); __syncthreads();
if (threadIdx.x < sizeof(encodeResidualTaskStruct)/sizeof(int)) if (threadIdx.x < sizeof(encodeResidualTaskStruct)/sizeof(int))
((int*)(tasks_out + blockIdx.y))[threadIdx.x] = ((int*)(tasks + count * blockIdx.y + shared.index[0]))[threadIdx.x]; ((int*)(tasks_out + 2 * blockIdx.y))[threadIdx.x] = ((int*)(tasks + shared.lr_index[0]))[threadIdx.x];
// if (threadIdx.x == 0) if (threadIdx.x == 0)
//tasks[count * blockIdx.y].best = count * blockIdx.y + shared.index[0]; tasks_out[2 * blockIdx.y].residualOffs = tasks[shared.best_index[0]].residualOffs;
if (threadIdx.x < sizeof(encodeResidualTaskStruct)/sizeof(int))
((int*)(tasks_out + 2 * blockIdx.y + 1))[threadIdx.x] = ((int*)(tasks + shared.lr_index[1]))[threadIdx.x];
if (threadIdx.x == 0)
tasks_out[2 * blockIdx.y + 1].residualOffs = tasks[shared.best_index[1]].residualOffs;
} }
extern "C" __global__ void cudaEncodeResidual( extern "C" __global__ void cudaEncodeResidual(
@@ -428,6 +538,6 @@ extern "C" __global__ void cudaEncodeResidual(
for (int c = 0; c < shared.task.residualOrder; c++) for (int c = 0; c < shared.task.residualOrder; c++)
sum += __mul24(shared.data[tid + c], shared.task.coefs[c]); sum += __mul24(shared.data[tid + c], shared.task.coefs[c]);
if (tid < residualLen) if (tid < residualLen)
output[shared.task.samplesOffs + pos + tid] = shared.data[tid + shared.task.residualOrder] - (sum >> shared.task.shift); output[shared.task.residualOffs + pos + tid] = shared.data[tid + shared.task.residualOrder] - (sum >> shared.task.shift);
} }
#endif #endif

View File

@@ -1,93 +1,6 @@
architecture {sm_10} architecture {sm_10}
abiversion {1} abiversion {1}
modname {cubin} modname {cubin}
code {
name = cudaChooseBestResidual
lmem = 0
smem = 1564
reg = 7
bar = 1
const {
segname = const
segnum = 1
offset = 0
bytes = 16
mem {
0x0000007f 0x0000003f 0x0000001f 0x0000002f
}
}
bincode {
0xa0000009 0x04000780 0x3002cdfd 0x6420c7c8
0xa0010003 0x00000000 0x1000f003 0x00000280
0x1000cc01 0x0423c780 0x40014e05 0x00200780
0x30100205 0xc4100780 0x60004e01 0x00204780
0x20000001 0x04008780 0x30070005 0xc4100780
0x30060001 0xc4100780 0x20008200 0x2100ea00
0x20108001 0x00000003 0xd00e0001 0x80c00780
0x10010003 0x00000780 0x103f8001 0x07ffffff
0x00020405 0xc0000782 0x04010e01 0xe4200780
0x3002040d 0xc4100780 0x861ffe03 0x00000000
0x308005fd 0x644107c8 0xa001f003 0x00000000
0x1001f003 0x00000280 0x00000605 0xc0000780
0xd408380d 0x20000780 0xd4043809 0x20000780
0x1c00c001 0x0423c780 0x3800c1fd 0x6c2107c8
0x20008401 0x0000000b 0x10000401 0x0403c500
0x04000e01 0xe4200780 0xf0000001 0xe0000002
0x861ffe03 0x00000000 0x308105fd 0x644107c8
0xa0030003 0x00000000 0x10030003 0x00000280
0x00000605 0xc0000780 0xd4023809 0x20000780
0x0802c00d 0xc0200780 0x0402ce11 0xc0200780
0xdc04380d 0x20000780 0x1400ce01 0x0423c780
0xd0043811 0x20000784 0x1d00e004 0x2940e000
0x3001c005 0x6c20c784 0xd0010001 0x04020780
0x2400ce01 0x04200780 0x04000e01 0xe4200780
0xf0000001 0xe0000002 0x861ffe03 0x00000000
0x308205fd 0x644107c8 0xa0074003 0x00000000
0x10074003 0x00000280 0x00000605 0xc0000780
0xd4013809 0x20000780 0x0802c00d 0xc0200780
0x0402ce11 0xc0200780 0xdc04380d 0x20000780
0x1400ce01 0x0423c780 0xd0043811 0x20000784
0x1d00e004 0x2940e000 0x3001c005 0x6c20c784
0xd0010001 0x04020780 0x2400ce01 0x04200780
0x04000e01 0xe4200780 0x0402ee0d 0xc0200780
0x0402ce09 0xc0200780 0xdc04380d 0x20000780
0x1400ce01 0x0423c780 0xd8043809 0x20000780
0x1c00c005 0x0423c780 0x2440ee01 0x04200780
0x3801c005 0x6c20c780 0xd0010001 0x04020780
0x2400ce01 0x04200780 0x04000e01 0xe4200780
0x0402de0d 0xc0200780 0x0402ce09 0xc0200780
0xdc04380d 0x20000780 0x1400ce01 0x0423c780
0xd8043809 0x20000780 0x1d00e004 0x2540fe00
0x3801c005 0x6c20c780 0xd0010001 0x04020780
0x2400ce01 0x04200780 0x04000e01 0xe4200780
0x0402d60d 0xc0200780 0x0402ce09 0xc0200780
0xdc04380d 0x20000780 0x1400ce01 0x0423c780
0xd8043809 0x20000780 0x1d00e004 0x2540f600
0x3801c005 0x6c20c780 0xd0010001 0x04020780
0x2400ce01 0x04200780 0x04000e01 0xe4200780
0x0402d20d 0xc0200780 0x0402ce09 0xc0200780
0xdc04380d 0x20000780 0x1400ce01 0x0423c780
0xd8043809 0x20000780 0x1d00e004 0x2540f200
0x3801c005 0x6c20c780 0xd0010001 0x04020780
0x2400ce01 0x04200780 0x04000e01 0xe4200780
0x0402d00d 0xc0200780 0x0402ce09 0xc0200780
0xdc04380d 0x20000780 0x1400ce01 0x0423c780
0xd8043809 0x20000780 0x1d00e004 0x2540f000
0x3801c005 0x6c20c780 0xd0010001 0x04020780
0x2400ce01 0x04200780 0x04000e01 0xe4200780
0xf0000001 0xe0000002 0x861ffe03 0x00000000
0x308305fd 0x644107c8 0x30000003 0x00000280
0xa0004e05 0x04200780 0x1000cc01 0x0423c780
0x40020209 0x00000780 0x30100409 0xc4100780
0x60020001 0x00008780 0x3007ce09 0xc4300780
0x3006ce11 0xc4300780 0x30070015 0xc4100780
0x30060019 0xc4100780 0x30070201 0xc4100780
0x30060205 0xc4100780 0x20048408 0x20068a10
0x20018000 0x20028608 0x2104ea10 0x2100e804
0x20000401 0x04010780 0xd00e0001 0x80c00780
0x20000605 0x04004780 0xd00e0201 0xa0c00781
}
}
code { code {
name = cudaComputeAutocor name = cudaComputeAutocor
lmem = 0 lmem = 0
@@ -190,10 +103,10 @@ code {
segname = const segname = const
segnum = 1 segnum = 1
offset = 0 offset = 0
bytes = 20 bytes = 24
mem { mem {
0x000003ff 0x0000000f 0x0000001f 0x00000001 0x000003ff 0x0000000f 0x0000001f 0x00000001
0x0000000e 0x007fffff 0x0000000e
} }
} }
bincode { bincode {
@@ -244,8 +157,8 @@ code {
0x1000f801 0x0403c780 0x08047401 0xe4200782 0x1000f801 0x0403c780 0x08047401 0xe4200782
0xd4112809 0x20000780 0x3883c005 0x6c608780 0xd4112809 0x20000780 0x3883c005 0x6c608780
0xa0004401 0x04200780 0xd001001d 0x04000780 0xa0004401 0x04200780 0xd001001d 0x04000780
0x30000ffd 0x640187c8 0xa0093003 0x00000000 0x30000ffd 0x640187c8 0xa0094003 0x00000000
0x10091003 0x00000280 0x3003d201 0xac200780 0x10092003 0x00000280 0x3003d201 0xac200780
0x307c000d 0x8c000780 0xd4110009 0x20000780 0x307c000d 0x8c000780 0xd4110009 0x20000780
0x387cc1fd 0x6c20c7c8 0xa007c003 0x00000000 0x387cc1fd 0x6c20c7c8 0xa007c003 0x00000000
0x1000f801 0x0403c780 0x1000f825 0x0403c780 0x1000f801 0x0403c780 0x1000f825 0x0403c780
@@ -266,42 +179,242 @@ code {
0xd0000205 0x04008780 0x30080601 0x6c010780 0xd0000205 0x04008780 0x30080601 0x6c010780
0x00020c0d 0xc0000780 0xdc095009 0x20000780 0x00020c0d 0xc0000780 0xdc095009 0x20000780
0xa0000021 0x2c014780 0x1800c001 0x0423c780 0xa0000021 0x2c014780 0x1800c001 0x0423c780
0x60011005 0x80000780 0x20018e1d 0x00000003 0x30840205 0xac400780 0x60011005 0x80000780
0xa0004401 0x04200780 0x30000ffd 0x640147c8 0x20018e1d 0x00000003 0xa0004401 0x04200780
0x0c025401 0xe4204780 0x10063003 0x00000280 0x30000ffd 0x640147c8 0x0c025401 0xe4204780
0x10093003 0x00000780 0x3003d201 0xac200780 0x10063003 0x00000280 0x10094003 0x00000780
0x307c000d 0x8c000780 0x00020c0d 0xc0000782 0x3003d201 0xac200780 0x307c000d 0x8c000780
0xdc095005 0x20000780 0x1400e001 0x0423c780 0x00020c0d 0xc0000782 0xdc095009 0x20000780
0x2400c001 0x04200780 0x0c025401 0xe4200780 0x1800e001 0x0423c780 0x2800c001 0x04200780
0x1500f000 0x2500e000 0x0c025401 0xe4200780 0x0c025401 0xe4200780 0x1900f000 0x2900e000
0x1500e800 0x2500e000 0x0c025401 0xe4200780 0x0c025401 0xe4200780 0x1900e800 0x2900e000
0x1500e400 0x2500e000 0x0c025401 0xe4200780 0x0c025401 0xe4200780 0x1900e400 0x2900e000
0x1400c205 0x0423c780 0x20018801 0x00000003 0x0c025401 0xe4200780 0x1900e200 0x2900e000
0x2400c005 0x04204780 0x0c025401 0xe4204780 0x0c025401 0xe4200780 0xd4112805 0x20000780
0x40010c05 0x00000780 0x60000e05 0x00004780 0x347cc1fd 0x6c2147c8 0xa00ac003 0x00000000
0x30100205 0xc4100780 0x60000c01 0x00004780 0x100a9003 0x00000280 0x00020a05 0xc0000780
0x30840805 0x64410780 0x00020a09 0xc0000780 0xd4095005 0x20000780 0x347cc1fd 0x6c2087c8
0xd8095009 0x20000780 0xa0000215 0x2c014780 0x100ab003 0x00000280 0x10018001 0x00000003
0x30010605 0xec100780 0x407f8a0d 0x0007ffff 0x100ac003 0x00000780 0x1000f801 0x0403c780
0x2941e004 0x2003800c 0x30040201 0xec000780 0xf0000001 0xe0000002 0x20018805 0x00000003
0x20000001 0x0400c780 0x0c025401 0xe4200780 0x40030c1d 0x00000780 0x60020e1d 0x0001c780
0x1400d001 0x0423c780 0x3400c001 0xac200780 0x00020a05 0xc0000780 0xd4095005 0x20000780
0x0c025401 0xe4200780 0x1400c801 0x0423c780 0x30010615 0xec100780 0x30100e1d 0xc4100780
0x3400c001 0xac200780 0x0c025401 0xe4200780 0x2440c015 0x04214780 0x60020c0d 0x0001c780
0x1400c401 0x0423c780 0x3400c001 0xac200780 0x307c09fd 0x640147c8 0x30040a05 0xec000780
0x0c025401 0xe4200780 0x1400c201 0x0423c780 0x30850811 0x64410780 0xa0000811 0x2c014780
0x3400c001 0xac200780 0x307c09fd 0x640147c8 0x407f8811 0x0007ffff 0x2004860c 0x20038204
0x0c025401 0xe4200780 0x30000003 0x00000280 0x4003000d 0x00000780 0x6002020d 0x0000c780
0x3010060d 0xc4100780 0x60020001 0x0000c780
0x00020c09 0xc0000780 0x08025401 0xe4200780
0xd8095005 0x20000780 0x1400d001 0x0423c780
0x3400c001 0xac200780 0x08025401 0xe4200780
0x1400c801 0x0423c780 0x3400c001 0xac200780
0x08025401 0xe4200780 0x1400c401 0x0423c780
0x3400c001 0xac200780 0x08025401 0xe4200780
0x1400c201 0x0423c780 0x3400c001 0xac200780
0x08025401 0xe4200780 0x30000003 0x00000280
0xa0004401 0x04200780 0x40014e05 0x00200780 0xa0004401 0x04200780 0x40014e05 0x00200780
0x30100205 0xc4100780 0x60004e01 0x00204780 0x30100205 0xc4100780 0x60004e01 0x00204780
0x20000001 0x04008780 0x40014805 0x00200780 0x20000001 0x04008780 0x30060001 0xc4100780
0x30100205 0xc4100780 0x60004801 0x00204780 0xa0004c0d 0x04200780 0x20000601 0x04000780
0xa0004c0d 0x04200780 0x20000001 0x0400c780
0x00020c0d 0xc0000780 0xdc095005 0x20000780 0x00020c0d 0xc0000780 0xdc095005 0x20000780
0x30020005 0xc4100780 0x1500e000 0x2101e804 0x30020005 0xc4100780 0x1500e000 0x2101e804
0xd00e0201 0xa0c00781 0xd00e0201 0xa0c00781
} }
} }
code {
name = cudaChooseBestMethod
lmem = 0
smem = 6688
reg = 10
bar = 1
const {
segname = const
segnum = 1
offset = 0
bytes = 32
mem {
0x000003ff 0x000000ff 0x00000008 0x00000020
0x00000001 0x0000007f 0x0000003f 0x0000001f
}
}
bincode {
0xd0800209 0x00400780 0xa000040d 0x04000780
0xa0000009 0x04000780 0xd0800209 0x00400780
0xa0000405 0x04000780 0x30050205 0xc4100780
0x20000409 0x04004780 0x308105fd 0x6c4107c8
0xa0013003 0x00000000 0x00070609 0xc0000780
0x10013003 0x00000280 0xa0000009 0x04000780
0xd0800209 0x00400780 0xa0000405 0x04000780
0x30050205 0xc4100780 0x20000409 0x04004780
0x00020405 0xc0000780 0x103f8005 0x07ffffff
0x04051001 0xe4204780 0x307ccffd 0x6c20c7ca
0x100a3003 0x00000280 0x1000f811 0x0403c780
0x20000815 0x0400c780 0x3005cffd 0x6420c7c8
0xa009f003 0x00000000 0x1009f003 0x00000280
0x1000ce05 0x0423c780 0x40034e09 0x00200780
0x30100409 0xc4100780 0x60024e1d 0x00208780
0x30070605 0xc4100780 0x30060609 0xc4100780
0x20000205 0x04008780 0x30070e09 0xc4100780
0x30060e19 0xc4100780 0x20000409 0x04018780
0xa0000019 0x04000780 0x30020c21 0xc4100780
0x20029008 0x20088220 0x0000100d 0xc0000780
0x30070821 0xc4100780 0x30060825 0xc4100780
0x20099020 0x2108e820 0x00000205 0xc0000780
0x20088204 0x20018404 0xd00e0205 0x80c00780
0x0c071001 0xe4204780 0x307ccdfd 0x6c20c7c8
0x1000f805 0x0403c780 0x10043003 0x00000280
0xa0042003 0x00000000 0x10008c08 0x2106ec18
0xa0004221 0x04200780 0x3002cdfd 0x6420c7c8
0xa003f003 0x00000000 0x1003f003 0x00000280
0x20000a25 0x0401c780 0x30061225 0xc4100780
0x20000425 0x04024780 0x30021225 0xc4100780
0x2000ca25 0x04224780 0xd00e1225 0x80c00780
0x20001205 0x04004780 0x20000409 0x04020782
0x300605fd 0x6c0047c8 0x10035003 0x00000280
0xf0000001 0xe0000002 0xa0000009 0x04000780
0xd0800231 0x00400780 0xa0001819 0x04000780
0x30050c19 0xc4100780 0x20000409 0x04018780
0x0002040d 0xc0000780 0x0c011001 0xe4204780
0xdc044011 0x20000780 0x1000e005 0x0423c784
0x2000c005 0x04204784 0x0c011001 0xe4204780
0x1000d005 0x0423c784 0x2000c005 0x04204784
0x0c011001 0xe4204780 0x1000c805 0x0423c784
0x2000c005 0x04204784 0x0c011001 0xe4204780
0x1000c405 0x0423c784 0x2000c005 0x04204784
0x0c011001 0xe4204780 0x1000c205 0x0423c784
0x2000c005 0x04204784 0xa0000019 0x04000780
0x307c0dfd 0x640147c8 0x0c011001 0xe4204780
0x1009f003 0x00000280 0xd41c680d 0x20000780
0x1d00e204 0x1d00e408 0x40050419 0x00000780
0x60040619 0x00018780 0x30100c19 0xc4100780
0x3c82c1fd 0x6c6147c8 0x60040419 0x00018780
0xa009c003 0x00000000 0x10070003 0x00000280
0xd41c4005 0x20000780 0x1500e004 0x1500ec08
0x4005041d 0x00000780 0x6004061d 0x0001c780
0x30100e1d 0xc4100780 0x60040405 0x0001c780
0xd8044005 0x20000780 0x2400c005 0x04204780
0x20068205 0x00000003 0x1009c003 0x00000780
0xd41c680d 0x20000780 0x3c83c1fd 0x6c6147c8
0xa009b003 0x00000000 0x10085003 0x00000280
0x3002cc05 0xc4300780 0x301f0209 0xec100780
0xd0840409 0x04400780 0x2000041d 0x04004780
0xd41c4005 0x20000780 0x1500ec08 0x1500e004
0x2400c609 0x04208780 0x40050421 0x00000780
0x60040621 0x00020780 0x30101021 0xc4100780
0x60040405 0x00020780 0x30010e09 0xec100780
0x20000205 0x04008780 0xd8044005 0x20000780
0x2400c005 0x04204780 0x200f8205 0x00000003
0x1009b003 0x00000780 0xd41c680d 0x20000780
0x3c7cc1fd 0x6c2147c8 0xa009a003 0x00000000
0x10094003 0x00000280 0xd804400d 0x20000780
0xd41c7005 0x20000780 0x3c7cc1fd 0x6c2087c8
0x2501e209 0x00000003 0x1400c005 0x0423c780
0x10000809 0x2440c280 0x4005041d 0x00000780
0x6004061d 0x0001c780 0x30100e1d 0xc4100780
0x60040405 0x0001c780 0x1009a003 0x00000780
0xd41c7005 0x20000780 0x1500e004 0x1500e208
0x4005041d 0x00000780 0x6004061d 0x0001c780
0x30100e1d 0xc4100780 0x60040405 0x0001c780
0xf0000001 0xe0000002 0xf0000001 0xe0000002
0x30060205 0xac000782 0x00020a05 0xc0000780
0x04051001 0xe4204780 0xa0004405 0x04200782
0x20000811 0x04004780 0x3004cffd 0x6c2107c8
0x10016003 0x00000280 0x861ffe03 0x00000000
0xa0000009 0x04000780 0xd0800209 0x00400780
0xa0000405 0x04000780 0x30050205 0xc4100780
0x20000409 0x04004780 0x308505fd 0x6c4107c8
0xa00be003 0x00000000 0x100be003 0x00000280
0xa0000009 0x04000780 0xd0800209 0x00400780
0xa0000405 0x04000780 0x30050205 0xc4100780
0x20000409 0x04004780 0x00020405 0xc0000780
0xd418400d 0x20000780 0xd4144009 0x20000780
0x1c00c005 0x0423c780 0x3801c1fd 0x6c2107c8
0xa0000009 0x04000780 0xd0800209 0x00400780
0xa0000405 0x04000780 0x30050205 0xc4100780
0x20000409 0x04004780 0x20008405 0x0000000b
0x10000405 0x0403c500 0x04001001 0xe4204780
0xf0000001 0xe0000002 0x861ffe03 0x00000000
0xa0000009 0x04000780 0xd0800209 0x00400780
0xa0000405 0x04000780 0x30050205 0xc4100780
0x20000409 0x04004780 0x308605fd 0x6c4107c8
0xa00d9003 0x00000000 0x100d9003 0x00000280
0xa0000009 0x04000780 0xd0800209 0x00400780
0xa0000405 0x04000780 0x30050205 0xc4100780
0x20000409 0x04004780 0x00020405 0xc0000780
0xd4024009 0x20000780 0x0802c00d 0xc0200780
0x0402d011 0xc0200780 0xdc14400d 0x20000780
0x1400d005 0x0423c780 0xd0144011 0x20000784
0x1d00e008 0x2941e004 0x3002c009 0x6c20c784
0xd0020205 0x04020780 0x2400d005 0x04204780
0x04001001 0xe4204780 0xf0000001 0xe0000002
0x861ffe03 0x00000000 0xa0000009 0x04000780
0xd0800209 0x00400780 0xa0000405 0x04000780
0x30050205 0xc4100780 0x20000409 0x04004780
0x308705fd 0x6c4107c8 0xa0128003 0x00000000
0x10128003 0x00000280 0xa0000009 0x04000780
0xd0800209 0x00400780 0xa0000405 0x04000780
0x30050205 0xc4100780 0x20000409 0x04004780
0x00020405 0xc0000780 0xd4014009 0x20000780
0x0802c00d 0xc0200780 0x0402d011 0xc0200780
0xdc14400d 0x20000780 0x1400d005 0x0423c780
0xd0144011 0x20000784 0x1d00e008 0x2941e004
0x3002c009 0x6c20c784 0xd0020205 0x04020780
0x2400d005 0x04204780 0x04001001 0xe4204780
0x0402f00d 0xc0200780 0x0402d009 0xc0200780
0xdc14400d 0x20000780 0x1400d005 0x0423c780
0xd8144009 0x20000780 0x1c00c009 0x0423c780
0x2440f005 0x04204780 0x3802c009 0x6c20c780
0xd0020205 0x04020780 0x2400d005 0x04204780
0x04001001 0xe4204780 0x0402e00d 0xc0200780
0x0402d009 0xc0200780 0xdc14400d 0x20000780
0x1400d005 0x0423c780 0xd8144009 0x20000780
0x1c00c009 0x0423c780 0x2440e005 0x04204780
0x3802c009 0x6c20c780 0xd0020205 0x04020780
0x2400d005 0x04204780 0x04001001 0xe4204780
0x0402d80d 0xc0200780 0x0402d009 0xc0200780
0xdc14400d 0x20000780 0x1400d005 0x0423c780
0xd8144009 0x20000780 0x1d00e008 0x2541f804
0x3802c009 0x6c20c780 0xd0020205 0x04020780
0x2400d005 0x04204780 0x04001001 0xe4204780
0x0402d40d 0xc0200780 0x0402d009 0xc0200780
0xdc14400d 0x20000780 0x1400d005 0x0423c780
0xd8144009 0x20000780 0x1d00e008 0x2541f404
0x3802c009 0x6c20c780 0xd0020205 0x04020780
0x2400d005 0x04204780 0x04001001 0xe4204780
0x0402d20d 0xc0200780 0x0402d009 0xc0200780
0xdc14400d 0x20000780 0x1400d005 0x0423c780
0xd8144009 0x20000780 0x1d00e008 0x2541f204
0x3802c009 0x6c20c780 0xd0020205 0x04020780
0x2400d005 0x04204780 0x04001001 0xe4204780
0xf0000001 0xe0000002 0x861ffe03 0x00000000
0xa0000009 0x04000780 0xd0800209 0x00400780
0xa0000405 0x04000780 0x30050205 0xc4100780
0x20000409 0x04004780 0x307c05fd 0x6c0147c8
0xa013c003 0x00000000 0x1013c003 0x00000280
0x1000ce05 0x0423c780 0x40034e09 0x00200780
0x30100409 0xc4100780 0x60024e05 0x00208780
0x30070209 0xc4100780 0x3006020d 0xc4100780
0x20038408 0x2102e808 0x2000d00d 0x04204780
0x20208405 0x00000003 0xd00e020d 0xa0c00780
0xa0000009 0x04000782 0xd0800209 0x00400780
0xa0000405 0x04000780 0x30050205 0xc4100780
0x20000409 0x04004780 0x3002cffd 0x6c20c7c8
0x30000003 0x00000280 0x1000ce05 0x0423c780
0x40034e09 0x00200780 0x30100409 0xc4100780
0x60024e05 0x00208780 0xa0000009 0x04000780
0xd0800219 0x00400780 0xa0000c0d 0x04000780
0x3005060d 0xc4100780 0x20038408 0x20028204
0x3007020d 0xc4100780 0x30060205 0xc4100780
0xa0000009 0x04000780 0xd0800201 0x00400780
0xa0000001 0x04000780 0x30050001 0xc4100780
0x20000409 0x04000780 0x00020405 0xc0000780
0x20000601 0x04004780 0xd4144005 0x20000780
0x2100e804 0x1500e000 0x20108205 0x00000003
0xd00e0201 0xa0c00781
}
}
code { code {
name = cudaEncodeResidual name = cudaEncodeResidual
lmem = 0 lmem = 0
@@ -356,13 +469,46 @@ code {
0x3005c1fd 0x6c2147cc 0x6800ce0d 0x8020c780 0x3005c1fd 0x6c2147cc 0x6800ce0d 0x8020c780
0xd4000805 0x20000780 0x1000c001 0x0423c784 0xd4000805 0x20000780 0x1000c001 0x0423c784
0x10040003 0x00000280 0x300109fd 0x6c00c7c8 0x10040003 0x00000280 0x300109fd 0x6c00c7c8
0x30000003 0x00000280 0xd0094005 0x20000780 0x30000003 0x00000280 0xd0094805 0x20000780
0x2502e008 0x20008210 0x1500e200 0x20028204 0x2502f008 0x20008210 0x1500e000 0x20028204
0x00020805 0xc0000780 0x30000609 0xec000780 0x00020805 0xc0000780 0x30000609 0xec000780
0x30020201 0xc4100780 0x2542ee04 0x2100e800 0x30020201 0xc4100780 0x2542ee04 0x2100e800
0xd00e0005 0xa0c00781 0xd00e0005 0xa0c00781
} }
} }
code {
name = cudaCopyBestMethod
lmem = 0
smem = 32
reg = 5
bar = 1
const {
segname = const
segnum = 1
offset = 0
bytes = 4
mem {
0x0000002f
}
}
bincode {
0xa0000005 0x040007c0 0xa000d003 0x00000000
0x1000d003 0x00000280 0x1000cc01 0x0423c780
0x40014e09 0x00200780 0x30100409 0xc4100780
0x60004e01 0x00208780 0x30070009 0xc4100780
0x30060001 0xc4100780 0x20008400 0x2100ea00
0x20208001 0x00000003 0xd00e0001 0x80c00780
0x00000e01 0xe4200780 0xf0000001 0xe0000002
0x861ffe03 0x00000000 0x308003fd 0x644107c8
0x30000003 0x00000280 0xa0004e01 0x04200780
0x3007ce0d 0xc4300780 0x3006ce11 0xc4300780
0x30070009 0xc4100780 0x30060001 0xc4100780
0x2004860c 0x20008400 0x30020205 0xc4100780
0x2103ea0c 0x2100e808 0x20000201 0x0400c780
0xd00e0001 0x80c00780 0x20000205 0x04008780
0xd00e0201 0xa0c00781
}
}
code { code {
name = cudaSumResidualChunks name = cudaSumResidualChunks
lmem = 0 lmem = 0
@@ -572,11 +718,91 @@ code {
0xf0000001 0xe0000001 0xf0000001 0xe0000001
} }
} }
code {
name = cudaCopyBestMethodStereo
lmem = 0
smem = 68
reg = 6
bar = 1
const {
segname = const
segnum = 1
offset = 0
bytes = 16
mem {
0x00000003 0x00000004 0x7fffffff 0x0000002f
}
}
bincode {
0xa0000001 0x04000780 0x30800005 0x6440c7d0
0xa00003fd 0x0c0147c8 0xa0012003 0x00000000
0x10012003 0x00001100 0x60824e09 0x00600780
0x1000cc05 0x0423c780 0x4005040d 0x00000780
0x6004060d 0x0000c780 0x3010060d 0xc4100780
0x60040405 0x0000c780 0x30070209 0xc4100780
0x30060205 0xc4100780 0x20018404 0x2101ea04
0x20208205 0x00000003 0xd00e0205 0x80c00780
0x00020005 0xc0000780 0x04000e01 0xe4204780
0xf0000001 0xe0000002 0xa001c003 0x00000000
0x1001c003 0x00000100 0x00020005 0xc0000780
0x3407ce05 0xc4300780 0x3406ce09 0xc4300780
0x20028204 0x2101ea04 0x20108205 0x00000003
0xd00e0205 0x80c00780 0x04001601 0xe4204780
0xf0000001 0xe0000002 0x861ffe03 0x00000000
0x307c0005 0x640087d0 0xa00003fd 0x0c0147c8
0xa003f003 0x00000000 0x1003f003 0x00001100
0x1100fc04 0x2101fa04 0x308203fd 0x6c4187d8
0x1000d209 0x0423d500 0x00001e01 0xe4209500
0x1000d409 0x0423d500 0x00002001 0xe4209500
0x10000405 0x2440d280 0x1100fc08 0x2102f808
0x300105fd 0x6c0187d8 0x10031003 0x00001280
0x1000d405 0x0423c780 0x00001e01 0xe4204780
0x1000d005 0x0423c780 0x00002001 0xe4204780
0x10000405 0x0403c780 0x1100fc08 0x2102f608
0x300105fd 0x6c0187d8 0x10039003 0x00001280
0x1000ce05 0x0423c780 0x00001e01 0xe4204780
0x1000d405 0x0423c780 0x00002001 0xe4204780
0x10000405 0x0403c780 0x1100f808 0x2102f608
0x300203fd 0x6c00c7d8 0x1000ce05 0x0423d500
0x00001e01 0xe4205500 0x1000d005 0x0423d500
0x00002001 0xe4205500 0xf0000001 0xe0000002
0x861ffe03 0x00000000 0x30830005 0x6440c7e0
0xa00003fd 0x0c0147d8 0xa0051003 0x00000000
0x10051003 0x00002100 0x41022e05 0x00000003
0x3007de0d 0xc4300780 0x3006de11 0xc4300780
0x30070209 0xc4100780 0x30060205 0xc4100780
0x2004860c 0x20018404 0x30020009 0xc4100780
0x2103ea10 0x2101e80c 0x20000405 0x04010780
0xd00e0205 0x80c00780 0x20000409 0x0400c780
0xd00e0405 0xa0c00780 0xf0000001 0xe0000002
0xa005f003 0x00000000 0x1005f003 0x00000100
0x41022e05 0x00000003 0x3007ce09 0xc4300780
0x3006ce0d 0xc4300780 0x30070211 0xc4100780
0x30060215 0xc4100780 0x20038404 0x20058808
0x2101ea04 0x2102e808 0x20288205 0x00000003
0xd00e0205 0x80c00780 0x20288409 0x00000003
0xd00e0405 0xa0c00780 0xf0000001 0xe0000002
0xa006e003 0x00000000 0x1006e003 0x00001100
0x41022e05 0x00000003 0x3007e00d 0xc4300780
0x30070209 0xc4100780 0x30060205 0xc4100780
0x3006e011 0xc4300780 0x20018404 0x20048608
0x30020001 0xc4100780 0x2101e804 0x2102ea08
0x20018004 0x20028000 0xd00e0001 0x80c00780
0x20008205 0x0000000f 0xd00e0201 0xa0c00780
0xf0000001 0xe0000002 0x30000003 0x00000100
0x41022e01 0x00000003 0x3007d005 0xc4300780
0x3006d009 0xc4300780 0x3007000d 0xc4100780
0x30060011 0xc4100780 0x20028200 0x20048604
0x2100ea00 0x2101e804 0x20288001 0x00000003
0xd00e0001 0x80c00780 0x20288205 0x0000000f
0xd00e0201 0xa0c00781
}
}
code { code {
name = cudaSumResidual name = cudaSumResidual
lmem = 0 lmem = 0
smem = 1248 smem = 1244
reg = 5 reg = 7
bar = 1 bar = 1
const { const {
segname = const segname = const
@@ -595,46 +821,59 @@ code {
0x3007000d 0xc4100780 0x30060001 0xc4100780 0x3007000d 0xc4100780 0x30060001 0xc4100780
0x20008600 0x2100e800 0x20000401 0x04000780 0x20008600 0x2100e800 0x20000401 0x04000780
0xd00e0001 0x80c00780 0x00000405 0xc0000780 0xd00e0001 0x80c00780 0x00000405 0xc0000780
0x04021001 0xe4200780 0xf0000001 0xe0000002 0x04020e01 0xe4200780 0xf0000001 0xe0000002
0x861ffe03 0x00000000 0x3001cffd 0x6c20c7c8 0x861ffe03 0x00000000 0x3001cdfd 0x6c20c7c8
0xa001c003 0x00000000 0x1001b003 0x00000280 0xa001c003 0x00000000 0x1001b003 0x00000280
0x1000ce01 0x0423c780 0x40014e0d 0x00200780 0x1000cc01 0x0423c780 0x40014e0d 0x00200780
0x3010060d 0xc4100780 0x60004e01 0x0020c780 0x3010060d 0xc4100780 0x60004e01 0x0020c780
0x20000001 0x04004780 0x30020001 0xc4100780 0x20000001 0x04004780 0x30020001 0xc4100780
0x2000ca01 0x04200780 0xd00e0001 0x80c00780 0x2000ca01 0x04200780 0xd00e0001 0x80c00780
0x1001c003 0x00000780 0x1000f801 0x0403c780 0x1001c003 0x00000780 0x1000f801 0x0403c780
0x00000405 0xc0000782 0x04001001 0xe4200780 0x00000405 0xc0000782 0x04000e01 0xe4200780
0x861ffe03 0x00000000 0x308103fd 0x6c4107c8 0x861ffe03 0x00000000 0x308103fd 0x6c4107c8
0x00000405 0xc0000500 0xd4014009 0x20000500 0x00000405 0xc0000500 0xd4013809 0x20000500
0x1800c001 0x0423c500 0x2400d001 0x04200500 0x1800c001 0x0423c500 0x2400ce01 0x04200500
0x04001001 0xe4200500 0x861ffe03 0x00000000 0x04000e01 0xe4200500 0x861ffe03 0x00000000
0x00000405 0xc0000780 0x1400f001 0x0423c780 0x00000405 0xc0000780 0x1400ee01 0x0423c780
0x2400d001 0x04200780 0x04001001 0xe4200780 0x2400ce01 0x04200780 0x04000e01 0xe4200780
0x2400e001 0x04200780 0x04001001 0xe4200780 0x1500fe00 0x2500ee00 0x04000e01 0xe4200780
0x2400d801 0x04200780 0x04001001 0xe4200780 0x1500f600 0x2500ee00 0x04000e01 0xe4200780
0x2400d401 0x04200780 0x04001001 0xe4200780 0x1500f200 0x2500ee00 0x04000e01 0xe4200780
0x2400d201 0x04200780 0x307c03fd 0x6c0147c8 0x1500f000 0x2500ee00 0x307c03fd 0x6c0147c8
0x04001001 0xe4200780 0x30000003 0x00000280 0x04000e01 0xe4200780 0x30000003 0x00000280
0xd0086805 0x20000780 0x3482c1fd 0x6c6147c8 0xd0086005 0x20000780 0x3482c1fd 0x6c6147c8
0x10040003 0x00000280 0xd0084005 0x20000780 0x10044003 0x00000280 0xd0083805 0x20000780
0x1500ec00 0x1500e004 0x40030009 0x00000780 0x1500ec00 0x1500e008 0x4005000c 0x1500ee04
0x60020209 0x00008780 0x30100409 0xc4100780 0x6004020d 0x0000c780 0x40020211 0x00000780
0x60020001 0x00008780 0x2000d001 0x04200780 0x3010060d 0xc4100780 0x60030011 0x00010780
0x20068001 0x00000003 0x10058003 0x00000780 0x60040009 0x0000c780 0x3010080d 0xc4100780
0xd0086805 0x20000780 0x3483c1fd 0x6c6147c8 0x2000ce09 0x04208780 0x60020001 0x0000c780
0x10052003 0x00000280 0xd0084005 0x20000780 0x20068405 0x00000003 0x10070003 0x00000780
0x1500e604 0x1500e000 0x2400cc05 0x04204780 0xd0086005 0x20000780 0x3483c1fd 0x6c6147c8
0x3002ce0d 0xc4300780 0x40030009 0x00000780 0x1005c003 0x00000280 0xd0083805 0x20000780
0x301f0611 0xec100780 0x60020209 0x00008780 0x1500ec00 0x1500e008 0x2400c60d 0x04200780
0xd0840811 0x04400780 0x30100409 0xc4100780 0x3002cc15 0xc4300780 0x40070805 0x00000780
0x2000080d 0x0400c780 0x60020001 0x00008780 0x301f0a19 0xec100780 0x60060a11 0x00004780
0x30010605 0xec100780 0x20018000 0x2100f000 0x1400ce05 0x0423c780 0xd0840c19 0x04400780
0x200f8001 0x00000003 0x10058003 0x00000780 0x30100811 0xc4100780 0x20000c15 0x04014780
0xd0087005 0x20000780 0x1500e000 0x1500e204 0x60060809 0x00010780 0x40010411 0x00000780
0x30010a0d 0xec100780 0x60000611 0x00010780
0x20000409 0x0400c780 0x3010080d 0xc4100780
0x2000ce09 0x04208780 0x60000401 0x0000c780
0x200f8405 0x00000003 0x10070003 0x00000780
0xd0086005 0x20000780 0x1500e400 0x1500e204
0x40030009 0x00000780 0x60020209 0x00008780 0x40030009 0x00000780 0x60020209 0x00008780
0x30100409 0xc4100780 0x60020001 0x00008780 0x30100409 0xc4100780 0x347cc1fd 0x6c2147c8
0xa0004e05 0x04200780 0x30070209 0xc4100780 0x60020001 0x00008780 0x1006f003 0x00000280
0x30060205 0xc4100780 0x20018404 0x2101e804 0xd0086805 0x20000780 0x2501e209 0x00000003
0x20108205 0x00000003 0xd00e0201 0xa0c00781 0x1400c005 0x0423c780 0x4005040d 0x00000780
0x6004060d 0x0000c780 0x3010060d 0xc4100780
0x60040409 0x0000c780 0x307ccffd 0x6c2147c8
0x1400c005 0x0423c780 0x10000405 0x0403c280
0x10070003 0x00000780 0x10000005 0x0403c780
0xa0004e09 0x04200780 0x3007040d 0xc4100780
0x30060409 0xc4100780 0x20028608 0x2102e808
0x30000201 0xac000780 0x20108405 0x00000003
0xd00e0201 0xa0c00781
} }
} }