optimizations

This commit is contained in:
chudov
2009-10-27 22:12:24 +00:00
parent 28e8c70a9f
commit 8f3b3bef9f
3 changed files with 626 additions and 703 deletions

View File

@@ -557,131 +557,152 @@ namespace CUETools.Codecs.FlaCuda
}
}
static uint rice_encode_count(uint sum, uint n, uint k)
{
return n * (k + 1) + ((sum - (n >> 1)) >> (int)k);
}
//static unsafe uint find_optimal_rice_param(uint sum, uint n)
//{
// uint* nbits = stackalloc uint[Flake.MAX_RICE_PARAM + 1];
// int k_opt = 0;
// nbits[0] = UINT32_MAX;
// for (int k = 0; k <= Flake.MAX_RICE_PARAM; k++)
// {
// nbits[k] = rice_encode_count(sum, n, (uint)k);
// if (nbits[k] < nbits[k_opt])
// k_opt = k;
// }
// return (uint)k_opt;
//}
static unsafe int find_optimal_rice_param(uint sum, uint n, out uint nbits_best)
{
int k_opt = 0;
uint a = n;
uint b = sum - (n >> 1);
uint nbits = a + b;
for (int k = 1; k <= Flake.MAX_RICE_PARAM; k++)
{
a += n;
b >>= 1;
uint nbits_k = a + b;
if (nbits_k < nbits)
{
k_opt = k;
nbits = nbits_k;
}
}
nbits_best = nbits;
return k_opt;
}
static unsafe uint calc_optimal_rice_params(ref RiceContext rc, int porder, uint* sums, uint n, uint pred_order)
static unsafe uint calc_optimal_rice_params(int porder, int* parm, uint* sums, uint n, uint pred_order)
{
uint part = (1U << porder);
uint all_bits = 0;
rc.rparams[0] = find_optimal_rice_param(sums[0], (n >> porder) - pred_order, out all_bits);
uint cnt = (n >> porder);
uint cnt = (n >> porder) - pred_order;
int k = cnt > 0 ? Math.Min(Flake.MAX_RICE_PARAM, BitReader.log2i(sums[0] / cnt)) : 0;
uint all_bits = cnt * ((uint)k + 1U) + (sums[0] >> k);
parm[0] = k;
cnt = (n >> porder);
for (uint i = 1; i < part; i++)
{
uint nbits;
rc.rparams[i] = find_optimal_rice_param(sums[i], cnt, out nbits);
all_bits += nbits;
k = Math.Min(Flake.MAX_RICE_PARAM, BitReader.log2i(sums[i] / cnt));
all_bits += cnt * ((uint)k + 1U) + (sums[i] >> k);
parm[i] = k;
}
all_bits += (4 * part);
rc.porder = porder;
return all_bits;
return all_bits + (4 * part);
}
static unsafe void calc_sums(int pmin, int pmax, int* data, uint n, uint pred_order, uint* sums)
static unsafe void calc_lower_sums(int pmin, int pmax, uint* sums)
{
// sums for highest level
int parts = (1 << pmax);
int* res = data + pred_order;
uint cnt = (n >> pmax) - pred_order;
uint sum = 0;
for (uint j = cnt; j > 0; j--)
{
int val = *(res++);
sum += (uint)((val << 1) ^ (val >> 31));
}
sums[pmax * Flake.MAX_PARTITIONS + 0] = sum;
cnt = (n >> pmax);
for (int i = 1; i < parts; i++)
{
sum = 0;
for (uint j = cnt; j > 0; j--)
{
int val = *(res++);
sum += (uint)((val << 1) ^ (val >> 31));
}
sums[pmax * Flake.MAX_PARTITIONS + i] = sum;
}
// sums for lower levels
for (int i = pmax - 1; i >= pmin; i--)
{
parts = (1 << i);
for (int j = 0; j < parts; j++)
for (int j = 0; j < (1 << i); j++)
{
sums[i * Flake.MAX_PARTITIONS + j] =
sums[(i + 1) * Flake.MAX_PARTITIONS + 2 * j] +
sums[i * Flake.MAX_PARTITIONS + j] =
sums[(i + 1) * Flake.MAX_PARTITIONS + 2 * j] +
sums[(i + 1) * Flake.MAX_PARTITIONS + 2 * j + 1];
}
}
}
static unsafe uint calc_rice_params(ref RiceContext rc, ref RiceContext tmp_rc, int pmin, int pmax, int* data, uint n, uint pred_order)
static unsafe void calc_sums(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums)
{
//uint* udata = stackalloc uint[(int)n];
int parts = (1 << pmax);
uint* res = data + pred_order;
uint cnt = (n >> pmax) - pred_order;
uint sum = 0;
for (uint j = cnt; j > 0; j--)
sum += *(res++);
sums[0] = sum;
cnt = (n >> pmax);
for (int i = 1; i < parts; i++)
{
sum = 0;
for (uint j = cnt; j > 0; j--)
sum += *(res++);
sums[i] = sum;
}
}
/// <summary>
/// Special case when (n >> pmax) == 18
/// </summary>
/// <param name="pmin"></param>
/// <param name="pmax"></param>
/// <param name="data"></param>
/// <param name="n"></param>
/// <param name="pred_order"></param>
/// <param name="sums"></param>
static unsafe void calc_sums18(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums)
{
int parts = (1 << pmax);
uint* res = data + pred_order;
uint cnt = 18 - pred_order;
uint sum = 0;
for (uint j = cnt; j > 0; j--)
sum += *(res++);
sums[0] = sum;
for (int i = 1; i < parts; i++)
{
sums[i] =
*(res++) + *(res++) + *(res++) + *(res++) +
*(res++) + *(res++) + *(res++) + *(res++) +
*(res++) + *(res++) + *(res++) + *(res++) +
*(res++) + *(res++) + *(res++) + *(res++) +
*(res++) + *(res++);
}
}
/// <summary>
/// Special case when (n >> pmax) == 18
/// </summary>
/// <param name="pmin"></param>
/// <param name="pmax"></param>
/// <param name="data"></param>
/// <param name="n"></param>
/// <param name="pred_order"></param>
/// <param name="sums"></param>
static unsafe void calc_sums16(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums)
{
int parts = (1 << pmax);
uint* res = data + pred_order;
uint cnt = 16 - pred_order;
uint sum = 0;
for (uint j = cnt; j > 0; j--)
sum += *(res++);
sums[0] = sum;
for (int i = 1; i < parts; i++)
{
sums[i] =
*(res++) + *(res++) + *(res++) + *(res++) +
*(res++) + *(res++) + *(res++) + *(res++) +
*(res++) + *(res++) + *(res++) + *(res++) +
*(res++) + *(res++) + *(res++) + *(res++);
}
}
static unsafe uint calc_rice_params(RiceContext rc, int pmin, int pmax, int* data, uint n, uint pred_order)
{
uint* udata = stackalloc uint[(int)n];
uint* sums = stackalloc uint[(pmax + 1) * Flake.MAX_PARTITIONS];
int* parm = stackalloc int[(pmax + 1) * Flake.MAX_PARTITIONS];
//uint* bits = stackalloc uint[Flake.MAX_PARTITION_ORDER];
//assert(pmin >= 0 && pmin <= Flake.MAX_PARTITION_ORDER);
//assert(pmax >= 0 && pmax <= Flake.MAX_PARTITION_ORDER);
//assert(pmin <= pmax);
//for (uint i = 0; i < n; i++)
// udata[i] = (uint) ((2 * data[i]) ^ (data[i] >> 31));
for (uint i = 0; i < n; i++)
udata[i] = (uint)((data[i] << 1) ^ (data[i] >> 31));
calc_sums(pmin, pmax, data, n, pred_order, sums);
// sums for highest level
if ((n >> pmax) == 18)
calc_sums18(pmin, pmax, udata, n, pred_order, sums + pmax * Flake.MAX_PARTITIONS);
else if ((n >> pmax) == 16)
calc_sums16(pmin, pmax, udata, n, pred_order, sums + pmax * Flake.MAX_PARTITIONS);
else
calc_sums(pmin, pmax, udata, n, pred_order, sums + pmax * Flake.MAX_PARTITIONS);
// sums for lower levels
calc_lower_sums(pmin, pmax, sums);
int opt_porder = pmin;
uint opt_bits = AudioSamples.UINT32_MAX;
int opt_porder = pmin;
for (int i = pmin; i <= pmax; i++)
{
uint bits = calc_optimal_rice_params(ref tmp_rc, i, sums + i * Flake.MAX_PARTITIONS, n, pred_order);
uint bits = calc_optimal_rice_params(i, parm + i * Flake.MAX_PARTITIONS, sums + i * Flake.MAX_PARTITIONS, n, pred_order);
if (bits <= opt_bits)
{
opt_porder = i;
opt_bits = bits;
RiceContext tmp_rc2 = rc;
rc = tmp_rc;
tmp_rc = tmp_rc2;
opt_porder = i;
}
}
rc.porder = opt_porder;
fixed (int* rparms = rc.rparams)
AudioSamples.MemCpy(rparms, parm + opt_porder * Flake.MAX_PARTITIONS, (1 << opt_porder));
return opt_bits;
}
@@ -990,7 +1011,7 @@ namespace CUETools.Codecs.FlaCuda
int pmin = get_max_p_order(eparams.min_partition_order, frame.blocksize, frame.subframes[ch].best.order);
int pmax = get_max_p_order(eparams.max_partition_order, frame.blocksize, frame.subframes[ch].best.order);
uint bits = (uint)frame.subframes[ch].best.order * frame.subframes[ch].obits + 6;
frame.subframes[ch].best.size = bits + calc_rice_params(ref frame.subframes[ch].best.rc, ref frame.current.rc, pmin, pmax, frame.subframes[ch].best.residual, (uint)frame.blocksize, (uint)frame.subframes[ch].best.order);
frame.subframes[ch].best.size = bits + calc_rice_params(frame.subframes[ch].best.rc, pmin, pmax, frame.subframes[ch].best.residual, (uint)frame.blocksize, (uint)frame.subframes[ch].best.order);
}
break;
case SubframeType.LPC:
@@ -1009,7 +1030,7 @@ namespace CUETools.Codecs.FlaCuda
int pmax = get_max_p_order(eparams.max_partition_order, frame.blocksize, frame.subframes[ch].best.order);
uint bits = (uint)frame.subframes[ch].best.order * frame.subframes[ch].obits + 4 + 5 + (uint)frame.subframes[ch].best.order * (uint)frame.subframes[ch].best.cbits + 6;
//uint oldsize = frame.subframes[ch].best.size;
frame.subframes[ch].best.size = bits + calc_rice_params(ref frame.subframes[ch].best.rc, ref frame.current.rc, pmin, pmax, frame.subframes[ch].best.residual, (uint)frame.blocksize, (uint)frame.subframes[ch].best.order);
frame.subframes[ch].best.size = bits + calc_rice_params(frame.subframes[ch].best.rc, pmin, pmax, frame.subframes[ch].best.residual, (uint)frame.blocksize, (uint)frame.subframes[ch].best.order);
//if (frame.subframes[ch].best.size > frame.subframes[ch].obits * (uint)frame.blocksize &&
// oldsize <= frame.subframes[ch].obits * (uint)frame.blocksize)
// throw new Exception("oops");
@@ -1213,9 +1234,10 @@ namespace CUETools.Codecs.FlaCuda
cuda.SetParameter(task.cudaChooseBestMethod, 0 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer);
cuda.SetParameter(task.cudaChooseBestMethod, 1 * sizeof(uint), (uint)task.cudaResidualOutput.Pointer);
cuda.SetParameter(task.cudaChooseBestMethod, 2 * sizeof(uint), (uint)residualPartCount);
cuda.SetParameter(task.cudaChooseBestMethod, 3 * sizeof(uint), (uint)task.nResidualTasksPerChannel);
cuda.SetParameterSize(task.cudaChooseBestMethod, sizeof(uint) * 4U);
cuda.SetParameter(task.cudaChooseBestMethod, 2 * sizeof(uint), (uint)residualPartSize);
cuda.SetParameter(task.cudaChooseBestMethod, 3 * sizeof(uint), (uint)residualPartCount);
cuda.SetParameter(task.cudaChooseBestMethod, 4 * sizeof(uint), (uint)task.nResidualTasksPerChannel);
cuda.SetParameterSize(task.cudaChooseBestMethod, 5U * sizeof(uint));
cuda.SetFunctionBlockShape(task.cudaChooseBestMethod, 32, 8, 1);
cuda.SetParameter(task.cudaCopyBestMethod, 0, (uint)task.cudaBestResidualTasks.Pointer);

View File

@@ -61,6 +61,7 @@ typedef struct
#define SUM512(buf,tid,op) if (tid < 256) buf[tid] op buf[tid + 256]; __syncthreads(); SUM256(buf,tid,op)
#define FSQR(s) ((s)*(s))
#define FASTMUL(a,b) __mul24(a,b)
extern "C" __global__ void cudaStereoDecorr(
int *samples,
@@ -581,14 +582,8 @@ extern "C" __global__ void cudaEstimateResidual(
shared.residual[tid] = shared.residual[tid] + shared.residual[tid + 8] + shared.residual[tid + 16] + shared.residual[tid + 24];
shared.residual[tid] = shared.residual[tid] + shared.residual[tid + 2] + shared.residual[tid + 4] + shared.residual[tid + 6];
shared.residual[tid] += shared.residual[tid + 1];
// rice parameter search
shared.residual[tid] = (shared.task[threadIdx.y].type != Constant || shared.residual[threadIdx.y << 5] != 0) *
(__mul24(threadIdx.x >= 15, 0x7fffff) + residualLen * (threadIdx.x + 1) + ((shared.residual[threadIdx.y << 5] - (residualLen >> 1)) >> threadIdx.x));
shared.residual[tid] = min(min(shared.residual[tid], shared.residual[tid + 4]), min(shared.residual[tid + 8], shared.residual[tid + 12]));
if (threadIdx.x == 0)
output[(blockIdx.y * blockDim.y + threadIdx.y) * 64 + blockIdx.x] = min(min(shared.residual[tid], shared.residual[tid + 1]), min(shared.residual[tid + 2], shared.residual[tid + 3]));
output[(blockIdx.y * blockDim.y + threadIdx.y) * 64 + blockIdx.x] = shared.residual[tid] + shared.residual[tid + 1];
}
extern "C" __global__ void cudaEstimateResidual1(
@@ -612,7 +607,7 @@ extern "C" __global__ void cudaEstimateResidual1(
if (tid == 0)
{
shared.pos = blockIdx.x * partSize;
shared.dataLen = min(shared.task.data.blocksize - shared.pos, partSize + max_order);
shared.dataLen = min(shared.task.data.blocksize - shared.pos, partSize + shared.task.data.residualOrder);
}
__syncthreads();
@@ -631,22 +626,10 @@ extern "C" __global__ void cudaEstimateResidual1(
shared.residual[tid] = __mul24(ptr < shared.dataLen, min(0x7fffff,(sum << 1) ^ (sum >> 31)));
__syncthreads();
SUM256(shared.residual, tid, +=);
if (threadIdx.y == 0)
{
const int residualLen = max(0,min(shared.task.data.blocksize - shared.pos - shared.task.data.residualOrder, partSize));
// rice parameter search
shared.residual[threadIdx.x] = (shared.task.data.type != Constant || shared.residual[0] != 0) *
(__mul24(threadIdx.x >= 15, 0x7fffff) + residualLen * (threadIdx.x + 1) + ((shared.residual[0] - (residualLen >> 1)) >> threadIdx.x));
shared.residual[threadIdx.x] = min(min(shared.residual[threadIdx.x], shared.residual[threadIdx.x + 4]), min(shared.residual[threadIdx.x + 8], shared.residual[threadIdx.x + 12]));
if (threadIdx.x == 0)
output[blockIdx.y * 64 + blockIdx.x] = min(min(shared.residual[threadIdx.x], shared.residual[threadIdx.x + 1]), min(shared.residual[threadIdx.x + 2], shared.residual[threadIdx.x + 3]));
}
if (tid == 0)
output[blockIdx.y * 64 + blockIdx.x] = shared.residual[0];
}
#define FASTMUL(a,b) __mul24(a,b)
extern "C" __global__ void cudaEstimateResidual8(
int*output,
int*samples,
@@ -656,57 +639,61 @@ extern "C" __global__ void cudaEstimateResidual8(
)
{
__shared__ struct {
int data[32*9];
volatile int data[32*9];
volatile int residual[32*8];
FlaCudaSubframeData task[8];
int coefs[32*8];
volatile int pos;
volatile int dataLen;
volatile int dataOffs;
} shared;
const int tid = threadIdx.x + threadIdx.y * 32;
const int taskNo = FASTMUL(blockIdx.y, blockDim.y) + threadIdx.y;
if (threadIdx.x < sizeof(FlaCudaSubframeData)/sizeof(int))
((int*)&shared.task[threadIdx.y])[threadIdx.x] = ((int*)(&tasks[blockIdx.y * blockDim.y + threadIdx.y]))[threadIdx.x];
((int*)&shared.task[threadIdx.y])[threadIdx.x] = ((int*)(&tasks[taskNo]))[threadIdx.x];
const int ro = shared.task[threadIdx.y].residualOrder;
shared.coefs[tid] = threadIdx.x < ro ? tasks[taskNo].coefs[threadIdx.x] : 0;
if (tid == 0)
{
shared.pos = FASTMUL(blockIdx.x, partSize);
shared.dataLen = min(shared.task[0].blocksize - shared.pos, partSize + max_order);
shared.dataOffs = shared.task[0].samplesOffs + shared.pos;
}
__syncthreads();
const int pos = blockIdx.x * partSize;
const int dataLen = min(shared.task[0].blocksize - pos, partSize + max_order);
// fetch samples
shared.data[tid] = tid < dataLen ? samples[shared.task[0].samplesOffs + pos + tid] >> shared.task[0].wbits : 0;
if (tid < 32) shared.data[tid + partSize] = tid + partSize < dataLen ? samples[shared.task[0].samplesOffs + pos + tid + partSize] >> shared.task[0].wbits : 0;
if (tid < shared.dataLen)
shared.data[tid] = samples[shared.dataOffs + tid] >> shared.task[0].wbits;
if (tid + partSize < shared.dataLen)
shared.data[tid + partSize] = samples[shared.dataOffs + tid + partSize] >> shared.task[0].wbits;
__syncthreads();
shared.residual[tid] = 0;
shared.coefs[tid] = threadIdx.x < shared.task[threadIdx.y].residualOrder ? tasks[blockIdx.y * blockDim.y + threadIdx.y].coefs[threadIdx.x] : 0;
const int residualLen = max(0,min(shared.task[0].blocksize - pos - shared.task[threadIdx.y].residualOrder, partSize));
const int residualLen = max(0,min(shared.dataLen - ro, partSize));
const int ptr2 = threadIdx.y << 5;
int s = 0;
for (int ptr = threadIdx.x + blockDim.y * 32 * (shared.task[threadIdx.y].type == Verbatim); ptr < blockDim.y * 32 + threadIdx.x; ptr += 32)
for (int ptr = threadIdx.x; ptr < residualLen; ptr += 32)
{
// compute residual
int sum =
__mul24(shared.data[ptr + 0], shared.coefs[ptr2 + 0]) +
__mul24(shared.data[ptr + 1], shared.coefs[ptr2 + 1]) +
__mul24(shared.data[ptr + 2], shared.coefs[ptr2 + 2]) +
__mul24(shared.data[ptr + 3], shared.coefs[ptr2 + 3]) +
__mul24(shared.data[ptr + 3], shared.coefs[ptr2 + 3]);
sum +=
__mul24(shared.data[ptr + 4], shared.coefs[ptr2 + 4]) +
__mul24(shared.data[ptr + 5], shared.coefs[ptr2 + 5]) +
__mul24(shared.data[ptr + 6], shared.coefs[ptr2 + 6]) +
__mul24(shared.data[ptr + 7], shared.coefs[ptr2 + 7]);
sum = shared.data[ptr + shared.task[threadIdx.y].residualOrder] - (sum >> shared.task[threadIdx.y].shift);
s += __mul24(ptr < residualLen, min(0x7fffff,(sum << 1) ^ (sum >> 31)));
sum = shared.data[ptr + ro] - (sum >> shared.task[threadIdx.y].shift);
s += min(0x7fffff,(sum << 1) ^ (sum >> 31));
}
shared.residual[tid] = s;
shared.residual[tid] = shared.residual[tid] + shared.residual[tid + 8] + shared.residual[tid + 16] + shared.residual[tid + 24];
shared.residual[tid] = shared.residual[tid] + shared.residual[tid + 2] + shared.residual[tid + 4] + shared.residual[tid + 6];
shared.residual[tid] += shared.residual[tid + 1];
// rice parameter search
shared.residual[tid] = (shared.task[threadIdx.y].type != Constant || shared.residual[threadIdx.y << 5] != 0) *
(__mul24(threadIdx.x >= 15, 0x7fffff) + residualLen * (threadIdx.x + 1) + ((shared.residual[threadIdx.y << 5] - (residualLen >> 1)) >> threadIdx.x));
shared.residual[tid] = min(min(shared.residual[tid], shared.residual[tid + 4]), min(shared.residual[tid + 8], shared.residual[tid + 12]));
if (threadIdx.x == 0)
output[(blockIdx.y * blockDim.y + threadIdx.y) * 64 + blockIdx.x] = min(min(shared.residual[tid], shared.residual[tid + 1]), min(shared.residual[tid + 2], shared.residual[tid + 3]));
output[(blockIdx.y * blockDim.y + threadIdx.y) * 64 + blockIdx.x] = shared.residual[tid] + shared.residual[tid + 1];
}
extern "C" __global__ void cudaEstimateResidual12(
@@ -727,8 +714,11 @@ extern "C" __global__ void cudaEstimateResidual12(
volatile int dataOffs;
} shared;
const int tid = threadIdx.x + threadIdx.y * 32;
const int taskNo = FASTMUL(blockIdx.y, blockDim.y) + threadIdx.y;
if (threadIdx.x < sizeof(FlaCudaSubframeData)/sizeof(int))
((int*)&shared.task[threadIdx.y])[threadIdx.x] = ((int*)(&tasks[FASTMUL(blockIdx.y, blockDim.y) + threadIdx.y]))[threadIdx.x];
((int*)&shared.task[threadIdx.y])[threadIdx.x] = ((int*)(&tasks[taskNo]))[threadIdx.x];
const int ro = shared.task[threadIdx.y].residualOrder;
shared.coefs[tid] = threadIdx.x < ro ? tasks[taskNo].coefs[threadIdx.x] : 0;
if (tid == 0)
{
shared.pos = FASTMUL(blockIdx.x, partSize);
@@ -738,30 +728,30 @@ extern "C" __global__ void cudaEstimateResidual12(
__syncthreads();
// fetch samples
shared.data[tid] = tid < shared.dataLen ? samples[shared.dataOffs + tid] >> shared.task[0].wbits : 0;
if (tid < 32) shared.data[tid + partSize] = tid + partSize < shared.dataLen ? samples[shared.dataOffs + tid + partSize] >> shared.task[0].wbits : 0;
if (tid < shared.dataLen)
shared.data[tid] = samples[shared.dataOffs + tid] >> shared.task[0].wbits;
if (tid + partSize < shared.dataLen)
shared.data[tid + partSize] = samples[shared.dataOffs + tid + partSize] >> shared.task[0].wbits;
__syncthreads();
const int ro = shared.task[threadIdx.y].residualOrder;
const int residualLen = max(0,min(shared.task[0].blocksize - shared.pos - ro, partSize));
int residualLen = max(0,min(shared.dataLen - ro, partSize));
const int ptr2 = threadIdx.y << 5;
shared.coefs[tid] = threadIdx.x < ro ? tasks[FASTMUL(blockIdx.y, blockDim.y) + threadIdx.y].coefs[threadIdx.x] : 0;
int s = 0;
for (int ptr = shared.task[threadIdx.y].type == Verbatim ? residualLen : threadIdx.x; ptr < residualLen; ptr += 32)
for (int ptr = threadIdx.x; ptr < residualLen; ptr += 32)
{
// compute residual
int sum =
FASTMUL(shared.data[ptr + 0], shared.coefs[ptr2 + 0]) +
FASTMUL(shared.data[ptr + 1], shared.coefs[ptr2 + 1]) +
FASTMUL(shared.data[ptr + 2], shared.coefs[ptr2 + 2]) +
FASTMUL(shared.data[ptr + 3], shared.coefs[ptr2 + 3]) +
FASTMUL(shared.data[ptr + 3], shared.coefs[ptr2 + 3]);
sum +=
FASTMUL(shared.data[ptr + 4], shared.coefs[ptr2 + 4]) +
FASTMUL(shared.data[ptr + 5], shared.coefs[ptr2 + 5]) +
FASTMUL(shared.data[ptr + 6], shared.coefs[ptr2 + 6]) +
FASTMUL(shared.data[ptr + 7], shared.coefs[ptr2 + 7]) +
FASTMUL(shared.data[ptr + 7], shared.coefs[ptr2 + 7]);
sum +=
FASTMUL(shared.data[ptr + 8], shared.coefs[ptr2 + 8]) +
FASTMUL(shared.data[ptr + 9], shared.coefs[ptr2 + 9]) +
FASTMUL(shared.data[ptr + 10], shared.coefs[ptr2 + 10]) +
@@ -773,19 +763,14 @@ extern "C" __global__ void cudaEstimateResidual12(
shared.residual[tid] = s;
shared.residual[tid] = shared.residual[tid] + shared.residual[tid + 8] + shared.residual[tid + 16] + shared.residual[tid + 24];
shared.residual[tid] = shared.residual[tid] + shared.residual[tid + 2] + shared.residual[tid + 4] + shared.residual[tid + 6];
shared.residual[tid] += shared.residual[tid + 1];
// rice parameter search
shared.residual[tid] = (shared.task[threadIdx.y].type != Constant || shared.residual[threadIdx.y << 5] != 0) *
(__mul24(threadIdx.x >= 15, 0x7fffff) + FASTMUL(residualLen, threadIdx.x + 1) + ((shared.residual[threadIdx.y << 5] - (residualLen >> 1)) >> threadIdx.x));
shared.residual[tid] = min(min(shared.residual[tid], shared.residual[tid + 4]), min(shared.residual[tid + 8], shared.residual[tid + 12]));
if (threadIdx.x == 0)
output[(blockIdx.y * blockDim.y + threadIdx.y) * 64 + blockIdx.x] = min(min(shared.residual[tid], shared.residual[tid + 1]), min(shared.residual[tid + 2], shared.residual[tid + 3]));
output[(blockIdx.y * blockDim.y + threadIdx.y) * 64 + blockIdx.x] = shared.residual[tid] + shared.residual[tid + 1];
}
extern "C" __global__ void cudaChooseBestMethod(
FlaCudaSubframeTask *tasks,
int *residual,
int partSize,
int partCount, // <= blockDim.y (256)
int taskCount
)
@@ -806,8 +791,18 @@ extern "C" __global__ void cudaChooseBestMethod(
((int*)&shared.task[threadIdx.y])[threadIdx.x] = ((int*)(tasks + task + threadIdx.y + taskCount * blockIdx.y))[threadIdx.x];
int sum = 0;
for (int pos = 0; pos < partCount; pos += blockDim.x)
sum += (pos + threadIdx.x < partCount ? residual[pos + threadIdx.x + 64 * (task + threadIdx.y + taskCount * blockIdx.y)] : 0);
for (int pos = threadIdx.x; pos < partCount; pos += blockDim.x)
{
// fetch part sum
int psum = residual[pos + 64 * (task + threadIdx.y + taskCount * blockIdx.y)];
// calculate part size
int residualLen = max(0,min(shared.task[threadIdx.y].data.blocksize - FASTMUL(pos, partSize) - shared.task[threadIdx.y].data.residualOrder, partSize));
residualLen = FASTMUL(residualLen, shared.task[threadIdx.y].data.type != Constant || psum != 0);
// calculate rice parameter
int k = max(0, min(14, __float2int_rz(__log2f((psum + 0.000001f) / (residualLen + 0.000001f) + 0.5f))));
// calculate part bit length
sum += FASTMUL(residualLen, k + 1) + (psum >> k);
}
shared.partLen[tid] = sum;
// length sum: reduction in shared mem

File diff suppressed because it is too large Load Diff