mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
optimizations
This commit is contained in:
@@ -557,131 +557,152 @@ namespace CUETools.Codecs.FlaCuda
|
||||
}
|
||||
}
|
||||
|
||||
static uint rice_encode_count(uint sum, uint n, uint k)
|
||||
{
|
||||
return n * (k + 1) + ((sum - (n >> 1)) >> (int)k);
|
||||
}
|
||||
|
||||
//static unsafe uint find_optimal_rice_param(uint sum, uint n)
|
||||
//{
|
||||
// uint* nbits = stackalloc uint[Flake.MAX_RICE_PARAM + 1];
|
||||
// int k_opt = 0;
|
||||
|
||||
// nbits[0] = UINT32_MAX;
|
||||
// for (int k = 0; k <= Flake.MAX_RICE_PARAM; k++)
|
||||
// {
|
||||
// nbits[k] = rice_encode_count(sum, n, (uint)k);
|
||||
// if (nbits[k] < nbits[k_opt])
|
||||
// k_opt = k;
|
||||
// }
|
||||
// return (uint)k_opt;
|
||||
//}
|
||||
|
||||
static unsafe int find_optimal_rice_param(uint sum, uint n, out uint nbits_best)
|
||||
{
|
||||
int k_opt = 0;
|
||||
uint a = n;
|
||||
uint b = sum - (n >> 1);
|
||||
uint nbits = a + b;
|
||||
for (int k = 1; k <= Flake.MAX_RICE_PARAM; k++)
|
||||
{
|
||||
a += n;
|
||||
b >>= 1;
|
||||
uint nbits_k = a + b;
|
||||
if (nbits_k < nbits)
|
||||
{
|
||||
k_opt = k;
|
||||
nbits = nbits_k;
|
||||
}
|
||||
}
|
||||
nbits_best = nbits;
|
||||
return k_opt;
|
||||
}
|
||||
|
||||
static unsafe uint calc_optimal_rice_params(ref RiceContext rc, int porder, uint* sums, uint n, uint pred_order)
|
||||
static unsafe uint calc_optimal_rice_params(int porder, int* parm, uint* sums, uint n, uint pred_order)
|
||||
{
|
||||
uint part = (1U << porder);
|
||||
uint all_bits = 0;
|
||||
rc.rparams[0] = find_optimal_rice_param(sums[0], (n >> porder) - pred_order, out all_bits);
|
||||
uint cnt = (n >> porder);
|
||||
uint cnt = (n >> porder) - pred_order;
|
||||
int k = cnt > 0 ? Math.Min(Flake.MAX_RICE_PARAM, BitReader.log2i(sums[0] / cnt)) : 0;
|
||||
uint all_bits = cnt * ((uint)k + 1U) + (sums[0] >> k);
|
||||
parm[0] = k;
|
||||
cnt = (n >> porder);
|
||||
for (uint i = 1; i < part; i++)
|
||||
{
|
||||
uint nbits;
|
||||
rc.rparams[i] = find_optimal_rice_param(sums[i], cnt, out nbits);
|
||||
all_bits += nbits;
|
||||
k = Math.Min(Flake.MAX_RICE_PARAM, BitReader.log2i(sums[i] / cnt));
|
||||
all_bits += cnt * ((uint)k + 1U) + (sums[i] >> k);
|
||||
parm[i] = k;
|
||||
}
|
||||
all_bits += (4 * part);
|
||||
rc.porder = porder;
|
||||
return all_bits;
|
||||
return all_bits + (4 * part);
|
||||
}
|
||||
|
||||
static unsafe void calc_sums(int pmin, int pmax, int* data, uint n, uint pred_order, uint* sums)
|
||||
static unsafe void calc_lower_sums(int pmin, int pmax, uint* sums)
|
||||
{
|
||||
// sums for highest level
|
||||
int parts = (1 << pmax);
|
||||
int* res = data + pred_order;
|
||||
uint cnt = (n >> pmax) - pred_order;
|
||||
uint sum = 0;
|
||||
for (uint j = cnt; j > 0; j--)
|
||||
{
|
||||
int val = *(res++);
|
||||
sum += (uint)((val << 1) ^ (val >> 31));
|
||||
}
|
||||
sums[pmax * Flake.MAX_PARTITIONS + 0] = sum;
|
||||
cnt = (n >> pmax);
|
||||
for (int i = 1; i < parts; i++)
|
||||
{
|
||||
sum = 0;
|
||||
for (uint j = cnt; j > 0; j--)
|
||||
{
|
||||
int val = *(res++);
|
||||
sum += (uint)((val << 1) ^ (val >> 31));
|
||||
}
|
||||
sums[pmax * Flake.MAX_PARTITIONS + i] = sum;
|
||||
}
|
||||
// sums for lower levels
|
||||
for (int i = pmax - 1; i >= pmin; i--)
|
||||
{
|
||||
parts = (1 << i);
|
||||
for (int j = 0; j < parts; j++)
|
||||
for (int j = 0; j < (1 << i); j++)
|
||||
{
|
||||
sums[i * Flake.MAX_PARTITIONS + j] =
|
||||
sums[(i + 1) * Flake.MAX_PARTITIONS + 2 * j] +
|
||||
sums[i * Flake.MAX_PARTITIONS + j] =
|
||||
sums[(i + 1) * Flake.MAX_PARTITIONS + 2 * j] +
|
||||
sums[(i + 1) * Flake.MAX_PARTITIONS + 2 * j + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static unsafe uint calc_rice_params(ref RiceContext rc, ref RiceContext tmp_rc, int pmin, int pmax, int* data, uint n, uint pred_order)
|
||||
static unsafe void calc_sums(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums)
|
||||
{
|
||||
//uint* udata = stackalloc uint[(int)n];
|
||||
int parts = (1 << pmax);
|
||||
uint* res = data + pred_order;
|
||||
uint cnt = (n >> pmax) - pred_order;
|
||||
uint sum = 0;
|
||||
for (uint j = cnt; j > 0; j--)
|
||||
sum += *(res++);
|
||||
sums[0] = sum;
|
||||
cnt = (n >> pmax);
|
||||
for (int i = 1; i < parts; i++)
|
||||
{
|
||||
sum = 0;
|
||||
for (uint j = cnt; j > 0; j--)
|
||||
sum += *(res++);
|
||||
sums[i] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Special case when (n >> pmax) == 18
|
||||
/// </summary>
|
||||
/// <param name="pmin"></param>
|
||||
/// <param name="pmax"></param>
|
||||
/// <param name="data"></param>
|
||||
/// <param name="n"></param>
|
||||
/// <param name="pred_order"></param>
|
||||
/// <param name="sums"></param>
|
||||
static unsafe void calc_sums18(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums)
|
||||
{
|
||||
int parts = (1 << pmax);
|
||||
uint* res = data + pred_order;
|
||||
uint cnt = 18 - pred_order;
|
||||
uint sum = 0;
|
||||
for (uint j = cnt; j > 0; j--)
|
||||
sum += *(res++);
|
||||
sums[0] = sum;
|
||||
for (int i = 1; i < parts; i++)
|
||||
{
|
||||
sums[i] =
|
||||
*(res++) + *(res++) + *(res++) + *(res++) +
|
||||
*(res++) + *(res++) + *(res++) + *(res++) +
|
||||
*(res++) + *(res++) + *(res++) + *(res++) +
|
||||
*(res++) + *(res++) + *(res++) + *(res++) +
|
||||
*(res++) + *(res++);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Special case when (n >> pmax) == 18
|
||||
/// </summary>
|
||||
/// <param name="pmin"></param>
|
||||
/// <param name="pmax"></param>
|
||||
/// <param name="data"></param>
|
||||
/// <param name="n"></param>
|
||||
/// <param name="pred_order"></param>
|
||||
/// <param name="sums"></param>
|
||||
static unsafe void calc_sums16(int pmin, int pmax, uint* data, uint n, uint pred_order, uint* sums)
|
||||
{
|
||||
int parts = (1 << pmax);
|
||||
uint* res = data + pred_order;
|
||||
uint cnt = 16 - pred_order;
|
||||
uint sum = 0;
|
||||
for (uint j = cnt; j > 0; j--)
|
||||
sum += *(res++);
|
||||
sums[0] = sum;
|
||||
for (int i = 1; i < parts; i++)
|
||||
{
|
||||
sums[i] =
|
||||
*(res++) + *(res++) + *(res++) + *(res++) +
|
||||
*(res++) + *(res++) + *(res++) + *(res++) +
|
||||
*(res++) + *(res++) + *(res++) + *(res++) +
|
||||
*(res++) + *(res++) + *(res++) + *(res++);
|
||||
}
|
||||
}
|
||||
|
||||
static unsafe uint calc_rice_params(RiceContext rc, int pmin, int pmax, int* data, uint n, uint pred_order)
|
||||
{
|
||||
uint* udata = stackalloc uint[(int)n];
|
||||
uint* sums = stackalloc uint[(pmax + 1) * Flake.MAX_PARTITIONS];
|
||||
int* parm = stackalloc int[(pmax + 1) * Flake.MAX_PARTITIONS];
|
||||
//uint* bits = stackalloc uint[Flake.MAX_PARTITION_ORDER];
|
||||
|
||||
//assert(pmin >= 0 && pmin <= Flake.MAX_PARTITION_ORDER);
|
||||
//assert(pmax >= 0 && pmax <= Flake.MAX_PARTITION_ORDER);
|
||||
//assert(pmin <= pmax);
|
||||
|
||||
//for (uint i = 0; i < n; i++)
|
||||
// udata[i] = (uint) ((2 * data[i]) ^ (data[i] >> 31));
|
||||
for (uint i = 0; i < n; i++)
|
||||
udata[i] = (uint)((data[i] << 1) ^ (data[i] >> 31));
|
||||
|
||||
calc_sums(pmin, pmax, data, n, pred_order, sums);
|
||||
// sums for highest level
|
||||
if ((n >> pmax) == 18)
|
||||
calc_sums18(pmin, pmax, udata, n, pred_order, sums + pmax * Flake.MAX_PARTITIONS);
|
||||
else if ((n >> pmax) == 16)
|
||||
calc_sums16(pmin, pmax, udata, n, pred_order, sums + pmax * Flake.MAX_PARTITIONS);
|
||||
else
|
||||
calc_sums(pmin, pmax, udata, n, pred_order, sums + pmax * Flake.MAX_PARTITIONS);
|
||||
// sums for lower levels
|
||||
calc_lower_sums(pmin, pmax, sums);
|
||||
|
||||
int opt_porder = pmin;
|
||||
uint opt_bits = AudioSamples.UINT32_MAX;
|
||||
int opt_porder = pmin;
|
||||
for (int i = pmin; i <= pmax; i++)
|
||||
{
|
||||
uint bits = calc_optimal_rice_params(ref tmp_rc, i, sums + i * Flake.MAX_PARTITIONS, n, pred_order);
|
||||
uint bits = calc_optimal_rice_params(i, parm + i * Flake.MAX_PARTITIONS, sums + i * Flake.MAX_PARTITIONS, n, pred_order);
|
||||
if (bits <= opt_bits)
|
||||
{
|
||||
opt_porder = i;
|
||||
opt_bits = bits;
|
||||
RiceContext tmp_rc2 = rc;
|
||||
rc = tmp_rc;
|
||||
tmp_rc = tmp_rc2;
|
||||
opt_porder = i;
|
||||
}
|
||||
}
|
||||
|
||||
rc.porder = opt_porder;
|
||||
fixed (int* rparms = rc.rparams)
|
||||
AudioSamples.MemCpy(rparms, parm + opt_porder * Flake.MAX_PARTITIONS, (1 << opt_porder));
|
||||
|
||||
return opt_bits;
|
||||
}
|
||||
|
||||
@@ -990,7 +1011,7 @@ namespace CUETools.Codecs.FlaCuda
|
||||
int pmin = get_max_p_order(eparams.min_partition_order, frame.blocksize, frame.subframes[ch].best.order);
|
||||
int pmax = get_max_p_order(eparams.max_partition_order, frame.blocksize, frame.subframes[ch].best.order);
|
||||
uint bits = (uint)frame.subframes[ch].best.order * frame.subframes[ch].obits + 6;
|
||||
frame.subframes[ch].best.size = bits + calc_rice_params(ref frame.subframes[ch].best.rc, ref frame.current.rc, pmin, pmax, frame.subframes[ch].best.residual, (uint)frame.blocksize, (uint)frame.subframes[ch].best.order);
|
||||
frame.subframes[ch].best.size = bits + calc_rice_params(frame.subframes[ch].best.rc, pmin, pmax, frame.subframes[ch].best.residual, (uint)frame.blocksize, (uint)frame.subframes[ch].best.order);
|
||||
}
|
||||
break;
|
||||
case SubframeType.LPC:
|
||||
@@ -1009,7 +1030,7 @@ namespace CUETools.Codecs.FlaCuda
|
||||
int pmax = get_max_p_order(eparams.max_partition_order, frame.blocksize, frame.subframes[ch].best.order);
|
||||
uint bits = (uint)frame.subframes[ch].best.order * frame.subframes[ch].obits + 4 + 5 + (uint)frame.subframes[ch].best.order * (uint)frame.subframes[ch].best.cbits + 6;
|
||||
//uint oldsize = frame.subframes[ch].best.size;
|
||||
frame.subframes[ch].best.size = bits + calc_rice_params(ref frame.subframes[ch].best.rc, ref frame.current.rc, pmin, pmax, frame.subframes[ch].best.residual, (uint)frame.blocksize, (uint)frame.subframes[ch].best.order);
|
||||
frame.subframes[ch].best.size = bits + calc_rice_params(frame.subframes[ch].best.rc, pmin, pmax, frame.subframes[ch].best.residual, (uint)frame.blocksize, (uint)frame.subframes[ch].best.order);
|
||||
//if (frame.subframes[ch].best.size > frame.subframes[ch].obits * (uint)frame.blocksize &&
|
||||
// oldsize <= frame.subframes[ch].obits * (uint)frame.blocksize)
|
||||
// throw new Exception("oops");
|
||||
@@ -1213,9 +1234,10 @@ namespace CUETools.Codecs.FlaCuda
|
||||
|
||||
cuda.SetParameter(task.cudaChooseBestMethod, 0 * sizeof(uint), (uint)task.cudaResidualTasks.Pointer);
|
||||
cuda.SetParameter(task.cudaChooseBestMethod, 1 * sizeof(uint), (uint)task.cudaResidualOutput.Pointer);
|
||||
cuda.SetParameter(task.cudaChooseBestMethod, 2 * sizeof(uint), (uint)residualPartCount);
|
||||
cuda.SetParameter(task.cudaChooseBestMethod, 3 * sizeof(uint), (uint)task.nResidualTasksPerChannel);
|
||||
cuda.SetParameterSize(task.cudaChooseBestMethod, sizeof(uint) * 4U);
|
||||
cuda.SetParameter(task.cudaChooseBestMethod, 2 * sizeof(uint), (uint)residualPartSize);
|
||||
cuda.SetParameter(task.cudaChooseBestMethod, 3 * sizeof(uint), (uint)residualPartCount);
|
||||
cuda.SetParameter(task.cudaChooseBestMethod, 4 * sizeof(uint), (uint)task.nResidualTasksPerChannel);
|
||||
cuda.SetParameterSize(task.cudaChooseBestMethod, 5U * sizeof(uint));
|
||||
cuda.SetFunctionBlockShape(task.cudaChooseBestMethod, 32, 8, 1);
|
||||
|
||||
cuda.SetParameter(task.cudaCopyBestMethod, 0, (uint)task.cudaBestResidualTasks.Pointer);
|
||||
|
||||
@@ -61,6 +61,7 @@ typedef struct
|
||||
#define SUM512(buf,tid,op) if (tid < 256) buf[tid] op buf[tid + 256]; __syncthreads(); SUM256(buf,tid,op)
|
||||
|
||||
#define FSQR(s) ((s)*(s))
|
||||
#define FASTMUL(a,b) __mul24(a,b)
|
||||
|
||||
extern "C" __global__ void cudaStereoDecorr(
|
||||
int *samples,
|
||||
@@ -581,14 +582,8 @@ extern "C" __global__ void cudaEstimateResidual(
|
||||
|
||||
shared.residual[tid] = shared.residual[tid] + shared.residual[tid + 8] + shared.residual[tid + 16] + shared.residual[tid + 24];
|
||||
shared.residual[tid] = shared.residual[tid] + shared.residual[tid + 2] + shared.residual[tid + 4] + shared.residual[tid + 6];
|
||||
shared.residual[tid] += shared.residual[tid + 1];
|
||||
|
||||
// rice parameter search
|
||||
shared.residual[tid] = (shared.task[threadIdx.y].type != Constant || shared.residual[threadIdx.y << 5] != 0) *
|
||||
(__mul24(threadIdx.x >= 15, 0x7fffff) + residualLen * (threadIdx.x + 1) + ((shared.residual[threadIdx.y << 5] - (residualLen >> 1)) >> threadIdx.x));
|
||||
shared.residual[tid] = min(min(shared.residual[tid], shared.residual[tid + 4]), min(shared.residual[tid + 8], shared.residual[tid + 12]));
|
||||
if (threadIdx.x == 0)
|
||||
output[(blockIdx.y * blockDim.y + threadIdx.y) * 64 + blockIdx.x] = min(min(shared.residual[tid], shared.residual[tid + 1]), min(shared.residual[tid + 2], shared.residual[tid + 3]));
|
||||
output[(blockIdx.y * blockDim.y + threadIdx.y) * 64 + blockIdx.x] = shared.residual[tid] + shared.residual[tid + 1];
|
||||
}
|
||||
|
||||
extern "C" __global__ void cudaEstimateResidual1(
|
||||
@@ -612,7 +607,7 @@ extern "C" __global__ void cudaEstimateResidual1(
|
||||
if (tid == 0)
|
||||
{
|
||||
shared.pos = blockIdx.x * partSize;
|
||||
shared.dataLen = min(shared.task.data.blocksize - shared.pos, partSize + max_order);
|
||||
shared.dataLen = min(shared.task.data.blocksize - shared.pos, partSize + shared.task.data.residualOrder);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
@@ -631,22 +626,10 @@ extern "C" __global__ void cudaEstimateResidual1(
|
||||
shared.residual[tid] = __mul24(ptr < shared.dataLen, min(0x7fffff,(sum << 1) ^ (sum >> 31)));
|
||||
__syncthreads();
|
||||
SUM256(shared.residual, tid, +=);
|
||||
|
||||
if (threadIdx.y == 0)
|
||||
{
|
||||
const int residualLen = max(0,min(shared.task.data.blocksize - shared.pos - shared.task.data.residualOrder, partSize));
|
||||
|
||||
// rice parameter search
|
||||
shared.residual[threadIdx.x] = (shared.task.data.type != Constant || shared.residual[0] != 0) *
|
||||
(__mul24(threadIdx.x >= 15, 0x7fffff) + residualLen * (threadIdx.x + 1) + ((shared.residual[0] - (residualLen >> 1)) >> threadIdx.x));
|
||||
shared.residual[threadIdx.x] = min(min(shared.residual[threadIdx.x], shared.residual[threadIdx.x + 4]), min(shared.residual[threadIdx.x + 8], shared.residual[threadIdx.x + 12]));
|
||||
if (threadIdx.x == 0)
|
||||
output[blockIdx.y * 64 + blockIdx.x] = min(min(shared.residual[threadIdx.x], shared.residual[threadIdx.x + 1]), min(shared.residual[threadIdx.x + 2], shared.residual[threadIdx.x + 3]));
|
||||
}
|
||||
if (tid == 0)
|
||||
output[blockIdx.y * 64 + blockIdx.x] = shared.residual[0];
|
||||
}
|
||||
|
||||
#define FASTMUL(a,b) __mul24(a,b)
|
||||
|
||||
extern "C" __global__ void cudaEstimateResidual8(
|
||||
int*output,
|
||||
int*samples,
|
||||
@@ -656,57 +639,61 @@ extern "C" __global__ void cudaEstimateResidual8(
|
||||
)
|
||||
{
|
||||
__shared__ struct {
|
||||
int data[32*9];
|
||||
volatile int data[32*9];
|
||||
volatile int residual[32*8];
|
||||
FlaCudaSubframeData task[8];
|
||||
int coefs[32*8];
|
||||
volatile int pos;
|
||||
volatile int dataLen;
|
||||
volatile int dataOffs;
|
||||
} shared;
|
||||
const int tid = threadIdx.x + threadIdx.y * 32;
|
||||
const int taskNo = FASTMUL(blockIdx.y, blockDim.y) + threadIdx.y;
|
||||
if (threadIdx.x < sizeof(FlaCudaSubframeData)/sizeof(int))
|
||||
((int*)&shared.task[threadIdx.y])[threadIdx.x] = ((int*)(&tasks[blockIdx.y * blockDim.y + threadIdx.y]))[threadIdx.x];
|
||||
((int*)&shared.task[threadIdx.y])[threadIdx.x] = ((int*)(&tasks[taskNo]))[threadIdx.x];
|
||||
const int ro = shared.task[threadIdx.y].residualOrder;
|
||||
shared.coefs[tid] = threadIdx.x < ro ? tasks[taskNo].coefs[threadIdx.x] : 0;
|
||||
if (tid == 0)
|
||||
{
|
||||
shared.pos = FASTMUL(blockIdx.x, partSize);
|
||||
shared.dataLen = min(shared.task[0].blocksize - shared.pos, partSize + max_order);
|
||||
shared.dataOffs = shared.task[0].samplesOffs + shared.pos;
|
||||
}
|
||||
__syncthreads();
|
||||
const int pos = blockIdx.x * partSize;
|
||||
const int dataLen = min(shared.task[0].blocksize - pos, partSize + max_order);
|
||||
|
||||
// fetch samples
|
||||
shared.data[tid] = tid < dataLen ? samples[shared.task[0].samplesOffs + pos + tid] >> shared.task[0].wbits : 0;
|
||||
if (tid < 32) shared.data[tid + partSize] = tid + partSize < dataLen ? samples[shared.task[0].samplesOffs + pos + tid + partSize] >> shared.task[0].wbits : 0;
|
||||
if (tid < shared.dataLen)
|
||||
shared.data[tid] = samples[shared.dataOffs + tid] >> shared.task[0].wbits;
|
||||
if (tid + partSize < shared.dataLen)
|
||||
shared.data[tid + partSize] = samples[shared.dataOffs + tid + partSize] >> shared.task[0].wbits;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
shared.residual[tid] = 0;
|
||||
shared.coefs[tid] = threadIdx.x < shared.task[threadIdx.y].residualOrder ? tasks[blockIdx.y * blockDim.y + threadIdx.y].coefs[threadIdx.x] : 0;
|
||||
|
||||
const int residualLen = max(0,min(shared.task[0].blocksize - pos - shared.task[threadIdx.y].residualOrder, partSize));
|
||||
const int residualLen = max(0,min(shared.dataLen - ro, partSize));
|
||||
const int ptr2 = threadIdx.y << 5;
|
||||
int s = 0;
|
||||
for (int ptr = threadIdx.x + blockDim.y * 32 * (shared.task[threadIdx.y].type == Verbatim); ptr < blockDim.y * 32 + threadIdx.x; ptr += 32)
|
||||
for (int ptr = threadIdx.x; ptr < residualLen; ptr += 32)
|
||||
{
|
||||
// compute residual
|
||||
int sum =
|
||||
__mul24(shared.data[ptr + 0], shared.coefs[ptr2 + 0]) +
|
||||
__mul24(shared.data[ptr + 1], shared.coefs[ptr2 + 1]) +
|
||||
__mul24(shared.data[ptr + 2], shared.coefs[ptr2 + 2]) +
|
||||
__mul24(shared.data[ptr + 3], shared.coefs[ptr2 + 3]) +
|
||||
__mul24(shared.data[ptr + 3], shared.coefs[ptr2 + 3]);
|
||||
sum +=
|
||||
__mul24(shared.data[ptr + 4], shared.coefs[ptr2 + 4]) +
|
||||
__mul24(shared.data[ptr + 5], shared.coefs[ptr2 + 5]) +
|
||||
__mul24(shared.data[ptr + 6], shared.coefs[ptr2 + 6]) +
|
||||
__mul24(shared.data[ptr + 7], shared.coefs[ptr2 + 7]);
|
||||
sum = shared.data[ptr + shared.task[threadIdx.y].residualOrder] - (sum >> shared.task[threadIdx.y].shift);
|
||||
s += __mul24(ptr < residualLen, min(0x7fffff,(sum << 1) ^ (sum >> 31)));
|
||||
sum = shared.data[ptr + ro] - (sum >> shared.task[threadIdx.y].shift);
|
||||
s += min(0x7fffff,(sum << 1) ^ (sum >> 31));
|
||||
}
|
||||
|
||||
shared.residual[tid] = s;
|
||||
shared.residual[tid] = shared.residual[tid] + shared.residual[tid + 8] + shared.residual[tid + 16] + shared.residual[tid + 24];
|
||||
shared.residual[tid] = shared.residual[tid] + shared.residual[tid + 2] + shared.residual[tid + 4] + shared.residual[tid + 6];
|
||||
shared.residual[tid] += shared.residual[tid + 1];
|
||||
|
||||
// rice parameter search
|
||||
shared.residual[tid] = (shared.task[threadIdx.y].type != Constant || shared.residual[threadIdx.y << 5] != 0) *
|
||||
(__mul24(threadIdx.x >= 15, 0x7fffff) + residualLen * (threadIdx.x + 1) + ((shared.residual[threadIdx.y << 5] - (residualLen >> 1)) >> threadIdx.x));
|
||||
shared.residual[tid] = min(min(shared.residual[tid], shared.residual[tid + 4]), min(shared.residual[tid + 8], shared.residual[tid + 12]));
|
||||
if (threadIdx.x == 0)
|
||||
output[(blockIdx.y * blockDim.y + threadIdx.y) * 64 + blockIdx.x] = min(min(shared.residual[tid], shared.residual[tid + 1]), min(shared.residual[tid + 2], shared.residual[tid + 3]));
|
||||
output[(blockIdx.y * blockDim.y + threadIdx.y) * 64 + blockIdx.x] = shared.residual[tid] + shared.residual[tid + 1];
|
||||
}
|
||||
|
||||
extern "C" __global__ void cudaEstimateResidual12(
|
||||
@@ -727,8 +714,11 @@ extern "C" __global__ void cudaEstimateResidual12(
|
||||
volatile int dataOffs;
|
||||
} shared;
|
||||
const int tid = threadIdx.x + threadIdx.y * 32;
|
||||
const int taskNo = FASTMUL(blockIdx.y, blockDim.y) + threadIdx.y;
|
||||
if (threadIdx.x < sizeof(FlaCudaSubframeData)/sizeof(int))
|
||||
((int*)&shared.task[threadIdx.y])[threadIdx.x] = ((int*)(&tasks[FASTMUL(blockIdx.y, blockDim.y) + threadIdx.y]))[threadIdx.x];
|
||||
((int*)&shared.task[threadIdx.y])[threadIdx.x] = ((int*)(&tasks[taskNo]))[threadIdx.x];
|
||||
const int ro = shared.task[threadIdx.y].residualOrder;
|
||||
shared.coefs[tid] = threadIdx.x < ro ? tasks[taskNo].coefs[threadIdx.x] : 0;
|
||||
if (tid == 0)
|
||||
{
|
||||
shared.pos = FASTMUL(blockIdx.x, partSize);
|
||||
@@ -738,30 +728,30 @@ extern "C" __global__ void cudaEstimateResidual12(
|
||||
__syncthreads();
|
||||
|
||||
// fetch samples
|
||||
shared.data[tid] = tid < shared.dataLen ? samples[shared.dataOffs + tid] >> shared.task[0].wbits : 0;
|
||||
if (tid < 32) shared.data[tid + partSize] = tid + partSize < shared.dataLen ? samples[shared.dataOffs + tid + partSize] >> shared.task[0].wbits : 0;
|
||||
if (tid < shared.dataLen)
|
||||
shared.data[tid] = samples[shared.dataOffs + tid] >> shared.task[0].wbits;
|
||||
if (tid + partSize < shared.dataLen)
|
||||
shared.data[tid + partSize] = samples[shared.dataOffs + tid + partSize] >> shared.task[0].wbits;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
const int ro = shared.task[threadIdx.y].residualOrder;
|
||||
const int residualLen = max(0,min(shared.task[0].blocksize - shared.pos - ro, partSize));
|
||||
int residualLen = max(0,min(shared.dataLen - ro, partSize));
|
||||
const int ptr2 = threadIdx.y << 5;
|
||||
|
||||
shared.coefs[tid] = threadIdx.x < ro ? tasks[FASTMUL(blockIdx.y, blockDim.y) + threadIdx.y].coefs[threadIdx.x] : 0;
|
||||
|
||||
int s = 0;
|
||||
for (int ptr = shared.task[threadIdx.y].type == Verbatim ? residualLen : threadIdx.x; ptr < residualLen; ptr += 32)
|
||||
for (int ptr = threadIdx.x; ptr < residualLen; ptr += 32)
|
||||
{
|
||||
// compute residual
|
||||
int sum =
|
||||
FASTMUL(shared.data[ptr + 0], shared.coefs[ptr2 + 0]) +
|
||||
FASTMUL(shared.data[ptr + 1], shared.coefs[ptr2 + 1]) +
|
||||
FASTMUL(shared.data[ptr + 2], shared.coefs[ptr2 + 2]) +
|
||||
FASTMUL(shared.data[ptr + 3], shared.coefs[ptr2 + 3]) +
|
||||
FASTMUL(shared.data[ptr + 3], shared.coefs[ptr2 + 3]);
|
||||
sum +=
|
||||
FASTMUL(shared.data[ptr + 4], shared.coefs[ptr2 + 4]) +
|
||||
FASTMUL(shared.data[ptr + 5], shared.coefs[ptr2 + 5]) +
|
||||
FASTMUL(shared.data[ptr + 6], shared.coefs[ptr2 + 6]) +
|
||||
FASTMUL(shared.data[ptr + 7], shared.coefs[ptr2 + 7]) +
|
||||
FASTMUL(shared.data[ptr + 7], shared.coefs[ptr2 + 7]);
|
||||
sum +=
|
||||
FASTMUL(shared.data[ptr + 8], shared.coefs[ptr2 + 8]) +
|
||||
FASTMUL(shared.data[ptr + 9], shared.coefs[ptr2 + 9]) +
|
||||
FASTMUL(shared.data[ptr + 10], shared.coefs[ptr2 + 10]) +
|
||||
@@ -773,19 +763,14 @@ extern "C" __global__ void cudaEstimateResidual12(
|
||||
shared.residual[tid] = s;
|
||||
shared.residual[tid] = shared.residual[tid] + shared.residual[tid + 8] + shared.residual[tid + 16] + shared.residual[tid + 24];
|
||||
shared.residual[tid] = shared.residual[tid] + shared.residual[tid + 2] + shared.residual[tid + 4] + shared.residual[tid + 6];
|
||||
shared.residual[tid] += shared.residual[tid + 1];
|
||||
|
||||
// rice parameter search
|
||||
shared.residual[tid] = (shared.task[threadIdx.y].type != Constant || shared.residual[threadIdx.y << 5] != 0) *
|
||||
(__mul24(threadIdx.x >= 15, 0x7fffff) + FASTMUL(residualLen, threadIdx.x + 1) + ((shared.residual[threadIdx.y << 5] - (residualLen >> 1)) >> threadIdx.x));
|
||||
shared.residual[tid] = min(min(shared.residual[tid], shared.residual[tid + 4]), min(shared.residual[tid + 8], shared.residual[tid + 12]));
|
||||
if (threadIdx.x == 0)
|
||||
output[(blockIdx.y * blockDim.y + threadIdx.y) * 64 + blockIdx.x] = min(min(shared.residual[tid], shared.residual[tid + 1]), min(shared.residual[tid + 2], shared.residual[tid + 3]));
|
||||
output[(blockIdx.y * blockDim.y + threadIdx.y) * 64 + blockIdx.x] = shared.residual[tid] + shared.residual[tid + 1];
|
||||
}
|
||||
|
||||
extern "C" __global__ void cudaChooseBestMethod(
|
||||
FlaCudaSubframeTask *tasks,
|
||||
int *residual,
|
||||
int partSize,
|
||||
int partCount, // <= blockDim.y (256)
|
||||
int taskCount
|
||||
)
|
||||
@@ -806,8 +791,18 @@ extern "C" __global__ void cudaChooseBestMethod(
|
||||
((int*)&shared.task[threadIdx.y])[threadIdx.x] = ((int*)(tasks + task + threadIdx.y + taskCount * blockIdx.y))[threadIdx.x];
|
||||
|
||||
int sum = 0;
|
||||
for (int pos = 0; pos < partCount; pos += blockDim.x)
|
||||
sum += (pos + threadIdx.x < partCount ? residual[pos + threadIdx.x + 64 * (task + threadIdx.y + taskCount * blockIdx.y)] : 0);
|
||||
for (int pos = threadIdx.x; pos < partCount; pos += blockDim.x)
|
||||
{
|
||||
// fetch part sum
|
||||
int psum = residual[pos + 64 * (task + threadIdx.y + taskCount * blockIdx.y)];
|
||||
// calculate part size
|
||||
int residualLen = max(0,min(shared.task[threadIdx.y].data.blocksize - FASTMUL(pos, partSize) - shared.task[threadIdx.y].data.residualOrder, partSize));
|
||||
residualLen = FASTMUL(residualLen, shared.task[threadIdx.y].data.type != Constant || psum != 0);
|
||||
// calculate rice parameter
|
||||
int k = max(0, min(14, __float2int_rz(__log2f((psum + 0.000001f) / (residualLen + 0.000001f) + 0.5f))));
|
||||
// calculate part bit length
|
||||
sum += FASTMUL(residualLen, k + 1) + (psum >> k);
|
||||
}
|
||||
shared.partLen[tid] = sum;
|
||||
|
||||
// length sum: reduction in shared mem
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user