Flake optimizations

This commit is contained in:
Grigory Chudov
2014-09-19 22:01:20 -04:00
parent 3572745968
commit fac9689155
4 changed files with 138 additions and 276 deletions

View File

@@ -1360,16 +1360,18 @@ namespace CUETools.Codecs.FLACCL
case SubframeType.LPC:
if (!task.UseGPUOnly)
{
int pmin = get_max_p_order(m_settings.MinPartitionOrder, task.frame.blocksize, task.frame.subframes[ch].best.order);
int pmax = get_max_p_order(m_settings.MaxPartitionOrder, task.frame.blocksize, task.frame.subframes[ch].best.order);
ulong* sums = stackalloc ulong[(pmax + 1) * Flake.MAX_PARTITIONS];
fixed (int* coefs = task.frame.subframes[ch].best.coefs)
{
if (Settings.PCM.BitsPerSample > 16)
lpc.encode_residual_long(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, task.frame.blocksize, task.frame.subframes[ch].best.order, coefs, task.frame.subframes[ch].best.shift);
lpc.encode_residual_long(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, task.frame.blocksize, task.frame.subframes[ch].best.order, coefs, task.frame.subframes[ch].best.shift, sums + pmax * Flake.MAX_PARTITIONS, pmax);
else
lpc.encode_residual(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, task.frame.blocksize, task.frame.subframes[ch].best.order, coefs, task.frame.subframes[ch].best.shift);
lpc.encode_residual(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, task.frame.blocksize, task.frame.subframes[ch].best.order, coefs, task.frame.subframes[ch].best.shift, sums + pmax * Flake.MAX_PARTITIONS, pmax);
}
int pmin = get_max_p_order(m_settings.MinPartitionOrder, task.frame.blocksize, task.frame.subframes[ch].best.order);
int pmax = get_max_p_order(m_settings.MaxPartitionOrder, task.frame.blocksize, task.frame.subframes[ch].best.order);
calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order, Settings.PCM.BitsPerSample > 16 ? 1 : 0);
}
break;

View File

@@ -1156,12 +1156,12 @@ new int[] { // 30
fixed (int* coefs = frame.current.coefs)
{
if ((csum << frame.subframes[ch].obits) >= 1UL << 32)
lpc.encode_residual_long(frame.current.residual, frame.subframes[ch].samples, frame.blocksize, frame.current.order, coefs, frame.current.shift);
lpc.encode_residual_long(frame.current.residual, frame.subframes[ch].samples, frame.blocksize, frame.current.order, coefs, frame.current.shift, sums + pmax * Flake.MAX_PARTITIONS, pmax);
else
lpc.encode_residual(frame.current.residual, frame.subframes[ch].samples, frame.blocksize, frame.current.order, coefs, frame.current.shift);
lpc.encode_residual(frame.current.residual, frame.subframes[ch].samples, frame.blocksize, frame.current.order, coefs, frame.current.shift, sums + pmax * Flake.MAX_PARTITIONS, pmax);
}
var cur_size = calc_rice_params(frame.current.rc, pmin, pmax, frame.current.residual, (uint)frame.blocksize, (uint)frame.current.order, Settings.PCM.BitsPerSample);
var cur_size = calc_rice_params_sums(frame.current.rc, pmin, pmax, sums, (uint)frame.blocksize, (uint)frame.current.order, Settings.PCM.BitsPerSample);
frame.current.size = (uint)(frame.current.order * frame.subframes[ch].obits + 4 + 5 + frame.current.order * frame.current.cbits + 6 + (int)cur_size);
if (frame.current.size < best_size)
@@ -1261,6 +1261,10 @@ new int[] { // 30
frame.current.window = iWindow;
frame.current.cbits = (int)cbits;
int pmax = get_max_p_order(m_settings.MaxPartitionOrder, frame.blocksize, frame.current.order);
int pmin = Math.Min(m_settings.MinPartitionOrder, pmax);
ulong* sums = stackalloc ulong[(pmax + 1) * Flake.MAX_PARTITIONS];
ulong csum = 0;
fixed (int* coefs = frame.current.coefs)
{
lpc.quantize_lpc_coefs(lpcs + (frame.current.order - 1) * lpc.MAX_LPC_ORDER,
@@ -1269,36 +1273,16 @@ new int[] { // 30
if (frame.current.shift < 0 || frame.current.shift > 15)
throw new Exception("negative shift");
ulong csum = 0;
for (int i = frame.current.order; i > 0; i--)
csum += (ulong)Math.Abs(coefs[i - 1]);
if ((csum << frame.subframes[ch].obits) >= 1UL << 32)
lpc.encode_residual_long(frame.current.residual, frame.subframes[ch].samples, frame.blocksize, frame.current.order, coefs, frame.current.shift);
lpc.encode_residual_long(frame.current.residual, frame.subframes[ch].samples, frame.blocksize, frame.current.order, coefs, frame.current.shift, sums + pmax * Flake.MAX_PARTITIONS, pmax);
else
lpc.encode_residual(frame.current.residual, frame.subframes[ch].samples, frame.blocksize, frame.current.order, coefs, frame.current.shift);
lpc.encode_residual(frame.current.residual, frame.subframes[ch].samples, frame.blocksize, frame.current.order, coefs, frame.current.shift, sums + pmax * Flake.MAX_PARTITIONS, pmax);
}
int pmax = get_max_p_order(m_settings.MaxPartitionOrder, frame.blocksize, frame.current.order);
int pmin = Math.Min(m_settings.MinPartitionOrder, pmax);
uint best_size = calc_rice_params(frame.current.rc, pmin, pmax, frame.current.residual, (uint)frame.blocksize, (uint)frame.current.order, Settings.PCM.BitsPerSample);
// not working
//for (int o = 1; o <= frame.current.order; o++)
//{
// if (frame.current.coefs[o - 1] > -(1 << frame.current.shift))
// {
// for (int i = o; i < frame.blocksize; i++)
// frame.current.residual[i] += frame.subframes[ch].samples[i - o] >> frame.current.shift;
// frame.current.coefs[o - 1]--;
// uint new_size = calc_rice_params(ref frame.current.rc, pmin, pmax, frame.current.residual, (uint)frame.blocksize, (uint)frame.current.order);
// if (new_size > best_size)
// {
// for (int i = o; i < frame.blocksize; i++)
// frame.current.residual[i] -= frame.subframes[ch].samples[i - o] >> frame.current.shift;
// frame.current.coefs[o - 1]++;
// }
// }
//}
uint best_size = calc_rice_params_sums(frame.current.rc, pmin, pmax, sums, (uint)frame.blocksize, (uint)frame.current.order, Settings.PCM.BitsPerSample);
frame.current.size = (uint)(frame.current.order * frame.subframes[ch].obits + 4 + 5 + frame.current.order * (int)cbits + 6 + (int)best_size);
frame.ChooseBestSubframe(ch);
//if (frame.current.size >= frame.subframes[ch].best.size)
@@ -1522,7 +1506,7 @@ new int[] { // 30
{
case OrderMethod.Akaike:
//lpc_ctx.SortOrdersAkaike(frame.blocksize, eparams.estimation_depth, max_order, 7.1, 0.0);
lpc_ctx.SortOrdersAkaike(frame.blocksize, eparams.estimation_depth, min_order, max_order, 4.5, 0.0);
lpc_ctx.SortOrdersAkaike(frame.blocksize, eparams.estimation_depth, min_order, max_order, 4.5, 0);
break;
default:
throw new Exception("unknown order method");
@@ -2175,13 +2159,17 @@ new int[] { // 30
{
frame2.InitSize(frame.blocksize / 2, true);
frame2.window_buffer = frame.window_buffer + frame.blocksize;
frame2.nSeg++;
frame2.nSeg = frame.nSeg + 1;
frame2.current.residual = r + tumbler * 5 * Flake.MAX_BLOCKSIZE;
for (int ch = 0; ch < 4; ch++)
frame2.subframes[ch].Init(frame.subframes[ch].samples, frame2.current.residual + (ch + 1) * frame2.blocksize,
frame.subframes[ch].obits + frame.subframes[ch].wbits, frame.subframes[ch].wbits);
estimate_frame(frame2, true);
uint fs2 = measure_frame_size(frame2, true);
//measure_frame_size(frame2, true);
//frame2.ChooseSubframes();
//encode_estimated_frame(frame2);
//uint fs2 = measure_frame_size(frame2, false);
uint fs2 = measure_frame_size(frame2, true);
uint fs3 = fs2;
if (eparams.variable_block_size == 2 || eparams.variable_block_size == 4)
{

View File

@@ -295,8 +295,7 @@ namespace CUETools.Codecs
return;
}
#endif
crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (byte)(bit_buf >> 56)]);
*(buf++) = (byte)(bit_buf >> 56);
crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (*(buf++) = (byte)(bit_buf >> 56))]);
bit_buf <<= 8;
bits -= 8;
}
@@ -320,23 +319,15 @@ namespace CUETools.Codecs
}
#endif
crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (byte)(bb >> 56)]);
crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (byte)(bb >> 48)]);
crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (byte)(bb >> 40)]);
crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (byte)(bb >> 32)]);
crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (byte)(bb >> 24)]);
crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (byte)(bb >> 16)]);
crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (byte)(bb >> 8)]);
crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (byte)(bb)]);
crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (*(buf++) = (byte)(bb >> 56))]);
crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (*(buf++) = (byte)(bb >> 48))]);
crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (*(buf++) = (byte)(bb >> 40))]);
crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (*(buf++) = (byte)(bb >> 32))]);
crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (*(buf++) = (byte)(bb >> 24))]);
crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (*(buf++) = (byte)(bb >> 16))]);
crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (*(buf++) = (byte)(bb >> 8))]);
crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (*(buf++) = (byte)(bb))]);
*(buf++) = (byte)(bb >> 56);
*(buf++) = (byte)(bb >> 48);
*(buf++) = (byte)(bb >> 40);
*(buf++) = (byte)(bb >> 32);
*(buf++) = (byte)(bb >> 24);
*(buf++) = (byte)(bb >> 16);
*(buf++) = (byte)(bb >> 8);
*(buf++) = (byte)(bb);
bit_left += 64 - bits;
bit_buf = (val << bit_left - 1) << 1;
}

View File

@@ -515,80 +515,82 @@ namespace CUETools.Codecs
shift = sh;
}
public static unsafe void
encode_residual(int* res, int* smp, int n, int order,
int* coefs, int shift)
private static unsafe ulong
encode_residual_partition(int* s, int* r, int* seg_end, int* coefs, int shift, int order)
{
for (int i = 0; i < order; i++)
res[i] = smp[i];
int* s = smp;
int* r = res + order;
ulong sum = 0ul;
int c0 = coefs[0];
int c1 = coefs[1];
switch (order)
{
case 1:
for (int i = n - order; i > 0; i--)
while (s < seg_end)
{
int pred = c0 * *(s++);
*(r++) = *s - (pred >> shift);
//*(r++) = *s - (pred >> shift);
int d = *(r++) = *s - (pred >> shift);
sum += (uint)((d << 1) ^ (d >> 31));
}
break;
case 2:
for (int i = n - order; i > 0; i--)
while (s < seg_end)
{
int pred = c1 * *(s++);
pred += c0 * *(s++);
*(r++) = *(s--) - (pred >> shift);
int d = *(r++) = *(s--) - (pred >> shift);
sum += (uint)((d << 1) ^ (d >> 31));
}
break;
case 3:
for (int i = n - order; i > 0; i--)
while (s < seg_end)
{
int pred = coefs[2] * *(s++) +
c1 * *(s++) + c0 * *(s++);
*(r++) = *s - (pred >> shift);
int d = *(r++) = *s - (pred >> shift);
sum += (uint)((d << 1) ^ (d >> 31));
s -= 2;
}
break;
case 4:
for (int i = n - order; i > 0; i--)
while (s < seg_end)
{
int* c = coefs + order - 1;
int pred =
*(c--) * *(s++) + *(c--) * *(s++) +
c1 * *(s++) + c0 * *(s++);
*(r++) = *s - (pred >> shift);
int d = *(r++) = *s - (pred >> shift);
sum += (uint)((d << 1) ^ (d >> 31));
s -= 3;
}
break;
case 5:
for (int i = n - order; i > 0; i--)
while (s < seg_end)
{
int* c = coefs + order - 1;
int pred =
*(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
c1 * *(s++) + c0 * *(s++);
*(r++) = *s - (pred >> shift);
int d = *(r++) = *s - (pred >> shift);
sum += (uint)((d << 1) ^ (d >> 31));
s -= 4;
}
break;
case 6:
for (int i = n - order; i > 0; i--)
while (s < seg_end)
{
int* c = coefs + order - 1;
int pred =
*(c--) * *(s++) + *(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
c1 * *(s++) + c0 * *(s++);
*(r++) = *s - (pred >> shift);
int d = *(r++) = *s - (pred >> shift);
sum += (uint)((d << 1) ^ (d >> 31));
s -= 5;
}
break;
case 7:
for (int i = n - order; i > 0; i--)
while (s < seg_end)
{
int* c = coefs + order - 1;
int pred =
@@ -596,12 +598,13 @@ namespace CUETools.Codecs
*(c--) * *(s++) + *(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
c1 * *(s++) + c0 * *(s++);
*(r++) = *s - (pred >> shift);
int d = *(r++) = *s - (pred >> shift);
sum += (uint)((d << 1) ^ (d >> 31));
s -= 6;
}
break;
case 8:
for (int i = n - order; i > 0; i--)
while (s < seg_end)
{
int* c = coefs + order - 1;
int pred =
@@ -609,26 +612,28 @@ namespace CUETools.Codecs
*(c--) * *(s++) + *(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
c1 * *(s++) + c0 * *(s++);
*(r++) = *s - (pred >> shift);
int d = *(r++) = *s - (pred >> shift);
sum += (uint)((d << 1) ^ (d >> 31));
s -= 7;
}
break;
case 9:
for (int i = n - order; i > 0; i--)
while (s < seg_end)
{
int* c = coefs + order - 1;
int pred =
*(c--) * *(s++) +
*(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
c1 * *(s++) + c0 * *(s++);
*(r++) = *s - (pred >> shift);
int d = *(r++) = *s - (pred >> shift);
sum += (uint)((d << 1) ^ (d >> 31));
s -= 8;
}
break;
case 10:
for (int i = n - order; i > 0; i--)
while (s < seg_end)
{
int* c = coefs + order - 1;
int pred =
@@ -637,12 +642,13 @@ namespace CUETools.Codecs
*(c--) * *(s++) + *(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
c1 * *(s++) + c0 * *(s++);
*(r++) = *s - (pred >> shift);
int d = *(r++) = *s - (pred >> shift);
sum += (uint)((d << 1) ^ (d >> 31));
s -= 9;
}
break;
case 11:
for (int i = n - order; i > 0; i--)
while (s < seg_end)
{
int* c = coefs + order - 1;
int pred =
@@ -652,12 +658,13 @@ namespace CUETools.Codecs
*(c--) * *(s++) + *(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
c1 * *(s++) + c0 * *(s++);
*(r++) = *s - (pred >> shift);
int d = *(r++) = *s - (pred >> shift);
sum += (uint)((d << 1) ^ (d >> 31));
s -= 10;
}
break;
case 12:
for (int i = n - order; i > 0; i--)
while (s < seg_end)
{
int* c = coefs + order - 1;
int pred =
@@ -667,14 +674,14 @@ namespace CUETools.Codecs
*(c--) * *(s++) + *(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
c1 * *(s++) + c0 * *(s++);
*(r++) = *s - (pred >> shift);
int d = *(r++) = *s - (pred >> shift);
sum += (uint)((d << 1) ^ (d >> 31));
s -= 11;
}
break;
default:
for (int i = order; i < n; i++)
while (s < seg_end)
{
s = smp + i - order;
int pred = 0;
int* c = coefs + order - 1;
int* c11 = coefs + 11;
@@ -687,75 +694,99 @@ namespace CUETools.Codecs
*(c--) * *(s++) + *(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
c1 * *(s++) + c0 * *(s++);
*(r++) = *s - (pred >> shift);
int d = *(r++) = *s - (pred >> shift);
sum += (uint)((d << 1) ^ (d >> 31));
s -= order - 1;
}
break;
}
return sum;
}
public static unsafe void
encode_residual_long(int* res, int* smp, int n, int order,
int* coefs, int shift)
encode_residual(int* res, int* smp, int n, int order,
int* coefs, int shift, ulong* sums, int pmax)
{
for (int i = 0; i < order; i++)
res[i] = smp[i];
int* s = smp;
int* s_end = smp + n - order;
int* seg_end = s + (n >> pmax) - order;
int* r = res + order;
while (s < s_end)
{
*(sums++) = encode_residual_partition(s, r, seg_end, coefs, shift, order);
r += seg_end - s;
s = seg_end;
seg_end += n >> pmax;
}
}
private static unsafe ulong
encode_residual_long_partition(int* s, int* r, int* seg_end, int* coefs, int shift, int order)
{
ulong sum = 0ul;
int c0 = coefs[0];
int c1 = coefs[1];
switch (order)
{
case 1:
for (int i = n - order; i > 0; i--)
while (s < seg_end)
{
long pred = c0 * (long)*(s++);
*(r++) = *s - (int)(pred >> shift);
int d = *(r++) = *s - (int)(pred >> shift);
sum += (uint)((d << 1) ^ (d >> 31));
}
break;
case 2:
for (int i = n - order; i > 0; i--)
while (s < seg_end)
{
long pred = c1 * (long)*(s++);
pred += c0 * (long)*(s++);
*(r++) = *(s--) - (int)(pred >> shift);
int d = *(r++) = *(s--) - (int)(pred >> shift);
sum += (uint)((d << 1) ^ (d >> 31));
}
break;
case 3:
for (int i = n - order; i > 0; i--)
while (s < seg_end)
{
long pred = coefs[2] * (long)*(s++);
pred += c1 * (long)*(s++);
pred += c0 * (long)*(s++);
*(r++) = *s - (int)(pred >> shift);
int d = *(r++) = *s - (int)(pred >> shift);
sum += (uint)((d << 1) ^ (d >> 31));
s -= 2;
}
break;
case 4:
for (int i = n - order; i > 0; i--)
while (s < seg_end)
{
long pred = coefs[3] * (long)*(s++);
pred += coefs[2] * (long)*(s++);
pred += c1 * (long)*(s++);
pred += c0 * (long)*(s++);
*(r++) = *s - (int)(pred >> shift);
int d = *(r++) = *s - (int)(pred >> shift);
sum += (uint)((d << 1) ^ (d >> 31));
s -= 3;
}
break;
case 5:
for (int i = n - order; i > 0; i--)
while (s < seg_end)
{
long pred = coefs[4] * (long)*(s++);
pred += coefs[3] * (long)*(s++);
pred += coefs[2] * (long)*(s++);
pred += c1 * (long)*(s++);
pred += c0 * (long)*(s++);
*(r++) = *s - (int)(pred >> shift);
int d = *(r++) = *s - (int)(pred >> shift);
sum += (uint)((d << 1) ^ (d >> 31));
s -= 4;
}
break;
case 6:
for (int i = n - order; i > 0; i--)
while (s < seg_end)
{
long pred = coefs[5] * (long)*(s++);
pred += coefs[4] * (long)*(s++);
@@ -763,12 +794,13 @@ namespace CUETools.Codecs
pred += coefs[2] * (long)*(s++);
pred += c1 * (long)*(s++);
pred += c0 * (long)*(s++);
*(r++) = *s - (int)(pred >> shift);
int d = *(r++) = *s - (int)(pred >> shift);
sum += (uint)((d << 1) ^ (d >> 31));
s -= 5;
}
break;
case 7:
for (int i = n - order; i > 0; i--)
while (s < seg_end)
{
long pred = coefs[6] * (long)*(s++);
pred += coefs[5] * (long)*(s++);
@@ -777,12 +809,13 @@ namespace CUETools.Codecs
pred += coefs[2] * (long)*(s++);
pred += c1 * (long)*(s++);
pred += c0 * (long)*(s++);
*(r++) = *s - (int)(pred >> shift);
int d = *(r++) = *s - (int)(pred >> shift);
sum += (uint)((d << 1) ^ (d >> 31));
s -= 6;
}
break;
case 8:
for (int i = n - order; i > 0; i--)
while (s < seg_end)
{
long pred = coefs[7] * (long)*(s++);
pred += coefs[6] * (long)*(s++);
@@ -792,14 +825,14 @@ namespace CUETools.Codecs
pred += coefs[2] * (long)*(s++);
pred += c1 * (long)*(s++);
pred += c0 * (long)*(s++);
*(r++) = *s - (int)(pred >> shift);
int d = *(r++) = *s - (int)(pred >> shift);
sum += (uint)((d << 1) ^ (d >> 31));
s -= 7;
}
break;
default:
for (int i = order; i < n; i++)
while (s < seg_end)
{
s = smp + i - order;
long pred = 0;
int* co = coefs + order - 1;
int* c7 = coefs + 7;
@@ -813,188 +846,36 @@ namespace CUETools.Codecs
pred += coefs[2] * (long)*(s++);
pred += c1 * (long)*(s++);
pred += c0 * (long)*(s++);
*(r++) = *s - (int)(pred >> shift);
int d = *(r++) = *s - (int)(pred >> shift);
sum += (uint)((d << 1) ^ (d >> 31));
s -= order - 1;
}
break;
}
return sum;
}
public static unsafe void
encode_residual2(int* res, int* smp, int n, int order,
int* coefs, int shift)
encode_residual_long(int* res, int* smp, int n, int order,
int* coefs, int shift, ulong* sums, int pmax)
{
for (int i = 0; i < order; i++)
res[i] = smp[i];
int* s = smp;
int* r = res;
int c0 = coefs[0];
int c1 = coefs[1];
switch (order)
int* s_end = smp + n - order;
int* seg_end = s + (n >> pmax) - order;
int* r = res + order;
while (s < s_end)
{
case 1:
for (int i = n - order; i > 0; i--)
{
int pred = c0 * *(s++);
*(r++) = *s - (pred >> shift);
}
break;
case 2:
for (int i = n - order; i > 0; i--)
{
int pred = c1 * *(s++);
pred += c0 * *(s++);
*(r++) = *(s--) - (pred >> shift);
}
break;
case 3:
for (int i = n - order; i > 0; i--)
{
int pred = coefs[2] * *(s++) +
c1 * *(s++) + c0 * *(s++);
*(r++) = *s - (pred >> shift);
s -= 2;
}
break;
case 4:
for (int i = n - order; i > 0; i--)
{
int* c = coefs + order - 1;
int pred =
*(c--) * *(s++) + *(c--) * *(s++) +
c1 * *(s++) + c0 * *(s++);
*(r++) = *s - (pred >> shift);
s -= 3;
}
break;
case 5:
for (int i = n - order; i > 0; i--)
{
int* c = coefs + order - 1;
int pred =
*(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
c1 * *(s++) + c0 * *(s++);
*(r++) = *s - (pred >> shift);
s -= 4;
}
break;
case 6:
for (int i = n - order; i > 0; i--)
{
int* c = coefs + order - 1;
int pred =
*(c--) * *(s++) + *(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
c1 * *(s++) + c0 * *(s++);
*(r++) = *s - (pred >> shift);
s -= 5;
}
break;
case 7:
for (int i = n - order; i > 0; i--)
{
int* c = coefs + order - 1;
int pred =
*(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
c1 * *(s++) + c0 * *(s++);
*(r++) = *s - (pred >> shift);
s -= 6;
}
break;
case 8:
for (int i = n - order; i > 0; i--)
{
int* c = coefs + order - 1;
int pred =
*(c--) * *(s++) + *(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
c1 * *(s++) + c0 * *(s++);
*(r++) = *s - (pred >> shift);
s -= 7;
}
break;
case 9:
for (int i = n - order; i > 0; i--)
{
int* c = coefs + order - 1;
int pred =
*(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
c1 * *(s++) + c0 * *(s++);
*(r++) = *s - (pred >> shift);
s -= 8;
}
break;
case 10:
for (int i = n - order; i > 0; i--)
{
int* c = coefs + order - 1;
int pred =
*(c--) * *(s++) + *(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
c1 * *(s++) + c0 * *(s++);
*(r++) = *s - (pred >> shift);
s -= 9;
}
break;
case 11:
for (int i = n - order; i > 0; i--)
{
int* c = coefs + order - 1;
int pred =
*(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
c1 * *(s++) + c0 * *(s++);
*(r++) = *s - (pred >> shift);
s -= 10;
}
break;
case 12:
for (int i = n - order; i > 0; i--)
{
int* c = coefs + order - 1;
int pred =
*(c--) * *(s++) + *(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
c1 * *(s++) + c0 * *(s++);
*(r++) = *s - (pred >> shift);
s -= 11;
}
break;
default:
for (int i = order; i < n; i++)
{
s = smp + i - order;
int pred = 0;
int* c = coefs + order - 1;
int* c11 = coefs + 11;
while (c > c11)
pred += *(c--) * *(s++);
pred +=
*(c--) * *(s++) + *(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
*(c--) * *(s++) + *(c--) * *(s++) +
c1 * *(s++) + c0 * *(s++);
*(r++) = *s - (pred >> shift);
}
break;
*(sums++) = encode_residual_long_partition(s, r, seg_end, coefs, shift, order);
r += seg_end - s;
s = seg_end;
seg_end += n >> pmax;
}
}
public static unsafe void
public static unsafe void
decode_residual(int* res, int* smp, int n, int order,
int* coefs, int shift)
{