From fac96891553874495f726264fe67520ac3b00bc7 Mon Sep 17 00:00:00 2001 From: Grigory Chudov Date: Fri, 19 Sep 2014 22:01:20 -0400 Subject: [PATCH] Flake optimizations --- CUETools.Codecs.FLACCL/FLACCLWriter.cs | 10 +- CUETools.Codecs.FLAKE/FlakeWriter.cs | 46 ++-- CUETools.Codecs/BitWriter.cs | 27 +- CUETools.Codecs/LPC.cs | 331 ++++++++----------------- 4 files changed, 138 insertions(+), 276 deletions(-) diff --git a/CUETools.Codecs.FLACCL/FLACCLWriter.cs b/CUETools.Codecs.FLACCL/FLACCLWriter.cs index 0de51f0..2826aae 100644 --- a/CUETools.Codecs.FLACCL/FLACCLWriter.cs +++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs @@ -1360,16 +1360,18 @@ namespace CUETools.Codecs.FLACCL case SubframeType.LPC: if (!task.UseGPUOnly) { + int pmin = get_max_p_order(m_settings.MinPartitionOrder, task.frame.blocksize, task.frame.subframes[ch].best.order); + int pmax = get_max_p_order(m_settings.MaxPartitionOrder, task.frame.blocksize, task.frame.subframes[ch].best.order); + ulong* sums = stackalloc ulong[(pmax + 1) * Flake.MAX_PARTITIONS]; + fixed (int* coefs = task.frame.subframes[ch].best.coefs) { if (Settings.PCM.BitsPerSample > 16) - lpc.encode_residual_long(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, task.frame.blocksize, task.frame.subframes[ch].best.order, coefs, task.frame.subframes[ch].best.shift); + lpc.encode_residual_long(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, task.frame.blocksize, task.frame.subframes[ch].best.order, coefs, task.frame.subframes[ch].best.shift, sums + pmax * Flake.MAX_PARTITIONS, pmax); else - lpc.encode_residual(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, task.frame.blocksize, task.frame.subframes[ch].best.order, coefs, task.frame.subframes[ch].best.shift); + lpc.encode_residual(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, task.frame.blocksize, task.frame.subframes[ch].best.order, coefs, task.frame.subframes[ch].best.shift, sums + pmax * Flake.MAX_PARTITIONS, pmax); } - int pmin = get_max_p_order(m_settings.MinPartitionOrder, task.frame.blocksize, task.frame.subframes[ch].best.order); - int pmax = get_max_p_order(m_settings.MaxPartitionOrder, task.frame.blocksize, task.frame.subframes[ch].best.order); calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order, Settings.PCM.BitsPerSample > 16 ? 1 : 0); } break; diff --git a/CUETools.Codecs.FLAKE/FlakeWriter.cs b/CUETools.Codecs.FLAKE/FlakeWriter.cs index d8d1486..2eb6304 100644 --- a/CUETools.Codecs.FLAKE/FlakeWriter.cs +++ b/CUETools.Codecs.FLAKE/FlakeWriter.cs @@ -1156,12 +1156,12 @@ new int[] { // 30 fixed (int* coefs = frame.current.coefs) { if ((csum << frame.subframes[ch].obits) >= 1UL << 32) - lpc.encode_residual_long(frame.current.residual, frame.subframes[ch].samples, frame.blocksize, frame.current.order, coefs, frame.current.shift); + lpc.encode_residual_long(frame.current.residual, frame.subframes[ch].samples, frame.blocksize, frame.current.order, coefs, frame.current.shift, sums + pmax * Flake.MAX_PARTITIONS, pmax); else - lpc.encode_residual(frame.current.residual, frame.subframes[ch].samples, frame.blocksize, frame.current.order, coefs, frame.current.shift); + lpc.encode_residual(frame.current.residual, frame.subframes[ch].samples, frame.blocksize, frame.current.order, coefs, frame.current.shift, sums + pmax * Flake.MAX_PARTITIONS, pmax); } - var cur_size = calc_rice_params(frame.current.rc, pmin, pmax, frame.current.residual, (uint)frame.blocksize, (uint)frame.current.order, Settings.PCM.BitsPerSample); + var cur_size = calc_rice_params_sums(frame.current.rc, pmin, pmax, sums, (uint)frame.blocksize, (uint)frame.current.order, Settings.PCM.BitsPerSample); frame.current.size = (uint)(frame.current.order * frame.subframes[ch].obits + 4 + 5 + frame.current.order * frame.current.cbits + 6 + (int)cur_size); if (frame.current.size < best_size) @@ -1261,6 +1261,10 @@ new int[] { // 30 frame.current.window = iWindow; frame.current.cbits = (int)cbits; + int pmax = get_max_p_order(m_settings.MaxPartitionOrder, frame.blocksize, frame.current.order); + int pmin = Math.Min(m_settings.MinPartitionOrder, pmax); + ulong* sums = stackalloc ulong[(pmax + 1) * Flake.MAX_PARTITIONS]; + ulong csum = 0; fixed (int* coefs = frame.current.coefs) { lpc.quantize_lpc_coefs(lpcs + (frame.current.order - 1) * lpc.MAX_LPC_ORDER, @@ -1269,36 +1273,16 @@ new int[] { // 30 if (frame.current.shift < 0 || frame.current.shift > 15) throw new Exception("negative shift"); - ulong csum = 0; for (int i = frame.current.order; i > 0; i--) csum += (ulong)Math.Abs(coefs[i - 1]); if ((csum << frame.subframes[ch].obits) >= 1UL << 32) - lpc.encode_residual_long(frame.current.residual, frame.subframes[ch].samples, frame.blocksize, frame.current.order, coefs, frame.current.shift); + lpc.encode_residual_long(frame.current.residual, frame.subframes[ch].samples, frame.blocksize, frame.current.order, coefs, frame.current.shift, sums + pmax * Flake.MAX_PARTITIONS, pmax); else - lpc.encode_residual(frame.current.residual, frame.subframes[ch].samples, frame.blocksize, frame.current.order, coefs, frame.current.shift); + lpc.encode_residual(frame.current.residual, frame.subframes[ch].samples, frame.blocksize, frame.current.order, coefs, frame.current.shift, sums + pmax * Flake.MAX_PARTITIONS, pmax); } - int pmax = get_max_p_order(m_settings.MaxPartitionOrder, frame.blocksize, frame.current.order); - int pmin = Math.Min(m_settings.MinPartitionOrder, pmax); - uint best_size = calc_rice_params(frame.current.rc, pmin, pmax, frame.current.residual, (uint)frame.blocksize, (uint)frame.current.order, Settings.PCM.BitsPerSample); - // not working - //for (int o = 1; o <= frame.current.order; o++) - //{ - // if (frame.current.coefs[o - 1] > -(1 << frame.current.shift)) - // { - // for (int i = o; i < frame.blocksize; i++) - // frame.current.residual[i] += frame.subframes[ch].samples[i - o] >> frame.current.shift; - // frame.current.coefs[o - 1]--; - // uint new_size = calc_rice_params(ref frame.current.rc, pmin, pmax, frame.current.residual, (uint)frame.blocksize, (uint)frame.current.order); - // if (new_size > best_size) - // { - // for (int i = o; i < frame.blocksize; i++) - // frame.current.residual[i] -= frame.subframes[ch].samples[i - o] >> frame.current.shift; - // frame.current.coefs[o - 1]++; - // } - // } - //} + uint best_size = calc_rice_params_sums(frame.current.rc, pmin, pmax, sums, (uint)frame.blocksize, (uint)frame.current.order, Settings.PCM.BitsPerSample); frame.current.size = (uint)(frame.current.order * frame.subframes[ch].obits + 4 + 5 + frame.current.order * (int)cbits + 6 + (int)best_size); frame.ChooseBestSubframe(ch); //if (frame.current.size >= frame.subframes[ch].best.size) @@ -1522,7 +1506,7 @@ new int[] { // 30 { case OrderMethod.Akaike: //lpc_ctx.SortOrdersAkaike(frame.blocksize, eparams.estimation_depth, max_order, 7.1, 0.0); - lpc_ctx.SortOrdersAkaike(frame.blocksize, eparams.estimation_depth, min_order, max_order, 4.5, 0.0); + lpc_ctx.SortOrdersAkaike(frame.blocksize, eparams.estimation_depth, min_order, max_order, 4.5, 0); break; default: throw new Exception("unknown order method"); @@ -2175,13 +2159,17 @@ new int[] { // 30 { frame2.InitSize(frame.blocksize / 2, true); frame2.window_buffer = frame.window_buffer + frame.blocksize; - frame2.nSeg++; + frame2.nSeg = frame.nSeg + 1; frame2.current.residual = r + tumbler * 5 * Flake.MAX_BLOCKSIZE; for (int ch = 0; ch < 4; ch++) frame2.subframes[ch].Init(frame.subframes[ch].samples, frame2.current.residual + (ch + 1) * frame2.blocksize, frame.subframes[ch].obits + frame.subframes[ch].wbits, frame.subframes[ch].wbits); estimate_frame(frame2, true); - uint fs2 = measure_frame_size(frame2, true); + //measure_frame_size(frame2, true); + //frame2.ChooseSubframes(); + //encode_estimated_frame(frame2); + //uint fs2 = measure_frame_size(frame2, false); + uint fs2 = measure_frame_size(frame2, true); uint fs3 = fs2; if (eparams.variable_block_size == 2 || eparams.variable_block_size == 4) { diff --git a/CUETools.Codecs/BitWriter.cs b/CUETools.Codecs/BitWriter.cs index bfaa195..518110d 100644 --- a/CUETools.Codecs/BitWriter.cs +++ b/CUETools.Codecs/BitWriter.cs @@ -295,8 +295,7 @@ namespace CUETools.Codecs return; } #endif - crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (byte)(bit_buf >> 56)]); - *(buf++) = (byte)(bit_buf >> 56); + crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (*(buf++) = (byte)(bit_buf >> 56))]); bit_buf <<= 8; bits -= 8; } @@ -320,23 +319,15 @@ namespace CUETools.Codecs } #endif - crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (byte)(bb >> 56)]); - crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (byte)(bb >> 48)]); - crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (byte)(bb >> 40)]); - crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (byte)(bb >> 32)]); - crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (byte)(bb >> 24)]); - crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (byte)(bb >> 16)]); - crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (byte)(bb >> 8)]); - crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (byte)(bb)]); + crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (*(buf++) = (byte)(bb >> 56))]); + crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (*(buf++) = (byte)(bb >> 48))]); + crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (*(buf++) = (byte)(bb >> 40))]); + crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (*(buf++) = (byte)(bb >> 32))]); + crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (*(buf++) = (byte)(bb >> 24))]); + crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (*(buf++) = (byte)(bb >> 16))]); + crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (*(buf++) = (byte)(bb >> 8))]); + crc16 = (ushort)((crc16 << 8) ^ crc16_t[(crc16 >> 8) ^ (*(buf++) = (byte)(bb))]); - *(buf++) = (byte)(bb >> 56); - *(buf++) = (byte)(bb >> 48); - *(buf++) = (byte)(bb >> 40); - *(buf++) = (byte)(bb >> 32); - *(buf++) = (byte)(bb >> 24); - *(buf++) = (byte)(bb >> 16); - *(buf++) = (byte)(bb >> 8); - *(buf++) = (byte)(bb); bit_left += 64 - bits; bit_buf = (val << bit_left - 1) << 1; } diff --git a/CUETools.Codecs/LPC.cs b/CUETools.Codecs/LPC.cs index d5975b9..89064f9 100644 --- a/CUETools.Codecs/LPC.cs +++ b/CUETools.Codecs/LPC.cs @@ -515,80 +515,82 @@ namespace CUETools.Codecs shift = sh; } - public static unsafe void - encode_residual(int* res, int* smp, int n, int order, - int* coefs, int shift) + private static unsafe ulong + encode_residual_partition(int* s, int* r, int* seg_end, int* coefs, int shift, int order) { - for (int i = 0; i < order; i++) - res[i] = smp[i]; - - int* s = smp; - int* r = res + order; + ulong sum = 0ul; int c0 = coefs[0]; int c1 = coefs[1]; switch (order) { case 1: - for (int i = n - order; i > 0; i--) + while (s < seg_end) { int pred = c0 * *(s++); - *(r++) = *s - (pred >> shift); + //*(r++) = *s - (pred >> shift); + int d = *(r++) = *s - (pred >> shift); + sum += (uint)((d << 1) ^ (d >> 31)); } break; case 2: - for (int i = n - order; i > 0; i--) + while (s < seg_end) { int pred = c1 * *(s++); pred += c0 * *(s++); - *(r++) = *(s--) - (pred >> shift); + int d = *(r++) = *(s--) - (pred >> shift); + sum += (uint)((d << 1) ^ (d >> 31)); } break; case 3: - for (int i = n - order; i > 0; i--) + while (s < seg_end) { int pred = coefs[2] * *(s++) + c1 * *(s++) + c0 * *(s++); - *(r++) = *s - (pred >> shift); + int d = *(r++) = *s - (pred >> shift); + sum += (uint)((d << 1) ^ (d >> 31)); s -= 2; } break; case 4: - for (int i = n - order; i > 0; i--) + while (s < seg_end) { int* c = coefs + order - 1; int pred = *(c--) * *(s++) + *(c--) * *(s++) + c1 * *(s++) + c0 * *(s++); - *(r++) = *s - (pred >> shift); + int d = *(r++) = *s - (pred >> shift); + sum += (uint)((d << 1) ^ (d >> 31)); s -= 3; } break; case 5: - for (int i = n - order; i > 0; i--) + while (s < seg_end) { int* c = coefs + order - 1; int pred = *(c--) * *(s++) + *(c--) * *(s++) + *(c--) * *(s++) + c1 * *(s++) + c0 * *(s++); - *(r++) = *s - (pred >> shift); + int d = *(r++) = *s - (pred >> shift); + sum += (uint)((d << 1) ^ (d >> 31)); s -= 4; } break; case 6: - for (int i = n - order; i > 0; i--) + while (s < seg_end) { int* c = coefs + order - 1; int pred = *(c--) * *(s++) + *(c--) * *(s++) + *(c--) * *(s++) + *(c--) * *(s++) + c1 * *(s++) + c0 * *(s++); - *(r++) = *s - (pred >> shift); + int d = *(r++) = *s - (pred >> shift); + sum += (uint)((d << 1) ^ (d >> 31)); s -= 5; } break; case 7: - for (int i = n - order; i > 0; i--) + while (s < seg_end) { int* c = coefs + order - 1; int pred = @@ -596,12 +598,13 @@ namespace CUETools.Codecs *(c--) * *(s++) + *(c--) * *(s++) + *(c--) * *(s++) + *(c--) * *(s++) + c1 * *(s++) + c0 * *(s++); - *(r++) = *s - (pred >> shift); + int d = *(r++) = *s - (pred >> shift); + sum += (uint)((d << 1) ^ (d >> 31)); s -= 6; } break; case 8: - for (int i = n - order; i > 0; i--) + while (s < seg_end) { int* c = coefs + order - 1; int pred = @@ -609,26 +612,28 @@ namespace CUETools.Codecs *(c--) * *(s++) + *(c--) * *(s++) + *(c--) * *(s++) + *(c--) * *(s++) + c1 * *(s++) + c0 * *(s++); - *(r++) = *s - (pred >> shift); + int d = *(r++) = *s - (pred >> shift); + sum += (uint)((d << 1) ^ (d >> 31)); s -= 7; } break; case 9: - for (int i = n - order; i > 0; i--) + while (s < seg_end) { int* c = coefs + order - 1; int pred = - *(c--) * *(s++) + + *(c--) * *(s++) + *(c--) * *(s++) + *(c--) * *(s++) + *(c--) * *(s++) + *(c--) * *(s++) + *(c--) * *(s++) + *(c--) * *(s++) + c1 * *(s++) + c0 * *(s++); - *(r++) = *s - (pred >> shift); + int d = *(r++) = *s - (pred >> shift); + sum += (uint)((d << 1) ^ (d >> 31)); s -= 8; } break; case 10: - for (int i = n - order; i > 0; i--) + while (s < seg_end) { int* c = coefs + order - 1; int pred = @@ -637,12 +642,13 @@ namespace CUETools.Codecs *(c--) * *(s++) + *(c--) * *(s++) + *(c--) * *(s++) + *(c--) * *(s++) + c1 * *(s++) + c0 * *(s++); - *(r++) = *s - (pred >> shift); + int d = *(r++) = *s - (pred >> shift); + sum += (uint)((d << 1) ^ (d >> 31)); s -= 9; } break; case 11: - for (int i = n - order; i > 0; i--) + while (s < seg_end) { int* c = coefs + order - 1; int pred = @@ -652,12 +658,13 @@ namespace CUETools.Codecs *(c--) * *(s++) + *(c--) * *(s++) + *(c--) * *(s++) + *(c--) * *(s++) + c1 * *(s++) + c0 * *(s++); - *(r++) = *s - (pred >> shift); + int d = *(r++) = *s - (pred >> shift); + sum += (uint)((d << 1) ^ (d >> 31)); s -= 10; } break; case 12: - for (int i = n - order; i > 0; i--) + while (s < seg_end) { int* c = coefs + order - 1; int pred = @@ -667,14 +674,14 @@ namespace CUETools.Codecs *(c--) * *(s++) + *(c--) * *(s++) + *(c--) * *(s++) + *(c--) * *(s++) + c1 * *(s++) + c0 * *(s++); - *(r++) = *s - (pred >> shift); + int d = *(r++) = *s - (pred >> shift); + sum += (uint)((d << 1) ^ (d >> 31)); s -= 11; } break; default: - for (int i = order; i < n; i++) + while (s < seg_end) { - s = smp + i - order; int pred = 0; int* c = coefs + order - 1; int* c11 = coefs + 11; @@ -687,75 +694,99 @@ namespace CUETools.Codecs *(c--) * *(s++) + *(c--) * *(s++) + *(c--) * *(s++) + *(c--) * *(s++) + c1 * *(s++) + c0 * *(s++); - *(r++) = *s - (pred >> shift); + int d = *(r++) = *s - (pred >> shift); + sum += (uint)((d << 1) ^ (d >> 31)); + s -= order - 1; } break; } + return sum; } public static unsafe void - encode_residual_long(int* res, int* smp, int n, int order, - int* coefs, int shift) + encode_residual(int* res, int* smp, int n, int order, + int* coefs, int shift, ulong* sums, int pmax) { for (int i = 0; i < order; i++) res[i] = smp[i]; int* s = smp; + int* s_end = smp + n - order; + int* seg_end = s + (n >> pmax) - order; int* r = res + order; + while (s < s_end) + { + *(sums++) = encode_residual_partition(s, r, seg_end, coefs, shift, order); + r += seg_end - s; + s = seg_end; + seg_end += n >> pmax; + } + } + + private static unsafe ulong + encode_residual_long_partition(int* s, int* r, int* seg_end, int* coefs, int shift, int order) + { + ulong sum = 0ul; int c0 = coefs[0]; int c1 = coefs[1]; switch (order) { case 1: - for (int i = n - order; i > 0; i--) + while (s < seg_end) { long pred = c0 * (long)*(s++); - *(r++) = *s - (int)(pred >> shift); + int d = *(r++) = *s - (int)(pred >> shift); + sum += (uint)((d << 1) ^ (d >> 31)); } break; case 2: - for (int i = n - order; i > 0; i--) + while (s < seg_end) { long pred = c1 * (long)*(s++); pred += c0 * (long)*(s++); - *(r++) = *(s--) - (int)(pred >> shift); + int d = *(r++) = *(s--) - (int)(pred >> shift); + sum += (uint)((d << 1) ^ (d >> 31)); } break; case 3: - for (int i = n - order; i > 0; i--) + while (s < seg_end) { long pred = coefs[2] * (long)*(s++); pred += c1 * (long)*(s++); pred += c0 * (long)*(s++); *(r++) = *s - (int)(pred >> shift); + int d = *(r++) = *s - (int)(pred >> shift); + sum += (uint)((d << 1) ^ (d >> 31)); s -= 2; } break; case 4: - for (int i = n - order; i > 0; i--) + while (s < seg_end) { long pred = coefs[3] * (long)*(s++); pred += coefs[2] * (long)*(s++); pred += c1 * (long)*(s++); pred += c0 * (long)*(s++); - *(r++) = *s - (int)(pred >> shift); + int d = *(r++) = *s - (int)(pred >> shift); + sum += (uint)((d << 1) ^ (d >> 31)); s -= 3; } break; case 5: - for (int i = n - order; i > 0; i--) + while (s < seg_end) { long pred = coefs[4] * (long)*(s++); pred += coefs[3] * (long)*(s++); pred += coefs[2] * (long)*(s++); pred += c1 * (long)*(s++); pred += c0 * (long)*(s++); - *(r++) = *s - (int)(pred >> shift); + int d = *(r++) = *s - (int)(pred >> shift); + sum += (uint)((d << 1) ^ (d >> 31)); s -= 4; } break; case 6: - for (int i = n - order; i > 0; i--) + while (s < seg_end) { long pred = coefs[5] * (long)*(s++); pred += coefs[4] * (long)*(s++); @@ -763,12 +794,13 @@ namespace CUETools.Codecs pred += coefs[2] * (long)*(s++); pred += c1 * (long)*(s++); pred += c0 * (long)*(s++); - *(r++) = *s - (int)(pred >> shift); + int d = *(r++) = *s - (int)(pred >> shift); + sum += (uint)((d << 1) ^ (d >> 31)); s -= 5; } break; case 7: - for (int i = n - order; i > 0; i--) + while (s < seg_end) { long pred = coefs[6] * (long)*(s++); pred += coefs[5] * (long)*(s++); @@ -777,12 +809,13 @@ namespace CUETools.Codecs pred += coefs[2] * (long)*(s++); pred += c1 * (long)*(s++); pred += c0 * (long)*(s++); - *(r++) = *s - (int)(pred >> shift); + int d = *(r++) = *s - (int)(pred >> shift); + sum += (uint)((d << 1) ^ (d >> 31)); s -= 6; } break; case 8: - for (int i = n - order; i > 0; i--) + while (s < seg_end) { long pred = coefs[7] * (long)*(s++); pred += coefs[6] * (long)*(s++); @@ -792,14 +825,14 @@ namespace CUETools.Codecs pred += coefs[2] * (long)*(s++); pred += c1 * (long)*(s++); pred += c0 * (long)*(s++); - *(r++) = *s - (int)(pred >> shift); + int d = *(r++) = *s - (int)(pred >> shift); + sum += (uint)((d << 1) ^ (d >> 31)); s -= 7; } break; default: - for (int i = order; i < n; i++) + while (s < seg_end) { - s = smp + i - order; long pred = 0; int* co = coefs + order - 1; int* c7 = coefs + 7; @@ -813,188 +846,36 @@ namespace CUETools.Codecs pred += coefs[2] * (long)*(s++); pred += c1 * (long)*(s++); pred += c0 * (long)*(s++); - *(r++) = *s - (int)(pred >> shift); + int d = *(r++) = *s - (int)(pred >> shift); + sum += (uint)((d << 1) ^ (d >> 31)); + s -= order - 1; } break; } + return sum; } public static unsafe void - encode_residual2(int* res, int* smp, int n, int order, - int* coefs, int shift) + encode_residual_long(int* res, int* smp, int n, int order, + int* coefs, int shift, ulong* sums, int pmax) { + for (int i = 0; i < order; i++) + res[i] = smp[i]; + int* s = smp; - int* r = res; - int c0 = coefs[0]; - int c1 = coefs[1]; - switch (order) + int* s_end = smp + n - order; + int* seg_end = s + (n >> pmax) - order; + int* r = res + order; + while (s < s_end) { - case 1: - for (int i = n - order; i > 0; i--) - { - int pred = c0 * *(s++); - *(r++) = *s - (pred >> shift); - } - break; - case 2: - for (int i = n - order; i > 0; i--) - { - int pred = c1 * *(s++); - pred += c0 * *(s++); - *(r++) = *(s--) - (pred >> shift); - } - break; - case 3: - for (int i = n - order; i > 0; i--) - { - int pred = coefs[2] * *(s++) + - c1 * *(s++) + c0 * *(s++); - *(r++) = *s - (pred >> shift); - s -= 2; - } - break; - case 4: - for (int i = n - order; i > 0; i--) - { - int* c = coefs + order - 1; - int pred = - *(c--) * *(s++) + *(c--) * *(s++) + - c1 * *(s++) + c0 * *(s++); - *(r++) = *s - (pred >> shift); - s -= 3; - } - break; - case 5: - for (int i = n - order; i > 0; i--) - { - int* c = coefs + order - 1; - int pred = - *(c--) * *(s++) + - *(c--) * *(s++) + *(c--) * *(s++) + - c1 * *(s++) + c0 * *(s++); - *(r++) = *s - (pred >> shift); - s -= 4; - } - break; - case 6: - for (int i = n - order; i > 0; i--) - { - int* c = coefs + order - 1; - int pred = - *(c--) * *(s++) + *(c--) * *(s++) + - *(c--) * *(s++) + *(c--) * *(s++) + - c1 * *(s++) + c0 * *(s++); - *(r++) = *s - (pred >> shift); - s -= 5; - } - break; - case 7: - for (int i = n - order; i > 0; i--) - { - int* c = coefs + order - 1; - int pred = - *(c--) * *(s++) + - *(c--) * *(s++) + *(c--) * *(s++) + - *(c--) * *(s++) + *(c--) * *(s++) + - c1 * *(s++) + c0 * *(s++); - *(r++) = *s - (pred >> shift); - s -= 6; - } - break; - case 8: - for (int i = n - order; i > 0; i--) - { - int* c = coefs + order - 1; - int pred = - *(c--) * *(s++) + *(c--) * *(s++) + - *(c--) * *(s++) + *(c--) * *(s++) + - *(c--) * *(s++) + *(c--) * *(s++) + - c1 * *(s++) + c0 * *(s++); - *(r++) = *s - (pred >> shift); - s -= 7; - } - break; - case 9: - for (int i = n - order; i > 0; i--) - { - int* c = coefs + order - 1; - int pred = - *(c--) * *(s++) + - *(c--) * *(s++) + *(c--) * *(s++) + - *(c--) * *(s++) + *(c--) * *(s++) + - *(c--) * *(s++) + *(c--) * *(s++) + - c1 * *(s++) + c0 * *(s++); - *(r++) = *s - (pred >> shift); - s -= 8; - } - break; - case 10: - for (int i = n - order; i > 0; i--) - { - int* c = coefs + order - 1; - int pred = - *(c--) * *(s++) + *(c--) * *(s++) + - *(c--) * *(s++) + *(c--) * *(s++) + - *(c--) * *(s++) + *(c--) * *(s++) + - *(c--) * *(s++) + *(c--) * *(s++) + - c1 * *(s++) + c0 * *(s++); - *(r++) = *s - (pred >> shift); - s -= 9; - } - break; - case 11: - for (int i = n - order; i > 0; i--) - { - int* c = coefs + order - 1; - int pred = - *(c--) * *(s++) + - *(c--) * *(s++) + *(c--) * *(s++) + - *(c--) * *(s++) + *(c--) * *(s++) + - *(c--) * *(s++) + *(c--) * *(s++) + - *(c--) * *(s++) + *(c--) * *(s++) + - c1 * *(s++) + c0 * *(s++); - *(r++) = *s - (pred >> shift); - s -= 10; - } - break; - case 12: - for (int i = n - order; i > 0; i--) - { - int* c = coefs + order - 1; - int pred = - *(c--) * *(s++) + *(c--) * *(s++) + - *(c--) * *(s++) + *(c--) * *(s++) + - *(c--) * *(s++) + *(c--) * *(s++) + - *(c--) * *(s++) + *(c--) * *(s++) + - *(c--) * *(s++) + *(c--) * *(s++) + - c1 * *(s++) + c0 * *(s++); - *(r++) = *s - (pred >> shift); - s -= 11; - } - break; - default: - for (int i = order; i < n; i++) - { - s = smp + i - order; - int pred = 0; - int* c = coefs + order - 1; - int* c11 = coefs + 11; - while (c > c11) - pred += *(c--) * *(s++); - pred += - *(c--) * *(s++) + *(c--) * *(s++) + - *(c--) * *(s++) + *(c--) * *(s++) + - *(c--) * *(s++) + *(c--) * *(s++) + - *(c--) * *(s++) + *(c--) * *(s++) + - *(c--) * *(s++) + *(c--) * *(s++) + - c1 * *(s++) + c0 * *(s++); - *(r++) = *s - (pred >> shift); - } - break; + *(sums++) = encode_residual_long_partition(s, r, seg_end, coefs, shift, order); + r += seg_end - s; + s = seg_end; + seg_end += n >> pmax; } } - public static unsafe void + public static unsafe void decode_residual(int* res, int* smp, int n, int order, int* coefs, int shift) {