BinaryObjectScanner/BinaryObjectScanner.Compression/bzip2/blocksort.cs

using static BinaryObjectScanner.Compression.bzip2.Constants;

namespace BinaryObjectScanner.Compression.bzip2
{
    /// <summary>
    /// Block sorting machinery
    /// </summary>
    /// <see href="https://github.com/ladislav-zezula/StormLib/blob/master/src/bzip2/blocksort.c"/>
    internal static unsafe class blocksort
    {
        /// <summary>
        /// Fallback O(N log(N)^2) sorting algorithm, for repetitive blocks
        /// </summary>
        public static void fallbackSimpleSort(uint* fmap, uint* eclass, int lo, int hi)
        {
            int i, j, tmp;
            uint ec_tmp;

            if (lo == hi) return;

            if (hi - lo > 3)
            {
                for (i = hi - 4; i >= lo; i--)
                {
                    tmp = (int)fmap[i];
                    ec_tmp = eclass[tmp];
                    for (j = i + 4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4)
                        fmap[j - 4] = fmap[j];
                    fmap[j - 4] = (uint)tmp;
                }
            }

            for (i = hi - 1; i >= lo; i--)
            {
                tmp = (int)fmap[i];
                ec_tmp = eclass[tmp];
                for (j = i + 1; j <= hi && ec_tmp > eclass[fmap[j]]; j++)
                    fmap[j - 1] = fmap[j];
                fmap[j - 1] = (uint)tmp;
            }
        }

        public static void fallbackQSort3(uint* fmap, uint* eclass, int loSt, int hiSt)
        {
            int unLo, unHi, ltLo, gtHi, n, m;
            int sp, lo = 0, hi = 0;
            uint med, r, r3;
            int[] stackLo = new int[FALLBACK_QSORT_STACK_SIZE];
            int[] stackHi = new int[FALLBACK_QSORT_STACK_SIZE];

            r = 0;

            sp = 0;
            fpush(loSt, hiSt, stackLo, stackHi, ref sp);

            while (sp > 0)
            {
                //AssertH(sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004);

                fpop(ref lo, ref hi, stackLo, stackHi, ref sp);
                if (hi - lo < FALLBACK_QSORT_SMALL_THRESH)
                {
                    fallbackSimpleSort(fmap, eclass, lo, hi);
                    continue;
                }

                /* Random partitioning.  Median of 3 sometimes fails to
                   avoid bad cases.  Median of 9 seems to help but
                   looks rather expensive.  This too seems to work but
                   is cheaper.  Guidance for the magic constants
                   7621 and 32768 is taken from Sedgewick's algorithms
                   book, chapter 35.
                */
                r = ((r * 7621) + 1) % 32768;
                r3 = r % 3;
                if (r3 == 0)
                    med = eclass[fmap[lo]];
                else if (r3 == 1)
                    med = eclass[fmap[(lo + hi) >> 1]];
                else
                    med = eclass[fmap[hi]];

                unLo = ltLo = lo;
                unHi = gtHi = hi;

                while (true)
                {
                    while (true)
                    {
                        if (unLo > unHi) break;
                        n = (int)eclass[fmap[unLo]] - (int)med;
                        if (n == 0)
                        {
                            fswap(ref fmap[unLo], ref fmap[ltLo]);
                            ltLo++; unLo++;
                            continue;
                        };
                        if (n > 0) break;
                        unLo++;
                    }
                    while (true)
                    {
                        if (unLo > unHi)
                            break;

                        n = (int)eclass[fmap[unHi]] - (int)med;
                        if (n == 0)
                        {
                            fswap(ref fmap[unHi], ref fmap[gtHi]);
                            gtHi--; unHi--;
                            continue;
                        };

                        if (n < 0)
                            break;

                        unHi--;
                    }

                    if (unLo > unHi)
                        break;

                    fswap(ref fmap[unLo], ref fmap[unHi]); unLo++; unHi--;
                }

                //AssertD(unHi == unLo - 1, "fallbackQSort3(2)");

                if (gtHi < ltLo) continue;

                n = fmin(ltLo - lo, unLo - ltLo); fvswap(fmap, lo, unLo - n, n);
                m = fmin(hi - gtHi, gtHi - unHi); fvswap(fmap, unLo, hi - m + 1, m);

                n = lo + unLo - ltLo - 1;
                m = hi - (gtHi - unHi) + 1;

                if (n - lo > hi - m)
                {
                    fpush(lo, n, stackLo, stackHi, ref sp);
                    fpush(m, hi, stackLo, stackHi, ref sp);
                }
                else
                {
                    fpush(m, hi, stackLo, stackHi, ref sp);
                    fpush(lo, n, stackLo, stackHi, ref sp);
                }
            }
        }

        /*
        Pre:
            nblock > 0
            eclass exists for [0 .. nblock-1]
            ((byte*)eclass) [0 .. nblock-1] holds block
            ptr exists for [0 .. nblock-1]
        Post:
            ((byte*)eclass) [0 .. nblock-1] holds block
            All other areas of eclass destroyed
            fmap [0 .. nblock-1] holds sorted order
            bhtab [ 0 .. 2+(nblock/32) ] destroyed
        */

        public static void fallbackSort(uint* fmap, uint* eclass, uint* bhtab, int nblock, int verb)
        {
            int[] ftab = new int[257];
            int[] ftabCopy = new int[256];
            int H, i, j, k, l, r, cc, cc1;
            int nNotDone;
            int nBhtab;
            byte* eclass8 = (byte*)eclass;

            /*--
               Initial 1-char radix sort to generate
               initial fmap and initial BH bits.
            --*/
            // if (verb >= 4)
            //     VPrintf0("        bucket sorting ...\n");
            for (i = 0; i < 257; i++)
            {
                ftab[i] = 0;
            }

            for (i = 0; i < nblock; i++)
            {
                ftab[eclass8[i]]++;
            }

            for (i = 0; i < 256; i++)
            {
                ftabCopy[i] = ftab[i];
            }

            for (i = 1; i < 257; i++)
            {
                ftab[i] += ftab[i - 1];
            }

            for (i = 0; i < nblock; i++)
            {
                j = eclass8[i];
                k = ftab[j] - 1;
                ftab[j] = k;
                fmap[k] = (uint)i;
            }

            nBhtab = 2 + (nblock / 32);
            for (i = 0; i < nBhtab; i++)
            {
                bhtab[i] = 0;
            }

            for (i = 0; i < 256; i++)
            {
                SET_BH(ftab[i], bhtab);
            }

            /*--
               Inductively refine the buckets.  Kind-of an
               "exponential radix sort" (!), inspired by the
               Manber-Myers suffix array construction algorithm.
            --*/

            /*-- set sentinel bits for block-end detection --*/
            for (i = 0; i < 32; i++)
            {
                SET_BH(nblock + 2 * i, bhtab);
                CLEAR_BH(nblock + 2 * i + 1, bhtab);
            }

            /*-- the log(N) loop --*/
            H = 1;
            while (true)
            {
                // if (verb >= 4)
                //     VPrintf1("        depth %6d has ", H);

                j = 0;
                for (i = 0; i < nblock; i++)
                {
                    if (ISSET_BH(i, bhtab))
                        j = i;

                    k = (int)(fmap[i] - H);
                    if (k < 0)
                        k += nblock;

                    eclass[k] = (uint)j;
                }

                nNotDone = 0;
                r = -1;
                while (true)
                {

                    /*-- find the next non-singleton bucket --*/
                    k = r + 1;
                    while (ISSET_BH(k, bhtab) && UNALIGNED_BH(k) != 0)
                    {
                        k++;
                    }

                    if (ISSET_BH(k, bhtab))
                    {
                        while (WORD_BH(k, bhtab) == 0xffffffff)
                        {
                            k += 32;
                        }

                        while (ISSET_BH(k, bhtab))
                        {
                            k++;
                        }
                    }

                    l = k - 1;
                    if (l >= nblock)
                        break;

                    while (!ISSET_BH(k, bhtab) && UNALIGNED_BH(k) != 0)
                    {
                        k++;
                    }

                    if (!ISSET_BH(k, bhtab))
                    {
                        while (WORD_BH(k, bhtab) == 0x00000000)
                        {
                            k += 32;
                        }

                        while (!ISSET_BH(k, bhtab))
                        {
                            k++;
                        }
                    }

                    r = k - 1;
                    if (r >= nblock)
                        break;

                    /*-- now [l, r] bracket current bucket --*/
                    if (r > l)
                    {
                        nNotDone += (r - l + 1);
                        fallbackQSort3(fmap, eclass, l, r);

                        /*-- scan bucket and generate header bits-- */
                        cc = -1;
                        for (i = l; i <= r; i++)
                        {
                            cc1 = (int)eclass[fmap[i]];
                            if (cc != cc1)
                            {
                                SET_BH(i, bhtab);
                                cc = cc1;
                            };
                        }
                    }
                }

                // if (verb >= 4)
                //     VPrintf1("%6d unresolved strings\n", nNotDone);

                H *= 2;
                if (H > nblock || nNotDone == 0)
                    break;
            }

            /*--
               Reconstruct the original block in
               eclass8 [0 .. nblock-1], since the
               previous phase destroyed it.
            --*/
            // if (verb >= 4)
            //     VPrintf0("        reconstructing block ...\n");

            j = 0;
            for (i = 0; i < nblock; i++)
            {
                while (ftabCopy[j] == 0)
                {
                    j++;
                }

                ftabCopy[j]--;
                eclass8[fmap[i]] = (byte)j;
            }

            //AssertH(j < 256, 1005);
        }

        /// <summary>
        /// The main, O(N^2 log(N)) sorting algorithm.
        /// Faster for "normal" non-repetitive blocks.
        /// </summary>
        public static bool mainGtU(uint i1, uint i2, byte* block, ushort* quadrant, uint nblock, int* budget)
        {
            uint k;
            byte c1, c2;
            ushort s1, s2;

            //AssertD(i1 != i2, "mainGtU");
            /* 1 */
            c1 = block[i1]; c2 = block[i2];
            if (c1 != c2) return (c1 > c2);
            i1++; i2++;
            /* 2 */
            c1 = block[i1]; c2 = block[i2];
            if (c1 != c2) return (c1 > c2);
            i1++; i2++;
            /* 3 */
            c1 = block[i1]; c2 = block[i2];
            if (c1 != c2) return (c1 > c2);
            i1++; i2++;
            /* 4 */
            c1 = block[i1]; c2 = block[i2];
            if (c1 != c2) return (c1 > c2);
            i1++; i2++;
            /* 5 */
            c1 = block[i1]; c2 = block[i2];
            if (c1 != c2) return (c1 > c2);
            i1++; i2++;
            /* 6 */
            c1 = block[i1]; c2 = block[i2];
            if (c1 != c2) return (c1 > c2);
            i1++; i2++;
            /* 7 */
            c1 = block[i1]; c2 = block[i2];
            if (c1 != c2) return (c1 > c2);
            i1++; i2++;
            /* 8 */
            c1 = block[i1]; c2 = block[i2];
            if (c1 != c2) return (c1 > c2);
            i1++; i2++;
            /* 9 */
            c1 = block[i1]; c2 = block[i2];
            if (c1 != c2) return (c1 > c2);
            i1++; i2++;
            /* 10 */
            c1 = block[i1]; c2 = block[i2];
            if (c1 != c2) return (c1 > c2);
            i1++; i2++;
            /* 11 */
            c1 = block[i1]; c2 = block[i2];
            if (c1 != c2) return (c1 > c2);
            i1++; i2++;
            /* 12 */
            c1 = block[i1]; c2 = block[i2];
            if (c1 != c2) return (c1 > c2);
            i1++; i2++;

            k = nblock + 8;

            do
            {
                /* 1 */
                c1 = block[i1]; c2 = block[i2];
                if (c1 != c2) return (c1 > c2);
                s1 = quadrant[i1]; s2 = quadrant[i2];
                if (s1 != s2) return (s1 > s2);
                i1++; i2++;
                /* 2 */
                c1 = block[i1]; c2 = block[i2];
                if (c1 != c2) return (c1 > c2);
                s1 = quadrant[i1]; s2 = quadrant[i2];
                if (s1 != s2) return (s1 > s2);
                i1++; i2++;
                /* 3 */
                c1 = block[i1]; c2 = block[i2];
                if (c1 != c2) return (c1 > c2);
                s1 = quadrant[i1]; s2 = quadrant[i2];
                if (s1 != s2) return (s1 > s2);
                i1++; i2++;
                /* 4 */
                c1 = block[i1]; c2 = block[i2];
                if (c1 != c2) return (c1 > c2);
                s1 = quadrant[i1]; s2 = quadrant[i2];
                if (s1 != s2) return (s1 > s2);
                i1++; i2++;
                /* 5 */
                c1 = block[i1]; c2 = block[i2];
                if (c1 != c2) return (c1 > c2);
                s1 = quadrant[i1]; s2 = quadrant[i2];
                if (s1 != s2) return (s1 > s2);
                i1++; i2++;
                /* 6 */
                c1 = block[i1]; c2 = block[i2];
                if (c1 != c2) return (c1 > c2);
                s1 = quadrant[i1]; s2 = quadrant[i2];
                if (s1 != s2) return (s1 > s2);
                i1++; i2++;
                /* 7 */
                c1 = block[i1]; c2 = block[i2];
                if (c1 != c2) return (c1 > c2);
                s1 = quadrant[i1]; s2 = quadrant[i2];
                if (s1 != s2) return (s1 > s2);
                i1++; i2++;
                /* 8 */
                c1 = block[i1]; c2 = block[i2];
                if (c1 != c2) return (c1 > c2);
                s1 = quadrant[i1]; s2 = quadrant[i2];
                if (s1 != s2) return (s1 > s2);
                i1++; i2++;

                if (i1 >= nblock) i1 -= nblock;
                if (i2 >= nblock) i2 -= nblock;

                k -= 8;
                (*budget)--;
            }
            while (k >= 0);

            return false;
        }

        public static void mainSimpleSort(uint* ptr, byte* block, ushort* quadrant, int nblock, int lo, int hi, int d, int* budget)
        {
            int i, j, h, bigN, hp;
            uint v;

            bigN = hi - lo + 1;
            if (bigN < 2)
                return;

            hp = 0;
            while (incs[hp] < bigN) hp++;
            hp--;

            for (; hp >= 0; hp--)
            {
                h = incs[hp];

                i = lo + h;
                while (true)
                {
                    /*-- copy 1 --*/
                    if (i > hi) break;
                    v = ptr[i];
                    j = i;
                    while (mainGtU((uint)(ptr[j - h] + d), (uint)(v + d), block, quadrant, (uint)nblock, budget))
                    {
                        ptr[j] = ptr[j - h];
                        j = j - h;
                        if (j <= (lo + h - 1)) break;
                    }

                    ptr[j] = v;
                    i++;

                    /*-- copy 2 --*/
                    if (i > hi) break;
                    v = ptr[i];
                    j = i;
                    while (mainGtU((uint)(ptr[j - h] + d), (uint)(v + d), block, quadrant, (uint)nblock, budget))
                    {
                        ptr[j] = ptr[j - h];
                        j = j - h;
                        if (j <= (lo + h - 1)) break;
                    }

                    ptr[j] = v;
                    i++;

                    /*-- copy 3 --*/
                    if (i > hi) break;
                    v = ptr[i];
                    j = i;
                    while (mainGtU((uint)(ptr[j - h] + d), (uint)(v + d), block, quadrant, (uint)nblock, budget))
                    {
                        ptr[j] = ptr[j - h];
                        j = j - h;
                        if (j <= (lo + h - 1))
                            break;
                    }

                    ptr[j] = v;
                    i++;

                    if (*budget < 0)
                        return;
                }
            }
        }

        /*--
            The following is an implementation of
            an elegant 3-way quicksort for strings,
            described in a paper "Fast Algorithms for
            Sorting and Searching Strings", by Robert
            Sedgewick and Jon L. Bentley.
        --*/
        public static byte mmed3(byte a, byte b, byte c)
        {
            byte t;
            if (a > b)
            {
                t = a;
                a = b;
                b = t;
            };

            if (b > c)
            {
                b = c;
                if (a > b)
                    b = a;
            }

            return b;
        }

        public static void mainQSort3(uint* ptr, byte* block, ushort* quadrant, int nblock, int loSt, int hiSt, int dSt, int* budget)
        {
            int unLo, unHi, ltLo, gtHi, n, m, med;
            int sp, lo = 0, hi = 0, d = 0;

            int[] stackLo = new int[MAIN_QSORT_STACK_SIZE];
            int[] stackHi = new int[MAIN_QSORT_STACK_SIZE];
            int[] stackD = new int[MAIN_QSORT_STACK_SIZE];

            int[] nextLo = new int[3];
            int[] nextHi = new int[3];
            int[] nextD = new int[3];

            sp = 0;
            mpush(loSt, hiSt, dSt, stackLo, stackHi, stackD, ref sp);

            while (sp > 0)
            {
                //AssertH(sp < MAIN_QSORT_STACK_SIZE - 2, 1001);

                mpop(ref lo, ref hi, ref d, stackLo, stackHi, stackD, ref sp);
                if (hi - lo < MAIN_QSORT_SMALL_THRESH ||
                    d > MAIN_QSORT_DEPTH_THRESH)
                {
                    mainSimpleSort(ptr, block, quadrant, nblock, lo, hi, d, budget);
                    if (*budget < 0) return;
                    continue;
                }

                med = mmed3(block[ptr[lo] + d], block[ptr[hi] + d], block[ptr[(lo + hi) >> 1] + d]);

                unLo = ltLo = lo;
                unHi = gtHi = hi;

                while (true)
                {
                    while (true)
                    {
                        if (unLo > unHi)
                            break;

                        n = (block[ptr[unLo] + d]) - med;
                        if (n == 0)
                        {
                            mswap(ref ptr[unLo], ref ptr[ltLo]);
                            ltLo++; unLo++; continue;
                        };

                        if (n > 0)
                            break;

                        unLo++;
                    }
                    while (true)
                    {
                        if (unLo > unHi)
                            break;

                        n = (block[ptr[unHi] + d]) - med;
                        if (n == 0)
                        {
                            mswap(ref ptr[unHi], ref ptr[gtHi]);
                            gtHi--;
                            unHi--;
                            continue;
                        };

                        if (n < 0)
                            break;

                        unHi--;
                    }

                    if (unLo > unHi)
                        break;

                    mswap(ref ptr[unLo], ref ptr[unHi]);
                    unLo++;
                    unHi--;
                }

                //AssertD(unHi == unLo - 1, "mainQSort3(2)");

                if (gtHi < ltLo)
                {
                    mpush(lo, hi, d + 1, stackLo, stackHi, stackD, ref sp);
                    continue;
                }

                n = mmin(ltLo - lo, unLo - ltLo); mvswap(ptr, lo, unLo - n, n);
                m = mmin(hi - gtHi, gtHi - unHi); mvswap(ptr, unLo, hi - m + 1, m);

                n = lo + unLo - ltLo - 1;
                m = hi - (gtHi - unHi) + 1;

                nextLo[0] = lo; nextHi[0] = n; nextD[0] = d;
                nextLo[1] = m; nextHi[1] = hi; nextD[1] = d;
                nextLo[2] = n + 1; nextHi[2] = m - 1; nextD[2] = d + 1;

                if (mnextsize(0, nextLo, nextHi) < mnextsize(1, nextLo, nextHi)) mnextswap(0, 1, nextLo, nextHi, nextD);
                if (mnextsize(1, nextLo, nextHi) < mnextsize(2, nextLo, nextHi)) mnextswap(1, 2, nextLo, nextHi, nextD);
                if (mnextsize(0, nextLo, nextHi) < mnextsize(1, nextLo, nextHi)) mnextswap(0, 1, nextLo, nextHi, nextD);

                //AssertD(mnextsize(0) >= mnextsize(1), "mainQSort3(8)");
                //AssertD(mnextsize(1) >= mnextsize(2), "mainQSort3(9)");

                mpush(nextLo[0], nextHi[0], nextD[0], stackLo, stackHi, stackD, ref sp);
                mpush(nextLo[1], nextHi[1], nextD[1], stackLo, stackHi, stackD, ref sp);
                mpush(nextLo[2], nextHi[2], nextD[2], stackLo, stackHi, stackD, ref sp);
            }
        }

        /*
        Pre:
            nblock > N_OVERSHOOT
            block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
            ((byte*)block32) [0 .. nblock-1] holds block
            ptr exists for [0 .. nblock-1]
        Post:
            ((byte*)block32) [0 .. nblock-1] holds block
            All other areas of block32 destroyed
            ftab [0 .. 65536 ] destroyed
            ptr [0 .. nblock-1] holds sorted order
            if (*budget < 0), sorting was abandoned
        */

        public static void mainSort(uint* ptr, byte* block, ushort* quadrant, uint* ftab, int nblock, int verb, int* budget)
        {
            int i, j, k, ss, sb;
            int[] runningOrder = new int[256];
            bool[] bigDone = new bool[256];
            int[] copyStart = new int[256];
            int[] copyEnd = new int[256];
            byte c1;
            int numQSorted;
            ushort s;

            // if (verb >= 4) VPrintf0("        main sort initialise ...\n");

            /*-- set up the 2-byte frequency table --*/
            for (i = 65536; i >= 0; i--)
            {
                ftab[i] = 0;
            }

            j = block[0] << 8;
            i = nblock - 1;
            for (; i >= 3; i -= 4)
            {
                quadrant[i] = 0;
                j = (j >> 8) | ((block[i]) << 8);
                ftab[j]++;

                quadrant[i - 1] = 0;
                j = (j >> 8) | ((block[i - 1]) << 8);
                ftab[j]++;

                quadrant[i - 2] = 0;
                j = (j >> 8) | ((block[i - 2]) << 8);
                ftab[j]++;

                quadrant[i - 3] = 0;
                j = (j >> 8) | ((block[i - 3]) << 8);
                ftab[j]++;
            }

            for (; i >= 0; i--)
            {
                quadrant[i] = 0;
                j = (j >> 8) | ((block[i]) << 8);
                ftab[j]++;
            }

            /*-- (emphasises close relationship of block & quadrant) --*/
            for (i = 0; i < BZ_N_OVERSHOOT; i++)
            {
                block[nblock + i] = block[i];
                quadrant[nblock + i] = 0;
            }

            // if (verb >= 4) VPrintf0("        bucket sorting ...\n");

            /*-- Complete the initial radix sort --*/
            for (i = 1; i <= 65536; i++) ftab[i] += ftab[i - 1];

            s = (ushort)(block[0] << 8);
            i = nblock - 1;
            for (; i >= 3; i -= 4)
            {
                s = (ushort)((s >> 8) | (block[i] << 8));
                j = (int)(ftab[s] - 1);
                ftab[s] = (uint)j;
                ptr[j] = (uint)i;

                s = (ushort)((s >> 8) | (block[i - 1] << 8));
                j = (int)(ftab[s] - 1);
                ftab[s] = (uint)j;
                ptr[j] = (uint)(i - 1);

                s = (ushort)((s >> 8) | (block[i - 2] << 8));
                j = (int)(ftab[s] - 1);
                ftab[s] = (uint)j;
                ptr[j] = (uint)(i - 2);

                s = (ushort)((s >> 8) | (block[i - 3] << 8));
                j = (int)(ftab[s] - 1);
                ftab[s] = (uint)j;
                ptr[j] = (uint)(i - 3);
            }

            for (; i >= 0; i--)
            {
                s = (ushort)((s >> 8) | (block[i] << 8));
                j = (int)(ftab[s] - 1);
                ftab[s] = (uint)j;
                ptr[j] = (uint)i;
            }

            /*--
               Now ftab contains the first loc of every small bucket.
               Calculate the running order, from smallest to largest
               big bucket.
            --*/
            for (i = 0; i <= 255; i++)
            {
                bigDone[i] = false;
                runningOrder[i] = i;
            }

            {
                int vv;
                int h = 1;
                do
                {
                    h = 3 * h + 1;
                }
                while (h <= 256);

                do
                {
                    h = h / 3;
                    for (i = h; i <= 255; i++)
                    {
                        vv = runningOrder[i];
                        j = i;
                        while (BIGFREQ(runningOrder[j - h], ftab) > BIGFREQ(vv, ftab))
                        {
                            runningOrder[j] = runningOrder[j - h];
                            j = j - h;
                            if (j <= (h - 1))
                                goto zero;
                        }

                    zero:
                        runningOrder[j] = vv;
                    }
                } while (h != 1);
            }

            /*--
               The main sorting loop.
            --*/

            numQSorted = 0;

            for (i = 0; i <= 255; i++)
            {

                /*--
                   Process big buckets, starting with the least full.
                   Basically this is a 3-step process in which we call
                   mainQSort3 to sort the small buckets [ss, j], but
                   also make a big effort to avoid the calls if we can.
                --*/
                ss = runningOrder[i];

                /*--
                   Step 1:
                   Complete the big bucket [ss] by quicksorting
                   any unsorted small buckets [ss, j], for j != ss.
                   Hopefully previous pointer-scanning phases have already
                   completed many of the small buckets [ss, j], so
                   we don't have to sort them at all.
                --*/
                for (j = 0; j <= 255; j++)
                {
                    if (j != ss)
                    {
                        sb = (ss << 8) + j;
                        if ((ftab[sb] & SETMASK) == 0)
                        {
                            int lo = (int)(ftab[sb] & CLEARMASK);
                            int hi = (int)((ftab[sb + 1] & CLEARMASK) - 1);
                            if (hi > lo)
                            {
                                // if (verb >= 4)
                                //     VPrintf4("        qsort [0x%x, 0x%x]   "

                                //                "done %d   this %d\n",
                                //                ss, j, numQSorted, hi - lo + 1);

                                mainQSort3(
                                   ptr, block, quadrant, nblock,
                                   lo, hi, BZ_N_RADIX, budget
                                );
                                numQSorted += (hi - lo + 1);
                                if (*budget < 0) return;
                            }
                        }

                        ftab[sb] |= SETMASK;
                    }
                }

                //AssertH(!bigDone[ss], 1006);

                /*--
                   Step 2:
                   Now scan this big bucket [ss] so as to synthesise the
                   sorted order for small buckets [t, ss] for all t,
                   including, magically, the bucket [ss,ss] too.
                   This will avoid doing Real Work in subsequent Step 1's.
                --*/
                {
                    for (j = 0; j <= 255; j++)
                    {
                        copyStart[j] = (int)(ftab[(j << 8) + ss] & CLEARMASK);
                        copyEnd[j] = (int)((ftab[(j << 8) + ss + 1] & CLEARMASK) - 1);
                    }

                    for (j = (int)(ftab[ss << 8] & CLEARMASK); j < copyStart[ss]; j++)
                    {
                        k = (int)(ptr[j] - 1);
                        if (k < 0)
                            k += nblock;

                        c1 = block[k];
                        if (!bigDone[c1])
                            ptr[copyStart[c1]++] = (uint)k;
                    }

                    for (j = (int)((ftab[(ss + 1) << 8] & CLEARMASK) - 1); j > copyEnd[ss]; j--)
                    {
                        k = (int)(ptr[j] - 1);
                        if (k < 0)
                            k += nblock;

                        c1 = block[k];
                        if (!bigDone[c1])
                            ptr[copyEnd[c1]--] = (uint)k;
                    }
                }

                // AssertH((copyStart[ss] - 1 == copyEnd[ss])
                //           ||
                //           /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
                //              Necessity for this case is demonstrated by compressing
                //              a sequence of approximately 48.5 million of character
                //              251; 1.0.0/1.0.1 will then die here. */
                //           (copyStart[ss] == 0 && copyEnd[ss] == nblock - 1),
                //           1007)


                for (j = 0; j <= 255; j++)
                {
                    ftab[(j << 8) + ss] |= SETMASK;
                }

                /*--
                   Step 3:
                   The [ss] big bucket is now done.  Record this fact,
                   and update the quadrant descriptors.  Remember to
                   update quadrants in the overshoot area too, if
                   necessary.  The "if (i < 255)" test merely skips
                   this updating for the last bucket processed, since
                   updating for the last bucket is pointless.
                   The quadrant array provides a way to incrementally
                   cache sort orderings, as they appear, so as to
                   make subsequent comparisons in fullGtU() complete
                   faster.  For repetitive blocks this makes a big
                   difference (but not big enough to be able to avoid
                   the fallback sorting mechanism, exponential radix sort).
                   The precise meaning is: at all times:
                      for 0 <= i < nblock and 0 <= j <= nblock
                      if block[i] != block[j],
                         then the relative values of quadrant[i] and
                              quadrant[j] are meaningless.
                         else {
                            if quadrant[i] < quadrant[j]
                               then the string starting at i lexicographically
                               precedes the string starting at j
                            else if quadrant[i] > quadrant[j]
                               then the string starting at j lexicographically
                               precedes the string starting at i
                            else
                               the relative ordering of the strings starting
                               at i and j has not yet been determined.
                         }
                --*/
                bigDone[ss] = true;

                if (i < 255)
                {
                    int bbStart = (int)(ftab[ss << 8] & CLEARMASK);
                    int bbSize = (int)((ftab[(ss + 1) << 8] & CLEARMASK) - bbStart);
                    int shifts = 0;

                    while ((bbSize >> shifts) > 65534) shifts++;

                    for (j = bbSize - 1; j >= 0; j--)
                    {
                        int a2update = (int)ptr[bbStart + j];
                        ushort qVal = (ushort)(j >> shifts);
                        quadrant[a2update] = qVal;
                        if (a2update < BZ_N_OVERSHOOT)
                            quadrant[a2update + nblock] = qVal;
                    }

                    // AssertH(((bbSize - 1) >> shifts) <= 65535, 1002);
                }

            }

            // if (verb >= 4)
            //     VPrintf3("        %d pointers, %d sorted, %d scanned\n",
            //                nblock, numQSorted, nblock - numQSorted);
        }

        /*
        Pre:
            nblock > 0
            arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
            ((byte*)arr2)  [0 .. nblock-1] holds block
            arr1 exists for [0 .. nblock-1]
        Post:
            ((byte*)arr2) [0 .. nblock-1] holds block
            All other areas of block destroyed
            ftab [ 0 .. 65536 ] destroyed
            arr1 [0 .. nblock-1] holds sorted order
        */

        public static void BZ2_blockSort(EState s)
        {
            uint* ptr = s.ptr;
            byte* block = s.block;
            uint* ftab = s.ftab;
            int nblock = s.nblock;
            int verb = s.verbosity;
            int wfact = s.workFactor;
            ushort* quadrant;
            int budget;
            int budgetInit;
            int i;

            if (nblock < 10000)
            {
                fallbackSort(s.arr1, s.arr2, ftab, nblock, verb);
            }
            else
            {
                /* Calculate the location for quadrant, remembering to get
                   the alignment right.  Assumes that &(block[0]) is at least
                   2-byte aligned -- this should be ok since block is really
                   the first section of arr2.
                */
                i = nblock + BZ_N_OVERSHOOT;
                if ((i & 1) != 0) i++;
                quadrant = (ushort*)(&(block[i]));

                /* (wfact-1) / 3 puts the default-factor-30
                   transition point at very roughly the same place as
                   with v0.1 and v0.9.0.
                   Not that it particularly matters any more, since the
                   resulting compressed stream is now the same regardless
                   of whether or not we use the main sort or fallback sort.
                */
                if (wfact < 1) wfact = 1;
                if (wfact > 100) wfact = 100;
                budgetInit = nblock * ((wfact - 1) / 3);
                budget = budgetInit;

                mainSort(ptr, block, quadrant, ftab, nblock, verb, &budget);
                // if (verb >= 3)
                //     VPrintf3("      %d work, %d block, ratio %5.2f\n",
                //                budgetInit - budget,
                //                nblock,
                //                (float)(budgetInit - budget) /
                //                (float)(nblock == 0 ? 1 : nblock));
                if (budget < 0)
                {
                    // if (verb >= 2)
                    //     VPrintf0("    too repetitive; using fallback"

                    //                " sorting algorithm\n");
                    fallbackSort(s.arr1, s.arr2, ftab, nblock, verb);
                }
            }

            s.origPtr = -1;
            for (i = 0; i < s.nblock; i++)
                if (ptr[i] == 0)
                { s.origPtr = i; break; };

            //AssertH(s.origPtr != -1, 1003);
        }

        #region Macros

        private static void fswap(ref int zz1, ref int zz2)
        {
            int zztmp = zz1;
            zz1 = zz2;
            zz2 = zztmp;
        }

        private static void fswap(ref uint zz1, ref uint zz2)
        {
            uint zztmp = zz1;
            zz1 = zz2;
            zz2 = zztmp;
        }

        private static void fvswap(uint* fmap, int zzp1, int zzp2, int zzn)
        {
            int yyp1 = (zzp1);
            int yyp2 = (zzp2);
            int yyn = (zzn);
            while (yyn > 0)
            {
                fswap(ref fmap[yyp1], ref fmap[yyp2]);
                yyp1++; yyp2++; yyn--;
            }
        }

        private static int fmin(int a, int b) => (a < b) ? a : b;

        private static void fpush(int lz, int hz, int[] stackLo, int[] stackHi, ref int sp)
        {
            stackLo[sp] = lz;
            stackHi[sp] = hz;
            sp++;
        }

        private static void fpop(ref int lz, ref int hz, int[] stackLo, int[] stackHi, ref int sp)
        {
            sp--;
            lz = stackLo[sp];
            hz = stackHi[sp];
        }

        private static void SET_BH(int zz, uint* bhtab)
        {
            bhtab[zz >> 5] |= (uint)(1 << (zz & 31));
        }

        private static void CLEAR_BH(int zz, uint* bhtab)
        {
            bhtab[zz >> 5] &= (uint)~(1 << (zz & 31));
        }

        private static bool ISSET_BH(int zz, uint* bhtab) => (bhtab[zz >> 5] & (1 << (zz & 31))) != 0;

        private static uint WORD_BH(int zz, uint* bhtab) => bhtab[(zz) >> 5];

        private static int UNALIGNED_BH(int zz) => zz & 0x01f;

        private static void mswap(ref uint zz1, ref uint zz2)
        {
            uint zztmp = zz1;
            zz1 = zz2;
            zz2 = zztmp;
        }

        private static void mvswap(uint* ptr, int zzp1, int zzp2, int zzn)
        {
            int yyp1 = (zzp1);
            int yyp2 = (zzp2);
            int yyn = (zzn);
            while (yyn > 0)
            {
                mswap(ref ptr[yyp1], ref ptr[yyp2]);
                yyp1++; yyp2++; yyn--;
            }
        }

        private static int mmin(int a, int b) => (a < b) ? a : b;

        private static void mpush(int lz, int hz, int dz, int[] stackLo, int[] stackHi, int[] stackD, ref int sp)
        {
            stackLo[sp] = lz;
            stackHi[sp] = hz;
            stackD[sp] = dz;
            sp++;
        }

        private static void mpop(ref int lz, ref int hz, ref int dz, int[] stackLo, int[] stackHi, int[] stackD, ref int sp)
        {
            sp--;
            lz = stackLo[sp];
            hz = stackHi[sp];
            dz = stackD[sp];
        }

        private static int mnextsize(int az, int[] nextLo, int[] nextHi) => nextHi[az] - nextLo[az];

        private static void mnextswap(int az, int bz, int[] nextLo, int[] nextHi, int[] nextD)
        {
            int tz;
            tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz;
            tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz;
            tz = nextD[az]; nextD[az] = nextD[bz]; nextD[bz] = tz;
        }

        private static uint BIGFREQ(int b, uint* ftab) => ftab[(b + 1) << 8] - ftab[b << 8];

        #endregion
    }
}