Start filling out LZX

2026-04-22 06:03:34 +00:00 · 2022-12-14 13:57:54 -08:00
parent 1cb3157110
commit ed2e88c781
1 changed files with 393 additions and 7 deletions
--- a/BurnOutSharp/FileType/MicrosoftCAB.LZX.cs
+++ b/BurnOutSharp/FileType/MicrosoftCAB.LZX.cs
@@ -1,11 +1,397 @@
-namespace BurnOutSharp.FileType
+using System.Collections.Generic;
+/// <see href="https://learn.microsoft.com/en-us/openspecs/exchange_server_protocols/ms-patch/cc78752a-b4af-4eee-88cb-01f4d8a4c2bf"/>
+/// <see href="https://interoperability.blob.core.windows.net/files/MS-PATCH/%5bMS-PATCH%5d.pdf"/>
+/// <see href="https://github.com/kyz/libmspack/blob/master/libmspack/mspack/lzx.h"/>
+/// <see href="https://github.com/kyz/libmspack/blob/master/libmspack/mspack/lzxc.c"/>
+/// <see href="https://github.com/kyz/libmspack/blob/master/libmspack/mspack/lzxd.c"/>
+/// <see href="https://wimlib.net/"/>
+/// <see href="http://xavprods.free.fr/lzx/"/>
+/// <see href="https://github.com/jhermsmeier/node-lzx"/>
+/// <see href="https://github.com/jhermsmeier/node-cabarc"/>
+namespace BurnOutSharp.FileType
 {
-    #region TEMPORARY AREA FOR LZX COMPRESSION FORMAT
+    /// <summary>
+    /// 3-bit block type
+    /// </summary>
+    public enum MSCABLXZBlockType : byte
+    {
+        /// <summary>
+        /// Not valid
+        /// </summary>
+        INVALID_0 = 0b000,

-    // See the following for details about implementation (there is no open spec):
-    // https://github.com/kyz/libmspack/blob/master/libmspack/mspack/lzx.h
-    // https://github.com/kyz/libmspack/blob/master/libmspack/mspack/lzxc.c
-    // https://github.com/kyz/libmspack/blob/master/libmspack/mspack/lzxd.c
+        /// <summary>
+        /// Verbatim block
+        /// </summary>
+        Verbatim = 0b001,

-    #endregion
+        /// <summary>
+        /// Aligned offset block
+        /// </summary>
+        AlignedOffset = 0b010,
+
+        /// <summary>
+        /// Uncompressed block
+        /// </summary>
+        Uncompressed = 0b011,
+
+        /// <summary>
+        /// Not valid
+        /// </summary>
+        INVALID_4 = 0b100,
+
+        /// <summary>
+        /// Not valid
+        /// </summary>
+        INVALID_5 = 0b101,
+
+        /// <summary>
+        /// Not valid
+        /// </summary>
+        INVALID_6 = 0b110,
+
+        /// <summary>
+        /// Not valid
+        /// </summary>
+        INVALID_7 = 0b111,
+    }
+
+    public class MSCABLZX
+    {
+        /// <summary>
+        /// The window size determines the number of window subdivisions, or position slots
+        /// </summary>
+        public static readonly Dictionary<int, int> PositionSlots = new Dictionary<int, int>()
+        {
+            [128 * 1024] = 34, // 128 KB
+            [256 * 1024] = 36, // 256 KB
+            [512 * 1024] = 38, // 512 KB
+            [1024 * 1024] = 42, // 1 MB
+            [2 * 1024 * 1024] = 50, // 2 MB
+            [4 * 1024 * 1024] = 66, // 4 MB
+            [8 * 1024 * 1024] = 98, // 8 MB
+            [16 * 1024 * 1024] = 162, // 16 MB
+            [32 * 1024 * 1024] = 290, // 32 MB
+        };
+    }
+
+    public class MSCABLZXHeader
+    {
+        /*
+        2.2 Header
+
+        2.2.1 Chunk Size
+
+        The LZXD compressor emits chunks of compressed data. A chunk represents exactly 32 KB of
+        uncompressed data until the last chunk in the stream, which can represent less than 32 KB. To
+        ensure that an exact number of input bytes represent an exact number of output bytes for each
+        chunk, after each 32 KB of uncompressed data is represented in the output compressed bitstream, the
+        output bitstream is padded with up to 15 bits of zeros to realign the bitstream on a 16-bit boundary
+        (even byte boundary) for the next 32 KB of data. This results in a compressed chunk of a byte-aligned
+        size. The compressed chunk could be smaller than 32 KB or larger than 32 KB if the data is
+        incompressible when the chunk is not the last one.
+
+        The LZXD engine encodes a compressed, chunk-size prefix field preceding each compressed chunk in
+        the compressed byte stream. The compressed, chunk-size prefix field is a byte aligned, little-endian,
+        16-bit field. The chunk prefix chain could be followed in the compressed stream without
+        decompressing any data. The next chunk prefix is at a location computed by the absolute byte offset
+        location of this chunk prefix plus 2 (for the size of the chunk-size prefix field) plus the current chunk
+        size.
+
+        2.2.2 E8 Call Translation
+
+        E8 call translation is an optional feature that can be used when the data to compress contains x86
+        instruction sequences. E8 translation operates as a preprocessing stage before compressing each
+        chunk, and the compressed stream header contains a bit that indicates whether the decoder shall
+        reverse the translation as a postprocessing step after decompressing each chunk.
+
+        The x86 instruction beginning with a byte value of 0xE8 is followed by a 32-bit, little-endian relative
+        displacement to the call target. When E8 call translation is enabled, the following preprocessing steps
+        are performed on the uncompressed input before compression (assuming little-endian byte ordering):
+
+        Let chunk_offset refer to the total number of uncompressed bytes preceding this chunk.
+
+        Let E8_file_size refer to the caller-specified value given to the compressor or decoded from the header
+        of the compressed stream during decompression.
+
+        The following example shows how E8 translation is performed for each 32-KB chunk of uncompressed
+        data (or less than 32 KB if last chunk to compress).
+
+            if (( chunk_offset < 0x40000000 ) && ( chunk_size > 10 ))
+                for ( i = 0; i < (chunk_size – 10); i++ )
+                    if ( chunk_byte[ i ] == 0xE8 )
+                        long current_pointer = chunk_offset + i;
+                        long displacement = chunk_byte[ i+1 ] |
+                        chunk_byte[ i+2 ] << 8 |
+                        chunk_byte[ i+3 ] << 16 |
+                        chunk_byte[ i+4 ] << 24;
+                        long target = current_pointer + displacement;
+                        if (( target >= 0 ) && ( target < E8_file_size+current_pointer))
+                            if ( target >= E8_file_size )
+                                target = displacement – E8_file_size;
+                            endif
+                            chunk_byte[ i+1 ] = (byte)( target );
+                            chunk_byte[ i+2 ] = (byte)( target >> 8 );
+                            chunk_byte[ i+3 ] = (byte)( target >> 16 );
+                            chunk_byte[ i+4 ] = (byte)( target >> 24 );
+                        endif
+                        i += 4;
+                    endif
+                endfor
+            endif
+
+        After decompression, the E8 scanning algorithm is the same. The following example shows how E8
+        translation reversal is performed.
+
+        long value = chunk_byte[ i+1 ] |
+        chunk_byte[ i+2 ] << 8 |
+        chunk_byte[ i+3 ] << 16 |
+        chunk_byte[ i+4 ] << 24;
+        if (( value >= -current_pointer ) && ( value < E8_file_size ))
+            if ( value >= 0 )
+                displacement = value – current_pointer;
+            else
+                displacement = value + E8_file_size;
+            endif
+            chunk_byte[ i+1 ] = (byte)( displacement );
+            chunk_byte[ i+2 ] = (byte)( displacement >> 8 );
+            chunk_byte[ i+3 ] = (byte)( displacement >> 16 );
+            chunk_byte[ i+4 ] = (byte)( displacement >> 24 );
+        endif
+
+        The first bit in the first chunk in the LZXD bitstream (following the 2-byte, chunk-size prefix described
+        in section 2.2.1) indicates the presence or absence of two 16-bit fields immediately following the
+        single bit. If the bit is set, E8 translation is enabled for all the following chunks in the stream using the
+        32-bit value derived from the two 16-bit fields as the E8_file_size provided to the compressor when E8
+        translation was enabled. Note that E8_file_size is completely independent of the length of the
+        uncompressed data. E8 call translation is disabled after the 32,768th chunk (after 1 gigabyte (GB) of
+        uncompressed data).
+
+        Field                       Comments                    Size
+        ----------------------------------------------------------------
+        E8 translation              0-disabled, 1-enabled       1 bit
+        Translation size high word  Only present if enabled     0 or 16 bits
+        Translation size low word   Only present if enabled     0 or 16 bits
+        */
+    }
+
+    /// <summary>
+    /// An LZXD block represents a sequence of compressed data that is encoded with the same set of
+    /// Huffman trees, or a sequence of uncompressed data. There can be one or more LZXD blocks in a
+    /// compressed stream, each with its own set of Huffman trees. Blocks do not have to start or end on a
+    /// chunk boundary; blocks can span multiple chunks, or a single chunk can contain multiple blocks. The
+    /// number of chunks is related to the size of the data being compressed, while the number of blocks is
+    /// related to how well the data is compressed. The Block Type field, as specified in section 2.3.1.1,
+    /// indicates which type of block follows, and the Block Size field, as specified in section 2.3.1.2,
+    /// indicates the number of uncompressed bytes represented by the block. Following the generic block
+    /// header is a type-specific header that describes the remainder of the block.
+    /// </summary>
+    public class MSCABLZXBlockHeader
+    {
+        /// <remarks>3 bits</remarks>
+        public MSCABLXZBlockType BlockType;
+
+        /// <summary>
+        /// Block size is the high 8 bits of 24
+        /// </summary>
+        /// <remarks>8 bits</remarks>
+        public byte BlockSizeMSB;
+
+        /// <summary>
+        /// Block size is the middle 8 bits of 24
+        /// </summary>
+        /// <remarks>8 bits</remarks>
+        public byte BlockSizeByte2;
+
+        /// <summary>
+        /// Block size is the low 8 bits of 24
+        /// </summary>
+        /// <remarks>8 bits</remarks>
+        public byte BlocksizeLSB;
+
+        /*
+        2.3.2 Block Data
+
+        2.3.2.3 Aligned Offset Block
+
+        An aligned offset block is identical to the verbatim block except for the presence of the aligned offset
+        tree preceding the other trees.
+
+        Entry                                           Comments                    Size
+        ----------------------------------------------------------------------------------
+        Aligned offset tree                             8 elements, 3 bits each     24 bits
+        Pretree for first 256 elements of main tree     20 elements, 4 bits each    80 bits
+        Path lengths of first 256 elements of main tree Encoded using pretree       Variable
+        Pretree for remainder of main tree              20 elements, 4 bits each    80 bits
+        Path lengths of remaining elements of main tree Encoded using pretree       Variable
+        Pretree for length tree                         20 elements, 4 bits each    80 bits
+        Path lengths of elements in length tree         Encoded using pretree       Variable
+        Token sequence (matches and literals)           Specified in section 2.6    Variable
+        */
+    }
+
+    /// <summary>
+    /// Following the generic block header, an uncompressed block begins with 1 to 16 bits of zero padding
+    /// to align the bit buffer on a 16-bit boundary. At this point, the bitstream ends and a byte stream
+    /// begins. Following the zero padding, new 32-bit values for R0, R1, and R2 are output in little-endian
+    /// form, followed by the uncompressed data bytes themselves. Finally, if the uncompressed data length
+    /// is odd, one extra byte of zero padding is encoded to realign the following bitstream.
+    /// 
+    /// Then the bitstream of byte-swapped 16-bit integers resumes for the next Block Type field (if there
+    /// are subsequent blocks).
+    /// 
+    /// The decoded R0, R1, and R2 values are used as initial repeated offset values to decode the
+    /// subsequent compressed block if present.
+    /// </summary>
+    public class MSCABLZXUncompressedBlock
+    {
+        /// <summary>
+        /// Generic block header
+        /// </summary>
+        public MSCABLZXBlockHeader Header;
+
+        /// <summary>
+        /// Padding to align following field on 16-bit boundary
+        /// </summary>
+        /// <remarks>Bits have a value of zero</remarks>
+        public ushort PaddingBits;
+
+        /// <summary>
+        /// Least significant to most significant byte (little-endian DWORD ([MS-DTYP]))
+        /// </summary>
+        /// <remarks>Encoded directly in the byte stream, not in the bitstream of byte-swapped 16-bit words</remarks>
+        public uint R0;
+
+        /// <summary>
+        /// Least significant to most significant byte (little-endian DWORD)
+        /// </summary>
+        /// <remarks>Encoded directly in the byte stream, not in the bitstream of byte-swapped 16-bit words</remarks>
+        public uint R1;
+
+        /// <summary>
+        /// Least significant to most significant byte (little-endian DWORD)
+        /// </summary>
+        /// <remarks>Encoded directly in the byte stream, not in the bitstream of byte-swapped 16-bit words</remarks>
+        public uint R2;
+
+        /// <summary>
+        /// Can use the direct memcpy function, as specified in [IEEE1003.1]
+        /// </summary>
+        /// <remarks>Encoded directly in the byte stream, not in the bitstream of byte-swapped 16-bit words</remarks>
+        public byte[] RawDataBytes;
+
+        /// <summary>
+        /// Only if uncompressed size is odd
+        /// </summary>
+        public byte AlignmentByte;
+    }
+
+    /// <summary>
+    /// The fields of a verbatim block that follow the generic block header
+    /// </summary>
+    public class MSCABLZXVerbatimBlock
+    {
+        /// <summary>
+        /// Generic block header
+        /// </summary>
+        public MSCABLZXBlockHeader Header;
+
+        /// <summary>
+        /// Pretree for first 256 elements of main tree
+        /// </summary>
+        /// <remarks>20 elements, 4 bits each</remarks>
+        public byte[] PretreeFirst256;
+
+        /// <summary>
+        /// Path lengths of first 256 elements of main tree
+        /// </summary>
+        /// <remarks>Encoded using pretree</remarks>
+        public int[] PathLengthsFirst256;
+
+        /// <summary>
+        /// Pretree for remainder of main tree
+        /// </summary>
+        /// <remarks>20 elements, 4 bits each</remarks>
+        public byte[] PretreeRemainder;
+
+        /// <summary>
+        /// Path lengths of remaining elements of main tree
+        /// </summary>
+        /// <remarks>Encoded using pretree</remarks>
+        public int[] PathLengthsRemainder;
+
+        /// <summary>
+        /// Pretree for length tree
+        /// </summary>
+        /// <remarks>20 elements, 4 bits each</remarks>
+        public byte[] PretreeLengthTree;
+
+        /// <summary>
+        /// Path lengths of elements in length tree
+        /// </summary>
+        /// <remarks>Encoded using pretree</remarks>
+        public int[] PathLengthsLengthTree;
+
+        // Entry                                    Comments                    Size
+        // ---------------------------------------------------------------------------------------
+        // Token sequence (matches and literals)    Specified in section 2.6    Variable
+    }
+
+    /// <summary>
+    /// An aligned offset block is identical to the verbatim block except for the presence of the aligned offset
+    /// tree preceding the other trees.
+    /// </summary>
+    public class MSCABLZXAlignedOffsetBlock
+    {
+        /// <summary>
+        /// Generic block header
+        /// </summary>
+        public MSCABLZXBlockHeader Header;
+
+        /// <summary>
+        /// Aligned offset tree
+        /// </summary>
+        /// <remarks>8 elements, 3 bits each</remarks>
+        public byte[] AlignedOffsetTree;
+
+        /// <summary>
+        /// Pretree for first 256 elements of main tree
+        /// </summary>
+        /// <remarks>20 elements, 4 bits each</remarks>
+        public byte[] PretreeFirst256;
+
+        /// <summary>
+        /// Path lengths of first 256 elements of main tree
+        /// </summary>
+        /// <remarks>Encoded using pretree</remarks>
+        public int[] PathLengthsFirst256;
+
+        /// <summary>
+        /// Pretree for remainder of main tree
+        /// </summary>
+        /// <remarks>20 elements, 4 bits each</remarks>
+        public byte[] PretreeRemainder;
+
+        /// <summary>
+        /// Path lengths of remaining elements of main tree
+        /// </summary>
+        /// <remarks>Encoded using pretree</remarks>
+        public int[] PathLengthsRemainder;
+
+        /// <summary>
+        /// Pretree for length tree
+        /// </summary>
+        /// <remarks>20 elements, 4 bits each</remarks>
+        public byte[] PretreeLengthTree;
+
+        /// <summary>
+        /// Path lengths of elements in length tree
+        /// </summary>
+        /// <remarks>Encoded using pretree</remarks>
+        public int[] PathLengthsLengthTree;
+
+        // Entry                                    Comments                    Size
+        // ---------------------------------------------------------------------------------------
+        // Token sequence (matches and literals)    Specified in section 2.6    Variable
+    }
 }