diff --git a/BurnOutSharp/FileType/MicrosoftCAB.LZX.cs b/BurnOutSharp/FileType/MicrosoftCAB.LZX.cs index 159a157f..a3d94b5d 100644 --- a/BurnOutSharp/FileType/MicrosoftCAB.LZX.cs +++ b/BurnOutSharp/FileType/MicrosoftCAB.LZX.cs @@ -1,11 +1,397 @@ -namespace BurnOutSharp.FileType +using System.Collections.Generic; +/// +/// +/// +/// +/// +/// +/// +/// +/// +namespace BurnOutSharp.FileType { - #region TEMPORARY AREA FOR LZX COMPRESSION FORMAT + /// + /// 3-bit block type + /// + public enum MSCABLXZBlockType : byte + { + /// + /// Not valid + /// + INVALID_0 = 0b000, - // See the following for details about implementation (there is no open spec): - // https://github.com/kyz/libmspack/blob/master/libmspack/mspack/lzx.h - // https://github.com/kyz/libmspack/blob/master/libmspack/mspack/lzxc.c - // https://github.com/kyz/libmspack/blob/master/libmspack/mspack/lzxd.c + /// + /// Verbatim block + /// + Verbatim = 0b001, - #endregion + /// + /// Aligned offset block + /// + AlignedOffset = 0b010, + + /// + /// Uncompressed block + /// + Uncompressed = 0b011, + + /// + /// Not valid + /// + INVALID_4 = 0b100, + + /// + /// Not valid + /// + INVALID_5 = 0b101, + + /// + /// Not valid + /// + INVALID_6 = 0b110, + + /// + /// Not valid + /// + INVALID_7 = 0b111, + } + + public class MSCABLZX + { + /// + /// The window size determines the number of window subdivisions, or position slots + /// + public static readonly Dictionary PositionSlots = new Dictionary() + { + [128 * 1024] = 34, // 128 KB + [256 * 1024] = 36, // 256 KB + [512 * 1024] = 38, // 512 KB + [1024 * 1024] = 42, // 1 MB + [2 * 1024 * 1024] = 50, // 2 MB + [4 * 1024 * 1024] = 66, // 4 MB + [8 * 1024 * 1024] = 98, // 8 MB + [16 * 1024 * 1024] = 162, // 16 MB + [32 * 1024 * 1024] = 290, // 32 MB + }; + } + + public class MSCABLZXHeader + { + /* + 2.2 Header + + 2.2.1 Chunk Size + + The LZXD compressor emits chunks of compressed data. A chunk represents exactly 32 KB of + uncompressed data until the last chunk in the stream, which can represent less than 32 KB. To + ensure that an exact number of input bytes represent an exact number of output bytes for each + chunk, after each 32 KB of uncompressed data is represented in the output compressed bitstream, the + output bitstream is padded with up to 15 bits of zeros to realign the bitstream on a 16-bit boundary + (even byte boundary) for the next 32 KB of data. This results in a compressed chunk of a byte-aligned + size. The compressed chunk could be smaller than 32 KB or larger than 32 KB if the data is + incompressible when the chunk is not the last one. + + The LZXD engine encodes a compressed, chunk-size prefix field preceding each compressed chunk in + the compressed byte stream. The compressed, chunk-size prefix field is a byte aligned, little-endian, + 16-bit field. The chunk prefix chain could be followed in the compressed stream without + decompressing any data. The next chunk prefix is at a location computed by the absolute byte offset + location of this chunk prefix plus 2 (for the size of the chunk-size prefix field) plus the current chunk + size. + + 2.2.2 E8 Call Translation + + E8 call translation is an optional feature that can be used when the data to compress contains x86 + instruction sequences. E8 translation operates as a preprocessing stage before compressing each + chunk, and the compressed stream header contains a bit that indicates whether the decoder shall + reverse the translation as a postprocessing step after decompressing each chunk. + + The x86 instruction beginning with a byte value of 0xE8 is followed by a 32-bit, little-endian relative + displacement to the call target. When E8 call translation is enabled, the following preprocessing steps + are performed on the uncompressed input before compression (assuming little-endian byte ordering): + + Let chunk_offset refer to the total number of uncompressed bytes preceding this chunk. + + Let E8_file_size refer to the caller-specified value given to the compressor or decoded from the header + of the compressed stream during decompression. + + The following example shows how E8 translation is performed for each 32-KB chunk of uncompressed + data (or less than 32 KB if last chunk to compress). + + if (( chunk_offset < 0x40000000 ) && ( chunk_size > 10 )) + for ( i = 0; i < (chunk_size – 10); i++ ) + if ( chunk_byte[ i ] == 0xE8 ) + long current_pointer = chunk_offset + i; + long displacement = chunk_byte[ i+1 ] | + chunk_byte[ i+2 ] << 8 | + chunk_byte[ i+3 ] << 16 | + chunk_byte[ i+4 ] << 24; + long target = current_pointer + displacement; + if (( target >= 0 ) && ( target < E8_file_size+current_pointer)) + if ( target >= E8_file_size ) + target = displacement – E8_file_size; + endif + chunk_byte[ i+1 ] = (byte)( target ); + chunk_byte[ i+2 ] = (byte)( target >> 8 ); + chunk_byte[ i+3 ] = (byte)( target >> 16 ); + chunk_byte[ i+4 ] = (byte)( target >> 24 ); + endif + i += 4; + endif + endfor + endif + + After decompression, the E8 scanning algorithm is the same. The following example shows how E8 + translation reversal is performed. + + long value = chunk_byte[ i+1 ] | + chunk_byte[ i+2 ] << 8 | + chunk_byte[ i+3 ] << 16 | + chunk_byte[ i+4 ] << 24; + if (( value >= -current_pointer ) && ( value < E8_file_size )) + if ( value >= 0 ) + displacement = value – current_pointer; + else + displacement = value + E8_file_size; + endif + chunk_byte[ i+1 ] = (byte)( displacement ); + chunk_byte[ i+2 ] = (byte)( displacement >> 8 ); + chunk_byte[ i+3 ] = (byte)( displacement >> 16 ); + chunk_byte[ i+4 ] = (byte)( displacement >> 24 ); + endif + + The first bit in the first chunk in the LZXD bitstream (following the 2-byte, chunk-size prefix described + in section 2.2.1) indicates the presence or absence of two 16-bit fields immediately following the + single bit. If the bit is set, E8 translation is enabled for all the following chunks in the stream using the + 32-bit value derived from the two 16-bit fields as the E8_file_size provided to the compressor when E8 + translation was enabled. Note that E8_file_size is completely independent of the length of the + uncompressed data. E8 call translation is disabled after the 32,768th chunk (after 1 gigabyte (GB) of + uncompressed data). + + Field Comments Size + ---------------------------------------------------------------- + E8 translation 0-disabled, 1-enabled 1 bit + Translation size high word Only present if enabled 0 or 16 bits + Translation size low word Only present if enabled 0 or 16 bits + */ + } + + /// + /// An LZXD block represents a sequence of compressed data that is encoded with the same set of + /// Huffman trees, or a sequence of uncompressed data. There can be one or more LZXD blocks in a + /// compressed stream, each with its own set of Huffman trees. Blocks do not have to start or end on a + /// chunk boundary; blocks can span multiple chunks, or a single chunk can contain multiple blocks. The + /// number of chunks is related to the size of the data being compressed, while the number of blocks is + /// related to how well the data is compressed. The Block Type field, as specified in section 2.3.1.1, + /// indicates which type of block follows, and the Block Size field, as specified in section 2.3.1.2, + /// indicates the number of uncompressed bytes represented by the block. Following the generic block + /// header is a type-specific header that describes the remainder of the block. + /// + public class MSCABLZXBlockHeader + { + /// 3 bits + public MSCABLXZBlockType BlockType; + + /// + /// Block size is the high 8 bits of 24 + /// + /// 8 bits + public byte BlockSizeMSB; + + /// + /// Block size is the middle 8 bits of 24 + /// + /// 8 bits + public byte BlockSizeByte2; + + /// + /// Block size is the low 8 bits of 24 + /// + /// 8 bits + public byte BlocksizeLSB; + + /* + 2.3.2 Block Data + + 2.3.2.3 Aligned Offset Block + + An aligned offset block is identical to the verbatim block except for the presence of the aligned offset + tree preceding the other trees. + + Entry Comments Size + ---------------------------------------------------------------------------------- + Aligned offset tree 8 elements, 3 bits each 24 bits + Pretree for first 256 elements of main tree 20 elements, 4 bits each 80 bits + Path lengths of first 256 elements of main tree Encoded using pretree Variable + Pretree for remainder of main tree 20 elements, 4 bits each 80 bits + Path lengths of remaining elements of main tree Encoded using pretree Variable + Pretree for length tree 20 elements, 4 bits each 80 bits + Path lengths of elements in length tree Encoded using pretree Variable + Token sequence (matches and literals) Specified in section 2.6 Variable + */ + } + + /// + /// Following the generic block header, an uncompressed block begins with 1 to 16 bits of zero padding + /// to align the bit buffer on a 16-bit boundary. At this point, the bitstream ends and a byte stream + /// begins. Following the zero padding, new 32-bit values for R0, R1, and R2 are output in little-endian + /// form, followed by the uncompressed data bytes themselves. Finally, if the uncompressed data length + /// is odd, one extra byte of zero padding is encoded to realign the following bitstream. + /// + /// Then the bitstream of byte-swapped 16-bit integers resumes for the next Block Type field (if there + /// are subsequent blocks). + /// + /// The decoded R0, R1, and R2 values are used as initial repeated offset values to decode the + /// subsequent compressed block if present. + /// + public class MSCABLZXUncompressedBlock + { + /// + /// Generic block header + /// + public MSCABLZXBlockHeader Header; + + /// + /// Padding to align following field on 16-bit boundary + /// + /// Bits have a value of zero + public ushort PaddingBits; + + /// + /// Least significant to most significant byte (little-endian DWORD ([MS-DTYP])) + /// + /// Encoded directly in the byte stream, not in the bitstream of byte-swapped 16-bit words + public uint R0; + + /// + /// Least significant to most significant byte (little-endian DWORD) + /// + /// Encoded directly in the byte stream, not in the bitstream of byte-swapped 16-bit words + public uint R1; + + /// + /// Least significant to most significant byte (little-endian DWORD) + /// + /// Encoded directly in the byte stream, not in the bitstream of byte-swapped 16-bit words + public uint R2; + + /// + /// Can use the direct memcpy function, as specified in [IEEE1003.1] + /// + /// Encoded directly in the byte stream, not in the bitstream of byte-swapped 16-bit words + public byte[] RawDataBytes; + + /// + /// Only if uncompressed size is odd + /// + public byte AlignmentByte; + } + + /// + /// The fields of a verbatim block that follow the generic block header + /// + public class MSCABLZXVerbatimBlock + { + /// + /// Generic block header + /// + public MSCABLZXBlockHeader Header; + + /// + /// Pretree for first 256 elements of main tree + /// + /// 20 elements, 4 bits each + public byte[] PretreeFirst256; + + /// + /// Path lengths of first 256 elements of main tree + /// + /// Encoded using pretree + public int[] PathLengthsFirst256; + + /// + /// Pretree for remainder of main tree + /// + /// 20 elements, 4 bits each + public byte[] PretreeRemainder; + + /// + /// Path lengths of remaining elements of main tree + /// + /// Encoded using pretree + public int[] PathLengthsRemainder; + + /// + /// Pretree for length tree + /// + /// 20 elements, 4 bits each + public byte[] PretreeLengthTree; + + /// + /// Path lengths of elements in length tree + /// + /// Encoded using pretree + public int[] PathLengthsLengthTree; + + // Entry Comments Size + // --------------------------------------------------------------------------------------- + // Token sequence (matches and literals) Specified in section 2.6 Variable + } + + /// + /// An aligned offset block is identical to the verbatim block except for the presence of the aligned offset + /// tree preceding the other trees. + /// + public class MSCABLZXAlignedOffsetBlock + { + /// + /// Generic block header + /// + public MSCABLZXBlockHeader Header; + + /// + /// Aligned offset tree + /// + /// 8 elements, 3 bits each + public byte[] AlignedOffsetTree; + + /// + /// Pretree for first 256 elements of main tree + /// + /// 20 elements, 4 bits each + public byte[] PretreeFirst256; + + /// + /// Path lengths of first 256 elements of main tree + /// + /// Encoded using pretree + public int[] PathLengthsFirst256; + + /// + /// Pretree for remainder of main tree + /// + /// 20 elements, 4 bits each + public byte[] PretreeRemainder; + + /// + /// Path lengths of remaining elements of main tree + /// + /// Encoded using pretree + public int[] PathLengthsRemainder; + + /// + /// Pretree for length tree + /// + /// 20 elements, 4 bits each + public byte[] PretreeLengthTree; + + /// + /// Path lengths of elements in length tree + /// + /// Encoded using pretree + public int[] PathLengthsLengthTree; + + // Entry Comments Size + // --------------------------------------------------------------------------------------- + // Token sequence (matches and literals) Specified in section 2.6 Variable + } }