diff --git a/BurnOutSharp.Models/MicrosoftCabinet/Enums.cs b/BurnOutSharp.Models/MicrosoftCabinet/Enums.cs index c8eb5ea6..1c9effd0 100644 --- a/BurnOutSharp.Models/MicrosoftCabinet/Enums.cs +++ b/BurnOutSharp.Models/MicrosoftCabinet/Enums.cs @@ -30,6 +30,29 @@ namespace BurnOutSharp.Models.MicrosoftCabinet TYPE_LZX = 0x0003, } + public enum DeflateCompressionType : byte + { + /// + /// no compression + /// + NoCompression = 0b00, + + /// + /// Compressed with fixed Huffman codes + /// + FixedHuffman = 0b01, + + /// + /// Compressed with dynamic Huffman codes + /// + DynamicHuffman = 0b10, + + /// + /// Reserved (error) + /// + Reserved = 0b11, + } + [Flags] public enum FileAttributes : ushort { diff --git a/BurnOutSharp.Models/MicrosoftCabinet/IMSZIPBlockData.cs b/BurnOutSharp.Models/MicrosoftCabinet/IMSZIPBlockData.cs new file mode 100644 index 00000000..6c038108 --- /dev/null +++ b/BurnOutSharp.Models/MicrosoftCabinet/IMSZIPBlockData.cs @@ -0,0 +1,8 @@ +namespace BurnOutSharp.Models.MicrosoftCabinet +{ + /// + /// Empty interface defining block types + /// + /// + public interface IMSZIPBlockData { } +} \ No newline at end of file diff --git a/BurnOutSharp.Models/MicrosoftCabinet/MSZIPBlock.cs b/BurnOutSharp.Models/MicrosoftCabinet/MSZIPBlock.cs new file mode 100644 index 00000000..f7ec4037 --- /dev/null +++ b/BurnOutSharp.Models/MicrosoftCabinet/MSZIPBlock.cs @@ -0,0 +1,31 @@ +namespace BurnOutSharp.Models.MicrosoftCabinet +{ + /// + /// Each MSZIP block MUST consist of a 2-byte MSZIP signature and one or more RFC 1951 blocks. The + /// 2-byte MSZIP signature MUST consist of the bytes 0x43 and 0x4B. The MSZIP signature MUST be + /// the first 2 bytes in the MSZIP block.The MSZIP signature is shown in the following packet diagram. + /// + /// + public class MSZIPBlock + { + /// + /// 'CK' + /// + public ushort Signature; + + /// + /// Each MSZIP block is the result of a single deflate compression operation, as defined in [RFC1951]. + /// The compressor that performs the compression operation MUST generate one or more RFC 1951 + /// blocks, as defined in [RFC1951]. The number, deflation mode, and type of RFC 1951 blocks in each + /// MSZIP block is determined by the compressor, as defined in [RFC1951]. The last RFC 1951 block in + /// each MSZIP block MUST be marked as the "end" of the stream(1), as defined by [RFC1951] + /// section 3.2.3. Decoding trees MUST be discarded after each RFC 1951 block, but the history buffer + /// MUST be maintained.Each MSZIP block MUST represent no more than 32 KB of uncompressed data. + /// + /// The maximum compressed size of each MSZIP block is 32 KB + 12 bytes. This enables the MSZIP + /// block to contain 32 KB of data split between two noncompressed RFC 1951 blocks, each of which + /// has a value of BTYPE = 00. + /// + public byte[] Data; + } +} \ No newline at end of file diff --git a/BurnOutSharp.Models/MicrosoftCabinet/MSZIPCompressedBlock.cs b/BurnOutSharp.Models/MicrosoftCabinet/MSZIPCompressedBlock.cs new file mode 100644 index 00000000..a94271ca --- /dev/null +++ b/BurnOutSharp.Models/MicrosoftCabinet/MSZIPCompressedBlock.cs @@ -0,0 +1,19 @@ +namespace BurnOutSharp.Models.MicrosoftCabinet +{ + /// + /// Base class for compressed blocks + /// + /// + public abstract class MSZIPCompressedBlock : IMSZIPBlockData + { + /// + /// Huffman code lengths for the literal / length alphabet + /// + public abstract int[] LiteralLengths { get; set; } + + /// + /// Huffman distance codes for the literal / length alphabet + /// + public abstract int[] DistanceCodes { get; set; } + } +} \ No newline at end of file diff --git a/BurnOutSharp.Models/MicrosoftCabinet/MSZIPDeflateBlock.cs b/BurnOutSharp.Models/MicrosoftCabinet/MSZIPDeflateBlock.cs new file mode 100644 index 00000000..35e872fd --- /dev/null +++ b/BurnOutSharp.Models/MicrosoftCabinet/MSZIPDeflateBlock.cs @@ -0,0 +1,23 @@ +namespace BurnOutSharp.Models.MicrosoftCabinet +{ + /// + public class MSZIPDeflateBlock + { + /// + /// Set if and only if this is the last block of the data set. + /// + /// Bit 0 + public bool BFINAL { get; set; } + + /// + /// Specifies how the data are compressed + /// + /// Bits 1-2 + public DeflateCompressionType BTYPE { get; set; } + + /// + /// Block data as defined by the compression type + /// + public IMSZIPBlockData BlockData { get; set; } + } +} \ No newline at end of file diff --git a/BurnOutSharp.Models/MicrosoftCabinet/MSZIPDynamicHuffmanCompressedBlock.cs b/BurnOutSharp.Models/MicrosoftCabinet/MSZIPDynamicHuffmanCompressedBlock.cs new file mode 100644 index 00000000..10bb3324 --- /dev/null +++ b/BurnOutSharp.Models/MicrosoftCabinet/MSZIPDynamicHuffmanCompressedBlock.cs @@ -0,0 +1,15 @@ +namespace BurnOutSharp.Models.MicrosoftCabinet +{ + /// + /// Compression with dynamic Huffman codes (BTYPE=10) + /// + /// + public class MSZIPDynamicHuffmanCompressedBlock : MSZIPCompressedBlock + { + /// + public override int[] LiteralLengths { get; set; } + + /// + public override int[] DistanceCodes { get; set; } + } +} \ No newline at end of file diff --git a/BurnOutSharp.Models/MicrosoftCabinet/MSZIPFixedHuffmanCompressedBlock.cs b/BurnOutSharp.Models/MicrosoftCabinet/MSZIPFixedHuffmanCompressedBlock.cs new file mode 100644 index 00000000..bb6e7265 --- /dev/null +++ b/BurnOutSharp.Models/MicrosoftCabinet/MSZIPFixedHuffmanCompressedBlock.cs @@ -0,0 +1,90 @@ +using System; + +namespace BurnOutSharp.Models.MicrosoftCabinet +{ + /// + /// Compression with fixed Huffman codes (BTYPE=01) + /// + /// + /// + public class MSZIPFixedHuffmanCompressedBlock : MSZIPCompressedBlock + { + #region Properties + + /// + public override int[] LiteralLengths + { + get + { + // If we have cached lengths, use those + if (_literalLengths != null) + return _literalLengths; + + // Otherwise, build it from scratch + _literalLengths = new int[288]; + + // Literal Value 0 - 143, 8 bits + for (int i = 0; i < 144; i++) + _literalLengths[i] = 8; + + // Literal Value 144 - 255, 9 bits + for (int i = 144; i < 256; i++) + _literalLengths[i] = 9; + + // Literal Value 256 - 279, 7 bits + for (int i = 256; i < 280; i++) + _literalLengths[i] = 7; + + // Literal Value 280 - 287, 8 bits + for (int i = 280; i < 288; i++) + _literalLengths[i] = 8; + + return _literalLengths; + } + set + { + throw new FieldAccessException(); + } + } + + /// + public override int[] DistanceCodes + { + get + { + // If we have cached distances, use those + if (_distanceCodes != null) + return _distanceCodes; + + // Otherwise, build it from scratch + _distanceCodes = new int[32]; + + // Fixed length, 5 bits + for (int i = 0; i < 32; i++) + _distanceCodes[i] = 5; + + return _distanceCodes; + } + set + { + throw new FieldAccessException(); + } + } + + #endregion + + #region Instance Variables + + /// + /// Huffman code lengths for the literal / length alphabet + /// + private int[] _literalLengths = null; + + /// + /// Huffman distance codes for the literal / length alphabet + /// + private int[] _distanceCodes = null; + + #endregion + } +} \ No newline at end of file diff --git a/BurnOutSharp.Models/MicrosoftCabinet/MSZIPNonCompressedBlock.cs b/BurnOutSharp.Models/MicrosoftCabinet/MSZIPNonCompressedBlock.cs new file mode 100644 index 00000000..76649468 --- /dev/null +++ b/BurnOutSharp.Models/MicrosoftCabinet/MSZIPNonCompressedBlock.cs @@ -0,0 +1,26 @@ +namespace BurnOutSharp.Models.MicrosoftCabinet +{ + /// + /// Non-compressed blocks (BTYPE=00) + /// + /// + public class MSZIPNonCompressedBlock : IMSZIPBlockData + { + /// + /// The number of data bytes in the block + /// + /// Bytes 0-1 + public ushort LEN; + + /// + /// The one's complement of LEN + /// + /// Bytes 2-3 + public ushort NLEN; + + /// + /// bytes of literal data + /// + public byte[] Data; + } +} \ No newline at end of file diff --git a/BurnOutSharp/FileType/MicrosoftCAB.MSZIP.cs b/BurnOutSharp/FileType/MicrosoftCAB.MSZIP.cs index 1bd29fb3..fdcf7cf2 100644 --- a/BurnOutSharp/FileType/MicrosoftCAB.MSZIP.cs +++ b/BurnOutSharp/FileType/MicrosoftCAB.MSZIP.cs @@ -1,112 +1,540 @@ using System; using System.Collections; using System.Collections.Generic; +using BurnOutSharp.Models.MicrosoftCabinet; using BurnOutSharp.Utilities; /// /// namespace BurnOutSharp.FileType { - /// - /// Each MSZIP block MUST consist of a 2-byte MSZIP signature and one or more RFC 1951 blocks. The - /// 2-byte MSZIP signature MUST consist of the bytes 0x43 and 0x4B. The MSZIP signature MUST be - /// the first 2 bytes in the MSZIP block.The MSZIP signature is shown in the following packet diagram. - /// - public class MSZIPBlock + public static class MSZIPBlockBuilder + { + public static MSZIPBlock Create(byte[] data) + { + if (data == null) + return null; + + MSZIPBlock block = new MSZIPBlock(); + int offset = 0; + + block.Signature = data.ReadUInt16(ref offset); + if (block.Signature != 0x4B43) + return null; + + block.Data = data.ReadBytes(ref offset, data.Length - 2); + + return block; + } + } + + public static class MSZIPDeflateBlockBuilder + { + public static MSZIPDeflateBlock Create(ulong data) + { + MSZIPDeflateBlock deflateBlock = new MSZIPDeflateBlock(); + + deflateBlock.BFINAL = (data & 0b100) != 0; + deflateBlock.BTYPE = (DeflateCompressionType)(data & 0b011); + + return deflateBlock; + } + } + + public static class MSZIPDynamicHuffmanCompressedBlockBuilder + { + public static MSZIPDynamicHuffmanCompressedBlock Create(MSZIPDeflateStream stream) + { + MSZIPDynamicHuffmanCompressedBlock dynamicHuffmanCompressedBlock = new MSZIPDynamicHuffmanCompressedBlock(); + + // # of Literal/Length codes - 257 + ulong HLIT = stream.ReadBitsLSB(5) + 257; + + // # of Distance codes - 1 + ulong HDIST = stream.ReadBitsLSB(5) + 1; + + // HCLEN, # of Code Length codes - 4 + ulong HCLEN = stream.ReadBitsLSB(5) + 4; + + // (HCLEN + 4) x 3 bits: code lengths for the code length + // alphabet given just above + // + // These code lengths are interpreted as 3-bit integers + // (0-7); as above, a code length of 0 means the + // corresponding symbol (literal/ length or distance code + // length) is not used. + int[] codeLengthAlphabet = new int[19]; + for (ulong i = 0; i < HCLEN; i++) + codeLengthAlphabet[MSZIPDeflate.BitLengthOrder[i]] = (int)stream.ReadBitsLSB(3); + + for (ulong i = HCLEN; i < 19; i++) + codeLengthAlphabet[MSZIPDeflate.BitLengthOrder[i]] = 0; + + // Code length Huffman code + int[] codeLengthHuffmanCode = MSZIPDeflate.CreateTable(codeLengthAlphabet); + + // HLIT + 257 code lengths for the literal/length alphabet, + // encoded using the code length Huffman code + dynamicHuffmanCompressedBlock.LiteralLengths = BuildHuffmanTree(stream, HLIT, codeLengthHuffmanCode); + + // HDIST + 1 code lengths for the distance alphabet, + // encoded using the code length Huffman code + dynamicHuffmanCompressedBlock.DistanceCodes = BuildHuffmanTree(stream, HDIST, codeLengthHuffmanCode); + + return dynamicHuffmanCompressedBlock; + } + + /// + /// The alphabet for code lengths is as follows + /// + private static int[] BuildHuffmanTree(MSZIPDeflateStream stream, ulong codeCount, int[] codeLengths) + { + // Setup the huffman tree + int[] tree = new int[codeCount]; + + // Setup the loop variables + int lastCode = 0, repeatLength = 0; + for (ulong i = 0; i < codeCount; i++) + { + int code = codeLengths[(int)stream.ReadBitsLSB(7)]; + + // Represent code lengths of 0 - 15 + if (code > 0 && code <= 15) + { + lastCode = code; + tree[i] = code; + } + + // Copy the previous code length 3 - 6 times. + // The next 2 bits indicate repeat length (0 = 3, ... , 3 = 6) + // Example: Codes 8, 16 (+2 bits 11), 16 (+2 bits 10) will expand to 12 code lengths of 8 (1 + 6 + 5) + else if (code == 16) + { + repeatLength = (int)stream.ReadBitsLSB(2); + repeatLength += 2; + code = lastCode; + } + + // Repeat a code length of 0 for 3 - 10 times. + // (3 bits of length) + else if (code == 17) + { + repeatLength = (int)stream.ReadBitsLSB(3); + repeatLength += 3; + code = 0; + } + + // Repeat a code length of 0 for 11 - 138 times + // (7 bits of length) + else if (code == 18) + { + repeatLength = (int)stream.ReadBitsLSB(7); + repeatLength += 11; + code = 0; + } + + // Everything else + else + { + throw new ArgumentOutOfRangeException(); + } + + // If we had a repeat length + for (; repeatLength > 0; repeatLength--) + { + tree[i++] = code; + } + } + + return tree; + } + } + + public static class MSZIPNonCompressedBlockBuilder + { + public static MSZIPNonCompressedBlock Create(byte[] data) + { + // If we have invalid header data + if (data == null || data.Length < 4) + throw new ArgumentException(); + + MSZIPNonCompressedBlock nonCompressedBlock = new MSZIPNonCompressedBlock(); + int offset = 0; + + nonCompressedBlock.LEN = data.ReadUInt16(ref offset); + nonCompressedBlock.NLEN = data.ReadUInt16(ref offset); + // TODO: Confirm NLEN is 1's compliment of LEN + + return nonCompressedBlock; + } + } + + #region Deflate Implementation + + /// + public class MSZIPDeflate { #region Constants /// - /// Human-readable signature + /// Maximum Huffman code bit count /// - public static readonly string SignatureString = "CK"; - - /// - /// Signature as an unsigned Int16 value - /// - public const ushort SignatureValue = 0x4B43; - - /// - /// Signature as a byte array - /// - public static readonly byte[] SignatureBytes = new byte[] { 0x43, 0x4B }; + public const int MAX_BITS = 16; #endregion #region Properties /// - /// 'CB' + /// Match lengths for literal codes 257..285 /// - public ushort Signature { get; private set; } - - /// - /// Each MSZIP block is the result of a single deflate compression operation, as defined in [RFC1951]. - /// The compressor that performs the compression operation MUST generate one or more RFC 1951 - /// blocks, as defined in [RFC1951]. The number, deflation mode, and type of RFC 1951 blocks in each - /// MSZIP block is determined by the compressor, as defined in [RFC1951]. The last RFC 1951 block in - /// each MSZIP block MUST be marked as the "end" of the stream(1), as defined by[RFC1951] - /// section 3.2.3. Decoding trees MUST be discarded after each RFC 1951 block, but the history buffer - /// MUST be maintained.Each MSZIP block MUST represent no more than 32 KB of uncompressed data. - /// - /// The maximum compressed size of each MSZIP block is 32 KB + 12 bytes.This enables the MSZIP - /// block to contain 32 KB of data split between two noncompressed RFC 1951 blocks, each of which - /// has a value of BTYPE = 00. - /// - public byte[] Data { get; private set; } - - #endregion - - #region Serialization - - public static MSZIPBlock Deserialize(byte[] data) + /// Each value here is the lower bound for lengths represented + public static Dictionary LiteralLengths { - if (data == null) - return null; + get + { + // If we have cached length mappings, use those + if (_literalLengths != null) + return _literalLengths; - MSZIPBlock block = new MSZIPBlock(); - int dataPtr = 0; + // Otherwise, build it from scratch + _literalLengths = new Dictionary + { + [257] = 3, + [258] = 4, + [259] = 5, + [260] = 6, + [261] = 7, + [262] = 8, + [263] = 9, + [264] = 10, + [265] = 11, // 11,12 + [266] = 13, // 13,14 + [267] = 15, // 15,16 + [268] = 17, // 17,18 + [269] = 19, // 19-22 + [270] = 23, // 23-26 + [271] = 27, // 27-30 + [272] = 31, // 31-34 + [273] = 35, // 35-42 + [274] = 43, // 43-50 + [275] = 51, // 51-58 + [276] = 59, // 59-66 + [277] = 67, // 67-82 + [278] = 83, // 83-98 + [279] = 99, // 99-114 + [280] = 115, // 115-130 + [281] = 131, // 131-162 + [282] = 163, // 163-194 + [283] = 195, // 195-226 + [284] = 227, // 227-257 + [285] = 258, + }; - block.Signature = data.ReadUInt16(ref dataPtr); - if (block.Signature != SignatureValue) - return null; - - block.Data = data.ReadBytes(ref dataPtr, data.Length - 2); - - return block; + return _literalLengths; + } } + /// + /// Extra bits for literal codes 257..285 + /// + public static Dictionary LiteralExtraBits + { + get + { + // If we have cached bit mappings, use those + if (_literalExtraBits != null) + return _literalExtraBits; + + // Otherwise, build it from scratch + _literalExtraBits = new Dictionary(); + + // Literal Value 257 - 264, 0 bits + for (int i = 257; i < 265; i++) + _literalExtraBits[i] = 0; + + // Literal Value 265 - 268, 1 bit + for (int i = 265; i < 269; i++) + _literalExtraBits[i] = 1; + + // Literal Value 269 - 272, 2 bits + for (int i = 269; i < 273; i++) + _literalExtraBits[i] = 2; + + // Literal Value 273 - 276, 3 bits + for (int i = 273; i < 277; i++) + _literalExtraBits[i] = 3; + + // Literal Value 277 - 280, 4 bits + for (int i = 277; i < 281; i++) + _literalExtraBits[i] = 4; + + // Literal Value 281 - 284, 5 bits + for (int i = 281; i < 285; i++) + _literalExtraBits[i] = 5; + + // Literal Value 285, 0 bits + _literalExtraBits[285] = 0; + + return _literalExtraBits; + } + } + + /// + /// Match offsets for distance codes 0..29 + /// + /// Each value here is the lower bound for lengths represented + public static readonly int[] DistanceOffsets = new int[30] + { + 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, + 33, 49, 65, 97, 129, 193, 257, 385, 513, 769, + 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, + }; + + /// + /// Extra bits for distance codes 0..29 + /// + public static readonly int[] DistanceExtraBits = new int[30] + { + 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, + 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, + 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, + }; + + /// + /// The order of the bit length Huffman code lengths + /// + public static readonly int[] BitLengthOrder = new int[19] + { + 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15, + }; + #endregion + + #region Instance Variables + + /// + /// Match lengths for literal codes 257..285 + /// + private static Dictionary _literalLengths = null; + + /// + /// Extra bits for literal codes 257..285 + /// + private static Dictionary _literalExtraBits = null; + + #endregion + + /// + /// The decoding algorithm for the actual data + /// + public static void Decode(MSZIPDeflateStream data) + { + // Create the output byte array + List decodedBytes = new List(); + + // Create the loop variable block + MSZIPDeflateBlock block; + + do + { + ulong header = data.ReadBitsLSB(3); + block = MSZIPDeflateBlockBuilder.Create(header); + + // We should never get a reserved block + if (block.BTYPE == DeflateCompressionType.Reserved) + throw new Exception(); + + // If stored with no compression + if (block.BTYPE == DeflateCompressionType.NoCompression) + { + // Skip any remaining bits in current partially processed byte + data.DiscardToByteBoundary(); + + // Read LEN and NLEN + byte[] nonCompressedHeader = data.ReadBytesLSB(4); + block.BlockData = MSZIPNonCompressedBlockBuilder.Create(nonCompressedHeader); + + // Copy LEN bytes of data to output + ushort length = ((MSZIPNonCompressedBlock)block.BlockData).LEN; + ((MSZIPNonCompressedBlock)block.BlockData).Data = data.ReadBytesLSB(length); + decodedBytes.AddRange(((MSZIPNonCompressedBlock)block.BlockData).Data); + } + + // Otherwise + else + { + // If compressed with dynamic Huffman codes + // read representation of code trees + block.BlockData = block.BTYPE == DeflateCompressionType.DynamicHuffman + ? (IMSZIPBlockData)MSZIPDynamicHuffmanCompressedBlockBuilder.Create(data) + : (IMSZIPBlockData)new MSZIPFixedHuffmanCompressedBlock(); + + var compressedBlock = (block.BlockData as MSZIPCompressedBlock); + + // 9 bits per entry, 288 max symbols + int[] literalDecodeTable = CreateTable(compressedBlock.LiteralLengths); + + // 6 bits per entry, 32 max symbols + int[] distanceDecodeTable = CreateTable(compressedBlock.DistanceCodes); + + // Loop until end of block code recognized + while (true) + { + // Decode literal/length value from input stream + int symbol = literalDecodeTable[data.ReadBitsLSB(9)]; + + // Copy value (literal byte) to output stream + if (symbol < 256) + { + decodedBytes.Add((byte)symbol); + } + // End of block (256) + else if (symbol == 256) + { + break; + } + else + { + // Decode distance from input stream + ulong length = data.ReadBitsLSB(LiteralExtraBits[symbol]); + length += (ulong)LiteralLengths[symbol]; + + int code = distanceDecodeTable[length]; + + ulong distance = data.ReadBitsLSB(DistanceExtraBits[code]); + distance += (ulong)DistanceOffsets[code]; + + + // Move backwards distance bytes in the output + // stream, and copy length bytes from this + // position to the output stream. + } + } + } + } while (!block.BFINAL); + + /* + Note that a duplicated string reference may refer to a string + in a previous block; i.e., the backward distance may cross one + or more block boundaries. However a distance cannot refer past + the beginning of the output stream. (An application using a + preset dictionary might discard part of the output stream; a + distance can refer to that part of the output stream anyway) + Note also that the referenced string may overlap the current + position; for example, if the last 2 bytes decoded have values + X and Y, a string reference with + adds X,Y,X,Y,X to the output stream. + */ + } + + /// + /// Given this rule, we can define the Huffman code for an alphabet + /// just by giving the bit lengths of the codes for each symbol of + /// the alphabet in order; this is sufficient to determine the + /// actual codes. In our example, the code is completely defined + /// by the sequence of bit lengths (2, 1, 3, 3). The following + /// algorithm generates the codes as integers, intended to be read + /// from most- to least-significant bit. The code lengths are + /// initially in tree[I].Len; the codes are produced in + /// tree[I].Code. + /// + public static void CreateTable(MSZIPCompressedBlock tree) + { + // Count the number of codes for each code length. Let + // bl_count[N] be the number of codes of length N, N >= 1. + var bl_count = new Dictionary(); + for (int i = 0; i < tree.LiteralLengths.Length; i++) + { + if (!bl_count.ContainsKey(tree.LiteralLengths[i])) + bl_count[tree.LiteralLengths[i]] = 0; + + bl_count[tree.LiteralLengths[i]]++; + } + + // Find the numerical value of the smallest code for each + // code length: + var next_code = new Dictionary(); + int code = 0; + bl_count[0] = 0; + for (int bits = 1; bits <= MAX_BITS; bits++) + { + code = (code + bl_count[bits - 1]) << 1; + next_code[bits] = code; + } + + // Assign numerical values to all codes, using consecutive + // values for all codes of the same length with the base + // values determined at step 2. Codes that are never used + // (which have a bit length of zero) must not be assigned a + // value. + for (int n = 0; n <= tree.LiteralLengths.Length; n++) + { + int len = tree.LiteralLengths[n]; + if (len != 0) + { + tree.DistanceCodes[n] = next_code[len]; + next_code[len]++; + } + } + } + + /// + /// Given this rule, we can define the Huffman code for an alphabet + /// just by giving the bit lengths of the codes for each symbol of + /// the alphabet in order; this is sufficient to determine the + /// actual codes. In our example, the code is completely defined + /// by the sequence of bit lengths (2, 1, 3, 3). The following + /// algorithm generates the codes as integers, intended to be read + /// from most- to least-significant bit. The code lengths are + /// initially in tree[I].Len; the codes are produced in + /// tree[I].Code. + /// + public static int[] CreateTable(int[] lengths) + { + // Count the number of codes for each code length. Let + // bl_count[N] be the number of codes of length N, N >= 1. + var bl_count = new Dictionary(); + for (int i = 0; i < lengths.Length; i++) + { + if (!bl_count.ContainsKey(lengths[i])) + bl_count[lengths[i]] = 0; + + bl_count[lengths[i]]++; + } + + // Find the numerical value of the smallest code for each + // code length: + var next_code = new Dictionary(); + int code = 0; + bl_count[0] = 0; + for (int bits = 1; bits <= MAX_BITS; bits++) + { + code = (code + bl_count[bits - 1]) << 1; + next_code[bits] = code; + } + + // Assign numerical values to all codes, using consecutive + // values for all codes of the same length with the base + // values determined at step 2. Codes that are never used + // (which have a bit length of zero) must not be assigned a + // value. + int[] distances = new int[lengths.Length]; + for (int n = 0; n <= lengths.Length; n++) + { + int len = lengths[n]; + if (len != 0) + { + distances[n] = next_code[len]; + next_code[len]++; + } + } + + return distances; + } } - #region Deflate Implementation - - /// - /// How the data are compressed - /// - public enum MSZIPDeflateCompressionType : byte - { - /// - /// no compression - /// - NoCompression = 0b00, - - /// - /// Compressed with fixed Huffman codes - /// - FixedHuffman = 0b01, - - /// - /// Compressed with dynamic Huffman codes - /// - DynamicHuffman = 0b10, - - /// - /// Reserved (error) - /// - Reserved = 0b11, - } - + /// public class MSZIPDeflateStream { #region Instance Variables @@ -499,661 +927,5 @@ namespace BurnOutSharp.FileType } } - public class MSZIPDeflate - { - #region Constants - - /// - /// Maximum Huffman code bit count - /// - public const int MAX_BITS = 16; - - #endregion - - #region Properties - - /// - /// Match lengths for literal codes 257..285 - /// - /// Each value here is the lower bound for lengths represented - public static Dictionary LiteralLengths - { - get - { - // If we have cached length mappings, use those - if (_literalLengths != null) - return _literalLengths; - - // Otherwise, build it from scratch - _literalLengths = new Dictionary - { - [257] = 3, - [258] = 4, - [259] = 5, - [260] = 6, - [261] = 7, - [262] = 8, - [263] = 9, - [264] = 10, - [265] = 11, // 11,12 - [266] = 13, // 13,14 - [267] = 15, // 15,16 - [268] = 17, // 17,18 - [269] = 19, // 19-22 - [270] = 23, // 23-26 - [271] = 27, // 27-30 - [272] = 31, // 31-34 - [273] = 35, // 35-42 - [274] = 43, // 43-50 - [275] = 51, // 51-58 - [276] = 59, // 59-66 - [277] = 67, // 67-82 - [278] = 83, // 83-98 - [279] = 99, // 99-114 - [280] = 115, // 115-130 - [281] = 131, // 131-162 - [282] = 163, // 163-194 - [283] = 195, // 195-226 - [284] = 227, // 227-257 - [285] = 258, - }; - - return _literalLengths; - } - } - - /// - /// Extra bits for literal codes 257..285 - /// - public static Dictionary LiteralExtraBits - { - get - { - // If we have cached bit mappings, use those - if (_literalExtraBits != null) - return _literalExtraBits; - - // Otherwise, build it from scratch - _literalExtraBits = new Dictionary(); - - // Literal Value 257 - 264, 0 bits - for (int i = 257; i < 265; i++) - _literalExtraBits[i] = 0; - - // Literal Value 265 - 268, 1 bit - for (int i = 265; i < 269; i++) - _literalExtraBits[i] = 1; - - // Literal Value 269 - 272, 2 bits - for (int i = 269; i < 273; i++) - _literalExtraBits[i] = 2; - - // Literal Value 273 - 276, 3 bits - for (int i = 273; i < 277; i++) - _literalExtraBits[i] = 3; - - // Literal Value 277 - 280, 4 bits - for (int i = 277; i < 281; i++) - _literalExtraBits[i] = 4; - - // Literal Value 281 - 284, 5 bits - for (int i = 281; i < 285; i++) - _literalExtraBits[i] = 5; - - // Literal Value 285, 0 bits - _literalExtraBits[285] = 0; - - return _literalExtraBits; - } - } - - /// - /// Match offsets for distance codes 0..29 - /// - /// Each value here is the lower bound for lengths represented - public static readonly int[] DistanceOffsets = new int[30] - { - 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, - 33, 49, 65, 97, 129, 193, 257, 385, 513, 769, - 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, - }; - - /// - /// Extra bits for distance codes 0..29 - /// - public static readonly int[] DistanceExtraBits = new int[30] - { - 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, - 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, - 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, - }; - - /// - /// The order of the bit length Huffman code lengths - /// - public static readonly int[] BitLengthOrder = new int[19] - { - 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15, - }; - - #endregion - - #region Instance Variables - - /// - /// Match lengths for literal codes 257..285 - /// - private static Dictionary _literalLengths = null; - - /// - /// Extra bits for literal codes 257..285 - /// - private static Dictionary _literalExtraBits = null; - - #endregion - - /// - /// The decoding algorithm for the actual data - /// - public static void Decode(MSZIPDeflateStream data) - { - // Create the output byte array - List decodedBytes = new List(); - - // Create the loop variable block - MSZIPDeflateBlock block; - - do - { - ulong header = data.ReadBitsLSB(3); - block = new MSZIPDeflateBlock(header); - - // We should never get a reserved block - if (block.BTYPE == MSZIPDeflateCompressionType.Reserved) - throw new Exception(); - - // If stored with no compression - if (block.BTYPE == MSZIPDeflateCompressionType.NoCompression) - { - // Skip any remaining bits in current partially processed byte - data.DiscardToByteBoundary(); - - // Read LEN and NLEN - byte[] nonCompressedHeader = data.ReadBytesLSB(4); - block.BlockData = new MSZIPNonCompressedBlock(nonCompressedHeader); - - // Copy LEN bytes of data to output - ushort length = ((MSZIPNonCompressedBlock)block.BlockData).LEN; - ((MSZIPNonCompressedBlock)block.BlockData).Data = data.ReadBytesLSB(length); - decodedBytes.AddRange(((MSZIPNonCompressedBlock)block.BlockData).Data); - } - - // Otherwise - else - { - // If compressed with dynamic Huffman codes - // read representation of code trees - block.BlockData = block.BTYPE == MSZIPDeflateCompressionType.DynamicHuffman - ? (IMSZIPBlockData)new MSZIPDynamicHuffmanCompressedBlock(data) - : (IMSZIPBlockData)new MSZIPFixedHuffmanCompressedBlock(); - - var compressedBlock = (block.BlockData as MSZIPCompressedBlock); - - // 9 bits per entry, 288 max symbols - int[] literalDecodeTable = CreateTable(compressedBlock.LiteralLengths); - - // 6 bits per entry, 32 max symbols - int[] distanceDecodeTable = CreateTable(compressedBlock.DistanceCodes); - - // Loop until end of block code recognized - while (true) - { - // Decode literal/length value from input stream - int symbol = literalDecodeTable[data.ReadBitsLSB(9)]; - - // Copy value (literal byte) to output stream - if (symbol < 256) - { - decodedBytes.Add((byte)symbol); - } - // End of block (256) - else if (symbol == 256) - { - break; - } - else - { - // Decode distance from input stream - ulong length = data.ReadBitsLSB(LiteralExtraBits[symbol]); - length += (ulong)LiteralLengths[symbol]; - - int code = distanceDecodeTable[length]; - - ulong distance = data.ReadBitsLSB(DistanceExtraBits[code]); - distance += (ulong)DistanceOffsets[code]; - - - // Move backwards distance bytes in the output - // stream, and copy length bytes from this - // position to the output stream. - } - } - } - } while (!block.BFINAL); - - /* - Note that a duplicated string reference may refer to a string - in a previous block; i.e., the backward distance may cross one - or more block boundaries. However a distance cannot refer past - the beginning of the output stream. (An application using a - preset dictionary might discard part of the output stream; a - distance can refer to that part of the output stream anyway) - Note also that the referenced string may overlap the current - position; for example, if the last 2 bytes decoded have values - X and Y, a string reference with - adds X,Y,X,Y,X to the output stream. - */ - } - - /// - /// Given this rule, we can define the Huffman code for an alphabet - /// just by giving the bit lengths of the codes for each symbol of - /// the alphabet in order; this is sufficient to determine the - /// actual codes. In our example, the code is completely defined - /// by the sequence of bit lengths (2, 1, 3, 3). The following - /// algorithm generates the codes as integers, intended to be read - /// from most- to least-significant bit. The code lengths are - /// initially in tree[I].Len; the codes are produced in - /// tree[I].Code. - /// - public static void CreateTable(MSZIPCompressedBlock tree) - { - // Count the number of codes for each code length. Let - // bl_count[N] be the number of codes of length N, N >= 1. - var bl_count = new Dictionary(); - for (int i = 0; i < tree.LiteralLengths.Length; i++) - { - if (!bl_count.ContainsKey(tree.LiteralLengths[i])) - bl_count[tree.LiteralLengths[i]] = 0; - - bl_count[tree.LiteralLengths[i]]++; - } - - // Find the numerical value of the smallest code for each - // code length: - var next_code = new Dictionary(); - int code = 0; - bl_count[0] = 0; - for (int bits = 1; bits <= MAX_BITS; bits++) - { - code = (code + bl_count[bits - 1]) << 1; - next_code[bits] = code; - } - - // Assign numerical values to all codes, using consecutive - // values for all codes of the same length with the base - // values determined at step 2. Codes that are never used - // (which have a bit length of zero) must not be assigned a - // value. - for (int n = 0; n <= tree.LiteralLengths.Length; n++) - { - int len = tree.LiteralLengths[n]; - if (len != 0) - { - tree.DistanceCodes[n] = next_code[len]; - next_code[len]++; - } - } - } - - /// - /// Given this rule, we can define the Huffman code for an alphabet - /// just by giving the bit lengths of the codes for each symbol of - /// the alphabet in order; this is sufficient to determine the - /// actual codes. In our example, the code is completely defined - /// by the sequence of bit lengths (2, 1, 3, 3). The following - /// algorithm generates the codes as integers, intended to be read - /// from most- to least-significant bit. The code lengths are - /// initially in tree[I].Len; the codes are produced in - /// tree[I].Code. - /// - public static int[] CreateTable(int[] lengths) - { - // Count the number of codes for each code length. Let - // bl_count[N] be the number of codes of length N, N >= 1. - var bl_count = new Dictionary(); - for (int i = 0; i < lengths.Length; i++) - { - if (!bl_count.ContainsKey(lengths[i])) - bl_count[lengths[i]] = 0; - - bl_count[lengths[i]]++; - } - - // Find the numerical value of the smallest code for each - // code length: - var next_code = new Dictionary(); - int code = 0; - bl_count[0] = 0; - for (int bits = 1; bits <= MAX_BITS; bits++) - { - code = (code + bl_count[bits - 1]) << 1; - next_code[bits] = code; - } - - // Assign numerical values to all codes, using consecutive - // values for all codes of the same length with the base - // values determined at step 2. Codes that are never used - // (which have a bit length of zero) must not be assigned a - // value. - int[] distances = new int[lengths.Length]; - for (int n = 0; n <= lengths.Length; n++) - { - int len = lengths[n]; - if (len != 0) - { - distances[n] = next_code[len]; - next_code[len]++; - } - } - - return distances; - } - } - - public class MSZIPDeflateBlock - { - #region Properties - - /// - /// Set if and only if this is the last block of the data set. - /// - /// Bit 0 - public bool BFINAL { get; set; } - - /// - /// Specifies how the data are compressed - /// - /// Bits 1-2 - public MSZIPDeflateCompressionType BTYPE { get; set; } - - /// - /// Block data as defined by the compression type - /// - public IMSZIPBlockData BlockData { get; set; } - - #endregion - - /// - /// Constructor - /// - public MSZIPDeflateBlock(ulong header) - { - BFINAL = (header & 0b100) != 0; - BTYPE = (MSZIPDeflateCompressionType)(header & 0b011); - } - } - - /// - /// Empty interface defining block types - /// - public interface IMSZIPBlockData { } - - /// - /// Non-compressed blocks (BTYPE=00) - /// - public class MSZIPNonCompressedBlock : IMSZIPBlockData - { - #region Properties - - /// - /// The number of data bytes in the block - /// - /// Bytes 0-1 - public ushort LEN { get; set; } - - /// - /// The one's complement of LEN - /// - /// Bytes 2-3 - public ushort NLEN { get; set; } - - /// - /// bytes of literal data - /// - public byte[] Data { get; set; } - - #endregion - - /// - /// Constructor - /// - public MSZIPNonCompressedBlock(byte[] header) - { - // If we have invalid header data - if (header == null || header.Length < 4) - throw new ArgumentException(); - - int offset = 0; - LEN = header.ReadUInt16(ref offset); - NLEN = header.ReadUInt16(ref offset); - - // TODO: Confirm NLEN is 1's compliment of LEN - } - } - - /// - /// Base class for compressed blocks - /// - public abstract class MSZIPCompressedBlock : IMSZIPBlockData - { - /// - /// Huffman code lengths for the literal / length alphabet - /// - public abstract int[] LiteralLengths { get; } - - /// - /// Huffman distance codes for the literal / length alphabet - /// - public abstract int[] DistanceCodes { get; } - } - - /// - /// Compression with fixed Huffman codes (BTYPE=01) - /// - public class MSZIPFixedHuffmanCompressedBlock : MSZIPCompressedBlock - { - #region Properties - - /// - public override int[] LiteralLengths - { - get - { - // If we have cached lengths, use those - if (_literalLengths != null) - return _literalLengths; - - // Otherwise, build it from scratch - _literalLengths = new int[288]; - - // Literal Value 0 - 143, 8 bits - for (int i = 0; i < 144; i++) - _literalLengths[i] = 8; - - // Literal Value 144 - 255, 9 bits - for (int i = 144; i < 256; i++) - _literalLengths[i] = 9; - - // Literal Value 256 - 279, 7 bits - for (int i = 256; i < 280; i++) - _literalLengths[i] = 7; - - // Literal Value 280 - 287, 8 bits - for (int i = 280; i < 288; i++) - _literalLengths[i] = 8; - - return _literalLengths; - } - } - - /// - public override int[] DistanceCodes - { - get - { - // If we have cached distances, use those - if (_distanceCodes != null) - return _distanceCodes; - - // Otherwise, build it from scratch - _distanceCodes = new int[32]; - - // Fixed length, 5 bits - for (int i = 0; i < 32; i++) - _distanceCodes[i] = 5; - - return _distanceCodes; - } - } - - #endregion - - #region Instance Variables - - /// - /// Huffman code lengths for the literal / length alphabet - /// - private int[] _literalLengths = null; - - /// - /// Huffman distance codes for the literal / length alphabet - /// - private int[] _distanceCodes = null; - - #endregion - } - - /// - /// Compression with dynamic Huffman codes (BTYPE=10) - /// - public class MSZIPDynamicHuffmanCompressedBlock : MSZIPCompressedBlock - { - #region Properties - - /// - public override int[] LiteralLengths { get; } = new int[19]; - - /// - public override int[] DistanceCodes { get; } = new int[19]; - - #endregion - - /// - /// Constructor - /// - public MSZIPDynamicHuffmanCompressedBlock(MSZIPDeflateStream stream) - { - // # of Literal/Length codes - 257 - ulong HLIT = stream.ReadBitsLSB(5) + 257; - - // # of Distance codes - 1 - ulong HDIST = stream.ReadBitsLSB(5) + 1; - - // HCLEN, # of Code Length codes - 4 - ulong HCLEN = stream.ReadBitsLSB(5) + 4; - - // (HCLEN + 4) x 3 bits: code lengths for the code length - // alphabet given just above - // - // These code lengths are interpreted as 3-bit integers - // (0-7); as above, a code length of 0 means the - // corresponding symbol (literal/ length or distance code - // length) is not used. - int[] codeLengthAlphabet = new int[19]; - for (ulong i = 0; i < HCLEN; i++) - codeLengthAlphabet[MSZIPDeflate.BitLengthOrder[i]] = (int)stream.ReadBitsLSB(3); - - for (ulong i = HCLEN; i < 19; i++) - codeLengthAlphabet[MSZIPDeflate.BitLengthOrder[i]] = 0; - - // Code length Huffman code - int[] codeLengthHuffmanCode = MSZIPDeflate.CreateTable(codeLengthAlphabet); - - // HLIT + 257 code lengths for the literal/length alphabet, - // encoded using the code length Huffman code - this.LiteralLengths = BuildHuffmanTree(stream, HLIT, codeLengthHuffmanCode); - - // HDIST + 1 code lengths for the distance alphabet, - // encoded using the code length Huffman code - this.DistanceCodes = BuildHuffmanTree(stream, HDIST, codeLengthHuffmanCode); - } - - /// - /// The alphabet for code lengths is as follows - /// - private int[] BuildHuffmanTree(MSZIPDeflateStream stream, ulong codeCount, int[] codeLengths) - { - // Setup the huffman tree - int[] tree = new int[codeCount]; - - // Setup the loop variables - int lastCode = 0, repeatLength = 0; - for (ulong i = 0; i < codeCount; i++) - { - int code = codeLengths[(int)stream.ReadBitsLSB(7)]; - - // Represent code lengths of 0 - 15 - if (code > 0 && code <= 15) - { - lastCode = code; - tree[i] = code; - } - - // Copy the previous code length 3 - 6 times. - // The next 2 bits indicate repeat length (0 = 3, ... , 3 = 6) - // Example: Codes 8, 16 (+2 bits 11), 16 (+2 bits 10) will expand to 12 code lengths of 8 (1 + 6 + 5) - else if (code == 16) - { - repeatLength = (int)stream.ReadBitsLSB(2); - repeatLength += 2; - code = lastCode; - } - - // Repeat a code length of 0 for 3 - 10 times. - // (3 bits of length) - else if (code == 17) - { - repeatLength = (int)stream.ReadBitsLSB(3); - repeatLength += 3; - code = 0; - } - - // Repeat a code length of 0 for 11 - 138 times - // (7 bits of length) - else if (code == 18) - { - repeatLength = (int)stream.ReadBitsLSB(7); - repeatLength += 11; - code = 0; - } - - // Everything else - else - { - throw new ArgumentOutOfRangeException(); - } - - // If we had a repeat length - for (; repeatLength > 0; repeatLength--) - { - tree[i++] = code; - } - } - - return tree; - } - } - #endregion }