diff --git a/FORMATS.md b/FORMATS.md index 7350b25e..50e88d97 100644 --- a/FORMATS.md +++ b/FORMATS.md @@ -24,7 +24,7 @@ 1. SOLID Rars are only supported in the RarReader API. 2. Zip format supports pkware and WinzipAES encryption. However, encrypted LZMA is not supported. Zip64 reading/writing is supported but only with seekable streams as the Zip spec doesn't support Zip64 data in post data descriptors. Deflate64 is only supported for reading. See [Zip Format Notes](#zip-format-notes) for details on multi-volume archives and streaming behavior. 3. The Tar format requires a file size in the header. If no size is specified to the TarWriter and the stream is not seekable, then an exception will be thrown. -4. The 7Zip format doesn't allow for reading as a forward-only stream so 7Zip is only supported through the Archive API +4. The 7Zip format doesn't allow for reading as a forward-only stream so 7Zip is only supported through the Archive API. See [7Zip Format Notes](#7zip-format-notes) for details on async extraction behavior. 5. LZip has no support for extra data like the file name or timestamp. There is a default filename used when looking at the entry Key on the archive. ### Zip Format Notes @@ -32,6 +32,18 @@ - Multi-volume/split ZIP archives require ZipArchive (seekable streams) as ZipReader cannot seek across volume files. - ZipReader processes entries from LocalEntry headers (which include directory entries ending with `/`) and intentionally skips DirectoryEntry headers from the central directory, as they are redundant in streaming mode - all entry data comes from LocalEntry headers which ZipReader has already processed. +### 7Zip Format Notes + +- **Async Extraction Performance**: When using async extraction methods (e.g., `ExtractAllEntries()` with `MoveToNextEntryAsync()`), each file creates its own decompression stream to avoid state corruption in the LZMA decoder. This is less efficient than synchronous extraction, which can reuse a single decompression stream for multiple files in the same folder. + + **Performance Impact**: For archives with many small files in the same compression folder, async extraction will be slower than synchronous extraction because it must: + 1. Create a new LZMA decoder for each file + 2. Skip through the decompressed data to reach each file's starting position + + **Recommendation**: For best performance with 7Zip archives, use synchronous extraction methods (`MoveToNextEntry()` and `WriteEntryToDirectory()`) when possible. Use async methods only when you need to avoid blocking the thread (e.g., in UI applications or async-only contexts). + + **Technical Details**: 7Zip archives group files into "folders" (compression units), where all files in a folder share one continuous LZMA-compressed stream. The LZMA decoder maintains internal state (dictionary window, decoder positions) that assumes sequential, non-interruptible processing. Async operations can yield control during awaits, which would corrupt this shared state. To avoid this, async extraction creates a fresh decoder stream for each file. + ## Compression Streams For those who want to directly compress/decompress bits. The single file formats are represented here as well. However, BZip2, LZip and XZ have no metadata (GZip has a little) so using them without something like a Tar file makes little sense. diff --git a/src/SharpCompress/Archives/SevenZip/SevenZipArchive.cs b/src/SharpCompress/Archives/SevenZip/SevenZipArchive.cs index 27d47d23..d9b5ba1a 100644 --- a/src/SharpCompress/Archives/SevenZip/SevenZipArchive.cs +++ b/src/SharpCompress/Archives/SevenZip/SevenZipArchive.cs @@ -2,6 +2,8 @@ using System; using System.Collections.Generic; using System.IO; using System.Linq; +using System.Threading; +using System.Threading.Tasks; using SharpCompress.Common; using SharpCompress.Common.SevenZip; using SharpCompress.Compressors.LZMA.Utilites; @@ -213,9 +215,7 @@ public class SevenZipArchive : AbstractArchive { private readonly SevenZipArchive _archive; - private CFolder? _currentFolder; - private Stream? _currentStream; - private CFileItem? _currentItem; + private SevenZipEntry? _currentEntry; internal SevenZipReader(ReaderOptions readerOptions, SevenZipArchive archive) : base(readerOptions, ArchiveType.SevenZip) => this._archive = archive; @@ -228,40 +228,135 @@ public class SevenZipArchive : AbstractArchive x.IsDirectory)) { + _currentEntry = dir; yield return dir; } - foreach ( - var group in entries.Where(x => !x.IsDirectory).GroupBy(x => x.FilePart.Folder) - ) + // For non-directory entries, yield them without creating shared streams + // Each call to GetEntryStream() will create a fresh decompression stream + // to avoid state corruption issues with async operations + foreach (var entry in entries.Where(x => !x.IsDirectory)) { - _currentFolder = group.Key; - if (group.Key is null) - { - _currentStream = Stream.Null; - } - else - { - _currentStream = _archive._database?.GetFolderStream( - stream, - _currentFolder, - new PasswordProvider(Options.Password) - ); - } - foreach (var entry in group) - { - _currentItem = entry.FilePart.Header; - yield return entry; - } + _currentEntry = entry; + yield return entry; } } - protected override EntryStream GetEntryStream() => - CreateEntryStream( - new ReadOnlySubStream( - _currentStream.NotNull("currentStream is not null"), - _currentItem?.Size ?? 0 - ) - ); + protected override EntryStream GetEntryStream() + { + // Create a fresh decompression stream for each file (no state sharing). + // However, the LZMA decoder has bugs in its async implementation that cause + // state corruption even on fresh streams. The SyncOnlyStream wrapper + // works around these bugs by forcing async operations to use sync equivalents. + // + // TODO: Fix the LZMA decoder async bugs (in LzmaStream, Decoder, OutWindow) + // so this wrapper is no longer necessary. + var entry = _currentEntry.NotNull("currentEntry is not null"); + if (entry.IsDirectory) + { + return CreateEntryStream(Stream.Null); + } + return CreateEntryStream(new SyncOnlyStream(entry.FilePart.GetCompressedStream())); + } + } + + /// + /// WORKAROUND: Forces async operations to use synchronous equivalents. + /// This is necessary because the LZMA decoder has bugs in its async implementation + /// that cause state corruption (IndexOutOfRangeException, DataErrorException). + /// + /// The proper fix would be to repair the LZMA decoder's async methods + /// (LzmaStream.ReadAsync, Decoder.CodeAsync, OutWindow async operations), + /// but that requires deep changes to the decoder state machine. + /// + private sealed class SyncOnlyStream : Stream + { + private readonly Stream _baseStream; + + public SyncOnlyStream(Stream baseStream) => _baseStream = baseStream; + + public override bool CanRead => _baseStream.CanRead; + public override bool CanSeek => _baseStream.CanSeek; + public override bool CanWrite => _baseStream.CanWrite; + public override long Length => _baseStream.Length; + public override long Position + { + get => _baseStream.Position; + set => _baseStream.Position = value; + } + + public override void Flush() => _baseStream.Flush(); + + public override int Read(byte[] buffer, int offset, int count) => + _baseStream.Read(buffer, offset, count); + + public override long Seek(long offset, SeekOrigin origin) => + _baseStream.Seek(offset, origin); + + public override void SetLength(long value) => _baseStream.SetLength(value); + + public override void Write(byte[] buffer, int offset, int count) => + _baseStream.Write(buffer, offset, count); + + // Force async operations to use sync equivalents to avoid LZMA decoder bugs + public override Task ReadAsync( + byte[] buffer, + int offset, + int count, + CancellationToken cancellationToken + ) + { + cancellationToken.ThrowIfCancellationRequested(); + return Task.FromResult(_baseStream.Read(buffer, offset, count)); + } + + public override Task WriteAsync( + byte[] buffer, + int offset, + int count, + CancellationToken cancellationToken + ) + { + cancellationToken.ThrowIfCancellationRequested(); + _baseStream.Write(buffer, offset, count); + return Task.CompletedTask; + } + + public override Task FlushAsync(CancellationToken cancellationToken) + { + cancellationToken.ThrowIfCancellationRequested(); + _baseStream.Flush(); + return Task.CompletedTask; + } + +#if !NETFRAMEWORK && !NETSTANDARD2_0 + public override ValueTask ReadAsync( + Memory buffer, + CancellationToken cancellationToken = default + ) + { + cancellationToken.ThrowIfCancellationRequested(); + return new ValueTask(_baseStream.Read(buffer.Span)); + } + + public override ValueTask WriteAsync( + ReadOnlyMemory buffer, + CancellationToken cancellationToken = default + ) + { + cancellationToken.ThrowIfCancellationRequested(); + _baseStream.Write(buffer.Span); + return ValueTask.CompletedTask; + } +#endif + + protected override void Dispose(bool disposing) + { + if (disposing) + { + _baseStream.Dispose(); + } + base.Dispose(disposing); + } } private class PasswordProvider : IPasswordProvider diff --git a/src/SharpCompress/Compressors/LZMA/LzmaStream.cs b/src/SharpCompress/Compressors/LZMA/LzmaStream.cs index eaef5fd3..e9d5877f 100644 --- a/src/SharpCompress/Compressors/LZMA/LzmaStream.cs +++ b/src/SharpCompress/Compressors/LZMA/LzmaStream.cs @@ -428,7 +428,9 @@ public class LzmaStream : Stream, IStreamStack private async Task DecodeChunkHeaderAsync(CancellationToken cancellationToken = default) { var controlBuffer = new byte[1]; - await _inputStream.ReadAsync(controlBuffer, 0, 1, cancellationToken).ConfigureAwait(false); + await _inputStream + .ReadExactlyAsync(controlBuffer, 0, 1, cancellationToken) + .ConfigureAwait(false); var control = controlBuffer[0]; _inputPosition++; @@ -455,11 +457,15 @@ public class LzmaStream : Stream, IStreamStack _availableBytes = (control & 0x1F) << 16; var buffer = new byte[2]; - await _inputStream.ReadAsync(buffer, 0, 2, cancellationToken).ConfigureAwait(false); + await _inputStream + .ReadExactlyAsync(buffer, 0, 2, cancellationToken) + .ConfigureAwait(false); _availableBytes += (buffer[0] << 8) + buffer[1] + 1; _inputPosition += 2; - await _inputStream.ReadAsync(buffer, 0, 2, cancellationToken).ConfigureAwait(false); + await _inputStream + .ReadExactlyAsync(buffer, 0, 2, cancellationToken) + .ConfigureAwait(false); _rangeDecoderLimit = (buffer[0] << 8) + buffer[1] + 1; _inputPosition += 2; @@ -467,7 +473,7 @@ public class LzmaStream : Stream, IStreamStack { _needProps = false; await _inputStream - .ReadAsync(controlBuffer, 0, 1, cancellationToken) + .ReadExactlyAsync(controlBuffer, 0, 1, cancellationToken) .ConfigureAwait(false); Properties[0] = controlBuffer[0]; _inputPosition++; @@ -495,7 +501,9 @@ public class LzmaStream : Stream, IStreamStack { _uncompressedChunk = true; var buffer = new byte[2]; - await _inputStream.ReadAsync(buffer, 0, 2, cancellationToken).ConfigureAwait(false); + await _inputStream + .ReadExactlyAsync(buffer, 0, 2, cancellationToken) + .ConfigureAwait(false); _availableBytes = (buffer[0] << 8) + buffer[1] + 1; _inputPosition += 2; } diff --git a/src/SharpCompress/Polyfills/StreamExtensions.cs b/src/SharpCompress/Polyfills/StreamExtensions.cs index ab118a4f..f00b274a 100644 --- a/src/SharpCompress/Polyfills/StreamExtensions.cs +++ b/src/SharpCompress/Polyfills/StreamExtensions.cs @@ -3,6 +3,8 @@ using System; using System.Buffers; using System.IO; +using System.Threading; +using System.Threading.Tasks; namespace SharpCompress; @@ -41,6 +43,28 @@ internal static class StreamExtensions ArrayPool.Shared.Return(temp); } } + + internal static async Task ReadExactlyAsync( + this Stream stream, + byte[] buffer, + int offset, + int count, + CancellationToken cancellationToken + ) + { + var totalRead = 0; + while (totalRead < count) + { + var read = await stream + .ReadAsync(buffer, offset + totalRead, count - totalRead, cancellationToken) + .ConfigureAwait(false); + if (read == 0) + { + throw new EndOfStreamException(); + } + totalRead += read; + } + } } #endif diff --git a/tests/SharpCompress.Test/ExtractAll.cs b/tests/SharpCompress.Test/ExtractAll.cs new file mode 100644 index 00000000..3e8b7d7a --- /dev/null +++ b/tests/SharpCompress.Test/ExtractAll.cs @@ -0,0 +1,46 @@ +using System.IO; +using System.Linq; +using System.Threading.Tasks; +using SharpCompress.Archives; +using SharpCompress.Common; +using SharpCompress.Readers; +using Xunit; + +namespace SharpCompress.Test; + +public class ExtractAllTests : TestBase +{ + [Theory] + [InlineData("Zip.deflate.zip")] + [InlineData("Rar5.rar")] + [InlineData("Rar.rar")] + [InlineData("Rar.solid.rar")] + [InlineData("7Zip.solid.7z")] + [InlineData("7Zip.nonsolid.7z")] + [InlineData("7Zip.LZMA.7z")] + public async Task ExtractAllEntriesAsync(string archivePath) + { + var testArchive = Path.Combine(TEST_ARCHIVES_PATH, archivePath); + var options = new ExtractionOptions() { ExtractFullPath = true, Overwrite = true }; + + using var archive = ArchiveFactory.Open(testArchive); + await archive.WriteToDirectoryAsync(SCRATCH_FILES_PATH, options); + } + + [Theory] + [InlineData("Zip.deflate.zip")] + [InlineData("Rar5.rar")] + [InlineData("Rar.rar")] + [InlineData("Rar.solid.rar")] + [InlineData("7Zip.solid.7z")] + [InlineData("7Zip.nonsolid.7z")] + [InlineData("7Zip.LZMA.7z")] + public void ExtractAllEntriesSync(string archivePath) + { + var testArchive = Path.Combine(TEST_ARCHIVES_PATH, archivePath); + var options = new ExtractionOptions() { ExtractFullPath = true, Overwrite = true }; + + using var archive = ArchiveFactory.Open(testArchive); + archive.WriteToDirectory(SCRATCH_FILES_PATH, options); + } +}