Merge pull request #1076 from adamhathcock/adam/extract-all-test

add extract all test
This commit is contained in:
Adam Hathcock
2025-12-19 14:58:33 +00:00
committed by GitHub
5 changed files with 222 additions and 37 deletions

View File

@@ -24,7 +24,7 @@
1. SOLID Rars are only supported in the RarReader API.
2. Zip format supports pkware and WinzipAES encryption. However, encrypted LZMA is not supported. Zip64 reading/writing is supported but only with seekable streams as the Zip spec doesn't support Zip64 data in post data descriptors. Deflate64 is only supported for reading. See [Zip Format Notes](#zip-format-notes) for details on multi-volume archives and streaming behavior.
3. The Tar format requires a file size in the header. If no size is specified to the TarWriter and the stream is not seekable, then an exception will be thrown.
4. The 7Zip format doesn't allow for reading as a forward-only stream so 7Zip is only supported through the Archive API
4. The 7Zip format doesn't allow for reading as a forward-only stream so 7Zip is only supported through the Archive API. See [7Zip Format Notes](#7zip-format-notes) for details on async extraction behavior.
5. LZip has no support for extra data like the file name or timestamp. There is a default filename used when looking at the entry Key on the archive.
### Zip Format Notes
@@ -32,6 +32,18 @@
- Multi-volume/split ZIP archives require ZipArchive (seekable streams) as ZipReader cannot seek across volume files.
- ZipReader processes entries from LocalEntry headers (which include directory entries ending with `/`) and intentionally skips DirectoryEntry headers from the central directory, as they are redundant in streaming mode - all entry data comes from LocalEntry headers which ZipReader has already processed.
### 7Zip Format Notes
- **Async Extraction Performance**: When using async extraction methods (e.g., `ExtractAllEntries()` with `MoveToNextEntryAsync()`), each file creates its own decompression stream to avoid state corruption in the LZMA decoder. This is less efficient than synchronous extraction, which can reuse a single decompression stream for multiple files in the same folder.
**Performance Impact**: For archives with many small files in the same compression folder, async extraction will be slower than synchronous extraction because it must:
1. Create a new LZMA decoder for each file
2. Skip through the decompressed data to reach each file's starting position
**Recommendation**: For best performance with 7Zip archives, use synchronous extraction methods (`MoveToNextEntry()` and `WriteEntryToDirectory()`) when possible. Use async methods only when you need to avoid blocking the thread (e.g., in UI applications or async-only contexts).
**Technical Details**: 7Zip archives group files into "folders" (compression units), where all files in a folder share one continuous LZMA-compressed stream. The LZMA decoder maintains internal state (dictionary window, decoder positions) that assumes sequential, non-interruptible processing. Async operations can yield control during awaits, which would corrupt this shared state. To avoid this, async extraction creates a fresh decoder stream for each file.
## Compression Streams
For those who want to directly compress/decompress bits. The single file formats are represented here as well. However, BZip2, LZip and XZ have no metadata (GZip has a little) so using them without something like a Tar file makes little sense.

View File

@@ -2,6 +2,8 @@ using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;
using SharpCompress.Common;
using SharpCompress.Common.SevenZip;
using SharpCompress.Compressors.LZMA.Utilites;
@@ -213,9 +215,7 @@ public class SevenZipArchive : AbstractArchive<SevenZipArchiveEntry, SevenZipVol
private sealed class SevenZipReader : AbstractReader<SevenZipEntry, SevenZipVolume>
{
private readonly SevenZipArchive _archive;
private CFolder? _currentFolder;
private Stream? _currentStream;
private CFileItem? _currentItem;
private SevenZipEntry? _currentEntry;
internal SevenZipReader(ReaderOptions readerOptions, SevenZipArchive archive)
: base(readerOptions, ArchiveType.SevenZip) => this._archive = archive;
@@ -228,40 +228,135 @@ public class SevenZipArchive : AbstractArchive<SevenZipArchiveEntry, SevenZipVol
stream.Position = 0;
foreach (var dir in entries.Where(x => x.IsDirectory))
{
_currentEntry = dir;
yield return dir;
}
foreach (
var group in entries.Where(x => !x.IsDirectory).GroupBy(x => x.FilePart.Folder)
)
// For non-directory entries, yield them without creating shared streams
// Each call to GetEntryStream() will create a fresh decompression stream
// to avoid state corruption issues with async operations
foreach (var entry in entries.Where(x => !x.IsDirectory))
{
_currentFolder = group.Key;
if (group.Key is null)
{
_currentStream = Stream.Null;
}
else
{
_currentStream = _archive._database?.GetFolderStream(
stream,
_currentFolder,
new PasswordProvider(Options.Password)
);
}
foreach (var entry in group)
{
_currentItem = entry.FilePart.Header;
yield return entry;
}
_currentEntry = entry;
yield return entry;
}
}
protected override EntryStream GetEntryStream() =>
CreateEntryStream(
new ReadOnlySubStream(
_currentStream.NotNull("currentStream is not null"),
_currentItem?.Size ?? 0
)
);
protected override EntryStream GetEntryStream()
{
// Create a fresh decompression stream for each file (no state sharing).
// However, the LZMA decoder has bugs in its async implementation that cause
// state corruption even on fresh streams. The SyncOnlyStream wrapper
// works around these bugs by forcing async operations to use sync equivalents.
//
// TODO: Fix the LZMA decoder async bugs (in LzmaStream, Decoder, OutWindow)
// so this wrapper is no longer necessary.
var entry = _currentEntry.NotNull("currentEntry is not null");
if (entry.IsDirectory)
{
return CreateEntryStream(Stream.Null);
}
return CreateEntryStream(new SyncOnlyStream(entry.FilePart.GetCompressedStream()));
}
}
/// <summary>
/// WORKAROUND: Forces async operations to use synchronous equivalents.
/// This is necessary because the LZMA decoder has bugs in its async implementation
/// that cause state corruption (IndexOutOfRangeException, DataErrorException).
///
/// The proper fix would be to repair the LZMA decoder's async methods
/// (LzmaStream.ReadAsync, Decoder.CodeAsync, OutWindow async operations),
/// but that requires deep changes to the decoder state machine.
/// </summary>
private sealed class SyncOnlyStream : Stream
{
private readonly Stream _baseStream;
public SyncOnlyStream(Stream baseStream) => _baseStream = baseStream;
public override bool CanRead => _baseStream.CanRead;
public override bool CanSeek => _baseStream.CanSeek;
public override bool CanWrite => _baseStream.CanWrite;
public override long Length => _baseStream.Length;
public override long Position
{
get => _baseStream.Position;
set => _baseStream.Position = value;
}
public override void Flush() => _baseStream.Flush();
public override int Read(byte[] buffer, int offset, int count) =>
_baseStream.Read(buffer, offset, count);
public override long Seek(long offset, SeekOrigin origin) =>
_baseStream.Seek(offset, origin);
public override void SetLength(long value) => _baseStream.SetLength(value);
public override void Write(byte[] buffer, int offset, int count) =>
_baseStream.Write(buffer, offset, count);
// Force async operations to use sync equivalents to avoid LZMA decoder bugs
public override Task<int> ReadAsync(
byte[] buffer,
int offset,
int count,
CancellationToken cancellationToken
)
{
cancellationToken.ThrowIfCancellationRequested();
return Task.FromResult(_baseStream.Read(buffer, offset, count));
}
public override Task WriteAsync(
byte[] buffer,
int offset,
int count,
CancellationToken cancellationToken
)
{
cancellationToken.ThrowIfCancellationRequested();
_baseStream.Write(buffer, offset, count);
return Task.CompletedTask;
}
public override Task FlushAsync(CancellationToken cancellationToken)
{
cancellationToken.ThrowIfCancellationRequested();
_baseStream.Flush();
return Task.CompletedTask;
}
#if !NETFRAMEWORK && !NETSTANDARD2_0
public override ValueTask<int> ReadAsync(
Memory<byte> buffer,
CancellationToken cancellationToken = default
)
{
cancellationToken.ThrowIfCancellationRequested();
return new ValueTask<int>(_baseStream.Read(buffer.Span));
}
public override ValueTask WriteAsync(
ReadOnlyMemory<byte> buffer,
CancellationToken cancellationToken = default
)
{
cancellationToken.ThrowIfCancellationRequested();
_baseStream.Write(buffer.Span);
return ValueTask.CompletedTask;
}
#endif
protected override void Dispose(bool disposing)
{
if (disposing)
{
_baseStream.Dispose();
}
base.Dispose(disposing);
}
}
private class PasswordProvider : IPasswordProvider

View File

@@ -428,7 +428,9 @@ public class LzmaStream : Stream, IStreamStack
private async Task DecodeChunkHeaderAsync(CancellationToken cancellationToken = default)
{
var controlBuffer = new byte[1];
await _inputStream.ReadAsync(controlBuffer, 0, 1, cancellationToken).ConfigureAwait(false);
await _inputStream
.ReadExactlyAsync(controlBuffer, 0, 1, cancellationToken)
.ConfigureAwait(false);
var control = controlBuffer[0];
_inputPosition++;
@@ -455,11 +457,15 @@ public class LzmaStream : Stream, IStreamStack
_availableBytes = (control & 0x1F) << 16;
var buffer = new byte[2];
await _inputStream.ReadAsync(buffer, 0, 2, cancellationToken).ConfigureAwait(false);
await _inputStream
.ReadExactlyAsync(buffer, 0, 2, cancellationToken)
.ConfigureAwait(false);
_availableBytes += (buffer[0] << 8) + buffer[1] + 1;
_inputPosition += 2;
await _inputStream.ReadAsync(buffer, 0, 2, cancellationToken).ConfigureAwait(false);
await _inputStream
.ReadExactlyAsync(buffer, 0, 2, cancellationToken)
.ConfigureAwait(false);
_rangeDecoderLimit = (buffer[0] << 8) + buffer[1] + 1;
_inputPosition += 2;
@@ -467,7 +473,7 @@ public class LzmaStream : Stream, IStreamStack
{
_needProps = false;
await _inputStream
.ReadAsync(controlBuffer, 0, 1, cancellationToken)
.ReadExactlyAsync(controlBuffer, 0, 1, cancellationToken)
.ConfigureAwait(false);
Properties[0] = controlBuffer[0];
_inputPosition++;
@@ -495,7 +501,9 @@ public class LzmaStream : Stream, IStreamStack
{
_uncompressedChunk = true;
var buffer = new byte[2];
await _inputStream.ReadAsync(buffer, 0, 2, cancellationToken).ConfigureAwait(false);
await _inputStream
.ReadExactlyAsync(buffer, 0, 2, cancellationToken)
.ConfigureAwait(false);
_availableBytes = (buffer[0] << 8) + buffer[1] + 1;
_inputPosition += 2;
}

View File

@@ -3,6 +3,8 @@
using System;
using System.Buffers;
using System.IO;
using System.Threading;
using System.Threading.Tasks;
namespace SharpCompress;
@@ -41,6 +43,28 @@ internal static class StreamExtensions
ArrayPool<byte>.Shared.Return(temp);
}
}
internal static async Task ReadExactlyAsync(
this Stream stream,
byte[] buffer,
int offset,
int count,
CancellationToken cancellationToken
)
{
var totalRead = 0;
while (totalRead < count)
{
var read = await stream
.ReadAsync(buffer, offset + totalRead, count - totalRead, cancellationToken)
.ConfigureAwait(false);
if (read == 0)
{
throw new EndOfStreamException();
}
totalRead += read;
}
}
}
#endif

View File

@@ -0,0 +1,46 @@
using System.IO;
using System.Linq;
using System.Threading.Tasks;
using SharpCompress.Archives;
using SharpCompress.Common;
using SharpCompress.Readers;
using Xunit;
namespace SharpCompress.Test;
public class ExtractAllTests : TestBase
{
[Theory]
[InlineData("Zip.deflate.zip")]
[InlineData("Rar5.rar")]
[InlineData("Rar.rar")]
[InlineData("Rar.solid.rar")]
[InlineData("7Zip.solid.7z")]
[InlineData("7Zip.nonsolid.7z")]
[InlineData("7Zip.LZMA.7z")]
public async Task ExtractAllEntriesAsync(string archivePath)
{
var testArchive = Path.Combine(TEST_ARCHIVES_PATH, archivePath);
var options = new ExtractionOptions() { ExtractFullPath = true, Overwrite = true };
using var archive = ArchiveFactory.Open(testArchive);
await archive.WriteToDirectoryAsync(SCRATCH_FILES_PATH, options);
}
[Theory]
[InlineData("Zip.deflate.zip")]
[InlineData("Rar5.rar")]
[InlineData("Rar.rar")]
[InlineData("Rar.solid.rar")]
[InlineData("7Zip.solid.7z")]
[InlineData("7Zip.nonsolid.7z")]
[InlineData("7Zip.LZMA.7z")]
public void ExtractAllEntriesSync(string archivePath)
{
var testArchive = Path.Combine(TEST_ARCHIVES_PATH, archivePath);
var options = new ExtractionOptions() { ExtractFullPath = true, Overwrite = true };
using var archive = ArchiveFactory.Open(testArchive);
archive.WriteToDirectory(SCRATCH_FILES_PATH, options);
}
}