mirror of
https://github.com/adamhathcock/sharpcompress.git
synced 2026-02-13 05:25:00 +00:00
Merge pull request #1076 from adamhathcock/adam/extract-all-test
add extract all test
This commit is contained in:
14
FORMATS.md
14
FORMATS.md
@@ -24,7 +24,7 @@
|
||||
1. SOLID Rars are only supported in the RarReader API.
|
||||
2. Zip format supports pkware and WinzipAES encryption. However, encrypted LZMA is not supported. Zip64 reading/writing is supported but only with seekable streams as the Zip spec doesn't support Zip64 data in post data descriptors. Deflate64 is only supported for reading. See [Zip Format Notes](#zip-format-notes) for details on multi-volume archives and streaming behavior.
|
||||
3. The Tar format requires a file size in the header. If no size is specified to the TarWriter and the stream is not seekable, then an exception will be thrown.
|
||||
4. The 7Zip format doesn't allow for reading as a forward-only stream so 7Zip is only supported through the Archive API
|
||||
4. The 7Zip format doesn't allow for reading as a forward-only stream so 7Zip is only supported through the Archive API. See [7Zip Format Notes](#7zip-format-notes) for details on async extraction behavior.
|
||||
5. LZip has no support for extra data like the file name or timestamp. There is a default filename used when looking at the entry Key on the archive.
|
||||
|
||||
### Zip Format Notes
|
||||
@@ -32,6 +32,18 @@
|
||||
- Multi-volume/split ZIP archives require ZipArchive (seekable streams) as ZipReader cannot seek across volume files.
|
||||
- ZipReader processes entries from LocalEntry headers (which include directory entries ending with `/`) and intentionally skips DirectoryEntry headers from the central directory, as they are redundant in streaming mode - all entry data comes from LocalEntry headers which ZipReader has already processed.
|
||||
|
||||
### 7Zip Format Notes
|
||||
|
||||
- **Async Extraction Performance**: When using async extraction methods (e.g., `ExtractAllEntries()` with `MoveToNextEntryAsync()`), each file creates its own decompression stream to avoid state corruption in the LZMA decoder. This is less efficient than synchronous extraction, which can reuse a single decompression stream for multiple files in the same folder.
|
||||
|
||||
**Performance Impact**: For archives with many small files in the same compression folder, async extraction will be slower than synchronous extraction because it must:
|
||||
1. Create a new LZMA decoder for each file
|
||||
2. Skip through the decompressed data to reach each file's starting position
|
||||
|
||||
**Recommendation**: For best performance with 7Zip archives, use synchronous extraction methods (`MoveToNextEntry()` and `WriteEntryToDirectory()`) when possible. Use async methods only when you need to avoid blocking the thread (e.g., in UI applications or async-only contexts).
|
||||
|
||||
**Technical Details**: 7Zip archives group files into "folders" (compression units), where all files in a folder share one continuous LZMA-compressed stream. The LZMA decoder maintains internal state (dictionary window, decoder positions) that assumes sequential, non-interruptible processing. Async operations can yield control during awaits, which would corrupt this shared state. To avoid this, async extraction creates a fresh decoder stream for each file.
|
||||
|
||||
## Compression Streams
|
||||
|
||||
For those who want to directly compress/decompress bits. The single file formats are represented here as well. However, BZip2, LZip and XZ have no metadata (GZip has a little) so using them without something like a Tar file makes little sense.
|
||||
|
||||
@@ -2,6 +2,8 @@ using System;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using SharpCompress.Common;
|
||||
using SharpCompress.Common.SevenZip;
|
||||
using SharpCompress.Compressors.LZMA.Utilites;
|
||||
@@ -213,9 +215,7 @@ public class SevenZipArchive : AbstractArchive<SevenZipArchiveEntry, SevenZipVol
|
||||
private sealed class SevenZipReader : AbstractReader<SevenZipEntry, SevenZipVolume>
|
||||
{
|
||||
private readonly SevenZipArchive _archive;
|
||||
private CFolder? _currentFolder;
|
||||
private Stream? _currentStream;
|
||||
private CFileItem? _currentItem;
|
||||
private SevenZipEntry? _currentEntry;
|
||||
|
||||
internal SevenZipReader(ReaderOptions readerOptions, SevenZipArchive archive)
|
||||
: base(readerOptions, ArchiveType.SevenZip) => this._archive = archive;
|
||||
@@ -228,40 +228,135 @@ public class SevenZipArchive : AbstractArchive<SevenZipArchiveEntry, SevenZipVol
|
||||
stream.Position = 0;
|
||||
foreach (var dir in entries.Where(x => x.IsDirectory))
|
||||
{
|
||||
_currentEntry = dir;
|
||||
yield return dir;
|
||||
}
|
||||
foreach (
|
||||
var group in entries.Where(x => !x.IsDirectory).GroupBy(x => x.FilePart.Folder)
|
||||
)
|
||||
// For non-directory entries, yield them without creating shared streams
|
||||
// Each call to GetEntryStream() will create a fresh decompression stream
|
||||
// to avoid state corruption issues with async operations
|
||||
foreach (var entry in entries.Where(x => !x.IsDirectory))
|
||||
{
|
||||
_currentFolder = group.Key;
|
||||
if (group.Key is null)
|
||||
{
|
||||
_currentStream = Stream.Null;
|
||||
}
|
||||
else
|
||||
{
|
||||
_currentStream = _archive._database?.GetFolderStream(
|
||||
stream,
|
||||
_currentFolder,
|
||||
new PasswordProvider(Options.Password)
|
||||
);
|
||||
}
|
||||
foreach (var entry in group)
|
||||
{
|
||||
_currentItem = entry.FilePart.Header;
|
||||
yield return entry;
|
||||
}
|
||||
_currentEntry = entry;
|
||||
yield return entry;
|
||||
}
|
||||
}
|
||||
|
||||
protected override EntryStream GetEntryStream() =>
|
||||
CreateEntryStream(
|
||||
new ReadOnlySubStream(
|
||||
_currentStream.NotNull("currentStream is not null"),
|
||||
_currentItem?.Size ?? 0
|
||||
)
|
||||
);
|
||||
protected override EntryStream GetEntryStream()
|
||||
{
|
||||
// Create a fresh decompression stream for each file (no state sharing).
|
||||
// However, the LZMA decoder has bugs in its async implementation that cause
|
||||
// state corruption even on fresh streams. The SyncOnlyStream wrapper
|
||||
// works around these bugs by forcing async operations to use sync equivalents.
|
||||
//
|
||||
// TODO: Fix the LZMA decoder async bugs (in LzmaStream, Decoder, OutWindow)
|
||||
// so this wrapper is no longer necessary.
|
||||
var entry = _currentEntry.NotNull("currentEntry is not null");
|
||||
if (entry.IsDirectory)
|
||||
{
|
||||
return CreateEntryStream(Stream.Null);
|
||||
}
|
||||
return CreateEntryStream(new SyncOnlyStream(entry.FilePart.GetCompressedStream()));
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WORKAROUND: Forces async operations to use synchronous equivalents.
|
||||
/// This is necessary because the LZMA decoder has bugs in its async implementation
|
||||
/// that cause state corruption (IndexOutOfRangeException, DataErrorException).
|
||||
///
|
||||
/// The proper fix would be to repair the LZMA decoder's async methods
|
||||
/// (LzmaStream.ReadAsync, Decoder.CodeAsync, OutWindow async operations),
|
||||
/// but that requires deep changes to the decoder state machine.
|
||||
/// </summary>
|
||||
private sealed class SyncOnlyStream : Stream
|
||||
{
|
||||
private readonly Stream _baseStream;
|
||||
|
||||
public SyncOnlyStream(Stream baseStream) => _baseStream = baseStream;
|
||||
|
||||
public override bool CanRead => _baseStream.CanRead;
|
||||
public override bool CanSeek => _baseStream.CanSeek;
|
||||
public override bool CanWrite => _baseStream.CanWrite;
|
||||
public override long Length => _baseStream.Length;
|
||||
public override long Position
|
||||
{
|
||||
get => _baseStream.Position;
|
||||
set => _baseStream.Position = value;
|
||||
}
|
||||
|
||||
public override void Flush() => _baseStream.Flush();
|
||||
|
||||
public override int Read(byte[] buffer, int offset, int count) =>
|
||||
_baseStream.Read(buffer, offset, count);
|
||||
|
||||
public override long Seek(long offset, SeekOrigin origin) =>
|
||||
_baseStream.Seek(offset, origin);
|
||||
|
||||
public override void SetLength(long value) => _baseStream.SetLength(value);
|
||||
|
||||
public override void Write(byte[] buffer, int offset, int count) =>
|
||||
_baseStream.Write(buffer, offset, count);
|
||||
|
||||
// Force async operations to use sync equivalents to avoid LZMA decoder bugs
|
||||
public override Task<int> ReadAsync(
|
||||
byte[] buffer,
|
||||
int offset,
|
||||
int count,
|
||||
CancellationToken cancellationToken
|
||||
)
|
||||
{
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
return Task.FromResult(_baseStream.Read(buffer, offset, count));
|
||||
}
|
||||
|
||||
public override Task WriteAsync(
|
||||
byte[] buffer,
|
||||
int offset,
|
||||
int count,
|
||||
CancellationToken cancellationToken
|
||||
)
|
||||
{
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
_baseStream.Write(buffer, offset, count);
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
public override Task FlushAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
_baseStream.Flush();
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
#if !NETFRAMEWORK && !NETSTANDARD2_0
|
||||
public override ValueTask<int> ReadAsync(
|
||||
Memory<byte> buffer,
|
||||
CancellationToken cancellationToken = default
|
||||
)
|
||||
{
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
return new ValueTask<int>(_baseStream.Read(buffer.Span));
|
||||
}
|
||||
|
||||
public override ValueTask WriteAsync(
|
||||
ReadOnlyMemory<byte> buffer,
|
||||
CancellationToken cancellationToken = default
|
||||
)
|
||||
{
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
_baseStream.Write(buffer.Span);
|
||||
return ValueTask.CompletedTask;
|
||||
}
|
||||
#endif
|
||||
|
||||
protected override void Dispose(bool disposing)
|
||||
{
|
||||
if (disposing)
|
||||
{
|
||||
_baseStream.Dispose();
|
||||
}
|
||||
base.Dispose(disposing);
|
||||
}
|
||||
}
|
||||
|
||||
private class PasswordProvider : IPasswordProvider
|
||||
|
||||
@@ -428,7 +428,9 @@ public class LzmaStream : Stream, IStreamStack
|
||||
private async Task DecodeChunkHeaderAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
var controlBuffer = new byte[1];
|
||||
await _inputStream.ReadAsync(controlBuffer, 0, 1, cancellationToken).ConfigureAwait(false);
|
||||
await _inputStream
|
||||
.ReadExactlyAsync(controlBuffer, 0, 1, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
var control = controlBuffer[0];
|
||||
_inputPosition++;
|
||||
|
||||
@@ -455,11 +457,15 @@ public class LzmaStream : Stream, IStreamStack
|
||||
|
||||
_availableBytes = (control & 0x1F) << 16;
|
||||
var buffer = new byte[2];
|
||||
await _inputStream.ReadAsync(buffer, 0, 2, cancellationToken).ConfigureAwait(false);
|
||||
await _inputStream
|
||||
.ReadExactlyAsync(buffer, 0, 2, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
_availableBytes += (buffer[0] << 8) + buffer[1] + 1;
|
||||
_inputPosition += 2;
|
||||
|
||||
await _inputStream.ReadAsync(buffer, 0, 2, cancellationToken).ConfigureAwait(false);
|
||||
await _inputStream
|
||||
.ReadExactlyAsync(buffer, 0, 2, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
_rangeDecoderLimit = (buffer[0] << 8) + buffer[1] + 1;
|
||||
_inputPosition += 2;
|
||||
|
||||
@@ -467,7 +473,7 @@ public class LzmaStream : Stream, IStreamStack
|
||||
{
|
||||
_needProps = false;
|
||||
await _inputStream
|
||||
.ReadAsync(controlBuffer, 0, 1, cancellationToken)
|
||||
.ReadExactlyAsync(controlBuffer, 0, 1, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
Properties[0] = controlBuffer[0];
|
||||
_inputPosition++;
|
||||
@@ -495,7 +501,9 @@ public class LzmaStream : Stream, IStreamStack
|
||||
{
|
||||
_uncompressedChunk = true;
|
||||
var buffer = new byte[2];
|
||||
await _inputStream.ReadAsync(buffer, 0, 2, cancellationToken).ConfigureAwait(false);
|
||||
await _inputStream
|
||||
.ReadExactlyAsync(buffer, 0, 2, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
_availableBytes = (buffer[0] << 8) + buffer[1] + 1;
|
||||
_inputPosition += 2;
|
||||
}
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
using System;
|
||||
using System.Buffers;
|
||||
using System.IO;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
|
||||
namespace SharpCompress;
|
||||
|
||||
@@ -41,6 +43,28 @@ internal static class StreamExtensions
|
||||
ArrayPool<byte>.Shared.Return(temp);
|
||||
}
|
||||
}
|
||||
|
||||
internal static async Task ReadExactlyAsync(
|
||||
this Stream stream,
|
||||
byte[] buffer,
|
||||
int offset,
|
||||
int count,
|
||||
CancellationToken cancellationToken
|
||||
)
|
||||
{
|
||||
var totalRead = 0;
|
||||
while (totalRead < count)
|
||||
{
|
||||
var read = await stream
|
||||
.ReadAsync(buffer, offset + totalRead, count - totalRead, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
if (read == 0)
|
||||
{
|
||||
throw new EndOfStreamException();
|
||||
}
|
||||
totalRead += read;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
46
tests/SharpCompress.Test/ExtractAll.cs
Normal file
46
tests/SharpCompress.Test/ExtractAll.cs
Normal file
@@ -0,0 +1,46 @@
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Threading.Tasks;
|
||||
using SharpCompress.Archives;
|
||||
using SharpCompress.Common;
|
||||
using SharpCompress.Readers;
|
||||
using Xunit;
|
||||
|
||||
namespace SharpCompress.Test;
|
||||
|
||||
public class ExtractAllTests : TestBase
|
||||
{
|
||||
[Theory]
|
||||
[InlineData("Zip.deflate.zip")]
|
||||
[InlineData("Rar5.rar")]
|
||||
[InlineData("Rar.rar")]
|
||||
[InlineData("Rar.solid.rar")]
|
||||
[InlineData("7Zip.solid.7z")]
|
||||
[InlineData("7Zip.nonsolid.7z")]
|
||||
[InlineData("7Zip.LZMA.7z")]
|
||||
public async Task ExtractAllEntriesAsync(string archivePath)
|
||||
{
|
||||
var testArchive = Path.Combine(TEST_ARCHIVES_PATH, archivePath);
|
||||
var options = new ExtractionOptions() { ExtractFullPath = true, Overwrite = true };
|
||||
|
||||
using var archive = ArchiveFactory.Open(testArchive);
|
||||
await archive.WriteToDirectoryAsync(SCRATCH_FILES_PATH, options);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData("Zip.deflate.zip")]
|
||||
[InlineData("Rar5.rar")]
|
||||
[InlineData("Rar.rar")]
|
||||
[InlineData("Rar.solid.rar")]
|
||||
[InlineData("7Zip.solid.7z")]
|
||||
[InlineData("7Zip.nonsolid.7z")]
|
||||
[InlineData("7Zip.LZMA.7z")]
|
||||
public void ExtractAllEntriesSync(string archivePath)
|
||||
{
|
||||
var testArchive = Path.Combine(TEST_ARCHIVES_PATH, archivePath);
|
||||
var options = new ExtractionOptions() { ExtractFullPath = true, Overwrite = true };
|
||||
|
||||
using var archive = ArchiveFactory.Open(testArchive);
|
||||
archive.WriteToDirectory(SCRATCH_FILES_PATH, options);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user