diff --git a/src/libraries/System.Formats.Tar/src/System.Formats.Tar.csproj b/src/libraries/System.Formats.Tar/src/System.Formats.Tar.csproj index c76c2a5250830a..4851fd6bd9e677 100644 --- a/src/libraries/System.Formats.Tar/src/System.Formats.Tar.csproj +++ b/src/libraries/System.Formats.Tar/src/System.Formats.Tar.csproj @@ -33,6 +33,7 @@ + diff --git a/src/libraries/System.Formats.Tar/src/System/Formats/Tar/GnuSparseStream.cs b/src/libraries/System.Formats.Tar/src/System/Formats/Tar/GnuSparseStream.cs new file mode 100644 index 00000000000000..1eb80d16b717fc --- /dev/null +++ b/src/libraries/System.Formats.Tar/src/System/Formats/Tar/GnuSparseStream.cs @@ -0,0 +1,478 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Buffers; +using System.Buffers.Text; +using System.Diagnostics; +using System.IO; +using System.Threading; +using System.Threading.Tasks; + +namespace System.Formats.Tar +{ + // Stream that wraps the raw data section of a GNU sparse format 1.0 PAX entry and + // expands it to the virtual file size by inserting zeros for sparse holes. + // + // The raw data section layout: + // [sparse map text: numSegs\n, then pairs of offset\n numbytes\n] + // [zero padding to the next 512-byte block boundary] + // [packed non-zero data segments, stored sequentially] + // + // This stream presents a virtual file of size 'realSize' where: + // - regions covered by sparse map segments contain the packed data + // - all other regions (holes) read as zero bytes + internal sealed class GnuSparseStream : Stream + { + // Caps the segment count to prevent excessive memory allocation from malformed archives. + // Each segment entry in the array occupies 16 bytes, so 1M segments = 16 MB. + private const int MaxSparseSegments = 1_000_000; + + private readonly Stream _rawStream; + private bool _isDisposed; + private readonly long _realSize; + + // Sparse map state — initialized lazily on first Read to avoid consuming the raw + // stream before TarWriter has a chance to copy the condensed data. + private (long Offset, long Length)[]? _segments; + private long[]? _packedStartOffsets; + + private long _virtualPosition; // current position in the virtual (expanded) file + + // For non-seekable streams: tracks how many bytes of packed data have been consumed + // so we can skip forward when there are holes between segments. + private long _nextPackedOffset; + + // Cached segment index for sequential read optimization. + // For typical forward sequential reads, this avoids repeated binary searches. + private int _currentSegmentIndex; + + internal GnuSparseStream(Stream rawStream, long realSize) + { + _rawStream = rawStream; + _realSize = realSize; + } + + // Parses the sparse map on first read. Populates _segments, _packedStartOffsets, + // and _dataStart. Throws InvalidDataException if the sparse map is malformed. + private void EnsureInitialized() + { + if (_segments is not null) + { + return; + } + + var segments = ParseSparseMap(isAsync: false, _rawStream, CancellationToken.None).GetAwaiter().GetResult(); + InitializeFromParsedMap(segments); + } + + private async ValueTask EnsureInitializedAsync(CancellationToken cancellationToken) + { + if (_segments is not null) + { + return; + } + + var segments = await ParseSparseMap(isAsync: true, _rawStream, cancellationToken).ConfigureAwait(false); + InitializeFromParsedMap(segments); + } + + private void InitializeFromParsedMap((long Offset, long Length)[] segments) + { + _packedStartOffsets = new long[segments.Length]; + long sum = 0; + long previousEnd = 0; + for (int i = 0; i < segments.Length; i++) + { + var (offset, length) = segments[i]; + + // Validate segment ordering and bounds. + if (offset < previousEnd || offset + length > _realSize) + { + throw new InvalidDataException(SR.TarInvalidNumber); + } + previousEnd = offset + length; + + _packedStartOffsets[i] = sum; + try + { + sum = checked(sum + length); + } + catch (OverflowException ex) + { + throw new InvalidDataException(SR.TarInvalidNumber, ex); + } + } + // Assign _segments last — it serves as the initialization flag. + _segments = segments; + } + + public override bool CanRead => !_isDisposed; + public override bool CanSeek => !_isDisposed && _rawStream.CanSeek; + public override bool CanWrite => false; + + public override long Length + { + get + { + ThrowIfDisposed(); + return _realSize; + } + } + + public override long Position + { + get + { + ThrowIfDisposed(); + return _virtualPosition; + } + set + { + ThrowIfDisposed(); + if (!_rawStream.CanSeek) + { + throw new NotSupportedException(SR.IO_NotSupported_UnseekableStream); + } + ArgumentOutOfRangeException.ThrowIfNegative(value); + _virtualPosition = value; + _currentSegmentIndex = 0; // Reset segment hint after seek + } + } + + public override long Seek(long offset, SeekOrigin origin) + { + ThrowIfDisposed(); + if (!_rawStream.CanSeek) + { + throw new NotSupportedException(SR.IO_NotSupported_UnseekableStream); + } + + long newPosition = origin switch + { + SeekOrigin.Begin => offset, + SeekOrigin.Current => _virtualPosition + offset, + SeekOrigin.End => _realSize + offset, + _ => throw new ArgumentOutOfRangeException(nameof(origin)), + }; + + if (newPosition < 0) + { + throw new IOException(SR.IO_SeekBeforeBegin); + } + + _virtualPosition = newPosition; + _currentSegmentIndex = 0; // Reset segment hint after seek + return _virtualPosition; + } + + public override int Read(byte[] buffer, int offset, int count) + { + ValidateBufferArguments(buffer, offset, count); + return Read(buffer.AsSpan(offset, count)); + } + + public override int Read(Span destination) + { + ThrowIfDisposed(); + EnsureInitialized(); + Debug.Assert(_segments is not null && _packedStartOffsets is not null); + + if (destination.IsEmpty || _virtualPosition >= _realSize) + { + return 0; + } + + int toRead = (int)Math.Min(destination.Length, _realSize - _virtualPosition); + destination = destination.Slice(0, toRead); + + int totalFilled = 0; + while (totalFilled < toRead) + { + long vPos = _virtualPosition + totalFilled; + int segIdx = FindSegmentFromCurrent(vPos); + + if (segIdx < 0) + { + // vPos is in a sparse hole — fill with zeros until the next segment or end of file. + long nextSegStart = ~segIdx < _segments.Length ? _segments[~segIdx].Offset : _realSize; + int zeroCount = (int)Math.Min(toRead - totalFilled, nextSegStart - vPos); + destination.Slice(totalFilled, zeroCount).Clear(); + totalFilled += zeroCount; + } + else + { + // vPos is within segment segIdx — read from packed data. + var (segOffset, segLength) = _segments[segIdx]; + long offsetInSeg = vPos - segOffset; + long remainingInSeg = segLength - offsetInSeg; + int countToRead = (int)Math.Min(toRead - totalFilled, remainingInSeg); + + long packedOffset = _packedStartOffsets[segIdx] + offsetInSeg; + int bytesRead = ReadFromPackedData(destination.Slice(totalFilled, countToRead), packedOffset); + totalFilled += bytesRead; + break; // Return after an underlying read; caller can call Read again for more. + } + } + + _virtualPosition += totalFilled; + return totalFilled; + } + + public override Task ReadAsync(byte[] buffer, int offset, int count, CancellationToken cancellationToken) + { + if (cancellationToken.IsCancellationRequested) + { + return Task.FromCanceled(cancellationToken); + } + ValidateBufferArguments(buffer, offset, count); + return ReadAsync(new Memory(buffer, offset, count), cancellationToken).AsTask(); + } + + public override ValueTask ReadAsync(Memory buffer, CancellationToken cancellationToken = default) + { + if (cancellationToken.IsCancellationRequested) + { + return ValueTask.FromCanceled(cancellationToken); + } + ThrowIfDisposed(); + if (buffer.IsEmpty || _virtualPosition >= _realSize) + { + return ValueTask.FromResult(0); + } + return ReadAsyncCore(buffer, cancellationToken); + } + + private async ValueTask ReadAsyncCore(Memory buffer, CancellationToken cancellationToken) + { + await EnsureInitializedAsync(cancellationToken).ConfigureAwait(false); + Debug.Assert(_segments is not null && _packedStartOffsets is not null); + + int toRead = (int)Math.Min(buffer.Length, _realSize - _virtualPosition); + buffer = buffer.Slice(0, toRead); + + int totalFilled = 0; + while (totalFilled < toRead) + { + long vPos = _virtualPosition + totalFilled; + int segIdx = FindSegmentFromCurrent(vPos); + + if (segIdx < 0) + { + long nextSegStart = ~segIdx < _segments.Length ? _segments[~segIdx].Offset : _realSize; + int zeroCount = (int)Math.Min(toRead - totalFilled, nextSegStart - vPos); + buffer.Slice(totalFilled, zeroCount).Span.Clear(); + totalFilled += zeroCount; + } + else + { + var (segOffset, segLength) = _segments[segIdx]; + long offsetInSeg = vPos - segOffset; + long remainingInSeg = segLength - offsetInSeg; + int countToRead = (int)Math.Min(toRead - totalFilled, remainingInSeg); + + long packedOffset = _packedStartOffsets[segIdx] + offsetInSeg; + int bytesRead = await ReadFromPackedDataAsync(buffer.Slice(totalFilled, countToRead), packedOffset, cancellationToken).ConfigureAwait(false); + totalFilled += bytesRead; + break; + } + } + + _virtualPosition += totalFilled; + return totalFilled; + } + + // Exposes the underlying raw stream for callers that need to access the condensed data. + internal Stream BaseStream => _rawStream; + + // Reads from the packed data at the given packedOffset. + // After EnsureInitialized, the raw stream is positioned at _dataStart and + // _nextPackedOffset tracks how far into the packed data we've read. + // Returns the number of bytes actually read (may be less than destination.Length). + private int ReadFromPackedData(Span destination, long packedOffset) + { + long skipBytes = packedOffset - _nextPackedOffset; + if (skipBytes < 0 && !_rawStream.CanSeek) + { + throw new InvalidOperationException(SR.IO_NotSupported_UnseekableStream); + } + if (skipBytes != 0) + { + TarHelpers.AdvanceStream(_rawStream, skipBytes); + } + int bytesRead = _rawStream.Read(destination); + _nextPackedOffset = packedOffset + bytesRead; + return bytesRead; + } + + private async ValueTask ReadFromPackedDataAsync(Memory destination, long packedOffset, CancellationToken cancellationToken) + { + long skipBytes = packedOffset - _nextPackedOffset; + if (skipBytes < 0 && !_rawStream.CanSeek) + { + throw new InvalidOperationException(SR.IO_NotSupported_UnseekableStream); + } + if (skipBytes != 0) + { + await TarHelpers.AdvanceStreamAsync(_rawStream, skipBytes, cancellationToken).ConfigureAwait(false); + } + int bytesRead = await _rawStream.ReadAsync(destination, cancellationToken).ConfigureAwait(false); + _nextPackedOffset = packedOffset + bytesRead; + return bytesRead; + } + + // Finds the segment containing virtualPosition using _currentSegmentIndex as a hint for O(1) + // sequential reads. Backward seeks must reset _currentSegmentIndex to 0 before calling this + // (done in Seek() and Position.set). For strictly forward sequential reads the index only ever + // advances, so no reset is needed here. + // Returns the segment index if found, or the bitwise complement of the + // insertion point (a negative number) if virtualPosition is in a hole. + private int FindSegmentFromCurrent(long virtualPosition) + { + Debug.Assert(_segments is not null); + + // Scan forward from the current cached index (optimal for sequential reads). + while (_currentSegmentIndex < _segments.Length) + { + var (offset, length) = _segments[_currentSegmentIndex]; + if (virtualPosition < offset) + { + // Position is in a hole before the current segment. + return ~_currentSegmentIndex; + } + if (virtualPosition < offset + length) + { + // Position is within the current segment. + return _currentSegmentIndex; + } + // Position is past this segment; advance to the next. + _currentSegmentIndex++; + } + return ~_segments.Length; // Past all segments. + } + + // Parses the sparse map from rawStream (positioned at the start of the data section). + // The map format is: numSegments\n, then pairs of offset\n numbytes\n. + // After the map text, there is zero-padding to the next 512-byte block boundary, + // and then the packed data begins. + // + // Returns the parsed segments. + private static async Task<(long Offset, long Length)[]> ParseSparseMap( + bool isAsync, Stream rawStream, CancellationToken cancellationToken) + { + // The buffer is 2 * RecordSize (1024 bytes) and each fill reads exactly RecordSize (512) + // bytes. This guarantees that the total bytes read is always a multiple of RecordSize, + // so the stream is already positioned at the start of the packed data when this method returns. + int bufferSize = 2 * TarHelpers.RecordSize; + byte[] bytes = ArrayPool.Shared.Rent(bufferSize); + + try + { + int activeStart = 0; + int availableStart = 0; + + // Compact the buffer and read exactly one RecordSize (512) block. + // Returns true if bytes were read, false on EOF. + async ValueTask FillBufferAsync() + { + int active = availableStart - activeStart; + if (active > 0 && activeStart > 0) + { + bytes.AsSpan(activeStart, active).CopyTo(bytes); + } + activeStart = 0; + availableStart = active; + + int newBytes = isAsync + ? await rawStream.ReadAtLeastAsync(bytes.AsMemory(availableStart, TarHelpers.RecordSize), TarHelpers.RecordSize, throwOnEndOfStream: false, cancellationToken).ConfigureAwait(false) + : rawStream.ReadAtLeast(bytes.AsSpan(availableStart, TarHelpers.RecordSize), TarHelpers.RecordSize, throwOnEndOfStream: false); + + availableStart += newBytes; + return newBytes > 0; + } + + // Reads a newline-terminated decimal line from the buffer, refilling as needed. + // Returns the parsed value. Throws InvalidDataException if the line is malformed. + async ValueTask ReadLineAsync() + { + while (true) + { + int nlIdx = bytes.AsSpan(activeStart, availableStart - activeStart).IndexOf((byte)'\n'); + if (nlIdx >= 0) + { + ReadOnlySpan span = bytes.AsSpan(activeStart, nlIdx); + if (!Utf8Parser.TryParse(span, out long value, out int consumed) || consumed != span.Length) + { + throw new InvalidDataException(SR.TarInvalidNumber); + } + activeStart += nlIdx + 1; + return value; + } + + if (availableStart + TarHelpers.RecordSize > bufferSize) + { + // Not enough room in the buffer for another block-sized fill + // and no newline found: line is too long (malformed). + throw new InvalidDataException(SR.TarInvalidNumber); + } + + if (!await FillBufferAsync().ConfigureAwait(false)) + { + // EOF before newline. + throw new InvalidDataException(SR.TarInvalidNumber); + } + } + } + + await FillBufferAsync().ConfigureAwait(false); + + long numSegments = await ReadLineAsync().ConfigureAwait(false); + if ((ulong)numSegments > MaxSparseSegments) + { + throw new InvalidDataException(SR.TarInvalidNumber); + } + + var segments = new (long Offset, long Length)[(int)numSegments]; + for (int i = 0; i < (int)numSegments; i++) + { + long offset = await ReadLineAsync().ConfigureAwait(false); + long length = await ReadLineAsync().ConfigureAwait(false); + if (offset < 0 || length < 0) + { + throw new InvalidDataException(SR.TarInvalidNumber); + } + segments[i] = (offset, length); + } + + // Since each FillBuffer call reads exactly RecordSize (512) bytes, the total bytes + // read is always a multiple of RecordSize (mapBytesConsumed + padding), so the stream + // is already positioned at the start of the packed data. + return segments; + } + finally + { + ArrayPool.Shared.Return(bytes); + } + } + + protected override void Dispose(bool disposing) + { + if (disposing && !_isDisposed) + { + _rawStream.Dispose(); + } + _isDisposed = true; + base.Dispose(disposing); + } + + public override void Flush() { } + + public override Task FlushAsync(CancellationToken cancellationToken) => + cancellationToken.IsCancellationRequested ? Task.FromCanceled(cancellationToken) : Task.CompletedTask; + + public override void SetLength(long value) => throw new NotSupportedException(SR.IO_NotSupported_UnwritableStream); + + public override void Write(byte[] buffer, int offset, int count) => throw new NotSupportedException(SR.IO_NotSupported_UnwritableStream); + + private void ThrowIfDisposed() => ObjectDisposedException.ThrowIf(_isDisposed, this); + } +} diff --git a/src/libraries/System.Formats.Tar/src/System/Formats/Tar/TarEntry.cs b/src/libraries/System.Formats.Tar/src/System/Formats/Tar/TarEntry.cs index 6e8382552e4d4b..af0383e80d1dc4 100644 --- a/src/libraries/System.Formats.Tar/src/System/Formats/Tar/TarEntry.cs +++ b/src/libraries/System.Formats.Tar/src/System/Formats/Tar/TarEntry.cs @@ -113,7 +113,7 @@ public DateTimeOffset ModificationTime /// When the indicates an entry that can contain data, this property returns the length in bytes of such data. /// /// The entry type that commonly contains data is (or in the format). Other uncommon entry types that can also contain data are: , , and . - public long Length => _header._dataStream != null ? _header._dataStream.Length : _header._size; + public long Length => _header._gnuSparseDataStream?.Length ?? (_header._dataStream is not null ? _header._dataStream.Length : _header._size); /// /// When the indicates a or a , this property returns the link target path of such link. @@ -252,7 +252,7 @@ public Task ExtractToFileAsync(string destinationFileName, bool overwrite, Cance /// An I/O problem occurred. public Stream? DataStream { - get => _header._dataStream; + get => (Stream?)_header._gnuSparseDataStream ?? _header._dataStream; set { if (!IsDataStreamSetterSupported()) @@ -275,6 +275,8 @@ public Stream? DataStream _readerOfOrigin = null; } + _header._gnuSparseDataStream?.Dispose(); + _header._gnuSparseDataStream = null; _header._dataStream?.Dispose(); _header._dataStream = value; diff --git a/src/libraries/System.Formats.Tar/src/System/Formats/Tar/TarHeader.Read.cs b/src/libraries/System.Formats.Tar/src/System/Formats/Tar/TarHeader.Read.cs index c39df542aa88d0..6cb4fa1637c1ff 100644 --- a/src/libraries/System.Formats.Tar/src/System/Formats/Tar/TarHeader.Read.cs +++ b/src/libraries/System.Formats.Tar/src/System/Formats/Tar/TarHeader.Read.cs @@ -144,6 +144,32 @@ internal void ReplaceNormalAttributesWithExtended(Dictionary? di _size = size; } + // GNU sparse format 1.0 (encoded via PAX) uses RegularFile type flag ('0') and stores sparse metadata in + // PAX extended attributes. Process all GNU sparse 1.0 attributes together in this block. + if (_typeFlag is TarEntryType.RegularFile or TarEntryType.V7RegularFile) + { + // 'GNU.sparse.name' overrides the placeholder path (e.g. 'GNUSparseFile.0/...') in the header's 'path' field. + if (ExtendedAttributes.TryGetValue(PaxEaGnuSparseName, out string? gnuSparseName)) + { + _name = gnuSparseName; + } + + // 'GNU.sparse.realsize' is the expanded (virtual) file size; stored separately from _size so that + // _size retains the archive data section length needed for correct stream positioning. + if (TarHelpers.TryGetStringAsBaseTenLong(ExtendedAttributes, PaxEaGnuSparseRealSize, out long gnuSparseRealSize)) + { + _gnuSparseRealSize = gnuSparseRealSize; + } + + // 'GNU.sparse.major=1' and 'GNU.sparse.minor=0' identify format 1.0, where the data section begins + // with an embedded text-format sparse map followed by the packed non-zero data segments. + if (ExtendedAttributes.TryGetValue(PaxEaGnuSparseMajor, out string? gnuSparseMajor) && gnuSparseMajor == "1" && + ExtendedAttributes.TryGetValue(PaxEaGnuSparseMinor, out string? gnuSparseMinor) && gnuSparseMinor == "0") + { + _isGnuSparse10 = true; + } + } + // The 'uid' header field only fits 8 bytes, or the user could've stored an override in the extended attributes if (TarHelpers.TryGetStringAsBaseTenInteger(ExtendedAttributes, PaxEaUid, out int uid)) { @@ -220,6 +246,17 @@ internal void ProcessDataBlock(Stream archiveStream, bool copyData) case TarEntryType.TapeVolume: // Might contain data default: // Unrecognized entry types could potentially have a data section _dataStream = GetDataStream(archiveStream, copyData); + + // GNU sparse format 1.0 PAX entries embed a sparse map at the start of the + // data section. Create a GnuSparseStream wrapper that presents the expanded + // virtual file content. The sparse map is parsed lazily on first Read, so + // _dataStream remains unconsumed here — TarWriter can copy the raw condensed + // data, and AdvanceDataStreamIfNeeded can advance past it normally. + if (_isGnuSparse10 && _gnuSparseRealSize > 0 && _dataStream is not null) + { + _gnuSparseDataStream = new GnuSparseStream(_dataStream, _gnuSparseRealSize); + } + if (_dataStream is SeekableSubReadStream) { TarHelpers.AdvanceStream(archiveStream, _size); @@ -282,6 +319,12 @@ private async Task ProcessDataBlockAsync(Stream archiveStream, bool copyData, Ca case TarEntryType.TapeVolume: // Might contain data default: // Unrecognized entry types could potentially have a data section _dataStream = await GetDataStreamAsync(archiveStream, copyData, _size, cancellationToken).ConfigureAwait(false); + + if (_isGnuSparse10 && _gnuSparseRealSize > 0 && _dataStream is not null) + { + _gnuSparseDataStream = new GnuSparseStream(_dataStream, _gnuSparseRealSize); + } + if (_dataStream is SeekableSubReadStream) { await TarHelpers.AdvanceStreamAsync(archiveStream, _size, cancellationToken).ConfigureAwait(false); diff --git a/src/libraries/System.Formats.Tar/src/System/Formats/Tar/TarHeader.cs b/src/libraries/System.Formats.Tar/src/System/Formats/Tar/TarHeader.cs index 35da5b566ac37f..cb6429ea37c6eb 100644 --- a/src/libraries/System.Formats.Tar/src/System/Formats/Tar/TarHeader.cs +++ b/src/libraries/System.Formats.Tar/src/System/Formats/Tar/TarHeader.cs @@ -39,6 +39,12 @@ internal sealed partial class TarHeader private const string PaxEaDevMajor = "devmajor"; private const string PaxEaDevMinor = "devminor"; + // Names of GNU sparse extended attributes (used with GNU sparse format 1.0 encoded via PAX) + private const string PaxEaGnuSparseName = "GNU.sparse.name"; + private const string PaxEaGnuSparseRealSize = "GNU.sparse.realsize"; + private const string PaxEaGnuSparseMajor = "GNU.sparse.major"; + private const string PaxEaGnuSparseMinor = "GNU.sparse.minor"; + internal Stream? _dataStream; internal long _dataOffset; @@ -77,6 +83,21 @@ internal sealed partial class TarHeader private Dictionary? _ea; internal Dictionary ExtendedAttributes => _ea ??= new Dictionary(); + // When a GNU sparse 1.0 PAX entry is read, the real (expanded) file size is stored here. + // This is separate from _size which holds the archive data size and is used for data stream reading. + internal long _gnuSparseRealSize; + + // Set to true when GNU.sparse.major=1 is present in the PAX extended attributes, + // indicating this is a GNU sparse format 1.0 entry whose data section contains an + // embedded sparse map followed by the packed data segments. + internal bool _isGnuSparse10; + + // When _isGnuSparse10 is true, this wraps _dataStream and presents the expanded virtual + // file content. _dataStream remains the raw (condensed) stream so that TarWriter can + // round-trip the original sparse data and AdvanceDataStreamIfNeeded works without + // special-casing. + internal GnuSparseStream? _gnuSparseDataStream; + // GNU attributes internal DateTimeOffset _aTime; @@ -106,6 +127,9 @@ internal TarHeader(TarEntryFormat format, TarEntryType typeFlag, TarHeader other _checksum = other._checksum; _linkName = other._linkName; _dataStream = other._dataStream; + _gnuSparseRealSize = other._gnuSparseRealSize; + _isGnuSparse10 = other._isGnuSparse10; + _gnuSparseDataStream = other._gnuSparseDataStream; } internal void AddExtendedAttributes(IEnumerable> existing) diff --git a/src/libraries/System.Formats.Tar/tests/System.Formats.Tar.Tests.csproj b/src/libraries/System.Formats.Tar/tests/System.Formats.Tar.Tests.csproj index 8f85711e7eaf3f..123f26475378b0 100644 --- a/src/libraries/System.Formats.Tar/tests/System.Formats.Tar.Tests.csproj +++ b/src/libraries/System.Formats.Tar/tests/System.Formats.Tar.Tests.csproj @@ -48,6 +48,7 @@ + diff --git a/src/libraries/System.Formats.Tar/tests/TarReader/TarReader.SparseFile.Tests.cs b/src/libraries/System.Formats.Tar/tests/TarReader/TarReader.SparseFile.Tests.cs new file mode 100644 index 00000000000000..a26c70d8c77b00 --- /dev/null +++ b/src/libraries/System.Formats.Tar/tests/TarReader/TarReader.SparseFile.Tests.cs @@ -0,0 +1,588 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.IO; +using System.Text; +using System.Threading; +using System.Threading.Tasks; +using Xunit; + +namespace System.Formats.Tar.Tests +{ + /// + /// Tests for GNU sparse format 1.0 (PAX) reading. Since GnuSparseStream is internal, + /// it is exercised through TarReader's public DataStream property using + /// programmatically constructed PAX 1.0 sparse archives. + /// + public class TarReader_SparseFileTests : TarTestsBase + { + // Builds a PAX 1.0 sparse archive in memory and returns a TarEntry whose + // DataStream is a GnuSparseStream. segments is an array of (virtualOffset, length) + // pairs; packed data for each segment is filled with its 1-based index value. + private static (MemoryStream archive, byte[] rawPackedData) BuildSparseArchive( + string realName, long realSize, + (long Offset, long Length)[] segments) + { + // Build the sparse map text: numSegs\n, then pairs offset\n length\n + var sb = new StringBuilder(); + sb.Append(segments.Length).Append('\n'); + foreach (var (off, len) in segments) + { + sb.Append(off).Append('\n'); + sb.Append(len).Append('\n'); + } + byte[] mapText = Encoding.ASCII.GetBytes(sb.ToString()); + + // Pad to the next 512-byte block boundary, then append placeholder packed data. + int padding = (512 - (mapText.Length % 512)) % 512; + long totalPackedBytes = 0; + foreach (var (_, len) in segments) totalPackedBytes += len; + + byte[] rawSparseData = new byte[mapText.Length + padding + totalPackedBytes]; + mapText.CopyTo(rawSparseData, 0); + + // Fill each segment's packed data with its 1-based segment index value. + int writePos = mapText.Length + padding; + for (int i = 0; i < segments.Length; i++) + { + for (long j = 0; j < segments[i].Length; j++) + { + rawSparseData[writePos++] = (byte)(i + 1); + } + } + + var gnuSparseAttributes = new Dictionary + { + ["GNU.sparse.major"] = "1", + ["GNU.sparse.minor"] = "0", + ["GNU.sparse.name"] = realName, + ["GNU.sparse.realsize"] = realSize.ToString(), + }; + + string placeholderName = "GNUSparseFile.0/" + realName; + var archive = new MemoryStream(); + using (var writer = new TarWriter(archive, TarEntryFormat.Pax, leaveOpen: true)) + { + var entry = new PaxTarEntry(TarEntryType.RegularFile, placeholderName, gnuSparseAttributes); + entry.DataStream = new MemoryStream(rawSparseData); + writer.WriteEntry(entry); + } + archive.Position = 0; + return (archive, rawSparseData[(mapText.Length + padding)..]); + } + + // Reads the DataStream of the first entry from the given archive and returns it. + private static Stream GetSparseDataStream(MemoryStream archiveStream, bool copyData) + { + archiveStream.Position = 0; + using var reader = new TarReader(archiveStream, leaveOpen: true); + TarEntry? entry = reader.GetNextEntry(copyData); + Assert.NotNull(entry); + Assert.NotNull(entry.DataStream); + return entry.DataStream; + } + + // Builds a raw archive byte array where the sparse map text is injected directly, + // bypassing TarWriter validation. Used to construct malformed archives. + private static MemoryStream BuildRawSparseArchive(string sparseMapContent, string realName, long realSize) + { + byte[] mapBytes = Encoding.ASCII.GetBytes(sparseMapContent); + int padding = (512 - (mapBytes.Length % 512)) % 512; + byte[] rawData = new byte[mapBytes.Length + padding]; + mapBytes.CopyTo(rawData, 0); + + var gnuSparseAttributes = new Dictionary + { + ["GNU.sparse.major"] = "1", + ["GNU.sparse.minor"] = "0", + ["GNU.sparse.name"] = realName, + ["GNU.sparse.realsize"] = realSize.ToString(), + }; + + var archive = new MemoryStream(); + using (var writer = new TarWriter(archive, TarEntryFormat.Pax, leaveOpen: true)) + { + var entry = new PaxTarEntry(TarEntryType.RegularFile, "GNUSparseFile.0/" + realName, gnuSparseAttributes); + entry.DataStream = new MemoryStream(rawData); + writer.WriteEntry(entry); + } + archive.Position = 0; + return archive; + } + + public static IEnumerable SparseLayoutTestCases() + { + // (realSize, segments as flat array [off0, len0, off1, len1, ...], copyData, useAsync) + (long, long[])[] layouts = + [ + (512, [0, 512]), // single segment, no holes + (1024, [256, 256]), // leading + trailing hole + (2048, [0, 256, 512, 256, 1024, 256]), // 3 segments with holes in between + (1000, [1000, 0]), // all holes (zero-length segment) + ]; + + foreach ((long size, long[] layout) in layouts) + { + foreach (bool copyData in new[] { false, true }) + { + foreach (bool useAsync in new[] { false, true }) + { + yield return new object[] { size, layout, copyData, useAsync }; + } + } + } + } + + [Theory] + [MemberData(nameof(SparseLayoutTestCases))] + public async Task SparseLayout_ExpandsCorrectly(long realSize, long[] segmentPairs, bool copyData, bool useAsync) + { + var segments = PairsToSegments(segmentPairs); + var (archive, _) = BuildSparseArchive("file.bin", realSize, segments); + + using var dataStream = GetSparseDataStream(archive, copyData); + + Assert.Equal(realSize, dataStream.Length); + byte[] buf = new byte[(int)realSize]; + if (useAsync) + { + await dataStream.ReadExactlyAsync(buf, CancellationToken.None); + } + else + { + dataStream.ReadExactly(buf); + } + VerifyExpandedContent(buf, realSize, segments); + } + + [Theory] + [InlineData(false)] + [InlineData(true)] + public void PartialReadsProduceSameResultAsFullRead(bool copyData) + { + // Read the data in small (13-byte) chunks and verify it matches a full read. + var segments = new[] { (0L, 256L), (512L, 256L), (1024L, 256L) }; + var (archive1, _) = BuildSparseArchive("file.bin", 2048, segments); + var (archive2, _) = BuildSparseArchive("file.bin", 2048, segments); + + byte[] fullRead = new byte[2048]; + using (var s = GetSparseDataStream(archive1, copyData)) + s.ReadExactly(fullRead); + + byte[] partialRead = new byte[2048]; + using var chunkedStream = GetSparseDataStream(archive2, copyData); + int pos = 0; + while (pos < 2048) + { + int chunk = Math.Min(13, 2048 - pos); + int read = chunkedStream.Read(partialRead, pos, chunk); + Assert.True(read > 0); + pos += read; + } + + Assert.Equal(fullRead, partialRead); + } + + [Fact] + public void SeekableStream_SeekAndRead() + { + // Build a seekable archive (from MemoryStream) and verify random access. + var segments = new[] { (0L, 256L), (512L, 256L), (1024L, 256L) }; + var (archive, _) = BuildSparseArchive("file.bin", 2048, segments); + + using var dataStream = GetSparseDataStream(archive, copyData: false); + + if (!dataStream.CanSeek) + { + return; // Only test on seekable streams. + } + + // Seek to segment 1 (offset 512) and read. + dataStream.Seek(512, SeekOrigin.Begin); + byte[] buf = new byte[256]; + dataStream.ReadExactly(buf); + Assert.All(buf, b => Assert.Equal(2, b)); + + // Seek to segment 0 (offset 0) and read. + dataStream.Seek(0, SeekOrigin.Begin); + dataStream.ReadExactly(buf); + Assert.All(buf, b => Assert.Equal(1, b)); + + // Seek into a hole. + dataStream.Seek(300, SeekOrigin.Begin); + int read = dataStream.Read(buf, 0, 10); + Assert.True(read > 0); + for (int i = 0; i < read; i++) Assert.Equal(0, buf[i]); + } + + [Fact] + public void AdvancePastEntry_DoesNotCorruptNextEntry() + { + // Write two entries in a PAX archive: first a sparse entry, then a regular entry. + // Verify that after reading the first entry without consuming its DataStream, + // the second entry is still readable with correct content. + const string RegularName = "regular.txt"; + byte[] regularContent = Encoding.UTF8.GetBytes("Hello, world!"); + + string sparseMapText = "1\n0\n256\n"; + byte[] sparseMapBytes = Encoding.ASCII.GetBytes(sparseMapText); + byte[] packedData = new byte[256]; + Array.Fill(packedData, 0x42); + byte[] rawSparseData = new byte[512 + 256]; + sparseMapBytes.CopyTo(rawSparseData, 0); + packedData.CopyTo(rawSparseData, 512); + + var gnuSparseAttributes = new Dictionary + { + ["GNU.sparse.major"] = "1", + ["GNU.sparse.minor"] = "0", + ["GNU.sparse.name"] = "sparse.bin", + ["GNU.sparse.realsize"] = "256", + }; + + var archive = new MemoryStream(); + using (var writer = new TarWriter(archive, TarEntryFormat.Pax, leaveOpen: true)) + { + var sparseEntry = new PaxTarEntry(TarEntryType.RegularFile, "GNUSparseFile.0/sparse.bin", gnuSparseAttributes); + sparseEntry.DataStream = new MemoryStream(rawSparseData); + writer.WriteEntry(sparseEntry); + + var regularEntry = new PaxTarEntry(TarEntryType.RegularFile, RegularName); + regularEntry.DataStream = new MemoryStream(regularContent); + writer.WriteEntry(regularEntry); + } + + archive.Position = 0; + using var reader = new TarReader(archive); + + // Read the sparse entry but don't consume its DataStream. + TarEntry? first = reader.GetNextEntry(copyData: false); + Assert.NotNull(first); + Assert.Equal("sparse.bin", first.Name); + + // Read the next entry without having consumed the sparse DataStream. + TarEntry? second = reader.GetNextEntry(copyData: false); + Assert.NotNull(second); + Assert.Equal(RegularName, second.Name); + + Assert.NotNull(second.DataStream); + byte[] buf = new byte[regularContent.Length]; + second.DataStream.ReadExactly(buf); + Assert.Equal(regularContent, buf); + } + + // --- Corrupted format tests --- + + public static IEnumerable CorruptedSparseMapTestCases() + { + string[] corruptedMaps = + [ + "abc\n0\n256\n", // non-numeric segment count + "\n0\n256\n", // empty segment count line + "1\nabc\n256\n", // non-numeric offset + "1\n0\nabc\n", // non-numeric length + "1\n-1\n256\n", // negative offset + "1\n0\n-1\n", // negative length + "1\n0\n", // truncated: missing length line + "1\n", // truncated: missing offset and length lines + "1\n0\n2048\n", // segment extends past realSize + "2\n256\n256\n0\n256\n", // segments not in ascending order + "2\n" + new string('A', 512) + "\n256\n", // line exceeding buffer capacity + ]; + + foreach (string map in corruptedMaps) + { + yield return new object[] { map, false }; + yield return new object[] { map, true }; + } + } + + [Theory] + [MemberData(nameof(CorruptedSparseMapTestCases))] + public async Task CorruptedSparseMap_InvalidDataException(string sparseMapContent, bool useAsync) + { + var archive = BuildRawSparseArchive(sparseMapContent, "file.bin", 1024); + using var reader = new TarReader(archive); + TarEntry? entry = useAsync + ? await reader.GetNextEntryAsync(copyData: false) + : reader.GetNextEntry(copyData: false); + Assert.NotNull(entry); + Assert.NotNull(entry.DataStream); + + if (useAsync) + { + await Assert.ThrowsAsync(async () => await entry.DataStream.ReadAsync(new byte[1])); + } + else + { + Assert.Throws(() => entry.DataStream.ReadByte()); + } + } + + [Theory] + [InlineData(false, false)] // missing minor + [InlineData(true, false)] + [InlineData(false, true)] // wrong major + [InlineData(true, true)] + public void WrongSparseVersion_EntryReadAsNormal(bool copyData, bool wrongMajor) + { + byte[] content = Encoding.ASCII.GetBytes("plain content"); + + var attributes = new Dictionary + { + ["GNU.sparse.name"] = "real.bin", + ["GNU.sparse.realsize"] = "1024", + }; + + if (wrongMajor) + { + attributes["GNU.sparse.major"] = "2"; + attributes["GNU.sparse.minor"] = "0"; + } + else + { + attributes["GNU.sparse.major"] = "1"; + // minor intentionally omitted + } + + var archive = new MemoryStream(); + using (var writer = new TarWriter(archive, TarEntryFormat.Pax, leaveOpen: true)) + { + var entry = new PaxTarEntry(TarEntryType.RegularFile, "GNUSparseFile.0/real.bin", attributes); + entry.DataStream = new MemoryStream(content); + writer.WriteEntry(entry); + } + archive.Position = 0; + + using var reader = new TarReader(archive); + TarEntry? e = reader.GetNextEntry(copyData); + Assert.NotNull(e); + Assert.Equal("real.bin", e.Name); + Assert.NotNull(e.DataStream); + byte[] buf = new byte[content.Length]; + e.DataStream.ReadExactly(buf); + Assert.Equal(content, buf); + } + + [Theory] + [InlineData(false)] + [InlineData(true)] + public void GnuSparse10Pax_NilSparseData(bool copyData) + { + // pax-nil-sparse-data: one segment (offset=0, length=1000), realsize=1000, no holes. + // The packed data is 1000 bytes of "0123456789" repeating. + using MemoryStream archiveStream = GetTarMemoryStream(CompressionMethod.Uncompressed, "golang_tar", "pax-nil-sparse-data"); + using TarReader reader = new TarReader(archiveStream); + + TarEntry? entry = reader.GetNextEntry(copyData); + Assert.NotNull(entry); + + Assert.Equal(TarEntryType.RegularFile, entry.EntryType); + Assert.Equal("sparse.db", entry.Name); + Assert.Equal(1000, entry.Length); + Assert.NotNull(entry.DataStream); + Assert.Equal(1000, entry.DataStream.Length); + + byte[] content = new byte[1000]; + entry.DataStream.ReadExactly(content); + + for (int i = 0; i < 1000; i++) + { + Assert.Equal((byte)'0' + (i % 10), content[i]); + } + + Assert.Null(reader.GetNextEntry()); + } + + [Theory] + [InlineData(false)] + [InlineData(true)] + public void GnuSparse10Pax_NilSparseHole(bool copyData) + { + // pax-nil-sparse-hole: one segment (offset=1000, length=0), realsize=1000, all zeros. + using MemoryStream archiveStream = GetTarMemoryStream(CompressionMethod.Uncompressed, "golang_tar", "pax-nil-sparse-hole"); + using TarReader reader = new TarReader(archiveStream); + + TarEntry? entry = reader.GetNextEntry(copyData); + Assert.NotNull(entry); + + Assert.Equal(TarEntryType.RegularFile, entry.EntryType); + Assert.Equal("sparse.db", entry.Name); + Assert.Equal(1000, entry.Length); + Assert.NotNull(entry.DataStream); + Assert.Equal(1000, entry.DataStream.Length); + + byte[] content = new byte[1000]; + entry.DataStream.ReadExactly(content); + + Assert.All(content, b => Assert.Equal(0, b)); + Assert.Null(reader.GetNextEntry()); + } + + [Theory] + [InlineData(false, true)] + [InlineData(true, true)] + [InlineData(false, false)] + [InlineData(true, false)] + public void CopySparseEntryToNewArchive_PreservesExpandedContent(bool copyData, bool seekableSource) + { + const string RealName = "realfile.txt"; + const long RealSize = 2048; + const long Seg0Offset = 256, Seg0Length = 256; + const long Seg1Offset = 768, Seg1Length = 256; + + byte[] packedData0 = new byte[Seg0Length]; + Array.Fill(packedData0, 0x42); + byte[] packedData1 = new byte[Seg1Length]; + Array.Fill(packedData1, 0x43); + + byte[] mapText = Encoding.ASCII.GetBytes("2\n256\n256\n768\n256\n"); + byte[] rawSparseData = new byte[512 + Seg0Length + Seg1Length]; + mapText.CopyTo(rawSparseData, 0); + packedData0.CopyTo(rawSparseData, 512); + packedData1.CopyTo(rawSparseData, 512 + (int)Seg0Length); + + var gnuSparseAttributes = new Dictionary + { + ["GNU.sparse.major"] = "1", + ["GNU.sparse.minor"] = "0", + ["GNU.sparse.name"] = RealName, + ["GNU.sparse.realsize"] = RealSize.ToString(), + }; + + using var sourceArchive = new MemoryStream(); + using (var writer = new TarWriter(sourceArchive, TarEntryFormat.Pax, leaveOpen: true)) + { + var entry = new PaxTarEntry(TarEntryType.RegularFile, "GNUSparseFile.0/" + RealName, gnuSparseAttributes); + entry.DataStream = new MemoryStream(rawSparseData); + writer.WriteEntry(entry); + } + + int sourceLength = (int)sourceArchive.Length; + sourceArchive.Position = 0; + + // Copy the sparse entry directly to a new archive. + using var destArchive = new MemoryStream(); + Stream readerStream = seekableSource + ? sourceArchive + : new NonSeekableStream(sourceArchive); + using (var reader = new TarReader(readerStream)) + { + TarEntry readEntry = reader.GetNextEntry(copyData); + Assert.NotNull(readEntry); + Assert.Equal(RealName, readEntry.Name); + Assert.Equal(RealSize, readEntry.Length); + + using var writer2 = new TarWriter(destArchive, TarEntryFormat.Pax, leaveOpen: true); + writer2.WriteEntry(readEntry); + } + + // Re-read the destination archive and verify the sparse entry round-trips correctly. + Assert.Equal(sourceLength, destArchive.Length); + + destArchive.Position = 0; + using var reader2 = new TarReader(destArchive); + TarEntry copiedEntry = reader2.GetNextEntry(); + Assert.NotNull(copiedEntry); + Assert.Equal(RealName, copiedEntry.Name); + Assert.Equal(RealSize, copiedEntry.Length); + Assert.NotNull(copiedEntry.DataStream); + + byte[] content = new byte[RealSize]; + copiedEntry.DataStream.ReadExactly(content); + + for (int i = 0; i < Seg0Offset; i++) Assert.Equal(0, content[i]); + for (int i = (int)Seg0Offset; i < (int)(Seg0Offset + Seg0Length); i++) Assert.Equal(0x42, content[i]); + for (int i = (int)(Seg0Offset + Seg0Length); i < Seg1Offset; i++) Assert.Equal(0, content[i]); + for (int i = (int)Seg1Offset; i < (int)(Seg1Offset + Seg1Length); i++) Assert.Equal(0x43, content[i]); + for (int i = (int)(Seg1Offset + Seg1Length); i < RealSize; i++) Assert.Equal(0, content[i]); + + Assert.Null(reader2.GetNextEntry()); + } + + [Theory] + [InlineData(false)] + [InlineData(true)] + public void GnuSparse10Pax_SparseBig_NameAndLength(bool copyData) + { + // pax-sparse-big: 6 segments scattered across a 60 GB virtual file, realsize=60000000000. + using MemoryStream archiveStream = GetTarMemoryStream(CompressionMethod.Uncompressed, "golang_tar", "pax-sparse-big"); + using TarReader reader = new TarReader(archiveStream); + + TarEntry? entry = reader.GetNextEntry(copyData); + Assert.NotNull(entry); + + Assert.Equal(TarEntryType.RegularFile, entry.EntryType); + Assert.Equal("pax-sparse", entry.Name); + Assert.Equal(60000000000L, entry.Length); + Assert.NotNull(entry.DataStream); + Assert.Equal(60000000000L, entry.DataStream.Length); + } + + private static (long Offset, long Length)[] PairsToSegments(long[] pairs) + { + var segments = new (long Offset, long Length)[pairs.Length / 2]; + for (int i = 0; i < segments.Length; i++) + { + segments[i] = (pairs[i * 2], pairs[i * 2 + 1]); + } + + return segments; + } + + // Verifies that expanded content has zeros in holes and the correct fill byte + // (1-based segment index) in data segments, matching BuildSparseArchive's convention. + private static void VerifyExpandedContent(byte[] buf, long realSize, (long Offset, long Length)[] segments) + { + int pos = 0; + for (int s = 0; s < segments.Length; s++) + { + var (offset, length) = segments[s]; + // Hole before this segment + for (int i = pos; i < (int)offset; i++) + { + Assert.Equal(0, buf[i]); + } + // Segment data (BuildSparseArchive fills with 1-based index) + byte expected = (byte)(s + 1); + for (int i = (int)offset; i < (int)(offset + length); i++) + { + Assert.Equal(expected, buf[i]); + } + pos = (int)(offset + length); + } + // Trailing hole + for (int i = pos; i < (int)realSize; i++) + { + Assert.Equal(0, buf[i]); + } + } + } + + // Minimal non-seekable stream wrapper for testing. + // Unlike WrappedStream, this overrides Read(Span) to avoid the extra buffer copy + // in Stream.Read(Span) that can cause issues with SubReadStream position tracking. + internal sealed class NonSeekableStream : Stream + { + private readonly Stream _inner; + + public NonSeekableStream(Stream inner) => _inner = inner; + + public override bool CanRead => _inner.CanRead; + public override bool CanSeek => false; + public override bool CanWrite => false; + public override long Length => throw new NotSupportedException(); + public override long Position { get => throw new NotSupportedException(); set => throw new NotSupportedException(); } + + public override int Read(byte[] buffer, int offset, int count) => _inner.Read(buffer, offset, count); + public override int Read(Span buffer) => _inner.Read(buffer); + public override Task ReadAsync(byte[] buffer, int offset, int count, CancellationToken ct) => _inner.ReadAsync(buffer, offset, count, ct); + public override ValueTask ReadAsync(Memory buffer, CancellationToken ct = default) => _inner.ReadAsync(buffer, ct); + + public override void Flush() { } + public override long Seek(long offset, SeekOrigin origin) => throw new NotSupportedException(); + public override void SetLength(long value) => throw new NotSupportedException(); + public override void Write(byte[] buffer, int offset, int count) => throw new NotSupportedException(); + } +}