diff --git a/src/libraries/System.Formats.Tar/src/System.Formats.Tar.csproj b/src/libraries/System.Formats.Tar/src/System.Formats.Tar.csproj
index c76c2a5250830a..4851fd6bd9e677 100644
--- a/src/libraries/System.Formats.Tar/src/System.Formats.Tar.csproj
+++ b/src/libraries/System.Formats.Tar/src/System.Formats.Tar.csproj
@@ -33,6 +33,7 @@
+
diff --git a/src/libraries/System.Formats.Tar/src/System/Formats/Tar/GnuSparseStream.cs b/src/libraries/System.Formats.Tar/src/System/Formats/Tar/GnuSparseStream.cs
new file mode 100644
index 00000000000000..1eb80d16b717fc
--- /dev/null
+++ b/src/libraries/System.Formats.Tar/src/System/Formats/Tar/GnuSparseStream.cs
@@ -0,0 +1,478 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Buffers;
+using System.Buffers.Text;
+using System.Diagnostics;
+using System.IO;
+using System.Threading;
+using System.Threading.Tasks;
+
+namespace System.Formats.Tar
+{
+ // Stream that wraps the raw data section of a GNU sparse format 1.0 PAX entry and
+ // expands it to the virtual file size by inserting zeros for sparse holes.
+ //
+ // The raw data section layout:
+ // [sparse map text: numSegs\n, then pairs of offset\n numbytes\n]
+ // [zero padding to the next 512-byte block boundary]
+ // [packed non-zero data segments, stored sequentially]
+ //
+ // This stream presents a virtual file of size 'realSize' where:
+ // - regions covered by sparse map segments contain the packed data
+ // - all other regions (holes) read as zero bytes
+ internal sealed class GnuSparseStream : Stream
+ {
+ // Caps the segment count to prevent excessive memory allocation from malformed archives.
+ // Each segment entry in the array occupies 16 bytes, so 1M segments = 16 MB.
+ private const int MaxSparseSegments = 1_000_000;
+
+ private readonly Stream _rawStream;
+ private bool _isDisposed;
+ private readonly long _realSize;
+
+ // Sparse map state — initialized lazily on first Read to avoid consuming the raw
+ // stream before TarWriter has a chance to copy the condensed data.
+ private (long Offset, long Length)[]? _segments;
+ private long[]? _packedStartOffsets;
+
+ private long _virtualPosition; // current position in the virtual (expanded) file
+
+ // For non-seekable streams: tracks how many bytes of packed data have been consumed
+ // so we can skip forward when there are holes between segments.
+ private long _nextPackedOffset;
+
+ // Cached segment index for sequential read optimization.
+ // For typical forward sequential reads, this avoids repeated binary searches.
+ private int _currentSegmentIndex;
+
+ internal GnuSparseStream(Stream rawStream, long realSize)
+ {
+ _rawStream = rawStream;
+ _realSize = realSize;
+ }
+
+ // Parses the sparse map on first read. Populates _segments, _packedStartOffsets,
+ // and _dataStart. Throws InvalidDataException if the sparse map is malformed.
+ private void EnsureInitialized()
+ {
+ if (_segments is not null)
+ {
+ return;
+ }
+
+ var segments = ParseSparseMap(isAsync: false, _rawStream, CancellationToken.None).GetAwaiter().GetResult();
+ InitializeFromParsedMap(segments);
+ }
+
+ private async ValueTask EnsureInitializedAsync(CancellationToken cancellationToken)
+ {
+ if (_segments is not null)
+ {
+ return;
+ }
+
+ var segments = await ParseSparseMap(isAsync: true, _rawStream, cancellationToken).ConfigureAwait(false);
+ InitializeFromParsedMap(segments);
+ }
+
+ private void InitializeFromParsedMap((long Offset, long Length)[] segments)
+ {
+ _packedStartOffsets = new long[segments.Length];
+ long sum = 0;
+ long previousEnd = 0;
+ for (int i = 0; i < segments.Length; i++)
+ {
+ var (offset, length) = segments[i];
+
+ // Validate segment ordering and bounds.
+ if (offset < previousEnd || offset + length > _realSize)
+ {
+ throw new InvalidDataException(SR.TarInvalidNumber);
+ }
+ previousEnd = offset + length;
+
+ _packedStartOffsets[i] = sum;
+ try
+ {
+ sum = checked(sum + length);
+ }
+ catch (OverflowException ex)
+ {
+ throw new InvalidDataException(SR.TarInvalidNumber, ex);
+ }
+ }
+ // Assign _segments last — it serves as the initialization flag.
+ _segments = segments;
+ }
+
+ public override bool CanRead => !_isDisposed;
+ public override bool CanSeek => !_isDisposed && _rawStream.CanSeek;
+ public override bool CanWrite => false;
+
+ public override long Length
+ {
+ get
+ {
+ ThrowIfDisposed();
+ return _realSize;
+ }
+ }
+
+ public override long Position
+ {
+ get
+ {
+ ThrowIfDisposed();
+ return _virtualPosition;
+ }
+ set
+ {
+ ThrowIfDisposed();
+ if (!_rawStream.CanSeek)
+ {
+ throw new NotSupportedException(SR.IO_NotSupported_UnseekableStream);
+ }
+ ArgumentOutOfRangeException.ThrowIfNegative(value);
+ _virtualPosition = value;
+ _currentSegmentIndex = 0; // Reset segment hint after seek
+ }
+ }
+
+ public override long Seek(long offset, SeekOrigin origin)
+ {
+ ThrowIfDisposed();
+ if (!_rawStream.CanSeek)
+ {
+ throw new NotSupportedException(SR.IO_NotSupported_UnseekableStream);
+ }
+
+ long newPosition = origin switch
+ {
+ SeekOrigin.Begin => offset,
+ SeekOrigin.Current => _virtualPosition + offset,
+ SeekOrigin.End => _realSize + offset,
+ _ => throw new ArgumentOutOfRangeException(nameof(origin)),
+ };
+
+ if (newPosition < 0)
+ {
+ throw new IOException(SR.IO_SeekBeforeBegin);
+ }
+
+ _virtualPosition = newPosition;
+ _currentSegmentIndex = 0; // Reset segment hint after seek
+ return _virtualPosition;
+ }
+
+ public override int Read(byte[] buffer, int offset, int count)
+ {
+ ValidateBufferArguments(buffer, offset, count);
+ return Read(buffer.AsSpan(offset, count));
+ }
+
+ public override int Read(Span destination)
+ {
+ ThrowIfDisposed();
+ EnsureInitialized();
+ Debug.Assert(_segments is not null && _packedStartOffsets is not null);
+
+ if (destination.IsEmpty || _virtualPosition >= _realSize)
+ {
+ return 0;
+ }
+
+ int toRead = (int)Math.Min(destination.Length, _realSize - _virtualPosition);
+ destination = destination.Slice(0, toRead);
+
+ int totalFilled = 0;
+ while (totalFilled < toRead)
+ {
+ long vPos = _virtualPosition + totalFilled;
+ int segIdx = FindSegmentFromCurrent(vPos);
+
+ if (segIdx < 0)
+ {
+ // vPos is in a sparse hole — fill with zeros until the next segment or end of file.
+ long nextSegStart = ~segIdx < _segments.Length ? _segments[~segIdx].Offset : _realSize;
+ int zeroCount = (int)Math.Min(toRead - totalFilled, nextSegStart - vPos);
+ destination.Slice(totalFilled, zeroCount).Clear();
+ totalFilled += zeroCount;
+ }
+ else
+ {
+ // vPos is within segment segIdx — read from packed data.
+ var (segOffset, segLength) = _segments[segIdx];
+ long offsetInSeg = vPos - segOffset;
+ long remainingInSeg = segLength - offsetInSeg;
+ int countToRead = (int)Math.Min(toRead - totalFilled, remainingInSeg);
+
+ long packedOffset = _packedStartOffsets[segIdx] + offsetInSeg;
+ int bytesRead = ReadFromPackedData(destination.Slice(totalFilled, countToRead), packedOffset);
+ totalFilled += bytesRead;
+ break; // Return after an underlying read; caller can call Read again for more.
+ }
+ }
+
+ _virtualPosition += totalFilled;
+ return totalFilled;
+ }
+
+ public override Task ReadAsync(byte[] buffer, int offset, int count, CancellationToken cancellationToken)
+ {
+ if (cancellationToken.IsCancellationRequested)
+ {
+ return Task.FromCanceled(cancellationToken);
+ }
+ ValidateBufferArguments(buffer, offset, count);
+ return ReadAsync(new Memory(buffer, offset, count), cancellationToken).AsTask();
+ }
+
+ public override ValueTask ReadAsync(Memory buffer, CancellationToken cancellationToken = default)
+ {
+ if (cancellationToken.IsCancellationRequested)
+ {
+ return ValueTask.FromCanceled(cancellationToken);
+ }
+ ThrowIfDisposed();
+ if (buffer.IsEmpty || _virtualPosition >= _realSize)
+ {
+ return ValueTask.FromResult(0);
+ }
+ return ReadAsyncCore(buffer, cancellationToken);
+ }
+
+ private async ValueTask ReadAsyncCore(Memory buffer, CancellationToken cancellationToken)
+ {
+ await EnsureInitializedAsync(cancellationToken).ConfigureAwait(false);
+ Debug.Assert(_segments is not null && _packedStartOffsets is not null);
+
+ int toRead = (int)Math.Min(buffer.Length, _realSize - _virtualPosition);
+ buffer = buffer.Slice(0, toRead);
+
+ int totalFilled = 0;
+ while (totalFilled < toRead)
+ {
+ long vPos = _virtualPosition + totalFilled;
+ int segIdx = FindSegmentFromCurrent(vPos);
+
+ if (segIdx < 0)
+ {
+ long nextSegStart = ~segIdx < _segments.Length ? _segments[~segIdx].Offset : _realSize;
+ int zeroCount = (int)Math.Min(toRead - totalFilled, nextSegStart - vPos);
+ buffer.Slice(totalFilled, zeroCount).Span.Clear();
+ totalFilled += zeroCount;
+ }
+ else
+ {
+ var (segOffset, segLength) = _segments[segIdx];
+ long offsetInSeg = vPos - segOffset;
+ long remainingInSeg = segLength - offsetInSeg;
+ int countToRead = (int)Math.Min(toRead - totalFilled, remainingInSeg);
+
+ long packedOffset = _packedStartOffsets[segIdx] + offsetInSeg;
+ int bytesRead = await ReadFromPackedDataAsync(buffer.Slice(totalFilled, countToRead), packedOffset, cancellationToken).ConfigureAwait(false);
+ totalFilled += bytesRead;
+ break;
+ }
+ }
+
+ _virtualPosition += totalFilled;
+ return totalFilled;
+ }
+
+ // Exposes the underlying raw stream for callers that need to access the condensed data.
+ internal Stream BaseStream => _rawStream;
+
+ // Reads from the packed data at the given packedOffset.
+ // After EnsureInitialized, the raw stream is positioned at _dataStart and
+ // _nextPackedOffset tracks how far into the packed data we've read.
+ // Returns the number of bytes actually read (may be less than destination.Length).
+ private int ReadFromPackedData(Span destination, long packedOffset)
+ {
+ long skipBytes = packedOffset - _nextPackedOffset;
+ if (skipBytes < 0 && !_rawStream.CanSeek)
+ {
+ throw new InvalidOperationException(SR.IO_NotSupported_UnseekableStream);
+ }
+ if (skipBytes != 0)
+ {
+ TarHelpers.AdvanceStream(_rawStream, skipBytes);
+ }
+ int bytesRead = _rawStream.Read(destination);
+ _nextPackedOffset = packedOffset + bytesRead;
+ return bytesRead;
+ }
+
+ private async ValueTask ReadFromPackedDataAsync(Memory destination, long packedOffset, CancellationToken cancellationToken)
+ {
+ long skipBytes = packedOffset - _nextPackedOffset;
+ if (skipBytes < 0 && !_rawStream.CanSeek)
+ {
+ throw new InvalidOperationException(SR.IO_NotSupported_UnseekableStream);
+ }
+ if (skipBytes != 0)
+ {
+ await TarHelpers.AdvanceStreamAsync(_rawStream, skipBytes, cancellationToken).ConfigureAwait(false);
+ }
+ int bytesRead = await _rawStream.ReadAsync(destination, cancellationToken).ConfigureAwait(false);
+ _nextPackedOffset = packedOffset + bytesRead;
+ return bytesRead;
+ }
+
+ // Finds the segment containing virtualPosition using _currentSegmentIndex as a hint for O(1)
+ // sequential reads. Backward seeks must reset _currentSegmentIndex to 0 before calling this
+ // (done in Seek() and Position.set). For strictly forward sequential reads the index only ever
+ // advances, so no reset is needed here.
+ // Returns the segment index if found, or the bitwise complement of the
+ // insertion point (a negative number) if virtualPosition is in a hole.
+ private int FindSegmentFromCurrent(long virtualPosition)
+ {
+ Debug.Assert(_segments is not null);
+
+ // Scan forward from the current cached index (optimal for sequential reads).
+ while (_currentSegmentIndex < _segments.Length)
+ {
+ var (offset, length) = _segments[_currentSegmentIndex];
+ if (virtualPosition < offset)
+ {
+ // Position is in a hole before the current segment.
+ return ~_currentSegmentIndex;
+ }
+ if (virtualPosition < offset + length)
+ {
+ // Position is within the current segment.
+ return _currentSegmentIndex;
+ }
+ // Position is past this segment; advance to the next.
+ _currentSegmentIndex++;
+ }
+ return ~_segments.Length; // Past all segments.
+ }
+
+ // Parses the sparse map from rawStream (positioned at the start of the data section).
+ // The map format is: numSegments\n, then pairs of offset\n numbytes\n.
+ // After the map text, there is zero-padding to the next 512-byte block boundary,
+ // and then the packed data begins.
+ //
+ // Returns the parsed segments.
+ private static async Task<(long Offset, long Length)[]> ParseSparseMap(
+ bool isAsync, Stream rawStream, CancellationToken cancellationToken)
+ {
+ // The buffer is 2 * RecordSize (1024 bytes) and each fill reads exactly RecordSize (512)
+ // bytes. This guarantees that the total bytes read is always a multiple of RecordSize,
+ // so the stream is already positioned at the start of the packed data when this method returns.
+ int bufferSize = 2 * TarHelpers.RecordSize;
+ byte[] bytes = ArrayPool.Shared.Rent(bufferSize);
+
+ try
+ {
+ int activeStart = 0;
+ int availableStart = 0;
+
+ // Compact the buffer and read exactly one RecordSize (512) block.
+ // Returns true if bytes were read, false on EOF.
+ async ValueTask FillBufferAsync()
+ {
+ int active = availableStart - activeStart;
+ if (active > 0 && activeStart > 0)
+ {
+ bytes.AsSpan(activeStart, active).CopyTo(bytes);
+ }
+ activeStart = 0;
+ availableStart = active;
+
+ int newBytes = isAsync
+ ? await rawStream.ReadAtLeastAsync(bytes.AsMemory(availableStart, TarHelpers.RecordSize), TarHelpers.RecordSize, throwOnEndOfStream: false, cancellationToken).ConfigureAwait(false)
+ : rawStream.ReadAtLeast(bytes.AsSpan(availableStart, TarHelpers.RecordSize), TarHelpers.RecordSize, throwOnEndOfStream: false);
+
+ availableStart += newBytes;
+ return newBytes > 0;
+ }
+
+ // Reads a newline-terminated decimal line from the buffer, refilling as needed.
+ // Returns the parsed value. Throws InvalidDataException if the line is malformed.
+ async ValueTask ReadLineAsync()
+ {
+ while (true)
+ {
+ int nlIdx = bytes.AsSpan(activeStart, availableStart - activeStart).IndexOf((byte)'\n');
+ if (nlIdx >= 0)
+ {
+ ReadOnlySpan span = bytes.AsSpan(activeStart, nlIdx);
+ if (!Utf8Parser.TryParse(span, out long value, out int consumed) || consumed != span.Length)
+ {
+ throw new InvalidDataException(SR.TarInvalidNumber);
+ }
+ activeStart += nlIdx + 1;
+ return value;
+ }
+
+ if (availableStart + TarHelpers.RecordSize > bufferSize)
+ {
+ // Not enough room in the buffer for another block-sized fill
+ // and no newline found: line is too long (malformed).
+ throw new InvalidDataException(SR.TarInvalidNumber);
+ }
+
+ if (!await FillBufferAsync().ConfigureAwait(false))
+ {
+ // EOF before newline.
+ throw new InvalidDataException(SR.TarInvalidNumber);
+ }
+ }
+ }
+
+ await FillBufferAsync().ConfigureAwait(false);
+
+ long numSegments = await ReadLineAsync().ConfigureAwait(false);
+ if ((ulong)numSegments > MaxSparseSegments)
+ {
+ throw new InvalidDataException(SR.TarInvalidNumber);
+ }
+
+ var segments = new (long Offset, long Length)[(int)numSegments];
+ for (int i = 0; i < (int)numSegments; i++)
+ {
+ long offset = await ReadLineAsync().ConfigureAwait(false);
+ long length = await ReadLineAsync().ConfigureAwait(false);
+ if (offset < 0 || length < 0)
+ {
+ throw new InvalidDataException(SR.TarInvalidNumber);
+ }
+ segments[i] = (offset, length);
+ }
+
+ // Since each FillBuffer call reads exactly RecordSize (512) bytes, the total bytes
+ // read is always a multiple of RecordSize (mapBytesConsumed + padding), so the stream
+ // is already positioned at the start of the packed data.
+ return segments;
+ }
+ finally
+ {
+ ArrayPool.Shared.Return(bytes);
+ }
+ }
+
+ protected override void Dispose(bool disposing)
+ {
+ if (disposing && !_isDisposed)
+ {
+ _rawStream.Dispose();
+ }
+ _isDisposed = true;
+ base.Dispose(disposing);
+ }
+
+ public override void Flush() { }
+
+ public override Task FlushAsync(CancellationToken cancellationToken) =>
+ cancellationToken.IsCancellationRequested ? Task.FromCanceled(cancellationToken) : Task.CompletedTask;
+
+ public override void SetLength(long value) => throw new NotSupportedException(SR.IO_NotSupported_UnwritableStream);
+
+ public override void Write(byte[] buffer, int offset, int count) => throw new NotSupportedException(SR.IO_NotSupported_UnwritableStream);
+
+ private void ThrowIfDisposed() => ObjectDisposedException.ThrowIf(_isDisposed, this);
+ }
+}
diff --git a/src/libraries/System.Formats.Tar/src/System/Formats/Tar/TarEntry.cs b/src/libraries/System.Formats.Tar/src/System/Formats/Tar/TarEntry.cs
index 6e8382552e4d4b..af0383e80d1dc4 100644
--- a/src/libraries/System.Formats.Tar/src/System/Formats/Tar/TarEntry.cs
+++ b/src/libraries/System.Formats.Tar/src/System/Formats/Tar/TarEntry.cs
@@ -113,7 +113,7 @@ public DateTimeOffset ModificationTime
/// When the indicates an entry that can contain data, this property returns the length in bytes of such data.
///
/// The entry type that commonly contains data is (or in the format). Other uncommon entry types that can also contain data are: , , and .
- public long Length => _header._dataStream != null ? _header._dataStream.Length : _header._size;
+ public long Length => _header._gnuSparseDataStream?.Length ?? (_header._dataStream is not null ? _header._dataStream.Length : _header._size);
///
/// When the indicates a or a , this property returns the link target path of such link.
@@ -252,7 +252,7 @@ public Task ExtractToFileAsync(string destinationFileName, bool overwrite, Cance
/// An I/O problem occurred.
public Stream? DataStream
{
- get => _header._dataStream;
+ get => (Stream?)_header._gnuSparseDataStream ?? _header._dataStream;
set
{
if (!IsDataStreamSetterSupported())
@@ -275,6 +275,8 @@ public Stream? DataStream
_readerOfOrigin = null;
}
+ _header._gnuSparseDataStream?.Dispose();
+ _header._gnuSparseDataStream = null;
_header._dataStream?.Dispose();
_header._dataStream = value;
diff --git a/src/libraries/System.Formats.Tar/src/System/Formats/Tar/TarHeader.Read.cs b/src/libraries/System.Formats.Tar/src/System/Formats/Tar/TarHeader.Read.cs
index c39df542aa88d0..6cb4fa1637c1ff 100644
--- a/src/libraries/System.Formats.Tar/src/System/Formats/Tar/TarHeader.Read.cs
+++ b/src/libraries/System.Formats.Tar/src/System/Formats/Tar/TarHeader.Read.cs
@@ -144,6 +144,32 @@ internal void ReplaceNormalAttributesWithExtended(Dictionary? di
_size = size;
}
+ // GNU sparse format 1.0 (encoded via PAX) uses RegularFile type flag ('0') and stores sparse metadata in
+ // PAX extended attributes. Process all GNU sparse 1.0 attributes together in this block.
+ if (_typeFlag is TarEntryType.RegularFile or TarEntryType.V7RegularFile)
+ {
+ // 'GNU.sparse.name' overrides the placeholder path (e.g. 'GNUSparseFile.0/...') in the header's 'path' field.
+ if (ExtendedAttributes.TryGetValue(PaxEaGnuSparseName, out string? gnuSparseName))
+ {
+ _name = gnuSparseName;
+ }
+
+ // 'GNU.sparse.realsize' is the expanded (virtual) file size; stored separately from _size so that
+ // _size retains the archive data section length needed for correct stream positioning.
+ if (TarHelpers.TryGetStringAsBaseTenLong(ExtendedAttributes, PaxEaGnuSparseRealSize, out long gnuSparseRealSize))
+ {
+ _gnuSparseRealSize = gnuSparseRealSize;
+ }
+
+ // 'GNU.sparse.major=1' and 'GNU.sparse.minor=0' identify format 1.0, where the data section begins
+ // with an embedded text-format sparse map followed by the packed non-zero data segments.
+ if (ExtendedAttributes.TryGetValue(PaxEaGnuSparseMajor, out string? gnuSparseMajor) && gnuSparseMajor == "1" &&
+ ExtendedAttributes.TryGetValue(PaxEaGnuSparseMinor, out string? gnuSparseMinor) && gnuSparseMinor == "0")
+ {
+ _isGnuSparse10 = true;
+ }
+ }
+
// The 'uid' header field only fits 8 bytes, or the user could've stored an override in the extended attributes
if (TarHelpers.TryGetStringAsBaseTenInteger(ExtendedAttributes, PaxEaUid, out int uid))
{
@@ -220,6 +246,17 @@ internal void ProcessDataBlock(Stream archiveStream, bool copyData)
case TarEntryType.TapeVolume: // Might contain data
default: // Unrecognized entry types could potentially have a data section
_dataStream = GetDataStream(archiveStream, copyData);
+
+ // GNU sparse format 1.0 PAX entries embed a sparse map at the start of the
+ // data section. Create a GnuSparseStream wrapper that presents the expanded
+ // virtual file content. The sparse map is parsed lazily on first Read, so
+ // _dataStream remains unconsumed here — TarWriter can copy the raw condensed
+ // data, and AdvanceDataStreamIfNeeded can advance past it normally.
+ if (_isGnuSparse10 && _gnuSparseRealSize > 0 && _dataStream is not null)
+ {
+ _gnuSparseDataStream = new GnuSparseStream(_dataStream, _gnuSparseRealSize);
+ }
+
if (_dataStream is SeekableSubReadStream)
{
TarHelpers.AdvanceStream(archiveStream, _size);
@@ -282,6 +319,12 @@ private async Task ProcessDataBlockAsync(Stream archiveStream, bool copyData, Ca
case TarEntryType.TapeVolume: // Might contain data
default: // Unrecognized entry types could potentially have a data section
_dataStream = await GetDataStreamAsync(archiveStream, copyData, _size, cancellationToken).ConfigureAwait(false);
+
+ if (_isGnuSparse10 && _gnuSparseRealSize > 0 && _dataStream is not null)
+ {
+ _gnuSparseDataStream = new GnuSparseStream(_dataStream, _gnuSparseRealSize);
+ }
+
if (_dataStream is SeekableSubReadStream)
{
await TarHelpers.AdvanceStreamAsync(archiveStream, _size, cancellationToken).ConfigureAwait(false);
diff --git a/src/libraries/System.Formats.Tar/src/System/Formats/Tar/TarHeader.cs b/src/libraries/System.Formats.Tar/src/System/Formats/Tar/TarHeader.cs
index 35da5b566ac37f..cb6429ea37c6eb 100644
--- a/src/libraries/System.Formats.Tar/src/System/Formats/Tar/TarHeader.cs
+++ b/src/libraries/System.Formats.Tar/src/System/Formats/Tar/TarHeader.cs
@@ -39,6 +39,12 @@ internal sealed partial class TarHeader
private const string PaxEaDevMajor = "devmajor";
private const string PaxEaDevMinor = "devminor";
+ // Names of GNU sparse extended attributes (used with GNU sparse format 1.0 encoded via PAX)
+ private const string PaxEaGnuSparseName = "GNU.sparse.name";
+ private const string PaxEaGnuSparseRealSize = "GNU.sparse.realsize";
+ private const string PaxEaGnuSparseMajor = "GNU.sparse.major";
+ private const string PaxEaGnuSparseMinor = "GNU.sparse.minor";
+
internal Stream? _dataStream;
internal long _dataOffset;
@@ -77,6 +83,21 @@ internal sealed partial class TarHeader
private Dictionary? _ea;
internal Dictionary ExtendedAttributes => _ea ??= new Dictionary();
+ // When a GNU sparse 1.0 PAX entry is read, the real (expanded) file size is stored here.
+ // This is separate from _size which holds the archive data size and is used for data stream reading.
+ internal long _gnuSparseRealSize;
+
+ // Set to true when GNU.sparse.major=1 is present in the PAX extended attributes,
+ // indicating this is a GNU sparse format 1.0 entry whose data section contains an
+ // embedded sparse map followed by the packed data segments.
+ internal bool _isGnuSparse10;
+
+ // When _isGnuSparse10 is true, this wraps _dataStream and presents the expanded virtual
+ // file content. _dataStream remains the raw (condensed) stream so that TarWriter can
+ // round-trip the original sparse data and AdvanceDataStreamIfNeeded works without
+ // special-casing.
+ internal GnuSparseStream? _gnuSparseDataStream;
+
// GNU attributes
internal DateTimeOffset _aTime;
@@ -106,6 +127,9 @@ internal TarHeader(TarEntryFormat format, TarEntryType typeFlag, TarHeader other
_checksum = other._checksum;
_linkName = other._linkName;
_dataStream = other._dataStream;
+ _gnuSparseRealSize = other._gnuSparseRealSize;
+ _isGnuSparse10 = other._isGnuSparse10;
+ _gnuSparseDataStream = other._gnuSparseDataStream;
}
internal void AddExtendedAttributes(IEnumerable> existing)
diff --git a/src/libraries/System.Formats.Tar/tests/System.Formats.Tar.Tests.csproj b/src/libraries/System.Formats.Tar/tests/System.Formats.Tar.Tests.csproj
index 8f85711e7eaf3f..123f26475378b0 100644
--- a/src/libraries/System.Formats.Tar/tests/System.Formats.Tar.Tests.csproj
+++ b/src/libraries/System.Formats.Tar/tests/System.Formats.Tar.Tests.csproj
@@ -48,6 +48,7 @@
+
diff --git a/src/libraries/System.Formats.Tar/tests/TarReader/TarReader.SparseFile.Tests.cs b/src/libraries/System.Formats.Tar/tests/TarReader/TarReader.SparseFile.Tests.cs
new file mode 100644
index 00000000000000..a26c70d8c77b00
--- /dev/null
+++ b/src/libraries/System.Formats.Tar/tests/TarReader/TarReader.SparseFile.Tests.cs
@@ -0,0 +1,588 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System.Collections.Generic;
+using System.IO;
+using System.Text;
+using System.Threading;
+using System.Threading.Tasks;
+using Xunit;
+
+namespace System.Formats.Tar.Tests
+{
+ ///
+ /// Tests for GNU sparse format 1.0 (PAX) reading. Since GnuSparseStream is internal,
+ /// it is exercised through TarReader's public DataStream property using
+ /// programmatically constructed PAX 1.0 sparse archives.
+ ///
+ public class TarReader_SparseFileTests : TarTestsBase
+ {
+ // Builds a PAX 1.0 sparse archive in memory and returns a TarEntry whose
+ // DataStream is a GnuSparseStream. segments is an array of (virtualOffset, length)
+ // pairs; packed data for each segment is filled with its 1-based index value.
+ private static (MemoryStream archive, byte[] rawPackedData) BuildSparseArchive(
+ string realName, long realSize,
+ (long Offset, long Length)[] segments)
+ {
+ // Build the sparse map text: numSegs\n, then pairs offset\n length\n
+ var sb = new StringBuilder();
+ sb.Append(segments.Length).Append('\n');
+ foreach (var (off, len) in segments)
+ {
+ sb.Append(off).Append('\n');
+ sb.Append(len).Append('\n');
+ }
+ byte[] mapText = Encoding.ASCII.GetBytes(sb.ToString());
+
+ // Pad to the next 512-byte block boundary, then append placeholder packed data.
+ int padding = (512 - (mapText.Length % 512)) % 512;
+ long totalPackedBytes = 0;
+ foreach (var (_, len) in segments) totalPackedBytes += len;
+
+ byte[] rawSparseData = new byte[mapText.Length + padding + totalPackedBytes];
+ mapText.CopyTo(rawSparseData, 0);
+
+ // Fill each segment's packed data with its 1-based segment index value.
+ int writePos = mapText.Length + padding;
+ for (int i = 0; i < segments.Length; i++)
+ {
+ for (long j = 0; j < segments[i].Length; j++)
+ {
+ rawSparseData[writePos++] = (byte)(i + 1);
+ }
+ }
+
+ var gnuSparseAttributes = new Dictionary
+ {
+ ["GNU.sparse.major"] = "1",
+ ["GNU.sparse.minor"] = "0",
+ ["GNU.sparse.name"] = realName,
+ ["GNU.sparse.realsize"] = realSize.ToString(),
+ };
+
+ string placeholderName = "GNUSparseFile.0/" + realName;
+ var archive = new MemoryStream();
+ using (var writer = new TarWriter(archive, TarEntryFormat.Pax, leaveOpen: true))
+ {
+ var entry = new PaxTarEntry(TarEntryType.RegularFile, placeholderName, gnuSparseAttributes);
+ entry.DataStream = new MemoryStream(rawSparseData);
+ writer.WriteEntry(entry);
+ }
+ archive.Position = 0;
+ return (archive, rawSparseData[(mapText.Length + padding)..]);
+ }
+
+ // Reads the DataStream of the first entry from the given archive and returns it.
+ private static Stream GetSparseDataStream(MemoryStream archiveStream, bool copyData)
+ {
+ archiveStream.Position = 0;
+ using var reader = new TarReader(archiveStream, leaveOpen: true);
+ TarEntry? entry = reader.GetNextEntry(copyData);
+ Assert.NotNull(entry);
+ Assert.NotNull(entry.DataStream);
+ return entry.DataStream;
+ }
+
+ // Builds a raw archive byte array where the sparse map text is injected directly,
+ // bypassing TarWriter validation. Used to construct malformed archives.
+ private static MemoryStream BuildRawSparseArchive(string sparseMapContent, string realName, long realSize)
+ {
+ byte[] mapBytes = Encoding.ASCII.GetBytes(sparseMapContent);
+ int padding = (512 - (mapBytes.Length % 512)) % 512;
+ byte[] rawData = new byte[mapBytes.Length + padding];
+ mapBytes.CopyTo(rawData, 0);
+
+ var gnuSparseAttributes = new Dictionary
+ {
+ ["GNU.sparse.major"] = "1",
+ ["GNU.sparse.minor"] = "0",
+ ["GNU.sparse.name"] = realName,
+ ["GNU.sparse.realsize"] = realSize.ToString(),
+ };
+
+ var archive = new MemoryStream();
+ using (var writer = new TarWriter(archive, TarEntryFormat.Pax, leaveOpen: true))
+ {
+ var entry = new PaxTarEntry(TarEntryType.RegularFile, "GNUSparseFile.0/" + realName, gnuSparseAttributes);
+ entry.DataStream = new MemoryStream(rawData);
+ writer.WriteEntry(entry);
+ }
+ archive.Position = 0;
+ return archive;
+ }
+
+ public static IEnumerable