diff --git a/src/System.Reflection.Metadata/tests/Metadata/Ecma335/MetadataBuilderTests.cs b/src/System.Reflection.Metadata/tests/Metadata/Ecma335/MetadataBuilderTests.cs
index 6665314d7317..4c20e9d6c66b 100644
--- a/src/System.Reflection.Metadata/tests/Metadata/Ecma335/MetadataBuilderTests.cs
+++ b/src/System.Reflection.Metadata/tests/Metadata/Ecma335/MetadataBuilderTests.cs
@@ -492,7 +492,16 @@ public void GetOrAddDocumentName2()
Assert.Equal(@"a/", mdReader.GetString(MetadataTokens.DocumentNameBlobHandle(MetadataTokens.GetHeapOffset(n6))));
Assert.Equal(@"/", mdReader.GetString(MetadataTokens.DocumentNameBlobHandle(MetadataTokens.GetHeapOffset(n7))));
Assert.Equal(@"\\", mdReader.GetString(MetadataTokens.DocumentNameBlobHandle(MetadataTokens.GetHeapOffset(n8))));
- Assert.Equal("\uFFFd\uFFFd", mdReader.GetString(MetadataTokens.DocumentNameBlobHandle(MetadataTokens.GetHeapOffset(n9))));
+ if (PlatformDetection.IsNetCore)
+ {
+ Assert.Equal("\uFFFD\uFFFD\uFFFD", mdReader.GetString(MetadataTokens.DocumentNameBlobHandle(MetadataTokens.GetHeapOffset(n9))));
+ }
+ else
+ {
+ // Versions of .NET prior to Core 3.0 didn't follow Unicode recommendations for U+FFFD substitution,
+ // so they sometimes emitted too few replacement chars.
+ Assert.Equal("\uFFFD\uFFFD", mdReader.GetString(MetadataTokens.DocumentNameBlobHandle(MetadataTokens.GetHeapOffset(n9))));
+ }
Assert.Equal("\0", mdReader.GetString(MetadataTokens.DocumentNameBlobHandle(MetadataTokens.GetHeapOffset(n10))));
}
}
diff --git a/src/System.Reflection.Metadata/tests/Metadata/Ecma335/MetadataRootBuilderTests.cs b/src/System.Reflection.Metadata/tests/Metadata/Ecma335/MetadataRootBuilderTests.cs
index b6c05bce30ee..09220310e483 100644
--- a/src/System.Reflection.Metadata/tests/Metadata/Ecma335/MetadataRootBuilderTests.cs
+++ b/src/System.Reflection.Metadata/tests/Metadata/Ecma335/MetadataRootBuilderTests.cs
@@ -377,11 +377,24 @@ public void MetadataVersion()
0x08, 0x00, 0x00, 0x00,
// padded version:
+ // [ E1 88 B4 ] -> U+1234
+ // [ ED ] -> invalid (ED cannot be followed by A0) -> U+FFFD
+ // [ A0 ] -> invalid (not ASCII, not valid leading byte) -> U+FFFD
+ // [ 80 ] -> invalid (not ASCII, not valid leading byte) -> U+FFFD
0xE1, 0x88, 0xB4, 0xED, 0xA0, 0x80, 0x00, 0x00,
}, builder.Slice(12, -132));
// the default decoder replaces bad byte sequences by U+FFFD
- Assert.Equal("\u1234\ufffd\ufffd", ReadVersion(builder));
+ if (PlatformDetection.IsNetCore)
+ {
+ Assert.Equal("\u1234\ufffd\ufffd\ufffd", ReadVersion(builder));
+ }
+ else
+ {
+ // Versions of .NET prior to Core 3.0 didn't follow Unicode recommendations for U+FFFD substitution,
+ // so they sometimes emitted too few replacement chars.
+ Assert.Equal("\u1234\ufffd\ufffd", ReadVersion(builder));
+ }
}
}
}
diff --git a/src/System.Runtime/tests/System.Runtime.Tests.csproj b/src/System.Runtime/tests/System.Runtime.Tests.csproj
index 4b97b787c420..812406b043d9 100644
--- a/src/System.Runtime/tests/System.Runtime.Tests.csproj
+++ b/src/System.Runtime/tests/System.Runtime.Tests.csproj
@@ -287,9 +287,11 @@
+
+
@@ -338,4 +340,4 @@
-
+
\ No newline at end of file
diff --git a/src/System.Runtime/tests/System/Text/Unicode/Utf16UtilityTests.ValidateChars.netcoreapp.cs b/src/System.Runtime/tests/System/Text/Unicode/Utf16UtilityTests.ValidateChars.netcoreapp.cs
new file mode 100644
index 000000000000..fd87b575b18b
--- /dev/null
+++ b/src/System.Runtime/tests/System/Text/Unicode/Utf16UtilityTests.ValidateChars.netcoreapp.cs
@@ -0,0 +1,255 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Buffers;
+using System.Globalization;
+using System.Linq;
+using System.Reflection;
+using System.Runtime.InteropServices;
+using Xunit;
+
+namespace System.Text.Unicode.Tests
+{
+ public partial class Utf16UtilityTests
+ {
+ private unsafe delegate char* GetPointerToFirstInvalidCharDel(char* pInputBuffer, int inputLength, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment);
+ private static readonly Lazy _getPointerToFirstInvalidCharFn = CreateGetPointerToFirstInvalidCharFn();
+
+ [Theory]
+ [InlineData("", 0, 0)] // empty string is OK
+ [InlineData("X", 1, 1)]
+ [InlineData("XY", 2, 2)]
+ [InlineData("XYZ", 3, 3)]
+ [InlineData("", 1, 2)]
+ [InlineData("X", 2, 3)]
+ [InlineData("X", 2, 3)]
+ [InlineData("", 1, 3)]
+ [InlineData("", 1, 4)]
+ [InlineData("XZ", 3, 6)]
+ [InlineData("X<0000>Z", 3, 3)] // null chars are allowed
+ public void GetIndexOfFirstInvalidUtf16Sequence_WithSmallValidBuffers(string unprocessedInput, int expectedRuneCount, int expectedUtf8ByteCount)
+ {
+ GetIndexOfFirstInvalidUtf16Sequence_Test_Core(unprocessedInput, -1 /* expectedIdxOfFirstInvalidChar */, expectedRuneCount, expectedUtf8ByteCount);
+ }
+
+ [Theory]
+ [InlineData("", 0, 0, 0)] // standalone low surrogate (at beginning of sequence)
+ [InlineData("X", 1, 1, 1)] // standalone low surrogate (preceded by valid ASCII data)
+ [InlineData("", 1, 1, 3)] // standalone low surrogate (preceded by valid non-ASCII data)
+ [InlineData("", 0, 0, 0)] // standalone high surrogate (missing follow-up low surrogate)
+ [InlineData("Y", 0, 0, 0)] // standalone high surrogate (followed by ASCII char)
+ [InlineData("", 0, 0, 0)] // standalone high surrogate (followed by high surrogate)
+ [InlineData("", 0, 0, 0)] // standalone high surrogate (followed by valid non-ASCII char)
+ [InlineData("", 0, 0, 0)] // standalone low surrogate (not preceded by a high surrogate)
+ [InlineData("", 0, 0, 0)] // standalone low surrogate (not preceded by a high surrogate)
+ [InlineData("", 2, 1, 4)] // standalone low surrogate (preceded by a valid surrogate pair)
+ [InlineData("", 2, 1, 4)] // standalone low surrogate (preceded by a valid surrogate pair)
+ [InlineData("<0000>", 3, 2, 5)] // standalone low surrogate (preceded by a valid null char)
+ public void GetIndexOfFirstInvalidUtf16Sequence_WithSmallInvalidBuffers(string unprocessedInput, int idxOfFirstInvalidChar, int expectedRuneCount, int expectedUtf8ByteCount)
+ {
+ GetIndexOfFirstInvalidUtf16Sequence_Test_Core(unprocessedInput, idxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount);
+ }
+
+ [Fact]
+ public void GetIndexOfFirstInvalidUtf16Sequence_WithInvalidSurrogateSequences()
+ {
+ // All ASCII
+
+ char[] chars = Enumerable.Repeat('x', 128).ToArray();
+ GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, -1, expectedRuneCount: 128, expectedUtf8ByteCount: 128);
+
+ // Throw a surrogate pair at the beginning
+
+ chars[0] = '\uD800';
+ chars[1] = '\uDFFF';
+ GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, -1, expectedRuneCount: 127, expectedUtf8ByteCount: 130);
+
+ // Throw a surrogate pair near the end
+
+ chars[124] = '\uD800';
+ chars[125] = '\uDFFF';
+ GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, -1, expectedRuneCount: 126, expectedUtf8ByteCount: 132);
+
+ // Throw a standalone surrogate code point at the *very* end
+
+ chars[127] = '\uD800'; // high surrogate
+ GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, 127, expectedRuneCount: 125, expectedUtf8ByteCount: 131);
+
+ chars[127] = '\uDFFF'; // low surrogate
+ GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, 127, expectedRuneCount: 125, expectedUtf8ByteCount: 131);
+
+ // Make the final surrogate pair valid
+
+ chars[126] = '\uD800'; // high surrogate
+ GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, -1, expectedRuneCount: 125, expectedUtf8ByteCount: 134);
+
+ // Throw an invalid surrogate sequence in the middle (straddles a vector boundary)
+
+ chars[12] = '\u0080'; // 2-byte UTF-8 sequence
+ chars[13] = '\uD800'; // high surrogate
+ chars[14] = '\uD800'; // high surrogate
+ chars[15] = '\uDFFF'; // low surrogate
+ chars[16] = '\uDFFF'; // low surrogate
+ GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, 13, expectedRuneCount: 12, expectedUtf8ByteCount: 16);
+
+ // Correct the surrogate sequence we just added
+
+ chars[14] = '\uDC00'; // low surrogate
+ chars[15] = '\uDBFF'; // high surrogate
+ GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, -1, expectedRuneCount: 123, expectedUtf8ByteCount: 139);
+
+ // Corrupt the surrogate pair that's split across a vector boundary
+
+ chars[16] = 'x'; // ASCII char (remember.. chars[15] is a high surrogate char)
+ GetIndexOfFirstInvalidUtf16Sequence_Test_Core(chars, 15, expectedRuneCount: 13, expectedUtf8ByteCount: 20);
+ }
+
+ private static void GetIndexOfFirstInvalidUtf16Sequence_Test_Core(string unprocessedInput, int expectedIdxOfFirstInvalidChar, int expectedRuneCount, long expectedUtf8ByteCount)
+ {
+ char[] processedInput = ProcessInput(unprocessedInput).ToCharArray();
+
+ // Run the test normally
+
+ GetIndexOfFirstInvalidUtf16Sequence_Test_Core(processedInput, expectedIdxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount);
+
+ // Put a bunch of ASCII data at the beginning (to test the call to ASCIIUtility at method entry)
+
+ processedInput = Enumerable.Repeat('x', 128).Concat(processedInput).ToArray();
+
+ if (expectedIdxOfFirstInvalidChar >= 0)
+ {
+ expectedIdxOfFirstInvalidChar += 128;
+ }
+ expectedRuneCount += 128;
+ expectedUtf8ByteCount += 128;
+
+ GetIndexOfFirstInvalidUtf16Sequence_Test_Core(processedInput, expectedIdxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount);
+
+ // Change the first few chars to a mixture of 2-byte and 3-byte UTF-8 sequences
+ // This makes sure the vectorized code paths can properly handle these.
+
+ processedInput[0] = '\u0080'; // 2-byte UTF-8 sequence
+ processedInput[1] = '\u0800'; // 3-byte UTF-8 sequence
+ processedInput[2] = '\u0080'; // 2-byte UTF-8 sequence
+ processedInput[3] = '\u0800'; // 3-byte UTF-8 sequence
+ processedInput[4] = '\u0080'; // 2-byte UTF-8 sequence
+ processedInput[5] = '\u0800'; // 3-byte UTF-8 sequence
+ processedInput[6] = '\u0080'; // 2-byte UTF-8 sequence
+ processedInput[7] = '\u0800'; // 3-byte UTF-8 sequence
+
+ expectedUtf8ByteCount += 12;
+
+ GetIndexOfFirstInvalidUtf16Sequence_Test_Core(processedInput, expectedIdxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount);
+
+ // Throw some surrogate pairs into the mix to make sure they're also handled properly
+ // by the vectorized code paths.
+
+ processedInput[8] = '\u0080'; // 2-byte UTF-8 sequence
+ processedInput[9] = '\u0800'; // 3-byte UTF-8 sequence
+ processedInput[10] = '\u0080'; // 2-byte UTF-8 sequence
+ processedInput[11] = '\u0800'; // 3-byte UTF-8 sequence
+ processedInput[12] = '\u0080'; // 2-byte UTF-8 sequence
+ processedInput[13] = '\uD800'; // high surrogate
+ processedInput[14] = '\uDC00'; // low surrogate
+ processedInput[15] = 'z'; // ASCII char
+
+ expectedRuneCount--;
+ expectedUtf8ByteCount += 9;
+
+ GetIndexOfFirstInvalidUtf16Sequence_Test_Core(processedInput, expectedIdxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount);
+
+ // Split the next surrogate pair across the vector boundary (so that we
+ // don't inadvertently treat this as a standalone surrogate sequence).
+
+ processedInput[15] = '\uDBFF'; // high surrogate
+ processedInput[16] = '\uDFFF'; // low surrogate
+
+ expectedRuneCount--;
+ expectedUtf8ByteCount += 2;
+
+ GetIndexOfFirstInvalidUtf16Sequence_Test_Core(processedInput, expectedIdxOfFirstInvalidChar, expectedRuneCount, expectedUtf8ByteCount);
+ }
+
+ private static unsafe void GetIndexOfFirstInvalidUtf16Sequence_Test_Core(char[] input, int expectedRetVal, int expectedRuneCount, long expectedUtf8ByteCount)
+ {
+ // Arrange
+
+ using BoundedMemory boundedMemory = BoundedMemory.AllocateFromExistingData(input);
+ boundedMemory.MakeReadonly();
+
+ // Act
+
+ int actualRetVal;
+ long actualUtf8CodeUnitCount;
+ int actualRuneCount;
+
+ fixed (char* pInputBuffer = &MemoryMarshal.GetReference(boundedMemory.Span))
+ {
+ char* pFirstInvalidChar = _getPointerToFirstInvalidCharFn.Value(pInputBuffer, input.Length, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment);
+
+ long ptrDiff = pFirstInvalidChar - pInputBuffer;
+ Assert.True((ulong)ptrDiff <= (uint)input.Length, "ptrDiff was outside expected range.");
+
+ Assert.True(utf8CodeUnitCountAdjustment >= 0, "UTF-16 code unit count adjustment must be non-negative.");
+ Assert.True(scalarCountAdjustment <= 0, "Scalar count adjustment must be 0 or negative.");
+
+ actualRetVal = (ptrDiff == input.Length) ? -1 : (int)ptrDiff;
+
+ // The last two 'out' parameters are:
+ // a) The number to be added to the "chars processed" return value to come up with the total UTF-8 code unit count, and
+ // b) The number to be added to the "total UTF-16 code unit count" value to come up with the total scalar count.
+
+ actualUtf8CodeUnitCount = ptrDiff + utf8CodeUnitCountAdjustment;
+ actualRuneCount = (int)ptrDiff + scalarCountAdjustment;
+ }
+
+ // Assert
+
+ Assert.Equal(expectedRetVal, actualRetVal);
+ Assert.Equal(expectedRuneCount, actualRuneCount);
+ Assert.Equal(actualUtf8CodeUnitCount, expectedUtf8ByteCount);
+ }
+
+ private static Lazy CreateGetPointerToFirstInvalidCharFn()
+ {
+ return new Lazy(() =>
+ {
+ Type utf16UtilityType = typeof(Utf8).Assembly.GetType("System.Text.Unicode.Utf16Utility");
+
+ if (utf16UtilityType is null)
+ {
+ throw new Exception("Couldn't find Utf16Utility type in System.Private.CoreLib.");
+ }
+
+ MethodInfo methodInfo = utf16UtilityType.GetMethod("GetPointerToFirstInvalidChar", BindingFlags.Static | BindingFlags.Public | BindingFlags.NonPublic);
+
+ if (methodInfo is null)
+ {
+ throw new Exception("Couldn't find GetPointerToFirstInvalidChar method on Utf8Utility.");
+ }
+
+ return (GetPointerToFirstInvalidCharDel)methodInfo.CreateDelegate(typeof(GetPointerToFirstInvalidCharDel));
+ });
+ }
+
+ private static string ProcessInput(string input)
+ {
+ input = input.Replace("", "\u00E9", StringComparison.Ordinal); // U+00E9 LATIN SMALL LETTER E WITH ACUTE
+ input = input.Replace("", "\u20AC", StringComparison.Ordinal); // U+20AC EURO SIGN
+ input = input.Replace("", "\U0001F600", StringComparison.Ordinal); // U+1F600 GRINNING FACE
+
+ // Replace with \uABCD. This allows us to flow potentially malformed
+ // UTF-16 strings without Xunit. (The unit testing framework gets angry when
+ // we try putting invalid UTF-16 data as inline test data.)
+
+ int idx;
+ while ((idx = input.IndexOf('<')) >= 0)
+ {
+ input = input[..idx] + (char)ushort.Parse(input.Substring(idx + 1, 4), NumberStyles.AllowHexSpecifier, CultureInfo.InvariantCulture) + input[idx + 6..];
+ }
+
+ return input;
+ }
+ }
+}
diff --git a/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.ToBytes.netcoreapp.cs b/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.ToBytes.netcoreapp.cs
index 18ceedc2f832..5432da089bdb 100644
--- a/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.ToBytes.netcoreapp.cs
+++ b/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.ToBytes.netcoreapp.cs
@@ -119,6 +119,34 @@ public void ToBytes_WithLargeValidBuffers(string utf16Input)
expectedNumCharsRead: expectedNumCharsConsumed,
expectedUtf8Transcoding: concatenatedUtf8);
}
+
+ // now throw lots of ASCII data at the beginning so that we exercise the vectorized code paths
+
+ utf16Input = new string('x', 64) + utf16Input;
+ concatenatedUtf8 = utf16Input.EnumerateRunes().SelectMany(ToUtf8).ToArray();
+
+ ToBytes_Test_Core(
+ utf16Input: utf16Input,
+ destinationSize: concatenatedUtf8.Length,
+ replaceInvalidSequences: false,
+ isFinalChunk: true,
+ expectedOperationStatus: OperationStatus.Done,
+ expectedNumCharsRead: utf16Input.Length,
+ expectedUtf8Transcoding: concatenatedUtf8);
+
+ // now throw some non-ASCII data at the beginning so that we *don't* exercise the vectorized code paths
+
+ utf16Input = WOMAN_CARTWHEELING_MEDSKIN_UTF16 + utf16Input[64..];
+ concatenatedUtf8 = utf16Input.EnumerateRunes().SelectMany(ToUtf8).ToArray();
+
+ ToBytes_Test_Core(
+ utf16Input: utf16Input,
+ destinationSize: concatenatedUtf8.Length,
+ replaceInvalidSequences: false,
+ isFinalChunk: true,
+ expectedOperationStatus: OperationStatus.Done,
+ expectedNumCharsRead: utf16Input.Length,
+ expectedUtf8Transcoding: concatenatedUtf8);
}
[Theory]
@@ -162,6 +190,18 @@ public void ToBytes_WithInvalidSurrogates(string utf16Input, int expectedNumChar
expectedOperationStatus: OperationStatus.InvalidData,
expectedNumCharsRead: expectedNumCharsConsumed,
expectedUtf8Transcoding: DecodeHex(expectedUtf8TranscodingHex));
+
+ // Now try the tests again with a larger buffer.
+ // This ensures that running out of destination space wasn't the reason we failed.
+
+ ToBytes_Test_Core(
+ utf16Input: utf16Input,
+ destinationSize: (expectedUtf8TranscodingHex.Length) / 2 + 16,
+ replaceInvalidSequences: false,
+ isFinalChunk: false,
+ expectedOperationStatus: OperationStatus.InvalidData,
+ expectedNumCharsRead: expectedNumCharsConsumed,
+ expectedUtf8Transcoding: DecodeHex(expectedUtf8TranscodingHex));
}
[Theory]
diff --git a/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.ToChars.netcoreapp.cs b/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.ToChars.netcoreapp.cs
index 6dda95dffc10..cb3933891ce0 100644
--- a/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.ToChars.netcoreapp.cs
+++ b/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.ToChars.netcoreapp.cs
@@ -42,6 +42,18 @@ public void ToChars_WithSmallInvalidBuffers(string utf8HexInput, int expectedNum
expectedOperationStatus: OperationStatus.InvalidData,
expectedNumBytesRead: expectedNumBytesConsumed,
expectedUtf16Transcoding: expectedUtf16Transcoding);
+
+ // Now try the tests again with a larger buffer.
+ // This ensures that running out of destination space wasn't the reason we failed.
+
+ ToChars_Test_Core(
+ utf8Input: DecodeHex(utf8HexInput),
+ destinationSize: expectedUtf16Transcoding.Length + 16,
+ replaceInvalidSequences: false,
+ isFinalChunk: false,
+ expectedOperationStatus: OperationStatus.InvalidData,
+ expectedNumBytesRead: expectedNumBytesConsumed,
+ expectedUtf16Transcoding: expectedUtf16Transcoding);
}
[Theory]
@@ -74,6 +86,18 @@ public void ToChars_WithVariousIncompleteBuffers(string utf8HexInput, int expect
expectedOperationStatus: OperationStatus.NeedMoreData,
expectedNumBytesRead: expectedNumBytesConsumed,
expectedUtf16Transcoding: expectedUtf16Transcoding);
+
+ // Now try the tests again with a larger buffer.
+ // This ensures that running out of destination space wasn't the reason we failed.
+
+ ToChars_Test_Core(
+ utf8Input: DecodeHex(utf8HexInput),
+ destinationSize: expectedUtf16Transcoding.Length + 16,
+ replaceInvalidSequences: false,
+ isFinalChunk: false,
+ expectedOperationStatus: OperationStatus.NeedMoreData,
+ expectedNumBytesRead: expectedNumBytesConsumed,
+ expectedUtf16Transcoding: expectedUtf16Transcoding);
}
[Theory]
@@ -104,7 +128,7 @@ public void ToChars_WithVariousIncompleteBuffers(string utf8HexInput, int expect
[InlineData(EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16)] // 2x 3-byte sequences + 4x 2-byte sequences, exercises "consume multiple bytes at a time" logic in 3-byte sequence processing
[InlineData(GRINNING_FACE_UTF16 + GRINNING_FACE_UTF16)] // 2x 4-byte sequences, exercises 4-byte sequence processing
[InlineData(GRINNING_FACE_UTF16 + "@AB")] // single 4-byte sequence + 3 ASCII bytes, exercises 4-byte sequence processing and draining logic
- [InlineData("\U0001F938\U0001F3FD\u200D\u2640\uFE0F")] // U+1F938 U+1F3FD U+200D U+2640 U+FE0F WOMAN CARTWHEELING: MEDIUM SKIN TONE, exercising switching between multiple sequence lengths
+ [InlineData(WOMAN_CARTWHEELING_MEDSKIN_UTF16)] // exercises switching between multiple sequence lengths
public void ToChars_ValidBuffers(string utf16Input)
{
// We're going to run the tests with destination buffer lengths ranging from 0 all the way
@@ -162,6 +186,34 @@ public void ToChars_ValidBuffers(string utf16Input)
expectedNumBytesRead: expectedNumBytesConsumed,
expectedUtf16Transcoding: concatenatedUtf16);
}
+
+ // now throw lots of ASCII data at the beginning so that we exercise the vectorized code paths
+
+ utf16Input = new string('x', 64) + utf16Input;
+ utf8Input = utf16Input.EnumerateRunes().SelectMany(ToUtf8).ToArray();
+
+ ToChars_Test_Core(
+ utf8Input: utf8Input,
+ destinationSize: utf16Input.Length,
+ replaceInvalidSequences: false,
+ isFinalChunk: true,
+ expectedOperationStatus: OperationStatus.Done,
+ expectedNumBytesRead: utf8Input.Length,
+ expectedUtf16Transcoding: utf16Input);
+
+ // now throw some non-ASCII data at the beginning so that we *don't* exercise the vectorized code paths
+
+ utf16Input = WOMAN_CARTWHEELING_MEDSKIN_UTF16 + utf16Input[64..];
+ utf8Input = utf16Input.EnumerateRunes().SelectMany(ToUtf8).ToArray();
+
+ ToChars_Test_Core(
+ utf8Input: utf8Input,
+ destinationSize: utf16Input.Length,
+ replaceInvalidSequences: false,
+ isFinalChunk: true,
+ expectedOperationStatus: OperationStatus.Done,
+ expectedNumBytesRead: utf8Input.Length,
+ expectedUtf16Transcoding: utf16Input);
}
[Theory]
@@ -182,6 +234,7 @@ public void ToChars_ValidBuffers(string utf16Input)
[InlineData("3031" + "E17F80" + EURO_SYMBOL_UTF8 + EURO_SYMBOL_UTF8, 2, "01")] // Improperly terminated 3-byte sequence at start of DWORD
[InlineData("3031" + "E1C080" + EURO_SYMBOL_UTF8 + EURO_SYMBOL_UTF8, 2, "01")] // Improperly terminated 3-byte sequence at start of DWORD
[InlineData("3031" + "EDA080" + EURO_SYMBOL_UTF8 + EURO_SYMBOL_UTF8, 2, "01")] // Surrogate 3-byte sequence at start of DWORD
+ [InlineData("3031" + "E69C88" + "E59B" + "E69C88", 5, "01\u6708")] // Incomplete 3-byte sequence surrounded by valid 3-byte sequences
[InlineData("3031" + "F5808080", 2, "01")] // [ F5 ] is always invalid
[InlineData("3031" + "F6808080", 2, "01")] // [ F6 ] is always invalid
[InlineData("3031" + "F7808080", 2, "01")] // [ F7 ] is always invalid
@@ -208,6 +261,18 @@ public void ToChars_WithLargeInvalidBuffers(string utf8HexInput, int expectedNum
expectedOperationStatus: OperationStatus.InvalidData,
expectedNumBytesRead: expectedNumBytesConsumed,
expectedUtf16Transcoding: expectedUtf16Transcoding);
+
+ // Now try the tests again with a larger buffer.
+ // This ensures that running out of destination space wasn't the reason we failed.
+
+ ToChars_Test_Core(
+ utf8Input: DecodeHex(utf8HexInput),
+ destinationSize: expectedUtf16Transcoding.Length + 16,
+ replaceInvalidSequences: false,
+ isFinalChunk: false,
+ expectedOperationStatus: OperationStatus.InvalidData,
+ expectedNumBytesRead: expectedNumBytesConsumed,
+ expectedUtf16Transcoding: expectedUtf16Transcoding);
}
[Theory]
diff --git a/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.netcoreapp.cs b/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.netcoreapp.cs
index 087235a81b74..f57c769c3697 100644
--- a/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.netcoreapp.cs
+++ b/src/System.Runtime/tests/System/Text/Unicode/Utf8Tests.netcoreapp.cs
@@ -33,7 +33,9 @@ public partial class Utf8Tests
private const string GRINNING_FACE_UTF8 = "F09F9880"; // U+1F600 GRINNING FACE, 4 bytes
private const string GRINNING_FACE_UTF16 = "\U0001F600";
-
+
+ private const string WOMAN_CARTWHEELING_MEDSKIN_UTF16 = "\U0001F938\U0001F3FD\u200D\u2640\uFE0F"; // U+1F938 U+1F3FD U+200D U+2640 U+FE0F WOMAN CARTWHEELING: MEDIUM SKIN TONE
+
// All valid scalars [ U+0000 .. U+D7FF ] and [ U+E000 .. U+10FFFF ].
private static readonly IEnumerable s_allValidScalars = Enumerable.Range(0x0000, 0xD800).Concat(Enumerable.Range(0xE000, 0x110000 - 0xE000)).Select(value => new Rune(value));
@@ -59,7 +61,7 @@ static Utf8Tests()
* COMMON UTILITIES FOR UNIT TESTS
*/
- private static byte[] DecodeHex(ReadOnlySpan inputHex)
+ public static byte[] DecodeHex(ReadOnlySpan inputHex)
{
Assert.True(Regex.IsMatch(inputHex.ToString(), "^([0-9a-fA-F]{2})*$"), "Input must be an even number of hex characters.");
@@ -74,7 +76,7 @@ private static byte[] DecodeHex(ReadOnlySpan inputHex)
// !! IMPORTANT !!
// Don't delete this implementation, as we use it as a reference to make sure the framework's
// transcoding logic is correct.
- private static byte[] ToUtf8(Rune rune)
+ public static byte[] ToUtf8(Rune rune)
{
Assert.True(Rune.IsValid(rune.Value), $"Rune with value U+{(uint)rune.Value:X4} is not well-formed.");
diff --git a/src/System.Runtime/tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.netcoreapp.cs b/src/System.Runtime/tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.netcoreapp.cs
new file mode 100644
index 000000000000..899faa86ce3d
--- /dev/null
+++ b/src/System.Runtime/tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.netcoreapp.cs
@@ -0,0 +1,417 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Buffers;
+using System.Linq;
+using System.Reflection;
+using System.Runtime.InteropServices;
+using Xunit;
+
+namespace System.Text.Unicode.Tests
+{
+ public partial class Utf8UtilityTests
+ {
+ private unsafe delegate byte* GetPointerToFirstInvalidByteDel(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment);
+ private static readonly Lazy _getPointerToFirstInvalidByteFn = CreateGetPointerToFirstInvalidByteFn();
+
+ private const string X = "58"; // U+0058 LATIN CAPITAL LETTER X, 1 byte
+ private const string Y = "59"; // U+0058 LATIN CAPITAL LETTER Y, 1 byte
+ private const string Z = "5A"; // U+0058 LATIN CAPITAL LETTER Z, 1 byte
+ private const string E_ACUTE = "C3A9"; // U+00E9 LATIN SMALL LETTER E WITH ACUTE, 2 bytes
+ private const string EURO_SYMBOL = "E282AC"; // U+20AC EURO SIGN, 3 bytes
+ private const string GRINNING_FACE = "F09F9880"; // U+1F600 GRINNING FACE, 4 bytes
+
+ [Theory]
+ [InlineData("", 0, 0)] // empty string is OK
+ [InlineData(X, 1, 0)]
+ [InlineData(X + Y, 2, 0)]
+ [InlineData(X + Y + Z, 3, 0)]
+ [InlineData(E_ACUTE, 1, 0)]
+ [InlineData(X + E_ACUTE, 2, 0)]
+ [InlineData(E_ACUTE + X, 2, 0)]
+ [InlineData(EURO_SYMBOL, 1, 0)]
+ public void GetIndexOfFirstInvalidUtf8Sequence_WithSmallValidBuffers(string input, int expectedRuneCount, int expectedSurrogatePairCount)
+ {
+ // These test cases are for the "slow processing" code path at the end of GetIndexOfFirstInvalidUtf8Sequence,
+ // so inputs should be less than 4 bytes.
+
+ Assert.InRange(input.Length, 0, 6);
+
+ GetIndexOfFirstInvalidUtf8Sequence_Test_Core(input, -1 /* expectedRetVal */, expectedRuneCount, expectedSurrogatePairCount);
+ }
+
+ [Theory]
+ [InlineData("80", 0, 0, 0)] // sequence cannot begin with continuation character
+ [InlineData("8182", 0, 0, 0)] // sequence cannot begin with continuation character
+ [InlineData("838485", 0, 0, 0)] // sequence cannot begin with continuation character
+ [InlineData(X + "80", 1, 1, 0)] // sequence cannot begin with continuation character
+ [InlineData(X + "8182", 1, 1, 0)] // sequence cannot begin with continuation character
+ [InlineData("C0", 0, 0, 0)] // [ C0 ] is always invalid
+ [InlineData("C080", 0, 0, 0)] // [ C0 ] is always invalid
+ [InlineData("C08081", 0, 0, 0)] // [ C0 ] is always invalid
+ [InlineData(X + "C1", 1, 1, 0)] // [ C1 ] is always invalid
+ [InlineData(X + "C180", 1, 1, 0)] // [ C1 ] is always invalid
+ [InlineData("C2", 0, 0, 0)] // [ C2 ] is improperly terminated
+ [InlineData(X + "C27F", 1, 1, 0)] // [ C2 ] is improperly terminated
+ [InlineData(X + "E282", 1, 1, 0)] // [ E2 82 ] is improperly terminated
+ [InlineData("E2827F", 0, 0, 0)] // [ E2 82 ] is improperly terminated
+ [InlineData("E09F80", 0, 0, 0)] // [ E0 9F ... ] is overlong
+ [InlineData("E0C080", 0, 0, 0)] // [ E0 ] is improperly terminated
+ [InlineData("ED7F80", 0, 0, 0)] // [ ED ] is improperly terminated
+ [InlineData("EDA080", 0, 0, 0)] // [ ED A0 ... ] is surrogate
+ public void GetIndexOfFirstInvalidUtf8Sequence_WithSmallInvalidBuffers(string input, int expectedRetVal, int expectedRuneCount, int expectedSurrogatePairCount)
+ {
+ // These test cases are for the "slow processing" code path at the end of GetIndexOfFirstInvalidUtf8Sequence,
+ // so inputs should be less than 4 bytes.
+
+ Assert.InRange(input.Length, 0, 6);
+
+ GetIndexOfFirstInvalidUtf8Sequence_Test_Core(input, expectedRetVal, expectedRuneCount, expectedSurrogatePairCount);
+ }
+
+ [Theory]
+ [InlineData(E_ACUTE + "21222324" + "303132333435363738393A3B3C3D3E3F", 21, 0)] // Loop unrolling at end of buffer
+ [InlineData(E_ACUTE + "21222324" + "303132333435363738393A3B3C3D3E3F" + "3031323334353637" + E_ACUTE + "38393A3B3C3D3E3F", 38, 0)] // Loop unrolling interrupted by non-ASCII
+ [InlineData("212223" + E_ACUTE + "30313233", 8, 0)] // 3 ASCII bytes followed by non-ASCII
+ [InlineData("2122" + E_ACUTE + "30313233", 7, 0)] // 2 ASCII bytes followed by non-ASCII
+ [InlineData("21" + E_ACUTE + "30313233", 6, 0)] // 1 ASCII byte followed by non-ASCII
+ [InlineData(E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE, 4, 0)] // 4x 2-byte sequences, exercises optimization code path in 2-byte sequence processing
+ [InlineData(E_ACUTE + E_ACUTE + E_ACUTE + "5051", 5, 0)] // 3x 2-byte sequences + 2 ASCII bytes, exercises optimization code path in 2-byte sequence processing
+ [InlineData(E_ACUTE + "5051", 3, 0)] // single 2-byte sequence + 2 trailing ASCII bytes, exercises draining logic in 2-byte sequence processing
+ [InlineData(E_ACUTE + "50" + E_ACUTE + "304050", 6, 0)] // single 2-byte sequences + 1 trailing ASCII byte + 2-byte sequence, exercises draining logic in 2-byte sequence processing
+ [InlineData(EURO_SYMBOL + "20", 2, 0)] // single 3-byte sequence + 1 trailing ASCII byte, exercises draining logic in 3-byte sequence processing
+ [InlineData(EURO_SYMBOL + "203040", 4, 0)] // single 3-byte sequence + 3 trailing ASCII byte, exercises draining logic and "running out of data" logic in 3-byte sequence processing
+ [InlineData(EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL, 3, 0)] // 3x 3-byte sequences, exercises "stay within 3-byte loop" logic in 3-byte sequence processing
+ [InlineData(EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL, 4, 0)] // 4x 3-byte sequences, exercises "consume multiple bytes at a time" logic in 3-byte sequence processing
+ [InlineData(EURO_SYMBOL + EURO_SYMBOL + EURO_SYMBOL + E_ACUTE, 4, 0)] // 3x 3-byte sequences + single 2-byte sequence, exercises "consume multiple bytes at a time" logic in 3-byte sequence processing
+ [InlineData(EURO_SYMBOL + EURO_SYMBOL + E_ACUTE + E_ACUTE + E_ACUTE + E_ACUTE, 6, 0)] // 2x 3-byte sequences + 4x 2-byte sequences, exercises "consume multiple bytes at a time" logic in 3-byte sequence processing
+ [InlineData(GRINNING_FACE + GRINNING_FACE, 2, 2)] // 2x 4-byte sequences, exercises 4-byte sequence processing
+ [InlineData(GRINNING_FACE + "303132", 4, 1)] // single 4-byte sequence + 3 ASCII bytes, exercises 4-byte sequence processing and draining logic
+ [InlineData("F09FA4B8" + "F09F8FBD" + "E2808D" + "E29980" + "EFB88F", 5, 2)] // U+1F938 U+1F3FD U+200D U+2640 U+FE0F WOMAN CARTWHEELING: MEDIUM SKIN TONE, exercising switching between multiple sequence lengths
+ public void GetIndexOfFirstInvalidUtf8Sequence_WithLargeValidBuffers(string input, int expectedRuneCount, int expectedSurrogatePairCount)
+ {
+ // These test cases are for the "fast processing" code which is the main loop of GetIndexOfFirstInvalidUtf8Sequence,
+ // so inputs should be less >= 4 bytes.
+
+ Assert.True(input.Length >= 8);
+
+ GetIndexOfFirstInvalidUtf8Sequence_Test_Core(input, -1 /* expectedRetVal */, expectedRuneCount, expectedSurrogatePairCount);
+ }
+
+ [Theory]
+ [InlineData("3031" + "80" + "202122232425", 2, 2, 0)] // Continuation character at start of sequence should match no bitmask
+ [InlineData("3031" + "C080" + "2021222324", 2, 2, 0)] // Overlong 2-byte sequence at start of DWORD
+ [InlineData("3031" + "C180" + "2021222324", 2, 2, 0)] // Overlong 2-byte sequence at start of DWORD
+ [InlineData("C280" + "C180", 2, 1, 0)] // Overlong 2-byte sequence at end of DWORD
+ [InlineData("C27F" + "C280", 0, 0, 0)] // Improperly terminated 2-byte sequence at start of DWORD
+ [InlineData("C2C0" + "C280", 0, 0, 0)] // Improperly terminated 2-byte sequence at start of DWORD
+ [InlineData("C280" + "C27F", 2, 1, 0)] // Improperly terminated 2-byte sequence at end of DWORD
+ [InlineData("C280" + "C2C0", 2, 1, 0)] // Improperly terminated 2-byte sequence at end of DWORD
+ [InlineData("C280" + "C280" + "80203040", 4, 2, 0)] // Continuation character at start of sequence, within "stay in 2-byte processing" optimization
+ [InlineData("C280" + "C280" + "C180" + "C280", 4, 2, 0)] // Overlong 2-byte sequence at start of DWORD, within "stay in 2-byte processing" optimization
+ [InlineData("C280" + "C280" + "C280" + "C180", 6, 3, 0)] // Overlong 2-byte sequence at end of DWORD, within "stay in 2-byte processing" optimization
+ [InlineData("3031" + "E09F80" + EURO_SYMBOL + EURO_SYMBOL, 2, 2, 0)] // Overlong 3-byte sequence at start of DWORD
+ [InlineData("3031" + "E07F80" + EURO_SYMBOL + EURO_SYMBOL, 2, 2, 0)] // Improperly terminated 3-byte sequence at start of DWORD
+ [InlineData("3031" + "E0C080" + EURO_SYMBOL + EURO_SYMBOL, 2, 2, 0)] // Improperly terminated 3-byte sequence at start of DWORD
+ [InlineData("3031" + "E17F80" + EURO_SYMBOL + EURO_SYMBOL, 2, 2, 0)] // Improperly terminated 3-byte sequence at start of DWORD
+ [InlineData("3031" + "E1C080" + EURO_SYMBOL + EURO_SYMBOL, 2, 2, 0)] // Improperly terminated 3-byte sequence at start of DWORD
+ [InlineData("3031" + "EDA080" + EURO_SYMBOL + EURO_SYMBOL, 2, 2, 0)] // Surrogate 3-byte sequence at start of DWORD
+ [InlineData("3031" + "E69C88" + "E59B" + "E69C88", 5, 3, 0)] // Incomplete 3-byte sequence surrounded by valid 3-byte sequences
+ [InlineData("3031" + "F5808080", 2, 2, 0)] // [ F5 ] is always invalid
+ [InlineData("3031" + "F6808080", 2, 2, 0)] // [ F6 ] is always invalid
+ [InlineData("3031" + "F7808080", 2, 2, 0)] // [ F7 ] is always invalid
+ [InlineData("3031" + "F8808080", 2, 2, 0)] // [ F8 ] is always invalid
+ [InlineData("3031" + "F9808080", 2, 2, 0)] // [ F9 ] is always invalid
+ [InlineData("3031" + "FA808080", 2, 2, 0)] // [ FA ] is always invalid
+ [InlineData("3031" + "FB808080", 2, 2, 0)] // [ FB ] is always invalid
+ [InlineData("3031" + "FC808080", 2, 2, 0)] // [ FC ] is always invalid
+ [InlineData("3031" + "FD808080", 2, 2, 0)] // [ FD ] is always invalid
+ [InlineData("3031" + "FE808080", 2, 2, 0)] // [ FE ] is always invalid
+ [InlineData("3031" + "FF808080", 2, 2, 0)] // [ FF ] is always invalid
+ public void GetIndexOfFirstInvalidUtf8Sequence_WithLargeInvalidBuffers(string input, int expectedRetVal, int expectedRuneCount, int expectedSurrogatePairCount)
+ {
+ // These test cases are for the "fast processing" code which is the main loop of GetIndexOfFirstInvalidUtf8Sequence,
+ // so inputs should be less >= 4 bytes.
+
+ Assert.True(input.Length >= 8);
+
+ GetIndexOfFirstInvalidUtf8Sequence_Test_Core(input, expectedRetVal, expectedRuneCount, expectedSurrogatePairCount);
+ }
+
+ [Fact]
+ public void GetIndexOfFirstInvalidUtf8Sequence_WithOverlongTwoByteSequences_ReturnsInvalid()
+ {
+ // [ C0 ] is never a valid byte, indicates overlong 2-byte sequence
+ // We'll test that [ C0 ] [ 00..FF ] is treated as invalid
+
+ for (int i = 0; i < 256; i++)
+ {
+ AssertIsInvalidTwoByteSequence(new byte[] { 0xC0, (byte)i });
+ }
+
+ // [ C1 ] is never a valid byte, indicates overlong 2-byte sequence
+ // We'll test that [ C1 ] [ 00..FF ] is treated as invalid
+
+ for (int i = 0; i < 256; i++)
+ {
+ AssertIsInvalidTwoByteSequence(new byte[] { 0xC1, (byte)i });
+ }
+ }
+
+ [Fact]
+ public void GetIndexOfFirstInvalidUtf8Sequence_WithImproperlyTerminatedTwoByteSequences_ReturnsInvalid()
+ {
+ // Test [ C2..DF ] [ 00..7F ] and [ C2..DF ] [ C0..FF ]
+
+ for (int i = 0xC2; i < 0xDF; i++)
+ {
+ for (int j = 0; j < 0x80; j++)
+ {
+ AssertIsInvalidTwoByteSequence(new byte[] { (byte)i, (byte)j });
+ }
+ for (int j = 0xC0; j < 0x100; j++)
+ {
+ AssertIsInvalidTwoByteSequence(new byte[] { (byte)i, (byte)j });
+ }
+ }
+ }
+
+ [Fact]
+ public void GetIndexOfFirstInvalidUtf8Sequence_WithOverlongThreeByteSequences_ReturnsInvalid()
+ {
+ // [ E0 ] [ 80..9F ] [ 80..BF ] is overlong 3-byte sequence
+
+ for (int i = 0x00; i < 0xA0; i++)
+ {
+ AssertIsInvalidThreeByteSequence(new byte[] { 0xE0, (byte)i, 0x80 });
+ }
+ }
+
+ [Fact]
+ public void GetIndexOfFirstInvalidUtf8Sequence_WithSurrogateThreeByteSequences_ReturnsInvalid()
+ {
+ // [ ED ] [ A0..BF ] [ 80..BF ] is surrogate 3-byte sequence
+
+ for (int i = 0xA0; i < 0x100; i++)
+ {
+ AssertIsInvalidThreeByteSequence(new byte[] { 0xED, (byte)i, 0x80 });
+ }
+ }
+
+ [Fact]
+ public void GetIndexOfFirstInvalidUtf8Sequence_WithImproperlyTerminatedThreeByteSequence_ReturnsInvalid()
+ {
+ // [ E0..EF ] [ 80..BF ] [ !(80..BF) ] is improperly terminated 3-byte sequence
+
+ for (int i = 0xE0; i < 0xF0; i++)
+ {
+ for (int j = 0x00; j < 0x80; j++)
+ {
+ // Use both '9F' and 'A0' to make sure at least one isn't caught by overlong / surrogate checks
+ AssertIsInvalidThreeByteSequence(new byte[] { (byte)i, 0x9F, (byte)j });
+ AssertIsInvalidThreeByteSequence(new byte[] { (byte)i, 0xA0, (byte)j });
+ }
+ for (int j = 0xC0; j < 0x100; j++)
+ {
+ // Use both '9F' and 'A0' to make sure at least one isn't caught by overlong / surrogate checks
+ AssertIsInvalidThreeByteSequence(new byte[] { (byte)i, 0x9F, (byte)j });
+ AssertIsInvalidThreeByteSequence(new byte[] { (byte)i, 0xA0, (byte)j });
+ }
+ }
+ }
+
+ [Fact]
+ public void GetIndexOfFirstInvalidUtf8Sequence_WithOverlongFourByteSequences_ReturnsInvalid()
+ {
+ // [ F0 ] [ 80..8F ] [ 80..BF ] [ 80..BF ] is overlong 4-byte sequence
+
+ for (int i = 0x00; i < 0x90; i++)
+ {
+ AssertIsInvalidFourByteSequence(new byte[] { 0xF0, (byte)i, 0x80, 0x80 });
+ }
+ }
+
+ [Fact]
+ public void GetIndexOfFirstInvalidUtf8Sequence_WithOutOfRangeFourByteSequences_ReturnsInvalid()
+ {
+ // [ F4 ] [ 90..BF ] [ 80..BF ] [ 80..BF ] is out-of-range 4-byte sequence
+
+ for (int i = 0x90; i < 0x100; i++)
+ {
+ AssertIsInvalidFourByteSequence(new byte[] { 0xF4, (byte)i, 0x80, 0x80 });
+ }
+ }
+
+ [Fact]
+ public void GetIndexOfFirstInvalidUtf8Sequence_WithInvalidFourByteSequence_ReturnsInvalid()
+ {
+ // [ F0..F4 ] [ !(80..BF) ] [ !(80..BF) ] [ !(80..BF) ] is improperly terminated 4-byte sequence
+
+ for (int i = 0xF0; i < 0xF5; i++)
+ {
+ for (int j = 0x00; j < 0x80; j++)
+ {
+ AssertIsInvalidFourByteSequence(new byte[] { (byte)i, (byte)j, 0x80, 0x80 });
+
+ // Use both '8F' and '90' to make sure at least one isn't caught by overlong / out-of-range checks
+ AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0x9F, (byte)j, 0x80 });
+ AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0xA0, (byte)j, 0x80 });
+
+ AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0x9F, 0x80, (byte)j });
+ AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0xA0, 0x80, (byte)j });
+ }
+ for (int j = 0xC0; j < 0x100; j++)
+ {
+ AssertIsInvalidFourByteSequence(new byte[] { (byte)i, (byte)j, 0x80, 0x80 });
+
+ // Use both '8F' and '90' to make sure at least one isn't caught by overlong / out-of-range checks
+ AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0x9F, (byte)j, 0x80 });
+ AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0xA0, (byte)j, 0x80 });
+
+ AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0x9F, 0x80, (byte)j });
+ AssertIsInvalidFourByteSequence(new byte[] { (byte)i, 0xA0, 0x80, (byte)j });
+ }
+ }
+ }
+
+ private static void AssertIsInvalidTwoByteSequence(byte[] invalidSequence)
+ {
+ Assert.Equal(2, invalidSequence.Length);
+
+ byte[] knownGoodBytes = Utf8Tests.DecodeHex(E_ACUTE);
+
+ byte[] toTest = invalidSequence.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at start of first DWORD
+ GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 0, 0, 0);
+
+ toTest = knownGoodBytes.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at end of first DWORD
+ GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 2, 1, 0);
+
+ // Run the same tests but with extra data at the beginning so that we're inside one of
+ // the 2-byte processing "hot loop" code paths.
+
+ toTest = knownGoodBytes.Concat(knownGoodBytes).Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at start of next DWORD
+ GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 4, 2, 0);
+
+ toTest = knownGoodBytes.Concat(knownGoodBytes).Concat(knownGoodBytes).Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at end of next DWORD
+ GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 6, 3, 0);
+ }
+
+ private static void AssertIsInvalidThreeByteSequence(byte[] invalidSequence)
+ {
+ Assert.Equal(3, invalidSequence.Length);
+
+ byte[] knownGoodBytes = Utf8Tests.DecodeHex(EURO_SYMBOL);
+
+ byte[] toTest = invalidSequence.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at start of first DWORD
+ GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 0, 0, 0);
+
+ // Run the same tests but with extra data at the beginning so that we're inside one of
+ // the 3-byte processing "hot loop" code paths.
+
+ toTest = knownGoodBytes.Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // straddling first and second DWORDs
+ GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 3, 1, 0);
+
+ toTest = knownGoodBytes.Concat(knownGoodBytes).Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // straddling second and third DWORDs
+ GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 6, 2, 0);
+
+ toTest = knownGoodBytes.Concat(knownGoodBytes).Concat(knownGoodBytes).Concat(invalidSequence).Concat(knownGoodBytes).ToArray(); // at end of third DWORD
+ GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 9, 3, 0);
+ }
+
+ private static void AssertIsInvalidFourByteSequence(byte[] invalidSequence)
+ {
+ Assert.Equal(4, invalidSequence.Length);
+
+ byte[] knownGoodBytes = Utf8Tests.DecodeHex(GRINNING_FACE);
+
+ byte[] toTest = invalidSequence.Concat(invalidSequence).Concat(knownGoodBytes).ToArray();
+ GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 0, 0, 0);
+
+ toTest = knownGoodBytes.Concat(invalidSequence).Concat(knownGoodBytes).ToArray();
+ GetIndexOfFirstInvalidUtf8Sequence_Test_Core(toTest, 4, 1, 1);
+ }
+
+ private static void GetIndexOfFirstInvalidUtf8Sequence_Test_Core(string inputHex, int expectedRetVal, int expectedRuneCount, int expectedSurrogatePairCount)
+ {
+ byte[] inputBytes = Utf8Tests.DecodeHex(inputHex);
+
+ // Run the test normally
+ GetIndexOfFirstInvalidUtf8Sequence_Test_Core(inputBytes, expectedRetVal, expectedRuneCount, expectedSurrogatePairCount);
+
+ // Then run the test with a bunch of ASCII data at the beginning (to exercise the vectorized code paths)
+ inputBytes = Enumerable.Repeat((byte)'x', 128).Concat(inputBytes).ToArray();
+ GetIndexOfFirstInvalidUtf8Sequence_Test_Core(inputBytes, (expectedRetVal < 0) ? expectedRetVal : (expectedRetVal + 128), expectedRuneCount + 128, expectedSurrogatePairCount);
+
+ // Then put a few more ASCII bytes at the beginning (to test that offsets are properly handled)
+ inputBytes = Enumerable.Repeat((byte)'x', 7).Concat(inputBytes).ToArray();
+ GetIndexOfFirstInvalidUtf8Sequence_Test_Core(inputBytes, (expectedRetVal < 0) ? expectedRetVal : (expectedRetVal + 135), expectedRuneCount + 135, expectedSurrogatePairCount);
+ }
+
+ private static unsafe void GetIndexOfFirstInvalidUtf8Sequence_Test_Core(byte[] input, int expectedRetVal, int expectedRuneCount, int expectedSurrogatePairCount)
+ {
+ // Arrange
+
+ using BoundedMemory boundedMemory = BoundedMemory.AllocateFromExistingData(input);
+ boundedMemory.MakeReadonly();
+
+ // Act
+
+ int actualRetVal;
+ int actualSurrogatePairCount;
+ int actualRuneCount;
+
+ fixed (byte* pInputBuffer = &MemoryMarshal.GetReference(boundedMemory.Span))
+ {
+ byte* pFirstInvalidByte = _getPointerToFirstInvalidByteFn.Value(pInputBuffer, input.Length, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment);
+
+ long ptrDiff = pFirstInvalidByte - pInputBuffer;
+ Assert.True((ulong)ptrDiff <= (uint)input.Length, "ptrDiff was outside expected range.");
+
+ Assert.True(utf16CodeUnitCountAdjustment <= 0, "UTF-16 code unit count adjustment must be 0 or negative.");
+ Assert.True(scalarCountAdjustment <= 0, "Scalar count adjustment must be 0 or negative.");
+
+ actualRetVal = (ptrDiff == input.Length) ? -1 : (int)ptrDiff;
+
+ // The last two 'out' parameters are:
+ // a) The number to be added to the "bytes processed" return value to come up with the total UTF-16 code unit count, and
+ // b) The number to be added to the "total UTF-16 code unit count" value to come up with the total scalar count.
+
+ int totalUtf16CodeUnitCount = (int)ptrDiff + utf16CodeUnitCountAdjustment;
+ actualRuneCount = totalUtf16CodeUnitCount + scalarCountAdjustment;
+
+ // Surrogate pair count is number of UTF-16 code units less the number of scalars.
+
+ actualSurrogatePairCount = totalUtf16CodeUnitCount - actualRuneCount;
+ }
+
+ // Assert
+
+ Assert.Equal(expectedRetVal, actualRetVal);
+ Assert.Equal(expectedRuneCount, actualRuneCount);
+ Assert.Equal(expectedSurrogatePairCount, actualSurrogatePairCount);
+ }
+
+ private static Lazy CreateGetPointerToFirstInvalidByteFn()
+ {
+ return new Lazy(() =>
+ {
+ Type utf8UtilityType = typeof(Utf8).Assembly.GetType("System.Text.Unicode.Utf8Utility");
+
+ if (utf8UtilityType is null)
+ {
+ throw new Exception("Couldn't find Utf8Utility type in System.Private.CoreLib.");
+ }
+
+ MethodInfo methodInfo = utf8UtilityType.GetMethod("GetPointerToFirstInvalidByte", BindingFlags.Static | BindingFlags.Public | BindingFlags.NonPublic);
+
+ if (methodInfo is null)
+ {
+ throw new Exception("Couldn't find GetPointerToFirstInvalidByte method on Utf8Utility.");
+ }
+
+ return (GetPointerToFirstInvalidByteDel)methodInfo.CreateDelegate(typeof(GetPointerToFirstInvalidByteDel));
+ });
+ }
+ }
+}
diff --git a/src/System.Text.Encoding/tests/NegativeEncodingTests.cs b/src/System.Text.Encoding/tests/NegativeEncodingTests.cs
index 2718f6f5287c..e5ae72d3cf40 100644
--- a/src/System.Text.Encoding/tests/NegativeEncodingTests.cs
+++ b/src/System.Text.Encoding/tests/NegativeEncodingTests.cs
@@ -45,7 +45,14 @@ public static IEnumerable