diff --git a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Runtime/InteropServices/PInvokeMarshal.cs b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Runtime/InteropServices/PInvokeMarshal.cs index d4fe3590513067..17107b23ad1c60 100644 --- a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Runtime/InteropServices/PInvokeMarshal.cs +++ b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Runtime/InteropServices/PInvokeMarshal.cs @@ -10,6 +10,8 @@ using Internal.Runtime.Augments; using Internal.Runtime.CompilerHelpers; using Internal.Runtime.CompilerServices; +using System.Text; +using System.Buffers; namespace System.Runtime.InteropServices { @@ -502,17 +504,7 @@ public static unsafe char AnsiCharToWideChar(byte nativeValue) internal static unsafe byte* StringToAnsiString(char* pManaged, int lenUnicode, byte* pNative, bool terminateWithNull, bool bestFit, bool throwOnUnmappableChar) { - bool allAscii = true; - - for (int i = 0; i < lenUnicode; i++) - { - if (pManaged[i] >= 128) - { - allAscii = false; - break; - } - } - + bool allAscii = Ascii.IsValid(new ReadOnlySpan(pManaged, lenUnicode)); int length; if (allAscii) // If all ASCII, map one UNICODE character to one ANSI char @@ -530,17 +522,8 @@ public static unsafe char AnsiCharToWideChar(byte nativeValue) } if (allAscii) // ASCII conversion { - byte* pDst = pNative; - char* pSrc = pManaged; - - while (lenUnicode > 0) - { - unchecked - { - *pDst++ = (byte)(*pSrc++); - lenUnicode--; - } - } + OperationStatus conversionStatus = Ascii.FromUtf16(new ReadOnlySpan(pManaged, length), new Span(pNative, length), out _); + Debug.Assert(conversionStatus == OperationStatus.Done); } else // Let OS convert { @@ -566,26 +549,9 @@ public static unsafe char AnsiCharToWideChar(byte nativeValue) /// private static unsafe bool CalculateStringLength(byte* pchBuffer, out int ansiBufferLen, out int unicodeBufferLen) { - ansiBufferLen = 0; - - bool allAscii = true; - - { - byte* p = pchBuffer; - byte b = *p++; - - while (b != 0) - { - if (b >= 128) - { - allAscii = false; - } - - ansiBufferLen++; - - b = *p++; - } - } + ReadOnlySpan span = MemoryMarshal.CreateReadOnlySpanFromNullTerminated(pchBuffer); + ansiBufferLen = span.Length; + bool allAscii = Ascii.IsValid(span); if (allAscii) { diff --git a/src/libraries/Common/tests/System/Net/Http/Http2LoopbackConnection.cs b/src/libraries/Common/tests/System/Net/Http/Http2LoopbackConnection.cs index 6263483807c76a..ed5af827da852a 100644 --- a/src/libraries/Common/tests/System/Net/Http/Http2LoopbackConnection.cs +++ b/src/libraries/Common/tests/System/Net/Http/Http2LoopbackConnection.cs @@ -91,7 +91,7 @@ private async Task ReadPrefixAsync() throw new Exception("Connection stream closed while attempting to read connection preface."); } - if (Text.Encoding.ASCII.GetString(_prefix).Contains("HTTP/1.1")) + if (_prefix.AsSpan().IndexOf("HTTP/1.1"u8) >= 0) { // Tests that use HttpAgnosticLoopbackServer will attempt to send an HTTP/1.1 request to an HTTP/2 server. // This is invalid and we should terminate the connection. diff --git a/src/libraries/System.Net.Http/src/System/Net/Http/Headers/ContentDispositionHeaderValue.cs b/src/libraries/System.Net.Http/src/System/Net/Http/Headers/ContentDispositionHeaderValue.cs index 105a58e93869df..a350942d9c9e45 100644 --- a/src/libraries/System.Net.Http/src/System/Net/Http/Headers/ContentDispositionHeaderValue.cs +++ b/src/libraries/System.Net.Http/src/System/Net/Http/Headers/ContentDispositionHeaderValue.cs @@ -422,7 +422,7 @@ private static string EncodeAndQuoteMime(string input) throw new ArgumentException(SR.Format(CultureInfo.InvariantCulture, SR.net_http_headers_invalid_value, input)); } - else if (HeaderUtilities.ContainsNonAscii(result)) + else if (!Ascii.IsValid(result)) { needsQuotes = true; // Encoded data must always be quoted, the equals signs are invalid in tokens. result = EncodeMime(result); // =?utf-8?B?asdfasdfaesdf?= diff --git a/src/libraries/System.Net.Http/src/System/Net/Http/Headers/HeaderUtilities.cs b/src/libraries/System.Net.Http/src/System/Net/Http/Headers/HeaderUtilities.cs index 711c8a23225fc0..c57d1b573fffb8 100644 --- a/src/libraries/System.Net.Http/src/System/Net/Http/Headers/HeaderUtilities.cs +++ b/src/libraries/System.Net.Http/src/System/Net/Http/Headers/HeaderUtilities.cs @@ -63,20 +63,6 @@ internal static void SetQuality(UnvalidatedObjectCollection 0x7f) - { - return true; - } - } - return false; - } - // Encode a string using RFC 5987 encoding. // encoding'lang'PercentEncodedSpecials internal static string Encode5987(string input) diff --git a/src/libraries/System.Net.Http/src/System/Net/Http/SocketsHttpHandler/AuthenticationHelper.Digest.cs b/src/libraries/System.Net.Http/src/System/Net/Http/SocketsHttpHandler/AuthenticationHelper.Digest.cs index 5e5a7c741aa735..89bd8dc802770c 100644 --- a/src/libraries/System.Net.Http/src/System/Net/Http/SocketsHttpHandler/AuthenticationHelper.Digest.cs +++ b/src/libraries/System.Net.Http/src/System/Net/Http/SocketsHttpHandler/AuthenticationHelper.Digest.cs @@ -88,7 +88,7 @@ internal static partial class AuthenticationHelper } else { - if (HeaderUtilities.ContainsNonAscii(credential.UserName)) + if (!Ascii.IsValid(credential.UserName)) { string usernameStar = HeaderUtilities.Encode5987(credential.UserName); sb.AppendKeyValue(UsernameStar, usernameStar, includeQuotes: false); diff --git a/src/libraries/System.Net.Http/src/System/Net/Http/SocketsHttpHandler/HttpConnection.cs b/src/libraries/System.Net.Http/src/System/Net/Http/SocketsHttpHandler/HttpConnection.cs index 6092a57decb303..7b2897c1a26253 100644 --- a/src/libraries/System.Net.Http/src/System/Net/Http/SocketsHttpHandler/HttpConnection.cs +++ b/src/libraries/System.Net.Http/src/System/Net/Http/SocketsHttpHandler/HttpConnection.cs @@ -1582,12 +1582,10 @@ private Task WriteAsciiStringAsync(string s, bool async) int offset = _writeOffset; if (s.Length <= _writeBuffer.Length - offset) { - byte[] writeBuffer = _writeBuffer; - foreach (char c in s) - { - writeBuffer[offset++] = (byte)c; - } - _writeOffset = offset; + OperationStatus operationStatus = Ascii.FromUtf16(s, _writeBuffer.AsSpan(offset), out int bytesWritten); + Debug.Assert(operationStatus == OperationStatus.Done); + _writeOffset = offset + bytesWritten; + return Task.CompletedTask; } @@ -1598,14 +1596,14 @@ private Task WriteAsciiStringAsync(string s, bool async) private async Task WriteStringAsyncSlow(string s, bool async) { + if (!Ascii.IsValid(s)) + { + throw new HttpRequestException(SR.net_http_request_invalid_char_encoding); + } + for (int i = 0; i < s.Length; i++) { - char c = s[i]; - if ((c & 0xFF80) != 0) - { - throw new HttpRequestException(SR.net_http_request_invalid_char_encoding); - } - await WriteByteAsync((byte)c, async).ConfigureAwait(false); + await WriteByteAsync((byte)s[i], async).ConfigureAwait(false); } } diff --git a/src/libraries/System.Net.HttpListener/src/System.Net.HttpListener.csproj b/src/libraries/System.Net.HttpListener/src/System.Net.HttpListener.csproj index b0e790da1d6c7d..a72b0e81ef495d 100644 --- a/src/libraries/System.Net.HttpListener/src/System.Net.HttpListener.csproj +++ b/src/libraries/System.Net.HttpListener/src/System.Net.HttpListener.csproj @@ -71,8 +71,6 @@ Link="Common\System\Net\CookieFields.cs" /> - + { + if (state.uriPrefix[state.j] == ':') + { + state.uriPrefix.CopyTo(destination); + } + else + { + int indexOfNextCopy = state.j; + state.uriPrefix.AsSpan(0, indexOfNextCopy).CopyTo(destination); + + if (state.i == 7) + { + ":80".CopyTo(destination.Slice(indexOfNextCopy)); + indexOfNextCopy += 3; + } + else + { + ":443".CopyTo(destination.Slice(indexOfNextCopy)); + indexOfNextCopy += 4; + } + + state.uriPrefix.AsSpan(state.j).CopyTo(destination.Slice(indexOfNextCopy)); + } + + int toLowerLength = destination.IndexOf(':'); + if (toLowerLength < 0) + { + toLowerLength = destination.Length; + } + + OperationStatus operationStatus = Ascii.ToLowerInPlace(destination.Slice(0, toLowerLength), out _); + Debug.Assert(operationStatus == OperationStatus.Done); + }); + } } internal bool ContainsPrefix(string uriPrefix) => _uriPrefixes.Contains(uriPrefix); diff --git a/src/libraries/System.Net.Mail/src/System/Net/Mail/DomainLiteralReader.cs b/src/libraries/System.Net.Mail/src/System/Net/Mail/DomainLiteralReader.cs index 2f03cf45b458c8..884b5c123bfcff 100644 --- a/src/libraries/System.Net.Mail/src/System/Net/Mail/DomainLiteralReader.cs +++ b/src/libraries/System.Net.Mail/src/System/Net/Mail/DomainLiteralReader.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Text; using System.Diagnostics; using System.Net.Mime; @@ -70,7 +71,7 @@ internal static bool TryReadReverse(string data, int index, out int outIndex, bo return true; } // Check for invalid characters - else if (data[index] > MailBnfHelper.Ascii7bitMaxValue || !MailBnfHelper.Dtext[data[index]]) + else if (!Ascii.IsValid(data[index]) || !MailBnfHelper.Dtext[data[index]]) { if (throwExceptionIfFail) { diff --git a/src/libraries/System.Net.Mail/src/System/Net/Mail/DotAtomReader.cs b/src/libraries/System.Net.Mail/src/System/Net/Mail/DotAtomReader.cs index f50b9946f4fe15..b032299b85352d 100644 --- a/src/libraries/System.Net.Mail/src/System/Net/Mail/DotAtomReader.cs +++ b/src/libraries/System.Net.Mail/src/System/Net/Mail/DotAtomReader.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Text; using System.Diagnostics; using System.Net.Mime; @@ -43,7 +44,7 @@ internal static bool TryReadReverse(string data, int index, out int outIndex, bo // Scan for the first invalid chars (including whitespace) for (; 0 <= index; index--) { - if (data[index] <= MailBnfHelper.Ascii7bitMaxValue // Any Unicode allowed + if (Ascii.IsValid(data[index]) // Any ASCII allowed && (data[index] != MailBnfHelper.Dot && !MailBnfHelper.Atext[data[index]])) // Invalid char { break; diff --git a/src/libraries/System.Net.Mail/src/System/Net/Mail/MailBnfHelper.cs b/src/libraries/System.Net.Mail/src/System/Net/Mail/MailBnfHelper.cs index be5ca6c69511d0..9ce43f271bbf87 100644 --- a/src/libraries/System.Net.Mail/src/System/Net/Mail/MailBnfHelper.cs +++ b/src/libraries/System.Net.Mail/src/System/Net/Mail/MailBnfHelper.cs @@ -26,7 +26,6 @@ internal static class MailBnfHelper // characters allowed inside of comments internal static readonly bool[] Ctext = CreateCharactersAllowedInComments(); - internal const int Ascii7bitMaxValue = 127; internal const char Quote = '\"'; internal const char Space = ' '; internal const char Tab = '\t'; @@ -226,11 +225,11 @@ internal static void ValidateHeaderName(string data) { //if data contains Unicode and Unicode is permitted, then //it is valid in a quoted string in a header. - if (data[offset] <= Ascii7bitMaxValue && !Qtext[data[offset]]) + if (Ascii.IsValid(data[offset]) && !Qtext[data[offset]]) throw new FormatException(SR.Format(SR.MailHeaderFieldInvalidCharacter, data[offset])); } //not permitting Unicode, in which case Unicode is a formatting error - else if (data[offset] > Ascii7bitMaxValue || !Qtext[data[offset]]) + else if (!Ascii.IsValid(data[offset]) || !Qtext[data[offset]]) { throw new FormatException(SR.Format(SR.MailHeaderFieldInvalidCharacter, data[offset])); } @@ -256,7 +255,7 @@ internal static string ReadToken(string data, ref int offset) int start = offset; for (; offset < data.Length; offset++) { - if (data[offset] > Ascii7bitMaxValue) + if (!Ascii.IsValid(data[offset])) { throw new FormatException(SR.Format(SR.MailHeaderFieldInvalidCharacter, data[offset])); } @@ -367,7 +366,7 @@ internal static void GetTokenOrQuotedString(string data, StringBuilder builder, private static bool CheckForUnicode(char ch, bool allowUnicode) { - if (ch < Ascii7bitMaxValue) + if (Ascii.IsValid(ch)) { return false; } diff --git a/src/libraries/System.Net.Mail/src/System/Net/Mail/QuotedPairReader.cs b/src/libraries/System.Net.Mail/src/System/Net/Mail/QuotedPairReader.cs index 34079edf51c8f7..af55831d78d41a 100644 --- a/src/libraries/System.Net.Mail/src/System/Net/Mail/QuotedPairReader.cs +++ b/src/libraries/System.Net.Mail/src/System/Net/Mail/QuotedPairReader.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Text; using System.Diagnostics; using System.Net.Mime; @@ -52,7 +53,7 @@ internal static bool TryCountQuotedChars(string data, int index, bool permitUnic } else { - if (!permitUnicodeEscaping && data[index] > MailBnfHelper.Ascii7bitMaxValue) + if (!permitUnicodeEscaping && !Ascii.IsValid(data[index])) { if (throwExceptionIfFail) { diff --git a/src/libraries/System.Net.Mail/src/System/Net/Mail/QuotedStringFormatReader.cs b/src/libraries/System.Net.Mail/src/System/Net/Mail/QuotedStringFormatReader.cs index e12d731640074d..495a3ad5abc2e6 100644 --- a/src/libraries/System.Net.Mail/src/System/Net/Mail/QuotedStringFormatReader.cs +++ b/src/libraries/System.Net.Mail/src/System/Net/Mail/QuotedStringFormatReader.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Text; using System.Diagnostics; using System.Net.Mime; @@ -185,7 +186,7 @@ internal static bool TryReadReverseUnQuoted(string data, int index, bool permitU // non-whitespace control characters as well as all remaining ASCII chars except backslash and double quote. private static bool IsValidQtext(bool allowUnicode, char ch) { - if (ch > MailBnfHelper.Ascii7bitMaxValue) + if (!Ascii.IsValid(ch)) { return allowUnicode; } diff --git a/src/libraries/System.Net.Mail/src/System/Net/Mail/SmtpClient.cs b/src/libraries/System.Net.Mail/src/System/Net/Mail/SmtpClient.cs index d323c244dda87a..5c85943f2c7a3f 100644 --- a/src/libraries/System.Net.Mail/src/System/Net/Mail/SmtpClient.cs +++ b/src/libraries/System.Net.Mail/src/System/Net/Mail/SmtpClient.cs @@ -143,7 +143,7 @@ private void Initialize() for (int i = 0; i < clientDomainRaw.Length; i++) { ch = clientDomainRaw[i]; - if ((ushort)ch <= 0x7F) + if (Ascii.IsValid(ch)) sb.Append(ch); } if (sb.Length > 0) diff --git a/src/libraries/System.Net.Mail/src/System/Net/Mail/WhitespaceReader.cs b/src/libraries/System.Net.Mail/src/System/Net/Mail/WhitespaceReader.cs index f8b4c0a27d5c34..b4f4382ccb8bdb 100644 --- a/src/libraries/System.Net.Mail/src/System/Net/Mail/WhitespaceReader.cs +++ b/src/libraries/System.Net.Mail/src/System/Net/Mail/WhitespaceReader.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Text; using System.Diagnostics; using System.Net.Mime; @@ -166,7 +167,7 @@ internal static bool TryReadCfwsReverse(string data, int index, out int outIndex } // Check for valid characters within comments. Allow Unicode, as we won't transmit any comments. else if (commentDepth > 0 - && (data[index] > MailBnfHelper.Ascii7bitMaxValue || MailBnfHelper.Ctext[data[index]])) + && (!Ascii.IsValid(data[index]) || MailBnfHelper.Ctext[data[index]])) { index--; } diff --git a/src/libraries/System.Net.Mail/src/System/Net/Mime/MimeBasePart.cs b/src/libraries/System.Net.Mail/src/System/Net/Mime/MimeBasePart.cs index 68c3cd2f523796..e0776147cb11c4 100644 --- a/src/libraries/System.Net.Mail/src/System/Net/Mime/MimeBasePart.cs +++ b/src/libraries/System.Net.Mail/src/System/Net/Mime/MimeBasePart.cs @@ -110,18 +110,7 @@ internal static bool IsAscii(string value, bool permitCROrLF) { ArgumentNullException.ThrowIfNull(value); - foreach (char c in value) - { - if (c > 0x7f) - { - return false; - } - if (!permitCROrLF && (c == '\r' || c == '\n')) - { - return false; - } - } - return true; + return Ascii.IsValid(value) && (permitCROrLF || value.AsSpan().IndexOfAny('\r', '\n') < 0); } internal string? ContentID diff --git a/src/libraries/System.Net.WebClient/src/System.Net.WebClient.csproj b/src/libraries/System.Net.WebClient/src/System.Net.WebClient.csproj index fdbfd8f017d684..f87bfb74c63e7e 100644 --- a/src/libraries/System.Net.WebClient/src/System.Net.WebClient.csproj +++ b/src/libraries/System.Net.WebClient/src/System.Net.WebClient.csproj @@ -33,6 +33,7 @@ + diff --git a/src/libraries/System.Net.WebClient/src/System/Net/WebClient.cs b/src/libraries/System.Net.WebClient/src/System/Net/WebClient.cs index 0a893f002c75f0..37ea6293e8b50a 100644 --- a/src/libraries/System.Net.WebClient/src/System/Net/WebClient.cs +++ b/src/libraries/System.Net.WebClient/src/System/Net/WebClient.cs @@ -1,11 +1,11 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Buffers; using System.Collections.Specialized; using System.ComponentModel; using System.Diagnostics; using System.Diagnostics.CodeAnalysis; -using System.Globalization; using System.IO; using System.Net.Cache; using System.Security; @@ -507,7 +507,12 @@ private void OpenFileInternal( "Content-Type: " + contentType + "\r\n" + "\r\n"; formHeaderBytes = Encoding.UTF8.GetBytes(formHeader); - boundaryBytes = Encoding.ASCII.GetBytes("\r\n--" + boundary + "--\r\n"); + + boundaryBytes = new byte["\r\n--".Length + boundary.Length + "--\r\n".Length]; + "\r\n--"u8.CopyTo(boundaryBytes); + "--\r\n"u8.CopyTo(boundaryBytes.AsSpan("\r\n--".Length + boundary.Length)); + OperationStatus conversionStatus = Ascii.FromUtf16(boundary, boundaryBytes.AsSpan("\r\n--".Length), out _); + Debug.Assert(conversionStatus == OperationStatus.Done); } else { diff --git a/src/libraries/System.Private.CoreLib/src/Resources/Strings.resx b/src/libraries/System.Private.CoreLib/src/Resources/Strings.resx index 2b8bf16f6e4333..5db230d54a926b 100644 --- a/src/libraries/System.Private.CoreLib/src/Resources/Strings.resx +++ b/src/libraries/System.Private.CoreLib/src/Resources/Strings.resx @@ -250,6 +250,9 @@ Only one of the following binding flags can be set: BindingFlags.SetProperty, BindingFlags.PutDispProperty, BindingFlags.PutRefDispProperty. + + Text must not contain non-ASCII characters. + Cannot specify both CreateInstance and another access type. diff --git a/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems b/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems index bbb2871fe815c8..30ed91834a9c0b 100644 --- a/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems +++ b/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems @@ -1041,9 +1041,13 @@ + + + + + + - - diff --git a/src/libraries/System.Private.CoreLib/src/System/Convert.cs b/src/libraries/System.Private.CoreLib/src/System/Convert.cs index 4ddd917956e192..6899925d140b39 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Convert.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Convert.cs @@ -2463,7 +2463,7 @@ private static unsafe void ToBase64CharsLargeNoLineBreaks(ReadOnlySpan byt OperationStatus status = Base64.EncodeToUtf8(bytes, MemoryMarshal.AsBytes(chars), out _, out int bytesWritten); Debug.Assert(status == OperationStatus.Done && charLengthRequired == bytesWritten); - // Now widen the ASCII bytes in-place to chars (if the vectorized ASCIIUtility.WidenAsciiToUtf16 is ever updated + // Now widen the ASCII bytes in-place to chars (if the vectorized Ascii.WidenAsciiToUtf16 is ever updated // to support in-place updates, it should be used here instead). Since the base64 bytes are all valid ASCII, the byte // data is guaranteed to be 1/2 as long as the char data, and we can widen in-place. ref ushort dest = ref Unsafe.As(ref MemoryMarshal.GetReference(chars)); @@ -2514,7 +2514,7 @@ private static unsafe void ToBase64CharsLargeNoLineBreaks(ReadOnlySpan byt { dest = ref Unsafe.Subtract(ref dest, 4); src = ref Unsafe.Subtract(ref src, 4); - ASCIIUtility.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref Unsafe.As(ref dest), Unsafe.ReadUnaligned(ref src)); + Ascii.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref Unsafe.As(ref dest), Unsafe.ReadUnaligned(ref src)); } // The length produced by Base64 encoding is always a multiple of 4, so we don't need to handle diff --git a/src/libraries/System.Private.CoreLib/src/System/Globalization/TextInfo.cs b/src/libraries/System.Private.CoreLib/src/System/Globalization/TextInfo.cs index 2afcacad561e0b..1499ce4450f042 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Globalization/TextInfo.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Globalization/TextInfo.cs @@ -1,11 +1,11 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Buffers; using System.Diagnostics; using System.Diagnostics.CodeAnalysis; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; -using System.Runtime.Intrinsics; using System.Runtime.Serialization; using System.Text; using System.Text.Unicode; @@ -192,205 +192,50 @@ private unsafe char ChangeCase(char c, bool toUpper) internal void ChangeCaseToLower(ReadOnlySpan source, Span destination) { Debug.Assert(destination.Length >= source.Length); - ChangeCaseCommon(ref MemoryMarshal.GetReference(source), ref MemoryMarshal.GetReference(destination), source.Length); + ChangeCaseCommon(source, destination); } [MethodImpl(MethodImplOptions.AggressiveInlining)] internal void ChangeCaseToUpper(ReadOnlySpan source, Span destination) { Debug.Assert(destination.Length >= source.Length); - ChangeCaseCommon(ref MemoryMarshal.GetReference(source), ref MemoryMarshal.GetReference(destination), source.Length); + ChangeCaseCommon(source, destination); } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private void ChangeCaseCommon(ReadOnlySpan source, Span destination) where TConversion : struct + private unsafe void ChangeCaseCommon(ReadOnlySpan source, Span destination) where TConversion : struct { - Debug.Assert(destination.Length >= source.Length); - ChangeCaseCommon(ref MemoryMarshal.GetReference(source), ref MemoryMarshal.GetReference(destination), source.Length); - } - - private unsafe void ChangeCaseCommon_Vector128(ref char source, ref char destination, int charCount) - where TConversion : struct - { - Debug.Assert(charCount >= Vector128.Count); - Debug.Assert(Vector128.IsHardwareAccelerated); + Debug.Assert(!GlobalizationMode.Invariant); + Debug.Assert(typeof(TConversion) == typeof(ToUpperConversion) || typeof(TConversion) == typeof(ToLowerConversion)); - // JIT will treat this as a constant in release builds - bool toUpper = typeof(TConversion) == typeof(ToUpperConversion); - nuint i = 0; - if (!IsAsciiCasingSameAsInvariant) + if (source.IsEmpty) { - goto NON_ASCII; + return; } - ref ushort src = ref Unsafe.As(ref source); - ref ushort dst = ref Unsafe.As(ref destination); + bool toUpper = typeof(TConversion) == typeof(ToUpperConversion); // JIT will treat this as a constant in release builds + int charsConsumed = 0; - nuint lengthU = (nuint)charCount; - nuint lengthToExamine = lengthU - (nuint)Vector128.Count; - do + if (IsAsciiCasingSameAsInvariant) { - Vector128 vec = Vector128.LoadUnsafe(ref src, i); - if (!Utf16Utility.AllCharsInVector128AreAscii(vec)) - { - goto NON_ASCII; - } - vec = toUpper ? - Utf16Utility.Vector128AsciiToUppercase(vec) : - Utf16Utility.Vector128AsciiToLowercase(vec); - vec.StoreUnsafe(ref dst, i); - - i += (nuint)Vector128.Count; - } while (i <= lengthToExamine); + OperationStatus operationStatus = toUpper + ? Ascii.ToUpper(source, destination, out charsConsumed) + : Ascii.ToLower(source, destination, out charsConsumed); - Debug.Assert(i <= lengthU); - - // Handle trailing elements - if (i < lengthU) - { - nuint trailingElements = lengthU - (nuint)Vector128.Count; - Vector128 vec = Vector128.LoadUnsafe(ref src, trailingElements); - if (!Utf16Utility.AllCharsInVector128AreAscii(vec)) + if (operationStatus != OperationStatus.InvalidData) { - goto NON_ASCII; + Debug.Assert(operationStatus == OperationStatus.Done); + return; } - vec = toUpper ? - Utf16Utility.Vector128AsciiToUppercase(vec) : - Utf16Utility.Vector128AsciiToLowercase(vec); - vec.StoreUnsafe(ref dst, trailingElements); } - return; - NON_ASCII: - // We encountered non-ASCII data and therefore can't perform invariant case conversion; - // Fallback to ICU/NLS - ChangeCaseCommon_Scalar( - ref Unsafe.Add(ref source, i), - ref Unsafe.Add(ref destination, i), - charCount - (int)i); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private unsafe void ChangeCaseCommon(ref char source, ref char destination, int charCount) - where TConversion : struct - { - if (!Vector128.IsHardwareAccelerated || charCount < Vector128.Count) - { - ChangeCaseCommon_Scalar(ref source, ref destination, charCount); - } - else + fixed (char* pSource = &MemoryMarshal.GetReference(source)) + fixed (char* pDestination = &MemoryMarshal.GetReference(destination)) { - ChangeCaseCommon_Vector128(ref source, ref destination, charCount); + ChangeCaseCore(pSource + charsConsumed, source.Length - charsConsumed, pDestination + charsConsumed, destination.Length - charsConsumed, toUpper); } } - private unsafe void ChangeCaseCommon_Scalar(ref char source, ref char destination, int charCount) where TConversion : struct - { - Debug.Assert(typeof(TConversion) == typeof(ToUpperConversion) || typeof(TConversion) == typeof(ToLowerConversion)); - bool toUpper = typeof(TConversion) == typeof(ToUpperConversion); // JIT will treat this as a constant in release builds - - Debug.Assert(!GlobalizationMode.Invariant); - Debug.Assert(charCount >= 0); - - if (charCount == 0) - { - goto Return; - } - - fixed (char* pSource = &source) - fixed (char* pDestination = &destination) - { - nuint currIdx = 0; // in chars - - if (IsAsciiCasingSameAsInvariant) - { - // Read 4 chars (two 32-bit integers) at a time - - if (charCount >= 4) - { - nuint lastIndexWhereCanReadFourChars = (uint)charCount - 4; - do - { - // This is a mostly branchless case change routine. Generally speaking, we assume that the majority - // of input is ASCII, so the 'if' checks below should normally evaluate to false. However, within - // the ASCII data, we expect that characters of either case might be about equally distributed, so - // we want the case change operation itself to be branchless. This gives optimal performance in the - // common case. We also expect that developers aren't passing very long (16+ character) strings into - // this method, so we won't bother vectorizing until data shows us that it's worthwhile to do so. - - uint tempValue = Unsafe.ReadUnaligned(pSource + currIdx); - if (!Utf16Utility.AllCharsInUInt32AreAscii(tempValue)) - { - goto NonAscii; - } - tempValue = (toUpper) ? Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(tempValue) : Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(tempValue); - Unsafe.WriteUnaligned(pDestination + currIdx, tempValue); - - tempValue = Unsafe.ReadUnaligned(pSource + currIdx + 2); - if (!Utf16Utility.AllCharsInUInt32AreAscii(tempValue)) - { - goto NonAsciiSkipTwoChars; - } - tempValue = (toUpper) ? Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(tempValue) : Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(tempValue); - Unsafe.WriteUnaligned(pDestination + currIdx + 2, tempValue); - currIdx += 4; - } while (currIdx <= lastIndexWhereCanReadFourChars); - - // At this point, there are fewer than 4 characters remaining to convert. - Debug.Assert((uint)charCount - currIdx < 4); - } - - // If there are 2 or 3 characters left to convert, we'll convert 2 of them now. - if ((charCount & 2) != 0) - { - uint tempValue = Unsafe.ReadUnaligned(pSource + currIdx); - if (!Utf16Utility.AllCharsInUInt32AreAscii(tempValue)) - { - goto NonAscii; - } - tempValue = (toUpper) ? Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(tempValue) : Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(tempValue); - Unsafe.WriteUnaligned(pDestination + currIdx, tempValue); - currIdx += 2; - } - - // If there's a single character left to convert, do it now. - if ((charCount & 1) != 0) - { - uint tempValue = pSource[currIdx]; - if (tempValue > 0x7Fu) - { - goto NonAscii; - } - tempValue = (toUpper) ? Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(tempValue) : Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(tempValue); - pDestination[currIdx] = (char)tempValue; - } - - // And we're finished! - - goto Return; - - // If we reached this point, we found non-ASCII data. - // Fall back down the p/invoke code path. - - NonAsciiSkipTwoChars: - currIdx += 2; - - NonAscii: - Debug.Assert(currIdx < (uint)charCount, "We somehow read past the end of the buffer."); - charCount -= (int)currIdx; - } - - // We encountered non-ASCII data and therefore can't perform invariant case conversion; or the requested culture - // has a case conversion that's different from the invariant culture, even for ASCII data (e.g., tr-TR converts - // 'i' (U+0069) to Latin Capital Letter I With Dot Above (U+0130)). - - ChangeCaseCore(pSource + currIdx, charCount, pDestination + currIdx, charCount, toUpper); - } - - Return: - return; - } - private unsafe string ChangeCaseCommon(string source) where TConversion : struct { Debug.Assert(typeof(TConversion) == typeof(ToUpperConversion) || typeof(TConversion) == typeof(ToLowerConversion)); diff --git a/src/libraries/System.Private.CoreLib/src/System/String.cs b/src/libraries/System.Private.CoreLib/src/System/String.cs index 971f1bbbef9fa5..acb38741b98446 100644 --- a/src/libraries/System.Private.CoreLib/src/System/String.cs +++ b/src/libraries/System.Private.CoreLib/src/System/String.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. using System.Buffers; +using System.Buffers.Text; using System.Collections; using System.Collections.Generic; using System.ComponentModel; @@ -673,7 +674,7 @@ public bool IsNormalized() public bool IsNormalized(NormalizationForm normalizationForm) { - if (this.IsAscii()) + if (Ascii.IsValid(this)) { // If its ASCII && one of the 4 main forms, then its already normalized if (normalizationForm == NormalizationForm.FormC || @@ -692,7 +693,7 @@ public string Normalize() public string Normalize(NormalizationForm normalizationForm) { - if (this.IsAscii()) + if (Ascii.IsValid(this)) { // If its ASCII && one of the 4 main forms, then its already normalized if (normalizationForm == NormalizationForm.FormC || @@ -704,14 +705,6 @@ public string Normalize(NormalizationForm normalizationForm) return Normalization.Normalize(this, normalizationForm); } - private unsafe bool IsAscii() - { - fixed (char* str = &_firstChar) - { - return ASCIIUtility.GetIndexOfFirstNonAsciiChar(str, (uint)Length) == (uint)Length; - } - } - // Gets the character at a specified position. // [IndexerName("Chars")] diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIEncoding.cs b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIEncoding.cs index ebf07d971271d9..7c07d363252d95 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIEncoding.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIEncoding.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. using System.Buffers; +using System.Buffers.Text; using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; @@ -188,11 +189,12 @@ private protected sealed override unsafe int GetByteCountFast(char* pChars, int if (!(fallback is EncoderReplacementFallback replacementFallback && replacementFallback.MaxCharCount == 1 - && replacementFallback.DefaultString[0] <= 0x7F)) + && Ascii.IsValid(replacementFallback.DefaultString[0]))) { // Unrecognized fallback mechanism - count chars manually. - byteCount = (int)ASCIIUtility.GetIndexOfFirstNonAsciiChar(pChars, (uint)charsLength); + int firstNonAsciiIndex = Ascii.GetIndexOfFirstNonAsciiChar(new ReadOnlySpan(pChars, charsLength)); + byteCount = firstNonAsciiIndex < 0 ? charsLength : firstNonAsciiIndex; } charsConsumed = byteCount; @@ -353,10 +355,8 @@ private unsafe int GetBytesCommon(char* pChars, int charCount, byte* pBytes, int [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetBytesCommon private protected sealed override unsafe int GetBytesFast(char* pChars, int charsLength, byte* pBytes, int bytesLength, out int charsConsumed) { - int bytesWritten = (int)ASCIIUtility.NarrowUtf16ToAscii(pChars, pBytes, (uint)Math.Min(charsLength, bytesLength)); - - charsConsumed = bytesWritten; - return bytesWritten; + Ascii.FromUtf16(new ReadOnlySpan(pChars, charsLength), new Span(pBytes, bytesLength), out charsConsumed); + return charsConsumed; } private protected sealed override unsafe int GetBytesWithFallback(ReadOnlySpan chars, int originalCharsLength, Span bytes, int originalBytesLength, EncoderNLS? encoder) @@ -367,29 +367,26 @@ private protected sealed override unsafe int GetBytesWithFallback(ReadOnlySpan(pBytes, bytesLength)); + charCount = indexOfFirstNonAscii < 0 ? bytesLength : indexOfFirstNonAscii; } bytesConsumed = charCount; @@ -629,10 +627,8 @@ private unsafe int GetCharsCommon(byte* pBytes, int byteCount, char* pChars, int [MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetCharsCommon private protected sealed override unsafe int GetCharsFast(byte* pBytes, int bytesLength, char* pChars, int charsLength, out int bytesConsumed) { - int charsWritten = (int)ASCIIUtility.WidenAsciiToUtf16(pBytes, pChars, (uint)Math.Min(bytesLength, charsLength)); - - bytesConsumed = charsWritten; - return charsWritten; + Ascii.ToUtf16(new ReadOnlySpan(pBytes, bytesLength), new Span(pChars, charsLength), out bytesConsumed); + return bytesConsumed; } private protected sealed override unsafe int GetCharsWithFallback(ReadOnlySpan bytes, int originalBytesLength, Span chars, int originalCharsLength, DecoderNLS? decoder) @@ -649,22 +645,19 @@ private protected sealed override unsafe int GetCharsWithFallback(ReadOnlySpan byte if (!bytes.IsEmpty) { byte b = bytes[0]; - if (b <= 0x7F) + if (Ascii.IsValid(b)) { // ASCII byte diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.CaseConversion.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.CaseConversion.cs new file mode 100644 index 00000000000000..66b8c6e0fde783 --- /dev/null +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.CaseConversion.cs @@ -0,0 +1,605 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Buffers; +using System.Diagnostics; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +using System.Text.Unicode; + +namespace System.Text +{ + public static partial class Ascii + { + /// + /// Copies text from a source buffer to a destination buffer, converting + /// ASCII letters to uppercase during the copy. + /// + /// The source buffer from which ASCII text is read. + /// The destination buffer to which uppercase text is written. + /// The number of bytes actually written to . It's the same as the number of bytes actually read from . + /// An describing the result of the operation. + /// In-place conversion is prohibited, please use for that. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static OperationStatus ToUpper(ReadOnlySpan source, Span destination, out int bytesWritten) + => ChangeCase(source, destination, out bytesWritten); + + /// + /// Copies text from a source buffer to a destination buffer, converting + /// ASCII letters to uppercase during the copy. + /// + /// The source buffer from which ASCII text is read. + /// The destination buffer to which uppercase text is written. + /// The number of characters actually written to . It's the same as the number of characters actually read from . + /// An describing the result of the operation. + /// In-place conversion is prohibited, please use for that. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static OperationStatus ToUpper(ReadOnlySpan source, Span destination, out int charsWritten) + => ChangeCase(MemoryMarshal.Cast(source), MemoryMarshal.Cast(destination), out charsWritten); + + /// + /// Copies text from a source buffer to a destination buffer, converting + /// ASCII letters to uppercase during the copy. + /// + /// The source buffer from which ASCII text is read. + /// The destination buffer to which uppercase text is written. + /// The number of characters actually written to . It's the same as the number of bytes actually read from . + /// An describing the result of the operation. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static OperationStatus ToUpper(ReadOnlySpan source, Span destination, out int charsWritten) + => ChangeCase(source, MemoryMarshal.Cast(destination), out charsWritten); + + /// + /// Copies text from a source buffer to a destination buffer, converting + /// ASCII letters to uppercase during the copy. + /// + /// The source buffer from which ASCII text is read. + /// The destination buffer to which uppercase text is written. + /// The number of bytes actually written to . It's the same as the number of characters actually read from . + /// An describing the result of the operation. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static OperationStatus ToUpper(ReadOnlySpan source, Span destination, out int bytesWritten) + => ChangeCase(MemoryMarshal.Cast(source), destination, out bytesWritten); + + /// + /// Copies text from a source buffer to a destination buffer, converting + /// ASCII letters to lowercase during the copy. + /// + /// The source buffer from which ASCII text is read. + /// The destination buffer to which lowercase text is written. + /// The number of bytes actually written to . It's the same as the number of bytes actually read from . + /// An describing the result of the operation. + /// In-place conversion is prohibited, please use for that. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static OperationStatus ToLower(ReadOnlySpan source, Span destination, out int bytesWritten) + => ChangeCase(source, destination, out bytesWritten); + + /// + /// Copies text from a source buffer to a destination buffer, converting + /// ASCII letters to lowercase during the copy. + /// + /// The source buffer from which ASCII text is read. + /// The destination buffer to which lowercase text is written. + /// The number of characters actually written to . It's the same as the number of characters actually read from . + /// An describing the result of the operation. + /// In-place conversion is prohibited, please use for that. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static OperationStatus ToLower(ReadOnlySpan source, Span destination, out int charsWritten) + => ChangeCase(MemoryMarshal.Cast(source), MemoryMarshal.Cast(destination), out charsWritten); + + /// + /// Copies text from a source buffer to a destination buffer, converting + /// ASCII letters to lowercase during the copy. + /// + /// The source buffer from which ASCII text is read. + /// The destination buffer to which lowercase text is written. + /// The number of characters actually written to . It's the same as the number of bytes actually read from . + /// An describing the result of the operation. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static OperationStatus ToLower(ReadOnlySpan source, Span destination, out int charsWritten) + => ChangeCase(source, MemoryMarshal.Cast(destination), out charsWritten); + + /// + /// Copies text from a source buffer to a destination buffer, converting + /// ASCII letters to lowercase during the copy. + /// + /// The source buffer from which ASCII text is read. + /// The destination buffer to which lowercase text is written. + /// The number of bytes actually written to . It's the same as the number of characters actually read from . + /// An describing the result of the operation. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static OperationStatus ToLower(ReadOnlySpan source, Span destination, out int bytesWritten) + => ChangeCase(MemoryMarshal.Cast(source), destination, out bytesWritten); + + /// + /// Performs in-place uppercase conversion. + /// + /// The ASCII text buffer. + /// The number of processed bytes. + /// An describing the result of the operation. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static OperationStatus ToLowerInPlace(Span value, out int bytesWritten) + => ChangeCase(value, out bytesWritten); + + /// + /// Performs in-place uppercase conversion. + /// + /// The ASCII text buffer. + /// The number of processed characters. + /// An describing the result of the operation. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static OperationStatus ToLowerInPlace(Span value, out int charsWritten) + => ChangeCase(MemoryMarshal.Cast(value), out charsWritten); + + /// + /// Performs in-place lowercase conversion. + /// + /// The ASCII text buffer. + /// The number of processed bytes. + /// An describing the result of the operation. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static OperationStatus ToUpperInPlace(Span value, out int bytesWritten) + => ChangeCase(value, out bytesWritten); + + /// + /// Performs in-place lowercase conversion. + /// + /// The ASCII text buffer. + /// The number of processed characters. + /// An describing the result of the operation. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static OperationStatus ToUpperInPlace(Span value, out int charsWritten) + => ChangeCase(MemoryMarshal.Cast(value), out charsWritten); + + private static unsafe OperationStatus ChangeCase(ReadOnlySpan source, Span destination, out int destinationElementsWritten) + where TFrom : unmanaged, IBinaryInteger + where TTo : unmanaged, IBinaryInteger + where TCasing : struct + { + if (MemoryMarshal.AsBytes(source).Overlaps(MemoryMarshal.AsBytes(destination))) + { + throw new InvalidOperationException(SR.InvalidOperation_SpanOverlappedOperation); + } + + nuint numElementsToConvert; + OperationStatus statusToReturnOnSuccess; + + if (source.Length <= destination.Length) + { + numElementsToConvert = (uint)source.Length; + statusToReturnOnSuccess = OperationStatus.Done; + } + else + { + numElementsToConvert = (uint)destination.Length; + statusToReturnOnSuccess = OperationStatus.DestinationTooSmall; + } + + fixed (TFrom* pSource = &MemoryMarshal.GetReference(source)) + fixed (TTo* pDestination = &MemoryMarshal.GetReference(destination)) + { + nuint numElementsActuallyConverted = ChangeCase(pSource, pDestination, numElementsToConvert); + Debug.Assert(numElementsActuallyConverted <= numElementsToConvert); + + destinationElementsWritten = (int)numElementsActuallyConverted; + return (numElementsToConvert == numElementsActuallyConverted) ? statusToReturnOnSuccess : OperationStatus.InvalidData; + } + } + + private static unsafe OperationStatus ChangeCase(Span buffer, out int elementsWritten) + where T : unmanaged, IBinaryInteger + where TCasing : struct + { + fixed (T* pBuffer = &MemoryMarshal.GetReference(buffer)) + { + nuint numElementsActuallyConverted = ChangeCase(pBuffer, pBuffer, (nuint)buffer.Length); + Debug.Assert(numElementsActuallyConverted <= (nuint)buffer.Length); + + elementsWritten = (int)numElementsActuallyConverted; + return elementsWritten == buffer.Length ? OperationStatus.Done : OperationStatus.InvalidData; + } + } + + private static unsafe nuint ChangeCase(TFrom* pSrc, TTo* pDest, nuint elementCount) + where TFrom : unmanaged, IBinaryInteger + where TTo : unmanaged, IBinaryInteger + where TCasing : struct + { + Debug.Assert(typeof(TFrom) == typeof(byte) || typeof(TFrom) == typeof(ushort)); + Debug.Assert(typeof(TTo) == typeof(byte) || typeof(TTo) == typeof(ushort)); + Debug.Assert(typeof(TCasing) == typeof(ToUpperConversion) || typeof(TCasing) == typeof(ToLowerConversion)); + + bool sourceIsAscii = (sizeof(TFrom) == 1); // JIT turns this into a const + bool destIsAscii = (sizeof(TTo) == 1); // JIT turns this into a const + bool conversionIsWidening = sourceIsAscii && !destIsAscii; // JIT turns this into a const + bool conversionIsNarrowing = !sourceIsAscii && destIsAscii; // JIT turns this into a const + bool conversionIsWidthPreserving = typeof(TFrom) == typeof(TTo); // JIT turns this into a const + bool conversionIsToUpper = (typeof(TCasing) == typeof(ToUpperConversion)); // JIT turns this into a const + uint numInputElementsToConsumeEachVectorizedLoopIteration = (uint)(sizeof(Vector128) / sizeof(TFrom)); // JIT turns this into a const + + nuint i = 0; + + // The only situation we can't easily optimize is non-hardware-accelerated + // widening or narrowing. In this case, fall back to a naive element-by-element + // loop. + + if (!conversionIsWidthPreserving && !Vector128.IsHardwareAccelerated) + { + goto DrainRemaining; + } + + // Process the input as a series of 128-bit blocks. + + if (Vector128.IsHardwareAccelerated && elementCount >= numInputElementsToConsumeEachVectorizedLoopIteration) + { + // Unaligned read and check for non-ASCII data. + + Vector128 srcVector = Vector128.LoadUnsafe(ref *pSrc); + if (VectorContainsAnyNonAsciiData(srcVector)) + { + goto Drain64; + } + + // Now find matching characters and perform case conversion. + // Basically, the (A <= value && value <= Z) check is converted to: + // (value - CONST) <= (Z - A), but using signed instead of unsigned arithmetic. + + TFrom SourceSignedMinValue = TFrom.CreateTruncating(1 << (8 * sizeof(TFrom) - 1)); + Vector128 subtractionVector = Vector128.Create(conversionIsToUpper ? (SourceSignedMinValue + TFrom.CreateTruncating('a')) : (SourceSignedMinValue + TFrom.CreateTruncating('A'))); + Vector128 comparisionVector = Vector128.Create(SourceSignedMinValue + TFrom.CreateTruncating(26 /* A..Z or a..z */)); + Vector128 caseConversionVector = Vector128.Create(TFrom.CreateTruncating(0x20)); // works both directions + + Vector128 matches = SignedLessThan((srcVector - subtractionVector), comparisionVector); + srcVector ^= (matches & caseConversionVector); + + // Now write to the destination. + + ChangeWidthAndWriteTo(srcVector, pDest, 0); + + // Now that the first conversion is out of the way, calculate how + // many elements we should skip in order to have future writes be + // aligned. + + uint expectedWriteAlignment = numInputElementsToConsumeEachVectorizedLoopIteration * (uint)sizeof(TTo); // JIT turns this into a const + i = numInputElementsToConsumeEachVectorizedLoopIteration - ((uint)pDest % expectedWriteAlignment) / (uint)sizeof(TTo); + Debug.Assert((nuint)(&pDest[i]) % expectedWriteAlignment == 0, "Destination buffer wasn't properly aligned!"); + + // Future iterations of this loop will be aligned, + // except for the last iteration. + + while (true) + { + Debug.Assert(i <= elementCount, "We overran a buffer somewhere."); + + if ((elementCount - i) < numInputElementsToConsumeEachVectorizedLoopIteration) + { + // If we're about to enter the final iteration of the loop, back up so that + // we can read one unaligned block. If we've already consumed all the data, + // jump straight to the end. + + if (i == elementCount) + { + goto Return; + } + + i = elementCount - numInputElementsToConsumeEachVectorizedLoopIteration; + } + + // Unaligned read & check for non-ASCII data. + + srcVector = Vector128.LoadUnsafe(ref *pSrc, i); + if (VectorContainsAnyNonAsciiData(srcVector)) + { + goto Drain64; + } + + // Now find matching characters and perform case conversion. + + matches = SignedLessThan((srcVector - subtractionVector), comparisionVector); + srcVector ^= (matches & caseConversionVector); + + // Now write to the destination. + // We expect this write to be aligned except for the last run through the loop. + + ChangeWidthAndWriteTo(srcVector, pDest, i); + i += numInputElementsToConsumeEachVectorizedLoopIteration; + } + } + + Drain64: + + // Attempt to process blocks of 64 input bits. + + if (IntPtr.Size >= 8 && (elementCount - i) >= (nuint)(8 / sizeof(TFrom))) + { + ulong nextBlockAsUInt64 = Unsafe.ReadUnaligned(&pSrc[i]); + if (sourceIsAscii) + { + if (!Utf8Utility.AllBytesInUInt64AreAscii(nextBlockAsUInt64)) + { + goto Drain32; + } + nextBlockAsUInt64 = (conversionIsToUpper) + ? Utf8Utility.ConvertAllAsciiBytesInUInt64ToUppercase(nextBlockAsUInt64) + : Utf8Utility.ConvertAllAsciiBytesInUInt64ToLowercase(nextBlockAsUInt64); + } + else + { + if (!Utf16Utility.AllCharsInUInt64AreAscii(nextBlockAsUInt64)) + { + goto Drain32; + } + nextBlockAsUInt64 = (conversionIsToUpper) + ? Utf16Utility.ConvertAllAsciiCharsInUInt64ToUppercase(nextBlockAsUInt64) + : Utf16Utility.ConvertAllAsciiCharsInUInt64ToLowercase(nextBlockAsUInt64); + } + + if (conversionIsWidthPreserving) + { + Unsafe.WriteUnaligned(&pDest[i], nextBlockAsUInt64); + } + else + { + Debug.Assert(Vector128.IsHardwareAccelerated); + + Vector128 blockAsVectorOfUInt64 = Vector128.CreateScalarUnsafe(nextBlockAsUInt64); + if (conversionIsWidening) + { + Vector128.StoreUnsafe(Vector128.WidenLower(blockAsVectorOfUInt64.AsByte()), ref *(ushort*)pDest, i); + } + else + { + Vector128 blockAsVectorOfUInt16 = blockAsVectorOfUInt64.AsUInt16(); + Vector128 narrowedBlock = Vector128.Narrow(blockAsVectorOfUInt16, blockAsVectorOfUInt16).AsUInt32(); + Unsafe.WriteUnaligned(&pDest[i], narrowedBlock.ToScalar()); + } + } + + i += (nuint)(8 / sizeof(TFrom)); + + // If vectorization is not accelerated, turn this into a while loop. + + if (!Vector128.IsHardwareAccelerated) + { + goto Drain64; + } + } + + Drain32: + + // Attempt to process blocks of 32 input bits. + + if ((elementCount - i) >= (nuint)(4 / sizeof(TFrom))) + { + uint nextBlockAsUInt32 = Unsafe.ReadUnaligned(&pSrc[i]); + if (sourceIsAscii) + { + if (!Utf8Utility.AllBytesInUInt32AreAscii(nextBlockAsUInt32)) + { + goto DrainRemaining; + } + nextBlockAsUInt32 = (conversionIsToUpper) + ? Utf8Utility.ConvertAllAsciiBytesInUInt32ToUppercase(nextBlockAsUInt32) + : Utf8Utility.ConvertAllAsciiBytesInUInt32ToLowercase(nextBlockAsUInt32); + } + else + { + if (!Utf16Utility.AllCharsInUInt32AreAscii(nextBlockAsUInt32)) + { + goto DrainRemaining; + } + nextBlockAsUInt32 = (conversionIsToUpper) + ? Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(nextBlockAsUInt32) + : Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(nextBlockAsUInt32); + } + + if (conversionIsWidthPreserving) + { + Unsafe.WriteUnaligned(&pDest[i], nextBlockAsUInt32); + } + else + { + Debug.Assert(Vector128.IsHardwareAccelerated); + + Vector128 blockAsVectorOfUInt32 = Vector128.CreateScalarUnsafe(nextBlockAsUInt32); + if (conversionIsWidening) + { + Vector128 widenedBlock = Vector128.WidenLower(blockAsVectorOfUInt32.AsByte()).AsUInt64(); + Unsafe.WriteUnaligned(&pDest[i], widenedBlock.ToScalar()); + } + else + { + Vector128 blockAsVectorOfUInt16 = blockAsVectorOfUInt32.AsUInt16(); + Vector128 narrowedBlock = Vector128.Narrow(blockAsVectorOfUInt16, blockAsVectorOfUInt16).AsUInt16(); + Unsafe.WriteUnaligned(&pDest[i], narrowedBlock.ToScalar()); + } + } + + i += (nuint)(4 / sizeof(TFrom)); + + // If vectorization is not accelerated or we're on 32-bit, + // turn this into a while loop. + + if (IntPtr.Size < 8 || !Vector128.IsHardwareAccelerated) + { + goto Drain32; + } + } + + DrainRemaining: + + // Process single elements at a time. + + for (; i < elementCount; i++) + { + uint element = uint.CreateTruncating(pSrc[i]); + if (!UnicodeUtility.IsAsciiCodePoint(element)) + { + break; + } + + if (conversionIsToUpper) + { + if (UnicodeUtility.IsInRangeInclusive(element, 'a', 'z')) + { + element -= 0x20u; // lowercase to uppercase + } + } + else + { + if (UnicodeUtility.IsInRangeInclusive(element, 'A', 'Z')) + { + element += 0x20u; // uppercase to lowercase + } + } + pDest[i] = TTo.CreateTruncating(element); + } + + Return: + + return i; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe bool VectorContainsAnyNonAsciiData(Vector128 vector) + where T : unmanaged + { + if (sizeof(T) == 1) + { + if (vector.ExtractMostSignificantBits() != 0) { return true; } + } + else if (sizeof(T) == 2) + { + if (VectorContainsNonAsciiChar(vector.AsUInt16())) + { + return true; + } + } + else + { + Debug.Fail("Unknown types provided."); + throw new NotSupportedException(); + } + + return false; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void Widen8To16AndAndWriteTo(Vector128 narrowVector, char* pDest, nuint destOffset) + { + if (Vector256.IsHardwareAccelerated) + { + Vector256 wide = Vector256.WidenLower(narrowVector.ToVector256Unsafe()); + wide.StoreUnsafe(ref *(ushort*)pDest, destOffset); + } + else + { + Vector128.WidenLower(narrowVector).StoreUnsafe(ref *(ushort*)pDest, destOffset); + Vector128.WidenUpper(narrowVector).StoreUnsafe(ref *(ushort*)pDest, destOffset + 8); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void Narrow16To8AndAndWriteTo(Vector128 wideVector, byte* pDest, nuint destOffset) + { + Vector128 narrow = Vector128.Narrow(wideVector, wideVector); + + if (Sse2.IsSupported) + { + // MOVQ is supported even on x86, unaligned accesses allowed + Sse2.StoreScalar((ulong*)(pDest + destOffset), narrow.AsUInt64()); + } + else if (Vector64.IsHardwareAccelerated) + { + narrow.GetLower().StoreUnsafe(ref *pDest, destOffset); + } + else + { + Unsafe.WriteUnaligned(pDest + destOffset, narrow.AsUInt64().ToScalar()); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void ChangeWidthAndWriteTo(Vector128 vector, TTo* pDest, nuint elementOffset) + where TFrom : unmanaged + where TTo : unmanaged + { + if (sizeof(TFrom) == sizeof(TTo)) + { + // no width change needed + Vector128.StoreUnsafe(vector.As(), ref *pDest, elementOffset); + } + else if (sizeof(TFrom) == 1 && sizeof(TTo) == 2) + { + // widening operation required + if (Vector256.IsHardwareAccelerated) + { + Vector256 wide = Vector256.WidenLower(vector.AsByte().ToVector256Unsafe()); + Vector256.StoreUnsafe(wide, ref *(ushort*)pDest, elementOffset); + } + else + { + Vector128.StoreUnsafe(Vector128.WidenLower(vector.AsByte()), ref *(ushort*)pDest, elementOffset); + Vector128.StoreUnsafe(Vector128.WidenUpper(vector.AsByte()), ref *(ushort*)pDest, elementOffset + 8); + } + } + else if (sizeof(TFrom) == 2 && sizeof(TTo) == 1) + { + // narrowing operation required + // since we know data is all-ASCII, special-case SSE2 to avoid unneeded PAND in Narrow call + Vector128 narrow = (Sse2.IsSupported) + ? Sse2.PackUnsignedSaturate(vector.AsInt16(), vector.AsInt16()) + : Vector128.Narrow(vector.AsUInt16(), vector.AsUInt16()); + narrow.GetLower().StoreUnsafe(ref *(byte*)pDest, elementOffset); + } + else + { + Debug.Fail("Unknown types."); + throw new NotSupportedException(); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe Vector128 SignedLessThan(Vector128 left, Vector128 right) + where T : unmanaged + { + if (sizeof(T) == 1) + { + return Vector128.LessThan(left.AsSByte(), right.AsSByte()).As(); + } + else if (sizeof(T) == 2) + { + return Vector128.LessThan(left.AsInt16(), right.AsInt16()).As(); + } + else + { + throw new NotSupportedException(); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe Vector128 NarrowOrWidenLowerVectorUnsigned(Vector128 vector) + where TFrom : unmanaged + where TTo : unmanaged + { + if (sizeof(TFrom) == 1 && sizeof(TTo) == 2) + { + return Vector128.WidenLower(vector.AsByte()).As(); + } + else if (sizeof(TFrom) == 2 && sizeof(TTo) == 1) + { + return Vector128.Narrow(vector.AsUInt16(), vector.AsUInt16()).As(); + } + else + { + throw new NotSupportedException(); + } + } + + private struct ToUpperConversion { } + private struct ToLowerConversion { } + } +} diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Transcoding.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Transcoding.cs new file mode 100644 index 00000000000000..0952598f5e0ec8 --- /dev/null +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Transcoding.cs @@ -0,0 +1,82 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Buffers; +using System.Diagnostics; +using System.Runtime.InteropServices; + +namespace System.Text +{ + public static partial class Ascii + { + /// + /// Copies text from a source buffer to a destination buffer, converting + /// from ASCII to UTF-16 during the copy. + /// + /// The source buffer from which ASCII text is read. + /// The destination buffer to which UTF-16 text is written. + /// The number of chars actually written to . It's the same as the number of bytes actually read from + /// An describing the result of the operation. + public static unsafe OperationStatus ToUtf16(ReadOnlySpan source, Span destination, out int charsWritten) + { + nuint numElementsToConvert; + OperationStatus statusToReturnOnSuccess; + + if (source.Length <= destination.Length) + { + numElementsToConvert = (uint)source.Length; + statusToReturnOnSuccess = OperationStatus.Done; + } + else + { + numElementsToConvert = (uint)destination.Length; + statusToReturnOnSuccess = OperationStatus.DestinationTooSmall; + } + + fixed (byte* pSource = &MemoryMarshal.GetReference(source)) + fixed (char* pDestination = &MemoryMarshal.GetReference(destination)) + { + nuint numElementsActuallyConverted = WidenAsciiToUtf16(pSource, pDestination, numElementsToConvert); + Debug.Assert(numElementsActuallyConverted <= numElementsToConvert); + + charsWritten = (int)numElementsActuallyConverted; + return (numElementsToConvert == numElementsActuallyConverted) ? statusToReturnOnSuccess : OperationStatus.InvalidData; + } + } + + /// + /// Copies text from a source buffer to a destination buffer, converting + /// from UTF-16 to ASCII during the copy. + /// + /// The source buffer from which UTF-16 text is read. + /// The destination buffer to which ASCII text is written. + /// The number of bytes actually written to . It's the same as the number of chars actually read from . + /// An describing the result of the operation. + public static unsafe OperationStatus FromUtf16(ReadOnlySpan source, Span destination, out int bytesWritten) + { + nuint numElementsToConvert; + OperationStatus statusToReturnOnSuccess; + + if (source.Length <= destination.Length) + { + numElementsToConvert = (uint)source.Length; + statusToReturnOnSuccess = OperationStatus.Done; + } + else + { + numElementsToConvert = (uint)destination.Length; + statusToReturnOnSuccess = OperationStatus.DestinationTooSmall; + } + + fixed (char* pSource = &MemoryMarshal.GetReference(source)) + fixed (byte* pDestination = &MemoryMarshal.GetReference(destination)) + { + nuint numElementsActuallyConverted = NarrowUtf16ToAscii(pSource, pDestination, numElementsToConvert); + Debug.Assert(numElementsActuallyConverted <= numElementsToConvert); + + bytesWritten = (int)numElementsActuallyConverted; + return (numElementsToConvert == numElementsActuallyConverted) ? statusToReturnOnSuccess : OperationStatus.InvalidData; + } + } + } +} diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Trimming.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Trimming.cs new file mode 100644 index 00000000000000..f175db0b2d8262 --- /dev/null +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Trimming.cs @@ -0,0 +1,80 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Numerics; + +namespace System.Text +{ + public static partial class Ascii + { + /// + /// Trims all leading and trailing ASCII whitespaces from the buffer. + /// + /// The ASCII buffer. + /// The Range of the untrimmed data. + public static Range Trim(ReadOnlySpan value) => TrimHelper(value, TrimType.Both); + + /// + public static Range Trim(ReadOnlySpan value) => TrimHelper(value, TrimType.Both); + + /// + /// Trims all leading ASCII whitespaces from the buffer. + /// + /// The ASCII buffer. + /// The Range of the untrimmed data. + public static Range TrimStart(ReadOnlySpan value) => TrimHelper(value, TrimType.Head); + + /// + public static Range TrimStart(ReadOnlySpan value) => TrimHelper(value, TrimType.Head); + + /// + /// Trims all trailing ASCII whitespaces from the buffer. + /// + /// The ASCII buffer. + /// The Range of the untrimmed data. + public static Range TrimEnd(ReadOnlySpan value) => TrimHelper(value, TrimType.Tail); + + /// + public static Range TrimEnd(ReadOnlySpan value) => TrimHelper(value, TrimType.Tail); + + private static Range TrimHelper(ReadOnlySpan value, TrimType trimType) + where T : unmanaged, IBinaryInteger + { + const uint TrimMask = + (1u << (0x09 - 1)) + | (1u << (0x0A - 1)) + | (1u << (0x0B - 1)) + | (1u << (0x0C - 1)) + | (1u << (0x0D - 1)) + | (1u << (0x20 - 1)); + + int start = 0; + if ((trimType & TrimType.Head) != 0) + { + for (; start < value.Length; start++) + { + uint elementValue = uint.CreateTruncating(value[start]); + if ((elementValue > 0x20) || ((TrimMask & (1u << ((int)elementValue - 1))) == 0)) + { + break; + } + } + } + + int end = value.Length - 1; + if ((trimType & TrimType.Tail) != 0) + { + for (; start <= end; end--) + { + uint elementValue = uint.CreateTruncating(value[end]); + if ((elementValue > 0x20) || ((TrimMask & (1u << ((int)elementValue - 1))) == 0)) + { + break; + } + } + } + + return start..(end + 1); + } + } +} diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.Helpers.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.Helpers.cs similarity index 97% rename from src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.Helpers.cs rename to src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.Helpers.cs index 3bb75b1640cb65..30467a1843a32a 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.Helpers.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.Helpers.cs @@ -4,11 +4,10 @@ using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; -using System.Runtime.Intrinsics.X86; namespace System.Text { - internal static partial class ASCIIUtility + public static partial class Ascii { /// /// A mask which selects only the high bit of each byte of the given . diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs similarity index 99% rename from src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs rename to src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index a32c5c97a05a2e..86818e46a406ee 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -10,8 +10,11 @@ namespace System.Text { - internal static partial class ASCIIUtility + public static partial class Ascii { + /// + /// Returns iff all bytes in are ASCII. + /// [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool AllBytesInUInt64AreAscii(ulong value) { @@ -78,7 +81,7 @@ private static bool FirstCharInUInt32IsAscii(uint value) /// /// An ASCII byte is defined as 0x00 - 0x7F, inclusive. [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static unsafe nuint GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint bufferLength) + private static unsafe nuint GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint bufferLength) { // If SSE2 is supported, use those specific intrinsics instead of the generic vectorized // code below. This has two benefits: (a) we can take advantage of specific instructions like @@ -614,7 +617,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Intrinsified(byte* pBuff /// /// An ASCII char is defined as 0x0000 - 0x007F, inclusive. [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static unsafe nuint GetIndexOfFirstNonAsciiChar(char* pBuffer, nuint bufferLength /* in chars */) + private static unsafe nuint GetIndexOfFirstNonAsciiChar(char* pBuffer, nuint bufferLength /* in chars */) { // If SSE2/ASIMD is supported, use those specific intrinsics instead of the generic vectorized // code below. This has two benefits: (a) we can take advantage of specific instructions like @@ -1149,7 +1152,7 @@ private static void NarrowTwoUtf16CharsToAsciiAndWriteToBuffer(ref byte outputBu /// or once elements have been converted. Returns the total number /// of elements that were able to be converted. /// - public static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount) + private static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAsciiBuffer, nuint elementCount) { nuint currentOffset = 0; @@ -1573,7 +1576,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified(char* pUtf16Buffer, /// or once elements have been converted. Returns the total number /// of elements that were able to be converted. /// - public static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount) + private static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount) { // Intrinsified in mono interpreter nuint currentOffset = 0; diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.cs new file mode 100644 index 00000000000000..fb8b9281fcc411 --- /dev/null +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.cs @@ -0,0 +1,85 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Diagnostics; +using System.Runtime.InteropServices; + +namespace System.Text +{ + public static partial class Ascii + { + /// + /// Returns the index of the first non-ASCII byte in a buffer. + /// + /// The buffer to scan. + /// The index in where the first non-ASCII + /// byte appears, or -1 if the buffer contains only ASCII bytes. + internal static unsafe int GetIndexOfFirstNonAsciiByte(ReadOnlySpan buffer) + { + if (buffer.IsEmpty) + { + return -1; + } + + nuint bufferLength = (uint)buffer.Length; + fixed (byte* pBuffer = &MemoryMarshal.GetReference(buffer)) + { + nuint idxOfFirstNonAsciiElement = GetIndexOfFirstNonAsciiByte(pBuffer, bufferLength); + Debug.Assert(idxOfFirstNonAsciiElement <= bufferLength); + return (idxOfFirstNonAsciiElement == bufferLength) ? -1 : (int)idxOfFirstNonAsciiElement; + } + } + + /// + /// Returns the index of the first non-ASCII char in a buffer. + /// + /// The buffer to scan. + /// The index in where the first non-ASCII + /// char appears, or -1 if the buffer contains only ASCII char. + internal static unsafe int GetIndexOfFirstNonAsciiChar(ReadOnlySpan buffer) + { + if (buffer.IsEmpty) + { + return -1; + } + + nuint bufferLength = (uint)buffer.Length; + fixed (char* pBuffer = &MemoryMarshal.GetReference(buffer)) + { + nuint idxOfFirstNonAsciiElement = GetIndexOfFirstNonAsciiChar(pBuffer, bufferLength); + Debug.Assert(idxOfFirstNonAsciiElement <= bufferLength); + return (idxOfFirstNonAsciiElement == bufferLength) ? -1 : (int)idxOfFirstNonAsciiElement; + } + } + + /// + /// Determines whether the provided value contains only ASCII bytes. + /// + /// The value to inspect. + /// True if contains only ASCII bytes or is + /// empty; False otherwise. + public static unsafe bool IsValid(ReadOnlySpan value) => GetIndexOfFirstNonAsciiByte(value) < 0; + + /// + /// Determines whether the provided value contains only ASCII chars. + /// + /// The value to inspect. + /// True if contains only ASCII chars or is + /// empty; False otherwise. + public static unsafe bool IsValid(ReadOnlySpan value) => GetIndexOfFirstNonAsciiChar(value) < 0; + + /// + /// Determines whether the provided value is ASCII byte. + /// + /// The value to inspect. + /// True if is ASCII, False otherwise. + public static unsafe bool IsValid(byte value) => value <= 127; + + /// + /// Determines whether the provided value is ASCII char. + /// + /// The value to inspect. + /// True if is ASCII, False otherwise. + public static unsafe bool IsValid(char value) => value <= 127; + } +} diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.Validation.cs index dcf91f9d66192c..ae3d8b572aba5f 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.Validation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.Validation.cs @@ -7,6 +7,7 @@ using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.X86; using System.Numerics; +using System.Buffers.Text; namespace System.Text.Unicode { @@ -28,7 +29,8 @@ internal static unsafe partial class Utf16Utility // First, we'll handle the common case of all-ASCII. If this is able to // consume the entire buffer, we'll skip the remainder of this method's logic. - int numAsciiCharsConsumedJustNow = (int)ASCIIUtility.GetIndexOfFirstNonAsciiChar(pInputBuffer, (uint)inputLength); + int firstNonAsciiIndex = Ascii.GetIndexOfFirstNonAsciiChar(new ReadOnlySpan(pInputBuffer, inputLength)); + int numAsciiCharsConsumedJustNow = firstNonAsciiIndex < 0 ? inputLength : firstNonAsciiIndex; Debug.Assert(0 <= numAsciiCharsConsumedJustNow && numAsciiCharsConsumedJustNow <= inputLength); pInputBuffer += (uint)numAsciiCharsConsumedJustNow; diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.cs index b8dec4640195e7..6e956bbbcbe3b5 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.cs @@ -86,6 +86,64 @@ internal static uint ConvertAllAsciiCharsInUInt32ToUppercase(uint value) return value ^ mask; // bit flip lowercase letters [a-z] => [A-Z] } + /// + /// Given a UInt64 that represents four ASCII UTF-16 characters, returns the invariant + /// uppercase representation of those characters. Requires the input value to contain + /// four ASCII UTF-16 characters in machine endianness. + /// + /// + /// This is a branchless implementation. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static ulong ConvertAllAsciiCharsInUInt64ToUppercase(ulong value) + { + // ASSUMPTION: Caller has validated that input value is ASCII. + Debug.Assert(AllCharsInUInt64AreAscii(value)); + + // the 0x80 bit of each word of 'lowerIndicator' will be set iff the word has value >= 'a' + ulong lowerIndicator = value + 0x0080_0080_0080_0080ul - 0x0061_0061_0061_0061ul; + + // the 0x80 bit of each word of 'upperIndicator' will be set iff the word has value > 'z' + ulong upperIndicator = value + 0x0080_0080_0080_0080ul - 0x007B_007B_007B_007Bul; + + // the 0x80 bit of each word of 'combinedIndicator' will be set iff the word has value >= 'a' and <= 'z' + ulong combinedIndicator = (lowerIndicator ^ upperIndicator); + + // the 0x20 bit of each word of 'mask' will be set iff the word has value >= 'a' and <= 'z' + ulong mask = (combinedIndicator & 0x0080_0080_0080_0080ul) >> 2; + + return value ^ mask; // bit flip lowercase letters [a-z] => [A-Z] + } + + /// + /// Given a UInt64 that represents four ASCII UTF-16 characters, returns the invariant + /// lowercase representation of those characters. Requires the input value to contain + /// four ASCII UTF-16 characters in machine endianness. + /// + /// + /// This is a branchless implementation. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static ulong ConvertAllAsciiCharsInUInt64ToLowercase(ulong value) + { + // ASSUMPTION: Caller has validated that input value is ASCII. + Debug.Assert(AllCharsInUInt64AreAscii(value)); + + // the 0x80 bit of each word of 'lowerIndicator' will be set iff the word has value >= 'A' + ulong lowerIndicator = value + 0x0080_0080_0080_0080ul - 0x0041_0041_0041_0041ul; + + // the 0x80 bit of each word of 'upperIndicator' will be set iff the word has value > 'Z' + ulong upperIndicator = value + 0x0080_0080_0080_0080ul - 0x005B_005B_005B_005Bul; + + // the 0x80 bit of each word of 'combinedIndicator' will be set iff the word has value >= 'a' and <= 'z' + ulong combinedIndicator = (lowerIndicator ^ upperIndicator); + + // the 0x20 bit of each word of 'mask' will be set iff the word has value >= 'a' and <= 'z' + ulong mask = (combinedIndicator & 0x0080_0080_0080_0080ul) >> 2; + + return value ^ mask; // bit flip uppercase letters [A-Z] => [a-z] + } + /// /// Given a UInt32 that represents two ASCII UTF-16 characters, returns true iff /// the input contains one or more lowercase ASCII characters. @@ -256,45 +314,5 @@ internal static bool Vector128OrdinalIgnoreCaseAscii(Vector128 vec1, Vec // Compare two lowercased vectors return (lcVec1 ^ lcVec2) == Vector128.Zero; } - - /// - /// Convert Vector128 that represent 8 ASCII UTF-16 characters to lowercase - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static Vector128 Vector128AsciiToLowercase(Vector128 vec) - { - // ASSUMPTION: Caller has validated that input values are ASCII. - Debug.Assert(AllCharsInVector128AreAscii(vec)); - - // the 0x80 bit of each word of 'lowerIndicator' will be set iff the word has value >= 'A' - Vector128 lowIndicator1 = Vector128.Create((sbyte)(0x80 - 'A')) + vec.AsSByte(); - - // the 0x80 bit of each word of 'combinedIndicator' will be set iff the word has value >= 'A' and <= 'Z' - Vector128 combIndicator1 = Vector128.LessThan( - Vector128.Create(unchecked((sbyte)(('Z' - 'A') - 0x80))), lowIndicator1); - - // Add the lowercase indicator (0x20 bit) to all A-Z letters - return Vector128.AndNot(Vector128.Create((sbyte)0x20), combIndicator1).AsUInt16() + vec; - } - - /// - /// Convert Vector128 that represent 8 ASCII UTF-16 characters to uppercase - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static Vector128 Vector128AsciiToUppercase(Vector128 vec) - { - // ASSUMPTION: Caller has validated that input values are ASCII. - Debug.Assert(AllCharsInVector128AreAscii(vec)); - - // the 0x80 bit of each word of 'lowerIndicator' will be set iff the word has value >= 'a' - Vector128 lowIndicator1 = Vector128.Create((sbyte)(0x80 - 'a')) + vec.AsSByte(); - - // the 0x80 bit of each word of 'combinedIndicator' will be set iff the word has value >= 'a' and <= 'z' - Vector128 combIndicator1 = Vector128.LessThan( - Vector128.Create(unchecked((sbyte)(('z' - 'a') - 0x80))), lowIndicator1); - - // Drop the lowercase indicator (0x20 bit) from all a-z letters - return vec - Vector128.AndNot(Vector128.Create((sbyte)0x20), combIndicator1).AsUInt16(); - } } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs index 77866f028ea0c6..b1c908b15b73ea 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. using System.Buffers; +using System.Buffers.Text; using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; @@ -25,27 +26,24 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng Debug.Assert(pOutputBuffer != null || outputCharsRemaining == 0, "Destination length must be zero if destination buffer pointer is null."); // First, try vectorized conversion. + OperationStatus status = Ascii.ToUtf16(new ReadOnlySpan(pInputBuffer, inputLength), new Span(pOutputBuffer, outputCharsRemaining), out int bytesConsumed); - { - nuint numElementsConverted = ASCIIUtility.WidenAsciiToUtf16(pInputBuffer, pOutputBuffer, (uint)Math.Min(inputLength, outputCharsRemaining)); - - pInputBuffer += numElementsConverted; - pOutputBuffer += numElementsConverted; - - // Quick check - did we just end up consuming the entire input buffer? - // If so, short-circuit the remainder of the method. + pInputBuffer += bytesConsumed; + pOutputBuffer += bytesConsumed; - if ((int)numElementsConverted == inputLength) - { - pInputBufferRemaining = pInputBuffer; - pOutputBufferRemaining = pOutputBuffer; - return OperationStatus.Done; - } + // Quick check - did we just end up consuming the entire input buffer? + // If so, short-circuit the remainder of the method. - inputLength -= (int)numElementsConverted; - outputCharsRemaining -= (int)numElementsConverted; + if (status == OperationStatus.Done) + { + pInputBufferRemaining = pInputBuffer; + pOutputBufferRemaining = pOutputBuffer; + return OperationStatus.Done; } + inputLength -= bytesConsumed; + outputCharsRemaining -= bytesConsumed; + if (inputLength < sizeof(uint)) { goto ProcessInputOfLessThanDWordSize; @@ -74,7 +72,7 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng #endif // First, check for the common case of all-ASCII bytes. - if (ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)) + if (Ascii.AllBytesInUInt32AreAscii(thisDWord)) { // We read an all-ASCII sequence. @@ -83,7 +81,7 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng goto ProcessRemainingBytesSlow; // running out of space, but may be able to write some data } - ASCIIUtility.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref *pOutputBuffer, thisDWord); + Ascii.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref *pOutputBuffer, thisDWord); pInputBuffer += 4; pOutputBuffer += 4; outputCharsRemaining -= 4; @@ -102,15 +100,15 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng thisDWord = Unsafe.ReadUnaligned(pInputBuffer); secondDWord = Unsafe.ReadUnaligned(pInputBuffer + sizeof(uint)); - if (!ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord | secondDWord)) + if (!Ascii.AllBytesInUInt32AreAscii(thisDWord | secondDWord)) { goto LoopTerminatedEarlyDueToNonAsciiData; } pInputBuffer += 8; - ASCIIUtility.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref pOutputBuffer[0], thisDWord); - ASCIIUtility.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref pOutputBuffer[4], secondDWord); + Ascii.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref pOutputBuffer[0], thisDWord); + Ascii.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref pOutputBuffer[4], secondDWord); pOutputBuffer += 8; } @@ -121,15 +119,15 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng LoopTerminatedEarlyDueToNonAsciiData: - if (ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)) + if (Ascii.AllBytesInUInt32AreAscii(thisDWord)) { // The first DWORD contained all-ASCII bytes, so expand it. - ASCIIUtility.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref *pOutputBuffer, thisDWord); + Ascii.WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref *pOutputBuffer, thisDWord); // continue the outer loop from the second DWORD - Debug.Assert(!ASCIIUtility.AllBytesInUInt32AreAscii(secondDWord)); + Debug.Assert(!Ascii.AllBytesInUInt32AreAscii(secondDWord)); thisDWord = secondDWord; pInputBuffer += 4; @@ -147,7 +145,7 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng AfterReadDWordSkipAllBytesAsciiCheck: - Debug.Assert(!ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)); // this should have been handled earlier + Debug.Assert(!Ascii.AllBytesInUInt32AreAscii(thisDWord)); // this should have been handled earlier // Next, try stripping off ASCII bytes one at a time. // We only handle up to three ASCII bytes here since we handled the four ASCII byte case above. @@ -848,23 +846,23 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt // First, try vectorized conversion. { - nuint numElementsConverted = ASCIIUtility.NarrowUtf16ToAscii(pInputBuffer, pOutputBuffer, (uint)Math.Min(inputLength, outputBytesRemaining)); + OperationStatus status = Ascii.FromUtf16(new ReadOnlySpan(pInputBuffer, inputLength), new Span(pOutputBuffer, outputBytesRemaining), out int charsConsumed); - pInputBuffer += numElementsConverted; - pOutputBuffer += numElementsConverted; + pInputBuffer += charsConsumed; + pOutputBuffer += charsConsumed; // Quick check - did we just end up consuming the entire input buffer? // If so, short-circuit the remainder of the method. - if ((int)numElementsConverted == inputLength) + if (status == OperationStatus.Done) { pInputBufferRemaining = pInputBuffer; pOutputBufferRemaining = pOutputBuffer; return OperationStatus.Done; } - inputLength -= (int)numElementsConverted; - outputBytesRemaining -= (int)numElementsConverted; + inputLength -= charsConsumed; + outputBytesRemaining -= charsConsumed; } if (inputLength < CharsPerDWord) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs index 5784cfa136430a..8e08a4a3bdae10 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Buffers.Text; using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; @@ -26,23 +27,19 @@ internal static unsafe partial class Utf8Utility Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null."); // First, try to drain off as many ASCII bytes as we can from the beginning. - + int indexOfFirstNonAscii = Ascii.GetIndexOfFirstNonAsciiByte(new ReadOnlySpan(pInputBuffer, inputLength)); + // Quick check - did we just end up consuming the entire input buffer? + // If so, short-circuit the remainder of the method. + if (indexOfFirstNonAscii < 0) { - nuint numAsciiBytesCounted = ASCIIUtility.GetIndexOfFirstNonAsciiByte(pInputBuffer, (uint)inputLength); - pInputBuffer += numAsciiBytesCounted; - - // Quick check - did we just end up consuming the entire input buffer? - // If so, short-circuit the remainder of the method. - - inputLength -= (int)numAsciiBytesCounted; - if (inputLength == 0) - { - utf16CodeUnitCountAdjustment = 0; - scalarCountAdjustment = 0; - return pInputBuffer; - } + utf16CodeUnitCountAdjustment = 0; + scalarCountAdjustment = 0; + return pInputBuffer + inputLength; } + pInputBuffer += indexOfFirstNonAscii; + inputLength -= indexOfFirstNonAscii; + #if DEBUG // Keep these around for final validation at the end of the method. byte* pOriginalInputBuffer = pInputBuffer; @@ -82,7 +79,7 @@ internal static unsafe partial class Utf8Utility // First, check for the common case of all-ASCII bytes. - if (ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)) + if (Ascii.AllBytesInUInt32AreAscii(thisDWord)) { // We read an all-ASCII sequence. @@ -102,7 +99,7 @@ internal static unsafe partial class Utf8Utility // the read pointer up to the next aligned address. thisDWord = Unsafe.ReadUnaligned(pInputBuffer); - if (!ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)) + if (!Ascii.AllBytesInUInt32AreAscii(thisDWord)) { goto AfterReadDWordSkipAllBytesAsciiCheck; } @@ -156,12 +153,12 @@ internal static unsafe partial class Utf8Utility } else { - if (!ASCIIUtility.AllBytesInUInt32AreAscii(((uint*)pInputBuffer)[0] | ((uint*)pInputBuffer)[1])) + if (!Ascii.AllBytesInUInt32AreAscii(((uint*)pInputBuffer)[0] | ((uint*)pInputBuffer)[1])) { goto LoopTerminatedEarlyDueToNonAsciiDataInFirstPair; } - if (!ASCIIUtility.AllBytesInUInt32AreAscii(((uint*)pInputBuffer)[2] | ((uint*)pInputBuffer)[3])) + if (!Ascii.AllBytesInUInt32AreAscii(((uint*)pInputBuffer)[2] | ((uint*)pInputBuffer)[3])) { goto LoopTerminatedEarlyDueToNonAsciiDataInSecondPair; } @@ -206,7 +203,7 @@ internal static unsafe partial class Utf8Utility // Let's perform a quick check here to bypass the logic at the beginning of the main loop. thisDWord = *(uint*)pInputBuffer; // still aligned here - if (ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)) + if (Ascii.AllBytesInUInt32AreAscii(thisDWord)) { pInputBuffer += sizeof(uint); // consumed 1 more DWORD thisDWord = *(uint*)pInputBuffer; // still aligned here @@ -220,13 +217,13 @@ internal static unsafe partial class Utf8Utility AfterReadDWordSkipAllBytesAsciiCheck: - Debug.Assert(!ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)); // this should have been handled earlier + Debug.Assert(!Ascii.AllBytesInUInt32AreAscii(thisDWord)); // this should have been handled earlier // Next, try stripping off ASCII bytes one at a time. // We only handle up to three ASCII bytes here since we handled the four ASCII byte case above. { - uint numLeadingAsciiBytes = ASCIIUtility.CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(thisDWord); + uint numLeadingAsciiBytes = Ascii.CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(thisDWord); pInputBuffer += numLeadingAsciiBytes; if (pFinalPosWhereCanReadDWordFromInputBuffer < pInputBuffer) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.cs index d553441c77cdf2..4d46796d0ab8bf 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; @@ -42,5 +43,134 @@ public static unsafe int GetIndexOfFirstInvalidUtf8Sequence(ReadOnlySpan u } } + + /// + /// Returns true iff the UInt32 represents four ASCII UTF-8 characters in machine endianness. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static bool AllBytesInUInt32AreAscii(uint value) => (value & ~0x7F7F_7F7Fu) == 0; + + /// + /// Returns true iff the UInt64 represents eighty ASCII UTF-8 characters in machine endianness. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static bool AllBytesInUInt64AreAscii(ulong value) => (value & ~0x7F7F_7F7F_7F7F_7F7Ful) == 0; + + /// + /// Given a UInt32 that represents four ASCII UTF-8 characters, returns the invariant + /// lowercase representation of those characters. Requires the input value to contain + /// four ASCII UTF-8 characters in machine endianness. + /// + /// + /// This is a branchless implementation. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static uint ConvertAllAsciiBytesInUInt32ToLowercase(uint value) + { + // ASSUMPTION: Caller has validated that input value is ASCII. + Debug.Assert(AllBytesInUInt32AreAscii(value)); + + // the 0x80 bit of each byte of 'lowerIndicator' will be set iff the word has value >= 'A' + uint lowerIndicator = value + 0x8080_8080u - 0x4141_4141u; + + // the 0x80 bit of each byte of 'upperIndicator' will be set iff the word has value > 'Z' + uint upperIndicator = value + 0x8080_8080u - 0x5B5B_5B5Bu; + + // the 0x80 bit of each byte of 'combinedIndicator' will be set iff the word has value >= 'A' and <= 'Z' + uint combinedIndicator = (lowerIndicator ^ upperIndicator); + + // the 0x20 bit of each byte of 'mask' will be set iff the word has value >= 'A' and <= 'Z' + uint mask = (combinedIndicator & 0x8080_8080u) >> 2; + + return value ^ mask; // bit flip uppercase letters [A-Z] => [a-z] + } + + /// + /// Given a UInt32 that represents four ASCII UTF-8 characters, returns the invariant + /// uppercase representation of those characters. Requires the input value to contain + /// four ASCII UTF-8 characters in machine endianness. + /// + /// + /// This is a branchless implementation. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static uint ConvertAllAsciiBytesInUInt32ToUppercase(uint value) + { + // Intrinsified in mono interpreter + // ASSUMPTION: Caller has validated that input value is ASCII. + Debug.Assert(AllBytesInUInt32AreAscii(value)); + + // the 0x80 bit of each byte of 'lowerIndicator' will be set iff the word has value >= 'a' + uint lowerIndicator = value + 0x8080_8080u - 0x6161_6161u; + + // the 0x80 bit of each byte of 'upperIndicator' will be set iff the word has value > 'z' + uint upperIndicator = value + 0x8080_8080u - 0x7B7B_7B7Bu; + + // the 0x80 bit of each byte of 'combinedIndicator' will be set iff the word has value >= 'a' and <= 'z' + uint combinedIndicator = (lowerIndicator ^ upperIndicator); + + // the 0x20 bit of each byte of 'mask' will be set iff the word has value >= 'a' and <= 'z' + uint mask = (combinedIndicator & 0x8080_8080u) >> 2; + + return value ^ mask; // bit flip lowercase letters [a-z] => [A-Z] + } + + /// + /// Given a UInt64 that represents eight ASCII UTF-8 characters, returns the invariant + /// uppercase representation of those characters. Requires the input value to contain + /// eight ASCII UTF-8 characters in machine endianness. + /// + /// + /// This is a branchless implementation. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static ulong ConvertAllAsciiBytesInUInt64ToUppercase(ulong value) + { + // ASSUMPTION: Caller has validated that input value is ASCII. + Debug.Assert(AllBytesInUInt64AreAscii(value)); + + // the 0x80 bit of each byte of 'lowerIndicator' will be set iff the word has value >= 'a' + ulong lowerIndicator = value + 0x8080_8080_8080_8080ul - 0x6161_6161_6161_6161ul; + + // the 0x80 bit of each byte of 'upperIndicator' will be set iff the word has value > 'z' + ulong upperIndicator = value + 0x8080_8080_8080_8080ul - 0x7B7B_7B7B_7B7B_7B7Bul; + + // the 0x80 bit of each byte of 'combinedIndicator' will be set iff the word has value >= 'a' and <= 'z' + ulong combinedIndicator = (lowerIndicator ^ upperIndicator); + + // the 0x20 bit of each byte of 'mask' will be set iff the word has value >= 'a' and <= 'z' + ulong mask = (combinedIndicator & 0x8080_8080_8080_8080ul) >> 2; + + return value ^ mask; // bit flip lowercase letters [a-z] => [A-Z] + } + + /// + /// Given a UInt64 that represents eight ASCII UTF-8 characters, returns the invariant + /// uppercase representation of those characters. Requires the input value to contain + /// eight ASCII UTF-8 characters in machine endianness. + /// + /// + /// This is a branchless implementation. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static ulong ConvertAllAsciiBytesInUInt64ToLowercase(ulong value) + { + // ASSUMPTION: Caller has validated that input value is ASCII. + Debug.Assert(AllBytesInUInt64AreAscii(value)); + + // the 0x80 bit of each byte of 'lowerIndicator' will be set iff the word has value >= 'A' + ulong lowerIndicator = value + 0x8080_8080_8080_8080ul - 0x4141_4141_4141_4141ul; + + // the 0x80 bit of each byte of 'upperIndicator' will be set iff the word has value > 'Z' + ulong upperIndicator = value + 0x8080_8080_8080_8080ul - 0x5B5B_5B5B_5B5B_5B5Bul; + + // the 0x80 bit of each byte of 'combinedIndicator' will be set iff the word has value >= 'a' and <= 'z' + ulong combinedIndicator = (lowerIndicator ^ upperIndicator); + + // the 0x20 bit of each byte of 'mask' will be set iff the word has value >= 'a' and <= 'z' + ulong mask = (combinedIndicator & 0x8080_8080_8080_8080ul) >> 2; + + return value ^ mask; // bit flip uppercase letters [A-Z] => [a-z] + } } } diff --git a/src/libraries/System.Private.Uri/src/System/DomainNameHelper.cs b/src/libraries/System.Private.Uri/src/System/DomainNameHelper.cs index 8cee324b5162dc..2cfe8bb79b3ca1 100644 --- a/src/libraries/System.Private.Uri/src/System/DomainNameHelper.cs +++ b/src/libraries/System.Private.Uri/src/System/DomainNameHelper.cs @@ -50,11 +50,6 @@ internal static class DomainNameHelper private const string Localhost = "localhost"; private const string Loopback = "loopback"; - // TODO https://github.com/dotnet/runtime/issues/28230: Replace once Ascii is available - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool IsAscii(ReadOnlySpan chars) => - chars.IndexOfAnyExceptInRange((char)0, (char)127) < 0; - internal static string ParseCanonicalName(string str, int start, int end, ref bool loopback) { // Do a quick search for the colon or uppercase letters @@ -151,7 +146,7 @@ public static bool IsValid(ReadOnlySpan hostname, bool iri, bool notImplic if (iri) { ReadOnlySpan label = hostname.Slice(0, labelLength); - if (!IsAscii(label)) + if (!Ascii.IsValid(label)) { // s_iriInvalidAsciiChars confirmed everything in [0, 7F] range. // Chars in [80, 9F] range are also invalid, check for them now. @@ -200,7 +195,7 @@ public static string IdnEquivalent(string hostname) { // check if only ascii chars // special case since idnmapping will not lowercase if only ascii present - if (IsAscii(hostname)) + if (Ascii.IsValid(hostname)) { // just lowercase for ascii return hostname.ToLowerInvariant(); @@ -247,7 +242,7 @@ public static bool TryGetUnicodeEquivalent(string hostname, ref ValueStringBuild label = label.Slice(0, dotIndex); } - if (!IsAscii(label)) + if (!Ascii.IsValid(label)) { try { diff --git a/src/libraries/System.Runtime/ref/System.Runtime.cs b/src/libraries/System.Runtime/ref/System.Runtime.cs index 50c7708cdfc74c..46e219694510b9 100644 --- a/src/libraries/System.Runtime/ref/System.Runtime.cs +++ b/src/libraries/System.Runtime/ref/System.Runtime.cs @@ -13818,6 +13818,33 @@ public enum TokenImpersonationLevel } namespace System.Text { + public static class Ascii + { + public static bool IsValid(System.ReadOnlySpan value) { throw null; } + public static bool IsValid(System.ReadOnlySpan value) { throw null; } + public static bool IsValid(byte value) { throw null; } + public static bool IsValid(char value) { throw null; } + public static System.Buffers.OperationStatus ToLower(System.ReadOnlySpan source, System.Span destination, out int bytesWritten) { throw null; } + public static System.Buffers.OperationStatus ToLower(System.ReadOnlySpan source, System.Span destination, out int charsWritten) { throw null; } + public static System.Buffers.OperationStatus ToLower(System.ReadOnlySpan source, System.Span destination, out int charsWritten) { throw null; } + public static System.Buffers.OperationStatus ToLower(System.ReadOnlySpan source, System.Span destination, out int bytesWritten) { throw null; } + public static System.Buffers.OperationStatus ToUpper(System.ReadOnlySpan source, System.Span destination, out int bytesWritten) { throw null; } + public static System.Buffers.OperationStatus ToUpper(System.ReadOnlySpan source, System.Span destination, out int charsWritten) { throw null; } + public static System.Buffers.OperationStatus ToUpper(System.ReadOnlySpan source, System.Span destination, out int charsWritten) { throw null; } + public static System.Buffers.OperationStatus ToUpper(System.ReadOnlySpan source, System.Span destination, out int bytesWritten) { throw null; } + public static System.Buffers.OperationStatus ToLowerInPlace(System.Span value, out int bytesWritten) { throw null; } + public static System.Buffers.OperationStatus ToLowerInPlace(System.Span value, out int charsWritten) { throw null; } + public static System.Buffers.OperationStatus ToUpperInPlace(System.Span value, out int bytesWritten) { throw null; } + public static System.Buffers.OperationStatus ToUpperInPlace(System.Span value, out int charsWritten) { throw null; } + public static System.Buffers.OperationStatus FromUtf16(System.ReadOnlySpan source, System.Span destination, out int bytesWritten) { throw null; } + public static System.Buffers.OperationStatus ToUtf16(System.ReadOnlySpan source, System.Span destination, out int charsWritten) { throw null; } + public static System.Range Trim(System.ReadOnlySpan value) { throw null; } + public static System.Range Trim(System.ReadOnlySpan value) { throw null; } + public static System.Range TrimEnd(System.ReadOnlySpan value) { throw null; } + public static System.Range TrimEnd(System.ReadOnlySpan value) { throw null; } + public static System.Range TrimStart(System.ReadOnlySpan value) { throw null; } + public static System.Range TrimStart(System.ReadOnlySpan value) { throw null; } + } public abstract partial class Decoder { protected Decoder() { } diff --git a/src/libraries/System.Runtime/tests/System.Runtime.Tests.csproj b/src/libraries/System.Runtime/tests/System.Runtime.Tests.csproj index 26483f4ab0cb36..54d3aaacd76050 100644 --- a/src/libraries/System.Runtime/tests/System.Runtime.Tests.csproj +++ b/src/libraries/System.Runtime/tests/System.Runtime.Tests.csproj @@ -294,7 +294,6 @@ - diff --git a/src/libraries/System.Runtime/tests/System/Text/ASCIIUtilityTests.cs b/src/libraries/System.Runtime/tests/System/Text/ASCIIUtilityTests.cs deleted file mode 100644 index 72ca1707055ab0..00000000000000 --- a/src/libraries/System.Runtime/tests/System/Text/ASCIIUtilityTests.cs +++ /dev/null @@ -1,461 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -using System.Buffers; -using System.Diagnostics.CodeAnalysis; -using System.Numerics; -using System.Reflection; -using System.Runtime.InteropServices; -using System.Security.Cryptography; -using Xunit; - -namespace System.Text.Tests -{ - // Since many of the methods we'll be testing are internal, we'll need to invoke - // them via reflection. - public static unsafe class AsciiUtilityTests - { - private const int SizeOfVector128 = 128 / 8; - - // The delegate definitions and members below provide us access to CoreLib's internals. - // We use UIntPtr instead of nuint everywhere here since we don't know what our target arch is. - - private delegate UIntPtr FnGetIndexOfFirstNonAsciiByte(byte* pBuffer, UIntPtr bufferLength); - private static readonly UnsafeLazyDelegate _fnGetIndexOfFirstNonAsciiByte = new UnsafeLazyDelegate("GetIndexOfFirstNonAsciiByte"); - - private delegate UIntPtr FnGetIndexOfFirstNonAsciiChar(char* pBuffer, UIntPtr bufferLength); - private static readonly UnsafeLazyDelegate _fnGetIndexOfFirstNonAsciiChar = new UnsafeLazyDelegate("GetIndexOfFirstNonAsciiChar"); - - private delegate UIntPtr FnNarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAsciiBuffer, UIntPtr elementCount); - private static readonly UnsafeLazyDelegate _fnNarrowUtf16ToAscii = new UnsafeLazyDelegate("NarrowUtf16ToAscii"); - - private delegate UIntPtr FnWidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16Buffer, UIntPtr elementCount); - private static readonly UnsafeLazyDelegate _fnWidenAsciiToUtf16 = new UnsafeLazyDelegate("WidenAsciiToUtf16"); - - [Fact] - public static void GetIndexOfFirstNonAsciiByte_EmptyInput_NullReference() - { - Assert.Equal(UIntPtr.Zero, _fnGetIndexOfFirstNonAsciiByte.Delegate(null, UIntPtr.Zero)); - } - - [Fact] - public static void GetIndexOfFirstNonAsciiByte_EmptyInput_NonNullReference() - { - byte b = default; - Assert.Equal(UIntPtr.Zero, _fnGetIndexOfFirstNonAsciiByte.Delegate(&b, UIntPtr.Zero)); - } - - [Fact] - public static void GetIndexOfFirstNonAsciiByte_Vector128InnerLoop() - { - // The purpose of this test is to make sure we're identifying the correct - // vector (of the two that we're reading simultaneously) when performing - // the final ASCII drain at the end of the method once we've broken out - // of the inner loop. - - using (BoundedMemory mem = BoundedMemory.Allocate(1024)) - { - Span bytes = mem.Span; - - for (int i = 0; i < bytes.Length; i++) - { - bytes[i] &= 0x7F; // make sure each byte (of the pre-populated random data) is ASCII - } - - // Two vectors have offsets 0 .. 31. We'll go backward to avoid having to - // re-clear the vector every time. - - for (int i = 2 * SizeOfVector128 - 1; i >= 0; i--) - { - bytes[100 + i * 13] = 0x80; // 13 is relatively prime to 32, so it ensures all possible positions are hit - Assert.Equal(100 + i * 13, CallGetIndexOfFirstNonAsciiByte(bytes)); - } - } - } - - [Fact] - public static void GetIndexOfFirstNonAsciiByte_Boundaries() - { - // The purpose of this test is to make sure we're hitting all of the vectorized - // and draining logic correctly both in the SSE2 and in the non-SSE2 enlightened - // code paths. We shouldn't be reading beyond the boundaries we were given. - - // The 5 * Vector test should make sure that we're exercising all possible - // code paths across both implementations. - using (BoundedMemory mem = BoundedMemory.Allocate(5 * Vector.Count)) - { - Span bytes = mem.Span; - - // First, try it with all-ASCII buffers. - - for (int i = 0; i < bytes.Length; i++) - { - bytes[i] &= 0x7F; // make sure each byte (of the pre-populated random data) is ASCII - } - - for (int i = bytes.Length; i >= 0; i--) - { - Assert.Equal(i, CallGetIndexOfFirstNonAsciiByte(bytes.Slice(0, i))); - } - - // Then, try it with non-ASCII bytes. - - for (int i = bytes.Length; i >= 1; i--) - { - bytes[i - 1] = 0x80; // set non-ASCII - Assert.Equal(i - 1, CallGetIndexOfFirstNonAsciiByte(bytes.Slice(0, i))); - } - } - } - - [Fact] - public static void GetIndexOfFirstNonAsciiChar_EmptyInput_NullReference() - { - Assert.Equal(UIntPtr.Zero, _fnGetIndexOfFirstNonAsciiChar.Delegate(null, UIntPtr.Zero)); - } - - [Fact] - public static void GetIndexOfFirstNonAsciiChar_EmptyInput_NonNullReference() - { - char c = default; - Assert.Equal(UIntPtr.Zero, _fnGetIndexOfFirstNonAsciiChar.Delegate(&c, UIntPtr.Zero)); - } - - [Fact] - public static void GetIndexOfFirstNonAsciiChar_Vector128InnerLoop() - { - // The purpose of this test is to make sure we're identifying the correct - // vector (of the two that we're reading simultaneously) when performing - // the final ASCII drain at the end of the method once we've broken out - // of the inner loop. - // - // Use U+0123 instead of U+0080 for this test because if our implementation - // uses pminuw / pmovmskb incorrectly, U+0123 will incorrectly show up as ASCII, - // causing our test to produce a false negative. - - using (BoundedMemory mem = BoundedMemory.Allocate(1024)) - { - Span chars = mem.Span; - - for (int i = 0; i < chars.Length; i++) - { - chars[i] &= '\u007F'; // make sure each char (of the pre-populated random data) is ASCII - } - - // Two vectors have offsets 0 .. 31. We'll go backward to avoid having to - // re-clear the vector every time. - - for (int i = 2 * SizeOfVector128 - 1; i >= 0; i--) - { - chars[100 + i * 13] = '\u0123'; // 13 is relatively prime to 32, so it ensures all possible positions are hit - Assert.Equal(100 + i * 13, CallGetIndexOfFirstNonAsciiChar(chars)); - } - } - } - - [Fact] - public static void GetIndexOfFirstNonAsciiChar_Boundaries() - { - // The purpose of this test is to make sure we're hitting all of the vectorized - // and draining logic correctly both in the SSE2 and in the non-SSE2 enlightened - // code paths. We shouldn't be reading beyond the boundaries we were given. - // - // The 5 * Vector test should make sure that we're exercising all possible - // code paths across both implementations. The sizeof(char) is because we're - // specifying element count, but underlying implementation reinterpret casts to bytes. - // - // Use U+0123 instead of U+0080 for this test because if our implementation - // uses pminuw / pmovmskb incorrectly, U+0123 will incorrectly show up as ASCII, - // causing our test to produce a false negative. - - using (BoundedMemory mem = BoundedMemory.Allocate(5 * Vector.Count / sizeof(char))) - { - Span chars = mem.Span; - - for (int i = 0; i < chars.Length; i++) - { - chars[i] &= '\u007F'; // make sure each char (of the pre-populated random data) is ASCII - } - - for (int i = chars.Length; i >= 0; i--) - { - Assert.Equal(i, CallGetIndexOfFirstNonAsciiChar(chars.Slice(0, i))); - } - - // Then, try it with non-ASCII bytes. - - for (int i = chars.Length; i >= 1; i--) - { - chars[i - 1] = '\u0123'; // set non-ASCII - Assert.Equal(i - 1, CallGetIndexOfFirstNonAsciiChar(chars.Slice(0, i))); - } - } - } - - [Fact] - public static void WidenAsciiToUtf16_EmptyInput_NullReferences() - { - Assert.Equal(UIntPtr.Zero, _fnWidenAsciiToUtf16.Delegate(null, null, UIntPtr.Zero)); - } - - [Fact] - public static void WidenAsciiToUtf16_EmptyInput_NonNullReference() - { - byte b = default; - char c = default; - Assert.Equal(UIntPtr.Zero, _fnWidenAsciiToUtf16.Delegate(&b, &c, UIntPtr.Zero)); - } - - [Fact] - public static void WidenAsciiToUtf16_AllAsciiInput() - { - using BoundedMemory asciiMem = BoundedMemory.Allocate(128); - using BoundedMemory utf16Mem = BoundedMemory.Allocate(128); - - // Fill source with 00 .. 7F, then trap future writes. - - Span asciiSpan = asciiMem.Span; - for (int i = 0; i < asciiSpan.Length; i++) - { - asciiSpan[i] = (byte)i; - } - asciiMem.MakeReadonly(); - - // We'll write to the UTF-16 span. - // We test with a variety of span lengths to test alignment and fallthrough code paths. - - Span utf16Span = utf16Mem.Span; - - for (int i = 0; i < asciiSpan.Length; i++) - { - utf16Span.Clear(); // remove any data from previous iteration - - // First, validate that the workhorse saw the incoming data as all-ASCII. - - Assert.Equal(128 - i, CallWidenAsciiToUtf16(asciiSpan.Slice(i), utf16Span.Slice(i))); - - // Then, validate that the data was transcoded properly. - - for (int j = i; j < 128; j++) - { - Assert.Equal((ushort)asciiSpan[i], (ushort)utf16Span[i]); - } - } - } - - [Fact] - public static void WidenAsciiToUtf16_SomeNonAsciiInput() - { - using BoundedMemory asciiMem = BoundedMemory.Allocate(128); - using BoundedMemory utf16Mem = BoundedMemory.Allocate(128); - - // Fill source with 00 .. 7F, then trap future writes. - - Span asciiSpan = asciiMem.Span; - for (int i = 0; i < asciiSpan.Length; i++) - { - asciiSpan[i] = (byte)i; - } - - // We'll write to the UTF-16 span. - - Span utf16Span = utf16Mem.Span; - - for (int i = asciiSpan.Length - 1; i >= 0; i--) - { - RandomNumberGenerator.Fill(MemoryMarshal.Cast(utf16Span)); // fill with garbage - - // First, keep track of the garbage we wrote to the destination. - // We want to ensure it wasn't overwritten. - - char[] expectedTrailingData = utf16Span.Slice(i).ToArray(); - - // Then, set the desired byte as non-ASCII, then check that the workhorse - // correctly saw the data as non-ASCII. - - asciiSpan[i] |= (byte)0x80; - Assert.Equal(i, CallWidenAsciiToUtf16(asciiSpan, utf16Span)); - - // Next, validate that the ASCII data was transcoded properly. - - for (int j = 0; j < i; j++) - { - Assert.Equal((ushort)asciiSpan[j], (ushort)utf16Span[j]); - } - - // Finally, validate that the trailing data wasn't overwritten with non-ASCII data. - - Assert.Equal(expectedTrailingData, utf16Span.Slice(i).ToArray()); - } - } - - [Fact] - public static unsafe void NarrowUtf16ToAscii_EmptyInput_NullReferences() - { - Assert.Equal(UIntPtr.Zero, _fnNarrowUtf16ToAscii.Delegate(null, null, UIntPtr.Zero)); - } - - [Fact] - public static void NarrowUtf16ToAscii_EmptyInput_NonNullReference() - { - char c = default; - byte b = default; - Assert.Equal(UIntPtr.Zero, _fnNarrowUtf16ToAscii.Delegate(&c, &b, UIntPtr.Zero)); - } - - [Fact] - public static void NarrowUtf16ToAscii_AllAsciiInput() - { - using BoundedMemory utf16Mem = BoundedMemory.Allocate(128); - using BoundedMemory asciiMem = BoundedMemory.Allocate(128); - - // Fill source with 00 .. 7F. - - Span utf16Span = utf16Mem.Span; - for (int i = 0; i < utf16Span.Length; i++) - { - utf16Span[i] = (char)i; - } - utf16Mem.MakeReadonly(); - - // We'll write to the ASCII span. - // We test with a variety of span lengths to test alignment and fallthrough code paths. - - Span asciiSpan = asciiMem.Span; - - for (int i = 0; i < utf16Span.Length; i++) - { - asciiSpan.Clear(); // remove any data from previous iteration - - // First, validate that the workhorse saw the incoming data as all-ASCII. - - Assert.Equal(128 - i, CallNarrowUtf16ToAscii(utf16Span.Slice(i), asciiSpan.Slice(i))); - - // Then, validate that the data was transcoded properly. - - for (int j = i; j < 128; j++) - { - Assert.Equal((ushort)utf16Span[i], (ushort)asciiSpan[i]); - } - } - } - - [Fact] - public static void NarrowUtf16ToAscii_SomeNonAsciiInput() - { - using BoundedMemory utf16Mem = BoundedMemory.Allocate(128); - using BoundedMemory asciiMem = BoundedMemory.Allocate(128); - - // Fill source with 00 .. 7F. - - Span utf16Span = utf16Mem.Span; - for (int i = 0; i < utf16Span.Length; i++) - { - utf16Span[i] = (char)i; - } - - // We'll write to the ASCII span. - - Span asciiSpan = asciiMem.Span; - - for (int i = utf16Span.Length - 1; i >= 0; i--) - { - RandomNumberGenerator.Fill(asciiSpan); // fill with garbage - - // First, keep track of the garbage we wrote to the destination. - // We want to ensure it wasn't overwritten. - - byte[] expectedTrailingData = asciiSpan.Slice(i).ToArray(); - - // Then, set the desired byte as non-ASCII, then check that the workhorse - // correctly saw the data as non-ASCII. - - utf16Span[i] = '\u0123'; // use U+0123 instead of U+0080 since it catches inappropriate pmovmskb usage - Assert.Equal(i, CallNarrowUtf16ToAscii(utf16Span, asciiSpan)); - - // Next, validate that the ASCII data was transcoded properly. - - for (int j = 0; j < i; j++) - { - Assert.Equal((ushort)utf16Span[j], (ushort)asciiSpan[j]); - } - - // Finally, validate that the trailing data wasn't overwritten with non-ASCII data. - - Assert.Equal(expectedTrailingData, asciiSpan.Slice(i).ToArray()); - } - } - - private static int CallGetIndexOfFirstNonAsciiByte(ReadOnlySpan buffer) - { - fixed (byte* pBuffer = &MemoryMarshal.GetReference(buffer)) - { - // Conversions between UIntPtr <-> int are not checked by default. - return checked((int)_fnGetIndexOfFirstNonAsciiByte.Delegate(pBuffer, (UIntPtr)buffer.Length)); - } - } - - private static int CallGetIndexOfFirstNonAsciiChar(ReadOnlySpan buffer) - { - fixed (char* pBuffer = &MemoryMarshal.GetReference(buffer)) - { - // Conversions between UIntPtr <-> int are not checked by default. - return checked((int)_fnGetIndexOfFirstNonAsciiChar.Delegate(pBuffer, (UIntPtr)buffer.Length)); - } - } - - private static int CallNarrowUtf16ToAscii(ReadOnlySpan utf16, Span ascii) - { - Assert.Equal(utf16.Length, ascii.Length); - - fixed (char* pUtf16 = &MemoryMarshal.GetReference(utf16)) - fixed (byte* pAscii = &MemoryMarshal.GetReference(ascii)) - { - // Conversions between UIntPtr <-> int are not checked by default. - return checked((int)_fnNarrowUtf16ToAscii.Delegate(pUtf16, pAscii, (UIntPtr)utf16.Length)); - } - } - - private static int CallWidenAsciiToUtf16(ReadOnlySpan ascii, Span utf16) - { - Assert.Equal(ascii.Length, utf16.Length); - - fixed (byte* pAscii = &MemoryMarshal.GetReference(ascii)) - fixed (char* pUtf16 = &MemoryMarshal.GetReference(utf16)) - { - // Conversions between UIntPtr <-> int are not checked by default. - return checked((int)_fnWidenAsciiToUtf16.Delegate(pAscii, pUtf16, (UIntPtr)ascii.Length)); - } - } - - [return: DynamicallyAccessedMembers(DynamicallyAccessedMemberTypes.PublicMethods | DynamicallyAccessedMemberTypes.NonPublicMethods)] - private static Type GetAsciiUtilityType() - { - return Type.GetType("System.Text.ASCIIUtility, System.Private.CoreLib"); - } - - private sealed class UnsafeLazyDelegate where TDelegate : Delegate - { - private readonly Lazy _lazyDelegate; - - public UnsafeLazyDelegate(string methodName) - { - _lazyDelegate = new Lazy(() => - { - Assert.True(typeof(TDelegate).IsSubclassOf(typeof(MulticastDelegate))); - - // Get the MethodInfo for the target method - - MethodInfo methodInfo = GetAsciiUtilityType().GetMethod(methodName, BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Static); - Assert.NotNull(methodInfo); - - // Construct the TDelegate pointing to this method - - return methodInfo.CreateDelegate(); - }); - } - - public TDelegate Delegate => _lazyDelegate.Value; - } - } -} diff --git a/src/libraries/System.Text.Encoding/tests/ASCIIEncoding/ASCIIEncodingDecode.cs b/src/libraries/System.Text.Encoding/tests/ASCIIEncoding/ASCIIEncodingDecode.cs index c9d12101d00a87..1c9df742c6b6cf 100644 --- a/src/libraries/System.Text.Encoding/tests/ASCIIEncoding/ASCIIEncodingDecode.cs +++ b/src/libraries/System.Text.Encoding/tests/ASCIIEncoding/ASCIIEncodingDecode.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Buffers; using System.Collections.Generic; using System.Linq; using Xunit; @@ -37,6 +38,19 @@ public void Decode(byte[] bytes, int index, int count) // Decoding valid bytes should not throw with a DecoderExceptionFallback Encoding exceptionEncoding = Encoding.GetEncoding("ascii", new EncoderReplacementFallback("?"), new DecoderExceptionFallback()); EncodingHelpers.Decode(exceptionEncoding, bytes, index, count, expected); + + char[] actual = new char[expected.Length]; + Assert.Equal(OperationStatus.Done, Ascii.ToUtf16(bytes.AsSpan(index, count), actual, out int charsWritten)); + Assert.Equal(expected.Length, charsWritten); + Assert.Equal(expected, new string(actual.AsSpan(0, charsWritten))); + + if (expected.Length > 1) + { + actual = new char[expected.Length - 1]; + Assert.Equal(OperationStatus.DestinationTooSmall, Ascii.ToUtf16(bytes.AsSpan(index, count), actual, out charsWritten)); + Assert.Equal(expected.Length - 1, charsWritten); + Assert.Equal(expected.Substring(0, expected.Length - 1), new string(actual.AsSpan(0, charsWritten))); + } } public static IEnumerable Decode_InvalidBytes_TestData() @@ -45,17 +59,17 @@ public static IEnumerable Decode_InvalidBytes_TestData() for (int i = 0x80; i <= byte.MaxValue; i++) { byte b = (byte)i; - yield return new object[] { new byte[] { b }, 0, 1 }; - yield return new object[] { new byte[] { 96, b, 97 }, 1, 1 }; - yield return new object[] { new byte[] { 97, b, 97 }, 0, 3 }; + yield return new object[] { new byte[] { b }, 0, 1, 0 }; + yield return new object[] { new byte[] { 96, b, 97 }, 1, 1, 0 }; + yield return new object[] { new byte[] { 97, b, 97 }, 0, 3, 1 }; } - yield return new object[] { new byte[] { 0xC1, 0x41, 0xF0, 0x42 }, 0, 4 }; + yield return new object[] { new byte[] { 0xC1, 0x41, 0xF0, 0x42 }, 0, 4, 0 }; } [Theory] [MemberData(nameof(Decode_InvalidBytes_TestData))] - public void Decode_InvalidBytes(byte[] bytes, int index, int count) + public void Decode_InvalidBytes(byte[] bytes, int index, int count, int expectedBytesConsumed) { string expected = GetString(bytes, index, count); EncodingHelpers.Decode(new ASCIIEncoding(), bytes, index, count, expected); @@ -63,6 +77,11 @@ public void Decode_InvalidBytes(byte[] bytes, int index, int count) // Decoding invalid bytes should throw with a DecoderExceptionFallback Encoding exceptionEncoding = Encoding.GetEncoding("ascii", new EncoderReplacementFallback("?"), new DecoderExceptionFallback()); NegativeEncodingTests.Decode_Invalid(exceptionEncoding, bytes, index, count); + + char[] actual = new char[expected.Length]; + Assert.Equal(OperationStatus.InvalidData, Ascii.ToUtf16(bytes.AsSpan(index, count), actual, out int charsWritten)); + Assert.Equal(expectedBytesConsumed, charsWritten); + Assert.Equal(expected.Take(charsWritten).ToArray(), actual.Take(charsWritten).ToArray()); } public static string GetString(byte[] bytes, int index, int count) diff --git a/src/libraries/System.Text.Encoding/tests/ASCIIEncoding/ASCIIEncodingEncode.cs b/src/libraries/System.Text.Encoding/tests/ASCIIEncoding/ASCIIEncodingEncode.cs index b0170dfe532be6..c4bff337ca37a8 100644 --- a/src/libraries/System.Text.Encoding/tests/ASCIIEncoding/ASCIIEncodingEncode.cs +++ b/src/libraries/System.Text.Encoding/tests/ASCIIEncoding/ASCIIEncodingEncode.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Buffers; using System.Collections.Generic; using System.Linq; using Xunit; @@ -43,6 +44,19 @@ public void Encode(string source, int index, int count) // Encoding valid chars should not throw with an EncoderExceptionFallback Encoding exceptionEncoding = Encoding.GetEncoding("ascii", new EncoderExceptionFallback(), new DecoderReplacementFallback("?")); EncodingHelpers.Encode(exceptionEncoding, source, index, count, expected); + + byte[] actual = new byte[expected.Length]; + Assert.Equal(OperationStatus.Done , Ascii.FromUtf16(source.AsSpan(index, count), actual, out int bytesWritten)); + Assert.Equal(expected.Length, bytesWritten); + Assert.Equal(expected, actual.Take(bytesWritten).ToArray()); + + if (expected.Length > 1) + { + actual = new byte[expected.Length - 1]; + Assert.Equal(OperationStatus.DestinationTooSmall, Ascii.FromUtf16(source.AsSpan(index, count), actual, out bytesWritten)); + Assert.Equal(expected.Length - 1, bytesWritten); + Assert.Equal(expected.Take(bytesWritten).ToArray(), actual.Take(bytesWritten).ToArray()); + } } public static IEnumerable Encode_InvalidChars_TestData() @@ -51,39 +65,39 @@ public static IEnumerable Encode_InvalidChars_TestData() for (int i = 0x80; i <= 0xFF; i++) { char b = (char)i; - yield return new object[] { b, 0, 1 }; + yield return new object[] { b, 0, 1, 0 }; } // Unicode chars - yield return new object[] { "\u1234\u2345", 0, 2 }; - yield return new object[] { "a\u1234\u2345b", 0, 4 }; + yield return new object[] { "\u1234\u2345", 0, 2, 0 }; + yield return new object[] { "a\u1234\u2345b", 0, 4, 1 }; - yield return new object[] { "\uD800\uDC00", 0, 2 }; - yield return new object[] { "a\uD800\uDC00b", 0, 2 }; + yield return new object[] { "\uD800\uDC00", 0, 2, 0 }; + yield return new object[] { "a\uD800\uDC00b", 0, 2, 1 }; - yield return new object[] { "\uD800\uDC00\u0061\u0CFF", 0, 4 }; + yield return new object[] { "\uD800\uDC00\u0061\u0CFF", 0, 4, 0 }; // Invalid Unicode - yield return new object[] { "\uD800", 0, 1 }; // Lone high surrogate - yield return new object[] { "\uDC00", 0, 1 }; // Lone low surrogate - yield return new object[] { "\uD800\uDC00", 0, 1 }; // Surrogate pair out of range - yield return new object[] { "\uD800\uDC00", 1, 1 }; // Surrogate pair out of range + yield return new object[] { "\uD800", 0, 1, 0 }; // Lone high surrogate + yield return new object[] { "\uDC00", 0, 1, 0 }; // Lone low surrogate + yield return new object[] { "\uD800\uDC00", 0, 1, 0 }; // Surrogate pair out of range + yield return new object[] { "\uD800\uDC00", 1, 1, 0 }; // Surrogate pair out of range - yield return new object[] { "\uD800\uD800", 0, 2 }; // High, high - yield return new object[] { "\uDC00\uD800", 0, 2 }; // Low, high - yield return new object[] { "\uDC00\uDC00", 0, 2 }; // Low, low + yield return new object[] { "\uD800\uD800", 0, 2, 0 }; // High, high + yield return new object[] { "\uDC00\uD800", 0, 2, 0 }; // Low, high + yield return new object[] { "\uDC00\uDC00", 0, 2, 0 }; // Low, low - yield return new object[] { "\u0080\u00FF\u0B71\uFFFF\uD800\uDFFF", 0, 6 }; + yield return new object[] { "\u0080\u00FF\u0B71\uFFFF\uD800\uDFFF", 0, 6, 0 }; // High BMP non-chars - yield return new object[] { "\uFFFD", 0, 1 }; - yield return new object[] { "\uFFFE", 0, 1 }; - yield return new object[] { "\uFFFF", 0, 1 }; + yield return new object[] { "\uFFFD", 0, 1, 0 }; + yield return new object[] { "\uFFFE", 0, 1, 0 }; + yield return new object[] { "\uFFFF", 0, 1, 0 }; } [Theory] [MemberData(nameof(Encode_InvalidChars_TestData))] - public void Encode_InvalidChars(string source, int index, int count) + public void Encode_InvalidChars(string source, int index, int count, int expectedCharsConsumed) { byte[] expected = GetBytes(source, index, count); EncodingHelpers.Encode(new ASCIIEncoding(), source, index, count, expected); @@ -91,6 +105,11 @@ public void Encode_InvalidChars(string source, int index, int count) // Encoding invalid chars should throw with an EncoderExceptionFallback Encoding exceptionEncoding = Encoding.GetEncoding("ascii", new EncoderExceptionFallback(), new DecoderReplacementFallback("?")); NegativeEncodingTests.Encode_Invalid(exceptionEncoding, source, index, count); + + byte[] actual = new byte[expected.Length]; + Assert.Equal(OperationStatus.InvalidData, Ascii.FromUtf16(source.AsSpan(index, count), actual, out int bytesWritten)); + Assert.Equal(expectedCharsConsumed, bytesWritten); + Assert.Equal(expected.Take(bytesWritten).ToArray(), actual.Take(bytesWritten).ToArray()); } private static byte[] GetBytes(string source, int index, int count) diff --git a/src/libraries/System.Text.Encoding/tests/Ascii/CaseConversionTests.cs b/src/libraries/System.Text.Encoding/tests/Ascii/CaseConversionTests.cs new file mode 100644 index 00000000000000..ceb836ff268a41 --- /dev/null +++ b/src/libraries/System.Text.Encoding/tests/Ascii/CaseConversionTests.cs @@ -0,0 +1,258 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Buffers; +using System.Collections.Generic; +using System.Linq; +using System.Runtime.InteropServices; +using Xunit; + +namespace System.Text.Tests +{ + public static class CaseConversionTests + { + private const byte MaxValidAsciiChar = 127; + + [Fact] + public static void OverlappingBuffers_Throws() + { + byte[] byteBuffer = new byte[10]; + char[] charBuffer = new char[10]; + + // byte -> byte + Assert.Throws(() => Ascii.ToLower(byteBuffer, byteBuffer, out _)); + Assert.Throws(() => Ascii.ToLower(byteBuffer.AsSpan(1, 3), byteBuffer.AsSpan(3, 5), out _)); + Assert.Throws(() => Ascii.ToUpper(byteBuffer, byteBuffer, out _)); + Assert.Throws(() => Ascii.ToUpper(byteBuffer.AsSpan(1, 3), byteBuffer.AsSpan(3, 5), out _)); + // byte -> char + Assert.Throws(() => Ascii.ToLower(byteBuffer, MemoryMarshal.Cast(byteBuffer), out _)); + Assert.Throws(() => Ascii.ToLower(byteBuffer, MemoryMarshal.Cast(byteBuffer).Slice(1, 3), out _)); + Assert.Throws(() => Ascii.ToUpper(byteBuffer, MemoryMarshal.Cast(byteBuffer), out _)); + Assert.Throws(() => Ascii.ToUpper(byteBuffer, MemoryMarshal.Cast(byteBuffer).Slice(1, 3), out _)); + // char -> char + Assert.Throws(() => Ascii.ToLower(charBuffer, charBuffer, out _)); + Assert.Throws(() => Ascii.ToLower(charBuffer.AsSpan(1, 3), charBuffer.AsSpan(3, 5), out _)); + Assert.Throws(() => Ascii.ToUpper(charBuffer, charBuffer, out _)); + Assert.Throws(() => Ascii.ToUpper(charBuffer.AsSpan(1, 3), charBuffer.AsSpan(3, 5), out _)); + // char -> byte + Assert.Throws(() => Ascii.ToLower(charBuffer, MemoryMarshal.Cast(charBuffer), out _)); + Assert.Throws(() => Ascii.ToLower(charBuffer, MemoryMarshal.Cast(charBuffer).Slice(1, 3), out _)); + Assert.Throws(() => Ascii.ToUpper(charBuffer, MemoryMarshal.Cast(charBuffer), out _)); + Assert.Throws(() => Ascii.ToUpper(charBuffer, MemoryMarshal.Cast(charBuffer).Slice(1, 3), out _)); + } + + private static void VerifySingleChar(OperationStatus status, int value, T expected, T actual, int written) + { + Assert.True(typeof(T) == typeof(char) || typeof(T) == typeof(byte)); + + if (value <= MaxValidAsciiChar) + { + Assert.Equal(OperationStatus.Done, status); + Assert.Equal(expected, actual); + Assert.Equal(1, written); + } + else + { + Assert.Equal(OperationStatus.InvalidData, status); + Assert.Equal(default, actual); + Assert.Equal(0, written); + } + } + + [Fact] + public static void SingleByteConversion() + { + byte[] destinationByte = new byte[1]; + char[] destinationChar = new char[1]; + + for (int i = 0; i <= byte.MaxValue; i++) + { + byte expectedToLower = char.IsBetween((char)i, 'A', 'Z') ? (byte)(i - 'A' + 'a') : (byte)i; + byte expectedToUpper = char.IsBetween((char)i, 'a', 'z') ? (byte)(i + 'A' - 'a') : (byte)i; + + byte[] sourceByte = new byte[1] { (byte)i }; + + // byte -> byte + destinationByte[0] = default; + VerifySingleChar(Ascii.ToLower(sourceByte, destinationByte, out int written), i, expectedToLower, destinationByte[0], written); + destinationByte[0] = default; + VerifySingleChar(Ascii.ToUpper(sourceByte, destinationByte, out written), i, expectedToUpper, destinationByte[0], written); + // byte -> char + destinationChar[0] = default; + VerifySingleChar(Ascii.ToLower(sourceByte, destinationChar, out written), i, (char)expectedToLower, destinationChar[0], written); + destinationChar[0] = default; + VerifySingleChar(Ascii.ToUpper(sourceByte, destinationChar, out written), i, (char)expectedToUpper, destinationChar[0], written); + } + } + + [Fact] + public static void SingleCharConversion() + { + char[] sourceChar = new char[1], destinationChar = new char[1]; // this test is "optimized" as it performs a LOT of iterations + byte[] destinationByte = new byte[1]; + + for (int i = 0; i <= char.MaxValue; i++) + { + char expectedLower = char.IsBetween((char)i, 'A', 'Z') ? (char)(i - 'A' + 'a') : (char)i; + char expectedUpper = char.IsBetween((char)i, 'a', 'z') ? (char)(i + 'A' - 'a') : (char)i; + + sourceChar[0] = (char)i; + + // char -> char + destinationChar[0] = default; + VerifySingleChar(Ascii.ToLower(sourceChar, destinationChar, out int written), i, expectedLower, destinationChar[0], written); + destinationChar[0] = default; + VerifySingleChar(Ascii.ToUpper(sourceChar, destinationChar, out written), i, expectedUpper, destinationChar[0], written); + // char -> byte + destinationByte[0] = default; + VerifySingleChar(Ascii.ToLower(sourceChar, destinationByte, out written), i, (byte)expectedLower, destinationByte[0], written); + destinationByte[0] = default; + VerifySingleChar(Ascii.ToUpper(sourceChar, destinationByte, out written), i, (byte)expectedUpper, destinationByte[0], written); + } + } + + [Theory] + [InlineData("\u00C0bCDe")] // U+00C0 is not ASCII + [InlineData("\u00E0bCDe")] // U+00E0 is not ASCII + public static void InvalidCharacters(string sourceChars) + { + char[] destinationChars = new char[sourceChars.Length]; + byte[] sourceBytes = System.Text.Encoding.ASCII.GetBytes(sourceChars); + byte[] destinationBytes = new byte[sourceBytes.Length]; + + if (sourceBytes[0] <= MaxValidAsciiChar) + { + sourceBytes[0] = MaxValidAsciiChar + 1; // ensure the first byte is invalid (U+00C0 is mapped to valid ascii char by ASCII.GetBytes) + } + + // char => char + VerifyStatus(Ascii.ToLower(sourceChars, destinationChars, out int written), written); + VerifyStatus(Ascii.ToUpper(sourceChars, destinationChars, out written), written); + // char => byte + VerifyStatus(Ascii.ToLower(sourceChars, destinationBytes, out written), written); + VerifyStatus(Ascii.ToUpper(sourceChars, destinationBytes, out written), written); + // byte => byte + VerifyStatus(Ascii.ToLower(sourceBytes, destinationBytes, out written), written); + VerifyStatus(Ascii.ToUpper(sourceBytes, destinationBytes, out written), written); + // byte => char + VerifyStatus(Ascii.ToLower(sourceBytes, destinationChars, out written), written); + VerifyStatus(Ascii.ToUpper(sourceBytes, destinationChars, out written), written); + + // InPlace(byte) + VerifyStatus(Ascii.ToLowerInPlace(sourceBytes, out int processed), processed); + VerifyStatus(Ascii.ToUpperInPlace(sourceBytes, out processed), processed); + // InPlace(char) + VerifyStatus(Ascii.ToLowerInPlace(sourceChars.ToCharArray(), out processed), processed); + VerifyStatus(Ascii.ToUpperInPlace(sourceChars.ToCharArray(), out processed), processed); + + static void VerifyStatus(OperationStatus status, int written) + { + Assert.Equal(OperationStatus.InvalidData, status); + Assert.Equal(0, written); + } + } + + public static IEnumerable MultipleValidCharacterConversion_Arguments + { + get + { + yield return new object[] { "", "", "" }; + yield return new object[] { "Hello", "hello", "HELLO" }; + yield return new object[] { "\rHello\n", "\rhello\n", "\rHELLO\n" }; + yield return new object[] { "\0xyz\0", "\0xyz\0", "\0XYZ\0" }; + yield return new object[] { "\0XYZ\0", "\0xyz\0", "\0XYZ\0" }; + yield return new object[] { "AbCdEFgHIJkLmNoPQRStUVwXyZ", "abcdefghijklmnopqrstuvwxyz", "ABCDEFGHIJKLMNOPQRSTUVWXYZ" }; + + // exercise all possible code paths + for (int i = 1; i <= MaxValidAsciiChar; i++) + { + char expectedLower = char.IsBetween((char)i, 'A', 'Z') ? (char)(i - 'A' + 'a') : (char)i; + char expectedUpper = char.IsBetween((char)i, 'a', 'z') ? (char)(i + 'A' - 'a') : (char)i; + + yield return new object[] { new string((char)i, i), new string(expectedLower, i), new string(expectedUpper, i) }; + } + } + } + + [Theory] + [MemberData(nameof(MultipleValidCharacterConversion_Arguments))] + public static void MultipleValidCharacterConversion(string sourceChars, string expectedLowerChars, string expectedUpperChars) + { + Assert.Equal(sourceChars.Length, expectedLowerChars.Length); + Assert.Equal(expectedLowerChars.Length, expectedUpperChars.Length); + + byte[] sourceBytes = Encoding.ASCII.GetBytes(sourceChars); + byte[] expectedLowerBytes = Encoding.ASCII.GetBytes(expectedLowerChars); + byte[] expectedUpperBytes = Encoding.ASCII.GetBytes(expectedUpperChars); + char[] destinationChars = new char[expectedLowerChars.Length]; + byte[] destinationBytes = new byte[expectedLowerChars.Length]; + + // char -> char + VerifyStatus(Ascii.ToLower(sourceChars, destinationChars, out int written), expectedLowerChars, destinationChars, written); + VerifyStatus(Ascii.ToUpper(sourceChars, destinationChars, out written), expectedUpperChars, destinationChars, written); + // char -> byte + VerifyStatus(Ascii.ToLower(sourceChars, destinationBytes, out written), expectedLowerBytes, destinationBytes, written); + VerifyStatus(Ascii.ToUpper(sourceChars, destinationBytes, out written), expectedUpperBytes, destinationBytes, written); + // byte -> byte + VerifyStatus(Ascii.ToLower(sourceBytes, destinationBytes, out written), expectedLowerBytes, destinationBytes, written); + VerifyStatus(Ascii.ToUpper(sourceBytes, destinationBytes, out written), expectedUpperBytes, destinationBytes, written); + // byte -> char + VerifyStatus(Ascii.ToLower(sourceBytes, destinationChars, out written), expectedLowerChars, destinationChars, written); + VerifyStatus(Ascii.ToUpper(sourceBytes, destinationChars, out written), expectedUpperChars, destinationChars, written); + + // InPlace(byte) + byte[] sourceBytesCopy = sourceBytes.ToArray(); + VerifyStatus(Ascii.ToLowerInPlace(sourceBytesCopy, out int processed), expectedLowerBytes, sourceBytesCopy, processed); + sourceBytesCopy = sourceBytes.ToArray(); + VerifyStatus(Ascii.ToUpperInPlace(sourceBytesCopy, out processed), expectedUpperBytes, sourceBytesCopy, processed); + // InPlace(char) + char[] sourceCharsCopy = sourceChars.ToCharArray(); + VerifyStatus(Ascii.ToLowerInPlace(sourceCharsCopy, out processed), expectedLowerChars.ToCharArray(), sourceCharsCopy, processed); + sourceCharsCopy = sourceChars.ToCharArray(); + VerifyStatus(Ascii.ToUpperInPlace(sourceCharsCopy, out processed), expectedUpperChars.ToCharArray(), sourceCharsCopy, processed); + + static void VerifyStatus(OperationStatus status, ReadOnlySpan expected, ReadOnlySpan actual, int written) + { + Assert.Equal(OperationStatus.Done, status); + Assert.Equal(expected.Length, written); + Assert.Equal(expected.ToArray(), actual.ToArray()); + } + } + + [Theory] + [InlineData("Hello", 4, "hell", "HELL")] + [InlineData(" AbC ", 3, " ab", " AB")] + public static void DestinationTooSmall(string sourceChars, int destinationSize, string expectedLowerChars, string expectedUpperChars) + { + Assert.NotEqual(sourceChars.Length, destinationSize); + Assert.Equal(destinationSize, expectedLowerChars.Length); + Assert.Equal(expectedLowerChars.Length, expectedUpperChars.Length); + + byte[] sourceBytes = Encoding.ASCII.GetBytes(sourceChars); + byte[] expectedLowerBytes = Encoding.ASCII.GetBytes(expectedLowerChars); + byte[] expectedUpperBytes = Encoding.ASCII.GetBytes(expectedUpperChars); + char[] destinationChars = new char[destinationSize]; + byte[] destinationBytes = new byte[destinationSize]; + + // char -> char + Verify(Ascii.ToLower(sourceChars, destinationChars, out int written), expectedLowerChars, destinationChars, written); + Verify(Ascii.ToUpper(sourceChars, destinationChars, out written), expectedUpperChars, destinationChars, written); + // char -> byte + Verify(Ascii.ToLower(sourceChars, destinationBytes, out written), expectedLowerBytes, destinationBytes, written); + Verify(Ascii.ToUpper(sourceChars, destinationBytes, out written), expectedUpperBytes, destinationBytes, written); + // byte -> byte + Verify(Ascii.ToLower(sourceBytes, destinationBytes, out written), expectedLowerBytes, destinationBytes, written); + Verify(Ascii.ToUpper(sourceBytes, destinationBytes, out written), expectedUpperBytes, destinationBytes, written); + // byte -> char + Verify(Ascii.ToLower(sourceBytes, destinationChars, out written), expectedLowerChars, destinationChars, written); + Verify(Ascii.ToUpper(sourceBytes, destinationChars, out written), expectedUpperChars, destinationChars, written); + + static void Verify(OperationStatus status, ReadOnlySpan expected, ReadOnlySpan actual, int written) + { + Assert.Equal(OperationStatus.DestinationTooSmall, status); + Assert.Equal(actual.Length, written); + Assert.Equal(expected.ToArray(), actual.ToArray()); + } + } + } +} diff --git a/src/libraries/System.Text.Encoding/tests/Ascii/FromUtf16Tests.cs b/src/libraries/System.Text.Encoding/tests/Ascii/FromUtf16Tests.cs new file mode 100644 index 00000000000000..80a70042abb804 --- /dev/null +++ b/src/libraries/System.Text.Encoding/tests/Ascii/FromUtf16Tests.cs @@ -0,0 +1,103 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Buffers; +using System.Security.Cryptography; +using Xunit; + +namespace System.Text.Tests +{ + public static class FromUtf16Tests + { + [Fact] + public static unsafe void EmptyInputs() + { + Assert.Equal(OperationStatus.Done, Ascii.FromUtf16(ReadOnlySpan.Empty, Span.Empty, out int bytesWritten)); + Assert.Equal(0, bytesWritten); + } + + [Fact] + public static void AllAsciiInput() + { + using BoundedMemory utf16Mem = BoundedMemory.Allocate(128); + using BoundedMemory asciiMem = BoundedMemory.Allocate(128); + + // Fill source with 00 .. 7F. + + Span utf16Span = utf16Mem.Span; + for (int i = 0; i < utf16Span.Length; i++) + { + utf16Span[i] = (char)i; + } + utf16Mem.MakeReadonly(); + + // We'll write to the ASCII span. + // We test with a variety of span lengths to test alignment and fallthrough code paths. + + Span asciiSpan = asciiMem.Span; + + for (int i = 0; i < utf16Span.Length; i++) + { + asciiSpan.Clear(); // remove any data from previous iteration + + // First, validate that the workhorse saw the incoming data as all-ASCII. + Assert.Equal(OperationStatus.Done, Ascii.FromUtf16(utf16Span.Slice(i), asciiSpan.Slice(i), out int bytesWritten)); + Assert.Equal(128 - i, bytesWritten); + + // Then, validate that the data was transcoded properly. + + for (int j = i; j < 128; j++) + { + Assert.Equal((ushort)utf16Span[i], (ushort)asciiSpan[i]); + } + } + } + + [Fact] + public static void SomeNonAsciiInput() + { + using BoundedMemory utf16Mem = BoundedMemory.Allocate(128); + using BoundedMemory asciiMem = BoundedMemory.Allocate(128); + + // Fill source with 00 .. 7F. + + Span utf16Span = utf16Mem.Span; + for (int i = 0; i < utf16Span.Length; i++) + { + utf16Span[i] = (char)i; + } + + // We'll write to the ASCII span. + + Span asciiSpan = asciiMem.Span; + + for (int i = utf16Span.Length - 1; i >= 0; i--) + { + RandomNumberGenerator.Fill(asciiSpan); // fill with garbage + + // First, keep track of the garbage we wrote to the destination. + // We want to ensure it wasn't overwritten. + + byte[] expectedTrailingData = asciiSpan.Slice(i).ToArray(); + + // Then, set the desired byte as non-ASCII, then check that the workhorse + // correctly saw the data as non-ASCII. + + utf16Span[i] = '\u0123'; // use U+0123 instead of U+0080 since it catches inappropriate pmovmskb usage + Assert.Equal(OperationStatus.InvalidData, Ascii.FromUtf16(utf16Span, asciiSpan, out int bytesWritten)); + Assert.Equal(i, bytesWritten); + + // Next, validate that the ASCII data was transcoded properly. + + for (int j = 0; j < i; j++) + { + Assert.Equal((ushort)utf16Span[j], (ushort)asciiSpan[j]); + } + + // Finally, validate that the trailing data wasn't overwritten with non-ASCII data. + + Assert.Equal(expectedTrailingData, asciiSpan.Slice(i).ToArray()); + } + } + } +} diff --git a/src/libraries/System.Text.Encoding/tests/Ascii/IsValidByteTests.cs b/src/libraries/System.Text.Encoding/tests/Ascii/IsValidByteTests.cs new file mode 100644 index 00000000000000..07858beb11ca39 --- /dev/null +++ b/src/libraries/System.Text.Encoding/tests/Ascii/IsValidByteTests.cs @@ -0,0 +1,150 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Buffers; +using System.Collections.Generic; +using System.Linq; +using System.Numerics; +using System.Runtime.Intrinsics; +using Xunit; + +namespace System.Text.Tests +{ + public static class IsValidByteTests + { + private static byte GetNextValidAsciiByte() => (byte)Random.Shared.Next(0, 127 + 1); + private static byte GetNextInvalidAsciiByte() => (byte)Random.Shared.Next(128, 255 + 1); + + [Fact] + public static void EmptyInput_ReturnsTrue() + { + Assert.True(Ascii.IsValid(ReadOnlySpan.Empty)); + } + + private static int[] BufferLengths = new[] { + 1, + Vector128.Count - 1, + Vector128.Count, + Vector128.Count + 1, + Vector256.Count - 1, + Vector256.Count, + Vector256.Count + 1 }; + + public static IEnumerable AsciiOnlyBuffers + { + get + { + yield return new object[] { new byte[] { GetNextValidAsciiByte() } }; + + foreach (int length in BufferLengths) + { + yield return new object[] { Enumerable.Repeat(GetNextValidAsciiByte(), length).ToArray() }; + } + } + } + + [Theory] + [MemberData(nameof(AsciiOnlyBuffers))] + public static void AllAscii_ReturnsTrue(byte[] buffer) + { + Assert.True(Ascii.IsValid(buffer)); + Assert.All(buffer, character => Assert.True(Ascii.IsValid(character))); + } + + public static IEnumerable ContainingNonAsciiCharactersBuffers + { + get + { + foreach (int length in BufferLengths) + { + for (int index = 0; index < length; index++) + { + yield return new object[] { index, Create(length, index) }; + } + } + + static byte[] Create(int length, int index) + { + byte[] buffer = Enumerable.Repeat(GetNextValidAsciiByte(), length).ToArray(); + buffer[index] = GetNextInvalidAsciiByte(); + return buffer; + } + } + } + + [Theory] + [MemberData(nameof(ContainingNonAsciiCharactersBuffers))] + public static void NonAsciiAtGivenIndex(int nonAsciiIndex, byte[] buffer) + { + Assert.False(Ascii.IsValid(buffer)); + + for (int i = 0; i < buffer.Length; i++) + { + Assert.Equal(i != nonAsciiIndex, Ascii.IsValid(buffer[i])); + } + } + + [Fact] + public static void Vector128InnerLoop() + { + // The purpose of this test is to make sure we're identifying the correct + // vector (of the two that we're reading simultaneously) when performing + // the final ASCII drain at the end of the method once we've broken out + // of the inner loop. + + using (BoundedMemory mem = BoundedMemory.Allocate(1024)) + { + Span bytes = mem.Span; + + for (int i = 0; i < bytes.Length; i++) + { + bytes[i] &= 0x7F; // make sure each byte (of the pre-populated random data) is ASCII + } + + // Two vectors have offsets 0 .. 31. We'll go backward to avoid having to + // re-clear the vector every time. + + for (int i = 2 * Vector128.Count - 1; i >= 0; i--) + { + bytes[100 + i * 13] = 0x80; // 13 is relatively prime to 32, so it ensures all possible positions are hit + Assert.False(Ascii.IsValid(bytes)); + } + } + } + + [Fact] + public static void Boundaries() + { + // The purpose of this test is to make sure we're hitting all of the vectorized + // and draining logic correctly both in the SSE2 and in the non-SSE2 enlightened + // code paths. We shouldn't be reading beyond the boundaries we were given. + + // The 5 * Vector test should make sure that we're exercising all possible + // code paths across both implementations. + using (BoundedMemory mem = BoundedMemory.Allocate(5 * Vector.Count)) + { + Span bytes = mem.Span; + + // First, try it with all-ASCII buffers. + + for (int i = 0; i < bytes.Length; i++) + { + bytes[i] &= 0x7F; // make sure each byte (of the pre-populated random data) is ASCII + } + + for (int i = bytes.Length; i >= 0; i--) + { + Assert.True(Ascii.IsValid(bytes.Slice(0, i))); + } + + // Then, try it with non-ASCII bytes. + + for (int i = bytes.Length; i >= 1; i--) + { + bytes[i - 1] = 0x80; // set non-ASCII + Assert.False(Ascii.IsValid(bytes.Slice(0, i))); + } + } + } + } +} diff --git a/src/libraries/System.Text.Encoding/tests/Ascii/IsValidCharTests.cs b/src/libraries/System.Text.Encoding/tests/Ascii/IsValidCharTests.cs new file mode 100644 index 00000000000000..fced8f2e873275 --- /dev/null +++ b/src/libraries/System.Text.Encoding/tests/Ascii/IsValidCharTests.cs @@ -0,0 +1,158 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Buffers; +using System.Collections.Generic; +using System.Linq; +using System.Numerics; +using System.Runtime.Intrinsics; +using Xunit; + +namespace System.Text.Tests +{ + public static class IsValidCharTests + { + private static char GetNextValidAsciiChar() => (char)Random.Shared.Next(0, 127 + 1); + private static char GetNextInvalidAsciiChar() => (char)Random.Shared.Next(128, ushort.MaxValue + 1); + + [Fact] + public static void EmptyInput_ReturnsTrue() + { + Assert.True(Ascii.IsValid(ReadOnlySpan.Empty)); + } + + private static int[] BufferLengths = new[] { + 1, + Vector128.Count - 1, + Vector128.Count, + Vector128.Count + 1, + Vector256.Count - 1, + Vector256.Count, + Vector256.Count + 1 }; + + public static IEnumerable AsciiOnlyBuffers + { + get + { + yield return new object[] { new char[] { GetNextValidAsciiChar() } }; + + foreach (int length in BufferLengths) + { + yield return new object[] { Enumerable.Repeat(GetNextValidAsciiChar(), length).ToArray() }; + } + } + } + + [Theory] + [MemberData(nameof(AsciiOnlyBuffers))] + public static void AllAscii_ReturnsTrue(char[] buffer) + { + Assert.True(Ascii.IsValid(buffer)); + Assert.All(buffer, character => Assert.True(Ascii.IsValid(character))); + } + + public static IEnumerable ContainingNonAsciiCharactersBuffers + { + get + { + foreach (int length in BufferLengths) + { + for (int index = 0; index < length; index++) + { + yield return new object[] { index, Create(length, index) }; + } + } + + static char[] Create(int length, int index) + { + char[] buffer = Enumerable.Repeat(GetNextValidAsciiChar(), length).ToArray(); + buffer[index] = GetNextInvalidAsciiChar(); + return buffer; + } + } + } + + [Theory] + [MemberData(nameof(ContainingNonAsciiCharactersBuffers))] + public static void NonAsciiAtGivenIndex(int nonAsciiIndex, char[] buffer) + { + Assert.False(Ascii.IsValid(buffer)); + + for (int i = 0; i < buffer.Length; i++) + { + Assert.Equal(i != nonAsciiIndex, Ascii.IsValid(buffer[i])); + } + } + + [Fact] + public static void Vector128InnerLoop() + { + // The purpose of this test is to make sure we're identifying the correct + // vector (of the two that we're reading simultaneously) when performing + // the final ASCII drain at the end of the method once we've broken out + // of the inner loop. + // + // Use U+0123 instead of U+0080 for this test because if our implementation + // uses pminuw / pmovmskb incorrectly, U+0123 will incorrectly show up as ASCII, + // causing our test to produce a false negative. + + using (BoundedMemory mem = BoundedMemory.Allocate(1024)) + { + Span chars = mem.Span; + + for (int i = 0; i < chars.Length; i++) + { + chars[i] &= '\u007F'; // make sure each char (of the pre-populated random data) is ASCII + } + + // Two vectors have offsets 0 .. 31. We'll go backward to avoid having to + // re-clear the vector every time. + + for (int i = 2 * Vector128.Count - 1; i >= 0; i--) + { + chars[100 + i * 13] = '\u0123'; // 13 is relatively prime to 32, so it ensures all possible positions are hit + Assert.False(Ascii.IsValid(chars)); + } + } + } + + [Fact] + public static void Boundaries() + { + // The purpose of this test is to make sure we're hitting all of the vectorized + // and draining logic correctly both in the SSE2 and in the non-SSE2 enlightened + // code paths. We shouldn't be reading beyond the boundaries we were given. + // + // The 5 * Vector test should make sure that we're exercising all possible + // code paths across both implementations. The sizeof(char) is because we're + // specifying element count, but underlying implementation reinterpret casts to bytes. + // + // Use U+0123 instead of U+0080 for this test because if our implementation + // uses pminuw / pmovmskb incorrectly, U+0123 will incorrectly show up as ASCII, + // causing our test to produce a false negative. + + using (BoundedMemory mem = BoundedMemory.Allocate(5 * Vector.Count / sizeof(char))) + { + Span chars = mem.Span; + + for (int i = 0; i < chars.Length; i++) + { + chars[i] &= '\u007F'; // make sure each char (of the pre-populated random data) is ASCII + } + + for (int i = chars.Length; i >= 0; i--) + { + Assert.True(Ascii.IsValid(chars.Slice(0, i))); + } + + // Then, try it with non-ASCII bytes. + + for (int i = chars.Length; i >= 1; i--) + { + chars[i - 1] = '\u0123'; // set non-ASCII + Assert.False(Ascii.IsValid(chars.Slice(0, i))); + } + } + } + } +} diff --git a/src/libraries/System.Text.Encoding/tests/Ascii/ToUtf16Tests.cs b/src/libraries/System.Text.Encoding/tests/Ascii/ToUtf16Tests.cs new file mode 100644 index 00000000000000..be9c71e14fbb82 --- /dev/null +++ b/src/libraries/System.Text.Encoding/tests/Ascii/ToUtf16Tests.cs @@ -0,0 +1,106 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Buffers; +using System.Runtime.InteropServices; +using System.Security.Cryptography; +using Xunit; + +namespace System.Text.Tests +{ + public static class ToUtf16Tests + { + [Fact] + public static void EmptyInputs() + { + Assert.Equal(OperationStatus.Done, Ascii.ToUtf16(ReadOnlySpan.Empty, Span.Empty, out int charsWritten)); + Assert.Equal(0, charsWritten); + } + + [Fact] + public static void AllAsciiInput() + { + using BoundedMemory asciiMem = BoundedMemory.Allocate(128); + using BoundedMemory utf16Mem = BoundedMemory.Allocate(128); + + // Fill source with 00 .. 7F, then trap future writes. + + Span asciiSpan = asciiMem.Span; + for (int i = 0; i < asciiSpan.Length; i++) + { + asciiSpan[i] = (byte)i; + } + asciiMem.MakeReadonly(); + + // We'll write to the UTF-16 span. + // We test with a variety of span lengths to test alignment and fallthrough code paths. + + Span utf16Span = utf16Mem.Span; + + for (int i = 0; i < asciiSpan.Length; i++) + { + utf16Span.Clear(); // remove any data from previous iteration + + // First, validate that the workhorse saw the incoming data as all-ASCII. + + Assert.Equal(OperationStatus.Done, Ascii.ToUtf16(asciiSpan.Slice(i), utf16Span.Slice(i), out int charsWritten)); + Assert.Equal(128 - i, charsWritten); + + // Then, validate that the data was transcoded properly. + + for (int j = i; j < 128; j++) + { + Assert.Equal((ushort)asciiSpan[i], (ushort)utf16Span[i]); + } + } + } + + [Fact] + public static void SomeNonAsciiInput() + { + using BoundedMemory asciiMem = BoundedMemory.Allocate(128); + using BoundedMemory utf16Mem = BoundedMemory.Allocate(128); + + // Fill source with 00 .. 7F, then trap future writes. + + Span asciiSpan = asciiMem.Span; + for (int i = 0; i < asciiSpan.Length; i++) + { + asciiSpan[i] = (byte)i; + } + + // We'll write to the UTF-16 span. + + Span utf16Span = utf16Mem.Span; + + for (int i = asciiSpan.Length - 1; i >= 0; i--) + { + RandomNumberGenerator.Fill(MemoryMarshal.Cast(utf16Span)); // fill with garbage + + // First, keep track of the garbage we wrote to the destination. + // We want to ensure it wasn't overwritten. + + char[] expectedTrailingData = utf16Span.Slice(i).ToArray(); + + // Then, set the desired byte as non-ASCII, then check that the workhorse + // correctly saw the data as non-ASCII. + + asciiSpan[i] |= (byte)0x80; + + Assert.Equal(OperationStatus.InvalidData, Ascii.ToUtf16(asciiSpan, utf16Span, out int charsWritten)); + Assert.Equal(i, charsWritten); + + // Next, validate that the ASCII data was transcoded properly. + + for (int j = 0; j < i; j++) + { + Assert.Equal((ushort)asciiSpan[j], (ushort)utf16Span[j]); + } + + // Finally, validate that the trailing data wasn't overwritten with non-ASCII data. + + Assert.Equal(expectedTrailingData, utf16Span.Slice(i).ToArray()); + } + } + } +} diff --git a/src/libraries/System.Text.Encoding/tests/Ascii/TrimTests.cs b/src/libraries/System.Text.Encoding/tests/Ascii/TrimTests.cs new file mode 100644 index 00000000000000..5873942d87a5dd --- /dev/null +++ b/src/libraries/System.Text.Encoding/tests/Ascii/TrimTests.cs @@ -0,0 +1,117 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using Xunit; + +namespace System.Text.Tests +{ + public static class TrimTests + { + [Fact] + public static void EmptyInput() + { + Assert.Equal(default(Range), Ascii.Trim(ReadOnlySpan.Empty)); + Assert.Equal(default(Range), Ascii.Trim(ReadOnlySpan.Empty)); + Assert.Equal(default(Range), Ascii.TrimStart(ReadOnlySpan.Empty)); + Assert.Equal(default(Range), Ascii.TrimStart(ReadOnlySpan.Empty)); + Assert.Equal(default(Range), Ascii.TrimEnd(ReadOnlySpan.Empty)); + Assert.Equal(default(Range), Ascii.TrimEnd(ReadOnlySpan.Empty)); + } + + [Theory] + [InlineData("1")] + [InlineData("abc")] + [InlineData("a\tb c\rd\ne")] + public static void NothingToTrimNonEmptyInput(string text) + { + ReadOnlySpan bytes = Encoding.ASCII.GetBytes(text); + + Range expected = 0..text.Length; + Assert.Equal(expected, Ascii.Trim(bytes)); + Assert.Equal(expected, Ascii.Trim(text)); + Assert.Equal(expected, Ascii.TrimStart(bytes)); + Assert.Equal(expected, Ascii.TrimStart(text)); + Assert.Equal(expected, Ascii.TrimEnd(bytes)); + Assert.Equal(expected, Ascii.TrimEnd(text)); + } + + [Theory] + [InlineData(" ")] + [InlineData("\t")] + [InlineData("\r")] + [InlineData("\n")] + [InlineData("\r\n")] + [InlineData(" \t\r\n ")] + [InlineData("\n \t \r")] + public static void OnlyWhitespaces(string text) + { + ReadOnlySpan bytes = Encoding.ASCII.GetBytes(text); + + Assert.Equal(text.Length..text.Length, Ascii.Trim(bytes)); + Assert.Equal(text.Length..text.Length, Ascii.Trim(text)); + Assert.Equal(text.Length..text.Length, Ascii.TrimStart(bytes)); + Assert.Equal(text.Length..text.Length, Ascii.TrimStart(text)); + // Special-case when the input contains all-whitespace data, since we want to + // return a zero-length slice at the *beginning* of the span, not the end of the span + Assert.Equal(0..0, Ascii.TrimEnd(bytes)); + Assert.Equal(0..0, Ascii.TrimEnd(text)); + } + + [Theory] + [InlineData(" a", 1)] + [InlineData("\tb", 1)] + [InlineData("\rc", 1)] + [InlineData("\nd", 1)] + [InlineData(" \t\r\ne", 4)] + [InlineData(" \t\r\n\n\r\t f", 8)] + public static void StartingWithWhitespace(string text, int leadingWhitespaceCount) + { + ReadOnlySpan bytes = Encoding.ASCII.GetBytes(text); + + Assert.Equal(leadingWhitespaceCount..text.Length, Ascii.TrimStart(bytes)); + Assert.Equal(leadingWhitespaceCount..text.Length, Ascii.TrimStart(text)); + Assert.Equal(leadingWhitespaceCount..text.Length, Ascii.Trim(bytes)); + Assert.Equal(leadingWhitespaceCount..text.Length, Ascii.Trim(text)); + Assert.Equal(0..text.Length, Ascii.TrimEnd(bytes)); + Assert.Equal(0..text.Length, Ascii.TrimEnd(text)); + } + + [Theory] + [InlineData("a ", 1)] + [InlineData("b\t", 1)] + [InlineData("c\r", 1)] + [InlineData("d\n", 1)] + [InlineData("e \t\r\n", 4)] + [InlineData("f \t\r\n\n\r\t ", 8)] + public static void EndingWithWhitespace(string text, int trailingWhitespaceCount) + { + ReadOnlySpan bytes = Encoding.ASCII.GetBytes(text); + + Assert.Equal(0..(text.Length - trailingWhitespaceCount), Ascii.TrimEnd(bytes)); + Assert.Equal(0..(text.Length - trailingWhitespaceCount), Ascii.TrimEnd(text)); + Assert.Equal(0..(text.Length - trailingWhitespaceCount), Ascii.Trim(bytes)); + Assert.Equal(0..(text.Length - trailingWhitespaceCount), Ascii.Trim(text)); + Assert.Equal(0..text.Length, Ascii.TrimStart(bytes)); + Assert.Equal(0..text.Length, Ascii.TrimStart(text)); + } + + [Theory] + [InlineData(" a ", 1, 1)] + [InlineData("\tb\t", 1, 1)] + [InlineData("\rc\r", 1, 1)] + [InlineData("\nd\n", 1, 1)] + [InlineData(" \t\r\ne \t\r\n", 4, 4)] + [InlineData(" \t\r\n\n\r\t f \t\r\n\n\r\t ", 8, 8)] + public static void StartingAndEndingWithWhitespace(string text, int leadingWhitespaceCount, int trailingWhitespaceCount) + { + ReadOnlySpan bytes = Encoding.ASCII.GetBytes(text); + + Assert.Equal(leadingWhitespaceCount..text.Length, Ascii.TrimStart(bytes)); + Assert.Equal(leadingWhitespaceCount..text.Length, Ascii.TrimStart(text)); + Assert.Equal(leadingWhitespaceCount..(text.Length - trailingWhitespaceCount), Ascii.Trim(bytes)); + Assert.Equal(leadingWhitespaceCount..(text.Length - trailingWhitespaceCount), Ascii.Trim(text)); + Assert.Equal(0..(text.Length - trailingWhitespaceCount), Ascii.TrimEnd(bytes)); + Assert.Equal(0..(text.Length - trailingWhitespaceCount), Ascii.TrimEnd(text)); + } + } +} diff --git a/src/libraries/System.Text.Encoding/tests/System.Text.Encoding.Tests.csproj b/src/libraries/System.Text.Encoding/tests/System.Text.Encoding.Tests.csproj index 2e59a6b32f4b06..305e53a58290dd 100644 --- a/src/libraries/System.Text.Encoding/tests/System.Text.Encoding.Tests.csproj +++ b/src/libraries/System.Text.Encoding/tests/System.Text.Encoding.Tests.csproj @@ -9,6 +9,12 @@ true + + + + + + diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs index ea47cb5b4dc3f6..02a61612e6f3cc 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs @@ -979,8 +979,9 @@ public static bool ParticipatesInCaseConversion(ReadOnlySpan s) } /// Gets whether the specified span contains only ASCII. - public static bool IsAscii(ReadOnlySpan s) // TODO https://github.com/dotnet/runtime/issues/28230: Replace once Ascii is available + public static bool IsAscii(ReadOnlySpan s) { +#if REGEXGENERATOR foreach (char c in s) { if (c >= 128) @@ -990,6 +991,9 @@ public static bool IsAscii(ReadOnlySpan s) // TODO https://github.com/dotn } return true; +#else + return Ascii.IsValid(s); +#endif } /// Gets whether we can iterate through the set list pairs in order to completely enumerate the set's contents.