From f21116f54b6d37a667d3eecbe0cf35cb14f2bdff Mon Sep 17 00:00:00 2001 From: Levi Broderick Date: Thu, 26 May 2022 15:38:48 -0700 Subject: [PATCH 1/4] Improve Encoding.UTF8.GetMaxByte/CharCount perf Also fixes some potential integer overflows in callers --- .../src/System/Text/ValueUtf8Converter.cs | 2 +- .../System/Runtime/InteropServices/Marshal.cs | 4 +- .../src/System/Text/UTF8Encoding.Sealed.cs | 44 +++++++++++++++++++ .../Xml/Xsl/XsltOld/SequentialOutput.cs | 9 +--- .../UTF8EncodingGetMaxByteCount.cs | 14 ++++++ .../UTF8EncodingGetMaxCharCount.cs | 12 +++++ 6 files changed, 75 insertions(+), 10 deletions(-) diff --git a/src/libraries/Common/src/System/Text/ValueUtf8Converter.cs b/src/libraries/Common/src/System/Text/ValueUtf8Converter.cs index 6765ecbaad6a6d..f589aec0cac222 100644 --- a/src/libraries/Common/src/System/Text/ValueUtf8Converter.cs +++ b/src/libraries/Common/src/System/Text/ValueUtf8Converter.cs @@ -23,7 +23,7 @@ public ValueUtf8Converter(Span initialBuffer) public Span ConvertAndTerminateString(ReadOnlySpan value) { - int maxSize = Encoding.UTF8.GetMaxByteCount(value.Length) + 1; + int maxSize = checked(Encoding.UTF8.GetMaxByteCount(value.Length) + 1); if (_bytes.Length < maxSize) { Dispose(); diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/InteropServices/Marshal.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/InteropServices/Marshal.cs index bbc79c19effdc2..907fa5c9b7ed3c 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/InteropServices/Marshal.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/InteropServices/Marshal.cs @@ -993,7 +993,7 @@ private static unsafe IntPtr StringToHGlobalUTF8(string? s) int nb = Encoding.UTF8.GetMaxByteCount(s.Length); - IntPtr ptr = AllocHGlobal(nb + 1); + IntPtr ptr = AllocHGlobal(checked(nb + 1)); int nbWritten; byte* pbMem = (byte*)ptr; @@ -1040,7 +1040,7 @@ public static unsafe IntPtr StringToCoTaskMemUTF8(string? s) int nb = Encoding.UTF8.GetMaxByteCount(s.Length); - IntPtr ptr = AllocCoTaskMem(nb + 1); + IntPtr ptr = AllocCoTaskMem(checked(nb + 1)); int nbWritten; byte* pbMem = (byte*)ptr; diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/UTF8Encoding.Sealed.cs b/src/libraries/System.Private.CoreLib/src/System/Text/UTF8Encoding.Sealed.cs index 8a0041028df210..c76c5782d4940c 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/UTF8Encoding.Sealed.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/UTF8Encoding.Sealed.cs @@ -69,6 +69,50 @@ private unsafe byte[] GetBytesForSmallInput(string s) return new Span(ref *pDestination, bytesWritten).ToArray(); // this overload of Span ctor doesn't validate length } + public override int GetMaxByteCount(int charCount) + { + // This is a specialization of UTF8Encoding.GetMaxByteCount + // with the assumption that the default replacement fallback + // emits 3 fallback bytes ([ EF BF BD ] = '\uFFFD') per + // malformed input char in the worst case. + + if ((uint)charCount > (int.MaxValue / MaxUtf8BytesPerChar) - 1) + { + // Move the throw out of the hot path to allow for inlining. + ThrowArgumentException(charCount); + static void ThrowArgumentException(int charCount) + { + throw new ArgumentOutOfRangeException( + paramName: nameof(charCount), + message: (charCount < 0) ? SR.ArgumentOutOfRange_NeedNonNegNum : SR.ArgumentOutOfRange_GetByteCountOverflow); + } + } + + return (charCount * MaxUtf8BytesPerChar) + MaxUtf8BytesPerChar; + } + + public override int GetMaxCharCount(int byteCount) + { + // This is a specialization of UTF8Encoding.GetMaxCharCount + // with the assumption that the default replacement fallback + // emits one fallback char ('\uFFFD') per malformed input + // byte in the worst case. + + if ((uint)byteCount > int.MaxValue - 1) + { + // Move the throw out of the hot path to allow for inlining. + ThrowArgumentException(byteCount); + static void ThrowArgumentException(int byteCount) + { + throw new ArgumentOutOfRangeException( + paramName: nameof(byteCount), + message: (byteCount < 0) ? SR.ArgumentOutOfRange_NeedNonNegNum : SR.ArgumentOutOfRange_GetCharCountOverflow); + } + } + + return byteCount + 1; + } + public override string GetString(byte[] bytes) { // This method is short and can be inlined, meaning that the null check below diff --git a/src/libraries/System.Private.Xml/src/System/Xml/Xsl/XsltOld/SequentialOutput.cs b/src/libraries/System.Private.Xml/src/System/Xml/Xsl/XsltOld/SequentialOutput.cs index 4ba7cf133088ec..2788536aa85e21 100644 --- a/src/libraries/System.Private.Xml/src/System/Xml/Xsl/XsltOld/SequentialOutput.cs +++ b/src/libraries/System.Private.Xml/src/System/Xml/Xsl/XsltOld/SequentialOutput.cs @@ -72,7 +72,6 @@ internal abstract class SequentialOutput : IRecordOutput // Uri Escaping: private byte[]? _byteBuffer; - private Encoding? _utf8Encoding; [MemberNotNull(nameof(_output))] private void CacheOuptutProps(XsltOutput output) @@ -607,12 +606,8 @@ private void WriteHtmlUri(string value) default: if (127 < ch) { - if (_utf8Encoding == null) - { - _utf8Encoding = Encoding.UTF8; - _byteBuffer = new byte[_utf8Encoding.GetMaxByteCount(1)]; - } - int bytes = _utf8Encoding.GetBytes(value, i - 1, 1, _byteBuffer!, 0); + _byteBuffer = new byte[Encoding.UTF8.GetMaxByteCount(1)]; + int bytes = Encoding.UTF8.GetBytes(value, i - 1, 1, _byteBuffer!, 0); for (int j = 0; j < bytes; j++) { Write("%"); diff --git a/src/libraries/System.Text.Encoding/tests/UTF8Encoding/UTF8EncodingGetMaxByteCount.cs b/src/libraries/System.Text.Encoding/tests/UTF8Encoding/UTF8EncodingGetMaxByteCount.cs index 6c4c2da47acb8f..7379fa4ae8b6b6 100644 --- a/src/libraries/System.Text.Encoding/tests/UTF8Encoding/UTF8EncodingGetMaxByteCount.cs +++ b/src/libraries/System.Text.Encoding/tests/UTF8Encoding/UTF8EncodingGetMaxByteCount.cs @@ -15,10 +15,24 @@ public class UTF8EncodingGetMaxByteCount public void GetMaxByteCount(int charCount) { int expected = (charCount + 1) * 3; + Assert.Equal(expected, Encoding.UTF8.GetMaxByteCount(charCount)); Assert.Equal(expected, new UTF8Encoding(true, true).GetMaxByteCount(charCount)); Assert.Equal(expected, new UTF8Encoding(true, false).GetMaxByteCount(charCount)); Assert.Equal(expected, new UTF8Encoding(false, true).GetMaxByteCount(charCount)); Assert.Equal(expected, new UTF8Encoding(false, false).GetMaxByteCount(charCount)); } + + [Theory] + [InlineData(-1)] + [InlineData(int.MinValue)] + [InlineData(-1_000_000_000)] + [InlineData(-1_300_000_000)] // yields positive result when *3 + [InlineData(int.MaxValue / 3)] + [InlineData(int.MaxValue)] + public void GetMaxByteCount_NegativeTests(int charCount) + { + Assert.Throws(nameof(charCount), () => Encoding.UTF8.GetMaxByteCount(charCount)); + Assert.Throws(nameof(charCount), () => new UTF8Encoding().GetMaxByteCount(charCount)); + } } } diff --git a/src/libraries/System.Text.Encoding/tests/UTF8Encoding/UTF8EncodingGetMaxCharCount.cs b/src/libraries/System.Text.Encoding/tests/UTF8Encoding/UTF8EncodingGetMaxCharCount.cs index 7450d4c698e56e..3b043ae2859bfc 100644 --- a/src/libraries/System.Text.Encoding/tests/UTF8Encoding/UTF8EncodingGetMaxCharCount.cs +++ b/src/libraries/System.Text.Encoding/tests/UTF8Encoding/UTF8EncodingGetMaxCharCount.cs @@ -15,10 +15,22 @@ public class UTF8EncodingGetMaxCharCount public void GetMaxCharCount(int byteCount) { int expected = byteCount + 1; + Assert.Equal(expected, Encoding.UTF8.GetMaxCharCount(byteCount)); Assert.Equal(expected, new UTF8Encoding(true, true).GetMaxCharCount(byteCount)); Assert.Equal(expected, new UTF8Encoding(true, false).GetMaxCharCount(byteCount)); Assert.Equal(expected, new UTF8Encoding(false, true).GetMaxCharCount(byteCount)); Assert.Equal(expected, new UTF8Encoding(false, false).GetMaxCharCount(byteCount)); } + + [Theory] + [InlineData(-1)] + [InlineData(int.MinValue)] + [InlineData(-1_000_000_000)] + [InlineData(int.MaxValue)] + public void GetMaxCharCount_NegativeTests(int byteCount) + { + Assert.Throws(nameof(byteCount), () => Encoding.UTF8.GetMaxCharCount(byteCount)); + Assert.Throws(nameof(byteCount), () => new UTF8Encoding().GetMaxCharCount(byteCount)); + } } } From f9eecc88171ebed08c84aaebcfece0eb12edbf5e Mon Sep 17 00:00:00 2001 From: Levi Broderick Date: Fri, 27 May 2022 13:18:10 -0700 Subject: [PATCH 2/4] Undo improper optimization --- .../src/System/Xml/Xsl/XsltOld/SequentialOutput.cs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/libraries/System.Private.Xml/src/System/Xml/Xsl/XsltOld/SequentialOutput.cs b/src/libraries/System.Private.Xml/src/System/Xml/Xsl/XsltOld/SequentialOutput.cs index 2788536aa85e21..4ba7cf133088ec 100644 --- a/src/libraries/System.Private.Xml/src/System/Xml/Xsl/XsltOld/SequentialOutput.cs +++ b/src/libraries/System.Private.Xml/src/System/Xml/Xsl/XsltOld/SequentialOutput.cs @@ -72,6 +72,7 @@ internal abstract class SequentialOutput : IRecordOutput // Uri Escaping: private byte[]? _byteBuffer; + private Encoding? _utf8Encoding; [MemberNotNull(nameof(_output))] private void CacheOuptutProps(XsltOutput output) @@ -606,8 +607,12 @@ private void WriteHtmlUri(string value) default: if (127 < ch) { - _byteBuffer = new byte[Encoding.UTF8.GetMaxByteCount(1)]; - int bytes = Encoding.UTF8.GetBytes(value, i - 1, 1, _byteBuffer!, 0); + if (_utf8Encoding == null) + { + _utf8Encoding = Encoding.UTF8; + _byteBuffer = new byte[_utf8Encoding.GetMaxByteCount(1)]; + } + int bytes = _utf8Encoding.GetBytes(value, i - 1, 1, _byteBuffer!, 0); for (int j = 0; j < bytes; j++) { Write("%"); From b34ad3ffbee1517197b9bfbb498e7878728095aa Mon Sep 17 00:00:00 2001 From: Levi Broderick Date: Tue, 31 May 2022 15:57:51 -0700 Subject: [PATCH 3/4] Update comment in UTF8Encoding --- .../src/System/Text/UTF8Encoding.cs | 37 ++++++++++++++++--- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/UTF8Encoding.cs b/src/libraries/System.Private.CoreLib/src/System/Text/UTF8Encoding.cs index 88ec661423ee58..7446d529d07e06 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/UTF8Encoding.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/UTF8Encoding.cs @@ -761,8 +761,20 @@ public override int GetMaxByteCount(int charCount) throw new ArgumentOutOfRangeException(nameof(charCount), SR.ArgumentOutOfRange_NeedNonNegNum); - // Characters would be # of characters + 1 in case left over high surrogate is ? * max fallback - long byteCount = (long)charCount + 1; + // GetMaxByteCount assumes that the caller might have a stateful Encoder instance. If the + // Encoder instance already has a captured high surrogate, then one of two things will + // happen: + // + // - The next char is a low surrogate, at which point the two chars together result in 4 + // UTF-8 bytes in the output; or + // - The next char is not a low surrogate (or the input reaches EOF), at which point the + // standalone captured surrogate will go through the fallback routine. + // + // The second case is the worst-case scenario for expansion, so it's what we use for any + // pessimistic "max byte count" calculation: assume there's a captured surrogate and that + // it must fall back. + + long byteCount = (long)charCount + 1; // +1 to account for captured surrogate, per above if (EncoderFallback.MaxCharCount > 1) byteCount *= EncoderFallback.MaxCharCount; @@ -781,9 +793,24 @@ public override int GetMaxCharCount(int byteCount) if (byteCount < 0) throw new ArgumentOutOfRangeException(nameof(byteCount), SR.ArgumentOutOfRange_NeedNonNegNum); - - // Figure out our length, 1 char per input byte + 1 char if 1st byte is last byte of 4 byte surrogate pair - long charCount = ((long)byteCount + 1); + + // GetMaxCharCount assumes that the caller might have a stateful Decoder instance. If the + // Decoder instance already has a captured partial UTF-8 subsequence, then one of two + // thngs will happen: + // + // - The next byte(s) won't complete the subsequence but will instead be consumed into + // the Decoder's internal state, resulting in no character output; or + // - The next byte(s) will complete the subsequence, and the previously captured + // subsequence and the next byte(s) will result in 1 - 2 chars output; or + // - The captured subsequence will be treated as a singular ill-formed subsequence, at + // which point the captured subsequence will go through the fallback routine. + // (See The Unicode Standard, Sec. 3.9 for more information on this.) + // + // The third case is the worst-case scenario for expansion, since it means 0 bytes of + // new input could cause any existing captured state to expand via fallback. So it's + // what we'll use for any pessimistic "max char count" calculation. + + long charCount = ((long)byteCount + 1); // +1 to account for captured subsequence, as above // Non-shortest form would fall back, so get max count from fallback. // So would 11... followed by 11..., so you could fall back every byte From 5e4dece781744c366e1df83db0a0dbc3947764d4 Mon Sep 17 00:00:00 2001 From: Levi Broderick Date: Tue, 31 May 2022 16:08:08 -0700 Subject: [PATCH 4/4] Fix whitespace --- .../System.Private.CoreLib/src/System/Text/UTF8Encoding.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/UTF8Encoding.cs b/src/libraries/System.Private.CoreLib/src/System/Text/UTF8Encoding.cs index 7446d529d07e06..7a2c577479d266 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/UTF8Encoding.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/UTF8Encoding.cs @@ -793,7 +793,7 @@ public override int GetMaxCharCount(int byteCount) if (byteCount < 0) throw new ArgumentOutOfRangeException(nameof(byteCount), SR.ArgumentOutOfRange_NeedNonNegNum); - + // GetMaxCharCount assumes that the caller might have a stateful Decoder instance. If the // Decoder instance already has a captured partial UTF-8 subsequence, then one of two // thngs will happen: