Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
<Compile Include="System\Text\Unicode\UnicodeRanges.generated.cs" />
</ItemGroup>
<ItemGroup Condition="'$(TargetFramework)' == '$(NetCoreAppCurrent)' or '$(TargetFramework)' == 'netcoreapp3.0'">
<Compile Include="System\Text\Encodings\Web\BitHelper.cs" />
<Compile Include="System\Text\Encodings\Web\Sse2Helper.cs" />
<Compile Include="System\Text\Encodings\Web\Ssse3Helper.cs" />
</ItemGroup>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Diagnostics;
using System.Numerics;
using System.Runtime.CompilerServices;

namespace System.Text.Encodings.Web
{
internal static class BitHelper
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int GetIndexOfFirstNeedToEscape(int index)
{
// Found at least one byte that needs to be escaped, figure out the index of
// the first one found that needed to be escaped within the 16 bytes.
Debug.Assert(index > 0 && index <= 65_535);
int tzc = BitOperations.TrailingZeroCount(index);
Debug.Assert(tzc >= 0 && tzc < 16);

return tzc;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
using System.Text.Unicode;

#if NETCOREAPP
using System.Numerics;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
#endif
Expand Down Expand Up @@ -209,7 +208,7 @@ public override unsafe int FindFirstCharacterToEncode(char* text, int textLength
goto AllAllowed;

VectorizedFound:
idx = GetIndexOfFirstNeedToEscape(index);
idx = BitHelper.GetIndexOfFirstNeedToEscape(index);
idx += CalculateIndex(ptr, text);
return idx;

Expand Down Expand Up @@ -329,7 +328,7 @@ public override unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan<byte> utf
goto AllAllowed;

VectorizedFound:
idx = GetIndexOfFirstNeedToEscape(index);
idx = BitHelper.GetIndexOfFirstNeedToEscape(index);
idx += CalculateIndex(ptr, pValue);
return idx;

Expand Down Expand Up @@ -446,20 +445,6 @@ private static int NeedsEscaping(Vector128<sbyte> sourceValue)
int index = Sse2.MoveMask(mask.AsByte());
return index;
}

// PERF: don't manually inline or call this method in NeedsEscaping
// as the resulting asm won't be great
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int GetIndexOfFirstNeedToEscape(int index)
{
// Found at least one byte that needs to be escaped, figure out the index of
// the first one found that needed to be escaped within the 16 bytes.
Debug.Assert(index > 0 && index <= 65_535);
int tzc = BitOperations.TrailingZeroCount(index);
Debug.Assert(tzc >= 0 && tzc <= 16);

return tzc;
}
#endif
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,20 @@ internal static class Ssse3Helper
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector128<sbyte> CreateEscapingMask_DefaultJavaScriptEncoderBasicLatin(Vector128<sbyte> sourceValue)
=> CreateEscapingMask(sourceValue, s_bitMaskLookupBasicLatin, s_bitPosLookup, s_nibbleMaskSByte, s_nullMaskSByte);

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector128<sbyte> CreateEscapingMask(
Vector128<sbyte> sourceValue,
Vector128<sbyte> bitMaskLookup,
Vector128<sbyte> bitPosLookup,
Vector128<sbyte> nibbleMaskSByte,
Vector128<sbyte> nullMaskSByte)
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I need this arguments in order to be able to hoist these vectors outside the loop (in TextEncoder) as the JIT won't do it here.

Codegen for DefaultJavaScriptEncoderBasicLatin.FindFirstCharacterToEncode is affected only in minimal way:

G_M60692_IG11:      ;; bbWeight=4
       C4C17A6F27           vmovdqu  xmm4, xmmword ptr [r15]
       C4C159636710         vpacksswb xmm4, xmm4, xmmword ptr [r15+16]
       C5F828E8             vmovaps  xmm5, xmm0
       C5F828F1             vmovaps  xmm6, xmm1
       C5F828FA             vmovaps  xmm7, xmm2
       C57828C3             vmovaps  xmm8, xmm3
       C5B172D404           vpsrld   xmm9, xmm4, 4
       C531DBCF             vpand    xmm9, xmm9, xmm7
       C5D9DBE7             vpand    xmm4, xmm4, xmm7
       C4E25100E4           vpshufb  xmm4, xmm5, xmm4
       C4C24900E9           vpshufb  xmm5, xmm6, xmm9
       C5D1DBE4             vpand    xmm4, xmm5, xmm4
       C4C15964E8           vpcmpgtb xmm5, xmm4, xmm8
       C5B964E4             vpcmpgtb xmm4, xmm8, xmm4
       C5D1EBE4             vpor     xmm4, xmm5, xmm4
       C5F9D7C4             vpmovmskb eax, xmm4
       85C0                 test     eax, eax
       0F851B010000         jne      G_M60692_IG17
       4983C720             add      r15, 32
       4D3BFD               cmp      r15, r13
       76A8                 jbe      SHORT G_M60692_IG11

Those vmovaps aren't needed at all*...
In the benchmarks this is just above noise. For larger inputs it's not measurable, for smaller one (just so that vectorization kicks in):

|         Method |       TestStringData |      Mean |     Error |    StdDev | Ratio |
|--------------- |--------------------- |----------:|----------:|----------:|------:|
|       PR_42073 | (16, (...)dcsv) [26] |  8.911 ns | 0.0505 ns | 0.0447 ns |  1.00 |
| PR_TextEncoder | (16, (...)dcsv) [26] |  9.085 ns | 0.0178 ns | 0.0158 ns |  1.02 |
|                |                      |           |           |           |       |
|       PR_42073 |   (9, -1, rddnegsne) | 10.245 ns | 0.0445 ns | 0.0372 ns |  1.00 |
| PR_TextEncoder |   (9, -1, rddnegsne) | 10.656 ns | 0.0152 ns | 0.0135 ns |  1.04 |

I think this is acceptable (especially if the JIT will be fixed so it doesn't emit these moves...I believe there is already on issue out, but I don't find it rigth now (or is that issue only for scalar-registers?)).
If not code for CreateEscapingMask has to be duplicated.


* if the register allocator would make these actually zero-latency, then there's still the instructions that need to be fetched, decoded, ... and it makes the loop larger.

{
// To check if an input byte needs to be escaped or not, we create bit-mask.
// To check if an input byte needs to be escaped or not, we use a bitmask-lookup.
// Therefore we split the input byte into the low- and high-nibble, which will get
// the row-/column-index in the bit-mask.
// The bit-mask-matrix looks like
// The bitmask-lookup looks like (here for example s_bitMaskLookupBasicLatin):
// high-nibble
// low-nibble 0 1 2 3 4 5 6 7 8 9 A B C D E F
// 0 1 1 0 0 0 0 1 0 1 1 1 1 1 1 1 1
Expand All @@ -42,29 +51,29 @@ public static Vector128<sbyte> CreateEscapingMask_DefaultJavaScriptEncoderBasicL
// can omit them in the bit-mask, thus only high-nibbles in the range 0..7 need
// to be considered, hence the entries in the bit-mask can be of type byte.
//
// In the Bitmask (see above) for each row (= low-nibble) a bit-mask for the
// In the bitmask-lookup for each row (= low-nibble) a bit-mask for the
// high-nibbles (= columns) is created.

Debug.Assert(Ssse3.IsSupported);

Vector128<sbyte> highNibbles = Sse2.And(Sse2.ShiftRightLogical(sourceValue.AsInt32(), 4).AsSByte(), s_nibbleMaskSByte);
Vector128<sbyte> lowNibbles = Sse2.And(sourceValue, s_nibbleMaskSByte);
Vector128<sbyte> highNibbles = Sse2.And(Sse2.ShiftRightLogical(sourceValue.AsInt32(), 4).AsSByte(), nibbleMaskSByte);
Vector128<sbyte> lowNibbles = Sse2.And(sourceValue, nibbleMaskSByte);

Vector128<sbyte> bitMask = Ssse3.Shuffle(s_bitMask, lowNibbles);
Vector128<sbyte> bitPositions = Ssse3.Shuffle(s_bitPosLookup, highNibbles);
Vector128<sbyte> bitMask = Ssse3.Shuffle(bitMaskLookup, lowNibbles);
Vector128<sbyte> bitPositions = Ssse3.Shuffle(bitPosLookup, highNibbles);

Vector128<sbyte> mask = Sse2.And(bitPositions, bitMask);

mask = Sse2.CompareEqual(s_nullMaskSByte, Sse2.CompareEqual(s_nullMaskSByte, mask));
mask = Sse2.CompareEqual(nullMaskSByte, Sse2.CompareEqual(nullMaskSByte, mask));
return mask;
}

private static readonly Vector128<sbyte> s_nibbleMaskSByte = Vector128.Create((sbyte)0xF);
private static readonly Vector128<sbyte> s_nullMaskSByte = Vector128<sbyte>.Zero;
internal static readonly Vector128<sbyte> s_nibbleMaskSByte = Vector128.Create((sbyte)0xF);
internal static readonly Vector128<sbyte> s_nullMaskSByte = Vector128<sbyte>.Zero;

// See comment above in method CreateEscapingMask_DefaultJavaScriptEncoderBasicLatin
// for description of the bit-mask.
private static readonly Vector128<sbyte> s_bitMask = Vector128.Create(
private static readonly Vector128<sbyte> s_bitMaskLookupBasicLatin = Vector128.Create(
0b_01000011, // low-nibble 0
0b_00000011, // low-nibble 1
0b_00000111, // low-nibble 2
Expand Down Expand Up @@ -92,7 +101,7 @@ public static Vector128<sbyte> CreateEscapingMask_DefaultJavaScriptEncoderBasicL
// A bitmask from the Bitmask (above) is created only for values 0..7 (one byte),
// so to avoid a explicit check for values outside 0..7, i.e.
// high nibbles 8..F, we use a bitpos that always results in escaping.
private static readonly Vector128<sbyte> s_bitPosLookup = Vector128.Create(
internal static readonly Vector128<sbyte> s_bitPosLookup = Vector128.Create(
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, // high-nibble 0..7
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF // high-nibble 8..F
).AsSByte();
Expand Down
Loading