From 03e8ccbe9095be28a7c1415218042bbe2d0f27ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Strehovsk=C3=BD?= Date: Fri, 18 Nov 2022 16:38:00 +0900 Subject: [PATCH 1/2] Allow inline relocations in dehydrated data This is a follow up to #77884. In the original pull request, all relocation targets went into a lookup table. This is not a very efficient to represent rarely used relocs. In this update, I'm extending the dehydration format to allow representing relocations inline - instead of indirecting through the lookup table, the target immediately follows the instruction. I'm changing the emitter to emit this if there's less than 2 references to the reloc. This produces a ~0.5% size saving. It likely also speeds up the decoding at runtime since there's less cache thrashing. On a hello world, the lookup table originally had about 11k entries. With this change, the lookup table only has 1700 entries. --- .../CompilerHelpers/StartupCodeHelpers.cs | 10 ++++ .../Common/Internal/Runtime/DehydratedData.cs | 44 ++---------------- .../DependencyAnalysis/DehydratedDataNode.cs | 46 +++++++++++++++---- 3 files changed, 52 insertions(+), 48 deletions(-) diff --git a/src/coreclr/nativeaot/Common/src/Internal/Runtime/CompilerHelpers/StartupCodeHelpers.cs b/src/coreclr/nativeaot/Common/src/Internal/Runtime/CompilerHelpers/StartupCodeHelpers.cs index eab095b2a119cb..2cb073f2c2d6b8 100644 --- a/src/coreclr/nativeaot/Common/src/Internal/Runtime/CompilerHelpers/StartupCodeHelpers.cs +++ b/src/coreclr/nativeaot/Common/src/Internal/Runtime/CompilerHelpers/StartupCodeHelpers.cs @@ -266,6 +266,16 @@ private static unsafe void RehydrateData(IntPtr dehydratedData, int length) WriteRelPtr32(pDest, ReadRelPtr32(pFixups + payload)); pDest += sizeof(int); break; + case DehydratedDataCommand.InlinePtrReloc: + *(void**)pDest = ReadRelPtr32(pCurrent); + pDest += sizeof(void*); + pCurrent += sizeof(int); + break; + case DehydratedDataCommand.InlineRelPtr32Reloc: + WriteRelPtr32(pDest, ReadRelPtr32(pCurrent)); + pDest += sizeof(int); + pCurrent += sizeof(int); + break; } } diff --git a/src/coreclr/tools/Common/Internal/Runtime/DehydratedData.cs b/src/coreclr/tools/Common/Internal/Runtime/DehydratedData.cs index 8124fd8e1cd48f..ee844da0080544 100644 --- a/src/coreclr/tools/Common/Internal/Runtime/DehydratedData.cs +++ b/src/coreclr/tools/Common/Internal/Runtime/DehydratedData.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. using System; + using Debug = System.Diagnostics.Debug; namespace Internal.Runtime @@ -21,9 +22,11 @@ internal static class DehydratedDataCommand public const byte ZeroFill = 0x01; public const byte RelPtr32Reloc = 0x02; public const byte PtrReloc = 0x03; + public const byte InlineRelPtr32Reloc = 0x04; + public const byte InlinePtrReloc = 0x05; - private const byte DehydratedDataCommandMask = 0x03; - private const int DehydratedDataCommandPayloadShift = 2; + private const byte DehydratedDataCommandMask = 0x07; + private const int DehydratedDataCommandPayloadShift = 3; private const int MaxRawShortPayload = (1 << (8 - DehydratedDataCommandPayloadShift)) - 1; private const int MaxExtraPayloadBytes = 3; @@ -77,42 +80,5 @@ public static int Encode(int command, int commandData, byte[] buffer) return pB + 1; } - -#if false - static void Main() - { - int command, payload; - - byte[] buf = new byte[5]; - Debug.Assert(Encode(1, 0, buf) == 1); - Debug.Assert(buf[0] == 1); - Debug.Assert(D(buf, out command, out payload) == 1 && command == 1 && payload == 0); - Debug.Assert(Encode(1, 1, buf) == 1); - Debug.Assert(buf[0] == (1 | (1 << DehydratedDataCommandPayloadShift))); - Debug.Assert(D(buf, out command, out payload) == 1 && command == 1 && payload == 1); - Debug.Assert(Encode(1, 60, buf) == 1); - Debug.Assert(buf[0] == (1 | (60 << DehydratedDataCommandPayloadShift))); - Debug.Assert(D(buf, out command, out payload) == 1 && command == 1 && payload == 60); - Debug.Assert(Encode(1, 61, buf) == 2); - Debug.Assert(buf[0] == (1 | ((MaxShortPayload + 1) << DehydratedDataCommandPayloadShift))); - Debug.Assert(buf[1] == 1); - Debug.Assert(D(buf, out command, out payload) == 2 && command == 1 && payload == 61); - - Debug.Assert(Encode(3, 256, buf) == 2); - Debug.Assert(D(buf, out command, out payload) == 2 && command == 3 && payload == 256); - Debug.Assert(Encode(3, 6500, buf) == 3); - Debug.Assert(D(buf, out command, out payload) == 3 && command == 3 && payload == 6500); - Debug.Assert(Encode(3, 65000, buf) == 3); - Debug.Assert(D(buf, out command, out payload) == 3 && command == 3 && payload == 65000); - Debug.Assert(Encode(3, 100000, buf) == 4); - Debug.Assert(D(buf, out command, out payload) == 4 && command == 3 && payload == 100000); - - static unsafe int D(byte[] bytes, out int command, out int payload) - { - fixed (byte* pBytes = bytes) - return (int)(Decode(pBytes, out command, out payload) - pBytes); - } - } -#endif } } diff --git a/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/DependencyAnalysis/DehydratedDataNode.cs b/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/DependencyAnalysis/DehydratedDataNode.cs index a40eb4b407382f..338a48f335f1f3 100644 --- a/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/DependencyAnalysis/DehydratedDataNode.cs +++ b/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/DependencyAnalysis/DehydratedDataNode.cs @@ -92,8 +92,24 @@ public override ObjectData GetData(NodeFactory factory, bool relocsOnly = false) // Sort the reloc targets and create reloc lookup table. KeyValuePair[] relocSort = new List>(relocOccurences).ToArray(); Array.Sort(relocSort, (x, y) => y.Value.CompareTo(x.Value)); + int lastProfitableReloc = 0; for (int i = 0; i < relocSort.Length; i++) + { + // Stop when we reach rarely referenced targets. Those will be inlined instead of being indirected + // through the table. Lookup table entry costs 4 bytes, a single reference to a rarely used reloc + // in the lookup table costs about 3 bytes. Inline reference to a reloc costs 5 bytes. + // It might be profitable from cache line utilization perspective at runtime to bump this number + // even higher to avoid using the lookup table as much as possible. + if (relocSort[i].Value < 3) + { + lastProfitableReloc = i - 1; + break; + } + relocSort[i] = new KeyValuePair(relocSort[i].Key, i); + } + if (lastProfitableReloc > 0) + Array.Resize(ref relocSort, lastProfitableReloc); var relocs = new Dictionary(relocSort); // Walk all the ObjectDatas and generate the dehydrated instruction stream. @@ -210,17 +226,29 @@ public override ObjectData GetData(NodeFactory factory, bool relocsOnly = false) if (target is ISymbolNodeWithLinkage withLinkage) target = withLinkage.NodeForLinkage(factory); - int targetIndex = relocs[target]; - - int relocCommand = reloc.RelocType switch + if (relocs.TryGetValue(target, out int targetIndex)) { - RelocType.IMAGE_REL_BASED_DIR64 => DehydratedDataCommand.PtrReloc, - RelocType.IMAGE_REL_BASED_RELPTR32 => DehydratedDataCommand.RelPtr32Reloc, - _ => throw new NotSupportedException(), - }; + int relocCommand = reloc.RelocType switch + { + RelocType.IMAGE_REL_BASED_DIR64 => DehydratedDataCommand.PtrReloc, + RelocType.IMAGE_REL_BASED_RELPTR32 => DehydratedDataCommand.RelPtr32Reloc, + _ => throw new NotSupportedException(), + }; - int written = DehydratedDataCommand.Encode(relocCommand, targetIndex, buff); - builder.EmitBytes(buff, 0, written); + int written = DehydratedDataCommand.Encode(relocCommand, targetIndex, buff); + builder.EmitBytes(buff, 0, written); + } + else + { + int relocCommand = reloc.RelocType switch + { + RelocType.IMAGE_REL_BASED_DIR64 => DehydratedDataCommand.InlinePtrReloc, + RelocType.IMAGE_REL_BASED_RELPTR32 => DehydratedDataCommand.InlineRelPtr32Reloc, + _ => throw new NotSupportedException(), + }; + builder.EmitByte(DehydratedDataCommand.EncodeShort(relocCommand, 0)); + builder.EmitReloc(target, RelocType.IMAGE_REL_BASED_RELPTR32); + } } } From 823463929bd763a5082cd8b69ed2e00cc803880e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Strehovsk=C3=BD?= Date: Fri, 18 Nov 2022 18:00:14 +0900 Subject: [PATCH 2/2] Allow generating runs of inline relocations If multiple relocations follow after each other, generate a single command with the payload specifying the number of subsequent relocations. This saves additional 0.1%. --- .../CompilerHelpers/StartupCodeHelpers.cs | 18 ++++--- .../Common/Internal/Runtime/DehydratedData.cs | 2 +- .../DependencyAnalysis/DehydratedDataNode.cs | 52 ++++++++++++++++++- 3 files changed, 63 insertions(+), 9 deletions(-) diff --git a/src/coreclr/nativeaot/Common/src/Internal/Runtime/CompilerHelpers/StartupCodeHelpers.cs b/src/coreclr/nativeaot/Common/src/Internal/Runtime/CompilerHelpers/StartupCodeHelpers.cs index 2cb073f2c2d6b8..341e6baf0d5b5d 100644 --- a/src/coreclr/nativeaot/Common/src/Internal/Runtime/CompilerHelpers/StartupCodeHelpers.cs +++ b/src/coreclr/nativeaot/Common/src/Internal/Runtime/CompilerHelpers/StartupCodeHelpers.cs @@ -267,14 +267,20 @@ private static unsafe void RehydrateData(IntPtr dehydratedData, int length) pDest += sizeof(int); break; case DehydratedDataCommand.InlinePtrReloc: - *(void**)pDest = ReadRelPtr32(pCurrent); - pDest += sizeof(void*); - pCurrent += sizeof(int); + while (payload-- > 0) + { + *(void**)pDest = ReadRelPtr32(pCurrent); + pDest += sizeof(void*); + pCurrent += sizeof(int); + } break; case DehydratedDataCommand.InlineRelPtr32Reloc: - WriteRelPtr32(pDest, ReadRelPtr32(pCurrent)); - pDest += sizeof(int); - pCurrent += sizeof(int); + while (payload-- > 0) + { + WriteRelPtr32(pDest, ReadRelPtr32(pCurrent)); + pDest += sizeof(int); + pCurrent += sizeof(int); + } break; } } diff --git a/src/coreclr/tools/Common/Internal/Runtime/DehydratedData.cs b/src/coreclr/tools/Common/Internal/Runtime/DehydratedData.cs index ee844da0080544..dee7c2cfdfdd87 100644 --- a/src/coreclr/tools/Common/Internal/Runtime/DehydratedData.cs +++ b/src/coreclr/tools/Common/Internal/Runtime/DehydratedData.cs @@ -30,7 +30,7 @@ internal static class DehydratedDataCommand private const int MaxRawShortPayload = (1 << (8 - DehydratedDataCommandPayloadShift)) - 1; private const int MaxExtraPayloadBytes = 3; - private const int MaxShortPayload = MaxRawShortPayload - MaxExtraPayloadBytes; + public const int MaxShortPayload = MaxRawShortPayload - MaxExtraPayloadBytes; public static byte EncodeShort(int command, int commandData) { diff --git a/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/DependencyAnalysis/DehydratedDataNode.cs b/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/DependencyAnalysis/DehydratedDataNode.cs index 338a48f335f1f3..eeba7d3c1d1cdf 100644 --- a/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/DependencyAnalysis/DehydratedDataNode.cs +++ b/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/DependencyAnalysis/DehydratedDataNode.cs @@ -207,6 +207,8 @@ public override ObjectData GetData(NodeFactory factory, bool relocsOnly = false) // Generate the next relocation if there's any. if (reloc.Target != null) { + Debug.Assert(sourcePosition == reloc.Offset); + #if DEBUG unsafe { @@ -228,6 +230,7 @@ public override ObjectData GetData(NodeFactory factory, bool relocsOnly = false) if (relocs.TryGetValue(target, out int targetIndex)) { + // Reloc goes through the lookup table int relocCommand = reloc.RelocType switch { RelocType.IMAGE_REL_BASED_DIR64 => DehydratedDataCommand.PtrReloc, @@ -240,14 +243,59 @@ public override ObjectData GetData(NodeFactory factory, bool relocsOnly = false) } else { + // Reloc will be generated inline. Check if we can generate a run of inline relocs. + + // Reserve a byte for the command (the command payload will have to fit in this byte too). + ObjectDataBuilder.Reservation reservation = builder.ReserveByte(); + + int numRelocs = 0; + bool hasNextReloc; + do + { + builder.EmitReloc(target, RelocType.IMAGE_REL_BASED_RELPTR32); + numRelocs++; + hasNextReloc = false; + + if (currentReloc < o.Relocs.Length) + { + // If we wouldn't be able to fit this run into the single byte we reserved, stop. + if (numRelocs == DehydratedDataCommand.MaxShortPayload) + break; + + Relocation nextReloc = o.Relocs[currentReloc]; + + // Does the next reloc immediately follow this one? + if (nextReloc.Offset != sourcePosition) + break; + + // Is it of the same type? + if (nextReloc.RelocType != reloc.RelocType) + break; + + ISymbolNode nextTarget = nextReloc.Target; + if (nextTarget is ISymbolNodeWithLinkage nextTargetWithLinkage) + nextTarget = nextTargetWithLinkage.NodeForLinkage(factory); + + // We don't have a short code for it? + if (relocs.ContainsKey(nextTarget)) + break; + + // This relocation is good - we'll generate it as part of the run + sourcePosition += Relocation.GetSize(reloc.RelocType); + hasNextReloc = true; + currentReloc++; + target = nextTarget; + } + } while (hasNextReloc); + + // Now update the byte we reserved with the command to emit for the run int relocCommand = reloc.RelocType switch { RelocType.IMAGE_REL_BASED_DIR64 => DehydratedDataCommand.InlinePtrReloc, RelocType.IMAGE_REL_BASED_RELPTR32 => DehydratedDataCommand.InlineRelPtr32Reloc, _ => throw new NotSupportedException(), }; - builder.EmitByte(DehydratedDataCommand.EncodeShort(relocCommand, 0)); - builder.EmitReloc(target, RelocType.IMAGE_REL_BASED_RELPTR32); + builder.EmitByte(reservation, DehydratedDataCommand.EncodeShort(relocCommand, numRelocs)); } } }