From a52a7752d8cd2e90a3e6fbbf45e69937c6d9f1b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Strehovsk=C3=BD?= Date: Thu, 15 Feb 2024 12:03:37 +0100 Subject: [PATCH] More precise writebarrier for regions Port of #67389 to Native AOT. Adds additional checks to write barriers so that the GC can do less work. Write barriers get slower with the expectation that we'll recoup the time during garbage collections. Was hoping to see similar gains as https://github.com/dotnet/runtime/pull/67389#issuecomment-1240955293 for our self-hosted ILC, but wallclock time got maybe 1% worse instead. However GC stats in Perfview look much better: Before: * Total CPU Time: 33,478 msec * Total GC CPU Time: 585 msec * Total Allocs : 776.721 MB * Number of Heaps: 16 * GC CPU MSec/MB Alloc : 0.753 MSec/MB * Total GC Pause: 207.7 msec * % Time paused for Garbage Collection: 2.4% * % CPU Time spent Garbage Collecting: 1.7% After: * Total CPU Time: 33,348 msec * Total GC CPU Time: 179 msec * Total Allocs : 771.313 MB * Number of Heaps: 16 * GC CPU MSec/MB Alloc : 0.232 MSec/MB * Total GC Pause: 195.8 msec * % Time paused for Garbage Collection: 2.3% * % CPU Time spent Garbage Collecting: 0.5% Opening as a draft because maybe we can do something to make these not as expensive (CoreCLR seems to have lots of tricks up its sleeve). We also need the Linux version. --- .../nativeaot/Runtime/amd64/AsmMacros.inc | 3 + .../nativeaot/Runtime/amd64/WriteBarriers.asm | 103 ++++++++++++++++++ src/coreclr/nativeaot/Runtime/gcenv.ee.cpp | 6 + .../nativeaot/Runtime/gcheaputilities.cpp | 3 + .../nativeaot/Runtime/gcheaputilities.h | 3 + 5 files changed, 118 insertions(+) diff --git a/src/coreclr/nativeaot/Runtime/amd64/AsmMacros.inc b/src/coreclr/nativeaot/Runtime/amd64/AsmMacros.inc index 33089b6643d382..c4f575e1198c10 100644 --- a/src/coreclr/nativeaot/Runtime/amd64/AsmMacros.inc +++ b/src/coreclr/nativeaot/Runtime/amd64/AsmMacros.inc @@ -400,6 +400,9 @@ EXTERN g_highest_address : QWORD EXTERN g_ephemeral_low : QWORD EXTERN g_ephemeral_high : QWORD EXTERN g_card_table : QWORD +EXTERN g_region_shr : BYTE +EXTERN g_region_use_bitwise_write_barrier : BYTE +EXTERN g_region_to_generation_table : QWORD ifdef FEATURE_MANUALLY_MANAGED_CARD_BUNDLES EXTERN g_card_bundle_table : QWORD diff --git a/src/coreclr/nativeaot/Runtime/amd64/WriteBarriers.asm b/src/coreclr/nativeaot/Runtime/amd64/WriteBarriers.asm index 148aa7d1301b1c..049510c46898c0 100644 --- a/src/coreclr/nativeaot/Runtime/amd64/WriteBarriers.asm +++ b/src/coreclr/nativeaot/Runtime/amd64/WriteBarriers.asm @@ -122,6 +122,55 @@ endif cmp REFREG, [g_ephemeral_high] jae &BASENAME&_NoBarrierRequired_&REFREG& + mov r10, REFREG + mov r11, rcx + + mov cl, [g_region_shr] + test cl, cl + je &BASENAME&_SkipCheck_&REFREG& + + ; check if the source is in gen 2 - then it's not an ephemeral pointer + shr r10, cl + add r10, [g_region_to_generation_table] + cmp byte ptr [r10], 82h + je &BASENAME&_NoBarrierRequired_&REFREG& + + ; check if the destination happens to be in gen 0 + mov r10, r11 + shr r10, cl + add r10, [g_region_to_generation_table] + cmp byte ptr [r10], 0 + je &BASENAME&_NoBarrierRequired_&REFREG& +&BASENAME&_SkipCheck_&REFREG&: + + cmp [g_region_use_bitwise_write_barrier], 0 + mov rcx, r11 + je &BASENAME&_CheckCardTableByte_&REFREG& + + ; compute card table bit + mov r10b, 1 + shr rcx, 8 + and cl, 7 + shl r10b, cl + + ; Check if we need to update the card table + ; Calc pCardByte + mov rcx, r11 + shr rcx, 0Bh + add rcx, [g_card_table] + + ; Check if this card table bit is already set + test byte ptr [rcx], r10b + jne &BASENAME&_NoBarrierRequired_&REFREG& + + lock or byte ptr [rcx], r10b + + mov rcx, r11 + shr rcx, 15h + jmp &BASENAME&_CheckCardBundle_&REFREG& + +&BASENAME&_CheckCardTableByte_&REFREG&: + ;; We have a location on the GC heap being updated with a reference to an ephemeral object so we must ;; track this write. The location address is translated into an offset in the card table bitmap. We set ;; an entire byte in the card table since it's quicker than messing around with bitmasks and we only write @@ -137,11 +186,14 @@ endif ifdef FEATURE_MANUALLY_MANAGED_CARD_BUNDLES ;; Shift rcx by 0Ah more to get the card bundle byte (we shifted by 0x0B already) shr rcx, 0Ah +&BASENAME&_CheckCardBundle_&REFREG&: add rcx, [g_card_bundle_table] cmp byte ptr [rcx], 0FFh je &BASENAME&_NoBarrierRequired_&REFREG& mov byte ptr [rcx], 0FFh +else +&BASENAME&_CheckCardBundle_&REFREG&: endif &BASENAME&_NoBarrierRequired_&REFREG&: @@ -317,6 +369,54 @@ RhpByRefAssignRef_CheckCardTable: cmp rcx, [g_ephemeral_high] jae RhpByRefAssignRef_NoBarrierRequired + mov r10, rcx + + mov cl, [g_region_shr] + test cl, cl + je RhpByRefAssignRef_SkipCheck + + ; check if the source is in gen 2 - then it's not an ephemeral pointer + shr r10, cl + add r10, [g_region_to_generation_table] + cmp byte ptr [r10], 82h + je RhpByRefAssignRef_NoBarrierRequired + + ; check if the destination happens to be in gen 0 + mov r10, rdi + shr r10, cl + add r10, [g_region_to_generation_table] + cmp byte ptr [r10], 0 + je RhpByRefAssignRef_NoBarrierRequired +RhpByRefAssignRef_SkipCheck: + + cmp [g_region_use_bitwise_write_barrier], 0 + je RhpByRefAssignRef_CheckCardTableByte + + ; compute card table bit + mov rcx, rdi + mov r10b, 1 + shr rcx, 8 + and cl, 7 + shl r10b, cl + + ; Check if we need to update the card table + ; Calc pCardByte + mov rcx, rdi + shr rcx, 0Bh + add rcx, [g_card_table] + + ; Check if this card table bit is already set + test byte ptr [rcx], r10b + jne RhpByRefAssignRef_NoBarrierRequired + + lock or byte ptr [rcx], r10b + + mov rcx, rdi + shr rcx, 15h + jmp RhpByRefAssignRef_CheckCardBundle + +RhpByRefAssignRef_CheckCardTableByte: + ;; move current rdi value into rcx, we need to keep rdi and eventually increment by 8 mov rcx, rdi @@ -335,11 +435,14 @@ RhpByRefAssignRef_CheckCardTable: ifdef FEATURE_MANUALLY_MANAGED_CARD_BUNDLES ;; Shift rcx by 0Ah more to get the card bundle byte (we shifted by 0Bh already) shr rcx, 0Ah +RhpByRefAssignRef_CheckCardBundle: add rcx, [g_card_bundle_table] cmp byte ptr [rcx], 0FFh je RhpByRefAssignRef_NoBarrierRequired mov byte ptr [rcx], 0FFh +else +RhpByRefAssignRef_CheckCardBundle: endif RhpByRefAssignRef_NoBarrierRequired: diff --git a/src/coreclr/nativeaot/Runtime/gcenv.ee.cpp b/src/coreclr/nativeaot/Runtime/gcenv.ee.cpp index 2cb9445144c220..4e89921528246a 100644 --- a/src/coreclr/nativeaot/Runtime/gcenv.ee.cpp +++ b/src/coreclr/nativeaot/Runtime/gcenv.ee.cpp @@ -415,6 +415,9 @@ void GCToEEInterface::StompWriteBarrier(WriteBarrierParameters* args) assert(args->ephemeral_high != nullptr); g_ephemeral_low = args->ephemeral_low; g_ephemeral_high = args->ephemeral_high; + g_region_to_generation_table = args->region_to_generation_table; + g_region_shr = args->region_shr; + g_region_use_bitwise_write_barrier = args->region_use_bitwise_write_barrier; return; case WriteBarrierOp::Initialize: // This operation should only be invoked once, upon initialization. @@ -442,6 +445,9 @@ void GCToEEInterface::StompWriteBarrier(WriteBarrierParameters* args) g_lowest_address = args->lowest_address; g_highest_address = args->highest_address; + g_region_to_generation_table = args->region_to_generation_table; + g_region_shr = args->region_shr; + g_region_use_bitwise_write_barrier = args->region_use_bitwise_write_barrier; g_ephemeral_low = args->ephemeral_low; g_ephemeral_high = args->ephemeral_high; return; diff --git a/src/coreclr/nativeaot/Runtime/gcheaputilities.cpp b/src/coreclr/nativeaot/Runtime/gcheaputilities.cpp index 42f7928cd9e252..eae41a68c04aaa 100644 --- a/src/coreclr/nativeaot/Runtime/gcheaputilities.cpp +++ b/src/coreclr/nativeaot/Runtime/gcheaputilities.cpp @@ -21,6 +21,9 @@ GPTR_IMPL_INIT(uint8_t, g_highest_address, nullptr); GVAL_IMPL_INIT(GCHeapType, g_heap_type, GC_HEAP_INVALID); uint8_t* g_ephemeral_low = (uint8_t*)1; uint8_t* g_ephemeral_high = (uint8_t*)~0; +uint8_t* g_region_to_generation_table = nullptr; +uint8_t g_region_shr = 0; +bool g_region_use_bitwise_write_barrier = false; #ifdef FEATURE_MANUALLY_MANAGED_CARD_BUNDLES uint32_t* g_card_bundle_table = nullptr; diff --git a/src/coreclr/nativeaot/Runtime/gcheaputilities.h b/src/coreclr/nativeaot/Runtime/gcheaputilities.h index ef99c354163f33..d7c9dd2b43e797 100644 --- a/src/coreclr/nativeaot/Runtime/gcheaputilities.h +++ b/src/coreclr/nativeaot/Runtime/gcheaputilities.h @@ -27,6 +27,9 @@ extern "C" uint32_t* g_card_bundle_table; extern "C" uint8_t* g_ephemeral_low; extern "C" uint8_t* g_ephemeral_high; +extern "C" uint8_t* g_region_to_generation_table; +extern "C" uint8_t g_region_shr; +extern "C" bool g_region_use_bitwise_write_barrier; #ifdef FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP extern "C" bool g_sw_ww_enabled_for_gc_heap;