Runtime lookup clean up, enable for helper-based tail calls#83430
Runtime lookup clean up, enable for helper-based tail calls#83430EgorBo merged 4 commits intodotnet:mainfrom
Conversation
|
Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch, @kunalspathak Issue DetailsClean up some dead code and enable runtime lookup expansion for helper-based tail-calls where we previously only used slow fallbacks.
|
src/coreclr/jit/importer.cpp
Outdated
| impAppendTree(qmark, CHECK_SPILL_NONE, impCurStmtDI); | ||
|
|
||
| return gtNewLclvNode(slotLclNum, TYP_I_IMPL); | ||
| assert(!pRuntimeLookup->testForFixup); |
There was a problem hiding this comment.
This should be deleted from JIT/EE interface.
|
A couple of diffs in coreclr_tests where we have ; Program:GenInterfaceForwardG
@@ -39,11 +42,26 @@ G_M7779_IG01: ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
mov rsi, r9
; gcrRegs +[rsi]
;; size=44 bbWeight=1 PerfScore 10.58
-G_M7779_IG02: ; bbWeight=1, gcrefRegs=00C0 {rsi rdi}, byrefRegs=0000 {}, byref
+G_M7779_IG02: ; bbWeight=1, gcrefRegs=00C0 {rsi rdi}, byrefRegs=0000 {}, byref, isz
+ mov rdx, qword ptr [rcx+38H]
+ cmp qword ptr [rdx+10H], 32
+ jle SHORT G_M7779_IG05
+ ;; size=11 bbWeight=1 PerfScore 6.00
+G_M7779_IG03: ; bbWeight=0.80, gcrefRegs=00C0 {rsi rdi}, byrefRegs=0000 {}, byref, isz
+ mov rdx, qword ptr [rdx+20H]
+ test rdx, rdx
+ je SHORT G_M7779_IG05
+ ;; size=9 bbWeight=0.80 PerfScore 2.60
+G_M7779_IG04: ; bbWeight=0.64, gcrefRegs=00C0 {rsi rdi}, byrefRegs=0000 {}, byref, isz
+ jmp SHORT G_M7779_IG06
+ ;; size=2 bbWeight=0.64 PerfScore 1.28
+G_M7779_IG05: ; bbWeight=0.36, gcrefRegs=00C0 {rsi rdi}, byrefRegs=0000 {}, byref
mov rdx, 0xD1FFAB1E ; global ptr
call CORINFO_HELP_RUNTIMEHANDLE_METHOD
; gcr arg pop 0
mov rdx, rax
+ ;; size=18 bbWeight=0.36 PerfScore 0.54
+G_M7779_IG06: ; bbWeight=1, gcrefRegs=00C0 {rsi rdi}, byrefRegs=0000 {}, byref
mov rcx, rsi
; gcrRegs +[rcx]
mov r8, 0xD1FFAB1E ; token handle
@@ -67,8 +85,8 @@ G_M7779_IG02: ; bbWeight=1, gcrefRegs=00C0 {rsi rdi}, byrefRegs=0000 {},
; gcr arg pop 0
mov rax, gword ptr [rsp+30H]
; gcrRegs +[rax]
- ;; size=91 bbWeight=1 PerfScore 11.50
-G_M7779_IG03: ; bbWeight=1, epilog, nogc, extend
+ ;; size=73 bbWeight=1 PerfScore 10.00
+G_M7779_IG07: ; bbWeight=1, epilog, nogc, extend
add rsp, 96
pop rbx
pop rsi
@@ -76,7 +94,7 @@ G_M7779_IG03: ; bbWeight=1, epilog, nogc, extend
ret
;; size=8 bbWeight=1 PerfScore 2.75
-; Total bytes of code 143 |
|
Nice! Any benchmarks? BTW, doesn't the block layout look kind of odd here: +G_M7779_IG03: ; bbWeight=0.80, gcrefRegs=00C0 {rsi rdi}, byrefRegs=0000 {}, byref, isz
+ mov rdx, qword ptr [rdx+20H]
+ test rdx, rdx
+ je SHORT G_M7779_IG05
+ ;; size=9 bbWeight=0.80 PerfScore 2.60
+G_M7779_IG04: ; bbWeight=0.64, gcrefRegs=00C0 {rsi rdi}, byrefRegs=0000 {}, byref, isz
+ jmp SHORT G_M7779_IG06
+ ;; size=2 bbWeight=0.64 PerfScore 1.28
+G_M7779_IG05: ; bbWeight=0.36, gcrefRegs=00C0 {rsi rdi}, byrefRegs=0000 {}, byref
mov rdx, 0xD1FFAB1E ; global ptr
call CORINFO_HELP_RUNTIMEHANDLE_METHOD
; gcr arg pop 0
mov rdx, rax
+ ;; size=18 bbWeight=0.36 PerfScore 0.54
+G_M7779_IG06: ; bbWeight=1, gcrefRegs=00C0 {rsi rdi}, byrefRegs=0000 {}, byrefCouldn't this be |
|
Quick benchmark: public struct ForceHelper
{
public int A, B, C, D, E, F, G;
}
public interface IFoo<T>
{
long Count(int count, long sum, ForceHelper fh);
}
public unsafe class C<T> : IFoo<T>
{
public long Count(int iterations, long sum, ForceHelper fh)
{
if (iterations == 0)
return sum;
ForceHelper local = fh;
local.A = 30;
IL.Push(this);
IL.Push(iterations - 1);
IL.Push(sum + iterations);
IL.Push(local);
IL.Emit.Tail();
IL.Emit.Callvirt(new MethodRef(typeof(IFoo<T>), "Count"));
return IL.Return<long>();
}
}
🚀 |
Thanks for checking - I was puzzled on how to properly benchmark it 🙂.
We emit block layout like this: // prevBb(BBJ_NONE): [weight: 1.0]
// ...
//
// nullcheckBb(BBJ_COND): [weight: 1.0]
// if (*fastPathValue == null)
// goto fallbackBb;
//
// fastPathBb(BBJ_ALWAYS): [weight: 0.8]
// rtLookupLcl = *fastPathValue;
// goto block;
//
// fallbackBb(BBJ_NONE): [weight: 0.2]
// rtLookupLcl = HelperCall();
//
// block(...): [weight: 1.0]
// use(rtLookupLcl);We probably need to re-order fastPathBb and fallbackBb |
Follow up to #81635
Clean up some dead code and enable runtime lookup expansion for helper-based tail-calls where we previously only used slow fallbacks.