[WIP] Implement portable tailcall helpers#26418
[WIP] Implement portable tailcall helpers#26418jakobbotsch wants to merge 164 commits intodotnet:masterfrom
Conversation
|
FYI @janvorli @dotnet/jit-contrib This now works for some very basic examples on unix x64 (using InlineIL.Fody): using InlineIL;
using System;
using System.Diagnostics;
using System.Runtime.Intrinsics;
using System.Threading.Tasks;
internal class Program
{
private static int Main()
{
s_prev = 10000001;
Stopwatch timer = Stopwatch.StartNew();
bool result = IsOddNewHelper(10000000);
Console.WriteLine("Time: {0}ms", timer.ElapsedMilliseconds);
return result ? 100 : 0;
}
private static readonly S32 s_s32 = new S32
{ A = 1, B = 2, C = 3, D = 4, };
private static int s_prev;
static unsafe bool IsEvenNewHelper(int x, S32 onStack)
{
Trace.Assert(x == s_prev - 1);
s_prev = x;
if (x == 0)
return true;
IL.Push(x - 1);
IL.Emit.Tail();
IL.Emit.Call(new MethodRef(typeof(Program), nameof(IsOddNewHelper)));
return IL.Return<bool>();
}
static unsafe bool IsOddNewHelper(int x)
{
Trace.Assert(x == s_prev - 1);
s_prev = x;
if (x == 0)
return false;
IL.Push(x - 1);
IL.Push(s_s32);
IL.Emit.Tail();
IL.Emit.Call(new MethodRef(typeof(Program), nameof(IsEvenNewHelper)));
return IL.Return<bool>();
}
struct S32 { public long A, B, C, D; }
}builds and runs successfully. The following IL stubs are generated for the call to Incoming sig: Boolean (Int32, S32)
StoreArgs sig: Void (IntPtr, System.Int32, S32)
// CallMethod {
/*( 0)*/ ldc.i4.s 0x30
/*( 1)*/ ldc.i4.0
/*( 2)*/ conv.i
/*( 2)*/ call native int [System.Private.CoreLib] System.Runtime.CompilerServices.RuntimeHelpers::AllocTailCallArgBuffer(int32,native int)
/*( 1)*/ stloc.0
/*( 0)*/ ldloc.0
/*( 1)*/ ldarg.0
/*( 2)*/ stind.i
/*( 0)*/ ldloc.0
/*( 1)*/ ldc.i4.8
/*( 2)*/ add
/*( 1)*/ ldarg.1
/*( 2)*/ stobj System.Int32
/*( 0)*/ ldloc.0
/*( 1)*/ ldc.i4.s 0x10
/*( 2)*/ add
/*( 1)*/ ldarg.2
/*( 2)*/ stobj Program+S32
/*( 0)*/ ret
// } CallMethod
CallTarget sig: System.Boolean ()
// CallMethod {
/*( 0)*/ call native int [System.Private.CoreLib] System.Runtime.CompilerServices.RuntimeHelpers::GetTailCallTls()
/*( 1)*/ stloc.1
/*( 0)*/ ldloc.1
/*( 1)*/ ldfld System.Runtime.CompilerServices.TailCallTls::Frame
/*( 1)*/ stloc.2
/*( 0)*/ ldloc.2
/*( 1)*/ ldfld System.Runtime.CompilerServices.TailCallFrame::ReturnAddress
/*( 1)*/ call native int [System.Private.CoreLib] System.StubHelpers.StubHelpers::ReturnAddress()
/*( 2)*/ bne.un IL_002c
/*( 0)*/ ldloc.2
/*( 1)*/ ldftn System.Boolean /* MT: 0x00007FFF7CFADAF8 */ [example1] dynamicClass::IL_STUB_CallTailCallTarget()
/*( 2)*/ conv.i
/*( 2)*/ stfld System.Runtime.CompilerServices.TailCallFrame::NextCall
/*( 0)*/ ldloc.0
/*( 1)*/ ret
IL_002c: /*( 0)*/ ldloca.s 0x3
/*( 1)*/ ldloc.2
/*( 2)*/ stfld System.Runtime.CompilerServices.TailCallFrame::Prev
/*( 0)*/ ldloca.s 0x3
/*( 1)*/ ldc.i4.0
/*( 2)*/ conv.i
/*( 2)*/ stfld System.Runtime.CompilerServices.TailCallFrame::NextCall
/*( 0)*/ ldloc.1
/*( 1)*/ ldloca.s 0x3
/*( 2)*/ stfld System.Runtime.CompilerServices.TailCallTls::Frame
/*( 0)*/ ldloc.1
/*( 1)*/ ldfld System.Runtime.CompilerServices.TailCallTls::ArgBuffer
/*( 1)*/ stloc.s 0x4
/*( 0)*/ ldloc.s 0x4
/*( 1)*/ ldind.i
/*( 1)*/ stloc.s 0x5
/*( 0)*/ ldloc.s 0x4
/*( 1)*/ ldc.i4.8
/*( 2)*/ add
/*( 1)*/ ldobj System.Int32
/*( 1)*/ stloc.s 0x6
/*( 0)*/ ldloc.s 0x4
/*( 1)*/ ldc.i4.s 0x10
/*( 2)*/ add
/*( 1)*/ ldobj Program+S32
/*( 1)*/ stloc.s 0x7
/*( 0)*/ call void [System.Private.CoreLib] System.Runtime.CompilerServices.RuntimeHelpers::FreeTailCallArgBuffer()
/*( 0)*/ ldloca.s 0x3
/*( 1)*/ call native int [System.Private.CoreLib] System.StubHelpers.StubHelpers::NextCallReturnAddress()
/*( 2)*/ stfld System.Runtime.CompilerServices.TailCallFrame::ReturnAddress
/*( 0)*/ ldloc.s 0x6
/*( 1)*/ ldloc.s 0x7
/*( 2)*/ ldloc.s 0x5
/*( 3)*/ calli System.Boolean /* MT: 0x00007FFF7CFADAF8 */(System.Int32 /* MT: 0x00007FFF7CFD6090 */,Program/S32 /* MT: 0x00007FFF7D133DF0 */)
/*( 1)*/ stloc.0
/*( 0)*/ ldloc.1
/*( 1)*/ ldloc.3
/*( 2)*/ ldfld System.Runtime.CompilerServices.TailCallFrame::Prev
/*( 2)*/ stfld System.Runtime.CompilerServices.TailCallTls::Frame
/*( 0)*/ ldloc.3
/*( 1)*/ ldfld System.Runtime.CompilerServices.TailCallFrame::NextCall
/*( 1)*/ brfalse IL_00ab
/*( 0)*/ ldloc.3
/*( 1)*/ ldfld System.Runtime.CompilerServices.TailCallFrame::NextCall
/*( 1)*/ tail.
/*( 1)*/ calli System.Boolean /* MT: 0x00007FFF7CFADAF8 */()
/*( 1)*/ ret
IL_00ab: /*( 0)*/ ldloc.0
/*( 1)*/ ret
// } CallMethod generating respectively ; Assembly listing for method ILStubClass:IL_STUB_StoreTailCallArgs(long,int,struct)
; Emitting BLENDED_CODE for X64 CPU with AVX - Unix
; optimized code
; rsp based frame
; partially interruptible
; Final local variable assignments
;
; V00 arg0 [V00,T00] ( 3, 3 ) long -> rbx
; V01 arg1 [V01,T01] ( 3, 3 ) int -> r14
; V02 arg2 [V02,T03] ( 1, 1 ) struct (32) [rsp+0x20] do-not-enreg[SB]
; V03 loc0 [V03,T02] ( 4, 4 ) long -> rax
;# V04 OutArgs [V04 ] ( 1, 1 ) lclBlk ( 0) [rsp+0x00] "OutgoingArgSpace"
;
; Lcl frame size = 8
G_M9479_IG01:
4156 push r14
53 push rbx
50 push rax
C5F877 vzeroupper
488BDF mov rbx, rdi
448BF6 mov r14d, esi
G_M9479_IG02:
BF30000000 mov edi, 48
33F6 xor rsi, rsi
E857671479 call System.Runtime.CompilerServices.RuntimeHelpers:AllocTailCallArgBuffer(int,long):long
488918 mov qword ptr [rax], rbx
44897008 mov dword ptr [rax+8], r14d
4883C010 add rax, 16
C5FA6F442420 vmovdqu xmm0, qword ptr [rsp+20H]
C5FA7F00 vmovdqu qword ptr [rax], xmm0
C5FA6F442430 vmovdqu xmm0, qword ptr [rsp+30H]
C5FA7F4010 vmovdqu qword ptr [rax+16], xmm0
G_M9479_IG03:
4883C408 add rsp, 8
5B pop rbx
415E pop r14
C3 ret
; Total bytes of code 65, prolog size 7 for method ILStubClass:IL_STUB_StoreTailCallArgs(long,int,struct)
; ============================================================and ; Assembly listing for method ILStubClass:IL_STUB_CallTailCallTarget():bool
; Emitting BLENDED_CODE for X64 CPU with AVX - Unix
; optimized code
; rbp based frame
; partially interruptible
; Final local variable assignments
;
; V00 loc0 [V00,T04] ( 3, 1.50) bool -> [rbp-0x1C]
; V01 loc1 [V01,T00] ( 5, 3.50) long -> rbx
; V02 loc2 [V02,T01] ( 4, 3 ) long -> rdi
; V03 loc3 [V03 ] ( 7, 3.50) struct (24) [rbp-0x38] do-not-enreg[XS] addr-exposed ld-addr-op
; V04 loc4 [V04,T02] ( 4, 2 ) long -> rax
; V05 loc5 [V05,T05] ( 2, 1 ) long -> r14
; V06 loc6 [V06,T06] ( 2, 1 ) int -> r15
; V07 loc7 [V07,T07] ( 2, 1 ) struct (32) [rbp-0x58] do-not-enreg[SB]
; V08 OutArgs [V08 ] ( 1, 1 ) lclBlk (32) [rsp+0x00] "OutgoingArgSpace"
; V09 tmp1 [V09,T08] ( 1, 1 ) long -> [rbp+0x08] "Return address"
; V10 tmp2 [V10,T03] ( 2, 2 ) long -> rax "impImportIndirectCall"
; V11 tmp3 [V11 ] ( 3, 1.50) long -> [rbp-0x38] do-not-enreg[X] addr-exposed V03.Prev(offs=0x00) P-DEP "field V03.Prev (fldOffset=0x0)"
; V12 tmp4 [V12 ] ( 2, 1 ) long -> [rbp-0x30] do-not-enreg[X] addr-exposed V03.ReturnAddress(offs=0x08) P-DEP "field V03.ReturnAddress (fldOffset=0x8)"
; V13 tmp5 [V13 ] ( 4, 2 ) long -> [rbp-0x28] do-not-enreg[X] addr-exposed V03.NextCall(offs=0x10) P-DEP "field V03.NextCall (fldOffset=0x10)"
;
; Lcl frame size = 104
G_M31660_IG01:
55 push rbp
4157 push r15
4156 push r14
53 push rbx
4883EC68 sub rsp, 104
C5F877 vzeroupper
488DAC2480000000 lea rbp, [rsp+80H]
G_M31660_IG02:
E806681479 call System.Runtime.CompilerServices.RuntimeHelpers:GetTailCallTls():long
488BD8 mov rbx, rax
488B3B mov rdi, qword ptr [rbx]
4C8B7508 mov r14, qword ptr [rbp+08H]
4C397708 cmp qword ptr [rdi+8], r14
751C jne SHORT G_M31660_IG04
48B89858F27CFF7F0000 mov rax, 0x7FFF7CF25898
48894710 mov qword ptr [rdi+16], rax
8B45E4 mov eax, dword ptr [rbp-1CH]
G_M31660_IG03:
488D65E8 lea rsp, [rbp-18H]
5B pop rbx
415E pop r14
415F pop r15
5D pop rbp
C3 ret
G_M31660_IG04:
48897DC8 mov qword ptr [rbp-38H], rdi
33C0 xor rax, rax
488945D8 mov qword ptr [rbp-28H], rax
488D45C8 lea rax, bword ptr [rbp-38H]
488903 mov qword ptr [rbx], rax
488B4308 mov rax, qword ptr [rbx+8]
4C8B30 mov r14, qword ptr [rax]
448B7808 mov r15d, dword ptr [rax+8]
4883C010 add rax, 16
C5FA6F00 vmovdqu xmm0, qword ptr [rax]
C5FA7F45A8 vmovdqu qword ptr [rbp-58H], xmm0
C5FA6F4010 vmovdqu xmm0, qword ptr [rax+16]
C5FA7F45B8 vmovdqu qword ptr [rbp-48H], xmm0
E872671479 call System.Runtime.CompilerServices.RuntimeHelpers:FreeTailCallArgBuffer()
488D3D27000000 lea rdi, G_M31660_IG05
48897DD0 mov qword ptr [rbp-30H], rdi
C5FA6F45A8 vmovdqu xmm0, qword ptr [rbp-58H]
C5FA7F0424 vmovdqu qword ptr [rsp], xmm0
C5FA6F45B8 vmovdqu xmm0, qword ptr [rbp-48H]
C5FA7F442410 vmovdqu qword ptr [rsp+10H], xmm0
418BFF mov edi, r15d
41FFD6 call r14
G_M31660_IG05:
448BF0 mov r14d, eax
488B45C8 mov rax, qword ptr [rbp-38H]
488903 mov qword ptr [rbx], rax
48837DD800 cmp qword ptr [rbp-28H], 0
7411 je SHORT G_M31660_IG07
488B45D8 mov rax, qword ptr [rbp-28H]
G_M31660_IG06:
488D65E8 lea rsp, [rbp-18H]
5B pop rbx
415E pop r14
415F pop r15
5D pop rbp
48FFE0 rex.jmp rax
G_M31660_IG07:
418BC6 mov eax, r14d
G_M31660_IG08:
488D65E8 lea rsp, [rbp-18H]
5B pop rbx
415E pop r14
415F pop r15
5D pop rbp
C3 ret
; Total bytes of code 212, prolog size 21 for method ILStubClass:IL_STUB_CallTailCallTarget():bool
; ============================================================In the JIT the most interesting changes were to expose two new intrinsics ; V09 tmp1 [V09,T08] ( 1, 1 ) long -> [rbp+0x08] "Return address"
G_M31660_IG02:
E806681479 call System.Runtime.CompilerServices.RuntimeHelpers:GetTailCallTls():long
488BD8 mov rbx, rax
488B3B mov rdi, qword ptr [rbx]
4C8B7508 mov r14, qword ptr [rbp+08H]
4C397708 cmp qword ptr [rdi+8], r14and here: E872671479 call System.Runtime.CompilerServices.RuntimeHelpers:FreeTailCallArgBuffer()
488D3D27000000 lea rdi, G_M31660_IG05
48897DD0 mov qword ptr [rbp-30H], rdi
C5FA6F45A8 vmovdqu xmm0, qword ptr [rbp-58H]
C5FA7F0424 vmovdqu qword ptr [rsp], xmm0
C5FA6F45B8 vmovdqu xmm0, qword ptr [rbp-48H]
C5FA7F442410 vmovdqu qword ptr [rsp+10H], xmm0
418BFF mov edi, r15d
41FFD6 call r14
G_M31660_IG05:Other than that the JIT basically just does some different morphing than before (the diff looks large because this is rebased on top of #26158). It morphs the end of IN0014: 000068 vmovdqu xmm0, qword ptr [rsi]
IN0015: 00006C vmovdqu qword ptr [V01 rsp], xmm0
IN0016: 000071 vmovdqu xmm0, qword ptr [rsi+16]
IN0017: 000076 vmovdqu qword ptr [V01+0x10 rsp+10H], xmm0
IN0018: 00007C lea esi, [rbx-1]
IN0019: 00007F mov rdi, 0x7FFF7CF0EF90
IN001a: 000089 call ILStubClass:IL_STUB_StoreTailCallArgs(long,int,struct)
IN001b: 00008E nop
G_M31660_IG07: ; offs=00008FH, size=000BH, epilog, nogc, emitadd
IN0026: 00008F lea rsp, [rbp-08H]
IN0027: 000093 pop rbx
IN0028: 000094 pop rbp
IN0029: 000095 jmp ILStubClass:IL_STUB_CallTailCallTarget():boolEDIT: Note that this is now outdated and not completely how it looks anymore. |
src/vm/tailcallhelp.cpp
Outdated
| const ArgBufferValue& arg = info.ArgBufLayout.Values[i]; | ||
| emitOffs(arg.Offset); | ||
| pCode->EmitLDARG(argIndex++); | ||
| pCode->EmitSTOBJ(pCode->GetToken(arg.TyHnd)); |
There was a problem hiding this comment.
@dotnet/jit-contrib This ends up emitting write barriers for object references. There won't be any need for that as this will always be into TLS that will need special treatment for the GC anyway. Does anyone know how if there is an easy way to get around it? If I just try STIND_I I get asserts in JIT.
There was a problem hiding this comment.
Maybe I should just use CPBLK actually...
This is primarily for logging purposes when logging IL.
We will need this to allow the runtime to generate call-target stubs that do not require the target pointer.
* CallTarget stub now tries hard to use a call/callvirt whenever possible. This makes VSDs work for free. * Add CORINFO_TAILCALL_HELP and CORINFO_TAILCALL_FLAGS to communicate helper information between runtime and JIT. The flags currently only contains CORINFO_TAILCALL_STORE_TARGET which means that the JIT needs to pass the target address to the store-args stub (necessary for calli, for instance).
Done, here is the link https://dev.azure.com/dnceng/public/_build/results?buildId=379909 |
|
Sorry, I don't think any great idea about the QMARK issue. Since the target is now the last arg, introducing flow control at the point will require spilling pretty much all call args to locals. Probably that happens anyway since the target tree will contain another call. The only idea I can throw in is "Is there any change to move all this acrobatics to lowering?". Trees are pretty good for many things but terrible for this kind of low level control over ordering that you need here. |
I assume because it will be much simpler in LIR? |
Only a bit simpler, not too much. It should be easier to find a better insertion point ("One solution to the QMARK problem I can think of is modifying the statements in the block so that the control flow appears at the top level"). But it's unlikely to help with inserting flow control in the middle of a block. Come to think of it, it's probably going to be worse throughput wise - you'll probably need to traverse the block backwards in order to find SDSU edges that cross the split point and spill them to locals. |
Ah, wait. These are supposed to be tail calls so the call is probably more or less the last thing in the block. Might work better than expected, but it's still going to be a bit of work, couple of days to get it all right I would guess. |
|
Another thing that may work is to not use a real call and the associated flow control - just create some sort of "intrinsic" node for which generates all the needed code. Normally you can't generate calls in codegen like that but this helper only has 2 args it seems. All args would go in registers on all targets so that would hopefully avoid nested call restrictions. |
If I can construct the tree so it is just the last arg of the call (without introducing statements) I suppose morph will just handle things correctly?
I see, that is interesting. I guess this should be done in combination with somehow spilling everything? So add the placeholder node last, make I am quite partial to this solution. There is already a |
Ignoring the QMARK, yes, arg sorting should prevent nested calls from occurring (on x64 & co, they're valid on x86).
Mmm, no spilling should be needed in this case. The idea is that the intrinsic node would avoid the need for inserting flow control and also hide the helper call. At least if I did not misunderstood something and calls having only reg args also cannot be nested. |
That's what I don't really understand. If I just change the last arg to |
Right, since this call appears after all args, even after late args that contain register args. Hrm, I have no idea if LSRA is able to deal with this. In theory it could spill already loaded call args, load the new args for I'm starting to get the impression that there's no way to avoid some manual spilling of args to lclvars. And that would likely be somewhat easier to do in LIR. It is what it is. The team is trying to do things backwards. The JIT representation and handling of calls has some serious issues. The normal thing to do is to clean the mess up and then implement new features. |
Wait a moment. The target address is now a real call arg, if you set |
I see -- ok, this was my original assumption of what would be needed. I think this is what I will attempt to do.
Right. One possibility would be to have a better interface for querying ABI details. We should be able to determine during import whether we will be able to do a fast tailcall or not for explicit tailcalls, since this decision has only to do with ABI details and the signatures of the caller and callee. With this it would not be hard to create the proper trees from here. Unfortunately as it is we need to wait until morph and then we run into problems. |
If it's invariant, yes, it could be. I actually made some improvements to the runtime lookup tree that the importer generates but it's been a while and I don't remember exactly what it all involves. I'll check more when I get back home but I'm rather skeptical that there's a lot to lose in terms of optimizations by not expanding the lookup in the importer.
Quite the contrary, I'm thinking that a lot of call related stuff should happen in lowering (or LIR more generally). We'll see, after the tail call work is done I plan to continue with my call IR refactoring experiments :) |
Sounds good. I guess the best way to find out is to experiment with this change in isolation.
I see. One thing I personally found odd is how many ABI specific details are handled very early in the JIT which would be nice to do later. I know @CarolEidt has had this on her list for a while too. A lot of it might also be related to first class struct work. |
|
Thank you for your contribution. As announced in dotnet/coreclr#27549 this repository will be moving to dotnet/runtime on November 13. If you would like to continue working on this PR after this date, the easiest way to move the change to dotnet/runtime is:
|
|
I have gone through the change and the past conversation. I believe I agree that the conversation around the QMark logic is worth fixing. However, I am going to suggest not blocking the change on that. My reasoning is that the logic here seems to be sound, even though it adds duplication. Specifically this is a large change that seems like it can naturally be broken into several post steps. The longer this pr is open the more debt we will accumulate on this change. Lastly, going through the change, there is most likely PR feedback that will be required by at least @CarolEidt and @AndyAyersMS., I think this change is basically ready to go barring regular PR feedback and I think it would be a shame to block on a potentially complex refactoring. /cc @dotnet/jit-contrib @BruceForstall |
|
@jashook I am not against merging it now, however I am not very confident in the current way the runtime lookup is done for this case. It is missing the check that is normally done by QMARK, and I am not familiar enough with the runtime lookups to know how this affects it. Besides, the runtime lookup generation code is a little intricate and it seems many of the cases were not hit when I tested it, so I do not know if it is even right for all those cases. If it is the case that it will be merged soonish then I also have an updated design document that I can add and also it looks like there are some conflicts. I will try to fix them tomorrow or on the weekend if I do not get around to it tomorrow. |
Implement the portable tailcall helpers described in https://github.com/dotnet/coreclr/blob/master/Documentation/design-docs/tailcalls-with-helpers.md with a few variations.
ReturnAddressintrinsics for detecting previous framestail. callitail. calliExceptionswe already have coverage in https://github.com/dotnet/coreclr/tree/master/tests/src/JIT/Methodical/Invoke/SEHFixes #26311
Fixes #6675
Fixes #2556