(Some now fixed by https://github.com/dotnet/coreclr/pull/19429)
This is problematic for example when adding formatting and/or buffering capabilities over Span<T>
As seen in "Add support for BufferWriter<T> to the JsonWriter" https://github.com/dotnet/corefxlab/pull/2366
When a struct wraps a struct (but has no additional fields) it becomes a non-zero cost abstraction for its constructor (but not method calls) e.g. sharplab.io
Given
using System.Runtime.CompilerServices;
public static class Program
{
public static void ViaClass()
{
var c = new C();
c.M();
}
public static void ViaStruct1()
{
var s1 = new S1(new C());
s1.M();
}
public static void ViaStruct2()
{
var s2 = new S2(new S1(new C()));
s2.M();
}
public static void ViaStruct3()
{
var s3 = new S3(new S2(new S1(new C())));
s3.M();
}
public static void ViaStruct4()
{
var s4 = new S4(new S3(new S2(new S1(new C()))));
s4.M();
}
public static void ViaStruct5()
{
var s5 = new S5(new S4(new S3(new S2(new S1(new C())))));
s5.M();
}
}
public class C
{
[MethodImpl(MethodImplOptions.NoInlining)]
public void M()
{
}
}
public struct S1
{
private C _c;
public S1(C c) => _c = c;
public void M() => _c.M();
}
public struct S2
{
private S1 _s;
public S2(S1 s) => _s = s;
public void M() => _s.M();
}
public struct S3
{
private S2 _s;
public S3(S2 s) => _s = s;
public void M() => _s.M();
}
public struct S4
{
private S3 _s;
public S4(S3 s) => _s = s;
public void M() => _s.M();
}
public struct S5
{
private S4 _s;
public S5(S4 s) => _s = s;
public void M() => _s.M();
}
Each additional wrapper introduces redundant work in the asm e.g.
Program.ViaClass()
L0000: sub rsp, 0x28
L0004: mov rcx, 0x7ffa88b77008
L000e: call 0x7ffadf232520
L0013: mov rcx, rax
L0016: call C.M()
L001b: nop
L001c: add rsp, 0x28
L0020: ret
Program.ViaStruct1()
L0000: sub rsp, 0x28
L0004: mov rcx, 0x7ffa88b77008
L000e: call 0x7ffadf232520
L0013: mov rcx, rax
L0016: call C.M()
L001b: nop
L001c: add rsp, 0x28
L0020: ret
Program.ViaStruct2()
L0000: sub rsp, 0x28
L0004: mov rcx, 0x7ffa88b77008
L000e: call 0x7ffadf232520
L0013: mov rcx, rax
L0016: mov rax, rcx
L0019: mov rcx, rax
L001c: mov [rsp+0x20], rcx
L0021: mov rcx, [rsp+0x20]
L0026: cmp [rcx], ecx
L0028: call C.M()
L002d: nop
L002e: add rsp, 0x28
L0032: ret
Program.ViaStruct3()
L0000: sub rsp, 0x28
L0004: xor eax, eax
L0006: mov [rsp+0x20], rax
L000b: mov rcx, 0x7ffa88b77008
L0015: call 0x7ffadf232520
L001a: mov rcx, rax
L001d: mov rax, rcx
L0020: mov rcx, rax
L0023: mov rax, rcx
L0026: mov rcx, rax
L0029: lea rax, [rsp+0x20]
L002e: mov [rax], rcx
L0031: mov rcx, [rsp+0x20]
L0036: cmp [rcx], ecx
L0038: call C.M()
L003d: nop
L003e: add rsp, 0x28
L0042: ret
Program.ViaStruct4()
L0000: push rdi
L0001: sub rsp, 0x40
L0005: lea rdi, [rsp+0x28]
L000a: mov ecx, 0x6
L000f: xor eax, eax
L0011: rep stosd
L0013: mov rcx, 0x7ffa88b77008
L001d: call 0x7ffadf232520
L0022: mov rcx, rax
L0025: mov rax, rcx
L0028: mov rcx, rax
L002b: mov rax, rcx
L002e: xor ecx, ecx
L0030: mov [rsp+0x30], rcx
L0035: mov rcx, rax
L0038: lea rax, [rsp+0x30]
L003d: mov [rax], rcx
L0040: mov rcx, [rsp+0x30]
L0045: mov [rsp+0x28], rcx
L004a: mov rcx, [rsp+0x28]
L004f: lea rax, [rsp+0x38]
L0054: mov [rax], rcx
L0057: mov rcx, [rsp+0x38]
L005c: cmp [rcx], ecx
L005e: call C.M()
L0063: nop
L0064: add rsp, 0x40
L0068: pop rdi
L0069: ret
Program.ViaStruct5()
L0000: push rdi
L0001: sub rsp, 0x50
L0005: lea rdi, [rsp+0x28]
L000a: mov ecx, 0xa
L000f: xor eax, eax
L0011: rep stosd
L0013: mov rcx, 0x7ffa88b77008
L001d: call 0x7ffadf232520
L0022: mov rcx, rax
L0025: mov rax, rcx
L0028: mov rcx, rax
L002b: mov rax, rcx
L002e: xor ecx, ecx
L0030: mov [rsp+0x40], rcx
L0035: mov rcx, rax
L0038: lea rax, [rsp+0x40]
L003d: mov [rax], rcx
L0040: xor ecx, ecx
L0042: mov [rsp+0x38], rcx
L0047: mov rcx, [rsp+0x40]
L004c: mov [rsp+0x30], rcx
L0051: mov rcx, [rsp+0x30]
L0056: lea rax, [rsp+0x38]
L005b: mov [rax], rcx
L005e: mov rcx, [rsp+0x38]
L0063: mov [rsp+0x28], rcx
L0068: mov rcx, [rsp+0x28]
L006d: lea rax, [rsp+0x48]
L0072: mov [rax], rcx
L0075: mov rcx, [rsp+0x48]
L007a: cmp [rcx], ecx
L007c: call C.M()
L0081: nop
L0082: add rsp, 0x50
L0086: pop rdi
L0087: ret
Method calls themselves are almost zero cost, or at least don't increase at same rate:
public static void ViaClass(C c)
{
c.M();
}
public static void ViaStruct3(S1 s1)
{
s1.M();
}
public static void ViaStruct3(S2 s2)
{
s2.M();
}
public static void ViaStruct3(S3 s3)
{
s3.M();
}
public static void ViaStruct4(S4 s4)
{
s4.M();
}
public static void ViaStruct5(S5 s5)
{
s5.M();
}
Gives
Program.ViaClass(C)
L0000: sub rsp, 0x28
L0004: cmp [rcx], ecx
L0006: call C.M()
L000b: nop
L000c: add rsp, 0x28
L0010: ret
Program.ViaStruct3(S1)
L0000: sub rsp, 0x28
L0004: cmp [rcx], ecx
L0006: call C.M()
L000b: nop
L000c: add rsp, 0x28
L0010: ret
Program.ViaStruct3(S2)
L0000: sub rsp, 0x28
L0004: mov [rsp+0x30], rcx
L0009: mov rcx, [rsp+0x30]
L000e: cmp [rcx], ecx
L0010: call C.M()
L0015: nop
L0016: add rsp, 0x28
L001a: ret
Program.ViaStruct3(S3)
L0000: sub rsp, 0x28
L0004: mov [rsp+0x30], rcx
L0009: mov rcx, [rsp+0x30]
L000e: cmp [rcx], ecx
L0010: call C.M()
L0015: nop
L0016: add rsp, 0x28
L001a: ret
Program.ViaStruct4(S4)
L0000: sub rsp, 0x28
L0004: mov [rsp+0x30], rcx
L0009: mov rcx, [rsp+0x30]
L000e: cmp [rcx], ecx
L0010: call C.M()
L0015: nop
L0016: add rsp, 0x28
L001a: ret
Program.ViaStruct5(S5)
L0000: sub rsp, 0x28
L0004: mov [rsp+0x30], rcx
L0009: mov rcx, [rsp+0x30]
L000e: cmp [rcx], ecx
L0010: call C.M()
L0015: nop
L0016: add rsp, 0x28
L001a: ret
This becomes more problematic as the size of the original struct increases, for example Span<T> wrappers: sharplab.io
using System;
using System.Runtime.CompilerServices;
public static class Program
{
public static int ViaStruct1(Span<Byte> s)
{
var s1 = new S1(s);
return s1.Length;
}
public static int ViaStruct2(Span<Byte> s)
{
var s2 = new S2(new S1(s));
return s2.Length;
}
public static int ViaStruct3(Span<Byte> s)
{
var s3 = new S3(new S2(new S1(s)));
return s3.Length;
}
public static int ViaStruct4(Span<Byte> s)
{
var s4 = new S4(new S3(new S2(new S1(s))));
return s4.Length;
}
public static int ViaStruct5(Span<Byte> s)
{
var s5 = new S5(new S4(new S3(new S2(new S1(s)))));
return s5.Length;
}
}
public ref struct S1
{
private Span<Byte> _s;
public S1(Span<Byte> s) => _s = s;
public int Length => _s.Length;
}
public ref struct S2
{
private S1 _s;
public S2(S1 s) => _s = s;
public int Length => _s.Length;
}
public ref struct S3
{
private S2 _s;
public S3(S2 s) => _s = s;
public int Length => _s.Length;
}
public ref struct S4
{
private S3 _s;
public S4(S3 s) => _s = s;
public int Length => _s.Length;
}
public ref struct S5
{
private S4 _s;
public S5(S4 s) => _s = s;
public int Length => _s.Length;
}
Giving
Program.ViaStruct1(System.Span`1<Byte>)
L0000: push rdi
L0001: push rsi
L0002: sub rsp, 0x18
L0006: mov rsi, rcx
L0009: lea rdi, [rsp]
L000d: mov ecx, 0x6
L0012: xor eax, eax
L0014: rep stosd
L0016: mov rcx, rsi
L0019: mov rax, [rcx]
L001c: mov rdx, [rcx+0x8]
L0020: mov ecx, [rcx+0x10]
L0023: lea r8, [rsp]
L0027: mov [r8], rax
L002a: mov [r8+0x8], rdx
L002e: mov [r8+0x10], ecx
L0032: mov eax, [rsp+0x10]
L0036: add rsp, 0x18
L003a: pop rsi
L003b: pop rdi
L003c: ret
Program.ViaStruct2(System.Span`1<Byte>)
L0000: push rdi
L0001: push rsi
L0002: sub rsp, 0x48
L0006: vzeroupper
L0009: mov rsi, rcx
L000c: lea rdi, [rsp]
L0010: mov ecx, 0x12
L0015: xor eax, eax
L0017: rep stosd
L0019: mov rcx, rsi
L001c: xor eax, eax
L001e: lea rdx, [rsp+0x18]
L0023: vxorpd xmm0, xmm0, xmm0
L0028: vmovdqu [rdx], xmm0
L002d: mov [rdx+0x10], rax
L0031: mov rax, [rcx]
L0034: mov rdx, [rcx+0x8]
L0038: mov ecx, [rcx+0x10]
L003b: lea r8, [rsp+0x18]
L0040: mov [r8], rax
L0043: mov [r8+0x8], rdx
L0047: mov [r8+0x10], ecx
L004b: vmovdqu xmm0, [rsp+0x18]
L0052: vmovdqu [rsp], xmm0
L0058: mov rax, [rsp+0x28]
L005d: mov [rsp+0x10], rax
L0062: vmovdqu xmm0, [rsp]
L0068: vmovdqu [rsp+0x30], xmm0
L006f: mov rax, [rsp+0x10]
L0074: mov [rsp+0x40], rax
L0079: mov eax, [rsp+0x40]
L007d: add rsp, 0x48
L0081: pop rsi
L0082: pop rdi
L0083: ret
Program.ViaStruct3(System.Span`1<Byte>)
L0000: push rdi
L0001: push rsi
L0002: sub rsp, 0x68
L0006: vzeroupper
L0009: mov rsi, rcx
L000c: lea rdi, [rsp+0x8]
L0011: mov ecx, 0x18
L0016: xor eax, eax
L0018: rep stosd
L001a: mov rcx, rsi
L001d: xor eax, eax
L001f: lea rdx, [rsp+0x38]
L0024: vxorpd xmm0, xmm0, xmm0
L0029: vmovdqu [rdx], xmm0
L002e: mov [rdx+0x10], rax
L0032: mov rax, [rcx]
L0035: mov rdx, [rcx+0x8]
L0039: mov ecx, [rcx+0x10]
L003c: lea r8, [rsp+0x38]
L0041: mov [r8], rax
L0044: mov [r8+0x8], rdx
L0048: mov [r8+0x10], ecx
L004c: vmovdqu xmm0, [rsp+0x38]
L0053: vmovdqu [rsp+0x8], xmm0
L005a: mov rax, [rsp+0x48]
L005f: mov [rsp+0x18], rax
L0064: vmovdqu xmm0, [rsp+0x8]
L006b: vmovdqu [rsp+0x20], xmm0
L0072: mov rax, [rsp+0x18]
L0077: mov [rsp+0x30], rax
L007c: vmovdqu xmm0, [rsp+0x20]
L0083: vmovdqu [rsp+0x50], xmm0
L008a: mov rax, [rsp+0x30]
L008f: mov [rsp+0x60], rax
L0094: mov eax, [rsp+0x60]
L0098: add rsp, 0x68
L009c: pop rsi
L009d: pop rdi
L009e: ret
Program.ViaStruct4(System.Span`1<Byte>)
L0000: push rdi
L0001: push rsi
L0002: sub rsp, 0x78
L0006: vzeroupper
L0009: mov rsi, rcx
L000c: lea rdi, [rsp]
L0010: mov ecx, 0x1e
L0015: xor eax, eax
L0017: rep stosd
L0019: mov rcx, rsi
L001c: xor eax, eax
L001e: lea rdx, [rsp+0x48]
L0023: vxorpd xmm0, xmm0, xmm0
L0028: vmovdqu [rdx], xmm0
L002d: mov [rdx+0x10], rax
L0031: mov rax, [rcx]
L0034: mov rdx, [rcx+0x8]
L0038: mov ecx, [rcx+0x10]
L003b: lea r8, [rsp+0x48]
L0040: mov [r8], rax
L0043: mov [r8+0x8], rdx
L0047: mov [r8+0x10], ecx
L004b: vmovdqu xmm0, [rsp+0x48]
L0052: vmovdqu [rsp], xmm0
L0058: mov rax, [rsp+0x58]
L005d: mov [rsp+0x10], rax
L0062: vmovdqu xmm0, [rsp]
L0068: vmovdqu [rsp+0x30], xmm0
L006f: mov rax, [rsp+0x10]
L0074: mov [rsp+0x40], rax
L0079: vmovdqu xmm0, [rsp+0x30]
L0080: vmovdqu [rsp+0x18], xmm0
L0087: mov rax, [rsp+0x40]
L008c: mov [rsp+0x28], rax
L0091: vmovdqu xmm0, [rsp+0x18]
L0098: vmovdqu [rsp+0x60], xmm0
L009f: mov rax, [rsp+0x28]
L00a4: mov [rsp+0x70], rax
L00a9: mov eax, [rsp+0x70]
L00ad: add rsp, 0x78
L00b1: pop rsi
L00b2: pop rdi
L00b3: ret
Program.ViaStruct5(System.Span`1<Byte>)
L0000: push rdi
L0001: push rsi
L0002: sub rsp, 0x98
L0009: vzeroupper
L000c: mov rsi, rcx
L000f: lea rdi, [rsp+0x8]
L0014: mov ecx, 0x24
L0019: xor eax, eax
L001b: rep stosd
L001d: mov rcx, rsi
L0020: xor eax, eax
L0022: lea rdx, [rsp+0x68]
L0027: vxorpd xmm0, xmm0, xmm0
L002c: vmovdqu [rdx], xmm0
L0031: mov [rdx+0x10], rax
L0035: mov rax, [rcx]
L0038: mov rdx, [rcx+0x8]
L003c: mov ecx, [rcx+0x10]
L003f: lea r8, [rsp+0x68]
L0044: mov [r8], rax
L0047: mov [r8+0x8], rdx
L004b: mov [r8+0x10], ecx
L004f: vmovdqu xmm0, [rsp+0x68]
L0056: vmovdqu [rsp+0x8], xmm0
L005d: mov rax, [rsp+0x78]
L0062: mov [rsp+0x18], rax
L0067: vmovdqu xmm0, [rsp+0x8]
L006e: vmovdqu [rsp+0x50], xmm0
L0075: mov rax, [rsp+0x18]
L007a: mov [rsp+0x60], rax
L007f: vmovdqu xmm0, [rsp+0x50]
L0086: vmovdqu [rsp+0x38], xmm0
L008d: mov rax, [rsp+0x60]
L0092: mov [rsp+0x48], rax
L0097: vmovdqu xmm0, [rsp+0x38]
L009e: vmovdqu [rsp+0x20], xmm0
L00a5: mov rax, [rsp+0x48]
L00aa: mov [rsp+0x30], rax
L00af: vmovdqu xmm0, [rsp+0x20]
L00b6: vmovdqu [rsp+0x80], xmm0
L00c0: mov rax, [rsp+0x30]
L00c5: mov [rsp+0x90], rax
L00cd: mov eax, [rsp+0x90]
L00d4: add rsp, 0x98
L00db: pop rsi
L00dc: pop rdi
L00dd: ret
Again the method call wrapping doesn't increase in complexity
public static int ViaStruct(S1 s)
=> s.Length;
public static int ViaStruct(S2 s)
=> s.Length;
public static int ViaStruct(S3 s)
=> s.Length;
public static int ViaStruct(S4 s)
=> s.Length;
public static int ViaStruct(S5 s)
=> s.Length;
Giving
Program.ViaStruct(S1)
L0000: mov eax, [rcx+0x10]
L0003: ret
Program.ViaStruct(S2)
L0000: mov eax, [rcx+0x10]
L0003: ret
Program.ViaStruct(S3)
L0000: mov eax, [rcx+0x10]
L0003: ret
Program.ViaStruct(S4)
L0000: mov eax, [rcx+0x10]
L0003: ret
Program.ViaStruct(S5)
L0000: mov eax, [rcx+0x10]
L0003: ret
@mikedn @AndyAyersMS the extended mov shimmy
L001a: mov rcx, rax
L001d: mov rax, rcx
L0020: mov rcx, rax
L0023: mov rax, rcx
L0026: mov rcx, rax
L0029: lea rax, [rsp+0x20]
L002e: mov [rax], rcx
L0031: mov rcx, [rsp+0x20]
L0036: cmp [rcx], ecx
L0038: call C.M()
Possibly related: dotnet/runtime#7279, dotnet/runtime#9165, dotnet/runtime#9519, dotnet/runtime#10296.
Last time I looked there were some quirks in how the RA's preferencing worked could lead to this behavior. I tried tweaking this some with dotnet/coreclr#16028 but am not convinced it really helped much.
If you have just the right pattern of live ranges -- something like a local with long span between first def and last use (and intermittent liveness) and a bunch of chained temps within -- the RA wants to restore the local to the same reg at each point it becomes live. So you can get a back and forth pattern going, and by layering inlines like you do here, you can extend it to alarming lengths.
cc @CarolEidt and @dotnet/jit-contrib
Note one of the proposals for UTF8 wraps ReadyOnlySpan
Can't get the overall size of the ref struct as all my tools work with a generic param, and ref structs don't like that; but the example I was looking at in https://github.com/dotnet/corefxlab/pull/2366
JsonWriterUtf8 jsonUtf8 = JsonWriter.CreateUtf8(output, prettyPrint);
// Inlines:
// return new JsonWriterUtf8(BufferWriter.Create(new IBufferWriter(bufferWriter)), prettyPrint);
// public IBufferWriter(IBufferWriter<byte> writer)
// IBufferWriter<byte> _writer;
// static BufferWriter<TOutput> Create<TOutput>(TOutput output)
// new BufferWriter<TOutput>(output);
// public ref partial struct BufferWriter<T> where T : IBufferWriter<byte>
// IBufferWriter _output;
// Span<byte> _span;
// int _buffered;
// ReadOnlySpan<byte> NewLine
// new JsonWriterUtf8
// ref struct JsonWriterUtf8
// JsonWriterUtf8<IBufferWriter>
// new JsonWriterUtf8<IBufferWriter>(bufferWriter, prettyPrint);
// ref struct JsonWriterUtf8<JsonWriterUtf8<IBufferWriter>>
// readonly bool _prettyPrint;
// BufferWriter<JsonWriterUtf8<IBufferWriter>> _bufferWriter;
// int _indent;
// bool _firstItem;
json.WriteObjectStart();
Becomes
// JsonWriterUtf8 json = JsonWriter.CreateUtf8(output, formatted);
mov rcx,rdx
lea rdx,[rsp+88h]
vxorps xmm0,xmm0,xmm0
vmovdqu xmmword ptr [rdx],xmm0
vmovdqu xmmword ptr [rdx+10h],xmm0
vmovdqu xmmword ptr [rdx+20h],xmm0
xor edx,edx
mov dword ptr [rsp+88h],edx
lea rdx,[rsp+90h]
mov qword ptr [rdx],rcx
lea rdx,[rsp+98h]
mov r11,7FFC140F0340h
xor r8d,r8d
mov rax,7FFC140F0340h
cmp dword ptr [rcx],ecx
call qword ptr [rax]
mov rcx,7FFC1435F2C8h
xor edx,edx
call coreclr!GC_VersionInfo+0x25a30
mov rcx,21F58C52C90h
mov rcx,qword ptr [rcx]
test rcx,rcx
jne M01_L00
xor eax,eax
xor edx,edx
jmp M01_L01
M01_L00
lea rax,[rcx+10h]
mov edx,dword ptr [rcx+8]
M01_L01
lea rcx,[rsp+0A8h]
mov qword ptr [rcx],rax
mov dword ptr [rcx+8],edx
vmovdqu xmm0,xmmword ptr [rsp+88h]
vmovdqu xmmword ptr [rsp+0B8h],xmm0
vmovdqu xmm0,xmmword ptr [rsp+98h]
vmovdqu xmmword ptr [rsp+0C8h],xmm0
vmovdqu xmm0,xmmword ptr [rsp+0A8h]
vmovdqu xmmword ptr [rsp+0D8h],xmm0
vmovdqu xmm0,xmmword ptr [rsp+0B8h]
vmovdqu xmmword ptr [rsp+58h],xmm0
vmovdqu xmm0,xmmword ptr [rsp+0C8h]
vmovdqu xmmword ptr [rsp+68h],xmm0
vmovdqu xmm0,xmmword ptr [rsp+0D8h]
vmovdqu xmmword ptr [rsp+78h],xmm0
xor ecx,ecx
lea rax,[rsp+20h]
vxorps xmm0,xmm0,xmm0
vmovdqu xmmword ptr [rax],xmm0
vmovdqu xmmword ptr [rax+10h],xmm0
vmovdqu xmmword ptr [rax+20h],xmm0
mov qword ptr [rax+30h],rcx
vmovdqu xmm0,xmmword ptr [rsp+58h]
vmovdqu xmmword ptr [rsp+28h],xmm0
vmovdqu xmm0,xmmword ptr [rsp+68h]
vmovdqu xmmword ptr [rsp+38h],xmm0
vmovdqu xmm0,xmmword ptr [rsp+78h]
vmovdqu xmmword ptr [rsp+48h],xmm0
movzx ecx,sil
mov byte ptr [rsp+24h],cl
mov dword ptr [rsp+20h],0FFFFFFFFh
mov byte ptr [rsp+25h],1
vmovdqu xmm0,xmmword ptr [rsp+20h]
vmovdqu xmmword ptr [rsp+0E8h],xmm0
vmovdqu xmm0,xmmword ptr [rsp+30h]
vmovdqu xmmword ptr [rsp+0F8h],xmm0
vmovdqu xmm0,xmmword ptr [rsp+40h]
vmovdqu xmmword ptr [rsp+108h],xmm0
mov rcx,qword ptr [rsp+50h]
mov qword ptr [rsp+118h],rcx
vmovdqu xmm0,xmmword ptr [rsp+0E8h]
vmovdqu xmmword ptr [rsp+120h],xmm0
vmovdqu xmm0,xmmword ptr [rsp+0F8h]
vmovdqu xmmword ptr [rsp+130h],xmm0
vmovdqu xmm0,xmmword ptr [rsp+108h]
vmovdqu xmmword ptr [rsp+140h],xmm0
mov rcx,qword ptr [rsp+118h]
mov qword ptr [rsp+150h],rcx
lea rcx,[rsp+120h]
call System.Text.JsonLab.JsonWriterUtf8`1[[System.Text.JsonLab.IBufferWriter, System.Text.JsonLab]].WriteObjectStart()
Interesting that I see different disassembly from BenchmarkDotNet (both on .Net Core 2.1 and .Net Framework 4.7.2):



Interesting that I see different disassembly from BenchmarkDotNet (both on .Net Core 2.1 and .Net Framework 4.7.2):
So is less worse in Core 2,1; but still not great as they should all collapse to be the same as the class (when inlined) rather than shuffling the pointer back and forth between the registers and then moving it higher and higher through stack space, before returning it to the original register and invoking call
The JIT compilers are different between the two: (.Net Core 2.1 and .Net Framework 4.7.2):
With Net Core 2.1 being newer.
Guessing would also effect the sync path in async as that is lots of structs wrapping structs? (though exception handling may complicate things)
JitDump for ViaStruct5 https://gist.github.com/benaadams/ad76003a334f3da6916ac5c8a80d95d1
I suppose an orthogonal question is why it generates:
G_M45457_IG01: ; func=00, offs=000000H, size=0013H, gcrefRegs=00000000 {}, byrefRegs=00000000 {}, byref, nogc <-- Prolog IG
IN0018: 000000 push rdi
IN0019: 000001 sub rsp, 80
IN001a: 000005 lea rdi, [rsp+28H]
IN001b: 00000A mov ecx, 10
IN001c: 00000F xor rax, rax
IN001d: 000011 rep stosd
G_M45457_IG02: ; offs=000013H, size=0061H, gcrefRegs=00000000 {}, byrefRegs=00000000 {}, byref
IN0001: 000013 mov rcx, 0x7FFC12345508
IN0002: 00001D call CORINFO_HELP_NEWSFAST
IN0003: 000022 mov rcx, rax
IN0004: 000025 mov rax, rcx
IN0005: 000028 mov rcx, rax
IN0006: 00002B mov rax, rcx
IN0007: 00002E mov rcx, rax
IN0008: 000031 lea rax, bword ptr [V05 rsp+40H]
IN0009: 000036 mov gword ptr [rax], rcx
IN000a: 000039 mov rcx, gword ptr [V05 rsp+40H]
IN000b: 00003E mov gword ptr [V09 rsp+30H], rcx
IN000c: 000043 mov rcx, gword ptr [V09 rsp+30H]
IN000d: 000048 lea rax, bword ptr [V06 rsp+38H]
IN000e: 00004D mov gword ptr [rax], rcx
IN000f: 000050 mov rcx, gword ptr [V06 rsp+38H]
IN0010: 000055 mov gword ptr [V10 rsp+28H], rcx
IN0011: 00005A mov rcx, gword ptr [V10 rsp+28H]
IN0012: 00005F lea rax, bword ptr [V00 rsp+48H]
IN0013: 000064 mov gword ptr [rax], rcx
IN0014: 000067 mov rcx, gword ptr [V00 rsp+48H]
IN0015: 00006C cmp dword ptr [rcx], ecx
IN0016: 00006E call C:M():this
IN0017: 000073 nop
G_M45457_IG03: ; offs=000074H, size=0006H, epilog, nogc, emitadd
IN001e: 000074 add rsp, 80
IN001f: 000078 pop rdi
IN0020: 000079 ret
Rather than something like
push rdi
sub rsp, 0x28
mov rcx, 0x7FFC12345508
call CORINFO_HELP_NEWSFAST
mov rcx, rax
mov rax, rcx
mov rcx, rax
mov rax, rcx
mov rcx, rax
mov rax, rcx
mov rcx, rax
mov rax, rcx
mov rcx, rax
mov rax, rcx
mov rcx, rax
mov rax, rcx
mov rcx, rax
mov rax, rcx
mov rcx, rax
call C:M():this
nop
add rsp, 0x28
pop rdi
ret
; V00 loc0 [V00,T05] ( 2, 2 ) struct ( 8) [rsp+0x48] do-not-enreg[SF] must-init ld-addr-op
; V01 OutArgs [V01 ] ( 1, 1 ) lclBlk (32) [rsp+0x00]
; V02 tmp1 [V02,T00] ( 2, 4 ) ref -> rcx class-hnd exact
;* V03 tmp2 [V03 ] ( 0, 0 ) struct ( 8) zero-ref
;* V04 tmp3 [V04 ] ( 0, 0 ) struct ( 8) zero-ref
; V05 tmp4 [V05 ] ( 2, 4 ) struct ( 8) [rsp+0x40] do-not-enreg[XSFB] must-init addr-exposed
; V06 tmp5 [V06 ] ( 2, 4 ) struct ( 8) [rsp+0x38] do-not-enreg[XSF] must-init addr-exposed
;* V07 tmp6 [V07 ] ( 0, 0 ) struct ( 8) zero-ref
;* V08 tmp7 [V08 ] ( 0, 0 ) struct ( 8) zero-ref
; V09 tmp8 [V09 ] ( 2, 4 ) struct ( 8) [rsp+0x30] do-not-enreg[XSF] must-init addr-exposed
; V10 tmp9 [V10 ] ( 2, 4 ) struct ( 8) [rsp+0x28] do-not-enreg[XSF] must-init addr-exposed
do-not-enreg[XSF] must-init addr-exposed?
Is it because they are passed byref?
[000063] --C-G------- * CALL nullcheck void C.M
[000062] ----G------- this in rcx \--* FIELD ref _c
[000052] ----G------- \--* ADDR byref
[000053] ----G------- \--* FIELD struct _s
[000054] ----G------- \--* ADDR byref
[000055] ----G------- \--* FIELD struct _s
[000056] ----G------- \--* ADDR byref
[000057] ----G------- \--* FIELD struct _s
[000058] ----G------- \--* ADDR byref
[000059] ----G------- \--* FIELD struct _s
[000060] L----------- \--* ADDR byref
[000061] ------------ \--* LCL_VAR struct V00 arg0
"Inlining" via Unsafe.As gets a bit closer, though obviously its not very flexible; and still has some in and out of stack action:
public static void ViaStruct5()
{
var s1 = new S1(new C());
var s5 = Unsafe.As<S1, S5>(ref s1);
s5.M();
}
; Assembly listing for method Program:ViaStruct5()
;* V00 loc0 [V00 ] ( 0, 0 ) struct ( 8) zero-ref ld-addr-op
; V01 loc1 [V01,T02] ( 2, 2 ) struct ( 8) [rsp+0x20] do-not-enreg[SFB] must-init ld-addr-op
; V02 OutArgs [V02 ] ( 1, 1 ) lclBlk (32) [rsp+0x00]
; V03 tmp1 [V03,T00] ( 2, 4 ) ref -> rax class-hnd exact
; V04 tmp2 [V04,T01] ( 2, 2 ) ref -> rax V00._c(offs=0x00) P-INDEP
;
; Lcl frame size = 40
G_M30623_IG01:
4883EC28 sub rsp, 40
33C0 xor rax, rax
4889442420 mov qword ptr [rsp+20H], rax
G_M30623_IG02:
48B908555B0DFC7F0000 mov rcx, 0x7FFC0D5B5508
E81627AE5F call CORINFO_HELP_NEWSFAST
488D4C2420 lea rcx, bword ptr [rsp+20H]
488901 mov gword ptr [rcx], rax
488B4C2420 mov rcx, gword ptr [rsp+20H]
3909 cmp dword ptr [rcx], ecx
E842FEFFFF call C:M():this
90 nop
G_M30623_IG03:
4883C428 add rsp, 40
C3 ret
; Total bytes of code 52, prolog size 11 for method Program:ViaStruct5()
Rather than
public static void ViaStruct5()
{
var s5 = new S5(new S4(new S3(new S2(new S1(new C())))));
s5.M();
}
; Assembly listing for method Program:ViaStruct5()
; V00 loc0 [V00,T05] ( 2, 2 ) struct ( 8) [rsp+0x48] do-not-enreg[SF] must-init ld-addr-op
; V01 OutArgs [V01 ] ( 1, 1 ) lclBlk (32) [rsp+0x00]
; V02 tmp1 [V02,T00] ( 2, 4 ) ref -> rcx class-hnd exact
;* V03 tmp2 [V03 ] ( 0, 0 ) struct ( 8) zero-ref
;* V04 tmp3 [V04 ] ( 0, 0 ) struct ( 8) zero-ref
; V05 tmp4 [V05 ] ( 2, 4 ) struct ( 8) [rsp+0x40] do-not-enreg[XSFB] must-init addr-exposed
; V06 tmp5 [V06 ] ( 2, 4 ) struct ( 8) [rsp+0x38] do-not-enreg[XSF] must-init addr-exposed
;* V07 tmp6 [V07 ] ( 0, 0 ) struct ( 8) zero-ref
;* V08 tmp7 [V08 ] ( 0, 0 ) struct ( 8) zero-ref
; V09 tmp8 [V09 ] ( 2, 4 ) struct ( 8) [rsp+0x30] do-not-enreg[XSF] must-init addr-exposed
; V10 tmp9 [V10 ] ( 2, 4 ) struct ( 8) [rsp+0x28] do-not-enreg[XSF] must-init addr-exposed
; V11 tmp10 [V11,T01] ( 2, 2 ) ref -> rax V03._c(offs=0x00) P-INDEP
; V12 tmp11 [V12,T02] ( 2, 2 ) ref -> rax V04._s(offs=0x00) P-INDEP
; V13 tmp12 [V13,T03] ( 2, 2 ) ref -> rcx V07._c(offs=0x00) P-INDEP
; V14 tmp13 [V14,T04] ( 2, 2 ) ref -> rcx V08._s(offs=0x00) P-INDEP
;
; Lcl frame size = 80
G_M30626_IG01:
57 push rdi
4883EC50 sub rsp, 80
488D7C2428 lea rdi, [rsp+28H]
B90A000000 mov ecx, 10
33C0 xor rax, rax
F3AB rep stosd
G_M30626_IG02:
48B90855590DFC7F0000 mov rcx, 0x7FFC0D595508
E80E27B05F call CORINFO_HELP_NEWSFAST
488BC8 mov rcx, rax
488BC1 mov rax, rcx
488BC8 mov rcx, rax
488BC1 mov rax, rcx
488BC8 mov rcx, rax
488D442440 lea rax, bword ptr [rsp+40H]
488908 mov gword ptr [rax], rcx
488B4C2440 mov rcx, gword ptr [rsp+40H]
48894C2430 mov gword ptr [rsp+30H], rcx
488B4C2430 mov rcx, gword ptr [rsp+30H]
488D442438 lea rax, bword ptr [rsp+38H]
488908 mov gword ptr [rax], rcx
488B4C2438 mov rcx, gword ptr [rsp+38H]
48894C2428 mov gword ptr [rsp+28H], rcx
488B4C2428 mov rcx, gword ptr [rsp+28H]
488D442448 lea rax, bword ptr [rsp+48H]
488908 mov gword ptr [rax], rcx
488B4C2448 mov rcx, gword ptr [rsp+48H]
3909 cmp dword ptr [rcx], ecx
E8FDFDFFFF call C:M():this
90 nop
G_M30626_IG03:
4883C450 add rsp, 80
5F pop rdi
C3 ret
; Total bytes of code 122, prolog size 19 for method Program:ViaStruct5()
Also note struct promotion is not as general as one might hope -- struct of structs generally won't get promoted except in one very special case (struct of struct of primitive).
Excuse my poor drawings...
What I was hoping it would do when all the .ctors were inlined and then optimized would be:

Requiring sizeof(Struct5) of stack space; if it was a single pointer promotion would be a bonus.
However what it seems to do is

Requiring sizeof(Struct5) + sizeof(Struct4) + sizeof(Struct3) + sizeof(Struct2) + sizeof(Struct1) of stack space and lots of copy ops.
If I've got that right?
Could be -- the jit will be very conservative in places because of aliasing worries.
I'll take a look, maybe early next week some time?
That would be great! I think async has the same issue
e.g.
static async Task<int> Main(string[] args)
{
return await ValueTask1();
}
public static async ValueTask<int> ValueTask1()
{
return await ValueTask2().ConfigureAwait(false);
}
public static ValueTask<int> ValueTask2()
{
return new ValueTask<int>(1);
}
Generates <ValueTask1>d__1:MoveNext() *JitDump the same sort of things for its inlined .ctors
Inlines into 0600001F <ValueTask1>d__1:MoveNext():this
[1 IL=0010 TR=000067 06000003] [below ALWAYS_INLINE size] Program:ValueTask2():struct
[2 IL=0001 TR=000158 06002C45] [aggressive inline attribute] ValueTask`1:.ctor(int):this
[3 IL=0019 TR=000077 06002C57] [aggressive inline attribute] ValueTask`1:ConfigureAwait(bool):struct:this
[4 IL=0019 TR=000210 06002C48] [aggressive inline attribute] ValueTask`1:.ctor(ref,int,short,bool):this
[5 IL=0024 TR=000223 06005070] [aggressive inline attribute] ConfiguredValueTaskAwaitable`1:.ctor(struct):this
[6 IL=0028 TR=000087 06005071] [aggressive inline attribute] ConfiguredValueTaskAwaitable`1:GetAwaiter():struct:this
[7 IL=0006 TR=000293 06005072] [aggressive inline attribute] ConfiguredValueTaskAwaiter:.ctor(struct):this
[8 IL=0036 TR=000096 06005073] [aggressive inline attribute] ConfiguredValueTaskAwaiter:get_IsCompleted():bool:this
[9 IL=0006 TR=000322 06002C51] [aggressive inline attribute] ValueTask`1:get_IsCompleted():bool:this
[10 IL=0023 TR=000382 060029E5] [below ALWAYS_INLINE size] Task:get_IsCompleted():bool:this
[11 IL=0010 TR=000412 060029E6] [below ALWAYS_INLINE size] Task:IsCompletedMethod(int):bool
....
IN0009: 00004B lea r9, bword ptr [V16 rbp-58H]
IN000a: 00004F mov gword ptr [r9], rdx ; <-- loading rdx
IN000b: 000052 mov dword ptr [r9+8], ecx
IN000c: 000056 mov word ptr [r9+12], ax
IN000d: 00005B mov byte ptr [r9+14], r8b
G_M13791_IG04: ; offs=00005FH, size=000CH, nogc, emitadd
IN000e: 00005F vmovdqu xmm0, qword ptr [V16 rbp-58H] ; copy
IN000f: 000065 vmovdqu qword ptr [V12 rbp-48H], xmm0
G_M13791_IG05: ; offs=00006BH, size=000CH, nogc, emitadd
IN0010: 00006B vmovdqu xmm0, qword ptr [V12 rbp-48H] ; copy
IN0011: 000071 vmovdqu qword ptr [V05 rbp-38H], xmm0
G_M13791_IG06: ; offs=000077H, size=000CH, nogc, emitadd
IN0012: 000077 vmovdqu xmm0, qword ptr [V05 rbp-38H] ; copy
IN0013: 00007D vmovdqu qword ptr [V18 rbp-78H], xmm0
G_M13791_IG07: ; offs=000083H, size=000CH, nogc, emitadd
IN0014: 000083 vmovdqu xmm0, qword ptr [V18 rbp-78H] ; copy
IN0015: 000089 vmovdqu qword ptr [V17 rbp-68H], xmm0
G_M13791_IG08: ; offs=00008FH, size=000CH, nogc, emitadd
IN0016: 00008F vmovdqu xmm0, qword ptr [V17 rbp-68H] ; copy
IN0017: 000095 vmovdqu qword ptr [V03 rbp-28H], xmm0
G_M13791_IG09: ; offs=00009BH, size=0010H, isz, emitadd
IN0018: 00009B mov rsi, gword ptr [V03 rbp-28H] ; <--- this is rdx ?
IN0019: 00009F test rsi, rsi
Probably unhelpful; but I was manually doing this
4C8D4DA8 lea r9, bword ptr [rbp-58H]
498911 mov gword ptr [r9], rdx
41894908 mov dword ptr [r9+8], ecx
664189410C mov word ptr [r9+12], ax
4588410E mov byte ptr [r9+14], r8b
C4E17A6F45A8 vmovdqu xmm0, qword ptr [rbp-58H]
C4E17A7F45B8 vmovdqu qword ptr [rbp-48H], xmm0
C4E17A6F45B8 vmovdqu xmm0, qword ptr [rbp-48H]
C4E17A7F45C8 vmovdqu qword ptr [rbp-38H], xmm0
C4E17A6F45C8 vmovdqu xmm0, qword ptr [rbp-38H]
C4E17A7F4588 vmovdqu qword ptr [rbp-78H], xmm0
C4E17A6F4588 vmovdqu xmm0, qword ptr [rbp-78H]
C4E17A7F4598 vmovdqu qword ptr [rbp-68H], xmm0
C4E17A6F4598 vmovdqu xmm0, qword ptr [rbp-68H]
C4E17A7F45D8 vmovdqu qword ptr [rbp-28H], xmm0
488B75D8 mov rsi, gword ptr [rbp-28H]
4885F6 test rsi, rsi
7507 jne SHORT G_M2754_IG10
BF01000000 mov edi, 1
EB4E jmp SHORT G_M2754_IG12
I'd probably go through these steps
- 4C8D4DA8 lea r9, bword ptr [rbp-58H]
+ 4C8D4DA8 lea r9, bword ptr [rbp-48H]
498911 mov gword ptr [r9], rdx
41894908 mov dword ptr [r9+8], ecx
664189410C mov word ptr [r9+12], ax
4588410E mov byte ptr [r9+14], r8b
- C4E17A6F45A8 vmovdqu xmm0, qword ptr [rbp-58H]
- C4E17A7F45B8 vmovdqu qword ptr [rbp-48H], xmm0
C4E17A6F45B8 vmovdqu xmm0, qword ptr [rbp-48H]
C4E17A7F45C8 vmovdqu qword ptr [rbp-38H], xmm0
C4E17A6F45C8 vmovdqu xmm0, qword ptr [rbp-38H]
C4E17A7F4588 vmovdqu qword ptr [rbp-78H], xmm0
C4E17A6F4588 vmovdqu xmm0, qword ptr [rbp-78H]
C4E17A7F4598 vmovdqu qword ptr [rbp-68H], xmm0
C4E17A6F4598 vmovdqu xmm0, qword ptr [rbp-68H]
C4E17A7F45D8 vmovdqu qword ptr [rbp-28H], xmm0
488B75D8 mov rsi, gword ptr [rbp-28H]
4885F6 test rsi, rsi
7507 jne SHORT G_M2754_IG10
BF01000000 mov edi, 1
EB4E jmp SHORT G_M2754_IG12
- 4C8D4DA8 lea r9, bword ptr [rbp-48H]
+ 4C8D4DA8 lea r9, bword ptr [rbp-38H]
498911 mov gword ptr [r9], rdx
41894908 mov dword ptr [r9+8], ecx
664189410C mov word ptr [r9+12], ax
4588410E mov byte ptr [r9+14], r8b
- C4E17A6F45B8 vmovdqu xmm0, qword ptr [rbp-48H]
- C4E17A7F45C8 vmovdqu qword ptr [rbp-38H], xmm0
C4E17A6F45C8 vmovdqu xmm0, qword ptr [rbp-38H]
C4E17A7F4588 vmovdqu qword ptr [rbp-78H], xmm0
C4E17A6F4588 vmovdqu xmm0, qword ptr [rbp-78H]
C4E17A7F4598 vmovdqu qword ptr [rbp-68H], xmm0
C4E17A6F4598 vmovdqu xmm0, qword ptr [rbp-68H]
C4E17A7F45D8 vmovdqu qword ptr [rbp-28H], xmm0
488B75D8 mov rsi, gword ptr [rbp-28H]
4885F6 test rsi, rsi
7507 jne SHORT G_M2754_IG10
BF01000000 mov edi, 1
EB4E jmp SHORT G_M2754_IG12
- 4C8D4DA8 lea r9, bword ptr [rbp-38H]
+ 4C8D4DA8 lea r9, bword ptr [rbp-78H]
498911 mov gword ptr [r9], rdx
41894908 mov dword ptr [r9+8], ecx
664189410C mov word ptr [r9+12], ax
4588410E mov byte ptr [r9+14], r8b
- C4E17A6F45C8 vmovdqu xmm0, qword ptr [rbp-38H]
- C4E17A7F4588 vmovdqu qword ptr [rbp-78H], xmm0
C4E17A6F4588 vmovdqu xmm0, qword ptr [rbp-78H]
C4E17A7F4598 vmovdqu qword ptr [rbp-68H], xmm0
C4E17A6F4598 vmovdqu xmm0, qword ptr [rbp-68H]
C4E17A7F45D8 vmovdqu qword ptr [rbp-28H], xmm0
488B75D8 mov rsi, gword ptr [rbp-28H]
4885F6 test rsi, rsi
7507 jne SHORT G_M2754_IG10
BF01000000 mov edi, 1
EB4E jmp SHORT G_M2754_IG12
- 4C8D4DA8 lea r9, bword ptr [rbp-78H]
+ 4C8D4DA8 lea r9, bword ptr [rbp-68H]
498911 mov gword ptr [r9], rdx
41894908 mov dword ptr [r9+8], ecx
664189410C mov word ptr [r9+12], ax
4588410E mov byte ptr [r9+14], r8b
- C4E17A6F4588 vmovdqu xmm0, qword ptr [rbp-78H]
- C4E17A7F4598 vmovdqu qword ptr [rbp-68H], xmm0
C4E17A6F4598 vmovdqu xmm0, qword ptr [rbp-68H]
C4E17A7F45D8 vmovdqu qword ptr [rbp-28H], xmm0
488B75D8 mov rsi, gword ptr [rbp-28H]
4885F6 test rsi, rsi
7507 jne SHORT G_M2754_IG10
BF01000000 mov edi, 1
EB4E jmp SHORT G_M2754_IG12
- 4C8D4DA8 lea r9, bword ptr [rbp-68H]
+ 4C8D4DA8 lea r9, bword ptr [rbp-28H]
498911 mov gword ptr [r9], rdx
41894908 mov dword ptr [r9+8], ecx
664189410C mov word ptr [r9+12], ax
4588410E mov byte ptr [r9+14], r8b
- C4E17A6F4598 vmovdqu xmm0, qword ptr [rbp-68H]
- C4E17A7F45D8 vmovdqu qword ptr [rbp-28H], xmm0
488B75D8 mov rsi, gword ptr [rbp-28H]
4885F6 test rsi, rsi
7507 jne SHORT G_M2754_IG10
BF01000000 mov edi, 1
EB4E jmp SHORT G_M2754_IG12
- 4C8D4DA8 lea r9, bword ptr [rbp-28H] ; maybe?
- 498911 mov gword ptr [r9], rdx ; maybe?
- 41894908 mov dword ptr [r9+8], ecx ; maybe?
- 664189410C mov word ptr [r9+12], ax ; maybe?
- 4588410E mov byte ptr [r9+14], r8b ; maybe?
- 488B75D8 mov rsi, gword ptr [rbp-28H]
+ 488B75D8 mov rsi, rdx
4885F6 test rsi, rsi
7507 jne SHORT G_M2754_IG10
BF01000000 mov edi, 1
EB4E jmp SHORT G_M2754_IG12
To end up with
- 4C8D4DA8 lea r9, bword ptr [rbp-58H]
- 498911 mov gword ptr [r9], rdx
- 41894908 mov dword ptr [r9+8], ecx
- 664189410C mov word ptr [r9+12], ax
- 4588410E mov byte ptr [r9+14], r8b
- C4E17A6F45A8 vmovdqu xmm0, qword ptr [rbp-58H]
- C4E17A7F45B8 vmovdqu qword ptr [rbp-48H], xmm0
- C4E17A6F45B8 vmovdqu xmm0, qword ptr [rbp-48H]
- C4E17A7F45C8 vmovdqu qword ptr [rbp-38H], xmm0
- C4E17A6F45C8 vmovdqu xmm0, qword ptr [rbp-38H]
- C4E17A7F4588 vmovdqu qword ptr [rbp-78H], xmm0
- C4E17A6F4588 vmovdqu xmm0, qword ptr [rbp-78H]
- C4E17A7F4598 vmovdqu qword ptr [rbp-68H], xmm0
- C4E17A6F4598 vmovdqu xmm0, qword ptr [rbp-68H]
- C4E17A7F45D8 vmovdqu qword ptr [rbp-28H], xmm0
- 488B75D8 mov rsi, gword ptr [rbp-28H]
- 4885F6 test rsi, rsi
+ 4885F6 test rdx, rdx
7507 jne SHORT G_M2754_IG10
BF01000000 mov edi, 1
EB4E jmp SHORT G_M2754_IG12
Cutting out 184 bytes of copy to and from stack?
May be related https://github.com/dotnet/corefx/issues/30655
Yes, the first case is a limitation in struct promotion.... given a pointer-sized non-float primitive type x, the jit can promote struct(x) and struct(struct(x)) but not cases with three or more levels of wrapper.
The changes here are a rough cut at arbitrary unwrapping... but the diffs aren't as clean as one might hope, as the register shuffling is still there, and for some reason we still have one struct hanging around:
; Assembly listing for method Program:ViaStruct5()
; Emitting BLENDED_CODE for X64 CPU with AVX
; optimized code
; partially interruptible
; Final local variable assignments
;
-; V00 loc0 [V00,T05] ( 2, 2 ) struct ( 8) [rsp+0x48] do-not-enreg[SF] must-init ld-addr-op
+;* V00 loc0 [V00 ] ( 0, 0 ) struct ( 8) zero-ref ld-addr-op
; V01 OutArgs [V01 ] ( 1, 1 ) lclBlk (32) [rsp+0x00]
; V02 tmp1 [V02,T00] ( 2, 4 ) ref -> rcx class-hnd exact
;* V03 tmp2 [V03 ] ( 0, 0 ) struct ( 8) zero-ref
;* V04 tmp3 [V04 ] ( 0, 0 ) struct ( 8) zero-ref
-; V05 tmp4 [V05 ] ( 2, 4 ) struct ( 8) [rsp+0x40] do-not-enreg[XSFB] must-init addr-exposed
-; V06 tmp5 [V06 ] ( 2, 4 ) struct ( 8) [rsp+0x38] do-not-enreg[XSF] must-init addr-exposed
+;* V05 tmp4 [V05 ] ( 0, 0 ) struct ( 8) zero-ref
+;* V06 tmp5 [V06 ] ( 0, 0 ) struct ( 8) zero-ref
;* V07 tmp6 [V07 ] ( 0, 0 ) struct ( 8) zero-ref
;* V08 tmp7 [V08 ] ( 0, 0 ) struct ( 8) zero-ref
-; V09 tmp8 [V09 ] ( 2, 4 ) struct ( 8) [rsp+0x30] do-not-enreg[XSF] must-init addr-exposed
-; V10 tmp9 [V10 ] ( 2, 4 ) struct ( 8) [rsp+0x28] do-not-enreg[XSF] must-init addr-exposed
-; V11 tmp10 [V11,T01] ( 2, 2 ) ref -> rax V03._c(offs=0x00) P-INDEP
-; V12 tmp11 [V12,T02] ( 2, 2 ) ref -> rax V04._s(offs=0x00) P-INDEP
-; V13 tmp12 [V13,T03] ( 2, 2 ) ref -> rcx V07._c(offs=0x00) P-INDEP
-; V14 tmp13 [V14,T04] ( 2, 2 ) ref -> rcx V08._s(offs=0x00) P-INDEP
-;
-; Lcl frame size = 80
+;* V09 tmp8 [V09 ] ( 0, 0 ) struct ( 8) zero-ref
+;* V10 tmp9 [V10 ] ( 0, 0 ) struct ( 8) zero-ref
+; V11 tmp10 [V11,T01] ( 2, 2 ) ref -> [rsp+0x20] do-not-enreg[F] V00._s(offs=0x00) P-INDEP
+; V12 tmp11 [V12,T02] ( 2, 2 ) ref -> rax V03._c(offs=0x00) P-INDEP
+; V13 tmp12 [V13,T03] ( 2, 2 ) ref -> rax V04._s(offs=0x00) P-INDEP
+; V14 tmp13 [V14,T04] ( 2, 2 ) ref -> rax V05._s(offs=0x00) P-INDEP
+; V15 tmp14 [V15,T05] ( 2, 2 ) ref -> rax V06._s(offs=0x00) P-INDEP
+; V16 tmp15 [V16,T06] ( 2, 2 ) ref -> rcx V07._c(offs=0x00) P-INDEP
+; V17 tmp16 [V17,T07] ( 2, 2 ) ref -> rcx V08._s(offs=0x00) P-INDEP
+; V18 tmp17 [V18,T08] ( 2, 2 ) ref -> rcx V09._s(offs=0x00) P-INDEP
+; V19 tmp18 [V19,T09] ( 2, 2 ) ref -> rcx V10._s(offs=0x00) P-INDEP
+;
+; Lcl frame size = 40
G_M60507_IG01:
- push rdi
- sub rsp, 80
- lea rdi, [rsp+28H]
- mov ecx, 10
- xor rax, rax
- rep stosd
+ sub rsp, 40
G_M60507_IG02:
mov rcx, 0xD1FFAB1E
call CORINFO_HELP_NEWSFAST
@@ -274,40 +264,31 @@ G_M60507_IG02:
mov rcx, rax
mov rax, rcx
mov rcx, rax
- lea rax, bword ptr [rsp+40H]
- mov gword ptr [rax], rcx
- mov rcx, gword ptr [rsp+40H]
- mov gword ptr [rsp+30H], rcx
- mov rcx, gword ptr [rsp+30H]
- lea rax, bword ptr [rsp+38H]
- mov gword ptr [rax], rcx
- mov rcx, gword ptr [rsp+38H]
- mov gword ptr [rsp+28H], rcx
- mov rcx, gword ptr [rsp+28H]
- lea rax, bword ptr [rsp+48H]
- mov gword ptr [rax], rcx
- mov rcx, gword ptr [rsp+48H]
+ mov rax, rcx
+ mov rcx, rax
+ mov rax, rcx
+ mov rcx, rax
+ mov gword ptr [rsp+20H], rcx
+ mov rcx, gword ptr [rsp+20H]
cmp dword ptr [rcx], ecx
call C:M():this
nop
G_M60507_IG03:
- add rsp, 80
- pop rdi
+ add rsp, 40
ret
-; Total bytes of code 122, prolog size 19 for method Program:ViaStruct5()
+; Total bytes of code 69, prolog size 4 for method Program:ViaStruct5()
We should be able to further generalize I think... let me play with this some more.
As detailed in https://github.com/dotnet/corefx/issues/30655#issuecomment-400317238, the micro benchmark shows a 10x speed difference between getting a flat struct vs. a one level nested struct, and irrespective if the fields are primitives or not.
Any performance enhancements with wrapped structs would help all ConfigureAwait() calls, and would be _super useful_ when awaiting completed tasks.
Thanks.
It is not as easy to promote struct(struct(x, y)) as it is to promote struct(x, struct(y)), as in the former the number of fields changes. The jit's nested struct promotion is quite limited and put there to ensure mainly that Span<T> can be promoted: it contains two fields, one a nested struct ByReference<T> with one field.
To generalize here, we probably need to augment the jit promotion descriptors with something more like the field sequence info, so we can have promotion trees, and then work through all the implications....
cc @erozenfeld
Using that change on coreclr (where I didn't expected to see much) I see the following
Total bytes of diff: -2 (0.00% of base)
diff is an improvement.
Total byte diff includes 0 bytes from reconciling methods
Base had 0 unique methods, 0 unique bytes
Diff had 0 unique methods, 0 unique bytes
Top file improvements by size (bytes):
-2 : System.Private.CoreLib.dasm (0.00% of base)
1 total files with size differences (1 improved, 0 regressed), 0 unchanged.
Top method regessions by size (bytes):
22 : System.Private.CoreLib.dasm - CommonlyUsedGenericInstantiations:AsyncHelper() (2 methods)
6 : System.Private.CoreLib.dasm - AsyncValueTaskMethodBuilder:Create():struct
Top method improvements by size (bytes):
-30 : System.Private.CoreLib.dasm - AsyncVoidMethodBuilder:Create():struct
3 total methods with size differences (1 improved, 2 regressed), 17429 unchanged.
Improvement
; Assembly listing for method AsyncVoidMethodBuilder:Create():struct
;
; V00 RetBuf [V00,T00] ( 5, 5 ) byref -> rsi
; V01 loc0 [V01,T01] ( 5, 4 ) ref -> rdi class-hnd
-; V02 loc1 [V02,T02] ( 3, 3 ) struct (16) [rsp+0x20] do-not-enreg[SFB] must-init ld-addr-op
-; V03 OutArgs [V03 ] ( 1, 1 ) lclBlk (32) [rsp+0x00]
+;* V02 loc1 [V02 ] ( 0, 0 ) struct (16) zero-ref ld-addr-op
+; V03 OutArgs [V03 ] ( 1, 1 ) lclBlk (32) [rsp+0x00]
+; V04 tmp1 [V04,T02] ( 2, 2 ) ref -> rdx V02._synchronizationContext(offs=0x00) P-INDEP
+; V05 tmp2 [V05,T03] ( 2, 2 ) ref -> rbx V02._builder(offs=0x08) P-INDEP
;
-; Lcl frame size = 48
+; Lcl frame size = 32
G_M38150_IG01:
push rdi
push rsi
push rbx
- sub rsp, 48
- xor rax, rax
- mov qword ptr [rsp+20H], rax
- mov qword ptr [rsp+28H], rax
- mov rbx, rcx
+ sub rsp, 32
+ mov rsi, rcx
G_M38150_IG02:
call SynchronizationContext:get_Current():ref
- mov rsi, rax
- test rsi, rsi
+ mov rdi, rax
+ test rdi, rdi
je SHORT G_M38150_IG03
- mov rcx, rsi
- mov rax, qword ptr [rsi]
+ mov rcx, rdi
+ mov rax, qword ptr [rdi]
mov rax, qword ptr [rax+64]
call qword ptr [rax+48]SynchronizationContext:OperationStarted():this
-G_M38150_IG03:
- lea rax, bword ptr [rsp+20H]
-G_M38150_IG04:
- xorps xmm0, xmm0
- movdqu qword ptr [rax], xmm0
-G_M38150_IG05:
- mov gword ptr [rsp+20H], rsi
- mov rdi, rbx
- lea rsi, bword ptr [rsp+20H]
- call CORINFO_HELP_ASSIGN_BYREF
- call CORINFO_HELP_ASSIGN_BYREF
- mov rax, rbx
-G_M38150_IG06:
- add rsp, 48
+ G_M38150_IG03:
+ xor rbx, rbx
+ mov rdx, rdi
+ lea rcx, bword ptr [rsi]
+ call CORINFO_HELP_CHECKED_ASSIGN_REF
+ mov gword ptr [rsi+8], rbx
+ mov rax, rsi
+ G_M38150_IG04:
+ add rsp, 32
pop rbx
pop rsi
pop rdi
ret
-; Total bytes of code 94, prolog size 19 for method AsyncVoidMethodBuilder:Create():struct
+; Total bytes of code 64, prolog size 7 for method AsyncVoidMethodBuilder:Create():struct
Regression
; Assembly listing for method AsyncValueTaskMethodBuilder:Create():struct
;
; V00 RetBuf [V00,T00] ( 4, 4 ) byref -> rcx
-;* V01 loc0 [V01 ] ( 0, 0 ) struct (16) zero-ref do-not-enreg[SB] ld-addr-op
+;* V01 loc0 [V01 ] ( 0, 0 ) struct (16) zero-ref ld-addr-op
;# V02 OutArgs [V02 ] ( 1, 1 ) lclBlk ( 0) [rsp+0x00]
+;* V03 tmp1 [V03,T03] ( 0, 0 ) bool -> zero-ref V01._haveResult(offs=0x00) P-INDEP
+; V04 tmp2 [V04,T01] ( 3, 3 ) bool -> rax V01._useBuilder(offs=0x01) P-INDEP
+; V05 tmp3 [V05,T02] ( 2, 2 ) ref -> rdx V01._methodBuilder(offs=0x08) P-INDEP
;
; Lcl frame size = 0
G_M1524_IG01:
nop
G_M1524_IG02:
- xorps xmm0, xmm0
- movdqu qword ptr [rcx], xmm0
+ xor rdx, rdx
+ mov byte ptr [rcx], al
+ mov byte ptr [rcx+1], al
+ mov gword ptr [rcx+8], rdx
mov rax, rcx
G_M1524_IG03:
ret
-; Total bytes of code 16, prolog size 5 for method AsyncValueTaskMethodBuilder:Create():struct
+; Total bytes of code 22, prolog size 5 for method AsyncValueTaskMethodBuilder:Create():struct
The AsyncVoidMethodBuilder:Create change diff is good (@stephentoub this is the one for awaiting a non generic Task?)
However, the AsyncValueTaskMethodBuilder:Create would suggest its not necessarily going to be a clean win?
The AsyncVoidMethodBuilder:Create change diff is good (@stephentoub Stephen Toub FTE this is the one for awaiting a non generic Task?)
AsyncVoidMethodBuilder is the builder for async void methods, e.g.
```C#
public async void Foo() { }
produces a stub like:
```C#
[AsyncStateMachine(typeof(<Foo>d__0))]
public void Foo()
{
<Foo>d__0 <Foo>d__ = default(<Foo>d__0);
<Foo>d__.<>t__builder = AsyncVoidMethodBuilder.Create();
<Foo>d__.<>1__state = -1;
AsyncVoidMethodBuilder <>t__builder = <Foo>d__.<>t__builder;
<>t__builder.Start(ref <Foo>d__);
}
Ah... shame. Am not worried about the code gen for async void; that's just a form of trouble 馃槈
Would it be possible to fold (last use) into each other? Or wouldn't it be a common enough pattern?
e.g. going from https://gist.github.com/benaadams/ad76003a334f3da6916ac5c8a80d95d1
After fgSsaBuild:
***** BB01, stmt 1
( 17, 16) [000007] * STMT void (IL 0x000...0x027)
N005 ( 17, 16) [000004] | /--* CALL help ref HELPER.CORINFO_HELP_NEWSFAST
N003 ( 3, 10) [000003] arg0 in rcx | | \--* CNS_INT(h) long 0x7ffc12345508 method
N007 ( 17, 16) [000006] \--* ASG ref
N006 ( 1, 1) [000005] \--* LCL_VAR ref V02 tmp1 d:3
***** BB01, stmt 2
( 5, 4) [000064] * STMT void (IL 0x007... ???)
N001 ( 1, 1) [000011] | /--* LCL_VAR ref V02 tmp1 u:3 (last use)
N003 ( 5, 4) [000063] \--* ASG ref
N002 ( 3, 2) [000062] \--* LCL_VAR ref V11 tmp10 d:3
***** BB01, stmt 3
( 7, 5) [000078] * STMT void (IL 0x00C... ???)
N001 ( 3, 2) [000182] | /--* LCL_VAR ref V11 tmp10 u:3 (last use)
N003 ( 7, 5) [000183] \--* ASG ref
N002 ( 3, 2) [000181] \--* LCL_VAR ref V13 tmp12 d:3
***** BB01, stmt 4
( 7, 5) [000074] * STMT void (IL 0x00C... ???)
N001 ( 3, 2) [000185] | /--* LCL_VAR ref V13 tmp12 u:3 (last use)
N003 ( 7, 5) [000186] \--* ASG ref
N002 ( 3, 2) [000184] \--* LCL_VAR ref V12 tmp11 d:3
***** BB01, stmt 5
( 7, 5) [000092] * STMT void (IL 0x011... ???)
N001 ( 3, 2) [000188] | /--* LCL_VAR ref V12 tmp11 u:3 (last use)
N003 ( 7, 5) [000189] \--* ASG ref
N002 ( 3, 2) [000187] \--* LCL_VAR ref V14 tmp13 d:3
***** BB01, stmt 6
( 10, 10) [000088] * STMT void (IL 0x011... ???)
N006 ( 3, 2) [000195] | /--* LCL_VAR ref V14 tmp13 u:3 (last use)
N007 ( 10, 10) [000196] \--* ASG ref
N005 ( 6, 7) [000194] \--* IND ref
N003 ( 1, 1) [000192] | /--* CNS_INT long 0 Fseq[_s]
N004 ( 5, 7) [000193] \--* ADD byref
N002 ( 3, 5) [000190] \--* ADDR byref
N001 ( 3, 4) [000191] \--* LCL_FLD struct V05 tmp4 [+0] Fseq[_s]
***** BB01, stmt 1
( 17, 16) [000007] * STMT void (IL 0x000...0x027)
N005 ( 17, 16) [000004] | /--* CALL help ref HELPER.CORINFO_HELP_NEWSFAST
N003 ( 3, 10) [000003] arg0 in rcx | | \--* CNS_INT(h) long 0x7ffc12345508 method
N007 ( 17, 16) [000006] \--* ASG ref
-N006 ( 1, 1) [000005] \--* LCL_VAR ref V02 tmp1 d:3
-***** BB01, stmt 2
- ( 5, 4) [000064 * STMT void (IL 0x007... ???)
-N001 ( 1, 1) [000011] | /--* LCL_VAR ref V02 tmp1 u:3 (last use)
-N003 ( 5, 4) [000063] \--* ASG ref
N002 ( 3, 2) [000062] \--* LCL_VAR ref V11 tmp10 d:3
***** BB01, stmt 3
( 7, 5) [000078] * STMT void (IL 0x00C... ???)
N001 ( 3, 2) [000182] | /--* LCL_VAR ref V11 tmp10 u:3 (last use)
N003 ( 7, 5) [000183] \--* ASG ref
N002 ( 3, 2) [000181] \--* LCL_VAR ref V13 tmp12 d:3
***** BB01, stmt 4
( 7, 5) [000074] * STMT void (IL 0x00C... ???)
N001 ( 3, 2) [000185] | /--* LCL_VAR ref V13 tmp12 u:3 (last use)
N003 ( 7, 5) [000186] \--* ASG ref
N002 ( 3, 2) [000184] \--* LCL_VAR ref V12 tmp11 d:3
***** BB01, stmt 5
( 7, 5) [000092] * STMT void (IL 0x011... ???)
N001 ( 3, 2) [000188] | /--* LCL_VAR ref V12 tmp11 u:3 (last use)
N003 ( 7, 5) [000189] \--* ASG ref
N002 ( 3, 2) [000187] \--* LCL_VAR ref V14 tmp13 d:3
***** BB01, stmt 6
( 10, 10) [000088] * STMT void (IL 0x011... ???)
N006 ( 3, 2) [000195] | /--* LCL_VAR ref V14 tmp13 u:3 (last use)
N007 ( 10, 10) [000196] \--* ASG ref
N005 ( 6, 7) [000194] \--* IND ref
N003 ( 1, 1) [000192] | /--* CNS_INT long 0 Fseq[_s]
N004 ( 5, 7) [000193] \--* ADD byref
N002 ( 3, 5) [000190] \--* ADDR byref
N001 ( 3, 4) [000191] \--* LCL_FLD struct V05 tmp4 [+0] Fseq[_s]
***** BB01, stmt 1
( 17, 16) [000007] * STMT void (IL 0x000...0x027)
N005 ( 17, 16) [000004] | /--* CALL help ref HELPER.CORINFO_HELP_NEWSFAST
N003 ( 3, 10) [000003] arg0 in rcx | | \--* CNS_INT(h) long 0x7ffc12345508 method
N007 ( 17, 16) [000006] \--* ASG ref
-N002 ( 3, 2) [000062] \--* LCL_VAR ref V11 tmp10 d:3
-
-***** BB01, stmt 3
- ( 7, 5) [000078] * STMT void (IL 0x00C... ???)
-N001 ( 3, 2) [000182] | /--* LCL_VAR ref V11 tmp10 u:3 (last use)
-N003 ( 7, 5) [000183] \--* ASG ref
N002 ( 3, 2) [000181] \--* LCL_VAR ref V13 tmp12 d:3
***** BB01, stmt 4
( 7, 5) [000074] * STMT void (IL 0x00C... ???)
N001 ( 3, 2) [000185] | /--* LCL_VAR ref V13 tmp12 u:3 (last use)
N003 ( 7, 5) [000186] \--* ASG ref
N002 ( 3, 2) [000184] \--* LCL_VAR ref V12 tmp11 d:3
***** BB01, stmt 5
( 7, 5) [000092] * STMT void (IL 0x011... ???)
N001 ( 3, 2) [000188] | /--* LCL_VAR ref V12 tmp11 u:3 (last use)
N003 ( 7, 5) [000189] \--* ASG ref
N002 ( 3, 2) [000187] \--* LCL_VAR ref V14 tmp13 d:3
***** BB01, stmt 6
( 10, 10) [000088] * STMT void (IL 0x011... ???)
N006 ( 3, 2) [000195] | /--* LCL_VAR ref V14 tmp13 u:3 (last use)
N007 ( 10, 10) [000196] \--* ASG ref
N005 ( 6, 7) [000194] \--* IND ref
N003 ( 1, 1) [000192] | /--* CNS_INT long 0 Fseq[_s]
N004 ( 5, 7) [000193] \--* ADD byref
N002 ( 3, 5) [000190] \--* ADDR byref
N001 ( 3, 4) [000191] \--* LCL_FLD struct V05 tmp4 [+0] Fseq[_s]
***** BB01, stmt 1
( 17, 16) [000007] * STMT void (IL 0x000...0x027)
N005 ( 17, 16) [000004] | /--* CALL help ref HELPER.CORINFO_HELP_NEWSFAST
N003 ( 3, 10) [000003] arg0 in rcx | | \--* CNS_INT(h) long 0x7ffc12345508 method
N007 ( 17, 16) [000006] \--* ASG ref
-N002 ( 3, 2) [000181] \--* LCL_VAR ref V13 tmp12 d:3
-
-***** BB01, stmt 4
- ( 7, 5) [000074] * STMT void (IL 0x00C... ???)
-N001 ( 3, 2) [000185] | /--* LCL_VAR ref V13 tmp12 u:3 (last use)
-N003 ( 7, 5) [000186] \--* ASG ref
N002 ( 3, 2) [000184] \--* LCL_VAR ref V12 tmp11 d:3
***** BB01, stmt 5
( 7, 5) [000092] * STMT void (IL 0x011... ???)
N001 ( 3, 2) [000188] | /--* LCL_VAR ref V12 tmp11 u:3 (last use)
N003 ( 7, 5) [000189] \--* ASG ref
N002 ( 3, 2) [000187] \--* LCL_VAR ref V14 tmp13 d:3
***** BB01, stmt 6
( 10, 10) [000088] * STMT void (IL 0x011... ???)
N006 ( 3, 2) [000195] | /--* LCL_VAR ref V14 tmp13 u:3 (last use)
N007 ( 10, 10) [000196] \--* ASG ref
N005 ( 6, 7) [000194] \--* IND ref
N003 ( 1, 1) [000192] | /--* CNS_INT long 0 Fseq[_s]
N004 ( 5, 7) [000193] \--* ADD byref
N002 ( 3, 5) [000190] \--* ADDR byref
N001 ( 3, 4) [000191] \--* LCL_FLD struct V05 tmp4 [+0] Fseq[_s]
***** BB01, stmt 1
( 17, 16) [000007] * STMT void (IL 0x000...0x027)
N005 ( 17, 16) [000004] | /--* CALL help ref HELPER.CORINFO_HELP_NEWSFAST
N003 ( 3, 10) [000003] arg0 in rcx | | \--* CNS_INT(h) long 0x7ffc12345508 method
N007 ( 17, 16) [000006] \--* ASG ref
-N002 ( 3, 2) [000184] \--* LCL_VAR ref V12 tmp11 d:3
-
-***** BB01, stmt 5
- ( 7, 5) [000092] * STMT void (IL 0x011... ???)
-N001 ( 3, 2) [000188] | /--* LCL_VAR ref V12 tmp11 u:3 (last use)
-N003 ( 7, 5) [000189] \--* ASG ref
N002 ( 3, 2) [000187] \--* LCL_VAR ref V14 tmp13 d:3
***** BB01, stmt 6
( 10, 10) [000088] * STMT void (IL 0x011... ???)
N006 ( 3, 2) [000195] | /--* LCL_VAR ref V14 tmp13 u:3 (last use)
N007 ( 10, 10) [000196] \--* ASG ref
N005 ( 6, 7) [000194] \--* IND ref
N003 ( 1, 1) [000192] | /--* CNS_INT long 0 Fseq[_s]
N004 ( 5, 7) [000193] \--* ADD byref
N002 ( 3, 5) [000190] \--* ADDR byref
N001 ( 3, 4) [000191] \--* LCL_FLD struct V05 tmp4 [+0] Fseq[_s]
***** BB01, stmt 1
( 17, 16) [000007] * STMT void (IL 0x000...0x027)
N005 ( 17, 16) [000004] | /--* CALL help ref HELPER.CORINFO_HELP_NEWSFAST
N003 ( 3, 10) [000003] arg0 in rcx | | \--* CNS_INT(h) long 0x7ffc12345508 method
N007 ( 17, 16) [000006] \--* ASG ref
-N002 ( 3, 2) [000187] \--* LCL_VAR ref V14 tmp13 d:3
-
-***** BB01, stmt 6
- ( 10, 10) [000088] * STMT void (IL 0x011... ???)
-N006 ( 3, 2) [000195] | /--* LCL_VAR ref V14 tmp13 u:3 (last use)
-N007 ( 10, 10) [000196] \--* ASG ref
N005 ( 6, 7) [000194] \--* IND ref
N003 ( 1, 1) [000192] | /--* CNS_INT long 0 Fseq[_s]
N004 ( 5, 7) [000193] \--* ADD byref
N002 ( 3, 5) [000190] \--* ADDR byref
N001 ( 3, 4) [000191] \--* LCL_FLD struct V05 tmp4 [+0] Fseq[_s]
***** BB01, stmt 1
( 17, 16) [000007] * STMT void (IL 0x000...0x027)
N005 ( 17, 16) [000004] | /--* CALL help ref HELPER.CORINFO_HELP_NEWSFAST
N003 ( 3, 10) [000003] arg0 in rcx | | \--* CNS_INT(h) long 0x7ffc12345508 method
N007 ( 17, 16) [000006] \--* ASG ref
N005 ( 6, 7) [000194] \--* IND ref
N003 ( 1, 1) [000192] | /--* CNS_INT long 0 Fseq[_s]
N004 ( 5, 7) [000193] \--* ADD byref
N002 ( 3, 5) [000190] \--* ADDR byref
N001 ( 3, 4) [000191] \--* LCL_FLD struct V05 tmp4 [+0] Fseq[_s]
Maybe that's what Copy Propagation should be doing?
*************** In optVnCopyProp()
*************** In SsaBuilder::ComputeDominators(Compiler*, ...)
Copy Assertion for BB01
curSsaName stack: { }
Live vars: {} => {V02}
Live vars: {V02} => {}
Live vars: {} => {V11}
Live vars: {V11} => {}
Live vars: {} => {V13}
Live vars: {V13} => {}
Live vars: {} => {V12}
Live vars: {V12} => {}
Live vars: {} => {V14}
Live vars: {V14} => {}
Live vars: {} => {V00}
Live vars: {V00} => {}
*************** In optOptimizeCSEs()
Would it be possible to fold (last use) into each other? Or wouldn't it be a common enough pattern?
The preferencing in the register allocator should be taking care of some of that, but it's got some issues ... I've got a branch that I'm working on (https://github.com/dotnet/coreclr/pull/19429) that improves it overall, and eliminates many of the copies (your ViaStruct5 example has 5 fewer copies, going from 122 bytes to 107). Overall it's a .18% improvement, but I'm currently trying to reduce the cases where things get worse.
That said, I agree that one would want copy prop to do a better job of this upstream.
Looking at the ValueTask example asyncValueTask_JitDump.txt; it would have to be done after the IR Rationalization phase as before that BB03 (one I'm looking at with additional copies) is quite complex and has lots of COMMA nodes.
I had a go at adding a Copy Elision (?) step after lowering https://github.com/dotnet/coreclr/compare/master...benaadams:copyprop?expand=1 by traversing a blocks statements backwards and looking for matching ValueNum on VAR_DEATH; as most of the copying seems to be contained within single blocks.
However not sure its the right approach? Or how to morph the tree...
However not sure its the right approach?
Not really, for various reasons - it has nothing to do with lowering, GTF_VAR_DEATH is likely not up to date at that point, VNs are probably out of date at that point etc.
I'll see if I can find some time to try some copy prop tweaks I've done a while ago to see if they improve this case. Though AFAIR I've never been able to prevent those tweaks from introducing regressions as well, the JIT's copy prop implementation relies on rather unwieldy heuristics...
it has nothing to do with lowering
I was adding it after lowering as the patterns seemed more obvious after Rationalize? (though might be more because I'm not quite fluent with what the syntax means)
Trees before IR Rationalize
------------ BB03 [00A..02B) -> BB05 (cond), preds={BB02} succs={BB04,BB05}
***** BB03, stmt 3
( 0, 0) [000155] ------------ * STMT void (IL 0x00A... ???)
N006 ( 0, 0) [000788] ------------ | /--* NOP void $383
N007 ( 0, 0) [000789] ------------ \--* COMMA void $383
N004 ( 0, 0) [000784] ------------ | /--* NOP void $382
N005 ( 0, 0) [000785] ------------ \--* COMMA void $382
N002 ( 0, 0) [000780] ------------ | /--* NOP void $381
N003 ( 0, 0) [000781] ------------ \--* COMMA void $381
N001 ( 0, 0) [000777] ------------ \--* NOP void $380
***** BB03, stmt 4
( 0, 0) [000073] ------------ * STMT void (IL ???... ???)
N006 ( 0, 0) [000803] ------------ | /--* NOP void $387
N007 ( 0, 0) [000804] ------------ \--* COMMA void $387
N004 ( 0, 0) [000799] ------------ | /--* NOP void $386
N005 ( 0, 0) [000800] ------------ \--* COMMA void $386
N002 ( 0, 0) [000795] ------------ | /--* NOP void $385
N003 ( 0, 0) [000796] ------------ \--* COMMA void $385
N001 ( 0, 0) [000792] ------------ \--* NOP void $384
***** BB03, stmt 5
( 0, 0) [000207] ------------ * STMT void (IL 0x010... ???)
N006 ( 0, 0) [000818] ------------ | /--* NOP void $38b
N007 ( 0, 0) [000819] ------------ \--* COMMA void $38b
N004 ( 0, 0) [000814] ------------ | /--* NOP void $38a
N005 ( 0, 0) [000815] ------------ \--* COMMA void $38a
N002 ( 0, 0) [000810] ------------ | /--* NOP void $389
N003 ( 0, 0) [000811] ------------ \--* COMMA void $389
N001 ( 0, 0) [000807] ------------ \--* NOP void $388
***** BB03, stmt 6
( 1, 3) [000267] ------------ * STMT void (IL 0x010... ???)
N001 ( 1, 1) [000202] ------------ | /--* CNS_INT int 0 $40
N003 ( 1, 3) [000266] -A------R--- \--* ASG short $40
N002 ( 1, 1) [000265] D------N---- \--* LCL_VAR int V15 tmp8 d:3 $40
***** BB03, stmt 7
( 1, 3) [000240] ------------ * STMT void (IL 0x010... ???)
N001 ( 1, 1) [000237] ------------ | /--* CNS_INT ref null $VN.Null
N003 ( 1, 3) [000239] -A------R--- \--* ASG ref $VN.Null
N002 ( 1, 1) [000238] D------N---- \--* LCL_VAR ref V47 tmp40 d:3 $VN.Null
***** BB03, stmt 8
( 1, 3) [000246] ------------ * STMT void (IL 0x010... ???)
N001 ( 1, 1) [000243] ------------ | /--* CNS_INT int 1 $44
N003 ( 1, 3) [000245] -A------R--- \--* ASG int $44
N002 ( 1, 1) [000244] D------N---- \--* LCL_VAR int V48 tmp41 d:3 $44
***** BB03, stmt 9
( 1, 3) [000252] ------------ * STMT void (IL 0x010... ???)
N001 ( 1, 1) [000249] ------------ | /--* CNS_INT int 0 $40
N003 ( 1, 3) [000251] -A------R--- \--* ASG short $40
N002 ( 2, 2) [000250] D------N---- \--* LCL_VAR short V49 tmp42 d:3 $40
***** BB03, stmt 10
( 1, 3) [000258] ------------ * STMT void (IL 0x010... ???)
N001 ( 1, 1) [000255] ------------ | /--* CNS_INT int 0 $40
N003 ( 1, 3) [000257] -A------R--- \--* ASG bool $40
N002 ( 2, 2) [000256] D------N---- \--* LCL_VAR bool V50 tmp43 d:3 $40
***** BB03, stmt 11
( 31, 30) [000281] ------------ * STMT void (IL 0x010... ???)
N030 ( 2, 2) [000850] -------N---- | /--* LCL_VAR bool V50 tmp43 u:3 (last use) $40
N031 ( 8, 8) [000851] -A---------- | /--* ASG bool $VN.Void
N029 ( 5, 5) [000849] *------N---- | | \--* IND bool $40
N027 ( 1, 1) [000847] ------------ | | | /--* CNS_INT long 14 Fseq[_continueOnCapturedContext] $144
N028 ( 2, 2) [000848] -------N---- | | \--* ADD byref $285
N026 ( 1, 1) [000846] ------------ | | \--* LCL_VAR byref V52 tmp45 u:3 (last use) $440
N032 ( 31, 30) [000852] -A---------- \--* COMMA void $VN.Void
N023 ( 2, 2) [000843] -------N---- | /--* LCL_VAR short V49 tmp42 u:3 (last use) $40
N024 ( 8, 8) [000844] -A---------- | /--* ASG short $VN.Void
N022 ( 5, 5) [000842] *------N---- | | \--* IND short $40
N020 ( 1, 1) [000840] ------------ | | | /--* CNS_INT long 12 Fseq[_token] $143
N021 ( 2, 2) [000841] -------N---- | | \--* ADD byref $284
N019 ( 1, 1) [000839] ------------ | | \--* LCL_VAR byref V52 tmp45 u:3 $440
N025 ( 23, 22) [000845] -A---------- \--* COMMA void $VN.Void
N016 ( 1, 1) [000836] -------N---- | /--* LCL_VAR int V48 tmp41 u:3 (last use) $44
N017 ( 6, 6) [000837] -A---------- | /--* ASG int $VN.Void
N015 ( 4, 4) [000835] *------N---- | | \--* IND int $44
N013 ( 1, 1) [000833] ------------ | | | /--* CNS_INT long 8 Fseq[_result] $142
N014 ( 2, 2) [000834] -------N---- | | \--* ADD byref $283
N012 ( 1, 1) [000832] ------------ | | \--* LCL_VAR byref V52 tmp45 u:3 $440
N018 ( 15, 14) [000838] -A---------- \--* COMMA void $VN.Void
N009 ( 1, 1) [000829] -------N---- | /--* LCL_VAR ref V47 tmp40 u:3 (last use) $VN.Null
N010 ( 6, 5) [000830] -A---------- | /--* ASG ref $VN.Void
N008 ( 4, 3) [000828] *------N---- | | \--* IND ref $VN.Null
N006 ( 1, 1) [000826] ------------ | | | /--* CNS_INT long 0 Fseq[_obj] $140
N007 ( 3, 3) [000827] -------N---- | | \--* ADD byref $282
N005 ( 1, 1) [000825] ------------ | | \--* LCL_VAR byref V52 tmp45 u:3 $440
N011 ( 9, 8) [000831] -A---------- \--* COMMA void $VN.Void
N002 ( 3, 3) [000821] L----------- | /--* ADDR byref $440
N001 ( 3, 2) [000822] -------N---- | | \--* LCL_VAR struct(AX) V16 tmp9 $2c1
N004 ( 3, 3) [000824] -A------R--- \--* ASG byref $440
N003 ( 1, 1) [000823] D------N---- \--* LCL_VAR byref V52 tmp45 d:3 $440
***** BB03, stmt 12
( 10, 10) [000277] ------------ * STMT void (IL 0x010... ???)
N001 ( 3, 2) [000272] ----G------- | /--* LCL_VAR struct(AX) V16 tmp9 $2c2
N005 ( 10, 10) [000276] -A--G---R--- \--* ASG struct (copy) $VN.Void
N004 ( 6, 7) [000275] x----------- \--* BLK(16) struct
N003 ( 3, 5) [000274] ------------ \--* ADDR byref $286
N002 ( 3, 4) [000271] D------N---- \--* LCL_FLD struct V12 tmp5 d:3[+0] Fseq[_value]
***** BB03, stmt 13
( 7, 5) [000084] ------------ * STMT void (IL ???... ???)
N001 ( 3, 2) [000228] ------------ | /--* LCL_VAR struct V12 tmp5 u:3 (last use) $302
N003 ( 7, 5) [000231] -A------R--- \--* ASG struct (copy) $VN.Void
N002 ( 3, 2) [000230] D------N---- \--* LCL_VAR struct V05 loc4 d:3
***** BB03, stmt 14
( 10, 10) [000315] ------------ * STMT void (IL 0x01A... ???)
N003 ( 6, 7) [000295] x---G------- | /--* IND struct $500
N002 ( 3, 5) [000294] ------------ | | \--* ADDR byref $287
N001 ( 3, 4) [000285] -------N---- | | \--* LCL_FLD struct V05 loc4 u:3[+0] Fseq[_value] (last use) $500
N005 ( 10, 10) [000314] -A--G---R--- \--* ASG struct (copy) $VN.Void
N004 ( 3, 2) [000312] D------N---- \--* LCL_VAR struct V18 tmp11 d:3
***** BB03, stmt 15
( 10, 10) [000311] ------------ * STMT void (IL 0x01A... ???)
N001 ( 3, 2) [000306] ------------ | /--* LCL_VAR struct V18 tmp11 u:3 (last use) $500
N005 ( 10, 10) [000310] -A------R--- \--* ASG struct (copy) $VN.Void
N004 ( 6, 7) [000309] x----------- \--* BLK(16) struct
N003 ( 3, 5) [000308] ------------ \--* ADDR byref $288
N002 ( 3, 4) [000305] D------N---- \--* LCL_FLD struct V17 tmp10 d:3[+0] Fseq[_value]
***** BB03, stmt 16
( 7, 5) [000093] ------------ * STMT void (IL ???... ???)
N001 ( 3, 2) [000298] ------------ | /--* LCL_VAR struct V17 tmp10 u:3 (last use) $500
N003 ( 7, 5) [000301] -A--G---R--- \--* ASG struct (copy) $VN.Void
N002 ( 3, 2) [000300] D---G--N---- \--* LCL_VAR struct(AX) V03 loc2
***** BB03, stmt 17
( 3, 4) [000335] ------------ * STMT void (IL 0x022... ???)
N001 ( 3, 4) [000331] ----G------- | /--* LCL_FLD ref V03 loc2 [+0] Fseq[_value, _obj] $347
N003 ( 3, 4) [000334] -A--G---R--- \--* ASG ref $347
N002 ( 1, 1) [000333] D------N---- \--* LCL_VAR ref V20 tmp13 d:3 $347
***** BB03, stmt 18
( 5, 5) [000340] ------------ * STMT void (IL 0x022... ???)
N004 ( 5, 5) [000339] ------------ \--* JTRUE void
N002 ( 1, 1) [000337] ------------ | /--* CNS_INT ref null $VN.Null
N003 ( 3, 3) [000338] J------N---- \--* NE int $242
N001 ( 1, 1) [000336] ------------ \--* LCL_VAR ref V20 tmp13 u:3 $347
Trees after IR Rationalize
------------ BB03 [00A..02B) -> BB05 (cond), preds={BB02} succs={BB04,BB05}
( 0, 0) [000155] ------------ IL_OFFSET void IL offset: 0xa
( 0, 0) [000207] ------------ IL_OFFSET void IL offset: 0x10
( 1, 3) [000267] ------------ IL_OFFSET void IL offset: 0x10
N001 ( 1, 1) [000202] ------------ t202 = CNS_INT int 0 $40
/--* t202 int
N003 ( 1, 3) [000266] DA---------- * STORE_LCL_VAR int V15 tmp8 d:3
( 1, 3) [000240] ------------ IL_OFFSET void IL offset: 0x10
N001 ( 1, 1) [000237] ------------ t237 = CNS_INT ref null $VN.Null
/--* t237 ref
N003 ( 1, 3) [000239] DA---------- * STORE_LCL_VAR ref V47 tmp40 d:3
( 1, 3) [000246] ------------ IL_OFFSET void IL offset: 0x10
N001 ( 1, 1) [000243] ------------ t243 = CNS_INT int 1 $44
/--* t243 int
N003 ( 1, 3) [000245] DA---------- * STORE_LCL_VAR int V48 tmp41 d:3
( 1, 3) [000252] ------------ IL_OFFSET void IL offset: 0x10
N001 ( 1, 1) [000249] ------------ t249 = CNS_INT int 0 $40
/--* t249 int
N003 ( 1, 3) [000251] DA---------- * STORE_LCL_VAR short V49 tmp42 d:3
( 1, 3) [000258] ------------ IL_OFFSET void IL offset: 0x10
N001 ( 1, 1) [000255] ------------ t255 = CNS_INT int 0 $40
/--* t255 int
N003 ( 1, 3) [000257] DA---------- * STORE_LCL_VAR bool V50 tmp43 d:3
( 31, 30) [000281] ------------ IL_OFFSET void IL offset: 0x10
N001 ( 3, 2) [000822] -------N---- t822 = LCL_VAR_ADDR byref V16 tmp9
/--* t822 byref
N004 ( 3, 3) [000824] DA---------- * STORE_LCL_VAR byref V52 tmp45 d:3
N005 ( 1, 1) [000825] ------------ t825 = LCL_VAR byref V52 tmp45 u:3 $440
N006 ( 1, 1) [000826] ------------ t826 = CNS_INT long 0 Fseq[_obj] $140
/--* t825 byref
+--* t826 long
N007 ( 3, 3) [000827] -------N---- t827 = * ADD byref $282
N009 ( 1, 1) [000829] -------N---- t829 = LCL_VAR ref V47 tmp40 u:3 (last use) $VN.Null
/--* t827 byref
+--* t829 ref
[001021] -A---------- * STOREIND ref
N012 ( 1, 1) [000832] ------------ t832 = LCL_VAR byref V52 tmp45 u:3 $440
N013 ( 1, 1) [000833] ------------ t833 = CNS_INT long 8 Fseq[_result] $142
/--* t832 byref
+--* t833 long
N014 ( 2, 2) [000834] -------N---- t834 = * ADD byref $283
N016 ( 1, 1) [000836] -------N---- t836 = LCL_VAR int V48 tmp41 u:3 (last use) $44
/--* t834 byref
+--* t836 int
[001022] -A---------- * STOREIND int
N019 ( 1, 1) [000839] ------------ t839 = LCL_VAR byref V52 tmp45 u:3 $440
N020 ( 1, 1) [000840] ------------ t840 = CNS_INT long 12 Fseq[_token] $143
/--* t839 byref
+--* t840 long
N021 ( 2, 2) [000841] -------N---- t841 = * ADD byref $284
N023 ( 2, 2) [000843] -------N---- t843 = LCL_VAR short V49 tmp42 u:3 (last use) $40
/--* t841 byref
+--* t843 short
[001023] -A---------- * STOREIND short
N026 ( 1, 1) [000846] ------------ t846 = LCL_VAR byref V52 tmp45 u:3 (last use) $440
N027 ( 1, 1) [000847] ------------ t847 = CNS_INT long 14 Fseq[_continueOnCapturedContext] $144
/--* t846 byref
+--* t847 long
N028 ( 2, 2) [000848] -------N---- t848 = * ADD byref $285
N030 ( 2, 2) [000850] -------N---- t850 = LCL_VAR bool V50 tmp43 u:3 (last use) $40
/--* t848 byref
+--* t850 bool
[001024] -A---------- * STOREIND bool
( 10, 10) [000277] ------------ IL_OFFSET void IL offset: 0x10
N001 ( 3, 2) [000272] ------------ t272 = LCL_VAR struct(AX) V16 tmp9 $2c2
N002 ( 3, 4) [000271] D------N---- t271 = LCL_FLD_ADDR byref V12 tmp5 d:3[+0] Fseq[_value]
/--* t271 byref
+--* t272 struct
N004 ( 6, 7) [000275] xA--G------- * STORE_BLK(16) struct (copy)
N001 ( 3, 2) [000228] ------------ t228 = LCL_VAR struct V12 tmp5 u:3 (last use) $302
N002 ( 3, 2) [000230] D------N---- t230 = LCL_VAR_ADDR byref V05 loc4 d:3
/--* t230 byref
+--* t228 struct
[001025] xA---------- * STORE_BLK(16) struct (copy)
( 10, 10) [000315] ------------ IL_OFFSET void IL offset: 0x1a
N001 ( 3, 4) [000285] -------N---- t285 = LCL_FLD_ADDR byref V05 loc4 u:3[+0] Fseq[_value] (last use)
/--* t285 byref
N003 ( 6, 7) [000295] x---G------- t295 = * IND struct $500
N004 ( 3, 2) [000312] D------N---- t312 = LCL_VAR_ADDR byref V18 tmp11 d:3
/--* t312 byref
+--* t295 struct
[001026] xA--G------- * STORE_BLK(16) struct (copy)
( 10, 10) [000311] ------------ IL_OFFSET void IL offset: 0x1a
N001 ( 3, 2) [000306] ------------ t306 = LCL_VAR struct V18 tmp11 u:3 (last use) $500
N002 ( 3, 4) [000305] D------N---- t305 = LCL_FLD_ADDR byref V17 tmp10 d:3[+0] Fseq[_value]
/--* t305 byref
+--* t306 struct
N004 ( 6, 7) [000309] xA---------- * STORE_BLK(16) struct (copy)
N001 ( 3, 2) [000298] ------------ t298 = LCL_VAR struct V17 tmp10 u:3 (last use) $500
N002 ( 3, 2) [000300] D------N---- t300 = LCL_VAR_ADDR byref V03 loc2
/--* t300 byref
+--* t298 struct
[001027] xA---------- * STORE_BLK(16) struct (copy)
( 3, 4) [000335] ------------ IL_OFFSET void IL offset: 0x22
N001 ( 3, 4) [000331] ------------ t331 = LCL_FLD ref V03 loc2 [+0] Fseq[_value, _obj] $347
/--* t331 ref
N003 ( 3, 4) [000334] DA--G------- * STORE_LCL_VAR ref V20 tmp13 d:3
( 5, 5) [000340] ------------ IL_OFFSET void IL offset: 0x22
N001 ( 1, 1) [000336] ------------ t336 = LCL_VAR ref V20 tmp13 u:3 $347
N002 ( 1, 1) [000337] ------------ t337 = CNS_INT ref null $VN.Null
/--* t336 ref
+--* t337 ref
N003 ( 3, 3) [000338] J------N---- t338 = * NE int $242
/--* t338 int
N004 ( 5, 5) [000339] ------------ * JTRUE void
With aim of changing
4C8D4DA8 lea r9, bword ptr [rbp-58H]
498911 mov gword ptr [r9], rdx
41894908 mov dword ptr [r9+8], ecx
664189410C mov word ptr [r9+12], ax
4588410E mov byte ptr [r9+14], r8b
C4E17A6F45A8 vmovdqu xmm0, qword ptr [rbp-58H]
C4E17A7F45B8 vmovdqu qword ptr [rbp-48H], xmm0
C4E17A6F45B8 vmovdqu xmm0, qword ptr [rbp-48H]
C4E17A7F45C8 vmovdqu qword ptr [rbp-38H], xmm0
C4E17A6F45C8 vmovdqu xmm0, qword ptr [rbp-38H]
C4E17A7F4588 vmovdqu qword ptr [rbp-78H], xmm0
C4E17A6F4588 vmovdqu xmm0, qword ptr [rbp-78H]
C4E17A7F4598 vmovdqu qword ptr [rbp-68H], xmm0
C4E17A6F4598 vmovdqu xmm0, qword ptr [rbp-68H]
C4E17A7F45D8 vmovdqu qword ptr [rbp-28H], xmm0
488B75D8 mov rsi, gword ptr [rbp-28H]
4885F6 test rsi, rsi
7507 jne SHORT G_M2754_IG10
BF01000000 mov edi, 1
EB4E jmp SHORT G_M2754_IG12
To
4885F6 test rdx, rdx
7507 jne SHORT G_M2754_IG10
BF01000000 mov edi, 1
EB4E jmp SHORT G_M2754_IG12
Seems to be a common pattern after calling async methods
From TechEmpower aspnetcore Platform benchmark
(Jit triggered by running and sending GET request to http://127.0.0.1:8080/plaintext)
public async ValueTask OnReadCompletedAsync()
{
await Writer.FlushAsync();
}
; Assembly listing for method <OnReadCompletedAsync>d__52:MoveNext():this
G_M39073_IG01:
55 push rbp
57 push rdi
56 push rsi
4881ECA0000000 sub rsp, 160
C5F877 vzeroupper
488DAC24B0000000 lea rbp, [rsp+B0H]
488BF1 mov rsi, rcx
488D7D80 lea rdi, [rbp-80H]
B91C000000 mov ecx, 28
33C0 xor rax, rax
F3AB rep stosd
488BCE mov rcx, rsi
4889A570FFFFFF mov qword ptr [rbp-90H], rsp
48894D10 mov bword ptr [rbp+10H], rcx
G_M39073_IG02:
488B5510 mov rdx, bword ptr [rbp+10H]
8B5208 mov edx, dword ptr [rdx+8]
488B4D10 mov rcx, bword ptr [rbp+10H]
488B09 mov rcx, gword ptr [rcx]
G_M39073_IG03:
85D2 test edx, edx
0F8405010000 je G_M39073_IG11
488B4910 mov rcx, gword ptr [rcx+16]
488D55C0 lea rdx, bword ptr [rbp-40H]
4533C0 xor r8, r8
488B01 mov rax, qword ptr [rcx]
488B4040 mov rax, qword ptr [rax+64]
FF5038 call qword ptr [rax+56]PipeWriter:FlushAsync(struct):struct:this
G_M39073_IG04:
C4E17A6F45C0 vmovdqu xmm0, qword ptr [rbp-40H] ; (1)
C4E17A7F4588 vmovdqu qword ptr [rbp-78H], xmm0 ; (1) rbp-78H == rbp-40H
488B55D0 mov rdx, qword ptr [rbp-30H] ; (2)
48895598 mov qword ptr [rbp-68H], rdx ; (2) rbp-68H == rbp-30H
G_M39073_IG05:
C4E17A6F4588 vmovdqu xmm0, qword ptr [rbp-78H] ; (1)
C4E17A7F45A0 vmovdqu qword ptr [rbp-60H], xmm0 ; (1) rbp-60H == ... == rbp-40H
488B5598 mov rdx, qword ptr [rbp-68H] ; (2)
488955B0 mov qword ptr [rbp-50H], rdx ; (2) rbp-50H == ... == rbp-30H
G_M39073_IG06:
C4E17A6F45A0 vmovdqu xmm0, qword ptr [rbp-60H] ; (1)
C4E17A7F45D8 vmovdqu qword ptr [rbp-28H], xmm0 ; (1) rbp-28H == ... == rbp-40H
488B55B0 mov rdx, qword ptr [rbp-50H] ; (2)
488955E8 mov qword ptr [rbp-18H], rdx ; (2) rbp-18H == ... == rbp-30H
G_M39073_IG07:
488B75D8 mov rsi, gword ptr [rbp-28H] ; (1) rsi == ... == rbp-40H (1)
4885F6 test rsi, rsi
7507 jne SHORT G_M39073_IG08
BF01000000 mov edi, 1
EB54 jmp SHORT G_M39073_IG10
With rbp-18H and rbp-28H becoming the only ones of these locations used after the above block e.g.
G_M39073_IG08:
488BD6 mov rdx, rsi
48B928A5143CFA7F0000 mov rcx, 0x7FFA3C14A528
E8B4A77C5F call CORINFO_HELP_ISINSTANCEOFCLASS
4885C0 test rax, rax
7413 je SHORT G_M39073_IG09
8B4834 mov ecx, dword ptr [rax+52]
F7C100006001 test ecx, 0x1600000
400F95C7 setne dil
400FB6FF movzx rdi, dil
EB2A jmp SHORT G_M39073_IG10
G_M39073_IG09:
488BCE mov rcx, rsi
480FBF55E0 movsx rdx, word ptr [rbp-20H]
49BB8810BF3BFA7F0000 mov r11, 0x7FFA3BBF1088
48B88810BF3BFA7F0000 mov rax, 0x7FFA3BBF1088
3909 cmp dword ptr [rcx], ecx
FF10 call qword ptr [rax]IValueTaskSource`1:GetStatus(short):int:this
85C0 test eax, eax
400F95C7 setne dil
400FB6FF movzx rdi, dil
G_M39073_IG10:
85FF test edi, edi
0F8586000000 jne G_M39073_IG14
488B4510 mov rax, bword ptr [rbp+10H]
33D2 xor edx, edx
895008 mov dword ptr [rax+8], edx
488B4510 mov rax, bword ptr [rbp+10H]
488D7820 lea rdi, bword ptr [rax+32]
488D75D8 lea rsi, bword ptr [rbp-28H] ; (1) load &rbp-28H to rsi
E890AA7C5F call CORINFO_HELP_ASSIGN_BYREF
48A5 movsq
48A5 movsq
488B5510 mov rdx, bword ptr [rbp+10H]
3912 cmp dword ptr [rdx], edx
488B5510 mov rdx, bword ptr [rbp+10H]
4883C210 add rdx, 16
C6420101 mov byte ptr [rdx+1], 1
3912 cmp dword ptr [rdx], edx
488D4A08 lea rcx, bword ptr [rdx+8]
488D55D8 lea rdx, bword ptr [rbp-28H] ; (1) load &rbp-28H to rdx
4C8B4510 mov r8, bword ptr [rbp+10H]
E85FE8FFFF call AsyncTaskMethodBuilder`1:AwaitUnsafeOnCompleted(byref,byref):this
E900010000 jmp G_M39073_IG22
G_M39073_IG11:
488B5510 mov rdx, bword ptr [rbp+10H]
4883C220 add rdx, 32
G_M39073_IG12:
C4E17A6F02 vmovdqu xmm0, qword ptr [rdx]
C4E17A7F45D8 vmovdqu qword ptr [rbp-28H], xmm0 ; (1) store to rbp-28H
488B4A10 mov rcx, qword ptr [rdx+16]
48894DE8 mov qword ptr [rbp-18H], rcx ; (2) store to rbp-18H
G_M39073_IG13:
33D2 xor rdx, rdx
488B4D10 mov rcx, bword ptr [rbp+10H]
4883C120 add rcx, 32
C4E17857C0 vxorps xmm0, xmm0
C4E17A7F01 vmovdqu qword ptr [rcx], xmm0
48895110 mov qword ptr [rcx+16], rdx
488B5510 mov rdx, bword ptr [rbp+10H]
C74208FFFFFFFF mov dword ptr [rdx+8], -1
G_M39073_IG14:
488B75D8 mov rsi, gword ptr [rbp-28H] ; (1) load from rbp-28H
4885F6 test rsi, rsi
7507 jne SHORT G_M39073_IG15
480FBE7DE8 movsx rdi, byte ptr [rbp-18H] ; (2) load from rbp-18H
EB5C jmp SHORT G_M39073_IG18
What specific pattern are you referring to? You're highlighting things in assembly that do not make sense, such as equating lea instructions with loads.
What specific pattern are you referring to?
The specific pattern is
G_M39073_IG04:
C4E17A6F45C0 vmovdqu xmm0, qword ptr [rbp-40H] ; (1)
C4E17A7F4588 vmovdqu qword ptr [rbp-78H], xmm0 ; (1) rbp-78H == rbp-40H
488B55D0 mov rdx, qword ptr [rbp-30H] ; (2)
48895598 mov qword ptr [rbp-68H], rdx ; (2) rbp-68H == rbp-30H
G_M39073_IG05:
C4E17A6F4588 vmovdqu xmm0, qword ptr [rbp-78H] ; (1)
C4E17A7F45A0 vmovdqu qword ptr [rbp-60H], xmm0 ; (1) rbp-60H == ... == rbp-40H
488B5598 mov rdx, qword ptr [rbp-68H] ; (2)
488955B0 mov qword ptr [rbp-50H], rdx ; (2) rbp-50H == ... == rbp-30H
G_M39073_IG06:
C4E17A6F45A0 vmovdqu xmm0, qword ptr [rbp-60H] ; (1)
C4E17A7F45D8 vmovdqu qword ptr [rbp-28H], xmm0 ; (1) rbp-28H == ... == rbp-40H
488B55B0 mov rdx, qword ptr [rbp-50H] ; (2)
488955E8 mov qword ptr [rbp-18H], rdx ; (2) rbp-18H == ... == rbp-30H
G_M39073_IG07:
488B75D8 mov rsi, gword ptr [rbp-28H] ; (1) rsi == ... == rbp-40H (1)
You're highlighting things in assembly that do not make sense, such as equating lea instructions with loads.
Was highlighting later location usage
I see, so only the first part of the disassembly illustrates the issue. It does look like a typical copy propagation issue but it's not clear what exactly generates those copies to begin with. One copy is likely needed - from the return buffer to the heap location. But how exactly the other 2 or 3 copies appeared is not clear.
Can you also post the relevant IL code?
Can you also post the relevant IL code?
Starting here (not sure which bit in ends on)
Inlines into 06000091 <OnReadCompletedAsync>d__52:MoveNext():this
[1 IL=0018 TR=000078 0600001E] [below ALWAYS_INLINE size] BenchmarkApplication:get_Writer():ref:this
[0 IL=0032 TR=000091 06000093] [FAILED: target not direct] PipeWriter:FlushAsync(struct):struct:this
[2 IL=0041 TR=000101 06002A95] [aggressive inline attribute] ValueTask`1:GetAwaiter():struct:this
[3 IL=0006 TR=000177 06004F09] [aggressive inline attribute] ValueTaskAwaiter`1:.ctor(struct):this
[4 IL=0049 TR=000110 06004F0A] [aggressive inline attribute] ValueTaskAwaiter`1:get_IsCompleted():bool:this
[5 IL=0006 TR=000204 06002A90] [aggressive inline attribute] ValueTask`1:get_IsCompleted():bool:this
[6 IL=0023 TR=000264 0600282A] [below ALWAYS_INLINE size] Task:get_IsCompleted():bool:this
[7 IL=0010 TR=000294 0600282B] [below ALWAYS_INLINE size] Task:IsCompletedMethod(int):bool
[8 IL=0030 TR=000243 0600010F] [aggressive inline attribute] Unsafe:As(ref):ref
[0 IL=0041 TR=000254 06002AA0] [FAILED: target not direct] IValueTaskSource`1:GetStatus(short):int:this
[9 IL=0081 TR=000140 06004E6E] [profitable inline] AsyncValueTaskMethodBuilder:AwaitUnsafeOnCompleted(byref,byref):this
...
IL_0020: callvirt instance valuetype [System.Runtime]System.Threading.Tasks.ValueTask`1<valuetype [System.IO.Pipelines]System.IO.Pipelines.FlushResult> [System.IO.Pipelines]System.IO.Pipelines.PipeWriter::FlushAsync(valuetype [System.Runtime]System.Threading.CancellationToken)
IL_0025: stloc.s V_4
IL_0027: ldloca.s V_4
IL_0029: call instance valuetype [System.Runtime]System.Runtime.CompilerServices.ValueTaskAwaiter`1<!0> valuetype [System.Runtime]System.Threading.Tasks.ValueTask`1<valuetype [System.IO.Pipelines]System.IO.Pipelines.FlushResult>::GetAwaiter()
IL_002e: stloc.2
IL_002f: ldloca.s V_2
IL_0031: call instance bool valuetype [System.Runtime]System.Runtime.CompilerServices.ValueTaskAwaiter`1<valuetype [System.IO.Pipelines]System.IO.Pipelines.FlushResult>::get_IsCompleted()
IL_0036: brtrue.s IL_0074
IL_0038: ldarg.0
IL_0039: ldc.i4.0
IL_003a: dup
IL_003b: stloc.0
IL_003c: stfld int32 PlatformBenchmarks.BenchmarkApplication/'<OnReadCompletedAsync>d__52'::'<>1__state'
IL_0041: ldarg.0
IL_0042: ldloc.2
IL_0043: stfld valuetype [System.Runtime]System.Runtime.CompilerServices.ValueTaskAwaiter`1<valuetype [System.IO.Pipelines]System.IO.Pipelines.FlushResult> PlatformBenchmarks.BenchmarkApplication/'<OnReadCompletedAsync>d__52'::'<>u__1'
IL_0048: ldarg.0
IL_0049: ldflda valuetype [System.Runtime]System.Runtime.CompilerServices.AsyncValueTaskMethodBuilder PlatformBenchmarks.BenchmarkApplication/'<OnReadCompletedAsync>d__52'::'<>t__builder'
IL_004e: ldloca.s V_2
IL_0050: ldarg.0
IL_0051: call instance void [System.Runtime]System.Runtime.CompilerServices.AsyncValueTaskMethodBuilder::AwaitUnsafeOnCompleted<valuetype [System.Runtime]System.Runtime.CompilerServices.ValueTaskAwaiter`1<valuetype [System.IO.Pipelines]System.IO.Pipelines.FlushResult>,valuetype PlatformBenchmarks.BenchmarkApplication/'<OnReadCompletedAsync>d__52'>(!!0&,
Looking at this again...
[000102] ------------ * STMT void (IL 0x016... ???)
[000203] n----------- | /--* IND ref
[000202] L----------- | | \--* ADDR byref
[000097] -----+------ | | \--* LCL_VAR struct(AX) V09 tmp8
[000101] -A--G+------ \--* ASG ref
[000100] *---G+-N---- \--* IND ref
[000099] -----+------ \--* ADDR byref
[000096] D----+-N---- \--* LCL_FLD struct V06 tmp5 [+0] Fseq[_s]
@CarolEidt Why isn't this just ASG(LCL_FLD, LCL_FLD)? It's not a struct copy, it doesn't involve any struct/field reinterpretation etc. Not sure how much that would help but at least we'd get rid of the extra LEA from
L0038: lea rax, [rsp+0x30]
L003d: mov [rax], rcx
For async, an example from the socket transport in Kestrel async Task SocketConnection.ProcessSends()
var result = await output.ReadAsync();
call qword ptr [rax+40]PipeReader:ReadAsync(struct):struct:this
G_M43661_IG05:
vmovdqu xmm0, qword ptr [rbp-E0H] -->
vmovdqu qword ptr [rbp-158H], xmm0 158H
vmovdqu xmm0, qword ptr [rbp-D0H] -->
vmovdqu qword ptr [rbp-148H], xmm0 148H
vmovdqu xmm0, qword ptr [rbp-C0H] -->
vmovdqu qword ptr [rbp-138H], xmm0 138H
mov rdx, qword ptr [rbp-B0H]
mov qword ptr [rbp-128H], rdx
G_M43663_IG06:
vmovdqu xmm0, qword ptr [rbp-158H] 158H -->
vmovdqu qword ptr [rbp-120H], xmm0 120H
vmovdqu xmm0, qword ptr [rbp-148H] 148H -->
vmovdqu qword ptr [rbp-110H], xmm0 110H
vmovdqu xmm0, qword ptr [rbp-138H] 138H -->
vmovdqu qword ptr [rbp-100H], xmm0 100H
mov rdx, qword ptr [rbp-128H]
mov qword ptr [rbp-F0H], rdx
G_M43663_IG07:
vmovdqu xmm0, qword ptr [rbp-120H] 120H -->
vmovdqu qword ptr [rbp-A8H], xmm0 A8H
vmovdqu xmm0, qword ptr [rbp-110H] 110H -->
vmovdqu qword ptr [rbp-98H], xmm0
vmovdqu xmm0, qword ptr [rbp-100H] 100H -->
vmovdqu qword ptr [rbp-88H], xmm0
mov rdx, qword ptr [rbp-F0H]
mov qword ptr [rbp-78H], rdx
G_M43663_IG08:
mov rsi, gword ptr [rbp-A8H] A8H -->
test rsi, rsi
jne SHORT G_M43663_IG09
mov edi, 1
jmp SHORT G_M43663_IG11
Looking at the il for <ProcessSends>d__26:MoveNext():this it might be the execption handling blocking the optimisations as AsyncTaskMethodBuilder::SetException is used in the catch so crosses the EH?
; V00 this byref -> [rbp+0x10] do-not-enreg[H] this
; V01 loc0 int -> rcx
; V02 loc1 ref -> rbx class-hnd
; V03 loc2 struct (40) [rbp-0x50] do-not-enreg[SFB] must-init ld-addr-op
; V04 loc3 struct (32) [rbp-0x70] do-not-enreg[XSFB] must-init addr-exposed ld-addr-op
; V05 loc4 struct (56) [rbp-0xA8] do-not-enreg[XSFB] must-init addr-exposed ld-addr-op
;* V06 loc5 struct ( 8) zero-ref ld-addr-op
; V07 loc6 struct (56) [rbp-0xE0] do-not-enreg[XSB] must-init addr-exposed ld-addr-op
; V08 loc7 ref -> [rbp-0xE8] do-not-enreg[X] must-init addr-exposed ld-addr-op class-hnd
;* V09 loc8 ref -> zero-ref class-hnd
; V10 OutArgs lclBlk (32) [rsp+0x00] "OutgoingArgSpace"
;...
Looking at the il for
d__26:MoveNext():this it might be the execption handling blocking the optimisations as AsyncTaskMethodBuilder::SetException is used in the catch so crosses the EH?
Exception handling blocks enregistration but these look more like redundant struct copies.
I think we should include aspnet (or at least Kestrel) assemblies in the typical JIT diff. The forward substitution experiment I did a while ago did get rid of some copies like these in some FX assemblies but I have no I idea if that would help Kestrel or not.
I think we should include aspnet (or at least Kestrel) assemblies in the typical JIT diff.
Giving it a go... https://github.com/dotnet/jitutils/pull/195
Funny, some of my experimental JIT changes combined show a 2700 bytes improvement for Microsoft.AspNetCore.Server.Kestrel.Core. But most of improvements come from better struct copy codegen rather than from eliminating redundant copies. Similar for Microsoft.AspNetCore.Server.Kestrel.Transport.Sockets. Oh well, need to look at the dumps to see what exactly is going on.
Async gets big fast, so this method becomes 2180 bytes of asm
private async Task ProcessSends()
{
var output = Output;
while (true)
{
var result = await output.ReadAsync();
if (result.IsCanceled)
{
break;
}
var buffer = result.Buffer;
var end = buffer.End;
var isCompleted = result.IsCompleted;
if (!buffer.IsEmpty)
{
await _sender.SendAsync(buffer);
}
output.AdvanceTo(end);
if (isCompleted)
{
break;
}
}
}
; Assembly listing for method SocketConnection:ProcessSends():ref:this
; Lcl frame size = 152
; Total bytes of code 90, prolog size 29 for method SocketConnection:ProcessSends():ref:this
md5-6b79212e98faf2152a78cc5179d291d8
; Assembly listing for method AsyncMethodBuilderCore:Start(byref)
; Lcl frame size = 72
; Total bytes of code 216, prolog size 18 for method AsyncMethodBuilderCore:Start(byref)
md5-6b79212e98faf2152a78cc5179d291d8
; Assembly listing for method AsyncStateMachineBox`1:MoveNext(ref):this
; Lcl frame size = 56
; Total bytes of code 349, prolog size 11 for method AsyncStateMachineBox`1:MoveNext(ref):this
md5-6b79212e98faf2152a78cc5179d291d8
; Assembly listing for method <ProcessSends>d__26:MoveNext():this
; Lcl frame size = 408
; Total bytes of code 1525, prolog size 59 for method <ProcessSends>d__26:MoveNext():this
It would be worse if ASP.NET Core used .ConfigureAwait anywhere; which they don't 馃槄 As that's a further struct wrapper on top.
Looking at StreamReader.ReadBufferAsync() in coreclr they are all marked as do-not-enreg
NewObj constructor temp is one of the reasons, and must-init addr-exposed ld-addr-op the other (could be due to in which reduced an additional copy? https://github.com/dotnet/coreclr/pull/22738)
; Assembly listing for method <ReadBufferAsync>d__69:MoveNext():this
call qword ptr [rax+32]Stream:ReadAsync(struct,struct):struct:this
mov rcx, gword ptr [rbp-58H]
mov eax, dword ptr [rbp-50H]
mov edx, dword ptr [rbp-4CH]
movsx rdx, dx
xor r8d, r8d
lea r9, bword ptr [rbp-B8H] ; do-not-enreg[XSB] must-init addr-exposed ld-addr-op "Inline stloc first use temp"
mov gword ptr [r9], rcx
mov dword ptr [r9+8], eax
mov word ptr [r9+12], dx
mov byte ptr [r9+14], r8b
G_M42774_IG33:
movdqu xmm0, qword ptr [rbp-B8H]
movdqu qword ptr [rbp-C8H], xmm0 ; do-not-enreg[SFB] must-init "NewObj constructor temp"
G_M42774_IG34:
movdqu xmm0, qword ptr [rbp-C8H]
movdqu qword ptr [rbp-68H], xmm0 ; do-not-enreg[XSFB] must-init addr-exposed ld-addr-op
G_M42774_IG35:
movdqu xmm0, qword ptr [rbp-68H]
movdqu qword ptr [rbp-D8H], xmm0 ; do-not-enreg[SFB] must-init "NewObj constructor temp"
G_M42774_IG36:
movdqu xmm0, qword ptr [rbp-D8H]
movdqu qword ptr [rbp-78H], xmm0 ; do-not-enreg[XSFB] must-init addr-exposed ld-addr-op
G_M42774_IG37:
mov rsi, gword ptr [rbp-78H]
***** BB41, stmt 109
( 10, 10) [001137] ------------ * STMT void (IL 0x187... ???)
N005 ( 10, 10) [001136] -A--G---R--- \--* ASG struct (copy) $VN.Void
N004 ( 6, 7) [001135] n----------- +--* BLK(16) struct
N003 ( 3, 5) [001134] ------------ | \--* ADDR byref $19d
N002 ( 3, 4) [001129] D------N---- | \--* LCL_FLD struct V51 tmp37 d:2[+0] Fseq[_value]
N001 ( 3, 2) [001131] ----G--N---- \--* LCL_VAR struct(AX) V50 tmp36 $345
***** BB41, stmt 110
( 7, 5) [000142] ------------ * STMT void (IL ???... ???)
N003 ( 7, 5) [001091] -A------R--- \--* ASG struct (copy) $VN.Void
N002 ( 3, 2) [001090] D------N---- +--* LCL_VAR struct V10 loc9 d:2
N001 ( 3, 2) [001088] ------------ \--* LCL_VAR struct V51 tmp37 u:2 (last use) $383
***** BB41, stmt 111
( 13, 15) [001168] ------------ * STMT void (IL 0x191... ???)
N007 ( 13, 15) [001167] -A--G---R--- \--* ASG struct (copy) $VN.Void
N006 ( 6, 7) [001166] n----------- +--* BLK(16) struct
N005 ( 3, 5) [001165] ------------ | \--* ADDR byref $19f
N004 ( 3, 4) [001158] D------N---- | \--* LCL_FLD struct V55 tmp41 d:2[+0] Fseq[_value]
N003 ( 6, 7) [001163] n---G------- \--* IND struct $3c3
N002 ( 3, 5) [001159] ----G------- \--* ADDR byref $19e
N001 ( 3, 4) [001162] -------N---- \--* LCL_FLD struct V10 loc9 u:2[+0] Fseq[_value] (last use) $3c3
***** BB41, stmt 112
( 7, 5) [000151] ------------ * STMT void (IL ???... ???)
N003 ( 7, 5) [001155] -A--G---R--- \--* ASG struct (copy) $VN.Void
N002 ( 3, 2) [001154] D---G--N---- +--* LCL_VAR struct(AX) V12 loc11
N001 ( 3, 2) [001152] ------------ \--* LCL_VAR struct V55 tmp41 u:2 (last use) $346
***** BB41, stmt 113
( 3, 4) [001186] ------------ * STMT void (IL 0x19A... ???)
N003 ( 3, 4) [001185] -A--G---R--- \--* ASG ref $31d
N002 ( 1, 1) [001184] D------N---- +--* LCL_VAR ref V57 tmp43 d:2 $31d
N001 ( 3, 4) [001182] ----G------- \--* LCL_FLD ref V12 loc11 [+0] Fseq[_value, _obj] $31d
N1223. IL_OFFSET IL offset: 0x187
N1225. V50 MEM
N1227. LCL_FLD_ADDR V51 tmp37 d:2[+0] Fseq[_value] NA
N1229. STORE_BLK(16)
N1231. V51 MEM
N1233. LCL_VAR_ADDR V10 loc9 d:2 NA
N1235. STORE_BLK(16)
N1237. IL_OFFSET IL offset: 0x191
N1239. LCL_FLD_ADDR V10 loc9 u:2[+0] Fseq[_value] NA (last use)
N1241. IND
N1243. LCL_FLD_ADDR V55 tmp41 d:2[+0] Fseq[_value] NA
N1245. STORE_BLK(16)
N1247. V55 MEM
N1249. LCL_VAR_ADDR V12 loc11 NA
N1251. STORE_BLK(16)
N1253. IL_OFFSET IL offset: 0x19a
N1255. rsi = V12 MEM
md5-6b79212e98faf2152a78cc5179d291d8
IN00e3: mov byte ptr [r9+14], r8b
genIPmappingAdd: ignoring duplicate IL offset 0x187
Generating: N1223 ( 10, 10) [001137] ------------ IL_OFFSET void IL offset: 0x187 REG NA
Generating: N1225 ( 3, 2) [001131] -c-----N---- t1131 = LCL_VAR struct(AX) V50 tmp36 NA REG NA $345
Generating: N1227 ( 3, 4) [001129] Dc-----N---- t1129 = LCL_FLD_ADDR byref V51 tmp37 d:2[+0] Fseq[_value] NA REG NA
/--* t1129 byref
+--* t1131 struct
Generating: N1229 ( 6, 7) [001135] nA--G------- * STORE_BLK(16) struct (copy) (Unroll) REG NA
G_M33302_IG32: ; offs=000353H, funclet=00
IN00e4: movdqu xmm0, qword ptr [V50 rbp-B8H]
IN00e5: movdqu qword ptr [V51 rbp-C8H], xmm0
Generating: N1231 ( 3, 2) [001088] -c---------- t1088 = LCL_VAR struct V51 tmp37 u:2 NA (last use) REG NA $383
Generating: N1233 ( 3, 2) [001090] Dc-----N---- t1090 = LCL_VAR_ADDR byref V10 loc9 d:2 NA REG NA
/--* t1090 byref
+--* t1088 struct
Generating: N1235 (???,???) [002170] nA---------- * STORE_BLK(16) struct (copy) (Unroll) REG NA
G_M33302_IG33: ; offs=0003A5H, funclet=00
IN00e6: movdqu xmm0, qword ptr [V51 rbp-C8H]
IN00e7: movdqu qword ptr [V10 rbp-68H], xmm0
Added IP mapping: 0x0191 STACK_EMPTY (G_M33302_IG34,ins#2,ofs#13)
Generating: N1237 ( 13, 15) [001168] ------------ IL_OFFSET void IL offset: 0x191 REG NA
Generating: N1239 ( 3, 4) [001162] -c--G--N---- t1162 = LCL_FLD_ADDR byref V10 loc9 u:2[+0] Fseq[_value] NA (last use) REG NA
/--* t1162 byref
Generating: N1241 ( 6, 7) [001163] nc--G------- t1163 = * IND struct REG NA $3c3
Generating: N1243 ( 3, 4) [001158] Dc-----N---- t1158 = LCL_FLD_ADDR byref V55 tmp41 d:2[+0] Fseq[_value] NA REG NA
/--* t1158 byref
+--* t1163 struct
Generating: N1245 ( 6, 7) [001166] nA--G------- * STORE_BLK(16) struct (copy) (Unroll) REG NA
G_M33302_IG34: ; offs=0003B5H, funclet=00
IN00e8: movdqu xmm0, qword ptr [V10 rbp-68H]
IN00e9: movdqu qword ptr [V55 rbp-D8H], xmm0
Generating: N1247 ( 3, 2) [001152] -c---------- t1152 = LCL_VAR struct V55 tmp41 u:2 NA (last use) REG NA $346
Generating: N1249 ( 3, 2) [001154] Dc-----N---- t1154 = LCL_VAR_ADDR byref V12 loc11 NA REG NA
/--* t1154 byref
+--* t1152 struct
Generating: N1251 (???,???) [002171] nA---------- * STORE_BLK(16) struct (copy) (Unroll) REG NA
G_M33302_IG35: ; offs=0003C2H, funclet=00
IN00ea: movdqu xmm0, qword ptr [V55 rbp-D8H]
IN00eb: movdqu qword ptr [V12 rbp-78H], xmm0
Added IP mapping: 0x019A STACK_EMPTY (G_M33302_IG36,ins#2,ofs#13)
Generating: N1253 ( 3, 4) [001186] ------------ IL_OFFSET void IL offset: 0x19a REG NA
Generating: N1255 ( 3, 4) [001182] ------------ t1182 = LCL_FLD ref V12 loc11 [+0] Fseq[_value, _obj] rsi REG rsi $31d
G_M33302_IG36: ; offs=0003CFH, funclet=00
IN00ec: mov rsi, gword ptr [V12 rbp-78H]
Probably because there are a bunch of struct typed LCL_FLD nodes that the JIT cannot handle well.
struct typed LCL_FLD nodes that the JIT cannot handle well.
Hmm, because its in the il?
await tmpStream.ReadAsync(new Memory<byte>(tmpByteBuffer)).ConfigureAwait(false)
Becoming stloc.s, ldloca.s chains
.locals init (
...
[8] valuetype System.Threading.Tasks.ValueTask`1<int32>,
[9] valuetype System.Runtime.CompilerServices.ConfiguredValueTaskAwaitable`1<int32>,
...
[11] valuetype System.Runtime.CompilerServices.ConfiguredValueTaskAwaitable`1/ConfiguredValueTaskAwaiter<int32>,
...
)
IL_0180: callvirt instance valuetype System.Threading.Tasks.ValueTask`1<int32> System.IO.Stream::ReadAsync(valuetype System.Memory`1<uint8>, valuetype System.Threading.CancellationToken)
IL_0185: stloc.s 8
IL_0187: ldloca.s 8
IL_0189: ldc.i4.0
IL_018a: call instance valuetype System.Runtime.CompilerServices.ConfiguredValueTaskAwaitable`1<!0> valuetype System.Threading.Tasks.ValueTask`1<int32>::ConfigureAwait(bool)
IL_018f: stloc.s 9
IL_0191: ldloca.s 9
IL_0193: call instance valuetype System.Runtime.CompilerServices.ConfiguredValueTaskAwaitable`1/ConfiguredValueTaskAwaiter<!0> valuetype System.Runtime.CompilerServices.ConfiguredValueTaskAwaitable`1<int32>::GetAwaiter()
IL_0198: stloc.s 11
IL_019a: ldloca.s 11
IL_019c: call instance bool valuetype System.Runtime.CompilerServices.ConfiguredValueTaskAwaitable`1/ConfiguredValueTaskAwaiter<int32>::get_IsCompleted()
IL_01a1: brtrue.s IL_01e4
So the Jit is doing what its asked to do, in a way?
Well, I don't know where those LCL_FLDs come from, presumably they were present in some IL - FSeq shows a field named _value. What's for sure that JIT's handling of struct typed fields needs improvement. Hopefully we'll get there sooner or later, I have a pending PR that's supposed to provided the preliminaries for that.
Might not help they are different types; even though they are the exact same sized structs containing the same data? (as they are wrappers over each other)
If I fold ConfiguredValueTaskAwaitable<TResult>.ConfiguredValueTaskAwaiter into ConfiguredValueTaskAwaitable<TResult> and change GetAwaiter from
public ConfiguredValueTaskAwaiter GetAwaiter() => new ConfiguredValueTaskAwaiter(in _value);
to
public ConfiguredValueTaskAwaitable<TResult> GetAwaiter() => this;
Then one of the ; do-not-enreg[SFB] must-init "NewObj constructor temp" drops out and its down the 3 copies rather than 4:
call qword ptr [rax+32]Stream:ReadAsync(struct,struct):struct:this
mov rcx, gword ptr [rbp-58H]
mov eax, dword ptr [rbp-50H]
mov edx, dword ptr [rbp-4CH]
movsx rdx, dx
xor r8d, r8d
lea r9, bword ptr [rbp-A8H] ; do-not-enreg[XSB] must-init addr-exposed ld-addr-op "Inline stloc first use temp"
mov gword ptr [r9], rcx
mov dword ptr [r9+8], eax
mov word ptr [r9+12], dx
mov byte ptr [r9+14], r8b
G_M42772_IG32:
movdqu xmm0, qword ptr [rbp-A8H]
movdqu qword ptr [rbp-B8H], xmm0 ; do-not-enreg[SFB] must-init "NewObj constructor temp"
G_M42772_IG33:
movdqu xmm0, qword ptr [rbp-B8H]
movdqu qword ptr [rbp-68H], xmm0 ; do-not-enreg[SB] must-init ld-addr-op
G_M42772_IG34:
movdqu xmm0, qword ptr [rbp-68H] ; do-not-enreg[XSFB] must-init addr-exposed ld-addr-op
movdqu qword ptr [rbp-78H], xmm0
G_M42772_IG35:
mov rsi, gword ptr [rbp-78H]
test rsi, rsi
Guessing that's new no longer being called rather than the types now matching though?
Here's a derived example with similar results (3 back to back copies):
```C#
using System;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
[StructLayout(LayoutKind.Auto)]
public readonly struct xValueTask
{
internal readonly object _obj;
internal readonly short _token;
internal readonly bool _continueOnCapturedContext;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public xValueTask(object o)
{
if (o == null)
{
ThrowHelper();
}
_obj = o;
_continueOnCapturedContext = true;
_token = 0;
}
static void ThrowHelper()
{
throw new Exception();
}
public bool IsCompleted
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get
{
object obj = _obj;
if (obj == null)
{
return true;
}
if (obj is string s)
{
return true;
}
return false;
}
}
}
[StructLayout(LayoutKind.Auto)]
public readonly struct xConfiguredValueTaskAwaitable
{
private readonly xValueTask _value;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal xConfiguredValueTaskAwaitable(in xValueTask value) => _value = value;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public xConfiguredValueTaskAwaiter GetAwaiter() => new xConfiguredValueTaskAwaiter(in _value);
}
[StructLayout(LayoutKind.Auto)]
public readonly struct xConfiguredValueTaskAwaiter
{
private readonly xValueTask _value;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal xConfiguredValueTaskAwaiter(in xValueTask value) => _value = value;
public bool IsCompleted
{
get => _value.IsCompleted;
}
}
class X
{
public static int Main()
{
xValueTask xvt = new xValueTask("xyz");
xConfiguredValueTaskAwaitable xcvta = new xConfiguredValueTaskAwaitable(in xvt);
return xcvta.GetAwaiter().IsCompleted ? 100 : 0;
}
}
we end up with
```asm
G_M52876_IG04:
C5FA6F442430 vmovdqu xmm0, qword ptr [rsp+30H]
C5FA7F442450 vmovdqu qword ptr [rsp+50H], xmm0
G_M52876_IG05:
C5FA6F442450 vmovdqu xmm0, qword ptr [rsp+50H]
C5FA7F442420 vmovdqu qword ptr [rsp+20H], xmm0
G_M52876_IG06:
C5FA6F442420 vmovdqu xmm0, qword ptr [rsp+20H]
C5FA7F442440 vmovdqu qword ptr [rsp+40H], xmm0
Pipeline use in Kestrel is this variation on the above
+ using System.Buffers;
+ [Flags]
+ internal enum ResultFlags : byte
+ {
+ None = 0x0,
+ Canceled = 0x1,
+ Completed = 0x2
+ }
+ public readonly struct ReadResult
+ {
+ internal readonly ReadOnlySequence<byte> _resultBuffer;
+ internal readonly ResultFlags _resultFlags;
+ }
[StructLayout(LayoutKind.Auto)]
public readonly struct xValueTask
{
+ internal readonly ReadResult _result;
...
Which has 3 xmm0 shuffles per copy sharplab.io:
L005e: vmovdqu xmm0, [rsp+0xe0]
L0067: vmovdqu [rsp+0x50], xmm0
L006d: vmovdqu xmm0, [rsp+0xf0]
L0076: vmovdqu [rsp+0x60], xmm0
L007c: vmovdqu xmm0, [rsp+0x100]
L0085: vmovdqu [rsp+0x70], xmm0
L008b: vmovdqu xmm0, [rsp+0x50]
L0091: vmovdqu [rsp+0xb0], xmm0
L009a: vmovdqu xmm0, [rsp+0x60]
L00a0: vmovdqu [rsp+0xc0], xmm0
L00a9: vmovdqu xmm0, [rsp+0x70]
L00af: vmovdqu [rsp+0xd0], xmm0
L00b8: vmovdqu xmm0, [rsp+0xb0]
L00c1: vmovdqu [rsp+0x20], xmm0
L00c7: vmovdqu xmm0, [rsp+0xc0]
L00d0: vmovdqu [rsp+0x30], xmm0
L00d6: vmovdqu xmm0, [rsp+0xd0]
L00df: vmovdqu [rsp+0x40], xmm0
L00e5: vmovdqu xmm0, [rsp+0x20]
L00eb: vmovdqu [rsp+0x80], xmm0
L00f4: vmovdqu xmm0, [rsp+0x30]
L00fa: vmovdqu [rsp+0x90], xmm0
L0103: vmovdqu xmm0, [rsp+0x40]
L0109: vmovdqu [rsp+0xa0], xmm0
Guess would be copyprop? Currently outputs this
Copy Assertion for BB02
curSsaName stack: { }
Live vars: {V00} => {}
Live vars: {} => {V04}
Live vars: {V04} => {}
Live vars: {} => {V01}
Live vars: {V01} => {}
Live vars: {} => {V05}
Live vars: {V05} => {}
Live vars: {} => {V02}
Live vars: {V02} => {}
Live vars: {} => {V07}
But doesn't then do any copyprop
Adding in the loop exit conditions
Copy Assertion for BB02
curSsaName stack: { }
!tree->IsLocal()
!tree->IsLocal()
tree->OperGet() == GT_PHI_ARG || tree->OperGet() == GT_LCL_FLD
!tree->IsLocal()
!tree->IsLocal()
tree->OperGet() == GT_PHI_ARG || tree->OperGet() == GT_LCL_FLD
!tree->IsLocal()
!tree->IsLocal()
tree->OperGet() == GT_PHI_ARG || tree->OperGet() == GT_LCL_FLD
!tree->IsLocal()
!tree->IsLocal()
tree->OperGet() == GT_PHI_ARG || tree->OperGet() == GT_LCL_FLD
!tree->IsLocal()
!tree->IsLocal()
!tree->IsLocal()
Live vars: {V00} => {}
lclNum == newLclNum
Live vars: {} => {V04}
tree->OperGet() == GT_PHI_ARG || tree->OperGet() == GT_LCL_FLD
!tree->IsLocal()
!tree->IsLocal()
!tree->IsLocal()
Live vars: {V04} => {}
opVN != tree->gtVNPair.GetConservative()
lclNum == newLclNum
Live vars: {} => {V01}
tree->gtFlags & GTF_VAR_DEF
!tree->IsLocal()
Live vars: {V01} => {}
tree->OperGet() == GT_PHI_ARG || tree->OperGet() == GT_LCL_FLD
!tree->IsLocal()
!tree->IsLocal()
Live vars: {} => {V05}
tree->OperGet() == GT_PHI_ARG || tree->OperGet() == GT_LCL_FLD
!tree->IsLocal()
!tree->IsLocal()
!tree->IsLocal()
Live vars: {V05} => {}
opVN != tree->gtVNPair.GetConservative()
opVN == ValueNumStore::NoVN
opVN == ValueNumStore::NoVN
lclNum == newLclNum
Live vars: {} => {V02}
tree->gtFlags & GTF_VAR_DEF
!tree->IsLocal()
Live vars: {V02} => {}
tree->OperGet() == GT_PHI_ARG || tree->OperGet() == GT_LCL_FLD
Live vars: {} => {V07}
tree->gtFlags & GTF_VAR_DEF
!tree->IsLocal()
op->TypeGet() != tree->TypeGet()
opVN == ValueNumStore::NoVN
opVN == ValueNumStore::NoVN
opVN == ValueNumStore::NoVN
opVN == ValueNumStore::NoVN
lclNum == newLclNum
!tree->IsLocal()
!tree->IsLocal()
!tree->IsLocal()
And stage before
***** BB02, stmt 5 (before)
N005 ( 8, 9) [000071] IA------R--- * ASG struct (init)
N004 ( 6, 7) [000070] n------N---- +--* BLK(32) struct
N003 ( 3, 5) [000068] ------------ | \--* ADDR byref
N002 ( 3, 4) [000066] U------N---- | \--* LCL_FLD struct V00 loc0 ud:4->5[+16] Fseq[_result]
N001 ( 1, 1) [000069] ------------ \--* CNS_INT int 0
N001 [000069] CNS_INT 0 => $40 {IntCns 0}
VNApplySelectors:
VNForHandle(_result) is $c4, fieldType is struct, size = 32
AX2: $c4 != $c3 ==> select([$201]store($1c1, $c3, $40), $c4) ==> select($1c1, $c4).
AX2: $c4 != $c2 ==> select([$1c1]store($102, $c2, $43), $c4) ==> select($102, $c4).
AX2: $c4 != $c1 ==> select([$102]store($1, $c1, $100), $c4) ==> select($1, $c4).
VNForMapSelect($201, $c4):struct returns $VN.ZeroMap
*** Mismatched types in VNApplySelectorsTypeCheck (indType is TYP_STRUCT)
VNApplySelectors:
VNForHandle(_result) is $c4, fieldType is struct, size = 32
AX2: $c4 != $c3 ==> select([$200]store($1c0, $c3, $40), $c4) ==> select($1c0, $c4).
AX2: $c4 != $c2 ==> select([$1c0]store($101, $c2, $43), $c4) ==> select($101, $c4).
AX2: $c4 != $c1 ==> select([$101]store($1, $c1, $142), $c4) ==> select($1, $c4).
VNForMapSelect($200, $c4):struct returns $VN.ZeroMap
*** Mismatched types in VNApplySelectorsTypeCheck (indType is TYP_STRUCT)
N002 [000066] LCL_FLD V00 loc0 ud:4->5[+16] Fseq[_result] => <l:$143 {143}, c:$144 {144}>
FieldSeq {_result} is $240
N003 [000068] ADDR => $280 {PtrToLoc($40, $240)}
N005 [000071] ASG V00/5 => $2c0 {2c0}
N005 [000071] ASG => $VN.Void
***** BB02, stmt 5 (after)
N005 ( 8, 9) [000071] IA------R--- * ASG struct (init) $VN.Void
N004 ( 6, 7) [000070] n------N---- +--* BLK(32) struct
N003 ( 3, 5) [000068] ------------ | \--* ADDR byref $280
N002 ( 3, 4) [000066] U------N---- | \--* LCL_FLD struct V00 loc0 ud:4->5[+16] Fseq[_result] <l:$143, c:$144>
N001 ( 1, 1) [000069] ------------ \--* CNS_INT int 0 $40
---------
***** BB02, stmt 6 (before)
N005 ( 10, 10) [000084] -A------R--- * ASG struct (copy)
N004 ( 6, 7) [000083] n----------- +--* BLK(48) struct
N003 ( 3, 5) [000082] ------------ | \--* ADDR byref
N002 ( 3, 4) [000077] D------N---- | \--* LCL_FLD struct V04 tmp1 d:2[+0] Fseq[_value]
N001 ( 3, 2) [000079] -------N---- \--* LCL_VAR struct V00 loc0 u:5 (last use)
N001 [000079] LCL_VAR V00 loc0 u:5 (last use) => $2c0 {2c0}
FieldSeq {_value} is $241
N003 [000082] ADDR => $281 {PtrToLoc($44, $241)}
VNApplySelectorsAssign:
VNForHandle(_value) is $c5, fieldType is struct
VNForMapStore($2c1, $c5, $2c0):struct returns $300 {$2c1[$c5 := $2c0]}
VNApplySelectorsAssign:
VNForHandle(_value) is $c5, fieldType is struct
VNForMapStore($2c1, $c5, $2c0):struct returns $300 {$2c1[$c5 := $2c0]}
Tree [000084] assigned VN to local var V04/2: $300 {$2c1[$c5 := $2c0]}
N005 [000084] ASG => $VN.Void
***** BB02, stmt 6 (after)
N005 ( 10, 10) [000084] -A------R--- * ASG struct (copy) $VN.Void
N004 ( 6, 7) [000083] n----------- +--* BLK(48) struct
N003 ( 3, 5) [000082] ------------ | \--* ADDR byref $281
N002 ( 3, 4) [000077] D------N---- | \--* LCL_FLD struct V04 tmp1 d:2[+0] Fseq[_value]
N001 ( 3, 2) [000079] -------N---- \--* LCL_VAR struct V00 loc0 u:5 (last use) $2c0
---------
***** BB02, stmt 7 (before)
N003 ( 7, 5) [000016] -A------R--- * ASG struct (copy)
N002 ( 3, 2) [000014] D------N---- +--* LCL_VAR struct V01 loc1 d:2
N001 ( 3, 2) [000013] ------------ \--* LCL_VAR struct V04 tmp1 u:2 (last use)
N001 [000013] LCL_VAR V04 tmp1 u:2 (last use) => $300 {$2c1[$c5 := $2c0]}
Tree [000016] assigned VN to local var V01/2: $300 {$2c1[$c5 := $2c0]}
N003 [000016] ASG => $VN.Void
***** BB02, stmt 7 (after)
N003 ( 7, 5) [000016] -A------R--- * ASG struct (copy) $VN.Void
N002 ( 3, 2) [000014] D------N---- +--* LCL_VAR struct V01 loc1 d:2
N001 ( 3, 2) [000013] ------------ \--* LCL_VAR struct V04 tmp1 u:2 (last use) $300
---------
***** BB02, stmt 8 (before)
N007 ( 13, 15) [000115] -A--G---R--- * ASG struct (copy)
N006 ( 6, 7) [000114] n----------- +--* BLK(48) struct
N005 ( 3, 5) [000113] ------------ | \--* ADDR byref
N004 ( 3, 4) [000106] D------N---- | \--* LCL_FLD struct V05 tmp2 d:2[+0] Fseq[_value]
N003 ( 6, 7) [000111] n---G------- \--* IND struct
N002 ( 3, 5) [000107] ----G------- \--* ADDR byref
N001 ( 3, 4) [000110] -------N---- \--* LCL_FLD struct V01 loc1 u:2[+0] Fseq[_value] (last use)
VNApplySelectors:
VNForHandle(_value) is $c5, fieldType is struct, size = 48
AX1: select([$2c1]store($300, $c5, $2c0), $c5) ==> $2c0.
VNForMapSelect($300, $c5):struct returns $2c0 {2c0}
VNApplySelectors:
VNForHandle(_value) is $c5, fieldType is struct, size = 48
AX1: select([$2c1]store($300, $c5, $2c0), $c5) ==> $2c0.
VNForMapSelect($300, $c5):struct returns $2c0 {2c0}
N001 [000110] LCL_FLD V01 loc1 u:2[+0] Fseq[_value] (last use) => $2c0 {2c0}
FieldSeq {_value} is $241
N002 [000107] ADDR => $282 {PtrToLoc($43, $241)}
VNApplySelectors:
VNForHandle(_value) is $c5, fieldType is struct, size = 48
AX1: select([$2c1]store($300, $c5, $2c0), $c5) ==> $2c0.
VNForMapSelect($300, $c5):struct returns $2c0 {2c0}
VNApplySelectors:
VNForHandle(_value) is $c5, fieldType is struct, size = 48
AX1: select([$2c1]store($300, $c5, $2c0), $c5) ==> $2c0.
VNForMapSelect($300, $c5):struct returns $2c0 {2c0}
N003 [000111] IND => $2c0 {2c0}
FieldSeq {_value} is $242
N005 [000113] ADDR => $283 {PtrToLoc($46, $242)}
VNApplySelectors:
VNForHandle(_value) is $c5, fieldType is struct, size = 48
AX1: select([$2c1]store($300, $c5, $2c0), $c5) ==> $2c0.
VNForMapSelect($300, $c5):struct returns $2c0 {2c0}
VNApplySelectors:
VNForHandle(_value) is $c5, fieldType is struct, size = 48
AX1: select([$2c1]store($300, $c5, $2c0), $c5) ==> $2c0.
VNForMapSelect($300, $c5):struct returns $2c0 {2c0}
VNApplySelectorsAssign:
VNForHandle(_value) is $c6, fieldType is struct
VNForMapStore($2c3, $c6, $2c0):struct returns $301 {$2c3[$c6 := $2c0]}
VNApplySelectorsAssign:
VNForHandle(_value) is $c6, fieldType is struct
VNForMapStore($2c3, $c6, $2c0):struct returns $301 {$2c3[$c6 := $2c0]}
Tree [000115] assigned VN to local var V05/2: $301 {$2c3[$c6 := $2c0]}
N007 [000115] ASG => $VN.Void
***** BB02, stmt 8 (after)
N007 ( 13, 15) [000115] -A--G---R--- * ASG struct (copy) $VN.Void
N006 ( 6, 7) [000114] n----------- +--* BLK(48) struct
N005 ( 3, 5) [000113] ------------ | \--* ADDR byref $283
N004 ( 3, 4) [000106] D------N---- | \--* LCL_FLD struct V05 tmp2 d:2[+0] Fseq[_value]
N003 ( 6, 7) [000111] n---G------- \--* IND struct $2c0
N002 ( 3, 5) [000107] ----G------- \--* ADDR byref $282
N001 ( 3, 4) [000110] -------N---- \--* LCL_FLD struct V01 loc1 u:2[+0] Fseq[_value] (last use) $2c0
---------
***** BB02, stmt 9 (before)
N003 ( 7, 5) [000103] -A------R--- * ASG struct (copy)
N002 ( 3, 2) [000102] D------N---- +--* LCL_VAR struct V02 loc2 d:2
N001 ( 3, 2) [000100] ------------ \--* LCL_VAR struct V05 tmp2 u:2 (last use)
N001 [000100] LCL_VAR V05 tmp2 u:2 (last use) => $301 {$2c3[$c6 := $2c0]}
Tree [000103] assigned VN to local var V02/2: $301 {$2c3[$c6 := $2c0]}
N003 [000103] ASG => $VN.Void
***** BB02, stmt 9 (after)
N003 ( 7, 5) [000103] -A------R--- * ASG struct (copy) $VN.Void
N002 ( 3, 2) [000102] D------N---- +--* LCL_VAR struct V02 loc2 d:2
N001 ( 3, 2) [000100] ------------ \--* LCL_VAR struct V05 tmp2 u:2 (last use) $301
---------
***** BB02, stmt 10 (before)
N003 ( 3, 4) [000133] -A--G---R--- * ASG ref
N002 ( 1, 1) [000132] D------N---- +--* LCL_VAR ref V07 tmp4 d:2
N001 ( 3, 4) [000130] ------------ \--* LCL_FLD ref V02 loc2 u:2[+0] Fseq[_value, _obj] (last use)
VNApplySelectors:
VNForHandle(_value) is $c6, fieldType is struct, size = 48
AX1: select([$2c3]store($301, $c6, $2c0), $c6) ==> $2c0.
VNForMapSelect($301, $c6):struct returns $2c0 {2c0}
VNApplySelectors:
VNForHandle(_obj) is $c1, fieldType is ref
VNForMapSelect($2c0, $c1):ref returns $243 {$2c0[$c1]}
VNApplySelectors:
VNForHandle(_value) is $c6, fieldType is struct, size = 48
AX1: select([$2c3]store($301, $c6, $2c0), $c6) ==> $2c0.
VNForMapSelect($301, $c6):struct returns $2c0 {2c0}
VNApplySelectors:
VNForHandle(_obj) is $c1, fieldType is ref
VNForMapSelect($2c0, $c1):ref returns $243 {$2c0[$c1]}
N001 [000130] LCL_FLD V02 loc2 u:2[+0] Fseq[_value, _obj] (last use) => $243 {$2c0[$c1]}
N002 [000132] LCL_VAR V07 tmp4 d:2 => $243 {$2c0[$c1]}
N003 [000133] ASG => $243 {$2c0[$c1]}
***** BB02, stmt 10 (after)
N003 ( 3, 4) [000133] -A--G---R--- * ASG ref $243
N002 ( 1, 1) [000132] D------N---- +--* LCL_VAR ref V07 tmp4 d:2 $243
N001 ( 3, 4) [000130] ------------ \--* LCL_FLD ref V02 loc2 u:2[+0] Fseq[_value, _obj] (last use) $243
Or
***** BB02, stmt 5
N005 ( 8, 9) [000071] IA------R--- * ASG struct (init) $VN.Void
N004 ( 6, 7) [000070] n------N---- +--* BLK(32) struct
N003 ( 3, 5) [000068] ------------ | \--* ADDR byref $280
N002 ( 3, 4) [000066] U------N---- | \--* LCL_FLD struct V00 loc0 ud:4->5[+16] Fseq[_result] <l:$143, c:$144>
N001 ( 1, 1) [000069] ------------ \--* CNS_INT int 0 $40
***** BB02, stmt 6
N005 ( 10, 10) [000084] -A------R--- * ASG struct (copy) $VN.Void
N004 ( 6, 7) [000083] n----------- +--* BLK(48) struct
N003 ( 3, 5) [000082] ------------ | \--* ADDR byref $281
N002 ( 3, 4) [000077] D------N---- | \--* LCL_FLD struct V04 tmp1 d:2[+0] Fseq[_value]
N001 ( 3, 2) [000079] -------N---- \--* LCL_VAR struct V00 loc0 u:5 (last use) $2c0
***** BB02, stmt 7
N003 ( 7, 5) [000016] -A------R--- * ASG struct (copy) $VN.Void
N002 ( 3, 2) [000014] D------N---- +--* LCL_VAR struct V01 loc1 d:2
N001 ( 3, 2) [000013] ------------ \--* LCL_VAR struct V04 tmp1 u:2 (last use) $300
***** BB02, stmt 8
N007 ( 13, 15) [000115] -A--G---R--- * ASG struct (copy) $VN.Void
N006 ( 6, 7) [000114] n----------- +--* BLK(48) struct
N005 ( 3, 5) [000113] ------------ | \--* ADDR byref $283
N004 ( 3, 4) [000106] D------N---- | \--* LCL_FLD struct V05 tmp2 d:2[+0] Fseq[_value]
N003 ( 6, 7) [000111] n---G------- \--* IND struct $2c0
N002 ( 3, 5) [000107] ----G------- \--* ADDR byref $282
N001 ( 3, 4) [000110] -------N---- \--* LCL_FLD struct V01 loc1 u:2[+0] Fseq[_value] (last use) $2c0
***** BB02, stmt 9
N003 ( 7, 5) [000103] -A------R--- * ASG struct (copy) $VN.Void
N002 ( 3, 2) [000102] D------N---- +--* LCL_VAR struct V02 loc2 d:2
N001 ( 3, 2) [000100] ------------ \--* LCL_VAR struct V05 tmp2 u:2 (last use) $301
***** BB02, stmt 10
N003 ( 3, 4) [000133] -A--G---R--- * ASG ref $243
N002 ( 1, 1) [000132] D------N---- +--* LCL_VAR ref V07 tmp4 d:2 $243
N001 ( 3, 4) [000130] ------------ \--* LCL_FLD ref V02 loc2 u:2[+0] Fseq[_value, _obj] (last use) $243
Hmm not sure how to do this.
What I'm envisaging is if there is a copy to a (first use); and followed by a (last use) copy
var1 -> var2 (first use)
var2 (last use) -> var3
Then that intermediate copy can be skipped becoming
var1 -> var3
Or in the Live vars output
Live vars: {V00} => {}
Live vars: {} => {V04}
Live vars: {V04} => {}
Live vars: {} => {V01}
Live vars: {V01} => {}
Live vars: {} => {V05}
Live vars: {V05} => {}
Live vars: {} => {V02}
Live vars: {V02} => {}
Live vars: {} => {V07}
becomes
Live vars: {V00} => {}
Live vars: {} => {V07}
Might have something...
https://github.com/dotnet/coreclr/commit/6b3131f1c3b94da36405693f7e8015f7157e72fb identifying the items to change, but not updating the tree correctly as it crashes in the optOptimizeValnumCSEs step
CopyBlk based copy assertion for [000106] V05 @00000003 by [000102] V02 @00000003.
***** BB02 (before)
N007 ( 13, 15) [000115] -A--G---R--- * ASG struct (copy) $VN.Void
N006 ( 6, 7) [000114] n----------- +--* BLK(48) struct
N005 ( 3, 5) [000113] ------------ | \--* ADDR byref $283
N004 ( 3, 4) [000106] D------N---- | \--* LCL_FLD struct V05 tmp2 d:2[+0] Fseq[_value]
N003 ( 6, 7) [000111] n---G------- \--* IND struct $2c0
N002 ( 3, 5) [000107] ----G------- \--* ADDR byref $282
N001 ( 3, 4) [000110] -------N---- \--* LCL_FLD struct V01 loc1 u:2[+0] Fseq[_value] (last use) $2c0
N003 ( 7, 5) [000103] -A------R--- * ASG struct (copy) $VN.Void
N002 ( 3, 2) [000102] D------N---- +--* LCL_VAR struct V02 loc2 d:2
N001 ( 3, 2) [000100] ------------ \--* LCL_VAR struct V05 tmp2 u:2 (last use) $301
Copy propagated to:
***** BB02 (after)
N007 ( 13, 15) [000115] -A--G---R--- * ASG struct (copy) $VN.Void
( 3, 2) [000228] D------N---- +--* LCL_VAR struct V02 loc2 d:2
N003 ( 6, 7) [000111] n---G------- \--* IND struct $2c0
N002 ( 3, 5) [000107] ----G------- \--* ADDR byref $282
N001 ( 3, 4) [000110] -------N---- \--* LCL_FLD struct V01 loc1 u:2[+0] Fseq[_value] (last use) $2c0
N003 ( 7, 5) [000103] ------------ * NOP void
CopyBlk based copy assertion for [000014] V01 @00000003 by [000228] V02 @00000003.
***** BB02 (before)
N003 ( 7, 5) [000016] -A------R--- * ASG struct (copy) $VN.Void
N002 ( 3, 2) [000014] D------N---- +--* LCL_VAR struct V01 loc1 d:2
N001 ( 3, 2) [000013] ------------ \--* LCL_VAR struct V04 tmp1 u:2 (last use) $300
N007 ( 13, 15) [000115] -A--G---R--- * ASG struct (copy) $VN.Void
( 3, 2) [000228] D------N---- +--* LCL_VAR struct V02 loc2 d:2
N003 ( 6, 7) [000111] n---G------- \--* IND struct $2c0
N002 ( 3, 5) [000107] ----G------- \--* ADDR byref $282
N001 ( 3, 4) [000110] -------N---- \--* LCL_FLD struct V01 loc1 u:2[+0] Fseq[_value] (last use) $2c0
Copy propagated to:
***** BB02 (after)
N003 ( 7, 5) [000016] -A------R--- * ASG struct (copy) $VN.Void
( 3, 2) [000229] D------N---- +--* LCL_VAR struct V02 loc2 d:2
N001 ( 3, 2) [000013] ------------ \--* LCL_VAR struct V04 tmp1 u:2 (last use) $300
N007 ( 13, 15) [000115] ------------ * NOP void
CopyBlk based copy assertion for [000077] V04 @00000003 by [000229] V02 @00000003.
***** BB02 (before)
N005 ( 10, 10) [000084] -A------R--- * ASG struct (copy) $VN.Void
N004 ( 6, 7) [000083] n----------- +--* BLK(48) struct
N003 ( 3, 5) [000082] ------------ | \--* ADDR byref $281
N002 ( 3, 4) [000077] D------N---- | \--* LCL_FLD struct V04 tmp1 d:2[+0] Fseq[_value]
N001 ( 3, 2) [000079] -------N---- \--* LCL_VAR struct V00 loc0 u:5 (last use) $2c0
N003 ( 7, 5) [000016] -A------R--- * ASG struct (copy) $VN.Void
( 3, 2) [000229] D------N---- +--* LCL_VAR struct V02 loc2 d:2
N001 ( 3, 2) [000013] ------------ \--* LCL_VAR struct V04 tmp1 u:2 (last use) $300
Copy propagated to:
***** BB02 (after)
N005 ( 10, 10) [000084] -A------R--- * ASG struct (copy) $VN.Void
( 3, 2) [000230] D------N---- +--* LCL_VAR struct V02 loc2 d:2
N001 ( 3, 2) [000079] -------N---- \--* LCL_VAR struct V00 loc0 u:5 (last use) $2c0
N003 ( 7, 5) [000016] ------------ * NOP void
------------ BB02 [000..01D) -> BB04 (cond), preds={BB01} succs={BB03,BB04}
***** BB02, stmt 2
( 9, 17) [000052] ------------ * STMT void (IL 0x000... ???)
N004 ( 9, 17) [000051] -A--G---R--- \--* ASG ref <l:$100, c:$142>
N003 ( 3, 4) [000048] U------N---- +--* LCL_FLD ref V00 loc0 ud:1->2[+0] Fseq[_obj] <l:$102, c:$101>
N002 ( 5, 12) [000193] n---G------- \--* IND ref <l:$100, c:$142>
N001 ( 3, 10) [000192] ------------ \--* CNS_INT(h) long 0x914133C0 "xyz" $c0
***** BB02, stmt 3
( 6, 7) [000058] ------------ * STMT void (IL 0x000... ???)
N003 ( 6, 7) [000057] -A------R--- \--* ASG bool $43
N002 ( 4, 5) [000054] U------N---- +--* LCL_FLD bool V00 loc0 ud:2->3[+10] Fseq[_continueOnCapturedContext] <l:$1c1, c:$1c0>
N001 ( 1, 1) [000055] ------------ \--* CNS_INT int 1 $43
***** BB02, stmt 4
( 6, 7) [000064] ------------ * STMT void (IL 0x000... ???)
N003 ( 6, 7) [000063] -A------R--- \--* ASG short $40
N002 ( 4, 5) [000060] U------N---- +--* LCL_FLD short V00 loc0 ud:3->4[+8] Fseq[_token] <l:$201, c:$200>
N001 ( 1, 1) [000061] ------------ \--* CNS_INT int 0 $40
***** BB02, stmt 5
( 8, 9) [000072] ------------ * STMT void (IL 0x000... ???)
N005 ( 8, 9) [000071] IA------R--- \--* ASG struct (init) $VN.Void
N004 ( 6, 7) [000070] n------N---- +--* BLK(32) struct
N003 ( 3, 5) [000068] ------------ | \--* ADDR byref $280
N002 ( 3, 4) [000066] U------N---- | \--* LCL_FLD struct V00 loc0 ud:4->5[+16] Fseq[_result] <l:$143, c:$144>
N001 ( 1, 1) [000069] ------------ \--* CNS_INT int 0 $40
***** BB02, stmt 6
( 10, 10) [000085] ------------ * STMT void (IL 0x00C... ???)
N005 ( 10, 10) [000084] -A------R--- \--* ASG struct (copy) $VN.Void
( 3, 2) [000230] D------N---- +--* LCL_VAR struct V02 loc2 d:2
N001 ( 3, 2) [000079] -------N---- \--* LCL_VAR struct V00 loc0 u:5 (last use) $2c0
***** BB02, stmt 7
( 7, 5) [000017] ------------ * STMT void (IL 0x013... ???)
N003 ( 7, 5) [000016] ------------ \--* NOP void
***** BB02, stmt 8
( 13, 15) [000116] ------------ * STMT void (IL 0x014... ???)
N007 ( 13, 15) [000115] ------------ \--* NOP void
***** BB02, stmt 9
( 7, 5) [000026] ------------ * STMT void (IL ???... ???)
N003 ( 7, 5) [000103] ------------ \--* NOP void
***** BB02, stmt 10
( 3, 4) [000134] ------------ * STMT void (IL 0x01C... ???)
N003 ( 3, 4) [000133] -A--G---R--- \--* ASG ref $243
N002 ( 1, 1) [000132] D------N---- +--* LCL_VAR ref V07 tmp4 d:2 $243
N001 ( 3, 4) [000130] ------------ \--* LCL_FLD ref V02 loc2 u:2[+0] Fseq[_value, _obj] (last use) $243
***** BB02, stmt 11
( 5, 5) [000139] ------------ * STMT void (IL 0x01C... ???)
N004 ( 5, 5) [000138] ------------ \--* JTRUE void
N003 ( 3, 3) [000137] J------N---- \--* NE int $182
N001 ( 1, 1) [000135] ------------ +--* LCL_VAR ref V07 tmp4 u:2 $243
N002 ( 1, 1) [000136] ------------ \--* CNS_INT ref null $VN.Null
------------ BB03 [01C..01D) -> BB10 (always), preds={BB02} succs={BB10}
Have an change that looks like it addresses it; just need to clean it up. It makes the following diff
-; V00 loc0 [V00,T00] ( 5, 5 ) struct (48) [rsp+0xE0] do-not-enreg[SFB] must-init ld-addr-op
+; V00 loc0 [V00,T00] ( 5, 5 ) struct (48) [rsp+0x20] do-not-enreg[SFB] must-init ld-addr-op
-; V01 loc1 [V01,T06] ( 2, 2 ) struct (48) [rsp+0xB0] do-not-enreg[SFB] must-init ld-addr-op
+;* V01 loc1 [V01 ] ( 0, 0 ) struct (48) zero-ref do-not-enreg[SFB] ld-addr-op
-; V02 loc2 [V02,T07] ( 2, 2 ) struct (48) [rsp+0x80] do-not-enreg[SFB] must-init ld-addr-op
+;* V02 loc2 [V02 ] ( 0, 0 ) struct (48) zero-ref do-not-enreg[SFB] ld-addr-op
; V03 OutArgs [V03 ] ( 1, 1 ) lclBlk (32) [rsp+0x00] "OutgoingArgSpace"
-; V04 tmp1 [V04,T01] ( 2, 4 ) struct (48) [rsp+0x50] do-not-enreg[SFB] must-init "NewObj constructor temp"
+;* V04 tmp1 [V04 ] ( 0, 0 ) struct (48) zero-ref do-not-enreg[SFB] "NewObj constructor temp"
-; V05 tmp2 [V05,T02] ( 2, 4 ) struct (48) [rsp+0x20] do-not-enreg[SFB] must-init "NewObj constructor temp"
+;* V05 tmp2 [V05 ] ( 0, 0 ) struct (48) zero-ref do-not-enreg[SFB] "NewObj constructor temp"
; V06 tmp3 [V06,T04] ( 4, 2.50) bool -> rdx "Inline return value spill temp"
; V07 tmp4 [V07,T05] ( 3, 2.25) ref -> rax class-hnd "Inline stloc first use temp"
; V08 tmp5 [V08,T08] ( 4, 1.62) ref -> rax class-hnd "spilling QMark2"
; V09 cse0 [V09,T03] ( 3, 3 ) ref -> rax "ValNumCSE"
;
-; Lcl frame size = 272
+; Lcl frame size = 80
G_M11412_IG01:
57 push rdi
- 4881EC10010000 sub rsp, 272
+ 4883EC50 sub rsp, 80
C5F877 vzeroupper
488D7C2420 lea rdi, [rsp+20H]
- B93C000000 mov ecx, 60
+ B90C000000 mov ecx, 12
33C0 xor rax, rax
F3AB rep stosd
G_M11412_IG02:
- 48B8C033BD566E010000 mov rax, 0x16E56BD33C0
+ 48B8C033001016020000 mov rax, 0x216100033C0
488B00 mov rax, gword ptr [rax]
4885C0 test rax, rax
0F844E010000 je G_M11412_IG16
G_M11412_IG03:
- 48898424E0000000 mov gword ptr [rsp+E0H], rax
- C68424EA00000001 mov byte ptr [rsp+EAH], 1
- 66C78424E80000000000 mov word ptr [rsp+E8H], 0
- 488D8424F0000000 lea rax, bword ptr [rsp+F0H]
+ 4889442420 mov gword ptr [rsp+20H], rax
+ C644242A01 mov byte ptr [rsp+2AH], 1
+ 66C74424280000 mov word ptr [rsp+28H], 0
+ 488D442430 lea rax, bword ptr [rsp+30H]
C5F857C0 vxorps xmm0, xmm0
C5FA7F00 vmovdqu qword ptr [rax], xmm0
C5FA7F4010 vmovdqu qword ptr [rax+16], xmm0
-G_M11412_IG04:
- C5FA6F8424E0000000 vmovdqu xmm0, qword ptr [rsp+E0H]
- C5FA7F442450 vmovdqu qword ptr [rsp+50H], xmm0
- C5FA6F8424F0000000 vmovdqu xmm0, qword ptr [rsp+F0H]
- C5FA7F442460 vmovdqu qword ptr [rsp+60H], xmm0
- C5FA6F842400010000 vmovdqu xmm0, qword ptr [rsp+100H]
- C5FA7F442470 vmovdqu qword ptr [rsp+70H], xmm0
-
-G_M11412_IG05:
- C5FA6F442450 vmovdqu xmm0, qword ptr [rsp+50H]
- C5FA7F8424B0000000 vmovdqu qword ptr [rsp+B0H], xmm0
- C5FA6F442460 vmovdqu xmm0, qword ptr [rsp+60H]
- C5FA7F8424C0000000 vmovdqu qword ptr [rsp+C0H], xmm0
- C5FA6F442470 vmovdqu xmm0, qword ptr [rsp+70H]
- C5FA7F8424D0000000 vmovdqu qword ptr [rsp+D0H], xmm0
-
-G_M11412_IG06:
- C5FA6F8424B0000000 vmovdqu xmm0, qword ptr [rsp+B0H]
- C5FA7F442420 vmovdqu qword ptr [rsp+20H], xmm0
- C5FA6F8424C0000000 vmovdqu xmm0, qword ptr [rsp+C0H]
- C5FA7F442430 vmovdqu qword ptr [rsp+30H], xmm0
- C5FA6F8424D0000000 vmovdqu xmm0, qword ptr [rsp+D0H]
- C5FA7F442440 vmovdqu qword ptr [rsp+40H], xmm0
-
-G_M11412_IG07:
- C5FA6F442420 vmovdqu xmm0, qword ptr [rsp+20H]
- C5FA7F842480000000 vmovdqu qword ptr [rsp+80H], xmm0
- C5FA6F442430 vmovdqu xmm0, qword ptr [rsp+30H]
- C5FA7F842490000000 vmovdqu qword ptr [rsp+90H], xmm0
- C5FA6F442440 vmovdqu xmm0, qword ptr [rsp+40H]
- C5FA7F8424A0000000 vmovdqu qword ptr [rsp+A0H], xmm0
G_M11412_IG08:
- 488B842480000000 mov rax, gword ptr [rsp+80H]
+ 488B442420 mov rax, gword ptr [rsp+20H]
4885C0 test rax, rax
This was previously issue 18542 in dotnet/coreclr, and the repro is captured as JITRegressionJitBlueGitHub_18542.
This is much closer now as this chonk sharplab.io:
L0051: vxorps xmm0, xmm0, xmm0
L0055: vmovdqu [rax], xmm0
L0059: vmovdqu [rax+0x10], xmm0
L005e: vmovdqu xmm0, [rsp+0xe0]
L0067: vmovdqu [rsp+0x50], xmm0
L006d: vmovdqu xmm0, [rsp+0xf0]
L0076: vmovdqu [rsp+0x60], xmm0
L007c: vmovdqu xmm0, [rsp+0x100]
L0085: vmovdqu [rsp+0x70], xmm0
L008b: vmovdqu xmm0, [rsp+0x50]
L0091: vmovdqu [rsp+0xb0], xmm0
L009a: vmovdqu xmm0, [rsp+0x60]
L00a0: vmovdqu [rsp+0xc0], xmm0
L00a9: vmovdqu xmm0, [rsp+0x70]
L00af: vmovdqu [rsp+0xd0], xmm0
L00b8: vmovdqu xmm0, [rsp+0xb0]
L00c1: vmovdqu [rsp+0x20], xmm0
L00c7: vmovdqu xmm0, [rsp+0xc0]
L00d0: vmovdqu [rsp+0x30], xmm0
L00d6: vmovdqu xmm0, [rsp+0xd0]
L00df: vmovdqu [rsp+0x40], xmm0
L00e5: vmovdqu xmm0, [rsp+0x20]
L00eb: vmovdqu [rsp+0x80], xmm0
L00f4: vmovdqu xmm0, [rsp+0x30]
L00fa: vmovdqu [rsp+0x90], xmm0
L0103: vmovdqu xmm0, [rsp+0x40]
L0109: vmovdqu [rsp+0xa0], xmm0
L0112: mov rax, [rsp+0x80]
L011a: test rax, rax
Has trimmed down significantly; which looks to be copying to output stack?
vxorps xmm0, xmm0
vmovdqu xmmword ptr [rsp+48H], xmm0
vmovdqu xmmword ptr [rsp+58H], xmm0
G_M7880_IG03:
vmovdqu xmm0, xmmword ptr [rsp+38H]
vmovdqu xmmword ptr [rsp+08H], xmm0
vmovdqu xmm0, xmmword ptr [rsp+48H]
vmovdqu xmmword ptr [rsp+18H], xmm0
vmovdqu xmm0, xmmword ptr [rsp+58H]
vmovdqu xmmword ptr [rsp+28H], xmm0
G_M7880_IG04:
mov rax, gword ptr [rsp+08H]
test rax, rax
So will close
Most helpful comment
Could be -- the jit will be very conservative in places because of aliasing worries.
I'll take a look, maybe early next week some time?