Since @alexanderkyte is working on LLVM improvements it's probably worth looking why LLVM doesn't help us here.
static unsafe void SetFirst4Elements(byte[] array)
{
if (array.Length >= 4)
{
array[3] = 4; // additional help: access last element first
array[0] = 1;
array[1] = 2;
array[2] = 3;
}
}
here we have two tips for mini/llvm that we will never go out of bounds:
1) if (array.Length >= 4) (should be enough to eliminate all of them)
2) array[3] = 4; we access last item first - it means there is no need to insert bounds checks for the following [0], [1] and [2]
.NET Core's JIT will eliminate them for this snippet, here is what mono llvm generates:
_P_SetFirst4Elements_byte__:
pushq %rax
movl 24(%rdi), %eax
cmpl $4, %eax
jb 46 <_P_SetFirst4Elements_byte__+0x37>
movl 24(%rdi), %eax
cmpl $4, %eax
jb 40 <_P_SetFirst4Elements_byte__+0x39>
movb $4, 35(%rdi)
cmpl $0, 24(%rdi)
je 30 <_P_SetFirst4Elements_byte__+0x39>
movb $1, 32(%rdi)
movl 24(%rdi), %eax
cmpl $1, %eax
jbe 18 <_P_SetFirst4Elements_byte__+0x39>
movb $2, 33(%rdi)
movl 24(%rdi), %eax
cmpl $2, %eax
jbe 6 <_P_SetFirst4Elements_byte__+0x39>
movb $3, 34(%rdi)
popq %rax
retq
movl $267, %edi
callq 381 <plt__jit_icall_llvm_throw_corlib_exception_abs_trampoline>
nop
; Function Attrs: uwtable
define hidden monocc void @P_SetFirst4Elements_byte__(i64* %arg_array) #5 {
BB0:
%0 = getelementptr i64, i64* %arg_array, i64 3
%1 = bitcast i64* %0 to i32*
%t19 = load volatile i32, i32* %1, align 4
%2 = icmp ult i32 %t19, 4
br i1 %2, label %BB1, label %BB5
BB5: ; preds = %BB0
%t38 = load volatile i32, i32* %1, align 4
%3 = icmp ult i32 %t38, 4
br i1 %3, label %EX_BB3, label %NOEX_BB4
BB1: ; preds = %BB0, %NOEX_BB13
ret void
EX_BB3: ; preds = %BB5
call void @p_1_plt__jit_icall_llvm_throw_corlib_exception_abs_trampoline_llvm(i32 267)
unreachable
NOEX_BB4: ; preds = %BB5
%4 = bitcast i64* %arg_array to i8*
%5 = getelementptr i8, i8* %4, i64 35
store i8 4, i8* %5, align 1
%t40 = load volatile i32, i32* %1, align 4
%6 = icmp eq i32 %t40, 0
br i1 %6, label %EX_BB6, label %NOEX_BB7
EX_BB6: ; preds = %NOEX_BB4
call void @p_1_plt__jit_icall_llvm_throw_corlib_exception_abs_trampoline_llvm(i32 267)
unreachable
NOEX_BB7: ; preds = %NOEX_BB4
%7 = getelementptr i64, i64* %arg_array, i64 4
%8 = bitcast i64* %7 to i8*
store i8 1, i8* %8, align 1
%t42 = load volatile i32, i32* %1, align 4
%9 = icmp ult i32 %t42, 2
br i1 %9, label %EX_BB9, label %NOEX_BB10
EX_BB9: ; preds = %NOEX_BB7
call void @p_1_plt__jit_icall_llvm_throw_corlib_exception_abs_trampoline_llvm(i32 267)
unreachable
NOEX_BB10: ; preds = %NOEX_BB7
%10 = getelementptr i8, i8* %4, i64 33
store i8 2, i8* %10, align 1
%t44 = load volatile i32, i32* %1, align 4
%11 = icmp ult i32 %t44, 3
br i1 %11, label %EX_BB12, label %NOEX_BB13
EX_BB12: ; preds = %NOEX_BB10
call void @p_1_plt__jit_icall_llvm_throw_corlib_exception_abs_trampoline_llvm(i32 267)
unreachable
NOEX_BB13: ; preds = %NOEX_BB10
%12 = getelementptr i8, i8* %4, i64 34
store i8 3, i8* %12, align 1
br label %BB1
}
(NOTE: there is no safe-points in the IR due to https://github.com/mono/mono/pull/13887)
Note that the PR only fixes the issue if the testcase is compiled using /optimize+. Otherwise csc generates:
IL_0002: ldlen
IL_0003: conv.i4
IL_0004: ldc.i4.4
IL_0005: cgt.un
IL_0007: stloc.0
IL_0008: ldloc.0
IL_0009: brfalse.s IL_001d
And our abcrem code is not smart enough to see through that.
@vargaz with your PR I get:
; Function Attrs: norecurse nounwind uwtable
define hidden monocc void @P_SetFirst4Elements_byte__(i64* %arg_array) #5 {
BB0:
%0 = getelementptr i64, i64* %arg_array, i64 3
%1 = bitcast i64* %0 to i32*
%t19 = load volatile i32, i32* %1, align 4
%2 = icmp ult i32 %t19, 4
br i1 %2, label %BB1, label %BB5
BB5: ; preds = %BB0
%3 = bitcast i64* %arg_array to i8*
%4 = getelementptr i8, i8* %3, i64 35
store i8 4, i8* %4, align 1
%5 = getelementptr i64, i64* %arg_array, i64 4
%6 = bitcast i64* %5 to i8*
store i8 1, i8* %6, align 1
%7 = getelementptr i8, i8* %3, i64 33
store i8 2, i8* %7, align 1
%8 = getelementptr i8, i8* %3, i64 34
store i8 3, i8* %8, align 1
br label %BB1
BB1: ; preds = %BB0, %BB5
ret void
}
_P_SetFirst4Elements_byte__:
movl 24(%rdi), %eax
cmpl $4, %eax
jb 7 <_P_SetFirst4Elements_byte__+0xf>
movl $67305985, 32(%rdi)
retq
omg, it optimized it with a single movl, Awesome!
For reference, here is what .NET Core generates:
mov eax, dword ptr [rcx+8]
cmp eax, 3
jbe SHORT G_M28968_IG03
mov byte ptr [rcx+19], 4
mov byte ptr [rcx+16], 1
mov byte ptr [rcx+17], 2
mov byte ptr [rcx+18], 3
/cc @migueldeicaza
Most helpful comment
Note that the PR only fixes the issue if the testcase is compiled using /optimize+. Otherwise csc generates:
IL_0002: ldlen
IL_0003: conv.i4
IL_0004: ldc.i4.4
IL_0005: cgt.un
IL_0007: stloc.0
IL_0008: ldloc.0
IL_0009: brfalse.s IL_001d
And our abcrem code is not smart enough to see through that.