Mono: [llvm] Bounds checks are not eliminated

Created on 6 Apr 2019 · 3Comments · Source: mono/mono

Since @alexanderkyte is working on LLVM improvements it's probably worth looking why LLVM doesn't help us here.

static unsafe void SetFirst4Elements(byte[] array)
{
    if (array.Length >= 4)
    {
        array[3] = 4; // additional help: access last element first 
        array[0] = 1;
        array[1] = 2;
        array[2] = 3;
    }
}

here we have two tips for mini/llvm that we will never go out of bounds:
1) if (array.Length >= 4) (should be enough to eliminate all of them)
2) array[3] = 4; we access last item first - it means there is no need to insert bounds checks for the following [0], [1] and [2]

.NET Core's JIT will eliminate them for this snippet, here is what mono llvm generates:

csc -o p.cs && mono --aot=llvm p.exe && objdump -d p.exe.dylib:

_P_SetFirst4Elements_byte__:
    pushq   %rax
    movl    24(%rdi), %eax
    cmpl    $4, %eax
    jb  46 <_P_SetFirst4Elements_byte__+0x37>
    movl    24(%rdi), %eax
    cmpl    $4, %eax
    jb  40 <_P_SetFirst4Elements_byte__+0x39>
    movb    $4, 35(%rdi)
    cmpl    $0, 24(%rdi)
    je  30 <_P_SetFirst4Elements_byte__+0x39>
    movb    $1, 32(%rdi)
    movl    24(%rdi), %eax
    cmpl    $1, %eax
    jbe 18 <_P_SetFirst4Elements_byte__+0x39>
    movb    $2, 33(%rdi)
    movl    24(%rdi), %eax
    cmpl    $2, %eax
    jbe 6 <_P_SetFirst4Elements_byte__+0x39>
    movb    $3, 34(%rdi)
    popq    %rax
    retq
    movl    $267, %edi
    callq   381 <plt__jit_icall_llvm_throw_corlib_exception_abs_trampoline>
    nop

LLVM IR:

; Function Attrs: uwtable
define hidden monocc void @P_SetFirst4Elements_byte__(i64* %arg_array) #5 {
BB0:
  %0 = getelementptr i64, i64* %arg_array, i64 3
  %1 = bitcast i64* %0 to i32*
  %t19 = load volatile i32, i32* %1, align 4
  %2 = icmp ult i32 %t19, 4
  br i1 %2, label %BB1, label %BB5

BB5:                                              ; preds = %BB0
  %t38 = load volatile i32, i32* %1, align 4
  %3 = icmp ult i32 %t38, 4
  br i1 %3, label %EX_BB3, label %NOEX_BB4

BB1:                                              ; preds = %BB0, %NOEX_BB13
  ret void

EX_BB3:                                           ; preds = %BB5
  call void @p_1_plt__jit_icall_llvm_throw_corlib_exception_abs_trampoline_llvm(i32 267)
  unreachable

NOEX_BB4:                                         ; preds = %BB5
  %4 = bitcast i64* %arg_array to i8*
  %5 = getelementptr i8, i8* %4, i64 35
  store i8 4, i8* %5, align 1
  %t40 = load volatile i32, i32* %1, align 4
  %6 = icmp eq i32 %t40, 0
  br i1 %6, label %EX_BB6, label %NOEX_BB7

EX_BB6:                                           ; preds = %NOEX_BB4
  call void @p_1_plt__jit_icall_llvm_throw_corlib_exception_abs_trampoline_llvm(i32 267)
  unreachable

NOEX_BB7:                                         ; preds = %NOEX_BB4
  %7 = getelementptr i64, i64* %arg_array, i64 4
  %8 = bitcast i64* %7 to i8*
  store i8 1, i8* %8, align 1
  %t42 = load volatile i32, i32* %1, align 4
  %9 = icmp ult i32 %t42, 2
  br i1 %9, label %EX_BB9, label %NOEX_BB10

EX_BB9:                                           ; preds = %NOEX_BB7
  call void @p_1_plt__jit_icall_llvm_throw_corlib_exception_abs_trampoline_llvm(i32 267)
  unreachable

NOEX_BB10:                                        ; preds = %NOEX_BB7
  %10 = getelementptr i8, i8* %4, i64 33
  store i8 2, i8* %10, align 1
  %t44 = load volatile i32, i32* %1, align 4
  %11 = icmp ult i32 %t44, 3
  br i1 %11, label %EX_BB12, label %NOEX_BB13

EX_BB12:                                          ; preds = %NOEX_BB10
  call void @p_1_plt__jit_icall_llvm_throw_corlib_exception_abs_trampoline_llvm(i32 267)
  unreachable

NOEX_BB13:                                        ; preds = %NOEX_BB10
  %12 = getelementptr i8, i8* %4, i64 34
  store i8 3, i8* %12, align 1
  br label %BB1
}

(NOTE: there is no safe-points in the IR due to https://github.com/mono/mono/pull/13887)

JIT

Source

EgorBo

Most helpful comment

Note that the PR only fixes the issue if the testcase is compiled using /optimize+. Otherwise csc generates:
IL_0002: ldlen
IL_0003: conv.i4
IL_0004: ldc.i4.4
IL_0005: cgt.un
IL_0007: stloc.0
IL_0008: ldloc.0
IL_0009: brfalse.s IL_001d

And our abcrem code is not smart enough to see through that.

vargaz on 6 Apr 2019

🚀1 ❤1 🎉1

All 3 comments

And our abcrem code is not smart enough to see through that.

vargaz on 6 Apr 2019

🚀1 ❤1 🎉1

@vargaz with your PR I get:

LLVM IR:

; Function Attrs: norecurse nounwind uwtable
define hidden monocc void @P_SetFirst4Elements_byte__(i64* %arg_array) #5 {
BB0:
  %0 = getelementptr i64, i64* %arg_array, i64 3
  %1 = bitcast i64* %0 to i32*
  %t19 = load volatile i32, i32* %1, align 4
  %2 = icmp ult i32 %t19, 4
  br i1 %2, label %BB1, label %BB5

BB5:                                              ; preds = %BB0
  %3 = bitcast i64* %arg_array to i8*
  %4 = getelementptr i8, i8* %3, i64 35
  store i8 4, i8* %4, align 1
  %5 = getelementptr i64, i64* %arg_array, i64 4
  %6 = bitcast i64* %5 to i8*
  store i8 1, i8* %6, align 1
  %7 = getelementptr i8, i8* %3, i64 33
  store i8 2, i8* %7, align 1
  %8 = getelementptr i8, i8* %3, i64 34
  store i8 3, i8* %8, align 1
  br label %BB1

BB1:                                              ; preds = %BB0, %BB5
  ret void
}

Asm:

_P_SetFirst4Elements_byte__:
    movl    24(%rdi), %eax
    cmpl    $4, %eax
    jb  7 <_P_SetFirst4Elements_byte__+0xf>
    movl    $67305985, 32(%rdi)
    retq

omg, it optimized it with a single movl, Awesome!

For reference, here is what .NET Core generates:

       mov      eax, dword ptr [rcx+8]
       cmp      eax, 3
       jbe      SHORT G_M28968_IG03
       mov      byte  ptr [rcx+19], 4
       mov      byte  ptr [rcx+16], 1
       mov      byte  ptr [rcx+17], 2
       mov      byte  ptr [rcx+18], 3

EgorBo on 6 Apr 2019

/cc @migueldeicaza