Rust: a case where bounds checks should be eliminated

Created on 20 Feb 2019  路  6Comments  路  Source: rust-lang/rust

https://godbolt.org/z/y-GVc2

const INPUT: usize = 10;
const OUTPUT: usize = 3;

pub fn get_output1(input: &[u8; INPUT], offset: usize) -> Option<&[u8]> {
    if offset <= INPUT - OUTPUT {
        // This version optimizes out the extra bounds checks for the slice.
        Some(&input[offset..offset + OUTPUT])
    } else {
        None
    }
}

pub fn get_output2(input: &[u8; INPUT], offset: usize) -> Option<&[u8]> {
    if offset <= INPUT - OUTPUT {
        // This version fails to optimize bounds checks.
        Some(&input[offset..][..OUTPUT])
    } else {
        None
    }
}

This is a simplified version of a pessimization that came up when I was trying to optimize the array_ref! macro: https://github.com/droundy/arrayref/pull/16

A-LLVM I-slow T-compiler WG-codegen

Most helpful comment

LLVM was upgraded and both functions generate equal assembly now.

All 6 comments

So this is what the LLVM IR looks like optimized:

; foo::get_output1
; Function Attrs: nonlazybind uwtable
define { i8*, i64 } @_ZN3foo11get_output117haa6e1bf361064f05E([10 x i8]* noalias readonly align 1 dereferenceable(10) %input, i64 %offset) unnamed_addr #0 {
start:
  %0 = icmp ult i64 %offset, 8
  %1 = getelementptr inbounds [10 x i8], [10 x i8]* %input, i64 0, i64 %offset
  %_0.sroa.0.0 = select i1 %0, i8* %1, i8* null
  %2 = insertvalue { i8*, i64 } undef, i8* %_0.sroa.0.0, 0
  %3 = insertvalue { i8*, i64 } %2, i64 3, 1
  ret { i8*, i64 } %3
}

; foo::get_output2
; Function Attrs: nonlazybind uwtable
define { i8*, i64 } @_ZN3foo11get_output217h09c70bf053786004E([10 x i8]* noalias readonly align 1 dereferenceable(10) %input, i64 %offset) unnamed_addr #0 {
start:
  %0 = icmp ult i64 %offset, 8
  br i1 %0, label %"_ZN4core5slice74_$LT$impl$u20$core..ops..index..Index$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$5index17h0c1ade7fc9d72442E.exit", label %bb5

"_ZN4core5slice74_$LT$impl$u20$core..ops..index..Index$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$5index17h0c1ade7fc9d72442E.exit": ; preds = %start
  %1 = sub i64 10, %offset
  %2 = icmp ult i64 %1, 3
  br i1 %2, label %bb4.i.i.i, label %"_ZN4core5slice74_$LT$impl$u20$core..ops..index..Index$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$5index17hbbe883cdf661ef0bE.exit"

bb4.i.i.i:                                        ; preds = %"_ZN4core5slice74_$LT$impl$u20$core..ops..index..Index$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$5index17h0c1ade7fc9d72442E.exit"
; call core::slice::slice_index_len_fail
  tail call void @_ZN4core5slice20slice_index_len_fail17h3b63b7b5b8377837E(i64 3, i64 %1)
  unreachable

"_ZN4core5slice74_$LT$impl$u20$core..ops..index..Index$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$5index17hbbe883cdf661ef0bE.exit": ; preds = %"_ZN4core5slice74_$LT$impl$u20$core..ops..index..Index$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$5index17h0c1ade7fc9d72442E.exit"
  %3 = getelementptr inbounds [10 x i8], [10 x i8]* %input, i64 0, i64 %offset
  br label %bb5

bb5:                                              ; preds = %start, %"_ZN4core5slice74_$LT$impl$u20$core..ops..index..Index$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$5index17hbbe883cdf661ef0bE.exit"
  %_0.sroa.0.0 = phi i8* [ %3, %"_ZN4core5slice74_$LT$impl$u20$core..ops..index..Index$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$5index17hbbe883cdf661ef0bE.exit" ], [ null, %start ]
  %4 = insertvalue { i8*, i64 } undef, i8* %_0.sroa.0.0, 0
  %5 = insertvalue { i8*, i64 } %4, i64 3, 1
  ret { i8*, i64 } %5
}

Looking at it we can tell that %2 = icmp ult i64 %1, 3 should be implied by the previous %0 = icmp ult i64 %offset, 8 and in fact LLVM does try to do this opt (from a quick looking in multiple passes instcombine, simplifycfg, instsimplify). Unfortunately, it doesn't seem to be able to relate %1 and %offset.

Just manually rewriting that condition as %2 = icmp ugt i64 %offset, 7 causes it to optimize get_output2 to the same as get_output. But that's not exactly a valid general transformation.

This could be decomposed into three optimizations: First, it should be possible to determine that the sub is nuw. I think this is something that CVP should be capable of determining, at least in principle. Second, %y = sub nuw C, %x; icmp P %y, C2 should be instcombined to icmp swap(P) %y, C-C2. Finally CSE will see that the conditions are now the same (this part already works).

The (icmp P (sub nuw|nsw C2, Y), C) -> (icmp swap(P) Y, C2-C) part of the change landed already in https://reviews.llvm.org/D59916.
The change to teach CVP to add nuw/nsw to sub is at https://reviews.llvm.org/D60036 but unfortunately that optimization is currently disabled by default due to another LLVM bug: https://bugs.llvm.org/show_bug.cgi?id=31181

CVP nowrap inference reenabled in https://reviews.llvm.org/D62776, so this issue should be fixed once LLVM is updated.

LLVM was upgraded and both functions generate equal assembly now.

Was this page helpful?
0 / 5 - 0 ratings