const INPUT: usize = 10;
const OUTPUT: usize = 3;
pub fn get_output1(input: &[u8; INPUT], offset: usize) -> Option<&[u8]> {
if offset <= INPUT - OUTPUT {
// This version optimizes out the extra bounds checks for the slice.
Some(&input[offset..offset + OUTPUT])
} else {
None
}
}
pub fn get_output2(input: &[u8; INPUT], offset: usize) -> Option<&[u8]> {
if offset <= INPUT - OUTPUT {
// This version fails to optimize bounds checks.
Some(&input[offset..][..OUTPUT])
} else {
None
}
}
This is a simplified version of a pessimization that came up when I was trying to optimize the array_ref! macro: https://github.com/droundy/arrayref/pull/16
So this is what the LLVM IR looks like optimized:
; foo::get_output1
; Function Attrs: nonlazybind uwtable
define { i8*, i64 } @_ZN3foo11get_output117haa6e1bf361064f05E([10 x i8]* noalias readonly align 1 dereferenceable(10) %input, i64 %offset) unnamed_addr #0 {
start:
%0 = icmp ult i64 %offset, 8
%1 = getelementptr inbounds [10 x i8], [10 x i8]* %input, i64 0, i64 %offset
%_0.sroa.0.0 = select i1 %0, i8* %1, i8* null
%2 = insertvalue { i8*, i64 } undef, i8* %_0.sroa.0.0, 0
%3 = insertvalue { i8*, i64 } %2, i64 3, 1
ret { i8*, i64 } %3
}
; foo::get_output2
; Function Attrs: nonlazybind uwtable
define { i8*, i64 } @_ZN3foo11get_output217h09c70bf053786004E([10 x i8]* noalias readonly align 1 dereferenceable(10) %input, i64 %offset) unnamed_addr #0 {
start:
%0 = icmp ult i64 %offset, 8
br i1 %0, label %"_ZN4core5slice74_$LT$impl$u20$core..ops..index..Index$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$5index17h0c1ade7fc9d72442E.exit", label %bb5
"_ZN4core5slice74_$LT$impl$u20$core..ops..index..Index$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$5index17h0c1ade7fc9d72442E.exit": ; preds = %start
%1 = sub i64 10, %offset
%2 = icmp ult i64 %1, 3
br i1 %2, label %bb4.i.i.i, label %"_ZN4core5slice74_$LT$impl$u20$core..ops..index..Index$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$5index17hbbe883cdf661ef0bE.exit"
bb4.i.i.i: ; preds = %"_ZN4core5slice74_$LT$impl$u20$core..ops..index..Index$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$5index17h0c1ade7fc9d72442E.exit"
; call core::slice::slice_index_len_fail
tail call void @_ZN4core5slice20slice_index_len_fail17h3b63b7b5b8377837E(i64 3, i64 %1)
unreachable
"_ZN4core5slice74_$LT$impl$u20$core..ops..index..Index$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$5index17hbbe883cdf661ef0bE.exit": ; preds = %"_ZN4core5slice74_$LT$impl$u20$core..ops..index..Index$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$5index17h0c1ade7fc9d72442E.exit"
%3 = getelementptr inbounds [10 x i8], [10 x i8]* %input, i64 0, i64 %offset
br label %bb5
bb5: ; preds = %start, %"_ZN4core5slice74_$LT$impl$u20$core..ops..index..Index$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$5index17hbbe883cdf661ef0bE.exit"
%_0.sroa.0.0 = phi i8* [ %3, %"_ZN4core5slice74_$LT$impl$u20$core..ops..index..Index$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$5index17hbbe883cdf661ef0bE.exit" ], [ null, %start ]
%4 = insertvalue { i8*, i64 } undef, i8* %_0.sroa.0.0, 0
%5 = insertvalue { i8*, i64 } %4, i64 3, 1
ret { i8*, i64 } %5
}
Looking at it we can tell that %2 = icmp ult i64 %1, 3 should be implied by the previous %0 = icmp ult i64 %offset, 8 and in fact LLVM does try to do this opt (from a quick looking in multiple passes instcombine, simplifycfg, instsimplify). Unfortunately, it doesn't seem to be able to relate %1 and %offset.
Just manually rewriting that condition as %2 = icmp ugt i64 %offset, 7 causes it to optimize get_output2 to the same as get_output. But that's not exactly a valid general transformation.
This could be decomposed into three optimizations: First, it should be possible to determine that the sub is nuw. I think this is something that CVP should be capable of determining, at least in principle. Second, %y = sub nuw C, %x; icmp P %y, C2 should be instcombined to icmp swap(P) %y, C-C2. Finally CSE will see that the conditions are now the same (this part already works).
The (icmp P (sub nuw|nsw C2, Y), C) -> (icmp swap(P) Y, C2-C) part of the change landed already in https://reviews.llvm.org/D59916.
The change to teach CVP to add nuw/nsw to sub is at https://reviews.llvm.org/D60036 but unfortunately that optimization is currently disabled by default due to another LLVM bug: https://bugs.llvm.org/show_bug.cgi?id=31181
CVP nowrap inference reenabled in https://reviews.llvm.org/D62776, so this issue should be fixed once LLVM is updated.
LLVM was upgraded and both functions generate equal assembly now.
Most helpful comment
LLVM was upgraded and both functions generate equal assembly now.