Found while tracking down the root cause of the slowdown in #51340. This is the minimized code which reproduces the performance issue (Playground link).
#![crate_type = "rlib"]
use std::num::NonZeroUsize;
unsafe fn repeat(c: bool) -> Option<NonZeroUsize> {
if c {
None
} else {
Some(NonZeroUsize::new_unchecked(8))
}
}
pub unsafe fn calculate_layout(c: bool) -> usize {
match repeat(c) {
Some(x) => 8 - x.get(),
None => std::hint::unreachable_unchecked(),
}
}
This code produces the following output:
// ASM
playground::calculate_layout:
mov eax, edi
shl rax, 3
ret
// LLVM
define i64 @_ZN10playground16calculate_layout17h33252013a8d7ec56E(i1 zeroext %c) unnamed_addr #0 {
start:
%0 = select i1 %c, i64 8, i64 0
ret i64 %0
}
Note that in practice, it is impossible for this function to return anything other than 0 (8 - 8) due to the call to unreachable_unchecked.
The version using usize does not suffer from this (Playground link):
#![crate_type = "rlib"]
unsafe fn repeat(c: bool) -> Option<usize> {
if c {
None
} else {
Some(8)
}
}
pub unsafe fn calculate_layout(c: bool) -> usize {
match repeat(c) {
Some(x) => 8 - x,
None => std::hint::unreachable_unchecked(),
}
}
The new code produces the following output:
// ASM
playground::calculate_layout:
xor eax, eax
ret
// LLVM
define i64 @_ZN10playground16calculate_layout17h33252013a8d7ec56E(i1 zeroext %c) unnamed_addr #0 {
start:
ret i64 0
}
cc @gnzlbg @rkruppe
Looking through -print-after-all output, the following jumps out:
*** IR Dump After Value Propagation ***
; Function Attrs: uwtable
define i64 @_ZN13nonzero_usize16calculate_layout17h044f84d640378d0bE(i1 zeroext %c) unnamed_addr #0 {
start:
%spec.select.i = select i1 %c, i64 0, i64 8
br i1 %c, label %bb2, label %bb4
bb2: ; preds = %start
unreachable
bb4: ; preds = %start
%0 = sub i64 8, %spec.select.i
%1 = insertvalue { i64, i1 } undef, i64 %0, 0
%2 = insertvalue { i64, i1 } %1, i1 false, 1
%3 = extractvalue { i64, i1 } %2, 1
br i1 %3, label %panic, label %bb6, !prof !0
bb6: ; preds = %bb4
%4 = extractvalue { i64, i1 } %2, 0
ret i64 %4
panic: ; preds = %bb4
call ... ; elided
unreachable
}
*** IR Dump After Simplify the CFG ***
; Function Attrs: uwtable
define i64 @_ZN13nonzero_usize16calculate_layout17h044f84d640378d0bE(i1 zeroext %c) unnamed_addr #0 {
start:
%spec.select.i = select i1 %c, i64 0, i64 8
%0 = sub i64 8, %spec.select.i
%1 = insertvalue { i64, i1 } undef, i64 %0, 0
%2 = insertvalue { i64, i1 } %1, i1 false, 1
%3 = extractvalue { i64, i1 } %2, 1
br i1 %3, label %panic, label %bb6, !prof !0
bb6: ; preds = %start
%4 = extractvalue { i64, i1 } %2, 0
ret i64 %4
panic: ; preds = %start
call ... ; elided
unreachable
}
SimplifyCFG is removing br i1 %c, <bb with unreachable>, <bb with calculations> in favor of falling through to <bb with calculation>, which loses the information that %c being true is UB, which could have been used to simplify the select at the start. Indeed manually adding an assume(not %c) makes it optimize to ret i64 0 (this appears to be the doing of instcombine, by leaning on the AssumptionCache). Of course, we do want to get rid of that branch, and we don't really want to carry around an assume forever, we'd just like some more optimization to happen to the select before the UB is exploited in another way.
In the usize case, the hard work (noticing that the subtraction always results in 0) is done by interprocedural constant propagation. Since Option<usize> = { i64, i64 }, repeat unconditionally (after optimizations) puts 8 into the return value's payload, which can then be propagated before any inlining and regardless of the complexities related to the discriminant. So it only accidentially side-steps the problem of "if %c then UB" not being properly exploited due to using a different representation.
Before/after IR with module-level cruft removed:
*** IR Dump After Infer set function attributes ***
; Function Attrs: inlinehint noreturn uwtable
define internal void @_ZN4core4hint21unreachable_unchecked17h5c82d720186d4847E() unnamed_addr #0 {
start:
unreachable
}
; Function Attrs: uwtable
define internal { i64, i64 } @_ZN13nonzero_usize6repeat17hff27c5667304426aE(i1 zeroext %c) unnamed_addr #1 {
start:
br i1 %c, label %bb1, label %bb2
bb1: ; preds = %start
br label %bb3
bb2: ; preds = %start
br label %bb3
bb3: ; preds = %bb2, %bb1
%_0.sroa.0.0 = phi i64 [ 0, %bb1 ], [ 1, %bb2 ]
%0 = insertvalue { i64, i64 } undef, i64 %_0.sroa.0.0, 0
%1 = insertvalue { i64, i64 } %0, i64 8, 1
ret { i64, i64 } %1
}
; Function Attrs: uwtable
define i64 @_ZN13nonzero_usize16calculate_layout17h044f84d640378d0bE(i1 zeroext %c) unnamed_addr #1 {
start:
%0 = call { i64, i64 } @_ZN13nonzero_usize6repeat17hff27c5667304426aE(i1 zeroext %c)
%.fca.0.extract = extractvalue { i64, i64 } %0, 0
%.fca.1.extract = extractvalue { i64, i64 } %0, 1
%switch = icmp ult i64 %.fca.0.extract, 1
br i1 %switch, label %bb2, label %bb4
bb2: ; preds = %start
call void @_ZN4core4hint21unreachable_unchecked17h5c82d720186d4847E()
unreachable
bb4: ; preds = %start
%1 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 8, i64 %.fca.1.extract)
%2 = extractvalue { i64, i1 } %1, 0
%3 = extractvalue { i64, i1 } %1, 1
br i1 %3, label %panic, label %bb5, !prof !0
bb5: ; preds = %bb4
ret i64 %2
panic: ; preds = %bb4
call void ... ; elided
unreachable
}
*** IR Dump After Interprocedural Sparse Conditional Constant Propagation ***
; Function Attrs: inlinehint noreturn uwtable
define internal void @_ZN4core4hint21unreachable_unchecked17h5c82d720186d4847E() unnamed_addr #0 {
start:
unreachable
}
; Function Attrs: uwtable
define internal { i64, i64 } @_ZN13nonzero_usize6repeat17hff27c5667304426aE(i1 zeroext %c) unnamed_addr #1 {
start:
br i1 %c, label %bb1, label %bb2
bb1: ; preds = %start
br label %bb3
bb2: ; preds = %start
br label %bb3
bb3: ; preds = %bb2, %bb1
%_0.sroa.0.0 = phi i64 [ 0, %bb1 ], [ 1, %bb2 ]
%0 = insertvalue { i64, i64 } undef, i64 %_0.sroa.0.0, 0
%1 = insertvalue { i64, i64 } %0, i64 8, 1
ret { i64, i64 } %1
}
; Function Attrs: uwtable
define i64 @_ZN13nonzero_usize16calculate_layout17h044f84d640378d0bE(i1 zeroext %c) unnamed_addr #1 {
start:
%0 = call { i64, i64 } @_ZN13nonzero_usize6repeat17hff27c5667304426aE(i1 zeroext %c)
%.fca.0.extract = extractvalue { i64, i64 } %0, 0
%switch = icmp ult i64 %.fca.0.extract, 1
br i1 %switch, label %bb2, label %bb4
bb2: ; preds = %start
call void @_ZN4core4hint21unreachable_unchecked17h5c82d720186d4847E()
unreachable
bb4: ; preds = %start
%1 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 8, i64 8)
%2 = extractvalue { i64, i1 } %1, 0
%3 = extractvalue { i64, i1 } %1, 1
br i1 %3, label %panic, label %bb5, !prof !0
bb5: ; preds = %bb4
ret i64 %2
panic: ; preds = %bb4
call void ... ; elided
unreachable
}
@rkruppe Look at this version which is closer to the original code and produces the following LLVM IR for playground::new:
define i64 @_ZN10playground3new17hc1cbbc83e9256887E(i64 %c) unnamed_addr #2 personality i32 (i32, i32, i64, %"unwind::libunwind::_Unwind_Exception"*, %"unwind::libunwind::_Unwind_Context"*)* @rust_eh_personality {
start:
%0 = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %c, i64 8) #8
%1 = extractvalue { i64, i1 } %0, 1
%2 = extractvalue { i64, i1 } %0, 0
%spec.select.i.i = select i1 %1, i64 0, i64 8
br i1 %1, label %bb4.i1, label %bb4.i
bb4.i: ; preds = %start
%3 = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %c, i64 16) #8
%4 = extractvalue { i64, i1 } %3, 1
%5 = extractvalue { i64, i1 } %3, 0
%spec.select.i16.i = select i1 %4, i64 0, i64 8
br i1 %4, label %bb4.i1, label %bb11.i
bb11.i: ; preds = %bb4.i
%6 = icmp ult i64 %spec.select.i16.i, %spec.select.i.i
%_0.0.sroa.speculated.i.i.i.i = select i1 %6, i64 %spec.select.i.i, i64 %spec.select.i16.i
%7 = add i64 %2, -1
%8 = add i64 %7, %spec.select.i16.i
%9 = sub nsw i64 0, %spec.select.i16.i
%10 = and i64 %8, %9
%11 = sub i64 %10, %2
%12 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %2, i64 %11) #8
%13 = extractvalue { i64, i1 } %12, 0
%14 = extractvalue { i64, i1 } %12, 1
br i1 %14, label %bb4.i1, label %bb11.i.i
bb11.i.i: ; preds = %bb11.i
%15 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %13, i64 %5) #8
%16 = extractvalue { i64, i1 } %15, 1
br i1 %16, label %bb4.i1, label %bb21.i.i
bb21.i.i: ; preds = %bb11.i.i
%17 = extractvalue { i64, i1 } %15, 0
%18 = add nuw nsw i64 %_0.0.sroa.speculated.i.i.i.i, 15
%19 = and i64 %18, %_0.0.sroa.speculated.i.i.i.i
%20 = icmp ne i64 %19, 0
%21 = icmp eq i64 %_0.0.sroa.speculated.i.i.i.i, 0
%or.cond.i.not.i.i.i = or i1 %21, %20
%22 = sub nsw i64 0, %_0.0.sroa.speculated.i.i.i.i
%23 = icmp ugt i64 %17, %22
%24 = or i1 %or.cond.i.not.i.i.i, %23
br i1 %24, label %bb4.i1, label %"_ZN47_$LT$core..result..Result$LT$T$C$$u20$E$GT$$GT$6unwrap17h2368a06f0a7bf993E.exit"
bb4.i1: ; preds = %bb21.i.i, %bb4.i, %bb11.i.i, %bb11.i, %start
; call core::result::unwrap_failed
tail call fastcc void @_ZN4core6result13unwrap_failed17hde364ddca953ee8aE(), !noalias !6
unreachable
"_ZN47_$LT$core..result..Result$LT$T$C$$u20$E$GT$$GT$6unwrap17h2368a06f0a7bf993E.exit": ; preds = %bb21.i.i
ret i64 %13
}
These lines in particular highlight the problem:
%spec.select.i.i = select i1 %1, i64 0, i64 8
br i1 %1, label %bb4.i1, label %bb4.i
[...]
%spec.select.i16.i = select i1 %4, i64 0, i64 8
br i1 %4, label %bb4.i1, label %bb11.i
In %bb4.i, %spec.select.i.i can only have the value 8. The same applies to %bb11.i and %spec.select.i16.i. The missed constant propagation here hurts code generation for the rest of the function.
@rust-lang/wg-codegen
@Amanieu Interesting, that one can't be handled by exploiting UB (but we should still track that other problem, it'll affect users of unchecked_unwrap style functions). It's too big to properly analyze why it's not optimized well, but based on what you've identified as the core problem I came up with this IR test case:
define i64 @foo(i64 %c) {
bb1:
%mul = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %c, i64 8)
%overflow = extractvalue { i64, i1 } %mul, 1
%select = select i1 %overflow, i64 0, i64 8
br i1 %overflow, label %abort, label %bb2
bb2:
call void @dummy(i64 %select)
ret i64 %select
abort:
call void @abort()
unreachable
}
declare void @abort()
declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64)
declare void @dummy(i64)
opt -O2 does nothing to this, and neither does NewGVN for that matter.
Removing the call to use_dummy makes instcombine optimize the return statement to ret i64 %n. I assume this is because of some "has only one use" check in instcombine. I haven't bothered to search for a simple expression that uses %select twice which instcombine can't turn into a single use.
Curiously, it also gets optimized if the select is sunk into bb2. I assume that's because of this cheapskate test. Which makes me wonder why the definition isn't sunk into bb2 by any pass in the -O2 pipeline.
Turns out there is a code sinking pass and opt -sink -instcombine optimizes the above test case as well as anyone could hope. It also does wonders on the IR from @Amanieu last "more like the original" code. Now to find out why it isn't enabled in the default pipeline...
Talked about this with @sunfishcode on IRC. Seems like there's no real reason for -sink to not be in the optimization pipeline(s) by default. However, apparently MachineSink was believed to cover many such cases (clearly not true here) and sinking could sometimes be a pessimization because it can lengthen the live ranges of the operands of the instruction that's moved. I'm unsure what to do about this issue. Ideally this optimization could be applied without moving code, but writing that pass & getting it upstream & enabled by default seems like a lot of work.
At least there should be an LLVM bug tracking this as well.
I'm somewhat worried about the amount of code that could be affected by this considering how often Option, Result and ? are used in Rust.
@rkruppe Could we try enabling the sinking pass in rustc so that we can see what the performance impact looks like (positive or negative)? This seems like something that would benefit Rust much more than C/C++.
AFAIK we still don't have infrastructure for good measuring the performance impact of a change on the run time of anything other than rustc itself. Without that, I'm not confident in our ability to check for regressions and quantify them.
This seems like something that would benefit Rust much more than C/C++
Possibly, but there's more cases like that (e.g. range checks) and we've generally been hesitant to deviate too far from the established pass pipelines in those cases too.
We did at some point make up a custom LLVM pass pipeline, so experimenting
with it is welcome.
On Fri, Jul 6, 2018, 17:38 Robin Kruppe notifications@github.com wrote:
AFAIK we still don't have infrastructure for good measuring the
performance impact of a change on the run time of anything other than rustc
itself. Without that, I'm not confident in our ability to check for
regressions and quantify them.This seems like something that would benefit Rust much more than C/C++
Possibly, but there's more cases like that (e.g. range checks) and we've
generally been hesitant to deviate too far from the established pass
pipelines in those cases too.—
You are receiving this because you are on a team that was mentioned.
Reply to this email directly, view it on GitHub
https://github.com/rust-lang/rust/issues/51346#issuecomment-403053496,
or mute the thread
https://github.com/notifications/unsubscribe-auth/AApc0hejmgQNcUVi3hhKadd064mUOEGBks5uD3Z-gaJpZM4UZOz8
.
Most helpful comment
Turns out there is a code sinking pass and
opt -sink -instcombineoptimizes the above test case as well as anyone could hope. It also does wonders on the IR from @Amanieu last "more like the original" code. Now to find out why it isn't enabled in the default pipeline...