Rust: Room for optimization improvement on reference counting

Created on 19 Mar 2014  Â·  22Comments  Â·  Source: rust-lang/rust

Playground

I would expect this function to be ideally optimized to a no-op:

#[inline(never)]
fn foo<T>(t: &Rc<T>) {
    drop(t.clone());  
}

However, when invoked with Rc<int>, we generate the following optimized IR

; Function Attrs: noinline nounwind uwtable
define internal fastcc void @_ZN3foo20h529f5691b0db4704gaa4v0.0E(%"struct.std::rc::Rc<int>[#1]"* nocapture readonly) unnamed_addr #3 {
entry-block:
  %1 = getelementptr inbounds %"struct.std::rc::Rc<int>[#1]"* %0, i64 0, i32 0
  %2 = load %"struct.std::rc::RcBox<int>[#1]"** %1, align 8
  %3 = getelementptr inbounds %"struct.std::rc::RcBox<int>[#1]"* %2, i64 0, i32 1
  %4 = load i64* %3, align 8
  %5 = add i64 %4, 1
  store i64 %5, i64* %3, align 8
  %6 = load %"struct.std::rc::RcBox<int>[#1]"** %1, align 8
  %7 = icmp eq %"struct.std::rc::RcBox<int>[#1]"* %6, null
  br i1 %7, label %_ZN3mem4drop20hf8ea12715443e741vda4v0.0E.exit, label %then-block-241-.i.i.i

then-block-241-.i.i.i:                            ; preds = %entry-block
  %8 = getelementptr inbounds %"struct.std::rc::RcBox<int>[#1]"* %6, i64 0, i32 1
  %9 = load i64* %8, align 8
  %10 = add i64 %9, -1
  store i64 %10, i64* %8, align 8
  %11 = icmp eq i64 %10, 0
  br i1 %11, label %then-block-262-.i.i.i, label %_ZN3mem4drop20hf8ea12715443e741vda4v0.0E.exit

then-block-262-.i.i.i:                            ; preds = %then-block-241-.i.i.i
  %12 = getelementptr inbounds %"struct.std::rc::RcBox<int>[#1]"* %6, i64 0, i32 2
  %13 = load i64* %12, align 8
  %14 = add i64 %13, -1
  store i64 %14, i64* %12, align 8
  %15 = icmp eq i64 %14, 0
  br i1 %15, label %then-block-270-.i.i.i, label %_ZN3mem4drop20hf8ea12715443e741vda4v0.0E.exit

then-block-270-.i.i.i:                            ; preds = %then-block-262-.i.i.i
  %16 = bitcast %"struct.std::rc::RcBox<int>[#1]"* %6 to i8*
  tail call void @free(i8* %16) #2
  br label %_ZN3mem4drop20hf8ea12715443e741vda4v0.0E.exit

_ZN3mem4drop20hf8ea12715443e741vda4v0.0E.exit:    ; preds = %entry-block, %then-block-241-.i.i.i, %then-block-262-.i.i.i, %then-block-270-.i.i.i
  ret void
}

I would expect that we would be able to do better.

A-codegen C-enhancement E-easy I-slow T-libs

Most helpful comment

The current LLVM IR for the snippet in the issue description is:

Click to expand LLVM IR

; ModuleID = 'playground0-3a317160927575f9dc8b26750a523322.rs'
source_filename = "playground0-3a317160927575f9dc8b26750a523322.rs"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

%"unwind::libunwind::_Unwind_Exception" = type { [0 x i64], i64, [0 x i64], void (i32, %"unwind::libunwind::_Unwind_Exception"*)*, [0 x i64], [6 x i64], [0 x i64] }
%"unwind::libunwind::_Unwind_Context" = type { [0 x i8] }

; Function Attrs: uwtable
define void @foo(i64** noalias nocapture readonly dereferenceable(8) %t) unnamed_addr #0 personality i32 (i32, i32, i64, %"unwind::libunwind::_Unwind_Exception"*, %"unwind::libunwind::_Unwind_Context"*)* @rust_eh_personality {
start:
  %t.val = load i64*, i64** %t, align 8
  %.val.i.i.i = load i64, i64* %t.val, align 8
  %0 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %.val.i.i.i, i64 1) #5
  %1 = extractvalue { i64, i1 } %0, 1
  br i1 %1, label %bb2.i.i.i, label %"_ZN61_$LT$alloc..rc..Rc$LT$T$GT$$u20$as$u20$core..clone..Clone$GT$5clone17h127f785fcc974269E.exit"

bb2.i.i.i:                                        ; preds = %start
  tail call void @llvm.trap() #5
  unreachable

"_ZN61_$LT$alloc..rc..Rc$LT$T$GT$$u20$as$u20$core..clone..Clone$GT$5clone17h127f785fcc974269E.exit": ; preds = %start
  %2 = extractvalue { i64, i1 } %0, 0
  %3 = add i64 %2, -1
  store i64 %3, i64* %t.val, align 1
  %4 = icmp eq i64 %3, 0
  br i1 %4, label %bb4.i.i.i, label %_ZN4core3mem4drop17h222f528d1fa068ceE.exit

bb4.i.i.i:                                        ; preds = %"_ZN61_$LT$alloc..rc..Rc$LT$T$GT$$u20$as$u20$core..clone..Clone$GT$5clone17h127f785fcc974269E.exit"
  %5 = getelementptr inbounds i64, i64* %t.val, i64 1
  %.val.i.i5.i.i.i = load i64, i64* %5, align 8
  %6 = add i64 %.val.i.i5.i.i.i, -1
  store i64 %6, i64* %5, align 1
  %7 = icmp eq i64 %6, 0
  br i1 %7, label %bb9.i.i.i, label %_ZN4core3mem4drop17h222f528d1fa068ceE.exit

bb9.i.i.i:                                        ; preds = %bb4.i.i.i
  %8 = bitcast i64* %t.val to i8*
  tail call void @__rust_dealloc(i8* nonnull %8, i64 24, i64 8) #5
  br label %_ZN4core3mem4drop17h222f528d1fa068ceE.exit

_ZN4core3mem4drop17h222f528d1fa068ceE.exit:       ; preds = %"_ZN61_$LT$alloc..rc..Rc$LT$T$GT$$u20$as$u20$core..clone..Clone$GT$5clone17h127f785fcc974269E.exit", %bb4.i.i.i, %bb9.i.i.i
  ret void
}

declare i32 @rust_eh_personality(i32, i32, i64, %"unwind::libunwind::_Unwind_Exception"*, %"unwind::libunwind::_Unwind_Context"*) unnamed_addr #1

; Function Attrs: nounwind readnone speculatable
declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #2

; Function Attrs: noreturn nounwind
declare void @llvm.trap() #3

; Function Attrs: nounwind
declare void @__rust_dealloc(i8*, i64, i64) unnamed_addr #4

attributes #0 = { uwtable "probe-stack"="__rust_probestack" }
attributes #1 = { "probe-stack"="__rust_probestack" }
attributes #2 = { nounwind readnone speculatable }
attributes #3 = { noreturn nounwind }
attributes #4 = { nounwind "probe-stack"="__rust_probestack" }
attributes #5 = { nounwind }


@eddyb said it's an LLVM bug and then walked off in the distance.

Cc @rust-lang/wg-codegen @rust-lang/wg-compiler-performance

All 22 comments

I think that this is partly because LLVM doesn't understand that &Rc<int> will never alias *RcBox<int>, so this may just be a "implement TBAA" bug

The situation is improved with the parameter marked as noalias, but it still doesn't optimize out.

With aliasing out of the way, LLVM still can't know that the refcount isn't already 0 when entering the function, so it can't remove the code.

Maybe an intrinsic for doing a load with a range assertion would work.

Range asserts don't help here, they're not used to optimize the icmp away.
And last time I tried to add support for that to LLVM, it affected some
other optimizations in a bad way. Leading to worse results overall. Maybe I
just messed up the implementation though
Am 20.07.2014 01:16 schrieb "Daniel Micay" [email protected]:

Maybe an intrinsic for doing a load with a range assertion would work.

—
Reply to this email directly or view it on GitHub
https://github.com/rust-lang/rust/issues/13018#issuecomment-49532087.

21418 didn't actually land :( (had to revert)

Triage: runnable code

use std::rc::Rc;

#[inline(never)]
fn foo<T>(t: &Rc<T>) {
    drop(t.clone());  
}

fn main() {
    let r = Rc::new(5);

    foo(&r);
}

I'm not aware of any improvements in this area, nor am I good enough with LLVM IR to tell if things have changed.

LLVM still can't know that the refcount isn't already 0

Adding assume(self.strong() != 0) to the top of clone() might fix this. rc.rs already uses assume in other places so this wouldn't be unprecedented.

Even with this change, the overflow check in inc_strong might _also_ prevent the optimization. Not sure what to do about that :/

@jruderman that is similar to what was tried in #21418, but was found to be a serious compilation time hit (https://github.com/rust-lang/rust/pull/21418#issuecomment-70929449). The assumes there currently were added in https://github.com/rust-lang/rust/pull/21894.

I'm not aware of any improvements in this area, nor am I good enough with LLVM IR to tell if things have changed.

LLVM IR we produce is still as junk as was before, even with MIR.

It might sense to attempt reapplying the assume stuff. A year and half has passed since first attempt, so LLVM might be able to handle it this time.

I just notice this issue when I'm reading Rc's code.

As @jruderman mentioned, we are using checked_add in inc_strong now, and the overflow checking is preventing the optimization.
Regardless the performance with the assume now, I'm afraid we can't make the original target code no-op.
(and yes, adding the assume in original patch won't make it no-op)

Should we think other solution or close the issue ?

The current LLVM IR for the snippet in the issue description is:

Click to expand LLVM IR

; ModuleID = 'playground0-3a317160927575f9dc8b26750a523322.rs'
source_filename = "playground0-3a317160927575f9dc8b26750a523322.rs"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

%"unwind::libunwind::_Unwind_Exception" = type { [0 x i64], i64, [0 x i64], void (i32, %"unwind::libunwind::_Unwind_Exception"*)*, [0 x i64], [6 x i64], [0 x i64] }
%"unwind::libunwind::_Unwind_Context" = type { [0 x i8] }

; Function Attrs: uwtable
define void @foo(i64** noalias nocapture readonly dereferenceable(8) %t) unnamed_addr #0 personality i32 (i32, i32, i64, %"unwind::libunwind::_Unwind_Exception"*, %"unwind::libunwind::_Unwind_Context"*)* @rust_eh_personality {
start:
  %t.val = load i64*, i64** %t, align 8
  %.val.i.i.i = load i64, i64* %t.val, align 8
  %0 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %.val.i.i.i, i64 1) #5
  %1 = extractvalue { i64, i1 } %0, 1
  br i1 %1, label %bb2.i.i.i, label %"_ZN61_$LT$alloc..rc..Rc$LT$T$GT$$u20$as$u20$core..clone..Clone$GT$5clone17h127f785fcc974269E.exit"

bb2.i.i.i:                                        ; preds = %start
  tail call void @llvm.trap() #5
  unreachable

"_ZN61_$LT$alloc..rc..Rc$LT$T$GT$$u20$as$u20$core..clone..Clone$GT$5clone17h127f785fcc974269E.exit": ; preds = %start
  %2 = extractvalue { i64, i1 } %0, 0
  %3 = add i64 %2, -1
  store i64 %3, i64* %t.val, align 1
  %4 = icmp eq i64 %3, 0
  br i1 %4, label %bb4.i.i.i, label %_ZN4core3mem4drop17h222f528d1fa068ceE.exit

bb4.i.i.i:                                        ; preds = %"_ZN61_$LT$alloc..rc..Rc$LT$T$GT$$u20$as$u20$core..clone..Clone$GT$5clone17h127f785fcc974269E.exit"
  %5 = getelementptr inbounds i64, i64* %t.val, i64 1
  %.val.i.i5.i.i.i = load i64, i64* %5, align 8
  %6 = add i64 %.val.i.i5.i.i.i, -1
  store i64 %6, i64* %5, align 1
  %7 = icmp eq i64 %6, 0
  br i1 %7, label %bb9.i.i.i, label %_ZN4core3mem4drop17h222f528d1fa068ceE.exit

bb9.i.i.i:                                        ; preds = %bb4.i.i.i
  %8 = bitcast i64* %t.val to i8*
  tail call void @__rust_dealloc(i8* nonnull %8, i64 24, i64 8) #5
  br label %_ZN4core3mem4drop17h222f528d1fa068ceE.exit

_ZN4core3mem4drop17h222f528d1fa068ceE.exit:       ; preds = %"_ZN61_$LT$alloc..rc..Rc$LT$T$GT$$u20$as$u20$core..clone..Clone$GT$5clone17h127f785fcc974269E.exit", %bb4.i.i.i, %bb9.i.i.i
  ret void
}

declare i32 @rust_eh_personality(i32, i32, i64, %"unwind::libunwind::_Unwind_Exception"*, %"unwind::libunwind::_Unwind_Context"*) unnamed_addr #1

; Function Attrs: nounwind readnone speculatable
declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #2

; Function Attrs: noreturn nounwind
declare void @llvm.trap() #3

; Function Attrs: nounwind
declare void @__rust_dealloc(i8*, i64, i64) unnamed_addr #4

attributes #0 = { uwtable "probe-stack"="__rust_probestack" }
attributes #1 = { "probe-stack"="__rust_probestack" }
attributes #2 = { nounwind readnone speculatable }
attributes #3 = { noreturn nounwind }
attributes #4 = { nounwind "probe-stack"="__rust_probestack" }
attributes #5 = { nounwind }


@eddyb said it's an LLVM bug and then walked off in the distance.

Cc @rust-lang/wg-codegen @rust-lang/wg-compiler-performance

Minimal example, which right now adds & subtracts instead of checking & returning the original x:

#![crate_type = "lib"]
#[no_mangle]
pub fn assert_not_max_usize(x: usize) -> usize {
    x.checked_add(1).unwrap().wrapping_sub(1)
}

It's unclear to me that any optimizing compiler can remove the increment/decrement in this case -- we've specified that if the reference count overflows, we expect the program to abort (panic in eddyb's minimal example). LLVM cannot remove that; it would change the program's behavior. As such, it's not clear to me that this issue can ever be fixed, so I'm going to close it.

I don't think this issue should be closed, as shown in https://github.com/rust-lang/rust/issues/13018#issuecomment-377538875 this is not about changing the program behaviour but about making the compiler check whether the value is the maximum value instead of doing the increment-decrement dance.

Right, but today that code snippet will panic if passed usize::max_value(); if we somehow make the compiler not do that then we've changed program behavior and fixed this bug. Maybe I'm wrong, but my understanding is that compilers aren't generally allowed to change that type of program behavior.

The optimisation improvement here is not about making the program not panicking (that would indeed be a behaviour change and thus shouldn't be done), but making it not increment and decrement the actual reference count and instead just panic if it is usize::max_value(). See discussion at
https://github.com/rust-lang/rust/issues/13018#issuecomment-218364078.

To be precise, something in the compiler toolchain could realise that:

#![crate_type = "lib"]
#[no_mangle]
pub fn assert_not_max_usize(x: usize) -> usize {
    x.checked_add(1).unwrap().wrapping_sub(1)
}

can become:

#![crate_type = "lib"]
#[no_mangle]
pub fn assert_not_max_usize(x: usize) -> usize {
    if x == usize::max_value() {
        None.unwrap();
    }
    x
}

Hm, okay, so looking at this more it seems that the answer is that LLVM can, we just need to help out a bit; it seems that LLVM can't quite handle the case where the abort is conditional on the add.

It almost looks like LLVM doesn't see that the overflow (and hence, abort) only occurs when usize is max_value.

Anyway, reopening. I think we can switch the code in liballoc to explicitly check before hand and abort; that should help with this if at least the small test cases are accurate.

pub fn check(x: usize) -> usize {
    if x == usize::max_value() {
        abort();
    }
    x.checked_add(1).unwrap().wrapping_sub(1)
}

produces

check:
        push    rax
        cmp     rdi, -1
        je      .LBB0_1
        mov     rax, rdi
        pop     rcx
        ret
.LBB0_1:
        call    std::process::abort@PLT
        ud2

@alexcrichton considering #53080 has been merged, could this issue be closed?

Technically this is still an issue in that the above function doesn't optimize to nothing, but practically I think it's "fixed enough", so sure!

Was this page helpful?
0 / 5 - 0 ratings

Related issues

drewcrawford picture drewcrawford  Â·  3Comments

zhendongsu picture zhendongsu  Â·  3Comments

behnam picture behnam  Â·  3Comments

dnsl48 picture dnsl48  Â·  3Comments

Robbepop picture Robbepop  Â·  3Comments