Rust: Room for optimization improvement on reference counting

Created on 19 Mar 2014 · 22Comments · Source: rust-lang/rust

I would expect this function to be ideally optimized to a no-op:

#[inline(never)]
fn foo<T>(t: &Rc<T>) {
    drop(t.clone());  
}

However, when invoked with Rc<int>, we generate the following optimized IR

; Function Attrs: noinline nounwind uwtable
define internal fastcc void @_ZN3foo20h529f5691b0db4704gaa4v0.0E(%"struct.std::rc::Rc<int>[#1]"* nocapture readonly) unnamed_addr #3 {
entry-block:
  %1 = getelementptr inbounds %"struct.std::rc::Rc<int>[#1]"* %0, i64 0, i32 0
  %2 = load %"struct.std::rc::RcBox<int>[#1]"** %1, align 8
  %3 = getelementptr inbounds %"struct.std::rc::RcBox<int>[#1]"* %2, i64 0, i32 1
  %4 = load i64* %3, align 8
  %5 = add i64 %4, 1
  store i64 %5, i64* %3, align 8
  %6 = load %"struct.std::rc::RcBox<int>[#1]"** %1, align 8
  %7 = icmp eq %"struct.std::rc::RcBox<int>[#1]"* %6, null
  br i1 %7, label %_ZN3mem4drop20hf8ea12715443e741vda4v0.0E.exit, label %then-block-241-.i.i.i

then-block-241-.i.i.i:                            ; preds = %entry-block
  %8 = getelementptr inbounds %"struct.std::rc::RcBox<int>[#1]"* %6, i64 0, i32 1
  %9 = load i64* %8, align 8
  %10 = add i64 %9, -1
  store i64 %10, i64* %8, align 8
  %11 = icmp eq i64 %10, 0
  br i1 %11, label %then-block-262-.i.i.i, label %_ZN3mem4drop20hf8ea12715443e741vda4v0.0E.exit

then-block-262-.i.i.i:                            ; preds = %then-block-241-.i.i.i
  %12 = getelementptr inbounds %"struct.std::rc::RcBox<int>[#1]"* %6, i64 0, i32 2
  %13 = load i64* %12, align 8
  %14 = add i64 %13, -1
  store i64 %14, i64* %12, align 8
  %15 = icmp eq i64 %14, 0
  br i1 %15, label %then-block-270-.i.i.i, label %_ZN3mem4drop20hf8ea12715443e741vda4v0.0E.exit

then-block-270-.i.i.i:                            ; preds = %then-block-262-.i.i.i
  %16 = bitcast %"struct.std::rc::RcBox<int>[#1]"* %6 to i8*
  tail call void @free(i8* %16) #2
  br label %_ZN3mem4drop20hf8ea12715443e741vda4v0.0E.exit

_ZN3mem4drop20hf8ea12715443e741vda4v0.0E.exit:    ; preds = %entry-block, %then-block-241-.i.i.i, %then-block-262-.i.i.i, %then-block-270-.i.i.i
  ret void
}

I would expect that we would be able to do better.

A-codegen C-enhancement E-easy I-slow T-libs

Source

alexcrichton

Most helpful comment

The current LLVM IR for the snippet in the issue description is:

Click to expand LLVM IR

; ModuleID = 'playground0-3a317160927575f9dc8b26750a523322.rs'
source_filename = "playground0-3a317160927575f9dc8b26750a523322.rs"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

%"unwind::libunwind::_Unwind_Exception" = type { [0 x i64], i64, [0 x i64], void (i32, %"unwind::libunwind::_Unwind_Exception"*)*, [0 x i64], [6 x i64], [0 x i64] }
%"unwind::libunwind::_Unwind_Context" = type { [0 x i8] }

; Function Attrs: uwtable
define void @foo(i64** noalias nocapture readonly dereferenceable(8) %t) unnamed_addr #0 personality i32 (i32, i32, i64, %"unwind::libunwind::_Unwind_Exception"*, %"unwind::libunwind::_Unwind_Context"*)* @rust_eh_personality {
start:
  %t.val = load i64*, i64** %t, align 8
  %.val.i.i.i = load i64, i64* %t.val, align 8
  %0 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %.val.i.i.i, i64 1) #5
  %1 = extractvalue { i64, i1 } %0, 1
  br i1 %1, label %bb2.i.i.i, label %"_ZN61_$LT$alloc..rc..Rc$LT$T$GT$$u20$as$u20$core..clone..Clone$GT$5clone17h127f785fcc974269E.exit"

bb2.i.i.i:                                        ; preds = %start
  tail call void @llvm.trap() #5
  unreachable

"_ZN61_$LT$alloc..rc..Rc$LT$T$GT$$u20$as$u20$core..clone..Clone$GT$5clone17h127f785fcc974269E.exit": ; preds = %start
  %2 = extractvalue { i64, i1 } %0, 0
  %3 = add i64 %2, -1
  store i64 %3, i64* %t.val, align 1
  %4 = icmp eq i64 %3, 0
  br i1 %4, label %bb4.i.i.i, label %_ZN4core3mem4drop17h222f528d1fa068ceE.exit

bb4.i.i.i:                                        ; preds = %"_ZN61_$LT$alloc..rc..Rc$LT$T$GT$$u20$as$u20$core..clone..Clone$GT$5clone17h127f785fcc974269E.exit"
  %5 = getelementptr inbounds i64, i64* %t.val, i64 1
  %.val.i.i5.i.i.i = load i64, i64* %5, align 8
  %6 = add i64 %.val.i.i5.i.i.i, -1
  store i64 %6, i64* %5, align 1
  %7 = icmp eq i64 %6, 0
  br i1 %7, label %bb9.i.i.i, label %_ZN4core3mem4drop17h222f528d1fa068ceE.exit

bb9.i.i.i:                                        ; preds = %bb4.i.i.i
  %8 = bitcast i64* %t.val to i8*
  tail call void @__rust_dealloc(i8* nonnull %8, i64 24, i64 8) #5
  br label %_ZN4core3mem4drop17h222f528d1fa068ceE.exit

_ZN4core3mem4drop17h222f528d1fa068ceE.exit:       ; preds = %"_ZN61_$LT$alloc..rc..Rc$LT$T$GT$$u20$as$u20$core..clone..Clone$GT$5clone17h127f785fcc974269E.exit", %bb4.i.i.i, %bb9.i.i.i
  ret void
}

declare i32 @rust_eh_personality(i32, i32, i64, %"unwind::libunwind::_Unwind_Exception"*, %"unwind::libunwind::_Unwind_Context"*) unnamed_addr #1

; Function Attrs: nounwind readnone speculatable
declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #2

; Function Attrs: noreturn nounwind
declare void @llvm.trap() #3

; Function Attrs: nounwind
declare void @__rust_dealloc(i8*, i64, i64) unnamed_addr #4

attributes #0 = { uwtable "probe-stack"="__rust_probestack" }
attributes #1 = { "probe-stack"="__rust_probestack" }
attributes #2 = { nounwind readnone speculatable }
attributes #3 = { noreturn nounwind }
attributes #4 = { nounwind "probe-stack"="__rust_probestack" }
attributes #5 = { nounwind }

@eddyb said it's an LLVM bug and then walked off in the distance.

Cc @rust-lang/wg-codegen @rust-lang/wg-compiler-performance

nox on 30 Mar 2018

😄5

All 22 comments

I think that this is partly because LLVM doesn't understand that &Rc<int> will never alias *RcBox<int>, so this may just be a "implement TBAA" bug

alexcrichton on 19 Mar 2014

The situation is improved with the parameter marked as noalias, but it still doesn't optimize out.

thestinger on 19 Mar 2014

With aliasing out of the way, LLVM still can't know that the refcount isn't already 0 when entering the function, so it can't remove the code.

dotdash on 19 Jul 2014

Maybe an intrinsic for doing a load with a range assertion would work.

thestinger on 20 Jul 2014

Range asserts don't help here, they're not used to optimize the icmp away.
And last time I tried to add support for that to LLVM, it affected some
other optimizations in a bad way. Leading to worse results overall. Maybe I
just messed up the implementation though
Am 20.07.2014 01:16 schrieb "Daniel Micay" [email protected]:

Maybe an intrinsic for doing a load with a range assertion would work.

—
Reply to this email directly or view it on GitHub
https://github.com/rust-lang/rust/issues/13018#issuecomment-49532087.

dotdash on 20 Jul 2014

21418 didn't actually land :( (had to revert)

alexcrichton on 22 Jan 2015

Triage: runnable code

use std::rc::Rc;

#[inline(never)]
fn foo<T>(t: &Rc<T>) {
    drop(t.clone());  
}

fn main() {
    let r = Rc::new(5);

    foo(&r);
}

I'm not aware of any improvements in this area, nor am I good enough with LLVM IR to tell if things have changed.

steveklabnik on 2 Feb 2016

LLVM still can't know that the refcount isn't already 0

Adding assume(self.strong() != 0) to the top of clone() might fix this. rc.rs already uses assume in other places so this wouldn't be unprecedented.

Even with this change, the overflow check in inc_strong might _also_ prevent the optimization. Not sure what to do about that :/

jruderman on 11 May 2016

@jruderman that is similar to what was tried in #21418, but was found to be a serious compilation time hit (https://github.com/rust-lang/rust/pull/21418#issuecomment-70929449). The assumes there currently were added in https://github.com/rust-lang/rust/pull/21894.

huonw on 11 May 2016

I'm not aware of any improvements in this area, nor am I good enough with LLVM IR to tell if things have changed.

LLVM IR we produce is still as junk as was before, even with MIR.

nagisa on 12 Aug 2016

It might sense to attempt reapplying the assume stuff. A year and half has passed since first attempt, so LLVM might be able to handle it this time.

nagisa on 12 Aug 2016

I just notice this issue when I'm reading Rc's code.

As @jruderman mentioned, we are using checked_add in inc_strong now, and the overflow checking is preventing the optimization.
Regardless the performance with the assume now, I'm afraid we can't make the original target code no-op.
(and yes, adding the assume in original patch won't make it no-op)

Should we think other solution or close the issue ?

wdv4758h on 11 Jan 2017

The current LLVM IR for the snippet in the issue description is:

Click to expand LLVM IR

; ModuleID = 'playground0-3a317160927575f9dc8b26750a523322.rs'
source_filename = "playground0-3a317160927575f9dc8b26750a523322.rs"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

%"unwind::libunwind::_Unwind_Exception" = type { [0 x i64], i64, [0 x i64], void (i32, %"unwind::libunwind::_Unwind_Exception"*)*, [0 x i64], [6 x i64], [0 x i64] }
%"unwind::libunwind::_Unwind_Context" = type { [0 x i8] }

; Function Attrs: uwtable
define void @foo(i64** noalias nocapture readonly dereferenceable(8) %t) unnamed_addr #0 personality i32 (i32, i32, i64, %"unwind::libunwind::_Unwind_Exception"*, %"unwind::libunwind::_Unwind_Context"*)* @rust_eh_personality {
start:
  %t.val = load i64*, i64** %t, align 8
  %.val.i.i.i = load i64, i64* %t.val, align 8
  %0 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %.val.i.i.i, i64 1) #5
  %1 = extractvalue { i64, i1 } %0, 1
  br i1 %1, label %bb2.i.i.i, label %"_ZN61_$LT$alloc..rc..Rc$LT$T$GT$$u20$as$u20$core..clone..Clone$GT$5clone17h127f785fcc974269E.exit"

bb2.i.i.i:                                        ; preds = %start
  tail call void @llvm.trap() #5
  unreachable

"_ZN61_$LT$alloc..rc..Rc$LT$T$GT$$u20$as$u20$core..clone..Clone$GT$5clone17h127f785fcc974269E.exit": ; preds = %start
  %2 = extractvalue { i64, i1 } %0, 0
  %3 = add i64 %2, -1
  store i64 %3, i64* %t.val, align 1
  %4 = icmp eq i64 %3, 0
  br i1 %4, label %bb4.i.i.i, label %_ZN4core3mem4drop17h222f528d1fa068ceE.exit

bb4.i.i.i:                                        ; preds = %"_ZN61_$LT$alloc..rc..Rc$LT$T$GT$$u20$as$u20$core..clone..Clone$GT$5clone17h127f785fcc974269E.exit"
  %5 = getelementptr inbounds i64, i64* %t.val, i64 1
  %.val.i.i5.i.i.i = load i64, i64* %5, align 8
  %6 = add i64 %.val.i.i5.i.i.i, -1
  store i64 %6, i64* %5, align 1
  %7 = icmp eq i64 %6, 0
  br i1 %7, label %bb9.i.i.i, label %_ZN4core3mem4drop17h222f528d1fa068ceE.exit

bb9.i.i.i:                                        ; preds = %bb4.i.i.i
  %8 = bitcast i64* %t.val to i8*
  tail call void @__rust_dealloc(i8* nonnull %8, i64 24, i64 8) #5
  br label %_ZN4core3mem4drop17h222f528d1fa068ceE.exit

_ZN4core3mem4drop17h222f528d1fa068ceE.exit:       ; preds = %"_ZN61_$LT$alloc..rc..Rc$LT$T$GT$$u20$as$u20$core..clone..Clone$GT$5clone17h127f785fcc974269E.exit", %bb4.i.i.i, %bb9.i.i.i
  ret void
}

declare i32 @rust_eh_personality(i32, i32, i64, %"unwind::libunwind::_Unwind_Exception"*, %"unwind::libunwind::_Unwind_Context"*) unnamed_addr #1

; Function Attrs: nounwind readnone speculatable
declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #2

; Function Attrs: noreturn nounwind
declare void @llvm.trap() #3

; Function Attrs: nounwind
declare void @__rust_dealloc(i8*, i64, i64) unnamed_addr #4

attributes #0 = { uwtable "probe-stack"="__rust_probestack" }
attributes #1 = { "probe-stack"="__rust_probestack" }
attributes #2 = { nounwind readnone speculatable }
attributes #3 = { noreturn nounwind }
attributes #4 = { nounwind "probe-stack"="__rust_probestack" }
attributes #5 = { nounwind }

@eddyb said it's an LLVM bug and then walked off in the distance.

Cc @rust-lang/wg-codegen @rust-lang/wg-compiler-performance

nox on 30 Mar 2018

😄5

Minimal example, which right now adds & subtracts instead of checking & returning the original x:

#![crate_type = "lib"]
#[no_mangle]
pub fn assert_not_max_usize(x: usize) -> usize {
    x.checked_add(1).unwrap().wrapping_sub(1)
}

eddyb on 30 Mar 2018

😄1

It's unclear to me that any optimizing compiler can remove the increment/decrement in this case -- we've specified that if the reference count overflows, we expect the program to abort (panic in eddyb's minimal example). LLVM cannot remove that; it would change the program's behavior. As such, it's not clear to me that this issue can ever be fixed, so I'm going to close it.

Mark-Simulacrum on 29 Jul 2018

I don't think this issue should be closed, as shown in https://github.com/rust-lang/rust/issues/13018#issuecomment-377538875 this is not about changing the program behaviour but about making the compiler check whether the value is the maximum value instead of doing the increment-decrement dance.

nox on 29 Jul 2018

👍1

Right, but today that code snippet will panic if passed usize::max_value(); if we somehow make the compiler not do that then we've changed program behavior and fixed this bug. Maybe I'm wrong, but my understanding is that compilers aren't generally allowed to change that type of program behavior.

Mark-Simulacrum on 29 Jul 2018

The optimisation improvement here is not about making the program not panicking (that would indeed be a behaviour change and thus shouldn't be done), but making it not increment and decrement the actual reference count and instead just panic if it is usize::max_value(). See discussion at
https://github.com/rust-lang/rust/issues/13018#issuecomment-218364078.

nox on 29 Jul 2018

👍1

To be precise, something in the compiler toolchain could realise that:

#![crate_type = "lib"]
#[no_mangle]
pub fn assert_not_max_usize(x: usize) -> usize {
    x.checked_add(1).unwrap().wrapping_sub(1)
}

can become:

#![crate_type = "lib"]
#[no_mangle]
pub fn assert_not_max_usize(x: usize) -> usize {
    if x == usize::max_value() {
        None.unwrap();
    }
    x
}

nox on 29 Jul 2018

Hm, okay, so looking at this more it seems that the answer is that LLVM can, we just need to help out a bit; it seems that LLVM can't quite handle the case where the abort is conditional on the add.

It almost looks like LLVM doesn't see that the overflow (and hence, abort) only occurs when usize is max_value.

Anyway, reopening. I think we can switch the code in liballoc to explicitly check before hand and abort; that should help with this if at least the small test cases are accurate.

pub fn check(x: usize) -> usize {
    if x == usize::max_value() {
        abort();
    }
    x.checked_add(1).unwrap().wrapping_sub(1)
}

produces

check:
        push    rax
        cmp     rdi, -1
        je      .LBB0_1
        mov     rax, rdi
        pop     rcx
        ret
.LBB0_1:
        call    std::process::abort@PLT
        ud2

Mark-Simulacrum on 29 Jul 2018

@alexcrichton considering #53080 has been merged, could this issue be closed?