Rust: Optimisation-caused UB during cross-crate compilation

Created on 5 Sep 2020 · 17Comments · Source: rust-lang/rust

The issue reproduces only if FatPtr code resides in a separate crate and only if compiled in release configuration.

I tried this code:

fat_ptr's crate lib.rs:

pub struct FatPtr {
    ptr: *mut u8,
    len: usize,
}

impl FatPtr {
    pub fn new(len: usize) -> FatPtr {
        let ptr = Box::into_raw(vec![42u8; len].into_boxed_slice()) as *mut u8;

        FatPtr { ptr, len }
    }
}

impl std::ops::Deref for FatPtr {
    type Target = [u8];

    #[inline]
    fn deref(&self) -> &[u8] {
        unsafe { std::slice::from_raw_parts(self.ptr, self.len) }
    }
}

impl std::ops::Drop for FatPtr {
    fn drop(&mut self) {
        println!("Drop");
    }
}

test's crate main.rs:

use fat_ptr::FatPtr;

fn print(data: &[u8]) {
    println!("{:#?}", data);
}

fn main() {
    let ptr = FatPtr::new(20);
    let data = unsafe { std::slice::from_raw_parts(ptr.as_ptr(), ptr.len()) };

    print(data);
}

I expected to see this happen: 42 printed 20 times.

Instead, this happened: 42 printed 20 times in debug build, but not in release. Instead, slice length appears to be corrupted, so println prints memory dump outside of allocated slice and eventually segfaults. However, if Drop implementation for FatPtr is removed, then slice is truncated to a single byte, which is still incorrect, but doesn't crash the process.

If print(data) call is replaced with println!("{:#?}", data); directly then everything works as intended even on release builds. If FatPtr code placed in the same crate as main then everything works as intended as well. Removing #[inline] from deref fixed the issue as well.

This reproduces on Linux and MacOS (haven't tried Windows).

Most helpful comment

I'm working on a patch for LLVM

Aaron1011 on 10 Sep 2020

❤7

All 17 comments

This passes under Miri, and it looks like the invariants for std::slice::from_raw_parts are being upheld.

Aaron1011 on 6 Sep 2020

❤1

-Z unpretty=mir: https://gist.github.com/jyn514/8cc91dae699c828e7e9164f9de03cd38

jyn514 on 6 Sep 2020

I can reproduce this at least as far back as nightly-2018-07-07 (although it's an infinite loop there, not a segfault). So I'd guess this isn't a regression and just never worked.

jyn514 on 6 Sep 2020

The slice length returned from FatPtr::deref is considered dead and eliminated
by Dead Argument Elimination, that doesn't seem right since there are two
calls to FatPtr::deref, one that uses returned ptr and the other one that uses
returned length.

Minimized LLVM IR

; ModuleID = 'c.ll'
source_filename = "b.7rcbfp3g-cgu.0"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@anon.7406e8c0c880c9bceef10e4e4055b7c3.1 = private unnamed_addr constant <{ [4 x i8] }> <{ [4 x i8] c"ab\0A\00" }>, align 1

; Function Attrs: inlinehint nonlazybind uwtable
define internal { [0 x i8]*, i64 } @from_raw_parts(i8* %data, i64 %len) unnamed_addr #0 {
start:
  %0 = bitcast i8* %data to [0 x i8]*
  %1 = insertvalue { [0 x i8]*, i64 } undef, [0 x i8]* %0, 0
  %2 = insertvalue { [0 x i8]*, i64 } %1, i64 %len, 1
  ret { [0 x i8]*, i64 } %2
}

; Function Attrs: inlinehint nonlazybind uwtable
define internal i64 @slice_len([0 x i8]* noalias nonnull readonly align 1 %data, i64 %len) unnamed_addr #0 {
start:
  ret i64 %len
}

; Function Attrs: inlinehint nonlazybind uwtable
define internal i8* @as_ptr([0 x i8]* noalias nonnull readonly align 1 %data, i64 %len) unnamed_addr #0 {
start:
  %0 = bitcast [0 x i8]* %data to i8*
  ret i8* %0
}

; Function Attrs: inlinehint nonlazybind uwtable
define internal { [0 x i8]*, i64 } @deref({ i8*, i64 }* noalias readonly align 8 dereferenceable(16) %self) unnamed_addr #0 {
start:
  %0 = getelementptr inbounds { i8*, i64 }, { i8*, i64 }* %self, i64 0, i32 0
  %_2 = load i8*, i8** %0, align 8
  %1 = getelementptr inbounds { i8*, i64 }, { i8*, i64 }* %self, i64 0, i32 1
  %_3 = load i64, i64* %1, align 8
  %2 = call { [0 x i8]*, i64 } @from_raw_parts(i8* %_2, i64 %_3)
  ret { [0 x i8]*, i64 } %2
}

; Function Attrs: noinline norecurse nounwind nonlazybind optnone readnone uwtable
define { i8*, i64 } @crate() unnamed_addr #2 {
  ret { i8*, i64 } { i8* getelementptr inbounds (<{ [4 x i8] }>, <{ [4 x i8] }>* @anon.7406e8c0c880c9bceef10e4e4055b7c3.1, i64 0, i32 0, i64 0), i64 3 }
}

; Function Attrs: nonlazybind
declare dso_local i64 @write(i32, i8* nocapture readonly, i64) local_unnamed_addr #3

; Function Attrs: nonlazybind
define i32 @main(i32 %argc, i8** %argv) unnamed_addr #3 {
start:
  %ptr = alloca { i8*, i64 }, align 8
  %0 = call { i8*, i64 } @crate()
  store { i8*, i64 } %0, { i8*, i64 }* %ptr, align 8
  %1 = call { [0 x i8]*, i64 } @deref({ i8*, i64 }* noalias readonly align 8 dereferenceable(16) %ptr)
  %_1.0 = extractvalue { [0 x i8]*, i64 } %1, 0
  %_1.1 = extractvalue { [0 x i8]*, i64 } %1, 1
  %2 = call i8* @as_ptr([0 x i8]* noalias nonnull readonly align 1 %_1.0, i64 %_1.1)
  %3 = call { [0 x i8]*, i64 } @deref({ i8*, i64 }* noalias readonly align 8 dereferenceable(16) %ptr)
  %_3.0 = extractvalue { [0 x i8]*, i64 } %3, 0
  %_3.1 = extractvalue { [0 x i8]*, i64 } %3, 1
  %4 = call i64 @slice_len([0 x i8]* noalias nonnull readonly align 1 %_3.0, i64 %_3.1)
  %5 = call { [0 x i8]*, i64 } @from_raw_parts(i8* %2, i64 %4)
  %_5.0 = extractvalue { [0 x i8]*, i64 } %5, 0
  %_5.1 = extractvalue { [0 x i8]*, i64 } %5, 1
  %6 = bitcast [0 x i8]* %_5.0 to i8*
  %7 = call i64 @write(i32 1, i8* nonnull %6, i64 %_5.1)
  ret i32 0
}

attributes #0 = { inlinehint nonlazybind uwtable "probe-stack"="__rust_probestack" "target-cpu"="x86-64" }
attributes #1 = { nonlazybind uwtable "probe-stack"="__rust_probestack" "target-cpu"="x86-64" }
attributes #2 = { noinline norecurse nounwind nonlazybind optnone readnone uwtable "probe-stack"="__rust_probestack" "target-cpu"="x86-64" }
attributes #3 = { nonlazybind "target-cpu"="x86-64" }

!llvm.module.flags = !{!0, !1, !2}

!0 = !{i32 7, !"PIC Level", i32 2}
!1 = !{i32 7, !"PIE Level", i32 2}
!2 = !{i32 2, !"RtLibUseGOT", i32 1}

tmiasko on 6 Sep 2020

👍2

Assigning P-high as discussed as part of the Prioritization Working Group procedure.

jyn514 on 6 Sep 2020

Nominating this for a discussion in T-compiler. While not a "regression" this issue seems just as bad if not worse than some other soundness holes that we’ve had at P-critical. And it seems like it could be fairly easy to trigger in typical code (and also that there could be an entirely safe reproducer for this issue).

nagisa on 6 Sep 2020

👍4

This is caused by a bug in DeadArgumentElimination:

After inspecting all of the users of a function's return values, MarkValue is called for each return value: https://github.com/rust-lang/llvm-project/blob/833dd1e3d4fd350c7c9f6fb2ce0c5f16af7a1e21/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp#L610-L611

MarkValue either calls MarkLive (which propagates liveness dependencies using the global Uses map), or updates the Uses map with the dependencies that were computed earlier: https://github.com/rust-lang/llvm-project/blob/833dd1e3d4fd350c7c9f6fb2ce0c5f16af7a1e21/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp#L653-L666

However, the dependencies computed for higher-numbered return values can affect the MarkLive calls for lower-numbered dependencies. That is, we may call PropagateLiveness for a RetOrArg before we have fully populated Uses for that RetOrArg.

One solution is to iterate over the return values in two passes - we update the Uses map in the first pass, and call MarkLive in the second pass.

Aaron1011 on 7 Sep 2020

❤1

LLVM bug report https://bugs.llvm.org/show_bug.cgi?id=47444.

tmiasko on 7 Sep 2020

This was re-discussed as part of the Prioritization Working Group procedure and we decided to bump the priority to P-critical.