Rust: Slower binary, slower compiler, bigger binary since some days

Created on 21 Dec 2017 · 27Comments · Source: rust-lang/rust

In the 2017-12-14 Nightly I've seen a significant (almost 20%) compilation-time increase when I use --emit=metadata on my code. And the successive day (2017-12-15) the compilation got a bit slower still and the run-time of the binary also got about 8% slower. The successive days the situation kept going slightly worse. Extracting parts of my code to show compiler/code performance is not easy, so I've composed this benchmark function (that solves an Euler problem. If this code isn't enough I will write other similar benchmarks):

Source code

use std::mem::size_of;

macro_rules! isqrt { ($e:expr, $t:ident) => ((($e) as f64).sqrt() as $t)  }

struct SieveBits { bits: Vec<usize> }

impl SieveBits {
    const BPC: usize = size_of::<usize>() * 8;

    fn is_bit_set(bits: &[usize], i: usize) -> bool {
        debug_assert!(i < bits.len() * Self::BPC);
        let offset = i / Self::BPC;
        let mask = 1 << (i % Self::BPC);
        unsafe { (*bits.get_unchecked(offset) & mask) != 0 }
    }

    fn reset_bit(bits: &mut [usize], i: usize) {
        debug_assert!(i < bits.len() * Self::BPC);
        let offset = i / Self::BPC;
        let mask = 1 << (i % Self::BPC);
        unsafe { *bits.get_unchecked_mut(offset) &= !mask; }
    }

    fn new(m: usize) -> Self {
        if m < 2 {
            return Self { bits: vec![] };
        }
        let mut primes = vec![std::usize::MAX; (m / 3) / Self::BPC + 1];
        Self::reset_bit(&mut primes, 0);

        let lim = isqrt!(m, usize) + 1;
        let mut i = 5;
        let mut step = 2;
        while i < lim {
            if Self::is_bit_set(&primes, i / 3) {
                let mut j = i * i;
                let mut step2 = step;
                while j < m {
                    Self::reset_bit(&mut primes, j / 3);
                    j += step2 * i;
                    step2 = 6 - step2;
                }
            }
            i += step;
            step = 6 - step;
        }

        Self { bits: primes }
    }

    fn is_prime(&self, i: usize) -> bool {
        if i == 2 || i == 3 { true }
        else if i % 2 == 0 || i % 3 == 0 { false }
        else { Self::is_bit_set(&self.bits, i / 3) }
    }
}


fn eratosthenes_sieve_u32(limit: usize) -> Vec<u32> {
    assert!(limit <= std::u32::MAX as usize);

    #[inline(always)]
    fn fill_u8(data: &mut [u8], value: u8) {
        unsafe {
            std::ptr::write_bytes(data.as_mut_ptr(), value, data.len());
        }
    }

    const L1D_CACHE_SIZE:u32 = 32_768;

    let mut result = vec![];
    if limit < 2 {
        return result;
    } else {
        result.push(2);
    }

    let lsqrt = isqrt!(limit, u32);

    let mut is_prime = vec![1u8; lsqrt as usize + 1];
    let mut i = 2;
    while i * i <= lsqrt {
        unsafe {
            if *is_prime.get_unchecked(i as usize) != 0 {
                let mut j = i * i;
                while j <= lsqrt {
                    *is_prime.get_unchecked_mut(j as usize) = 0;
                    j += i;
                }
            }
        }
        i += 1;
    }

    let segment_size = lsqrt.max(L1D_CACHE_SIZE);
    let mut s: usize = 3;
    let mut n: usize = 3;

    let mut sieve = vec![1u8; segment_size as usize]; // Vector used for sieving.
    let mut primes: Vec<u32> = vec![];
    let mut next: Vec<u32> = vec![];
    let mut low: usize = 0;

    while low <= limit {
        fill_u8(&mut sieve, 1);

        let high = (low + segment_size as usize - 1).min(limit);

        unsafe {
            while s * s <= high {
                if *is_prime.get_unchecked(s) != 0 {
                    primes.push(s as u32);
                    next.push((s * s - low) as u32);
                }
                s += 2;
            }

            for (i, &p) in primes.iter().enumerate() {
                let k = p * 2;
                let mut j = *next.get_unchecked(i);
                while j < segment_size {
                    *sieve.get_unchecked_mut(j as usize) = 0;
                    j += k;
                }
                *next.get_unchecked_mut(i) = j - segment_size;
            }

            while n <= high {
                if *sieve.get_unchecked(n - low) != 0 { // n is a prime.
                    result.push(n as u32);
                }
                n += 2;
            }
        }

        low += segment_size as usize;
    }

    result
}


fn main() {
    fn e249() -> u64 {
        const N: usize = 5_000;
        const M: u64 = 10_000_000_000_000_000_u64;

        let check_every = (((1u64 << 63) / M) as f64).log2().ceil() as usize;

        let primes = eratosthenes_sieve_u32(N);
        let mut ways = vec![0u64; primes.iter().sum::<u32>() as usize + 1];
        ways[0] = 1;
        let mut max: usize = 0;

        for (i, &p) in primes.iter().enumerate() {
            for j in (0 .. max + 1).rev() {
                unsafe {
                    *ways.get_unchecked_mut(j + p as usize) =
                        *ways.get_unchecked(j + p as usize) +
                        *ways.get_unchecked(j);
                }
            }

            if (i + 1) % check_every == 0 {
                for w in ways[.. max + 1].iter_mut() {
                    *w = *w % M;
                }
            }
            max += p as usize;
        }

        let sieve = SieveBits::new(max);
        let mut count = ways[2];
        let mut i = 3;
        while i <= max {
            if sieve.is_prime(i) {
                count = (count + ways[i]) % M;
            }
            i += 2;
        }
        count
    }
    assert_eq!(e249(), 9_275_262_564_250_418);
}

Compiled with:
rustc -C opt-level=3 -C target-cpu=native -C panic=abort test.rs

host: x86_64-pc-windows-gnu

Runtime:
f8af59d95 2017-12-13: 0.30 seconds
edbd7d232 2017-12-20: 0.53 seconds

Asm from the the 2017-12-13 compiler:
https://gist.github.com/anonymous/5c475819a2b7d0e01c036582c76cbd19

Asm from the the 2017-12-20 compiler:
https://gist.github.com/anonymous/bb984787c802da22b76c182d0f260872

I-slow T-compiler regression-from-stable-to-beta regression-from-stable-to-nightly

Source

leonardo-m

👍1

Most helpful comment

This is my good old friend the sink torpedo.

LLVM optimizes <Range as Iterator>::next into this:

; Function Attrs: inlinehint nounwind uwtable
define internal fastcc void @"_ZN4core4iter5range93_$LT$impl$u20$core..iter..iterator..Iterator$u20$for$u20$core..ops..range..Range$LT$A$GT$$GT$4next17h9ca7473ffcbaaeeeE"(%"core::option::Option<usize>"* noalias nocapture sret dereferenceable(16), { i64, i64 }* nocapture dereferenceable(16) %self) unnamed_addr #3 personality i32 (i32, i32, i64, %"unwind::libunwind::_Unwind_Exception"*, %"unwind::libunwind::_Unwind_Context"*)* @rust_eh_personality {
start:
  %1 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %self, i64 0, i32 0
  %2 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %self, i64 0, i32 1
  %3 = load i64, i64* %1, align 8, !alias.scope !1, !noalias !4
  %4 = load i64, i64* %2, align 8, !alias.scope !4, !noalias !1
  %5 = icmp ult i64 %3, %4
  br i1 %5, label %bb3, label %bb12

bb3:                                              ; preds = %start
  %6 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %3, i64 1) #6
  %7 = extractvalue { i64, i1 } %6, 1
  br i1 %7, label %bb12, label %bb7

bb7:                                              ; preds = %bb3
  %8 = extractvalue { i64, i1 } %6, 0
  store i64 %8, i64* %1, align 1
  %9 = getelementptr inbounds %"core::option::Option<usize>", %"core::option::Option<usize>"* %0, i64 0, i32 0, i64 0
  store i64 1, i64* %9, align 8
  br label %bb12

bb12:                                             ; preds = %start, %bb7, %bb3
  %.sink16 = phi i64 [ 1, %bb7 ], [ 0, %bb3 ], [ 0, %start ]
  %.sink14 = phi i64 [ %3, %bb7 ], [ 0, %bb3 ], [ 0, %start ]
  ; WHAT?
  %10 = getelementptr inbounds %"core::option::Option<usize>", %"core::option::Option<usize>"* %0, i64 0, i32 0, i64 %.sink16
  store i64 %.sink14, i64* %10, align 8
  ret void
}

This is basically equivalent to this pseudo-Rust code

struct Range {
    start: u64,
    end: u64
}

fn next(ret: &out Option<u64>, self: &mut Range)  {
    let Range { start, end } = *self;
    let (offset, value) = if start < end {
        if let Some(next) = start.checked_add(1) {
            self.start = next;
            ret.<discriminant> = Some;
            (1, start)
        } else {
            (0, 0)
        }
    } else {
        (0, 0)
    };
    // (sic) yes this writes a "value" to an "offset".
    (ret as *mut [u64; 2])[offset] = value; // WHAT?
}

After the "non-constant-address" write is generated, LLVM is sunk - LLVM has no way of getting rid of the non constant address, and no way of optimizing accesses to the target Option with that non-constant address.

arielb1 on 22 Dec 2017

😄2

All 27 comments

Could someone try to reproduce and bisect this?

arielb1 on 21 Dec 2017

cc @kennytm I think

arielb1 on 21 Dec 2017

We shouldn't have such big asm differences between successive nightlies, and I don't see anything suspicious in the middle, so it would be good to try and nail down the commit that made the difference.

arielb1 on 21 Dec 2017

❤2

I see a regression with this code (note: this is distinct from #46863).

#![crate_type="rlib"]
pub fn f(l: &Vec<u64>) -> u64 {
  let mut sum = 0;
  for i in 0..l.len() {
    sum += l[i]
  }
  sum
}

@kennytm the bad code accesses the stack pointer (%rsp) You have a fast network connection and know how to use bisect-rust. Could you please do so?

You should grep for %rsp to tell if the code is bad - the bad code includes the following line:

    subq    $24, %rsp

arielb1 on 21 Dec 2017

❤1

I bisected @leonardo-m's original example, just testing the speed of the executable created by a stage 1 rustc -C opt-level=3 -C target-cpu=native -C panic=abort test.rs and came upon 8a26e0422d (PR #46623) as the culprit: the runtime on my machine is around 0.3 seconds before that commit and around 0.7 seconds after.

@arielb1's latest example also has a clear difference. The code before that commit (just disassembling the .rlib):

0000000000000000 <_ZN5test21f17h7c517ed04f8b4e1bE>:
   0:   48 8b 57 10             mov    rdx,QWORD PTR [rdi+0x10]
   4:   48 85 d2                test   rdx,rdx
   7:   74 1b                   je     24 <_ZN5test21f17h7c517ed04f8b4e1bE+0x24>
   9:   48 8b 0f                mov    rcx,QWORD PTR [rdi]
   c:   31 c0                   xor    eax,eax
   e:   31 f6                   xor    esi,esi
  10:   48 39 f2                cmp    rdx,rsi
  13:   76 12                   jbe    27 <_ZN5test21f17h7c517ed04f8b4e1bE+0x27>
  15:   48 03 04 f1             add    rax,QWORD PTR [rcx+rsi*8]
  19:   48 83 c6 01             add    rsi,0x1
  1d:   48 39 d6                cmp    rsi,rdx
  20:   72 ee                   jb     10 <_ZN5test21f17h7c517ed04f8b4e1bE+0x10>
  22:   eb 02                   jmp    26 <_ZN5test21f17h7c517ed04f8b4e1bE+0x26>
  24:   31 c0                   xor    eax,eax
  26:   c3                      ret
  27:   50                      push   rax
  28:   48 8d 3d 00 00 00 00    lea    rdi,[rip+0x0]        # 2f <_ZN5test21f17h7c517ed04f8b4e1bE+0x2f>
  2f:   e8 00 00 00 00          call   34 <_ZN5test21f17h7c517ed04f8b4e1bE+0x34>
  34:   0f 0b                   ud2

And after:

0000000000000000 <_ZN5test21f17h7c517ed04f8b4e1bE>:
   0:   48 83 ec 18             sub    rsp,0x18
   4:   4c 8b 07                mov    r8,QWORD PTR [rdi]
   7:   48 8b 57 10             mov    rdx,QWORD PTR [rdi+0x10]
   b:   31 f6                   xor    esi,esi
   d:   31 c0                   xor    eax,eax
   f:   48 39 d6                cmp    rsi,rdx
  12:   72 18                   jb     2c <_ZN5test21f17h7c517ed04f8b4e1bE+0x2c>
  14:   eb 1f                   jmp    35 <_ZN5test21f17h7c517ed04f8b4e1bE+0x35>
  16:   66 2e 0f 1f 84 00 00    nop    WORD PTR cs:[rax+rax*1+0x0]
  1d:   00 00 00
  20:   49 03 04 f0             add    rax,QWORD PTR [r8+rsi*8]
  24:   48 89 fe                mov    rsi,rdi
  27:   48 39 d6                cmp    rsi,rdx
  2a:   73 09                   jae    35 <_ZN5test21f17h7c517ed04f8b4e1bE+0x35>
  2c:   48 89 f7                mov    rdi,rsi
  2f:   48 83 c7 01             add    rdi,0x1
  33:   73 0b                   jae    40 <_ZN5test21f17h7c517ed04f8b4e1bE+0x40>
  35:   48 89 f7                mov    rdi,rsi
  38:   31 c9                   xor    ecx,ecx
  3a:   31 f6                   xor    esi,esi
  3c:   eb 10                   jmp    4e <_ZN5test21f17h7c517ed04f8b4e1bE+0x4e>
  3e:   66 90                   xchg   ax,ax
  40:   48 c7 44 24 08 01 00    mov    QWORD PTR [rsp+0x8],0x1
  47:   00 00
  49:   b9 01 00 00 00          mov    ecx,0x1
  4e:   48 89 74 cc 08          mov    QWORD PTR [rsp+rcx*8+0x8],rsi
  53:   48 83 7c 24 08 01       cmp    QWORD PTR [rsp+0x8],0x1
  59:   75 18                   jne    73 <_ZN5test21f17h7c517ed04f8b4e1bE+0x73>
  5b:   48 8b 74 24 10          mov    rsi,QWORD PTR [rsp+0x10]
  60:   48 39 f2                cmp    rdx,rsi
  63:   77 bb                   ja     20 <_ZN5test21f17h7c517ed04f8b4e1bE+0x20>
  65:   48 8d 3d 00 00 00 00    lea    rdi,[rip+0x0]        # 6c <_ZN5test21f17h7c517ed04f8b4e1bE+0x6c>
  6c:   e8 00 00 00 00          call   71 <_ZN5test21f17h7c517ed04f8b4e1bE+0x71>
  71:   0f 0b                   ud2
  73:   48 83 c4 18             add    rsp,0x18
  77:   c3                      ret

Deewiant on 21 Dec 2017

👍1

That's unfortunate. cc @eddyb

arielb1 on 21 Dec 2017

cc @bluss

eddyb on 21 Dec 2017

Thanks @Deewiant

We're looking into it

arielb1 on 21 Dec 2017

@Deewiant >just testing the speed of the executable

I am not sure if they share the same cause, first a significant compiler performance decrease using --emit=metadata , then the successive Nightly the decrease of the produced binary performance.

leonardo-m on 21 Dec 2017

@leonardo-m

They should not. However, quite a few big commits landed in the Dec 14 nightly.

If you could provide an example that exhibits the slowdown with --emit=metadata, that would allow us to fix it (is the Dec 13 nightly faster than the Dec 14 nightly which has the same speed, but worse runtime performance, than the Dec 15 nightly?)

arielb1 on 21 Dec 2017

The last nightly (2017-12-21) has fixed all three problems.

Despite some attempts I failed finding a small amount of code that shows a significant compilation slowdown from 2017-12-14 to 2017-12-20, perhaps it's a more diffuse problem that emerges from compiling my whole big amount of code. I am sorry.

(is the Dec 13 nightly faster than the Dec 14 nightly which has the same speed, but worse runtime performance, than the Dec 15 nightly?)

I don't fully understand your question, but I try to answer creating a table with some data for some of the last Nightlies:

Rustc version               --emit=metadata  Opt build   Binary size  Run-time
1.24.0-nightly 2017-12-11   4.32             49.78       1_345_024    12.93
1.24.0-nightly 2017-12-12   4.31             48.50       1_345_024    12.84
1.24.0-nightly 2017-12-13   5.15             50.69       1_344_512    12.86
1.24.0-nightly 2017-12-15   5.33             52.98       1_378_304    14.04
1.24.0-nightly 2017-12-17   5.44             53.31       1_378_304    14.03
1.24.0-nightly 2017-12-20   5.47             54.49       1_378_304    14.14
1.24.0-nightly 2017-12-21   4.28             48.82       1_344_512    12.96

The --emit=metadata is a check time.
The Opt build is the compilation time for a well optimized build.
The binary size is stripped and in bytes.
The times are all in seconds.

leonardo-m on 22 Dec 2017

@leonardo-m Can you paste the exact output of rustc -V for the last two entries in your table? That way we can easily list all the commits in between (I don't know how else to do it and I want to be sure we don't mix up commit dates and nightly dates).

EDIT: @SimonSapin told me about these files, so the range is edbd7d232ee1272285be332b8a38eb47b4c8f5c6...7eb64b86ce44cc1828dd176a8b981e37ea08fc38.

EDIT2: at a quick glance, none of those changes should affect binary size or run time.
@leonardo-m How many times have you run each of these? How much do they vary?

eddyb on 22 Dec 2017

They should be:

rustc 1.24.0-nightly (7eb64b86c 2017-12-20)
binary: rustc
commit-hash: 7eb64b86ce44cc1828dd176a8b981e37ea08fc38
commit-date: 2017-12-20
host: x86_64-pc-windows-gnu
release: 1.24.0-nightly
LLVM version: 4.0

rustc 1.24.0-nightly (250b49205 2017-12-21)
binary: rustc
commit-hash: 250b492052e94d76bd318a4b932159395da55981
commit-date: 2017-12-21
host: x86_64-pc-windows-gnu
release: 1.24.0-nightly
LLVM version: 4.0

Mixing up things seems quite easy here.

leonardo-m on 22 Dec 2017

@leonardo-m Ah, that's why I asked, so the range is 7eb64b86ce44cc1828dd176a8b981e37ea08fc38...250b492052e94d76bd318a4b932159395da55981.
Still not seeing anything clearly relevant, but maybe someone else from @rust-lang/compiler does?

eddyb on 22 Dec 2017

How many times have you run each of these? How much do they vary?

The --emit=metadata times are computed as the minimum of four tries. Their variance is very small (on the order of +-0.05) so I think they are reliable.

The binary run-time is the minimum of 5 runs, their variance is higher perhaps because they do some I/O, despite using a SSD, about +-0.12.

The compilation is slow so there I've done only one time, so those values are the less reliable, but differences above 1.5 seconds should be reliable.

leonardo-m on 22 Dec 2017

@eddyb

For the compilation time issue, maybe there was a bug in NLL that made it active without it being enabled.

For the run-time issue, we need to actually investigate this.

arielb1 on 22 Dec 2017

@arielb1 As in, investigate the cause of the run-time fluctuations and how it unregressed?

eddyb on 22 Dec 2017

This is my good old friend the sink torpedo.

LLVM optimizes <Range as Iterator>::next into this:

; Function Attrs: inlinehint nounwind uwtable
define internal fastcc void @"_ZN4core4iter5range93_$LT$impl$u20$core..iter..iterator..Iterator$u20$for$u20$core..ops..range..Range$LT$A$GT$$GT$4next17h9ca7473ffcbaaeeeE"(%"core::option::Option<usize>"* noalias nocapture sret dereferenceable(16), { i64, i64 }* nocapture dereferenceable(16) %self) unnamed_addr #3 personality i32 (i32, i32, i64, %"unwind::libunwind::_Unwind_Exception"*, %"unwind::libunwind::_Unwind_Context"*)* @rust_eh_personality {
start:
  %1 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %self, i64 0, i32 0
  %2 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %self, i64 0, i32 1
  %3 = load i64, i64* %1, align 8, !alias.scope !1, !noalias !4
  %4 = load i64, i64* %2, align 8, !alias.scope !4, !noalias !1
  %5 = icmp ult i64 %3, %4
  br i1 %5, label %bb3, label %bb12

bb3:                                              ; preds = %start
  %6 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %3, i64 1) #6
  %7 = extractvalue { i64, i1 } %6, 1
  br i1 %7, label %bb12, label %bb7

bb7:                                              ; preds = %bb3
  %8 = extractvalue { i64, i1 } %6, 0
  store i64 %8, i64* %1, align 1
  %9 = getelementptr inbounds %"core::option::Option<usize>", %"core::option::Option<usize>"* %0, i64 0, i32 0, i64 0
  store i64 1, i64* %9, align 8
  br label %bb12

bb12:                                             ; preds = %start, %bb7, %bb3
  %.sink16 = phi i64 [ 1, %bb7 ], [ 0, %bb3 ], [ 0, %start ]
  %.sink14 = phi i64 [ %3, %bb7 ], [ 0, %bb3 ], [ 0, %start ]
  ; WHAT?
  %10 = getelementptr inbounds %"core::option::Option<usize>", %"core::option::Option<usize>"* %0, i64 0, i32 0, i64 %.sink16
  store i64 %.sink14, i64* %10, align 8
  ret void
}

This is basically equivalent to this pseudo-Rust code

struct Range {
    start: u64,
    end: u64
}

fn next(ret: &out Option<u64>, self: &mut Range)  {
    let Range { start, end } = *self;
    let (offset, value) = if start < end {
        if let Some(next) = start.checked_add(1) {
            self.start = next;
            ret.<discriminant> = Some;
            (1, start)
        } else {
            (0, 0)
        }
    } else {
        (0, 0)
    };
    // (sic) yes this writes a "value" to an "offset".
    (ret as *mut [u64; 2])[offset] = value; // WHAT?
}

arielb1 on 22 Dec 2017

😄2

Upstream LLVM (that's the yet-to-be-released LLVM 6) has this patch, which avoids the SimplifyCfg sink problem by not running sinking before inlining
https://github.com/llvm-mirror/llvm/commit/7034870f30320d6fbc74effff539d946018cd00a

I have confirmed that - e.g. this code is optimized correctly with the new opt

define void @slowness_base_1([3 x i64] * noalias nocapture dereferenceable(24), i1) {
start:
    br i1 %1, label %if_true, label %if_false
if_true:
    %x = getelementptr inbounds [3 x i64], [3 x i64]* %0, i32 0, i32 1
    store i64 1, i64* %x
    br label %ret
if_false:
    %y = getelementptr inbounds [3 x i64], [3 x i64]* %0, i32 0, i32 2
    store i64 1, i64* %y
    br label %ret
ret:
    ret void
}

define i64 @slowness_base_2([3 x i64] * noalias nocapture dereferenceable(24), i1) {
start:
    %allok = alloca [3 x i64]
    %allok_px = getelementptr inbounds [3 x i64], [3 x i64]* %allok, i32 0, i32 2
    store i64 2, i64* %allok_px
    call void @slowness_base_1([3 x i64] * %allok, i1 %1)
    %allok_p = getelementptr inbounds [3 x i64], [3 x i64]* %allok, i32 0, i32 2
    %allok_val = load i64, i64* %allok_p
    ret i64 %allok_val
}

Until then, I think representing small enums using a struct (rather than an array) should work around this problem.

arielb1 on 22 Dec 2017

With the 2017-12-22 Nightly all the problems are back:

Rustc version               --emit=metadata  Opt build   Binary size  Run-time
1.24.0-nightly 2017-12-22   5.36             53.57       1_381_376    13.96

leonardo-m on 23 Dec 2017

@eddyb

Coud you open a PR that turns small unions into LLVM structs (rather than arrays)? That should fix the problem.

arielb1 on 23 Dec 2017

The binaries generated by the last Nightly (1.24.0-nightly 4a7c072fa 2017-12-25) is further few percents slower (perhaps because of thinLTO?), the compilation times can't be compared now.

leonardo-m on 26 Dec 2017

@leonardo-m Is there more stable measure besides compile time / run time? They can be affected too easily by the environment especially when the run time is expected to take less than 1 second. The generated assembly size seems to be a good measure.