Rust: Large performance drop in compiled binary in stable rust 1.45.2 vs 1.44.0 on x86_64 linux

Created on 2 Sep 2020  路  13Comments  路  Source: rust-lang/rust

I have experienced more than 300% longer execution time in specific functions that use loops along with indexing into slices. After several hours of work with a profiler, I was able to isolate the problem from a 60K lines codebase into the following short program

use std::cmp;

#[inline(never)]
pub fn cmp_gt_and(in1: &[i16], in2: &[i16], destination: &mut [bool]) {
    let max = cmp::min(cmp::min(in1.len(), in2.len()), destination.len());

    let src1 = &in1[0..max];
    let src2 = &in2[0..max];
    let dst = &mut destination[0..max];

    for index in 0..max {
        dst[index] &= src1[index] < src2[index];
    }
}

fn main() {
    let len = 100;
    let a: Vec<i16> = (1..len).collect();
    let b: Vec<i16> = (1..len).map(|x| len - x).collect();
    let mut result = vec![false; len as usize];

    for _ in 0..100*1000*1000 {
        cmp_gt_and(&a, &b, &mut result);
    }

    let sum: i32 = b.into_iter().map(|x| x as i32).sum();
    std::process::exit(sum);
}

Code is also available in the following repository

With rust 1.44.0, I observe excecution time around 1.7 sec

$ rustc --version --verbose; cargo build --release;time ./target/release/rust-perf-demo
rustc 1.44.0 (49cae5576 2020-06-01)
binary: rustc
commit-hash: 49cae55760da0a43428eba73abcb659bb70cf2e4
commit-date: 2020-06-01
host: x86_64-unknown-linux-gnu
release: 1.44.0
LLVM version: 9.0
    Finished release [optimized] target(s) in 0.04s

real    0m1.681s
user    0m1.676s
sys     0m0.004s

Rust versions 1.45.2 and current stable 1.46.0 produce binaries that run more than 6.0 seconds with the same source code

$ rustc --version --verbose; cargo build --release;time ./target/release/rust-perf-demo
rustc 1.45.2 (d3fb005a3 2020-07-31)
binary: rustc
commit-hash: d3fb005a39e62501b8b0b356166e515ae24e2e54
commit-date: 2020-07-31
host: x86_64-unknown-linux-gnu
release: 1.45.2
LLVM version: 10.0
    Finished release [optimized] target(s) in 0.05s

real    0m6.643s
user    0m6.643s
sys     0m0.000s
$ rustc --version --verbose; cargo build --release;time ./target/release/rust-perf-demo
rustc 1.46.0 (04488afe3 2020-08-24)
binary: rustc
commit-hash: 04488afe34512aa4c33566eb16d8c912a3ae04f9
commit-date: 2020-08-24
host: x86_64-unknown-linux-gnu
release: 1.46.0
LLVM version: 10.0
    Finished release [optimized] target(s) in 0.00s

real    0m6.642s
user    0m6.606s
sys     0m0.012s

I use several more functions like cmp_gt_and in a core of image processing software that also show similar performance drop.

Has anything significantly changed between rustc 1.44 and 1.45 that may have impacted the code so significantly? Maybe LLVM 10 has a different behavior? Any thoughts how to modify the code to gain the performance back with the current compiler or other things to try in order to clarify the problem? For some time, I can stick with 1.44 to keep the performance.

Function cmp_gt_and also appears to have much shorter assembly code with rustc 1.44 than with its successors, not sure if that is the reason for the performnace drop, though:

Rustc 1.44.0

_ZN14rust_perf_demo10cmp_gt_and17h7a0e5899b697a5eaE:
    .cfi_startproc
    push    rax
    .cfi_def_cfa_offset 16
    mov r10, rdi
    cmp rsi, rcx
    mov rdi, rsi
    cmova   rdi, rcx
    cmp rdi, r9
    cmova   rdi, r9
    cmp rdi, rsi
    ja  .LBB8_10
    cmp rdi, rcx
    ja  .LBB8_11
    test    rdi, rdi
    je  .LBB8_9
    cmp rdi, 15
    ja  .LBB8_5
    xor ecx, ecx
    jmp .LBB8_8
.LBB8_5:
    mov rcx, rdi
    and rcx, -16
    xor esi, esi
    pxor    xmm0, xmm0
    .p2align    4, 0x90
.LBB8_6:
    movdqu  xmm1, xmmword ptr [r10 + 2*rsi]
    movdqu  xmm2, xmmword ptr [r10 + 2*rsi + 16]
    movdqu  xmm3, xmmword ptr [rdx + 2*rsi]
    pcmpgtw xmm3, xmm1
    movdqu  xmm1, xmmword ptr [rdx + 2*rsi + 16]
    pcmpgtw xmm1, xmm2
    movq    xmm2, qword ptr [r8 + rsi]
    punpcklbw   xmm2, xmm0
    movq    xmm4, qword ptr [r8 + rsi + 8]
    punpcklbw   xmm4, xmm0
    pcmpeqw xmm2, xmm0
    pandn   xmm2, xmm3
    pcmpeqw xmm4, xmm0
    pandn   xmm4, xmm1
    psrlw   xmm2, 15
    packuswb    xmm2, xmm0
    psrlw   xmm4, 15
    packuswb    xmm4, xmm0
    movq    qword ptr [r8 + rsi], xmm2
    movq    qword ptr [r8 + rsi + 8], xmm4
    add rsi, 16
    cmp rcx, rsi
    jne .LBB8_6
    cmp rdi, rcx
    je  .LBB8_9
    .p2align    4, 0x90
.LBB8_8:
    movzx   esi, word ptr [r10 + 2*rcx]
    cmp si, word ptr [rdx + 2*rcx]
    setl    sil
    cmp byte ptr [r8 + rcx], 0
    setne   al
    and al, sil
    mov byte ptr [r8 + rcx], al
    add rcx, 1
    cmp rcx, rdi
    jb  .LBB8_8
.LBB8_9:
    pop rax
    .cfi_def_cfa_offset 8
    ret
.LBB8_10:
    .cfi_def_cfa_offset 16
    lea rdx, [rip + .L__unnamed_2]
    call    qword ptr [rip + _ZN4core5slice20slice_index_len_fail17he661f5dd1689ef3bE@GOTPCREL]
    ud2
.LBB8_11:
    lea rdx, [rip + .L__unnamed_3]
    mov rsi, rcx
    call    qword ptr [rip + _ZN4core5slice20slice_index_len_fail17he661f5dd1689ef3bE@GOTPCREL]
    ud2
.Lfunc_end8:
    .size   _ZN14rust_perf_demo10cmp_gt_and17h7a0e5899b697a5eaE, .Lfunc_end8-_ZN14rust_perf_demo10cmp_gt_and17h7a0e5899b697a5eaE
    .cfi_endproc

    .section    .rodata.cst16,"aM",@progbits,16
    .p2align    4

Rustc 1.45.2

_ZN14rust_perf_demo10cmp_gt_and17h39391ca255a87f0fE:
    .cfi_startproc
    push    rbx
    .cfi_def_cfa_offset 16
    sub rsp, 32
    .cfi_def_cfa_offset 48
    .cfi_offset rbx, -16
    mov r10, rdi
    cmp rsi, rcx
    mov rdi, rsi
    cmova   rdi, rcx
    cmp rdi, r9
    cmova   rdi, r9
    cmp rdi, rsi
    ja  .LBB8_10
    cmp rdi, rcx
    ja  .LBB8_11
    test    rdi, rdi
    je  .LBB8_9
    cmp rdi, 15
    ja  .LBB8_5
    xor esi, esi
    jmp .LBB8_8
.LBB8_5:
    mov rsi, rdi
    and rsi, -16
    xor r11d, r11d
    pxor    xmm0, xmm0
    pcmpeqd xmm1, xmm1
    .p2align    4, 0x90
.LBB8_6:
    movdqu  xmm2, xmmword ptr [r10 + 2*r11]
    movdqu  xmm3, xmmword ptr [r10 + 2*r11 + 16]
    movdqu  xmm4, xmmword ptr [rdx + 2*r11]
    pcmpgtw xmm4, xmm2
    movdqu  xmm2, xmmword ptr [rdx + 2*r11 + 16]
    pcmpgtw xmm2, xmm3
    movq    xmm5, qword ptr [r8 + r11]
    movq    xmm3, qword ptr [r8 + r11 + 8]
    pcmpeqb xmm5, xmm0
    pxor    xmm5, xmm1
    punpcklbw   xmm5, xmm0
    pand    xmm5, xmm4
    pcmpeqb xmm3, xmm0
    pxor    xmm3, xmm1
    punpcklbw   xmm3, xmm0
    pand    xmm3, xmm2
    movdqa  xmmword ptr [rsp], xmm5
    movzx   eax, byte ptr [rsp + 4]
    and al, 1
    movzx   r9d, al
    movzx   eax, byte ptr [rsp + 6]
    and al, 1
    movzx   eax, al
    shl eax, 8
    or  eax, r9d
    movzx   ecx, byte ptr [rsp]
    movzx   r9d, byte ptr [rsp + 2]
    and cl, 1
    movzx   ebx, cl
    and r9b, 1
    movzx   ecx, r9b
    shl ecx, 8
    or  ecx, ebx
    movd    xmm2, ecx
    pinsrw  xmm2, eax, 1
    movzx   eax, byte ptr [rsp + 8]
    and al, 1
    movzx   eax, al
    movzx   ecx, byte ptr [rsp + 10]
    and cl, 1
    movzx   ecx, cl
    shl ecx, 8
    or  ecx, eax
    pinsrw  xmm2, ecx, 2
    movzx   eax, byte ptr [rsp + 12]
    and al, 1
    movzx   eax, al
    movzx   ecx, byte ptr [rsp + 14]
    and cl, 1
    movzx   ecx, cl
    shl ecx, 8
    or  ecx, eax
    pinsrw  xmm2, ecx, 3
    movdqa  xmmword ptr [rsp + 16], xmm3
    movzx   eax, byte ptr [rsp + 20]
    and al, 1
    movzx   eax, al
    movzx   ecx, byte ptr [rsp + 22]
    and cl, 1
    movzx   ecx, cl
    shl ecx, 8
    or  ecx, eax
    movzx   eax, byte ptr [rsp + 16]
    movzx   ebx, byte ptr [rsp + 18]
    and al, 1
    movzx   eax, al
    and bl, 1
    movzx   ebx, bl
    shl ebx, 8
    or  ebx, eax
    movd    xmm3, ebx
    pinsrw  xmm3, ecx, 1
    movzx   eax, byte ptr [rsp + 24]
    and al, 1
    movzx   eax, al
    movzx   ecx, byte ptr [rsp + 26]
    and cl, 1
    movzx   ecx, cl
    shl ecx, 8
    or  ecx, eax
    pinsrw  xmm3, ecx, 2
    movzx   eax, byte ptr [rsp + 28]
    and al, 1
    movzx   eax, al
    movzx   ecx, byte ptr [rsp + 30]
    and cl, 1
    movzx   ecx, cl
    shl ecx, 8
    or  ecx, eax
    pinsrw  xmm3, ecx, 3
    movq    qword ptr [r8 + r11], xmm2
    movq    qword ptr [r8 + r11 + 8], xmm3
    add r11, 16
    cmp rsi, r11
    jne .LBB8_6
    cmp rdi, rsi
    je  .LBB8_9
    .p2align    4, 0x90
.LBB8_8:
    movzx   eax, word ptr [r10 + 2*rsi]
    cmp ax, word ptr [rdx + 2*rsi]
    setl    al
    cmp byte ptr [r8 + rsi], 0
    setne   cl
    and cl, al
    mov byte ptr [r8 + rsi], cl
    add rsi, 1
    cmp rsi, rdi
    jb  .LBB8_8
.LBB8_9:
    add rsp, 32
    .cfi_def_cfa_offset 16
    pop rbx
    .cfi_def_cfa_offset 8
    ret
.LBB8_10:
    .cfi_def_cfa_offset 48
    lea rdx, [rip + .L__unnamed_2]
    call    qword ptr [rip + _ZN4core5slice20slice_index_len_fail17h9254c9506d16ff21E@GOTPCREL]
    ud2
.LBB8_11:
    lea rdx, [rip + .L__unnamed_3]
    mov rsi, rcx
    call    qword ptr [rip + _ZN4core5slice20slice_index_len_fail17h9254c9506d16ff21E@GOTPCREL]
    ud2
.Lfunc_end8:
    .size   _ZN14rust_perf_demo10cmp_gt_and17h39391ca255a87f0fE, .Lfunc_end8-_ZN14rust_perf_demo10cmp_gt_and17h39391ca255a87f0fE
    .cfi_endproc

    .section    .rodata.cst16,"aM",@progbits,16
    .p2align    4
A-LLVM C-bug I-slow ICEBreaker-Cleanup-Crew ICEBreaker-LLVM P-high T-compiler regression-from-stable-to-stable

Most helpful comment

Changing the code a bit:

-        dst[index] &= src1[index] < src2[index];
+        dst[index] = dst[index] as u8 & (src1[index] < src2[index]) as u8 != 0;

makes the compiler generate an even better code on both 1.44.0 and 1.45.2:

.LBB0_6:
        movq    xmm1, qword ptr [r8 + rsi]
        movq    xmm2, qword ptr [r8 + rsi + 8]
        movdqu  xmm3, xmmword ptr [r10 + 2*rsi]
        movdqu  xmm4, xmmword ptr [r10 + 2*rsi + 16]
        movdqu  xmm5, xmmword ptr [rdx + 2*rsi]
        pcmpgtw xmm5, xmm3
        packsswb        xmm5, xmm5
        movdqu  xmm3, xmmword ptr [rdx + 2*rsi + 16]
        pcmpgtw xmm3, xmm4
        packsswb        xmm3, xmm3
        pand    xmm1, xmm0
        pand    xmm1, xmm5
        pand    xmm2, xmm0
        pand    xmm2, xmm3
        movq    qword ptr [r8 + rsi], xmm1
        movq    qword ptr [r8 + rsi + 8], xmm2
        add     rsi, 16
        cmp     rcx, rsi
        jne     .LBB0_6

It suggests LLVM is struggling with the conversion between boolean vectors and integer vectors.

All 13 comments

It would be interesting to see if this is fixed on nightly, since that has the LLVM 11 bump.

No significant change with nightly and LLVM 11

$ rustc --version --verbose; cargo build --release;time ./target/release/rust-perf-demo
rustc 1.48.0-nightly (d006f5734 2020-08-28)
binary: rustc
commit-hash: d006f5734f49625c34d6fc33bf6b9967243abca8
commit-date: 2020-08-28
host: x86_64-unknown-linux-gnu
release: 1.48.0-nightly
LLVM version: 11.0
   Compiling rust-perf-demo v0.1.0 (/home/chvatal/prog/rust-perf-demo)
    Finished release [optimized] target(s) in 0.81s

real    0m6.654s
user    0m6.653s
sys     0m0.000s

@rustbot ping cleanup
We have a good MCVE, probably could use a bisection to isolate the culprit

Hey Cleanup Crew ICE-breakers! This bug has been identified as a good
"Cleanup ICE-breaking candidate". In case it's useful, here are some
[instructions] for tackling these sorts of bugs. Maybe take a look?
Thanks! <3

cc @AminArria @camelid @chrissimpkins @contrun @DutchGhost @elshize @ethanboxx @h-michael @HallerPatrick @hdhoang @hellow554 @imtsuki @kanru @KarlK90 @LeSeulArtichaut @MAdrianMattocks @matheus-consoli @mental32 @nmccarty @Noah-Kennedy @pard68 @PeytonT @pierreN @Redblueflame @RobbieClarken @RobertoSnap @robjtede @SarthakSingh31 @senden9 @shekohex @sinato @spastorino @turboladen @woshilapin @yerke

This seems to have happened in the LLVM 10 upgrade.

@rustbot ping llvm

Hey LLVM ICE-breakers! This bug has been identified as a good
"LLVM ICE-breaking candidate". In case it's useful, here are some
[instructions] for tackling these sorts of bugs. Maybe take a look?
Thanks! <3

cc @camelid @comex @cuviper @DutchGhost @hdhoang @heyrutvik @higuoxing @JOE1994 @jryans @mmilenko @nagisa @nikic @Noah-Kennedy @SiavoshZarrasvand @spastorino @vertexclique

I have created a script that returns nonzero exit code in case that bianry run time is larger than 3.0 seconds (reasonable threshold on my system) that can be used as script with cargo bisect-rustc

#!/bin/bash
rustc --version --verbose
cargo build --release
TIME=`{ time -p cargo run --release; } |& grep real | awk '{print $2}'`;
MAX_TIME=3.0
echo "${TIME}"

if (( $(echo "$TIME > $MAX_TIME" |bc -l) )); then
    echo "BAD: Slow"
    exit 1
else
    echo "GOOD: Fast"
fi

My particular result so far is

searched toolchains nightly-2020-05-20 through nightly-2020-07-02


********************************************************************************
Regression in nightly-2020-05-22
********************************************************************************

Waiting for the rest to complete...

found 7 bors merge commits in the specified range
commit[0] 2020-05-20UTC: Auto merge of #72378 - Dylan-DPC:rollup-m87bp2d, r=Dylan-DPC
commit[1] 2020-05-20UTC: Auto merge of #67759 - nikic:llvm-10, r=Mark-Simulacrum
commit[2] 2020-05-21UTC: Auto merge of #70705 - lcnr:generic_discriminant, r=nikomatsakis
commit[3] 2020-05-21UTC: Auto merge of #72205 - ecstatic-morse:nrvo, r=oli-obk
commit[4] 2020-05-21UTC: Auto merge of #72422 - RalfJung:rollup-u81z4mw, r=RalfJung
commit[5] 2020-05-21UTC: Auto merge of #71718 - NeoRaider:ffi_const_pure, r=Amanieu
commit[6] 2020-05-21UTC: Auto merge of #71930 - Nadrieril:exhaustiveness-remove-tyerr, r=varkor
validated commits found, specifying toolchains

installing 0aa6751c19d3ba80df5b0b02c00bf44e13c97e80
std for x86_64-unknown-linux-gnu: 15.79 MB / 15.79 MB [===============================================================================] 100.00 % 269.41 KB/s testing...
RESULT: 0aa6751c19d3ba80df5b0b02c00bf44e13c97e80, ===> No
uninstalling 0aa6751c19d3ba80df5b0b02c00bf44e13c97e80

installing 9310e3bd4f425f84fc27878ebf2bda1f30935a63
std for x86_64-unknown-linux-gnu: 15.80 MB / 15.80 MB [===============================================================================] 100.00 % 646.62 KB/s testing...
RESULT: 9310e3bd4f425f84fc27878ebf2bda1f30935a63, ===> Yes
uninstalling 9310e3bd4f425f84fc27878ebf2bda1f30935a63

installing 7f79e98c0356642db62e5113327e436c951e843d
std for x86_64-unknown-linux-gnu: 15.76 MB / 15.76 MB [===============================================================================] 100.00 % 297.44 KB/s testing...
RESULT: 7f79e98c0356642db62e5113327e436c951e843d, ===> Yes
uninstalling 7f79e98c0356642db62e5113327e436c951e843d

installing 82911b3bba76e73afe2881b732fe6b0edb35d5d3
std for x86_64-unknown-linux-gnu: 15.79 MB / 15.79 MB [===============================================================================] 100.00 % 221.07 KB/s testing...
RESULT: 82911b3bba76e73afe2881b732fe6b0edb35d5d3, ===> Yes
uninstalling 82911b3bba76e73afe2881b732fe6b0edb35d5d3

searched toolchains 0aa6751c19d3ba80df5b0b02c00bf44e13c97e80 through 9310e3bd4f425f84fc27878ebf2bda1f30935a63


Regression in 82911b3bba76e73afe2881b732fe6b0edb35d5d3


searched nightlies: from nightly-2020-05-20 to nightly-2020-07-02
regressed nightly: nightly-2020-05-22
searched commits: from https://github.com/rust-lang/rust/commit/0aa6751c19d3ba80df5b0b02c00bf44e13c97e80 to https://github.com/rust-lang/rust/commit/9310e3bd4f425f84fc27878ebf2bda1f30935a63
regressed commit: https://github.com/rust-lang/rust/commit/82911b3bba76e73afe2881b732fe6b0edb35d5d3

Changing the code a bit:

-        dst[index] &= src1[index] < src2[index];
+        dst[index] = dst[index] as u8 & (src1[index] < src2[index]) as u8 != 0;

makes the compiler generate an even better code on both 1.44.0 and 1.45.2:

.LBB0_6:
        movq    xmm1, qword ptr [r8 + rsi]
        movq    xmm2, qword ptr [r8 + rsi + 8]
        movdqu  xmm3, xmmword ptr [r10 + 2*rsi]
        movdqu  xmm4, xmmword ptr [r10 + 2*rsi + 16]
        movdqu  xmm5, xmmword ptr [rdx + 2*rsi]
        pcmpgtw xmm5, xmm3
        packsswb        xmm5, xmm5
        movdqu  xmm3, xmmword ptr [rdx + 2*rsi + 16]
        pcmpgtw xmm3, xmm4
        packsswb        xmm3, xmm3
        pand    xmm1, xmm0
        pand    xmm1, xmm5
        pand    xmm2, xmm0
        pand    xmm2, xmm3
        movq    qword ptr [r8 + rsi], xmm1
        movq    qword ptr [r8 + rsi + 8], xmm2
        add     rsi, 16
        cmp     rcx, rsi
        jne     .LBB0_6

It suggests LLVM is struggling with the conversion between boolean vectors and integer vectors.

@viktorchvatal @pcpthm do you know if an issue about this exist on LLVM? can you file one there maybe?.

@viktorchvatal @pcpthm do you know if an issue about this exist on LLVM? can you file one there maybe?.

Well, I do not feel to have the knowledge to file a LLVM bug.. yet.. What is the recommended way of doing so, anyway? Take the LLVM IR from the rust 1.44 and find a performance regression in the LLVM itself using just the IR? Also note that LLVM IR generated by rustc 1.44 and 1.45.2 slightly vary

Was this page helpful?
0 / 5 - 0 ratings