I have experienced more than 300% longer execution time in specific functions that use loops along with indexing into slices. After several hours of work with a profiler, I was able to isolate the problem from a 60K lines codebase into the following short program
use std::cmp;
#[inline(never)]
pub fn cmp_gt_and(in1: &[i16], in2: &[i16], destination: &mut [bool]) {
let max = cmp::min(cmp::min(in1.len(), in2.len()), destination.len());
let src1 = &in1[0..max];
let src2 = &in2[0..max];
let dst = &mut destination[0..max];
for index in 0..max {
dst[index] &= src1[index] < src2[index];
}
}
fn main() {
let len = 100;
let a: Vec<i16> = (1..len).collect();
let b: Vec<i16> = (1..len).map(|x| len - x).collect();
let mut result = vec![false; len as usize];
for _ in 0..100*1000*1000 {
cmp_gt_and(&a, &b, &mut result);
}
let sum: i32 = b.into_iter().map(|x| x as i32).sum();
std::process::exit(sum);
}
Code is also available in the following repository
With rust 1.44.0, I observe excecution time around 1.7 sec
$ rustc --version --verbose; cargo build --release;time ./target/release/rust-perf-demo
rustc 1.44.0 (49cae5576 2020-06-01)
binary: rustc
commit-hash: 49cae55760da0a43428eba73abcb659bb70cf2e4
commit-date: 2020-06-01
host: x86_64-unknown-linux-gnu
release: 1.44.0
LLVM version: 9.0
Finished release [optimized] target(s) in 0.04s
real 0m1.681s
user 0m1.676s
sys 0m0.004s
Rust versions 1.45.2 and current stable 1.46.0 produce binaries that run more than 6.0 seconds with the same source code
$ rustc --version --verbose; cargo build --release;time ./target/release/rust-perf-demo
rustc 1.45.2 (d3fb005a3 2020-07-31)
binary: rustc
commit-hash: d3fb005a39e62501b8b0b356166e515ae24e2e54
commit-date: 2020-07-31
host: x86_64-unknown-linux-gnu
release: 1.45.2
LLVM version: 10.0
Finished release [optimized] target(s) in 0.05s
real 0m6.643s
user 0m6.643s
sys 0m0.000s
$ rustc --version --verbose; cargo build --release;time ./target/release/rust-perf-demo
rustc 1.46.0 (04488afe3 2020-08-24)
binary: rustc
commit-hash: 04488afe34512aa4c33566eb16d8c912a3ae04f9
commit-date: 2020-08-24
host: x86_64-unknown-linux-gnu
release: 1.46.0
LLVM version: 10.0
Finished release [optimized] target(s) in 0.00s
real 0m6.642s
user 0m6.606s
sys 0m0.012s
I use several more functions like cmp_gt_and
in a core of image processing software that also show similar performance drop.
Has anything significantly changed between rustc 1.44 and 1.45 that may have impacted the code so significantly? Maybe LLVM 10 has a different behavior? Any thoughts how to modify the code to gain the performance back with the current compiler or other things to try in order to clarify the problem? For some time, I can stick with 1.44 to keep the performance.
Function cmp_gt_and
also appears to have much shorter assembly code with rustc 1.44 than with its successors, not sure if that is the reason for the performnace drop, though:
Rustc 1.44.0
_ZN14rust_perf_demo10cmp_gt_and17h7a0e5899b697a5eaE:
.cfi_startproc
push rax
.cfi_def_cfa_offset 16
mov r10, rdi
cmp rsi, rcx
mov rdi, rsi
cmova rdi, rcx
cmp rdi, r9
cmova rdi, r9
cmp rdi, rsi
ja .LBB8_10
cmp rdi, rcx
ja .LBB8_11
test rdi, rdi
je .LBB8_9
cmp rdi, 15
ja .LBB8_5
xor ecx, ecx
jmp .LBB8_8
.LBB8_5:
mov rcx, rdi
and rcx, -16
xor esi, esi
pxor xmm0, xmm0
.p2align 4, 0x90
.LBB8_6:
movdqu xmm1, xmmword ptr [r10 + 2*rsi]
movdqu xmm2, xmmword ptr [r10 + 2*rsi + 16]
movdqu xmm3, xmmword ptr [rdx + 2*rsi]
pcmpgtw xmm3, xmm1
movdqu xmm1, xmmword ptr [rdx + 2*rsi + 16]
pcmpgtw xmm1, xmm2
movq xmm2, qword ptr [r8 + rsi]
punpcklbw xmm2, xmm0
movq xmm4, qword ptr [r8 + rsi + 8]
punpcklbw xmm4, xmm0
pcmpeqw xmm2, xmm0
pandn xmm2, xmm3
pcmpeqw xmm4, xmm0
pandn xmm4, xmm1
psrlw xmm2, 15
packuswb xmm2, xmm0
psrlw xmm4, 15
packuswb xmm4, xmm0
movq qword ptr [r8 + rsi], xmm2
movq qword ptr [r8 + rsi + 8], xmm4
add rsi, 16
cmp rcx, rsi
jne .LBB8_6
cmp rdi, rcx
je .LBB8_9
.p2align 4, 0x90
.LBB8_8:
movzx esi, word ptr [r10 + 2*rcx]
cmp si, word ptr [rdx + 2*rcx]
setl sil
cmp byte ptr [r8 + rcx], 0
setne al
and al, sil
mov byte ptr [r8 + rcx], al
add rcx, 1
cmp rcx, rdi
jb .LBB8_8
.LBB8_9:
pop rax
.cfi_def_cfa_offset 8
ret
.LBB8_10:
.cfi_def_cfa_offset 16
lea rdx, [rip + .L__unnamed_2]
call qword ptr [rip + _ZN4core5slice20slice_index_len_fail17he661f5dd1689ef3bE@GOTPCREL]
ud2
.LBB8_11:
lea rdx, [rip + .L__unnamed_3]
mov rsi, rcx
call qword ptr [rip + _ZN4core5slice20slice_index_len_fail17he661f5dd1689ef3bE@GOTPCREL]
ud2
.Lfunc_end8:
.size _ZN14rust_perf_demo10cmp_gt_and17h7a0e5899b697a5eaE, .Lfunc_end8-_ZN14rust_perf_demo10cmp_gt_and17h7a0e5899b697a5eaE
.cfi_endproc
.section .rodata.cst16,"aM",@progbits,16
.p2align 4
Rustc 1.45.2
_ZN14rust_perf_demo10cmp_gt_and17h39391ca255a87f0fE:
.cfi_startproc
push rbx
.cfi_def_cfa_offset 16
sub rsp, 32
.cfi_def_cfa_offset 48
.cfi_offset rbx, -16
mov r10, rdi
cmp rsi, rcx
mov rdi, rsi
cmova rdi, rcx
cmp rdi, r9
cmova rdi, r9
cmp rdi, rsi
ja .LBB8_10
cmp rdi, rcx
ja .LBB8_11
test rdi, rdi
je .LBB8_9
cmp rdi, 15
ja .LBB8_5
xor esi, esi
jmp .LBB8_8
.LBB8_5:
mov rsi, rdi
and rsi, -16
xor r11d, r11d
pxor xmm0, xmm0
pcmpeqd xmm1, xmm1
.p2align 4, 0x90
.LBB8_6:
movdqu xmm2, xmmword ptr [r10 + 2*r11]
movdqu xmm3, xmmword ptr [r10 + 2*r11 + 16]
movdqu xmm4, xmmword ptr [rdx + 2*r11]
pcmpgtw xmm4, xmm2
movdqu xmm2, xmmword ptr [rdx + 2*r11 + 16]
pcmpgtw xmm2, xmm3
movq xmm5, qword ptr [r8 + r11]
movq xmm3, qword ptr [r8 + r11 + 8]
pcmpeqb xmm5, xmm0
pxor xmm5, xmm1
punpcklbw xmm5, xmm0
pand xmm5, xmm4
pcmpeqb xmm3, xmm0
pxor xmm3, xmm1
punpcklbw xmm3, xmm0
pand xmm3, xmm2
movdqa xmmword ptr [rsp], xmm5
movzx eax, byte ptr [rsp + 4]
and al, 1
movzx r9d, al
movzx eax, byte ptr [rsp + 6]
and al, 1
movzx eax, al
shl eax, 8
or eax, r9d
movzx ecx, byte ptr [rsp]
movzx r9d, byte ptr [rsp + 2]
and cl, 1
movzx ebx, cl
and r9b, 1
movzx ecx, r9b
shl ecx, 8
or ecx, ebx
movd xmm2, ecx
pinsrw xmm2, eax, 1
movzx eax, byte ptr [rsp + 8]
and al, 1
movzx eax, al
movzx ecx, byte ptr [rsp + 10]
and cl, 1
movzx ecx, cl
shl ecx, 8
or ecx, eax
pinsrw xmm2, ecx, 2
movzx eax, byte ptr [rsp + 12]
and al, 1
movzx eax, al
movzx ecx, byte ptr [rsp + 14]
and cl, 1
movzx ecx, cl
shl ecx, 8
or ecx, eax
pinsrw xmm2, ecx, 3
movdqa xmmword ptr [rsp + 16], xmm3
movzx eax, byte ptr [rsp + 20]
and al, 1
movzx eax, al
movzx ecx, byte ptr [rsp + 22]
and cl, 1
movzx ecx, cl
shl ecx, 8
or ecx, eax
movzx eax, byte ptr [rsp + 16]
movzx ebx, byte ptr [rsp + 18]
and al, 1
movzx eax, al
and bl, 1
movzx ebx, bl
shl ebx, 8
or ebx, eax
movd xmm3, ebx
pinsrw xmm3, ecx, 1
movzx eax, byte ptr [rsp + 24]
and al, 1
movzx eax, al
movzx ecx, byte ptr [rsp + 26]
and cl, 1
movzx ecx, cl
shl ecx, 8
or ecx, eax
pinsrw xmm3, ecx, 2
movzx eax, byte ptr [rsp + 28]
and al, 1
movzx eax, al
movzx ecx, byte ptr [rsp + 30]
and cl, 1
movzx ecx, cl
shl ecx, 8
or ecx, eax
pinsrw xmm3, ecx, 3
movq qword ptr [r8 + r11], xmm2
movq qword ptr [r8 + r11 + 8], xmm3
add r11, 16
cmp rsi, r11
jne .LBB8_6
cmp rdi, rsi
je .LBB8_9
.p2align 4, 0x90
.LBB8_8:
movzx eax, word ptr [r10 + 2*rsi]
cmp ax, word ptr [rdx + 2*rsi]
setl al
cmp byte ptr [r8 + rsi], 0
setne cl
and cl, al
mov byte ptr [r8 + rsi], cl
add rsi, 1
cmp rsi, rdi
jb .LBB8_8
.LBB8_9:
add rsp, 32
.cfi_def_cfa_offset 16
pop rbx
.cfi_def_cfa_offset 8
ret
.LBB8_10:
.cfi_def_cfa_offset 48
lea rdx, [rip + .L__unnamed_2]
call qword ptr [rip + _ZN4core5slice20slice_index_len_fail17h9254c9506d16ff21E@GOTPCREL]
ud2
.LBB8_11:
lea rdx, [rip + .L__unnamed_3]
mov rsi, rcx
call qword ptr [rip + _ZN4core5slice20slice_index_len_fail17h9254c9506d16ff21E@GOTPCREL]
ud2
.Lfunc_end8:
.size _ZN14rust_perf_demo10cmp_gt_and17h39391ca255a87f0fE, .Lfunc_end8-_ZN14rust_perf_demo10cmp_gt_and17h39391ca255a87f0fE
.cfi_endproc
.section .rodata.cst16,"aM",@progbits,16
.p2align 4
It would be interesting to see if this is fixed on nightly, since that has the LLVM 11 bump.
No significant change with nightly and LLVM 11
$ rustc --version --verbose; cargo build --release;time ./target/release/rust-perf-demo
rustc 1.48.0-nightly (d006f5734 2020-08-28)
binary: rustc
commit-hash: d006f5734f49625c34d6fc33bf6b9967243abca8
commit-date: 2020-08-28
host: x86_64-unknown-linux-gnu
release: 1.48.0-nightly
LLVM version: 11.0
Compiling rust-perf-demo v0.1.0 (/home/chvatal/prog/rust-perf-demo)
Finished release [optimized] target(s) in 0.81s
real 0m6.654s
user 0m6.653s
sys 0m0.000s
@rustbot ping cleanup
We have a good MCVE, probably could use a bisection to isolate the culprit
Hey Cleanup Crew ICE-breakers! This bug has been identified as a good
"Cleanup ICE-breaking candidate". In case it's useful, here are some
[instructions] for tackling these sorts of bugs. Maybe take a look?
Thanks! <3
cc @AminArria @camelid @chrissimpkins @contrun @DutchGhost @elshize @ethanboxx @h-michael @HallerPatrick @hdhoang @hellow554 @imtsuki @kanru @KarlK90 @LeSeulArtichaut @MAdrianMattocks @matheus-consoli @mental32 @nmccarty @Noah-Kennedy @pard68 @PeytonT @pierreN @Redblueflame @RobbieClarken @RobertoSnap @robjtede @SarthakSingh31 @senden9 @shekohex @sinato @spastorino @turboladen @woshilapin @yerke
This seems to have happened in the LLVM 10 upgrade.
@rustbot ping llvm
Hey LLVM ICE-breakers! This bug has been identified as a good
"LLVM ICE-breaking candidate". In case it's useful, here are some
[instructions] for tackling these sorts of bugs. Maybe take a look?
Thanks! <3
cc @camelid @comex @cuviper @DutchGhost @hdhoang @heyrutvik @higuoxing @JOE1994 @jryans @mmilenko @nagisa @nikic @Noah-Kennedy @SiavoshZarrasvand @spastorino @vertexclique
I have created a script that returns nonzero exit code in case that bianry run time is larger than 3.0 seconds (reasonable threshold on my system) that can be used as script with cargo bisect-rustc
#!/bin/bash
rustc --version --verbose
cargo build --release
TIME=`{ time -p cargo run --release; } |& grep real | awk '{print $2}'`;
MAX_TIME=3.0
echo "${TIME}"
if (( $(echo "$TIME > $MAX_TIME" |bc -l) )); then
echo "BAD: Slow"
exit 1
else
echo "GOOD: Fast"
fi
My particular result so far is
searched toolchains nightly-2020-05-20 through nightly-2020-07-02
********************************************************************************
Regression in nightly-2020-05-22
********************************************************************************
Waiting for the rest to complete...
found 7 bors merge commits in the specified range
commit[0] 2020-05-20UTC: Auto merge of #72378 - Dylan-DPC:rollup-m87bp2d, r=Dylan-DPC
commit[1] 2020-05-20UTC: Auto merge of #67759 - nikic:llvm-10, r=Mark-Simulacrum
commit[2] 2020-05-21UTC: Auto merge of #70705 - lcnr:generic_discriminant, r=nikomatsakis
commit[3] 2020-05-21UTC: Auto merge of #72205 - ecstatic-morse:nrvo, r=oli-obk
commit[4] 2020-05-21UTC: Auto merge of #72422 - RalfJung:rollup-u81z4mw, r=RalfJung
commit[5] 2020-05-21UTC: Auto merge of #71718 - NeoRaider:ffi_const_pure, r=Amanieu
commit[6] 2020-05-21UTC: Auto merge of #71930 - Nadrieril:exhaustiveness-remove-tyerr, r=varkor
validated commits found, specifying toolchains
installing 0aa6751c19d3ba80df5b0b02c00bf44e13c97e80
std for x86_64-unknown-linux-gnu: 15.79 MB / 15.79 MB [===============================================================================] 100.00 % 269.41 KB/s testing...
RESULT: 0aa6751c19d3ba80df5b0b02c00bf44e13c97e80, ===> No
uninstalling 0aa6751c19d3ba80df5b0b02c00bf44e13c97e80
installing 9310e3bd4f425f84fc27878ebf2bda1f30935a63
std for x86_64-unknown-linux-gnu: 15.80 MB / 15.80 MB [===============================================================================] 100.00 % 646.62 KB/s testing...
RESULT: 9310e3bd4f425f84fc27878ebf2bda1f30935a63, ===> Yes
uninstalling 9310e3bd4f425f84fc27878ebf2bda1f30935a63
installing 7f79e98c0356642db62e5113327e436c951e843d
std for x86_64-unknown-linux-gnu: 15.76 MB / 15.76 MB [===============================================================================] 100.00 % 297.44 KB/s testing...
RESULT: 7f79e98c0356642db62e5113327e436c951e843d, ===> Yes
uninstalling 7f79e98c0356642db62e5113327e436c951e843d
installing 82911b3bba76e73afe2881b732fe6b0edb35d5d3
std for x86_64-unknown-linux-gnu: 15.79 MB / 15.79 MB [===============================================================================] 100.00 % 221.07 KB/s testing...
RESULT: 82911b3bba76e73afe2881b732fe6b0edb35d5d3, ===> Yes
uninstalling 82911b3bba76e73afe2881b732fe6b0edb35d5d3
searched toolchains 0aa6751c19d3ba80df5b0b02c00bf44e13c97e80 through 9310e3bd4f425f84fc27878ebf2bda1f30935a63
Regression in 82911b3bba76e73afe2881b732fe6b0edb35d5d3
searched nightlies: from nightly-2020-05-20 to nightly-2020-07-02
regressed nightly: nightly-2020-05-22
searched commits: from https://github.com/rust-lang/rust/commit/0aa6751c19d3ba80df5b0b02c00bf44e13c97e80 to https://github.com/rust-lang/rust/commit/9310e3bd4f425f84fc27878ebf2bda1f30935a63
regressed commit: https://github.com/rust-lang/rust/commit/82911b3bba76e73afe2881b732fe6b0edb35d5d3
Changing the code a bit:
- dst[index] &= src1[index] < src2[index];
+ dst[index] = dst[index] as u8 & (src1[index] < src2[index]) as u8 != 0;
makes the compiler generate an even better code on both 1.44.0 and 1.45.2:
.LBB0_6:
movq xmm1, qword ptr [r8 + rsi]
movq xmm2, qword ptr [r8 + rsi + 8]
movdqu xmm3, xmmword ptr [r10 + 2*rsi]
movdqu xmm4, xmmword ptr [r10 + 2*rsi + 16]
movdqu xmm5, xmmword ptr [rdx + 2*rsi]
pcmpgtw xmm5, xmm3
packsswb xmm5, xmm5
movdqu xmm3, xmmword ptr [rdx + 2*rsi + 16]
pcmpgtw xmm3, xmm4
packsswb xmm3, xmm3
pand xmm1, xmm0
pand xmm1, xmm5
pand xmm2, xmm0
pand xmm2, xmm3
movq qword ptr [r8 + rsi], xmm1
movq qword ptr [r8 + rsi + 8], xmm2
add rsi, 16
cmp rcx, rsi
jne .LBB0_6
It suggests LLVM is struggling with the conversion between boolean vectors and integer vectors.
@viktorchvatal @pcpthm do you know if an issue about this exist on LLVM? can you file one there maybe?.
This was discussed during today's compiler meeting removing nomination.
@viktorchvatal @pcpthm do you know if an issue about this exist on LLVM? can you file one there maybe?.
Well, I do not feel to have the knowledge to file a LLVM bug.. yet.. What is the recommended way of doing so, anyway? Take the LLVM IR from the rust 1.44 and find a performance regression in the LLVM itself using just the IR? Also note that LLVM IR generated by rustc 1.44 and 1.45.2 slightly vary
Most helpful comment
Changing the code a bit:
makes the compiler generate an even better code on both 1.44.0 and 1.45.2:
It suggests LLVM is struggling with the conversion between boolean vectors and integer vectors.