Rust: Big performance problem with closed intervals looping

Created on 12 Oct 2017  ยท  18Comments  ยท  Source: rust-lang/rust

In my code I've essentially stopped using loops with ... (intervals closed on the right, recently written with the syntax ..=) because they give performance problems. This is a simple example that shows the problem:

#![feature(inclusive_range_syntax)]
#![allow(private_no_mangle_fns)]

#[inline(never)]
#[no_mangle]
fn foo1(n: u64) -> u64 {
    let mut count = 0;
    for _ in 0 .. n {
        for j in (0 .. n + 1).rev() {
            count += j;
        }
    }
    count
}

#[inline(never)]
#[no_mangle]
fn foo2(n: u64) -> u64 {
    let mut count = 0;
    for _ in 0 .. n {
        for j in (0 ..= n).rev() {
            count += j;
        }
    }
    count
}

fn main() {
    let n: u64 = std::env::args().nth(1).unwrap().parse().unwrap();
    let what: u32 = std::env::args().nth(2).unwrap().parse().unwrap();

    match what {
        1 => println!("{}", foo1(n)),
        2 => println!("{}", foo2(n)),
        _ => panic!(),
    }
}

Compiled with the last Nightly:

rustc 1.22.0-nightly (d6d711dd8 2017-10-10)
binary: rustc
commit-hash: d6d711dd8f7ad5885294b8e1f0009a23dc1f8b1f
commit-date: 2017-10-10
host: x86_64-pc-windows-gnu
release: 1.22.0-nightly
LLVM version: 4.0

Compiled with:
rustc -O test.rs

Running it calling foo1 takes 0.02 seconds:

...>elaps test 100000 1
500005000000000

Running it calling foo2 takes about 13.65 seconds:

...>elaps test 100000 2
500005000000000

The asm I am seeing using "--emit asm"

foo1:
    testq   %rcx, %rcx
    je  .LBB5_1
    movq    %rcx, %r8
    imulq   %r8, %r8
    leaq    -1(%rcx), %rdx
    movq    %rcx, %rax
    mulq    %rdx
    shldq   $63, %rax, %rdx
    subq    %rdx, %r8
    cmpq    $3, %rcx
    jbe .LBB5_3
    movq    %rcx, %rdx
    andq    $-4, %rdx
    je  .LBB5_3
    movd    %r8, %xmm0
    pshufd  $68, %xmm0, %xmm2
    leaq    -4(%rdx), %r9
    movl    %r9d, %eax
    shrl    $2, %eax
    incl    %eax
    andq    $3, %rax
    je  .LBB5_8
    cmpq    $-1, %rcx
    pxor    %xmm0, %xmm0
    pxor    %xmm3, %xmm3
    je  .LBB5_11
    movdqa  %xmm2, %xmm3
.LBB5_11:
    negq    %rax
    xorl    %r10d, %r10d
    pxor    %xmm1, %xmm1
    .p2align    4, 0x90
.LBB5_12:
    paddq   %xmm3, %xmm0
    paddq   %xmm3, %xmm1
    addq    $4, %r10
    incq    %rax
    jne .LBB5_12
    jmp .LBB5_13
.LBB5_3:
    xorl    %eax, %eax
    xorl    %edx, %edx
.LBB5_4:
    xorl    %r9d, %r9d
    cmpq    $-1, %rcx
    cmoveq  %r9, %r8
    .p2align    4, 0x90
.LBB5_5:
    incq    %rdx
    addq    %r8, %rax
    cmpq    %rcx, %rdx
    jb  .LBB5_5
.LBB5_19:
    retq
.LBB5_1:
    xorl    %eax, %eax
    retq
.LBB5_8:
    xorl    %r10d, %r10d
    pxor    %xmm0, %xmm0
    pxor    %xmm1, %xmm1
.LBB5_13:
    cmpq    $12, %r9
    jb  .LBB5_18
    cmpq    $-1, %rcx
    pxor    %xmm3, %xmm3
    je  .LBB5_16
    movdqa  %xmm2, %xmm3
.LBB5_16:
    movq    %rdx, %rax
    subq    %r10, %rax
    .p2align    4, 0x90
.LBB5_17:
    paddq   %xmm3, %xmm0
    paddq   %xmm3, %xmm1
    paddq   %xmm3, %xmm0
    paddq   %xmm3, %xmm1
    paddq   %xmm3, %xmm0
    paddq   %xmm3, %xmm1
    paddq   %xmm3, %xmm0
    paddq   %xmm3, %xmm1
    addq    $-16, %rax
    jne .LBB5_17
.LBB5_18:
    paddq   %xmm1, %xmm0
    pshufd  $78, %xmm0, %xmm1
    paddq   %xmm0, %xmm1
    movd    %xmm1, %rax
    cmpq    %rcx, %rdx
    jne .LBB5_4
    jmp .LBB5_19



foo2:
    pushq   %rsi
    pushq   %rdi
    pushq   %rbx
    testq   %rcx, %rcx
    je  .LBB6_1
    testb   $1, %cl
    jne .LBB6_4
    xorl    %eax, %eax
    xorl    %r8d, %r8d
    cmpq    $1, %rcx
    jne .LBB6_11
    jmp .LBB6_23
.LBB6_1:
    xorl    %eax, %eax
    jmp .LBB6_23
.LBB6_4:
    xorl    %r8d, %r8d
    movq    $-1, %r9
    xorl    %r10d, %r10d
    movq    %rcx, %r11
    xorl    %eax, %eax
    jmp .LBB6_5
    .p2align    4, 0x90
.LBB6_8:
    addq    %r11, %rax
    movq    %rdi, %r10
    movq    %rdx, %r11
.LBB6_5:
    cmpq    %r11, %r10
    movl    $1, %esi
    cmovbq  %r9, %rsi
    cmoveq  %r8, %rsi
    testq   %rsi, %rsi
    movl    $1, %edi
    movl    $0, %edx
    je  .LBB6_8
    cmpq    $-1, %rsi
    jne .LBB6_9
    leaq    -1(%r11), %rdx
    movq    %r10, %rdi
    jmp .LBB6_8
.LBB6_9:
    movl    $1, %r8d
    cmpq    $1, %rcx
    je  .LBB6_23
.LBB6_11:
    xorl    %r9d, %r9d
    movq    $-1, %r10
    .p2align    4, 0x90
.LBB6_12:
    xorl    %r11d, %r11d
    movq    %rcx, %rdx
    jmp .LBB6_13
    .p2align    4, 0x90
.LBB6_16:
    addq    %rdx, %rax
    movq    %rbx, %r11
    movq    %rsi, %rdx
.LBB6_13:
    cmpq    %rdx, %r11
    movl    $1, %edi
    cmovbq  %r10, %rdi
    cmoveq  %r9, %rdi
    testq   %rdi, %rdi
    movl    $1, %ebx
    movl    $0, %esi
    je  .LBB6_16
    cmpq    $-1, %rdi
    jne .LBB6_17
    leaq    -1(%rdx), %rsi
    movq    %r11, %rbx
    jmp .LBB6_16
    .p2align    4, 0x90
.LBB6_17:
    addq    $2, %r8
    xorl    %r11d, %r11d
    movq    %rcx, %rdx
    jmp .LBB6_18
    .p2align    4, 0x90
.LBB6_21:
    addq    %rdx, %rax
    movq    %rbx, %r11
    movq    %rsi, %rdx
.LBB6_18:
    cmpq    %rdx, %r11
    movl    $1, %edi
    cmovbq  %r10, %rdi
    cmoveq  %r9, %rdi
    testq   %rdi, %rdi
    movl    $1, %ebx
    movl    $0, %esi
    je  .LBB6_21
    cmpq    $-1, %rdi
    jne .LBB6_22
    leaq    -1(%rdx), %rsi
    movq    %r11, %rbx
    jmp .LBB6_21
    .p2align    4, 0x90
.LBB6_22:
    cmpq    %rcx, %r8
    jb  .LBB6_12
.LBB6_23:
    popq    %rbx
    popq    %rdi
    popq    %rsi
    retq

C-enhancement I-slow T-compiler

Most helpful comment

All 18 comments

Performing the low-hanging fruit for minimization:

#![feature(inclusive_range_syntax)]
#![allow(private_no_mangle_fns)]

#[inline(never)]
#[no_mangle]
fn triangle_exc(n: u64) -> u64 {
    let mut count = 0;
    for j in (0 .. n + 1) {
        count += j;
    }
    count
}

#[inline(never)]
#[no_mangle]
fn triangle_inc(n: u64) -> u64 {
    let mut count = 0;
    for j in 0 ..= n {
        count += j;
    }
    count
}

fn main() {
    let n: u64 = std::env::args().nth(1).unwrap().parse().unwrap();

    println!("{}", triangle_exc(n));
    println!("{}", triangle_inc(n));
}

Good:

//-----------------------------------------
       โ”‚     0000000000007300 <triangle_exc>:
       โ”‚     triangle_exc():
       โ”‚       inc    %rdi
       โ”‚     โ†“ je     8c
       โ”‚       cmp    $0x3,%rdi
       โ”‚     โ†“ jbe    7b
       โ”‚       mov    %rdi,%rcx
       โ”‚       and    $0xfffffffffffffffc,%rcx
       โ”‚     โ†“ je     7b
       โ”‚       lea    -0x4(%rcx),%rax
       โ”‚       mov    %eax,%esi
       โ”‚       shr    $0x2,%esi
       โ”‚       inc    %esi
       โ”‚       and    $0x3,%rsi
       โ”‚     โ†“ je     8f
       โ”‚       neg    %rsi
       โ”‚       mov    $0x1,%edx
       โ”‚       movq   %rdx,%xmm0
       โ”‚       pslldq $0x8,%xmm0
       โ”‚       pxor   %xmm2,%xmm2
       โ”‚       xor    %edx,%edx
       โ”‚       movdqa _fini+0x44,%xmm3
       โ”‚       movdqa _fini+0x54,%xmm4
       โ”‚       pxor   %xmm1,%xmm1
       โ”‚       data16 nopw %cs:0x0(%rax,%rax,1)
       โ”‚ 60:   paddq  %xmm0,%xmm2
       โ”‚       paddq  %xmm0,%xmm1
       โ”‚       paddq  %xmm3,%xmm1
       โ”‚       add    $0x4,%rdx
       โ”‚       paddq  %xmm4,%xmm0
       โ”‚       inc    %rsi
       โ”‚     โ†‘ jne    60
       โ”‚ 7b:   xor    %eax,%eax
       โ”‚       xor    %ecx,%ecx
       โ”‚       nop
       โ”‚ 80:   add    %rcx,%rax
       โ”‚       inc    %rcx
       โ”‚       cmp    %rdi,%rcx
       โ”‚     โ†‘ jb     80
       โ”‚ 8b: โ† retq
       โ”‚ 8c:   xor    %eax,%eax
       โ”‚     โ† retq
       โ”‚ 8f:   xor    %edx,%edx
       โ”‚       mov    $0x1,%esi
       โ”‚       movq   %rsi,%xmm0
       โ”‚       pslldq $0x8,%xmm0
       โ”‚       pxor   %xmm2,%xmm2
       โ”‚       pxor   %xmm1,%xmm1
       โ”‚ a8:   cmp    $0xc,%rax
       โ”‚       movdqa %xmm2,%xmm6
       โ”‚     โ†“ jb     106
       โ”‚       mov    %rcx,%rax
       โ”‚       sub    %rdx,%rax
       โ”‚       movdqa _fini+0x64,%xmm3
       โ”‚       movdqa _fini+0x74,%xmm4
       โ”‚       movdqa _fini+0x84,%xmm5
       โ”‚ d0:   paddq  %xmm0,%xmm2
       โ”‚       paddq  %xmm0,%xmm1
 20.00 โ”‚       movdqa %xmm0,%xmm6
 10.00 โ”‚       paddq  %xmm6,%xmm6
       โ”‚       paddq  %xmm6,%xmm1
 10.00 โ”‚       paddq  %xmm2,%xmm6
 30.00 โ”‚       paddq  %xmm0,%xmm6
       โ”‚       paddq  %xmm0,%xmm1
 10.00 โ”‚       paddq  %xmm3,%xmm6
       โ”‚       paddq  %xmm4,%xmm1
       โ”‚       paddq  %xmm5,%xmm0
 20.00 โ”‚       movdqa %xmm6,%xmm2
       โ”‚     โ†‘ jne    d0
       โ”‚106:   paddq  %xmm1,%xmm6
       โ”‚       pshufd $0x4e,%xmm6,%xmm0
       โ”‚       paddq  %xmm6,%xmm0
       โ”‚       movq   %xmm0,%rax
       โ”‚       cmp    %rcx,%rdi
       โ”‚     โ†‘ jne    80
       โ”‚     โ†‘ jmpq   8b

Bad:

       โ”‚    0000000000007430 <triangle_inc>:
       โ”‚    triangle_inc():
       โ”‚      xor    %r8d,%r8d
       โ”‚      mov    $0xffffffffffffffff,%r9
       โ”‚      xor    %r10d,%r10d
       โ”‚      xor    %eax,%eax
       โ”‚    โ†“ jmp    29
       โ”‚      data16 data16 data16 data16 data16 nopw %cs:0x0(%rax,%rax,1)
       โ”‚20:   add    %r10,%rax
  1.79 โ”‚      mov    %rdx,%rdi
  2.98 โ”‚      mov    %rsi,%r10
 17.86 โ”‚29:   cmp    %rdi,%r10
       โ”‚      mov    $0x1,%ecx
  7.14 โ”‚      cmovb  %r9,%rcx
 16.07 โ”‚      cmove  %r8,%rcx
  3.57 โ”‚      test   %rcx,%rcx
  1.79 โ”‚      mov    $0x0,%edx
  1.79 โ”‚      mov    $0x1,%esi
 14.29 โ”‚    โ†‘ je     20
       โ”‚      cmp    $0xffffffffffffffff,%rcx
       โ”‚    โ†“ jne    57
  2.98 โ”‚      lea    0x1(%r10),%rsi
  3.57 โ”‚      mov    %rdi,%rdx
 26.19 โ”‚    โ†‘ jmp    20
       โ”‚57: โ† retq

Lost some kind of fast lane?

Apparently llvm is way happier optimizing the open interval with simd .

Referring to the example in https://github.com/rust-lang/rust/issues/45222#issuecomment-335998038, the first code is recognized by LLVM loop-vectorize, while the second doesn't.

$ rustc +nightly --crate-type staticlib -C debuginfo=1 -C codegen-units=1 -C opt-level=3 -C panic=abort -C target-cpu=native -C remark=loop-vectorize 1.rs

note: optimization analysis for loop-vectorize at 1.rs:9:0: loop not vectorized: loop control flow is not understood by vectorizer

note: optimization missed for loop-vectorize at 1.rs:9:0: loop not vectorized

After vectorization the .. loop becomes just a * (a+1) / 2, which explains the huge time difference.

If we black-box the j in count += j when benchmarking, the result becomes more realistic (2.6ร— slowdown, not 680ร— slowdown):

$ time ./1 100000 1
500005000000000

real    0m5.680s
user    0m5.645s
sys 0m0.012s

$ time ./1 100000 2
500005000000000

real    0m15.088s
user    0m15.015s
sys 0m0.026s

We may see if upgrading to LLVM 6 can help this case.

Also note that the two pieces of code are not equivalent: the n+1 version cannot properly handle the case n == u64::max_value() (although rare).

I've checked again using the LLVM 6 build. The performance is improved, but there is still a large gap (2.6ร— โ†’ 2.0ร—).

Vectorization is still not recognized for 0 ..= n with the naked j.

Timings

$ rustc +nightly -C codegen-units=1 -C opt-level=3 -C target-cpu=native 1.rs

$ time ./1 30000 2
13500450000000

real    0m5.389s
user    0m5.136s
sys 0m0.012s

$ time ./1 30000 1
13500450000000

real    0m2.195s
user    0m2.192s
sys 0m0.000s

$ rustc +38bd38147d2fa21f8a684b019fc0763adf8fd436 -C codegen-units=1 -C opt-level=3 -C target-cpu=native 1.rs

$ time ./1 30000 2
13500450000000

real    0m4.445s
user    0m4.332s
sys 0m0.000s

$ time ./1 30000 1
13500450000000

real    0m2.330s
user    0m2.184s
sys 0m0.016s

This might just be the classic external-iteration-is-slower-sometimes problem. Note that even (0..=n).sum() is currently generating unfortunate code.

Fix for internal iteration is up at https://github.com/rust-lang/rust/pull/48012

I believe this can be fixed by adding an extra field to RangeInclusive like this:

struct FixedRangeInclusive {
    start: u64,
    end: u64,
    done: bool,
}

fn fixed_range_inclusive(start: u64, end: u64) -> FixedRangeInclusive {
    FixedRangeInclusive {
        start,
        end,
        done: false,
    }
}

impl Iterator for FixedRangeInclusive {
    type Item = u64;
    fn next(&mut self) -> Option<Self::Item> {
        if !self.done {
            if self.start == self.end {
                self.done = true;
            }
            let new = self.start.wrapping_add(1);
            Some(std::mem::replace(&mut self.start, new))
        } else {
            None
        }
    }
}

Check out the assembly on the playground.

The current two-field RangeInclusive follows from rust-lang/rfcs#1980. The done field was the original design in RFC 1192 but was changed due to https://github.com/rust-lang/rfcs/pull/1192#issuecomment-137864421.

I'm aware that RangeInclusive has gone through many different designs but the current design was clearly not chosen with performance in mind. Of course ideally RangeInclusive wouldn't have any public fields so these kind of changes can be made easily.

Of course ideally RangeInclusive wouldn't have any public fields so these kind of changes can be made easily.

This would be jarring, in consideration of the fact that Range does have public data members.

It is unfortunate. Were this not the case I could almost picture something like this:

pub struct RangeInclusive<T> {
    // NOTE: not pub
    start: T, // actually, these should probably be ManuallyDrop<T> 
    end: T,   // or union MaybeUninit<T> { value: T, empty: () }
    done: bool,
}

impl RangeInclusive<T> {
    // Expose an API that matches the functionality of the enum type
    #[inline] pub fn new(start: T, end: T) -> Self { ... }
    #[inline] pub fn new_done() -> Self { ... }
    #[inline] pub fn endpoints(&self) -> Option<(&T, &T)> { ... }
    #[inline] pub fn endpoints_mut(&mut self) -> Option<(&mut T, &mut T)> { ... }
    #[inline] pub fn into_endpoints(self) -> Option<(T, T)> { ... }
}

and ISTM (note: haven't tested) that this should optimize just as well as the three-field struct, since it IS the three-field struct (just with statically enforced usage patterns). But I suppose that, even then, it would seem questionable to have a standard library type that simulates a enum (rather than being one) solely for performance concerns.


Edit: I misread somewhat and thought that the enum was the current proposal.

@leonardo-m Can you share some non-simple examples where the current form is a problem? https://github.com/rust-lang/rust/pull/48012 will make it so that the example in here is fine if written the easier way count += (0 ..= n).sum().

For simple things like sums of iota, it's easy to get suboptimal codegen from all kinds of different iterators. Like using for x in (0..3).chain(3..x) to add things instead of folding the same iterator has the identical problem as was raised here: https://godbolt.org/g/tYt7TX

Edit: https://github.com/rust-lang/rust/pull/48057 has also improved things in recent nightlies, though it's still not perfect.

For record: Enabling Polly (#50044) doesn't fix the issue.

@ollie27 Just a note, if you are going to use a bool your example always do two tests, maybe something like this could speed up thing because it only test two time for the two last value:

impl Iterator for FixedRangeInclusive {
    type Item = u64;
    fn next(&mut self) -> Option<Self::Item> {
        if self.start < self.end {
            let new = self.start.wrapping_add(1);
            Some(std::mem::replace(&mut self.start, new))
        } else if !self.done {
            self.done = true;
            Some(self.start)
        } else {
            None
        }
    }
}

To answer Issue #56516, this is a first example of the performance problem, better examples could follow. Code example from:
http://ericniebler.com/2014/04/27/range-comprehensions/

fn triples() -> impl Iterator<Item=(u32, u32, u32)> {
    (1 ..).flat_map(|z| (1 .. z + 1)
                        .flat_map(move |x| (x .. z + 1u32)
                                           .filter(move |&y| x.pow(2) + y.pow(2) == z.pow(2))
                                           .map(move |y| (x, y, z))))
}

fn main() {
    let result: u32 = triples().take(3_000).map(|(x, y, z)| x + y + z).sum();
    println!("{}", result); // 10650478, about 2.8 seconds.
}

If I replace the open intervals with closed ones:

fn triples() -> impl Iterator<Item=(u32, u32, u32)> {
    (1 ..).flat_map(|z| (1u32 ..= z)
                        .flat_map(move |x| (x ..= z)
                                           .filter(move |&y| x.pow(2) + y.pow(2) == z.pow(2))
                                           .map(move |y| (x, y, z))))
}

fn main() {
    let result: u32 = triples().take(3_000).map(|(x, y, z)| x + y + z).sum();
    println!("{}", result);
}

For the second version I am seeing a run-time of about 6 seconds.

58122 has significantly improved the situation!


My benchmark based on the @leonardo-m snippet

#![feature(test)]
extern crate test;

use test::Bencher;

fn triples_exclusive() -> impl Iterator<Item = (u32, u32, u32)> {
    (1u32..).flat_map(|z| {
        (1..z + 1).flat_map(move |x| {
            (x..z + 1)
                .filter(move |&y| x.pow(2) + y.pow(2) == z.pow(2))
                .map(move |y| (x, y, z))
        })
    })
}

fn triples_inclusive() -> impl Iterator<Item = (u32, u32, u32)> {
    (1u32..).flat_map(|z| {
        (1..=z).flat_map(move |x| {
            (x..=z)
                .filter(move |&y| x.pow(2) + y.pow(2) == z.pow(2))
                .map(move |y| (x, y, z))
        })
    })
}

#[bench]
fn range_exclusive(b: &mut Bencher) {
    b.iter(|| triples_exclusive().take(1_000).map(|(x, y, z)| x + y + z).sum::<u32>());
}

#[bench]
fn range_inclusive(b: &mut Bencher) {
    b.iter(|| triples_inclusive().take(1_000).map(|(x, y, z)| x + y + z).sum::<u32>());
}

Here are the results on my laptop:

Run 1:

test range_exclusive ... bench: 169,132,205 ns/iter (+/- 9,048,718)
test range_inclusive ... bench: 173,425,958 ns/iter (+/- 16,198,459)

Run 2:

test range_exclusive ... bench: 172,896,754 ns/iter (+/- 12,204,477)
test range_inclusive ... bench: 174,383,382 ns/iter (+/- 12,032,995)

Run 3:

test range_exclusive ... bench: 169,798,685 ns/iter (+/- 11,155,761)
test range_inclusive ... bench: 174,589,161 ns/iter (+/- 13,358,378)

Inclusive Range is still consistently slower than the exclusive variant, which is not that significant some might say, but it may end up to be a "clever" optimization trick like so many of them in C++ world. Also, in my opinion, the existance of this performance difference goes against "zero-cost" claims, so I think this issue should be kept open.

/cc @matthieu-m

The percentage performance difference I see in my code is quite larger than the 1-2% difference shown above. So better benchmarks are necessary.

@frol

I agree that the performance drop between exclusive and inclusive is annoying, and should ideally be eliminated if possible, however I don't see that as a failure of zero-overhead abstraction.

If you were coding the inclusive range with a for loop, you would also need to handle either the starting/ending bound specially, leading to a different codegen than for exclusive ranges.

For me there are two issues:

  • The codegen issue, as mentioned; there are possibly ways to massage the Rust code to optimize better, or there may be something to be done on the LLVM side to teach LLVM to split loops in the situation of a loop check moving monotonically.
  • The API decision that Range and RangeInclusive directly implement Iterator instead of implementing IntoIterator; in the latter case, it would be possible to dynamically choose between exclusive (if possibility to move either bound further) or full (if not) iteration, which LLVM may optimize better.

The API is water under the bridge for stability reasons, so we need to concentrate on the codegen issues. I think the next step should be involving folks knowledgeable about LLVM, who could understand why the optimizer fails so hard and either point to a specific pattern to avoid or improve LLVM so it doesn't.

I won't be following this up, as I have other projects, so I invite anybody interested to pick up the flag and carry on.


@leonardo-m

For these benchmarks specifically? In my benchmarks I was seeing less than 1% between exclusive and inclusive, which is consistent with what is reported here.

Note that using external iteration (a for loop) may not optimize as well as using internal iteration.

@frol

Also, in my opinion, the existance of this performance difference goes against "zero-cost" claims

I disagree, 0..5 != 0..=4, the zero cost is about the difference if I would manually do a loop with inclusive range like with a u8 from 0 to 255

Was this page helpful?
0 / 5 - 0 ratings