Rust: The new step_by is too much slow

Created on 5 Jul 2017 · 14Comments · Source: rust-lang/rust

As I explained in #42534 the new step_by can't be used:

#![feature(iterator_step_by)]

#[inline(never)]
pub fn first_foo(mut a: i32, b: i32) -> i32 {
    let mut tot = 0;

    while a < b {
        tot += a;
        a += 3;
    }
    tot
}

#[inline(never)]
pub fn second_foo(x: std::ops::Range<i32>) -> i32 {
    let mut tot = 0;

    #[allow(deprecated)]
    for a in x.step_by(3) {
        tot += a;
    }
    tot
}

fn main() {
    println!("{}", first_foo(0, 1000));
    println!("{}", first_foo(0, 100000));
    println!("{}", second_foo(0 .. 1000));
    println!("{}", second_foo(0 .. 100000));
}

Compiling with:
rustc -O --emit asm test.rs

It gives:

_ZN4test9first_foo17he3a0bddb42457919E:
    testl   %ecx, %ecx
    jle .LBB0_1
    xorl    %edx, %edx
    xorl    %eax, %eax
    .p2align    4, 0x90
.LBB0_4:
    addl    %edx, %eax
    addl    $3, %edx
    cmpl    %ecx, %edx
    jl  .LBB0_4
    jmp .LBB0_2
.LBB0_1:
    xorl    %eax, %eax
.LBB0_2:
    retq

_ZN4test10second_foo17hd5d21559ba84e6d1E:
    movq    %rcx, %r9
    shrq    $32, %r9
    xorl    %r8d, %r8d
    xorl    %r10d, %r10d
    xorl    %eax, %eax
    testb   $1, %r10b
    je  .LBB1_11
    jmp .LBB1_2
    .p2align    4, 0x90
.LBB1_7:
    shrq    $32, %rdx
    addl    %eax, %edx
    movb    $1, %r10b
    movl    %edx, %eax
    testb   $1, %r10b
    je  .LBB1_11
.LBB1_2:
    cmpl    %r9d, %ecx
    jge .LBB1_5
    leal    1(%rcx), %edx
    cmpl    %r9d, %edx
    jge .LBB1_4
    leal    2(%rcx), %edx
    cmpl    %r9d, %edx
    jge .LBB1_9
    addl    $3, %ecx
    shlq    $32, %rdx
    movl    $1, %r10d
    jmp .LBB1_6
    .p2align    4, 0x90
.LBB1_11:
    movq    %rcx, %r10
    shlq    $32, %r10
    xorl    %edx, %edx
    cmpl    %r9d, %ecx
    setl    %dl
    cmovgeq %r8, %r10
    addl    %edx, %ecx
    jmp .LBB1_6
    .p2align    4, 0x90
.LBB1_4:
    movl    %edx, %ecx
    jmp .LBB1_5
.LBB1_9:
    movl    %edx, %ecx
    .p2align    4, 0x90
.LBB1_5:
    xorl    %edx, %edx
    xorl    %r10d, %r10d
.LBB1_6:
    orq %r10, %rdx
    testl   %edx, %edx
    jne .LBB1_7
    retq

In a function like second_foo() the old step_by gave something like:

_ZN5test210second_foo17h5b0eb3168418bdd1E:
    movq    %rcx, %rdx
    shrq    $32, %rdx
    xorl    %eax, %eax
    cmpl    %edx, %ecx
    jge .LBB1_3
    xorl    %eax, %eax
    .p2align    4, 0x90
.LBB1_2:
    addl    %ecx, %eax
    addl    $3, %ecx
    cmovol  %edx, %ecx ; what is this for?
    cmpl    %edx, %ecx
    jl  .LBB1_2
.LBB1_3:
    retq

C-enhancement I-slow T-libs

Source

leonardo-m

Most helpful comment

before pull requests about it got merged

Doing this kind of experiment and finding problems is the point of having a Nightly channel.

SimonSapin on 9 Jul 2017

👍5

All 14 comments

Dear bors, before closing down this issue you should show me that the asm after that pull request is good enough. I'll verify it myself tomorrow with the next Nightly.

leonardo-m on 8 Jul 2017

Technically this is closed by GitHub not bors 😄 Feel free to reopen if it is still not fixed.

kennytm on 8 Jul 2017

Reopening so we track verification.

Mark-Simulacrum on 8 Jul 2017

The asm now is:

_ZN4test10second_foo17h387627478d348617E:
    pushq   %r14
    pushq   %rsi
    pushq   %rdi
    pushq   %rbx
    subq    $40, %rsp
    movq    %rcx, %rsi
    movq    %rsi, %rbx
    shrq    $32, %rbx
    xorl    %r14d, %r14d
    xorl    %eax, %eax
    xorl    %edi, %edi
    testb   $1, %al
    jne .LBB1_3
    jmp .LBB1_2
    .p2align    4, 0x90
.LBB1_10:
    shrq    $32, %rcx
    addl    %edi, %ecx
    movb    $1, %al
    movl    %ecx, %edi
    testb   $1, %al
    je  .LBB1_2
.LBB1_3:
    movl    $2, %ecx
    callq   _ZN4core3num69_$LT$impl$u20$core..convert..TryFrom$LT$usize$GT$$u20$for$u20$u32$GT$8try_from17h98ee5a7abea3d10bE
    testl   %eax, %eax
    jne .LBB1_4
    shrq    $32, %rax
    addl    %esi, %eax
    cmpl    %esi, %eax
    jge .LBB1_6
.LBB1_4:
    movl    %ebx, %esi
    xorl    %ecx, %ecx
    xorl    %edx, %edx
.LBB1_7:
    orq %rdx, %rcx
    testl   %ecx, %ecx
    jne .LBB1_10
    jmp .LBB1_9
    .p2align    4, 0x90
.LBB1_2:
    movq    %rsi, %rax
    shlq    $32, %rax
    xorl    %ecx, %ecx
    cmpl    %ebx, %esi
    setl    %cl
    cmovgeq %r14, %rax
    addl    %ecx, %esi
    orq %rax, %rcx
    testl   %ecx, %ecx
    jne .LBB1_10
    jmp .LBB1_9
.LBB1_6:
    movq    %rax, %rcx
    shlq    $32, %rcx
    leal    1(%rax), %esi
    xorl    %edx, %edx
    cmpl    %ebx, %eax
    setl    %dl
    cmovgel %ebx, %esi
    cmovgeq %r14, %rcx
    jmp .LBB1_7
.LBB1_9:
    movl    %edi, %eax
    addq    $40, %rsp
    popq    %rbx
    popq    %rdi
    popq    %rsi
    popq    %r14
    retq

Ti me it looks still too much long. And apparently there's a call to a TryFrom in the middle of the loop...

In some of my code I see a further 10% performance decrease after the recent performance decrease caused by allocators.

leonardo-m on 9 Jul 2017

I pretty much can’t read assembly. Is either the before or after code O(n) for step_by(n).next()?

The call to try_from is because StepBy::next calls nth on the underlying iterator with a usize argument. In the case of Range<i32>, nth will call <i32 as Step>::add_usize which will first try to convert usize to u32 while checking for overflow.

I have more changes to this code in https://github.com/rust-lang/rust/pull/43127

SimonSapin on 9 Jul 2017

Is either the before or after code O(n) for step_by(n).next()?

I think it's now O(1), but the asm is a bit of a mess.

step_by() is part of the basics of for loops in Rust, a system language. So before pull requests about it got merged, someone that knows assembly should carefully review the code.

In the case of Range, nth will call ::add_usize which will first try to convert usize to u32 while checking for overflow.

So having only usize as step_by argument causes problems. Perhaps it's not a good idea. step_by() is meant to be used everywhere, its inner loop will run trillions of times. Perhaps all changes to step_by of the last few weeks should be reverted, and merged again only if and when they are acceptable.

leonardo-m on 9 Jul 2017

👍1

before pull requests about it got merged

Doing this kind of experiment and finding problems is the point of having a Nightly channel.

SimonSapin on 9 Jul 2017

👍5

Perhaps all changes to step_by of the last few weeks should be reverted, and merged again only if and when they are acceptable.

This looks like it's an issue with TryFrom/TryInto rather than step_by itself.

EDIT: The main difference seems to be the overflow checks.

oyvindln on 10 Jul 2017

In addition to the overflow checks, there is a check in the next() function of the StepBy iterator, which checks if any items have been taken from the iterator. This wasn't present in the old version, but is presumably needed in the current (generic) implementation.

Additionally, there is a call to try_from that isn't optimized out, but #43194 should help with that.

The overflow checks are probably the main issue though, so having a specialisation of this trait for ranges would probably be helpful.

oyvindln on 12 Jul 2017

I had a go at adding specializations for the primitive types, however, that triggered #36262, which made the compiler treat range syntax without type annotations (e.g 0...50) as always being i32 rather than inferring the type.

I Also tried to specialize using the Step trait, however I didn't manage to it to work as adding default to type Item = I::Item; caused the compiler to complain about the return type of next and the result of the inner Iterator::next() being different. I'm not sure if I hit a bug, or if i was simply doing something wrong. It's possible that doing it through a trait may also hit #36262, but I didn't get that far.

oyvindln on 9 Aug 2017

❤2

To better show the effects of the new step_by here you can find a test program (that solves two different Euler problems) that uses step_by (run-time 4.40 seconds):
https://gist.github.com/anonymous/05cae75cebb01d452e585effad1e01b8

And the same code with step_by replaced by while loops (run-time 3.81 seconds):
https://gist.github.com/anonymous/9db9fde7fe853c56760f8ef80f037fc3

leonardo-m on 13 Aug 2017

@oyvindln Nice experiment (I'd love to see the code). I think the associated type defaults issue is expected, and it seems best to avoid using associated type specialization for now, even in std.

bluss on 7 Jan 2018

@bluss
I had something left over from the first attempt that(specializing range): https://gist.github.com/oyvindln/a543f0d62da990c18a3575950c39ada1

Didn't find any code from trying to specialize using the step trait.

oyvindln on 9 Jan 2018

Since few days the step_by is fast enough for my usages, so I close this issue down (I still don't like the design of the new step_by that only works with usize steps, but I guess I can't change that). I leave Issue #45222 open because closed intervals are still too much slow.

leonardo-m on 25 Jun 2018

Was this page helpful?

0 / 5 - 0 ratings