Julia: Code generation and cartesian iteration

Created on 20 Nov 2014  Â·  29Comments  Â·  Source: JuliaLang/julia

Compared to manual iteration, the new cartesian iterator has a small performance penalty:

function sumcart_manual(A::AbstractMatrix)
    s = 0.0
    @inbounds for j = 1:size(A,2), i = 1:size(A,1)
        s += A[i,j]
    end
    s
end

function sumcart_iter(A)
    s = 0.0
    @inbounds for I in CartesianIndices(size(A))
        s += A[I]
    end
    s
end

A = rand(10^4,10^4);
sumcart_manual(A);
sumcart_iter(A);

julia> @time sumcart_manual(A)
elapsed time: 0.176363505 seconds (96 bytes allocated)
4.999591778337127e7

julia> @time sumcart_iter(A)
elapsed time: 0.250890195 seconds (96 bytes allocated)
4.999591778337127e7

In this gist, I compare the code that gets generated. The comparison reveals that the CartesianIndex is not fully elided by the compiler.

Any ideas what's happening?

EDIT: the core problem is that next contains a branch, and testing done is a 2nd branch, so there are two branches per iteration. See https://github.com/JuliaLang/julia/issues/16035#issuecomment-214482039

performance

Most helpful comment

So incredible to watch this close before my eyes! I have been waiting almost 6 years for this. You are a hero, @johnnychen94!

All 29 comments

Here's an example that doesn't rely on all the code-generation facilities in the iterators in Base:

module Iter

import Base: start, next, done, getindex

immutable Raw2
    i1::Int
    i2::Int
end
Raw2(dims::(Int,Int)) = Raw2(dims[1], dims[2])

start(sz::Raw2) = Raw2(1,1)
@inline next(sz::Raw2, state::Raw2) = state.i1 < sz.i1 ?
                                     (state, Raw2(state.i1+1,state.i2)) :
                                     (state, Raw2(1, state.i2+1))
done(sz::Raw2, state::Raw2) = state.i2 > sz.i2

getindex(A::AbstractArray, I::Raw2) = A[I.i1,I.i2]

end

function mysum(A)
    s = 0.0
    @inbounds for I in Iter.Raw2(size(A))
        s += A[I]
    end
    s
end

function mysum2(A)
    s = 0.0
    @inbounds for j = 1:size(A,2), i = 1:size(A,1)
        s += A[i,j]
    end
    s
end

Then do

A = reshape(1:12, 3, 4)
@code_llvm mysum(A)
@code_llvm mysum2(A)

You should see that the code for mysum is not nearly as well-optimized as mysum2.

@simonster, while we're on the topic of SubArrays and indexing code generation, if you have time I'd be grateful for any insights you have here.

Comparing mysum and mysum2 for A = rand(10^4, 10^4) reveals that there's a 40% speed gap, too. The critical section of @code_llvm for mysum is

L3:                                               ; preds = %L2, %if1
  %_var0.0 = phi [2 x %Raw2] [ %41, %L2 ], [ %37, %if1 ]
  %42 = extractvalue [2 x %Raw2] %_var0.0, 0, !dbg !796, !julia_type !803
  %43 = extractvalue [2 x %Raw2] %_var0.0, 1, !dbg !796, !julia_type !803
  %44 = extractvalue %Raw2 %42, 0, !dbg !804
  %45 = add i64 %44, -1, !dbg !804
  %46 = extractvalue %Raw2 %42, 1, !dbg !804
  %47 = add i64 %46, -1, !dbg !804
  %48 = mul i64 %47, %13, !dbg !804
  %49 = add i64 %45, %48, !dbg !804
  %50 = getelementptr double* %28, i64 %49, !dbg !804
  %51 = load double* %50, align 8, !dbg !804, !tbaa %jtbaa_user
  %52 = fadd double %s.0, %51, !dbg !804
  %53 = extractvalue %Raw2 %43, 1, !dbg !804
  %54 = icmp slt i64 %26, %53, !dbg !804
  br i1 %54, label %L6, label %L, !dbg !804

compared to

L2:                                               ; preds = %L2.preheader, %L2
  %s.1 = phi double [ %24, %L2 ], [ %s.0, %L2.preheader ]
  %"#s10.0" = phi i64 [ %20, %L2 ], [ 1, %L2.preheader ]
  %20 = add i64 %"#s10.0", 1, !dbg !798
  %21 = add i64 %19, %"#s10.0", !dbg !805
  %22 = getelementptr double* %10, i64 %21, !dbg !805
  %23 = load double* %22, align 8, !dbg !805, !tbaa %jtbaa_user
  %24 = fadd double %s.1, %23, !dbg !805
  %25 = icmp eq i64 %"#s10.0", %15, !dbg !805
  br i1 %25, label %L5, label %L2, !dbg !805

for mysum2.

Anything obvious to change to narrow the gap between these?

This generates better code for me:

start(sz::Raw2) = Raw2(0,0)
@inline function next(sz::Raw2, state::Raw2)
    i1 = state.i1+1
    i2 = state.i2
    if i1 == sz.i1
        i1 = 0
        i2 = state.i2+1
    end
    return (state, Raw2(i1, i2))
end
done(sz::Raw2, state::Raw2) = state.i2 == sz.i2

getindex(A::AbstractArray, I::Raw2) = A[I.i1+1,I.i2+1]

This is ~5% slower on my system instead of ~20%. But both the original code and this change are still suboptimal because the inner loop has a branch:

L:                                                ; preds = %L.preheader, %L2
  %s.0 = phi double [ %24, %L2 ], [ 0.000000e+00, %L.preheader ]
  %"#s2307.0" = phi %Raw2 [ %19, %L2 ], [ zeroinitializer, %L.preheader ]
  %9 = ptrtoint %jl_value_t* %8 to i64
  %10 = extractvalue %Raw2 %"#s2307.0", 0, !dbg !1614
  %11 = add i64 %10, 1, !dbg !1614
  %12 = extractvalue %Raw2 %"#s2307.0", 1, !dbg !1614
  %13 = icmp eq i64 %11, %9, !dbg !1614
  br i1 %13, label %if1, label %L2, !dbg !1614

if1:                                              ; preds = %L
  %14 = add i64 %12, 1, !dbg !1614
  br label %L2, !dbg !1614

L2:                                               ; preds = %L, %if1
  %_var1.0 = phi i64 [ %11, %L ], [ 0, %if1 ]
  %_var2.0 = phi i64 [ %12, %L ], [ %14, %if1 ]
  %15 = ptrtoint %jl_value_t* %2 to i64
  %16 = ptrtoint %jl_value_t* %8 to i64
  %17 = bitcast i8* %6 to double*
  %18 = insertvalue %Raw2 undef, i64 %_var1.0, 0, !dbg !1614
  %19 = insertvalue %Raw2 %18, i64 %_var2.0, 1, !dbg !1614, !julia_type !1621
  %20 = mul i64 %12, %16, !dbg !1622
  %21 = add i64 %20, %10, !dbg !1622
  %22 = getelementptr double* %17, i64 %21, !dbg !1622
  %23 = load double* %22, align 8, !dbg !1622, !tbaa %jtbaa_user
  %24 = fadd double %s.0, %23, !dbg !1622
  %25 = icmp eq i64 %_var2.0, %15, !dbg !1622
  br i1 %25, label %L5, label %L, !dbg !1622

This is pretty much what you'd expect given the iteration protocol. In Julia, it would look something like:

@label start
newi1 = i1 + 1
if newi1 == sz1
    newi1 = 0
    newi2 += 1
else
    newi2 = i2
end
ind = i2*sz1+i1
@inbounds s += A[ind+1]
newi2 == sz2 && @goto done
i1 = newi1
i2 = newi2
@goto start
@label done

(This generates exactly the same assembly for the inner loop). Rearranging this a little allows LLVM to optimize it to match the performance of mysum2:

@label start
ind = i2*sz1+i1
@inbounds s += A[ind+1]
newi1 = i1 + 1
if newi1 == sz1
    newi1 = 0
    newi2 += 1
    newi2 == sz2 && @goto done
else
    newi2 = i2
end
i1 = newi1
i2 = newi2
@goto start
@label done

But I can't see how to express this in the current iteration framework.

edit: Ref iteration protocol issues #6125, #8149, #9182

Worth keeping in mind wrt https://github.com/JuliaLang/julia/issues/8149 – if we could make it easier to express internal iteration _and_ get a performance bump at the same time, that would be the best.

Thanks a million, @simonster. The impact of these "stylistic" differences is quite surprising to me---I've learned to lean on LLVM to do the right thing, but this is a good counterexample.

I'll look into adopting your version for our code generation for next.

Hmm, on my system, my original version is _faster_ than your modification. This is weird.

I tried on another system, which I probably should have done earlier. On my Sandy Bridge system on Linux, mysum is 37% slower than mysum2 with your original version and 35% slower than mysum2 with my modifications. On my Haswell system on OS X, mysum is 22% slower than mysum2 with your original version and 7% slower than mysum2 with my modifications. Both systems are running LLVM 3.3 and the generated assembly is identical, so the performance difference seems to be in the architecture. I'm just benchmarking with:

function bench(A)
    t1 = 0.0
    t2 = 0.0
    for i = 1:50
        t1 += @elapsed mysum(A)
        t2 += @elapsed mysum2(A)
    end
    t1/t2
end

From reading a bit more, it seems that the cost of branches is a big difference among architectures (mine's a Sandy Bridge).

Just for fun I tried #9182, but by itself it didn't change the performance. I wondered if it did the nextstate at the end of the loop, rather than right after nextval, if it might fare better---you wouldn't need to essentially have two state variables (old and new).

To test this, I followed your lead and wrote out some explicit versions using while, and got pretty interesting results:

function mysum_while1(A)
    s = 0.0
    i = j = 1
    inew = jnew = 1
    while j <= size(A,2)
        if inew < size(A,1)
            inew += 1
        else
            inew = 1
            jnew += 1
        end
        @inbounds s += A[i,j]
        i = inew
        j = jnew
    end
    s
end

function mysum_while2(A)
    s = 0.0
    i = j = 1
    while j <= size(A,2)
        @inbounds s += A[i,j]
        if i < size(A,1)
            i += 1
        else
            i = 1
            j += 1
        end
    end
    s
end

function mysum_while3(A)
    s = 0.0
    j = 1
    while j <= size(A,2)
        i = 1
        while i <= size(A,1)
            @inbounds s += A[i,j]
            i += 1
        end
        j += 1
    end
    s
end

function mysum_while4(A)
    s = 0.0
    i = j = 1
    inew = jnew = 1
    notdone = true
    while notdone
        inew += 1
        if inew > size(A, 1)
            inew = 1
            jnew += 1
            notdone = jnew <= size(A,2)
        end
        @inbounds s += A[i,j]
        i = inew
        j = jnew
    end
    s
end


function mysum_while5(A)
    s = 0.0
    i = j = 1
    notdone = true
    while notdone
        @inbounds s += A[i,j]
        i += 1
        if i > size(A, 1)
            i = 1
            j += 1
            notdone = j <= size(A,2)
        end
    end
    s
end

function mysum_while6(A)
    s = 0.0
    i = j = 1
    while j <= size(A,2)
        @inbounds s += A[i,j]
        i += 1
        if i > size(A, 1)
            i = 1
            j += 1
        end
    end
    s
end

Timing results after warmup:

julia> @time mysum_while1(A)
elapsed time: 0.258024361 seconds (96 bytes allocated)
4.9997446791483015e7

julia> @time mysum_while2(A)
elapsed time: 0.216438899 seconds (96 bytes allocated)
4.9997446791483015e7

julia> @time mysum_while3(A)
elapsed time: 0.157289985 seconds (96 bytes allocated)
4.9997446791483015e7

julia> @time mysum_while4(A)
elapsed time: 0.227041972 seconds (96 bytes allocated)
4.9997446791483015e7

julia> @time mysum_while5(A)
elapsed time: 0.140768946 seconds (96 bytes allocated)
4.9997446791483015e7

julia> @time mysum_while6(A)
elapsed time: 0.190532756 seconds (96 bytes allocated)
4.9997446791483015e7

So, at least on my machine

  • the placement of the increment matters (end of the loop is best)
  • using a boolean variable for while testing seems to help

Interestingly, the iterators in multidimensional already use the boolean version, so it might just be a case of rewriting the way next works. But first, can you check on your system that mysum_while5 is among the best performers?

One more:

function mysum_while7(A)
    s = 0.0
    i = j = 1
    notdone = true
    while notdone
        @inbounds s += A[i,j]
        if i < size(A,1)
            i += 1
        else
            i = 1
            j += 1
            notdone = j <= size(A,2)
        end
    end
    s
end

is just as fast as the fastest ones above. So it really seems to come down to (1) putting the increment at the end, and (2) using a boolean for the while testing (and updating that boolean at the deepest level of the increment testing).

The test case in Tim's StackOverflow answer is much improved with the Tuple overhaul. I believe enumerate and zip were slow previously due to nested tuples. That seems to no longer be the case after the tupocolypse! :fireworks:

Unfortunately, the main test case in this PR (for Cartesian Indexing) isn't affected.

Looks like #12090 also solved this guy!

julia> @time sumcart_manual(A); @time sumcart_iter(A); # Edit: cheating with eachindex
 142.564 milliseconds (5 allocations: 176 bytes)
 142.117 milliseconds (5 allocations: 176 bytes)

And the @code_llvm is actually _simpler_ for sumcart_iter since it's only one loop: gist with llvm output.

The Cartesian objects aren't elided for LinearSlow arrays, but I can't measure a performance difference with sparse arrays:

julia> A = sprand(10^3,10^3,.1);

julia> @time sumcart_manual(A); @time sumcart_iter(A)
  35.287 milliseconds (5 allocations: 176 bytes)
  35.934 milliseconds (5 allocations: 176 bytes)
50221.35105393453

Not true for me. I updated the code sample at the top; the change in eachindex to use linear indexing, when applicable, might be confusing?

Duh. That's the second time I've done that, isn't it?

I'm not smart enough to keep track. And really, given how much you've improved Array-type operations, why would I want to??

Current situation:

julia> @btime sumcart_manual(A);
  98.752 ms (1 allocation: 16 bytes)

julia> @btime sumcart_iter(A);
  126.430 ms (1 allocation: 16 bytes)

Seems a bit worse than https://github.com/JuliaLang/julia/issues/9080#issuecomment-275629478 now.

julia> @btime sumcart_manual(A)
  100.859 ms (1 allocation: 16 bytes)
5.000302140597446e7

julia> @btime sumcart_iter(A)
  204.743 ms (1 allocation: 16 bytes)
5.000302140597446e7

I've commented this already in https://github.com/JuliaLang/julia/issues/15648#issuecomment-511048192. But I don't think iterator is a good abstraction to handle this. I think the approach based on foreach/foldl/reduce would be much better.

Here is a minimal implementation of foreach to handle CartesianIndices-like iteration:

@inline foreach′(f, I::CartesianIndices) =
    foreach_product((args...) -> f(CartesianIndex(reverse(args)...)), reverse(I.indices)...)

@inline function foreach_product(f::F, itr, itrs...) where F
    @inbounds for x in itr
        @inline g(args...) = f(x, args...)
        foreach_product(g, itrs...)
    end
end

@inline function foreach_product(f::F, itr) where F
    @inbounds for x in itr
        f(x)
    end
end

(Note: the order of axes in foreach_product is reversed compared to Iterators.product.)

Benchmark code:

using BenchmarkTools

function copyto_manual!(A, B)
    @assert axes(A) == axes(B)
    @inbounds for j = 1:size(A,2), i = 1:size(A,1)
        A[i, j] = B[i, j]
    end
end

function copyto_foreach!(A, B)
    foreach′(eachindex(A, B)) do I
        @inbounds A[I] = B[I]
    end
end

function copyto_iterate!(A, B)
    @inbounds for I in eachindex(A, B)
        A[I] = B[I]
    end
end

B = rand(10^2, 10^2);
A = similar(B');

Results:

julia> @btime copyto_manual!($A, $B');
  6.401 μs (1 allocation: 16 bytes)

julia> @btime copyto_foreach!($A, $B');
  6.243 μs (1 allocation: 16 bytes)

julia> @btime copyto_iterate!($A, $B');
  16.035 μs (1 allocation: 16 bytes)

As you can see, the performance of foreach-based code is equivalent to the manual loop and much faster than the iterate-based code.

For implementing summation as in the OP, we need foldl and/or reduce. It is actually a better strategy to implement foreach based on foldl to keep things DRY. But I chose foreach as the MWE as I thought it may be simpler to understand.

Love the implementation!

But I don't think iterator is a good abstraction to handle this. I think the approach based on foreach/foldl/reduce would be much better.

I am excited about the transducer approach and need to learn & use it more, but I don't think iterators are "wrong." A lot of the virtue of CartesianIndex iteration is being able to perform manipulations on the indexes for stencil operations. Is it easy to write all the operations in https://julialang.org/blog/2016/02/iteration/ in terms of transducers?

I was trying to bring up the difference between foldl and "iterator", or more specifically the iterate function, as the _implementation_ of, well, iteration (maybe it's confusing). In particular, I have no criticism on the _concrete interfaces_ like eachindex, CartesianIndices, etc. They are all fantastic! What I wanted to point out was that (1) implementing specific foldl for them (i.e., bypassing iterate) can yield a huge speedup gain sometimes and (2) "functional" entry points like foreach will be more favorable once you have that.

Iterator has been _the_ way to provide customizable iterations in many languages so I can imagine my point may be very fuzzy. But actually iterate and foldl are equally powerful formulation of customizable iterations (or rather foldl can be extended to be so). I think it is conceivable to have a programming language where the for loop syntax is lowered to foldl rather than iterate. Of course, I'm not suggesting to do this as it would be just way too destructive. Furthermore, foldl and iterate are good at different things. foldl is good at consuming collections and iterate is good at constructing them. So, having both of them may ultimately be a good idea. I think it's like any other Julia functions; you add a performance specialization on top of a generic fallback when it is a big deal.

Is it easy to write all the operations in https://julialang.org/blog/2016/02/iteration/ in terms of transducers?

(BTW, just a bit of terminology nitpick (sorry!). You may be using "transducers" as a broad term where I do some functional stuff, but what I'm presenting here is nothing to do with transducers _per se_ and instead it's about foldl. Transducers are tools for using foldl; just like Base.Iterators is a tool kit for using for loop.)

Transforming for loop to foldl or foreach is straightforward. You can just replace for x in xs to foreach(xs) do x if there is no state (as in copyto!). When you have state, you can put it in the accumulator of foldl:

function extrema_loop(xs)
    small = typemax(eltype(xs))
    large = typemin(eltype(xs))
    for x in xs
        small = min(small, x)
        large = max(large, x)
    end
    return (small, large)
end

becomes

function extrema_foldl(xs)
    small = typemax(eltype(xs))
    large = typemin(eltype(xs))
    foldl(xs, init=(small, large)) do (small, large), x
        small = min(small, x)
        large = max(large, x)
        (small, large)
    end
end

But I think more important question is how to expose the algorithms in the blog post in reusable APIs and how to optimize them. Maybe a generalized version of boxcar3 is a good example. You can define foreach_boxcar(f, A::AbstractArray, ::Val{width}) such that boxcar3 can be written as

function boxcar3(A::AbstractArray) where {F}
    out = similar(A)
    foreach_boxcar(A, Val(3)) do I, x
        out[I] = mean(x)
    end
    return out
end

This API makes writing other variants (say using median instead of mean) very easy.

An implementation of foreach_boxcar is something like this

using StaticArrays

_valof(::Val{x}) where {x} = x

function foreach_boxcar(f, A::AbstractArray{<:Any, N}, width::Val = Val(3)) where {N}
    # For simplicity:
    @assert isodd(_valof(width))
    @assert all(s -> s >= _valof(width), size(A))

    R = CartesianIndices(A)
    I1 = oneunit(first(R))

    # Phase 1: Loop over the "main" part (w/o edges).
    main = ntuple(N) do d
        k = _valof(width) ÷ 2
        k+1:size(A, d)-k
    end
    # Pass a static array to `f`, assuming that `width` is small:
    satype = SArray{Tuple{ntuple(_ -> _valof(width), N)...}, eltype(A)}
    for I in R[main...]
        f(I, satype(view(A, I-I1:I+I1)))
    end
    # Note: We can use `foreach(R[main...]) do I` for better
    # performance.

    # Phase 2: Loop over the edges.
    Ifirst, Ilast = first(R), last(R)
    for kinds in Iterators.product(ntuple(_ -> (-1, 0, 1), N)...)
        all(iszero, kinds) && continue
        ranges = ntuple(N) do d
            k = _valof(width) ÷ 2
            if kinds[d] == -1  # left edge
                1:k
            elseif kinds[d] == 1  # right edge
                size(A, d)-k+1:size(A, d)
            else  # main
                k+1:size(A, d)-k
            end
        end
        for I in R[ranges...]
            J = max(Ifirst, I-I1):min(Ilast, I+I1)
            # Pass a view, as the shapes are heterogeneous:
            f(I, view(A, J))
        end
    end
end

I did a bit of optimization that uses SArray while looping over the "main" part. This is to demonstrate that, unlike iterate, you can have heterogeneous "element types" (where element is a pair (I, block)) without invoking type-instability in the functional approach. This is because you can call f in multiple code locations.

The important point is that it's the _implementer of foreach_ who controls the loop. When you expose this with iterate it's the other way around; i.e., the caller controls the loop. You can do _a lot_ of optimization if you control the loop. In foreach_boxcar, I can pass SArray without type instability because I can split the loop in to phases. In foreach_product, I can "lower" Cartesian iteration to nested for loop because I control the loop. Furthermore, this pattern is composable; I could use foreach_product from foreach_boxcar.

No, I understand that inversion of control can be very useful. But notice that in the key portion of your code here you're using iteration, not foldl. That was my only point. Some things are easier to write using iteration, so it's not a "bad abstraction" (you're using it yourself).

for loop syntax is just sugar. I "ended up" using iterate because Julia's for loop is happened to be lowered to it. It is nothing to do with iterate being "good" or "bad." As I said, it is possible to have a language where for loop is lowered to foldl. In such a language, you'd write foldl for basic concepts like UnitRange using, e.g., while loop. You don't _need_ to have iterate in such a language. It's a trivial statement, but, if for loops were lowered to foldl in Julia, my code didn't use iterate (although then there is no need to use foreach or foldl directly like this). Another way to put this: I could, in principle, write foreach_boxcar using while loop plus some abstractions built on top it.

Interestingly, the performance regression seems to go away when adding @simd to the Cartesian examples, even when it should be irrelevant, as in the copyto! case:

julia> @btime copyto_iterate!($A, $B');
  13.590 μs (1 allocation: 16 bytes)

julia> @btime copyto_manual!($A, $B');
  3.086 μs (1 allocation: 16 bytes)

julia> @btime copyto_foreach!($A, $B');
  3.348 μs (3 allocations: 80 bytes)

julia> function copyto_iterate!(A, B)
       @inbounds @simd for I in eachindex(A, B)
       A[I] = B[I]
       end
       end
copyto_iterate! (generic function with 1 method)

julia> @btime copyto_iterate!($A, $B');
  3.012 μs (1 allocation: 16 bytes)

Or, the example from the opening post:

julia> A = rand(256,20);

julia> function sumcart_manual(A::AbstractMatrix)
           s = 0.0
           @inbounds for j = 1:size(A,2), i = 1:size(A,1)
               s += A[i,j]
           end
           s
       end
sumcart_manual (generic function with 1 method)

julia> function sumcart_iter(A)
           s = 0.0
           @inbounds for I in CartesianIndices(size(A))
               s += A[I]
           end
           s
       end
sumcart_iter (generic function with 1 method)

julia> @btime sumcart_manual($A)
  4.432 μs (0 allocations: 0 bytes)
2556.470985732226

julia> @btime sumcart_iter($A)
  4.640 μs (0 allocations: 0 bytes)
2556.470985732226

julia> function sumcart_manual(A::AbstractMatrix)
           s = 0.0
           @inbounds for j = 1:size(A,2); @simd for i = 1:size(A,1)
               s += A[i,j]
           end; end
           s
       end
sumcart_manual (generic function with 1 method)

julia> function sumcart_iter(A)
           s = 0.0
           @inbounds @simd for I in CartesianIndices(size(A))
               s += A[I]
           end
           s
       end
sumcart_iter (generic function with 1 method)

julia> @btime sumcart_manual($A)
  333.601 ns (0 allocations: 0 bytes)
2556.470985732225

julia> @btime sumcart_iter($A)
  332.655 ns (0 allocations: 0 bytes)
2556.470985732225

Because of these methods, @simd converts the loop into the equivalent of

R = CartesianIndices(A)
for Itail in CartesianIndices(Base.tail(R.indices)), i in R.indices[1]
    s += A[i, Itail]
end

By splitting out the first loop you mitigate but don't actually fix the problem. You should still be able to detect it if, for example, A is 3-dimensional and the first dimension is very short (e.g., size 1 or 2). Note that the magnitude of the effect is very processor-dependent.

Last time I investigated, the real problem with performance in the non-@simd version is that there are a minimum of two branches per iteration: one to test whether we need to perform a "carry" on the iterator, and another to decide whether iterate returned nothing and the loop should terminate (if you have to perform multiple carries it's even worse). Theoretically the optimizer should be able to realize that iterate won't return nothing if it doesn't carry, and then reorder the operations, but that doesn't seem to happen. As @tkf says, that optimization happens almost automatically for foldl because by calling front you can do that peeling one iterator at a time (#35036), so you never even need to implement a carry operation.

But we have to fix this anyway, and I don't think it will be hard. I was hoping it would arise from a more general optimization but perhaps it's time to add a compiler pass that detects calls to iterate(::CartesianIndex{N}) and rewrites the following code block using a set of N nested loops. I should just sit down and do it, it might be an interesting experiment in the context of evaluating deeper integration with LoopVectorization.

If it is easy to fix I agree it's a good idea to do this since it would make existing code faster. I assumed that it was a hard problem since it has been opened for a long time.

But I think it's still fair to say it becomes almost impossible to define efficient iterate when the collections are "complex enough" (e.g., sparse arrays, B-trees). From this point of view, it's a good idea to stop recommending iterate when writing generic functions. Though I suppose this discussion has to happen elsewhere to keep the scope of this issue simple.

I was curious how easy it is so I had a quick go at it. This improves the performance although it doesn't bring us to the performance of the nested loops:

diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index 66a22b7005..e3306c1033 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -345,7 +345,7 @@ module IteratorsMD
     end
     @inline function iterate(iter::CartesianIndices, state)
         valid, I = __inc(state.I, first(iter).I, last(iter).I)
-        valid || return nothing
+        valid === Val(true) || return nothing
         return CartesianIndex(I...), CartesianIndex(I...)
     end

@@ -356,15 +356,18 @@ module IteratorsMD
     end

     # increment post check to avoid integer overflow
-    @inline __inc(::Tuple{}, ::Tuple{}, ::Tuple{}) = false, ()
+    @inline __inc(::Tuple{}, ::Tuple{}, ::Tuple{}) = Val(false), ()
     @inline function __inc(state::Tuple{Int}, start::Tuple{Int}, stop::Tuple{Int})
-        valid = state[1] < stop[1]
-        return valid, (state[1]+1,)
+        if state[1] < stop[1]
+            return Val(true), (state[1]+1,)
+        else
+            return Val(false), (state[1]+1,)
+        end
     end

     @inline function __inc(state, start, stop)
         if state[1] < stop[1]
-            return true, (state[1]+1, tail(state)...)
+            return Val(true), (state[1]+1, tail(state)...)
         end
         valid, I = __inc(tail(state), tail(start), tail(stop))
         return valid, (start[1], I...)

Before:

julia> @btime sumcart_manual($A)
  129.077 ms (0 allocations: 0 bytes)
5.000437863394746e7

julia> @btime sumcart_iter($A)
  145.541 ms (0 allocations: 0 bytes)
5.000437863394746e7

After:

julia> @btime sumcart_iter($A)
  135.652 ms (29490 allocations: 617.03 KiB)
5.000437863394746e7

I'm not sure why it allocates a lot, though. The output of @code_typed optimize=true sumcart_iter(A) looks clean to me.

This gives us the performance very close to the manual nested loop:

diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index 66a22b7005..209c3707b8 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -344,30 +344,34 @@ module IteratorsMD
         iterfirst, iterfirst
     end
     @inline function iterate(iter::CartesianIndices, state)
-        valid, I = __inc(state.I, first(iter).I, last(iter).I)
-        valid || return nothing
+        I = __inc(state.I, first(iter).I, last(iter).I)
+        I === nothing && return nothing
         return CartesianIndex(I...), CartesianIndex(I...)
     end

     # increment & carry
     @inline function inc(state, start, stop)
-        _, I = __inc(state, start, stop)
+        I = __inc(state, start, stop)
         return CartesianIndex(I...)
     end

     # increment post check to avoid integer overflow
-    @inline __inc(::Tuple{}, ::Tuple{}, ::Tuple{}) = false, ()
+    @inline __inc(::Tuple{}, ::Tuple{}, ::Tuple{}) = nothing
     @inline function __inc(state::Tuple{Int}, start::Tuple{Int}, stop::Tuple{Int})
-        valid = state[1] < stop[1]
-        return valid, (state[1]+1,)
+        if state[1] < stop[1]
+            return (state[1]+1,)
+        else
+            return nothing
+        end
     end

     @inline function __inc(state, start, stop)
         if state[1] < stop[1]
-            return true, (state[1]+1, tail(state)...)
+            return (state[1]+1, tail(state)...)
         end
-        valid, I = __inc(tail(state), tail(start), tail(stop))
-        return valid, (start[1], I...)
+        I = __inc(tail(state), tail(start), tail(stop))
+        I === nothing && return nothing
+        return (start[1], I...)
     end

     # 0-d cartesian ranges are special-cased to iterate once and only once

After:

julia> @btime sumcart_iter($A)
  133.548 ms (0 allocations: 0 bytes)
5.000437863394746e7

Not sure where extra ~7 ms is coming from, though (the manual case is 126--129 ms in my laptop).

So incredible to watch this close before my eyes! I have been waiting almost 6 years for this. You are a hero, @johnnychen94!

Was this page helpful?
0 / 5 - 0 ratings

Related issues

omus picture omus  Â·  3Comments

StefanKarpinski picture StefanKarpinski  Â·  3Comments

Keno picture Keno  Â·  3Comments

dpsanders picture dpsanders  Â·  3Comments

manor picture manor  Â·  3Comments