Compared to manual iteration, the new cartesian iterator has a small performance penalty:
function sumcart_manual(A::AbstractMatrix)
s = 0.0
@inbounds for j = 1:size(A,2), i = 1:size(A,1)
s += A[i,j]
end
s
end
function sumcart_iter(A)
s = 0.0
@inbounds for I in CartesianIndices(size(A))
s += A[I]
end
s
end
A = rand(10^4,10^4);
sumcart_manual(A);
sumcart_iter(A);
julia> @time sumcart_manual(A)
elapsed time: 0.176363505 seconds (96 bytes allocated)
4.999591778337127e7
julia> @time sumcart_iter(A)
elapsed time: 0.250890195 seconds (96 bytes allocated)
4.999591778337127e7
In this gist, I compare the code that gets generated. The comparison reveals that the CartesianIndex
is not fully elided by the compiler.
Any ideas what's happening?
EDIT: the core problem is that next
contains a branch, and testing done
is a 2nd branch, so there are two branches per iteration. See https://github.com/JuliaLang/julia/issues/16035#issuecomment-214482039
Here's an example that doesn't rely on all the code-generation facilities in the iterators in Base:
module Iter
import Base: start, next, done, getindex
immutable Raw2
i1::Int
i2::Int
end
Raw2(dims::(Int,Int)) = Raw2(dims[1], dims[2])
start(sz::Raw2) = Raw2(1,1)
@inline next(sz::Raw2, state::Raw2) = state.i1 < sz.i1 ?
(state, Raw2(state.i1+1,state.i2)) :
(state, Raw2(1, state.i2+1))
done(sz::Raw2, state::Raw2) = state.i2 > sz.i2
getindex(A::AbstractArray, I::Raw2) = A[I.i1,I.i2]
end
function mysum(A)
s = 0.0
@inbounds for I in Iter.Raw2(size(A))
s += A[I]
end
s
end
function mysum2(A)
s = 0.0
@inbounds for j = 1:size(A,2), i = 1:size(A,1)
s += A[i,j]
end
s
end
Then do
A = reshape(1:12, 3, 4)
@code_llvm mysum(A)
@code_llvm mysum2(A)
You should see that the code for mysum
is not nearly as well-optimized as mysum2
.
@simonster, while we're on the topic of SubArrays and indexing code generation, if you have time I'd be grateful for any insights you have here.
Comparing mysum
and mysum2
for A = rand(10^4, 10^4)
reveals that there's a 40% speed gap, too. The critical section of @code_llvm
for mysum
is
L3: ; preds = %L2, %if1
%_var0.0 = phi [2 x %Raw2] [ %41, %L2 ], [ %37, %if1 ]
%42 = extractvalue [2 x %Raw2] %_var0.0, 0, !dbg !796, !julia_type !803
%43 = extractvalue [2 x %Raw2] %_var0.0, 1, !dbg !796, !julia_type !803
%44 = extractvalue %Raw2 %42, 0, !dbg !804
%45 = add i64 %44, -1, !dbg !804
%46 = extractvalue %Raw2 %42, 1, !dbg !804
%47 = add i64 %46, -1, !dbg !804
%48 = mul i64 %47, %13, !dbg !804
%49 = add i64 %45, %48, !dbg !804
%50 = getelementptr double* %28, i64 %49, !dbg !804
%51 = load double* %50, align 8, !dbg !804, !tbaa %jtbaa_user
%52 = fadd double %s.0, %51, !dbg !804
%53 = extractvalue %Raw2 %43, 1, !dbg !804
%54 = icmp slt i64 %26, %53, !dbg !804
br i1 %54, label %L6, label %L, !dbg !804
compared to
L2: ; preds = %L2.preheader, %L2
%s.1 = phi double [ %24, %L2 ], [ %s.0, %L2.preheader ]
%"#s10.0" = phi i64 [ %20, %L2 ], [ 1, %L2.preheader ]
%20 = add i64 %"#s10.0", 1, !dbg !798
%21 = add i64 %19, %"#s10.0", !dbg !805
%22 = getelementptr double* %10, i64 %21, !dbg !805
%23 = load double* %22, align 8, !dbg !805, !tbaa %jtbaa_user
%24 = fadd double %s.1, %23, !dbg !805
%25 = icmp eq i64 %"#s10.0", %15, !dbg !805
br i1 %25, label %L5, label %L2, !dbg !805
for mysum2
.
Anything obvious to change to narrow the gap between these?
This generates better code for me:
start(sz::Raw2) = Raw2(0,0)
@inline function next(sz::Raw2, state::Raw2)
i1 = state.i1+1
i2 = state.i2
if i1 == sz.i1
i1 = 0
i2 = state.i2+1
end
return (state, Raw2(i1, i2))
end
done(sz::Raw2, state::Raw2) = state.i2 == sz.i2
getindex(A::AbstractArray, I::Raw2) = A[I.i1+1,I.i2+1]
This is ~5% slower on my system instead of ~20%. But both the original code and this change are still suboptimal because the inner loop has a branch:
L: ; preds = %L.preheader, %L2
%s.0 = phi double [ %24, %L2 ], [ 0.000000e+00, %L.preheader ]
%"#s2307.0" = phi %Raw2 [ %19, %L2 ], [ zeroinitializer, %L.preheader ]
%9 = ptrtoint %jl_value_t* %8 to i64
%10 = extractvalue %Raw2 %"#s2307.0", 0, !dbg !1614
%11 = add i64 %10, 1, !dbg !1614
%12 = extractvalue %Raw2 %"#s2307.0", 1, !dbg !1614
%13 = icmp eq i64 %11, %9, !dbg !1614
br i1 %13, label %if1, label %L2, !dbg !1614
if1: ; preds = %L
%14 = add i64 %12, 1, !dbg !1614
br label %L2, !dbg !1614
L2: ; preds = %L, %if1
%_var1.0 = phi i64 [ %11, %L ], [ 0, %if1 ]
%_var2.0 = phi i64 [ %12, %L ], [ %14, %if1 ]
%15 = ptrtoint %jl_value_t* %2 to i64
%16 = ptrtoint %jl_value_t* %8 to i64
%17 = bitcast i8* %6 to double*
%18 = insertvalue %Raw2 undef, i64 %_var1.0, 0, !dbg !1614
%19 = insertvalue %Raw2 %18, i64 %_var2.0, 1, !dbg !1614, !julia_type !1621
%20 = mul i64 %12, %16, !dbg !1622
%21 = add i64 %20, %10, !dbg !1622
%22 = getelementptr double* %17, i64 %21, !dbg !1622
%23 = load double* %22, align 8, !dbg !1622, !tbaa %jtbaa_user
%24 = fadd double %s.0, %23, !dbg !1622
%25 = icmp eq i64 %_var2.0, %15, !dbg !1622
br i1 %25, label %L5, label %L, !dbg !1622
This is pretty much what you'd expect given the iteration protocol. In Julia, it would look something like:
@label start
newi1 = i1 + 1
if newi1 == sz1
newi1 = 0
newi2 += 1
else
newi2 = i2
end
ind = i2*sz1+i1
@inbounds s += A[ind+1]
newi2 == sz2 && @goto done
i1 = newi1
i2 = newi2
@goto start
@label done
(This generates exactly the same assembly for the inner loop). Rearranging this a little allows LLVM to optimize it to match the performance of mysum2
:
@label start
ind = i2*sz1+i1
@inbounds s += A[ind+1]
newi1 = i1 + 1
if newi1 == sz1
newi1 = 0
newi2 += 1
newi2 == sz2 && @goto done
else
newi2 = i2
end
i1 = newi1
i2 = newi2
@goto start
@label done
But I can't see how to express this in the current iteration framework.
edit: Ref iteration protocol issues #6125, #8149, #9182
Worth keeping in mind wrt https://github.com/JuliaLang/julia/issues/8149 – if we could make it easier to express internal iteration _and_ get a performance bump at the same time, that would be the best.
Thanks a million, @simonster. The impact of these "stylistic" differences is quite surprising to me---I've learned to lean on LLVM to do the right thing, but this is a good counterexample.
I'll look into adopting your version for our code generation for next
.
Hmm, on my system, my original version is _faster_ than your modification. This is weird.
I tried on another system, which I probably should have done earlier. On my Sandy Bridge system on Linux, mysum
is 37% slower than mysum2
with your original version and 35% slower than mysum2
with my modifications. On my Haswell system on OS X, mysum
is 22% slower than mysum2
with your original version and 7% slower than mysum2
with my modifications. Both systems are running LLVM 3.3 and the generated assembly is identical, so the performance difference seems to be in the architecture. I'm just benchmarking with:
function bench(A)
t1 = 0.0
t2 = 0.0
for i = 1:50
t1 += @elapsed mysum(A)
t2 += @elapsed mysum2(A)
end
t1/t2
end
From reading a bit more, it seems that the cost of branches is a big difference among architectures (mine's a Sandy Bridge).
Just for fun I tried #9182, but by itself it didn't change the performance. I wondered if it did the nextstate
at the end of the loop, rather than right after nextval
, if it might fare better---you wouldn't need to essentially have two state variables (old and new).
To test this, I followed your lead and wrote out some explicit versions using while
, and got pretty interesting results:
function mysum_while1(A)
s = 0.0
i = j = 1
inew = jnew = 1
while j <= size(A,2)
if inew < size(A,1)
inew += 1
else
inew = 1
jnew += 1
end
@inbounds s += A[i,j]
i = inew
j = jnew
end
s
end
function mysum_while2(A)
s = 0.0
i = j = 1
while j <= size(A,2)
@inbounds s += A[i,j]
if i < size(A,1)
i += 1
else
i = 1
j += 1
end
end
s
end
function mysum_while3(A)
s = 0.0
j = 1
while j <= size(A,2)
i = 1
while i <= size(A,1)
@inbounds s += A[i,j]
i += 1
end
j += 1
end
s
end
function mysum_while4(A)
s = 0.0
i = j = 1
inew = jnew = 1
notdone = true
while notdone
inew += 1
if inew > size(A, 1)
inew = 1
jnew += 1
notdone = jnew <= size(A,2)
end
@inbounds s += A[i,j]
i = inew
j = jnew
end
s
end
function mysum_while5(A)
s = 0.0
i = j = 1
notdone = true
while notdone
@inbounds s += A[i,j]
i += 1
if i > size(A, 1)
i = 1
j += 1
notdone = j <= size(A,2)
end
end
s
end
function mysum_while6(A)
s = 0.0
i = j = 1
while j <= size(A,2)
@inbounds s += A[i,j]
i += 1
if i > size(A, 1)
i = 1
j += 1
end
end
s
end
Timing results after warmup:
julia> @time mysum_while1(A)
elapsed time: 0.258024361 seconds (96 bytes allocated)
4.9997446791483015e7
julia> @time mysum_while2(A)
elapsed time: 0.216438899 seconds (96 bytes allocated)
4.9997446791483015e7
julia> @time mysum_while3(A)
elapsed time: 0.157289985 seconds (96 bytes allocated)
4.9997446791483015e7
julia> @time mysum_while4(A)
elapsed time: 0.227041972 seconds (96 bytes allocated)
4.9997446791483015e7
julia> @time mysum_while5(A)
elapsed time: 0.140768946 seconds (96 bytes allocated)
4.9997446791483015e7
julia> @time mysum_while6(A)
elapsed time: 0.190532756 seconds (96 bytes allocated)
4.9997446791483015e7
So, at least on my machine
while
testing seems to helpInterestingly, the iterators in multidimensional
already use the boolean version, so it might just be a case of rewriting the way next
works. But first, can you check on your system that mysum_while5
is among the best performers?
One more:
function mysum_while7(A)
s = 0.0
i = j = 1
notdone = true
while notdone
@inbounds s += A[i,j]
if i < size(A,1)
i += 1
else
i = 1
j += 1
notdone = j <= size(A,2)
end
end
s
end
is just as fast as the fastest ones above. So it really seems to come down to (1) putting the increment at the end, and (2) using a boolean for the while
testing (and updating that boolean at the deepest level of the increment testing).
The test case in Tim's StackOverflow answer is much improved with the Tuple overhaul. I believe enumerate and zip were slow previously due to nested tuples. That seems to no longer be the case after the tupocolypse! :fireworks:
Unfortunately, the main test case in this PR (for Cartesian Indexing) isn't affected.
Looks like #12090 also solved this guy!
julia> @time sumcart_manual(A); @time sumcart_iter(A); # Edit: cheating with eachindex
142.564 milliseconds (5 allocations: 176 bytes)
142.117 milliseconds (5 allocations: 176 bytes)
And the @code_llvm
is actually _simpler_ for sumcart_iter
since it's only one loop: gist with llvm output.
The Cartesian objects aren't elided for LinearSlow arrays, but I can't measure a performance difference with sparse arrays:
julia> A = sprand(10^3,10^3,.1);
julia> @time sumcart_manual(A); @time sumcart_iter(A)
35.287 milliseconds (5 allocations: 176 bytes)
35.934 milliseconds (5 allocations: 176 bytes)
50221.35105393453
Not true for me. I updated the code sample at the top; the change in eachindex
to use linear indexing, when applicable, might be confusing?
Duh. That's the second time I've done that, isn't it?
I'm not smart enough to keep track. And really, given how much you've improved Array-type operations, why would I want to??
Current situation:
julia> @btime sumcart_manual(A);
98.752 ms (1 allocation: 16 bytes)
julia> @btime sumcart_iter(A);
126.430 ms (1 allocation: 16 bytes)
Linking #18823 and https://github.com/JuliaLang/julia/issues/16035#issue-150794573.
Seems a bit worse than https://github.com/JuliaLang/julia/issues/9080#issuecomment-275629478 now.
julia> @btime sumcart_manual(A)
100.859 ms (1 allocation: 16 bytes)
5.000302140597446e7
julia> @btime sumcart_iter(A)
204.743 ms (1 allocation: 16 bytes)
5.000302140597446e7
I've commented this already in https://github.com/JuliaLang/julia/issues/15648#issuecomment-511048192. But I don't think iterator is a good abstraction to handle this. I think the approach based on foreach
/foldl
/reduce
would be much better.
Here is a minimal implementation of foreach
to handle CartesianIndices
-like iteration:
@inline foreach′(f, I::CartesianIndices) =
foreach_product((args...) -> f(CartesianIndex(reverse(args)...)), reverse(I.indices)...)
@inline function foreach_product(f::F, itr, itrs...) where F
@inbounds for x in itr
@inline g(args...) = f(x, args...)
foreach_product(g, itrs...)
end
end
@inline function foreach_product(f::F, itr) where F
@inbounds for x in itr
f(x)
end
end
(Note: the order of axes in foreach_product
is reversed compared to Iterators.product
.)
Benchmark code:
using BenchmarkTools
function copyto_manual!(A, B)
@assert axes(A) == axes(B)
@inbounds for j = 1:size(A,2), i = 1:size(A,1)
A[i, j] = B[i, j]
end
end
function copyto_foreach!(A, B)
foreach′(eachindex(A, B)) do I
@inbounds A[I] = B[I]
end
end
function copyto_iterate!(A, B)
@inbounds for I in eachindex(A, B)
A[I] = B[I]
end
end
B = rand(10^2, 10^2);
A = similar(B');
Results:
julia> @btime copyto_manual!($A, $B');
6.401 μs (1 allocation: 16 bytes)
julia> @btime copyto_foreach!($A, $B');
6.243 μs (1 allocation: 16 bytes)
julia> @btime copyto_iterate!($A, $B');
16.035 μs (1 allocation: 16 bytes)
As you can see, the performance of foreach
-based code is equivalent to the manual loop and much faster than the iterate
-based code.
For implementing summation as in the OP, we need foldl
and/or reduce
. It is actually a better strategy to implement foreach
based on foldl
to keep things DRY. But I chose foreach
as the MWE as I thought it may be simpler to understand.
Love the implementation!
But I don't think iterator is a good abstraction to handle this. I think the approach based on foreach/foldl/reduce would be much better.
I am excited about the transducer approach and need to learn & use it more, but I don't think iterators are "wrong." A lot of the virtue of CartesianIndex iteration is being able to perform manipulations on the indexes for stencil operations. Is it easy to write all the operations in https://julialang.org/blog/2016/02/iteration/ in terms of transducers?
I was trying to bring up the difference between foldl
and "iterator", or more specifically the iterate
function, as the _implementation_ of, well, iteration (maybe it's confusing). In particular, I have no criticism on the _concrete interfaces_ like eachindex
, CartesianIndices
, etc. They are all fantastic! What I wanted to point out was that (1) implementing specific foldl
for them (i.e., bypassing iterate
) can yield a huge speedup gain sometimes and (2) "functional" entry points like foreach
will be more favorable once you have that.
Iterator has been _the_ way to provide customizable iterations in many languages so I can imagine my point may be very fuzzy. But actually iterate
and foldl
are equally powerful formulation of customizable iterations (or rather foldl
can be extended to be so). I think it is conceivable to have a programming language where the for
loop syntax is lowered to foldl
rather than iterate
. Of course, I'm not suggesting to do this as it would be just way too destructive. Furthermore, foldl
and iterate
are good at different things. foldl
is good at consuming collections and iterate
is good at constructing them. So, having both of them may ultimately be a good idea. I think it's like any other Julia functions; you add a performance specialization on top of a generic fallback when it is a big deal.
Is it easy to write all the operations in https://julialang.org/blog/2016/02/iteration/ in terms of transducers?
(BTW, just a bit of terminology nitpick (sorry!). You may be using "transducers" as a broad term where I do some functional stuff, but what I'm presenting here is nothing to do with transducers _per se_ and instead it's about foldl
. Transducers are tools for using foldl
; just like Base.Iterators
is a tool kit for using for
loop.)
Transforming for
loop to foldl
or foreach
is straightforward. You can just replace for x in xs
to foreach(xs) do x
if there is no state (as in copyto!
). When you have state, you can put it in the accumulator of foldl
:
function extrema_loop(xs)
small = typemax(eltype(xs))
large = typemin(eltype(xs))
for x in xs
small = min(small, x)
large = max(large, x)
end
return (small, large)
end
becomes
function extrema_foldl(xs)
small = typemax(eltype(xs))
large = typemin(eltype(xs))
foldl(xs, init=(small, large)) do (small, large), x
small = min(small, x)
large = max(large, x)
(small, large)
end
end
But I think more important question is how to expose the algorithms in the blog post in reusable APIs and how to optimize them. Maybe a generalized version of boxcar3
is a good example. You can define foreach_boxcar(f, A::AbstractArray, ::Val{width})
such that boxcar3
can be written as
function boxcar3(A::AbstractArray) where {F}
out = similar(A)
foreach_boxcar(A, Val(3)) do I, x
out[I] = mean(x)
end
return out
end
This API makes writing other variants (say using median
instead of mean
) very easy.
An implementation of foreach_boxcar
is something like this
using StaticArrays
_valof(::Val{x}) where {x} = x
function foreach_boxcar(f, A::AbstractArray{<:Any, N}, width::Val = Val(3)) where {N}
# For simplicity:
@assert isodd(_valof(width))
@assert all(s -> s >= _valof(width), size(A))
R = CartesianIndices(A)
I1 = oneunit(first(R))
# Phase 1: Loop over the "main" part (w/o edges).
main = ntuple(N) do d
k = _valof(width) ÷ 2
k+1:size(A, d)-k
end
# Pass a static array to `f`, assuming that `width` is small:
satype = SArray{Tuple{ntuple(_ -> _valof(width), N)...}, eltype(A)}
for I in R[main...]
f(I, satype(view(A, I-I1:I+I1)))
end
# Note: We can use `foreach(R[main...]) do I` for better
# performance.
# Phase 2: Loop over the edges.
Ifirst, Ilast = first(R), last(R)
for kinds in Iterators.product(ntuple(_ -> (-1, 0, 1), N)...)
all(iszero, kinds) && continue
ranges = ntuple(N) do d
k = _valof(width) ÷ 2
if kinds[d] == -1 # left edge
1:k
elseif kinds[d] == 1 # right edge
size(A, d)-k+1:size(A, d)
else # main
k+1:size(A, d)-k
end
end
for I in R[ranges...]
J = max(Ifirst, I-I1):min(Ilast, I+I1)
# Pass a view, as the shapes are heterogeneous:
f(I, view(A, J))
end
end
end
I did a bit of optimization that uses SArray
while looping over the "main" part. This is to demonstrate that, unlike iterate
, you can have heterogeneous "element types" (where element is a pair (I, block)
) without invoking type-instability in the functional approach. This is because you can call f
in multiple code locations.
The important point is that it's the _implementer of foreach
_ who controls the loop. When you expose this with iterate
it's the other way around; i.e., the caller controls the loop. You can do _a lot_ of optimization if you control the loop. In foreach_boxcar
, I can pass SArray
without type instability because I can split the loop in to phases. In foreach_product
, I can "lower" Cartesian iteration to nested for loop because I control the loop. Furthermore, this pattern is composable; I could use foreach_product
from foreach_boxcar
.
No, I understand that inversion of control can be very useful. But notice that in the key portion of your code here you're using iteration, not foldl
. That was my only point. Some things are easier to write using iteration, so it's not a "bad abstraction" (you're using it yourself).
for
loop syntax is just sugar. I "ended up" using iterate
because Julia's for
loop is happened to be lowered to it. It is nothing to do with iterate
being "good" or "bad." As I said, it is possible to have a language where for
loop is lowered to foldl
. In such a language, you'd write foldl
for basic concepts like UnitRange
using, e.g., while
loop. You don't _need_ to have iterate
in such a language. It's a trivial statement, but, if for
loops were lowered to foldl
in Julia, my code didn't use iterate
(although then there is no need to use foreach
or foldl
directly like this). Another way to put this: I could, in principle, write foreach_boxcar
using while
loop plus some abstractions built on top it.
Interestingly, the performance regression seems to go away when adding @simd
to the Cartesian examples, even when it should be irrelevant, as in the copyto!
case:
julia> @btime copyto_iterate!($A, $B');
13.590 μs (1 allocation: 16 bytes)
julia> @btime copyto_manual!($A, $B');
3.086 μs (1 allocation: 16 bytes)
julia> @btime copyto_foreach!($A, $B');
3.348 μs (3 allocations: 80 bytes)
julia> function copyto_iterate!(A, B)
@inbounds @simd for I in eachindex(A, B)
A[I] = B[I]
end
end
copyto_iterate! (generic function with 1 method)
julia> @btime copyto_iterate!($A, $B');
3.012 μs (1 allocation: 16 bytes)
Or, the example from the opening post:
julia> A = rand(256,20);
julia> function sumcart_manual(A::AbstractMatrix)
s = 0.0
@inbounds for j = 1:size(A,2), i = 1:size(A,1)
s += A[i,j]
end
s
end
sumcart_manual (generic function with 1 method)
julia> function sumcart_iter(A)
s = 0.0
@inbounds for I in CartesianIndices(size(A))
s += A[I]
end
s
end
sumcart_iter (generic function with 1 method)
julia> @btime sumcart_manual($A)
4.432 μs (0 allocations: 0 bytes)
2556.470985732226
julia> @btime sumcart_iter($A)
4.640 μs (0 allocations: 0 bytes)
2556.470985732226
julia> function sumcart_manual(A::AbstractMatrix)
s = 0.0
@inbounds for j = 1:size(A,2); @simd for i = 1:size(A,1)
s += A[i,j]
end; end
s
end
sumcart_manual (generic function with 1 method)
julia> function sumcart_iter(A)
s = 0.0
@inbounds @simd for I in CartesianIndices(size(A))
s += A[I]
end
s
end
sumcart_iter (generic function with 1 method)
julia> @btime sumcart_manual($A)
333.601 ns (0 allocations: 0 bytes)
2556.470985732225
julia> @btime sumcart_iter($A)
332.655 ns (0 allocations: 0 bytes)
2556.470985732225
Because of these methods, @simd
converts the loop into the equivalent of
R = CartesianIndices(A)
for Itail in CartesianIndices(Base.tail(R.indices)), i in R.indices[1]
s += A[i, Itail]
end
By splitting out the first loop you mitigate but don't actually fix the problem. You should still be able to detect it if, for example, A
is 3-dimensional and the first dimension is very short (e.g., size 1 or 2). Note that the magnitude of the effect is very processor-dependent.
Last time I investigated, the real problem with performance in the non-@simd
version is that there are a minimum of two branches per iteration: one to test whether we need to perform a "carry" on the iterator, and another to decide whether iterate
returned nothing
and the loop should terminate (if you have to perform multiple carries it's even worse). Theoretically the optimizer should be able to realize that iterate
won't return nothing
if it doesn't carry, and then reorder the operations, but that doesn't seem to happen. As @tkf says, that optimization happens almost automatically for foldl
because by calling front
you can do that peeling one iterator at a time (#35036), so you never even need to implement a carry operation.
But we have to fix this anyway, and I don't think it will be hard. I was hoping it would arise from a more general optimization but perhaps it's time to add a compiler pass that detects calls to iterate(::CartesianIndex{N})
and rewrites the following code block using a set of N
nested loops. I should just sit down and do it, it might be an interesting experiment in the context of evaluating deeper integration with LoopVectorization.
If it is easy to fix I agree it's a good idea to do this since it would make existing code faster. I assumed that it was a hard problem since it has been opened for a long time.
But I think it's still fair to say it becomes almost impossible to define efficient iterate
when the collections are "complex enough" (e.g., sparse arrays, B-trees). From this point of view, it's a good idea to stop recommending iterate
when writing generic functions. Though I suppose this discussion has to happen elsewhere to keep the scope of this issue simple.
I was curious how easy it is so I had a quick go at it. This improves the performance although it doesn't bring us to the performance of the nested loops:
diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index 66a22b7005..e3306c1033 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -345,7 +345,7 @@ module IteratorsMD
end
@inline function iterate(iter::CartesianIndices, state)
valid, I = __inc(state.I, first(iter).I, last(iter).I)
- valid || return nothing
+ valid === Val(true) || return nothing
return CartesianIndex(I...), CartesianIndex(I...)
end
@@ -356,15 +356,18 @@ module IteratorsMD
end
# increment post check to avoid integer overflow
- @inline __inc(::Tuple{}, ::Tuple{}, ::Tuple{}) = false, ()
+ @inline __inc(::Tuple{}, ::Tuple{}, ::Tuple{}) = Val(false), ()
@inline function __inc(state::Tuple{Int}, start::Tuple{Int}, stop::Tuple{Int})
- valid = state[1] < stop[1]
- return valid, (state[1]+1,)
+ if state[1] < stop[1]
+ return Val(true), (state[1]+1,)
+ else
+ return Val(false), (state[1]+1,)
+ end
end
@inline function __inc(state, start, stop)
if state[1] < stop[1]
- return true, (state[1]+1, tail(state)...)
+ return Val(true), (state[1]+1, tail(state)...)
end
valid, I = __inc(tail(state), tail(start), tail(stop))
return valid, (start[1], I...)
Before:
julia> @btime sumcart_manual($A)
129.077 ms (0 allocations: 0 bytes)
5.000437863394746e7
julia> @btime sumcart_iter($A)
145.541 ms (0 allocations: 0 bytes)
5.000437863394746e7
After:
julia> @btime sumcart_iter($A)
135.652 ms (29490 allocations: 617.03 KiB)
5.000437863394746e7
I'm not sure why it allocates a lot, though. The output of @code_typed optimize=true sumcart_iter(A)
looks clean to me.
This gives us the performance very close to the manual nested loop:
diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index 66a22b7005..209c3707b8 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -344,30 +344,34 @@ module IteratorsMD
iterfirst, iterfirst
end
@inline function iterate(iter::CartesianIndices, state)
- valid, I = __inc(state.I, first(iter).I, last(iter).I)
- valid || return nothing
+ I = __inc(state.I, first(iter).I, last(iter).I)
+ I === nothing && return nothing
return CartesianIndex(I...), CartesianIndex(I...)
end
# increment & carry
@inline function inc(state, start, stop)
- _, I = __inc(state, start, stop)
+ I = __inc(state, start, stop)
return CartesianIndex(I...)
end
# increment post check to avoid integer overflow
- @inline __inc(::Tuple{}, ::Tuple{}, ::Tuple{}) = false, ()
+ @inline __inc(::Tuple{}, ::Tuple{}, ::Tuple{}) = nothing
@inline function __inc(state::Tuple{Int}, start::Tuple{Int}, stop::Tuple{Int})
- valid = state[1] < stop[1]
- return valid, (state[1]+1,)
+ if state[1] < stop[1]
+ return (state[1]+1,)
+ else
+ return nothing
+ end
end
@inline function __inc(state, start, stop)
if state[1] < stop[1]
- return true, (state[1]+1, tail(state)...)
+ return (state[1]+1, tail(state)...)
end
- valid, I = __inc(tail(state), tail(start), tail(stop))
- return valid, (start[1], I...)
+ I = __inc(tail(state), tail(start), tail(stop))
+ I === nothing && return nothing
+ return (start[1], I...)
end
# 0-d cartesian ranges are special-cased to iterate once and only once
After:
julia> @btime sumcart_iter($A)
133.548 ms (0 allocations: 0 bytes)
5.000437863394746e7
Not sure where extra ~7 ms is coming from, though (the manual case is 126--129 ms in my laptop).
So incredible to watch this close before my eyes! I have been waiting almost 6 years for this. You are a hero, @johnnychen94!
Most helpful comment
So incredible to watch this close before my eyes! I have been waiting almost 6 years for this. You are a hero, @johnnychen94!