I'm a bit confused why generic_matvecmul! doesn't use @inbounds
for the forlloops, as it makes it 5x faster:
julia> import LinearAlgebra: lapack_size
julia> function generic_matvecmul!(C::AbstractVector{R}, tA, A::AbstractVecOrMat, B::AbstractVector) where R
mB = length(B)
mA, nA = lapack_size(tA, A)
if mB != nA
throw(DimensionMismatch("matrix A has dimensions ($mA,$nA), vector B has length $mB"))
end
if mA != length(C)
throw(DimensionMismatch("result C has length $(length(C)), needs length $mA"))
end
Astride = size(A, 1)
if tA == 'T' # fastest case
for k = 1:mA
aoffs = (k-1)*Astride
if mB == 0
s = zero(R)
else
s = zero(A[aoffs + 1]*B[1] + A[aoffs + 1]*B[1])
end
for i = 1:nA
s += transpose(A[aoffs+i]) * B[i]
end
C[k] = s
end
elseif tA == 'C'
for k = 1:mA
aoffs = (k-1)*Astride
if mB == 0
s = zero(R)
else
s = zero(A[aoffs + 1]*B[1] + A[aoffs + 1]*B[1])
end
for i = 1:nA
s += A[aoffs + i]'B[i]
end
C[k] = s
end
else # tA == 'N'
@inbounds for i = 1:mA
if mB == 0
C[i] = zero(R)
else
C[i] = zero(A[i]*B[1] + A[i]*B[1])
end
end
@inbounds for k = 1:mB
aoffs = (k-1)*Astride
b = B[k]
for i = 1:mA
C[i] += A[aoffs + i] * b
end
end
end
C
end
generic_matvecmul! (generic function with 1 method)
julia> n = 1000; A = randn(n,n); x = randn(n); y = randn(n);
julia> @time generic_matvecmul!(y, 'N', A, x);
0.000530 seconds (4 allocations: 160 bytes)
julia> @time LinearAlgebra.generic_matvecmul!(y, 'N', A, x);
0.001840 seconds (4 allocations: 160 bytes)
Regarding the benchmarks, I imagine you performed warmup runs but omitted them from the above (and are aware of BenchmarkTools)? :)
The conclusion to https://github.com/JuliaLang/julia/issues/20469 suggests that judicious use of @inbounds
in such generic methods should be alright.
OK, fine, 3x speedup:
julia> @btime generic_matvecmul!(y, 'N', A, x);
284.819 渭s (0 allocations: 0 bytes)
julia> @btime LinearAlgebra.generic_matvecmul!(y, 'N', A, x);
840.900 渭s (0 allocations: 0 bytes)
Most helpful comment
OK, fine, 3x speedup: