Julia: Multiply and add instructions on loop reduction variables not combined with LLVM >= 8.0.1

Created on 13 Dec 2019  Â·  6Comments  Â·  Source: JuliaLang/julia

Example code:

function dotf(a,b)
    s = 0.0
    @inbounds @simd for i ∈ eachindex(a)
        s += a[i] * b[i]
    end
    s
end
function dotw(a,b)
    s = 0.0; ptra = pointer(a); ptrb = pointer(b); n = 0; N = length(a)
    while n < N
        @fastmath s += unsafe_load(ptra + sizeof(eltype(a))*n) * unsafe_load(ptrb + sizeof(eltype(b))*n)
        n += 1
    end
    s
end
a = rand(400); b = rand(400);
@code_native debuginfo=:none dotf(a,b)
@code_native debuginfo=:none dotw(a,b)
@code_llvm debuginfo=:none dotf(a,b)
@code_llvm debuginfo=:none dotw(a,b)

With version (I also tested with 8.0.1, getting the same result):

julia> versioninfo()
Julia Version 1.4.0-DEV.597
Commit e12b054* (2019-12-13 09:36 UTC)
Platform Info:
  OS: Linux (x86_64-pc-linux-gnu)
  CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-9.0.0 (ORCJIT, haswell)
Environment:
  JULIA_NUM_THREADS = 24

this yields

#julia> @code_native debuginfo=:none dotf(a,b)
        .text
        movq    24(%rdi), %rax
        testq   %rax, %rax
        jle     L44
        movq    %rax, %rcx
        sarq    $63, %rcx
        andnq   %rax, %rcx, %rax
        movq    (%rdi), %rcx
        movq    (%rsi), %rdx
        cmpq    $16, %rax
        jae     L49
        vxorpd  %xmm0, %xmm0, %xmm0
        xorl    %esi, %esi
        jmp     L192
L44:
        vxorps  %xmm0, %xmm0, %xmm0
        retq
L49:
        movabsq $9223372036854775792, %rsi # imm = 0x7FFFFFFFFFFFFFF0
        andq    %rax, %rsi
        vxorpd  %xmm0, %xmm0, %xmm0
        xorl    %edi, %edi
        vxorpd  %xmm1, %xmm1, %xmm1
        vxorpd  %xmm2, %xmm2, %xmm2
        vxorpd  %xmm3, %xmm3, %xmm3
L80:
        vmovupd (%rcx,%rdi,8), %ymm4
        vmovupd 32(%rcx,%rdi,8), %ymm5
        vmovupd 64(%rcx,%rdi,8), %ymm6
        vmovupd 96(%rcx,%rdi,8), %ymm7
        vmulpd  (%rdx,%rdi,8), %ymm4, %ymm4
        vaddpd  %ymm4, %ymm0, %ymm0
        vmulpd  32(%rdx,%rdi,8), %ymm5, %ymm4
        vaddpd  %ymm4, %ymm1, %ymm1
        vmulpd  64(%rdx,%rdi,8), %ymm6, %ymm4
        vmulpd  96(%rdx,%rdi,8), %ymm7, %ymm5
        vaddpd  %ymm4, %ymm2, %ymm2
        vaddpd  %ymm5, %ymm3, %ymm3
        addq    $16, %rdi
        cmpq    %rdi, %rsi
        jne     L80
        vaddpd  %ymm0, %ymm1, %ymm0
        vaddpd  %ymm0, %ymm2, %ymm0
        vaddpd  %ymm0, %ymm3, %ymm0
        vextractf128    $1, %ymm0, %xmm1
        vaddpd  %xmm1, %xmm0, %xmm0
        vpermilpd       $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
        vaddsd  %xmm1, %xmm0, %xmm0
        cmpq    %rsi, %rax
        je      L211
        nopl    (%rax)
L192:
        vmovsd  (%rcx,%rsi,8), %xmm1    # xmm1 = mem[0],zero
        vfmadd231sd     (%rdx,%rsi,8), %xmm1, %xmm0 # xmm0 = (xmm1 * mem) + xmm0
        incq    %rsi
        cmpq    %rax, %rsi
        jb      L192
L211:
        vzeroupper
        retq
        nopw    (%rax,%rax)

#julia> @code_native debuginfo=:none dotw(a,b)
        .text
        movq    8(%rdi), %rax
        testq   %rax, %rax
        jle     L32
        movq    (%rdi), %rcx
        movq    (%rsi), %rdx
        cmpq    $16, %rax
        jae     L37
        vxorpd  %xmm0, %xmm0, %xmm0
        xorl    %esi, %esi
        jmp     L176
L32:
        vxorps  %xmm0, %xmm0, %xmm0
        retq
L37:
        movq    %rax, %rsi
        andq    $-16, %rsi
        vxorpd  %xmm0, %xmm0, %xmm0
        xorl    %edi, %edi
        vxorpd  %xmm1, %xmm1, %xmm1
        vxorpd  %xmm2, %xmm2, %xmm2
        vxorpd  %xmm3, %xmm3, %xmm3
        nop
L64:
        vmovupd (%rdx,%rdi,8), %ymm4
        vmovupd 32(%rdx,%rdi,8), %ymm5
        vmovupd 64(%rdx,%rdi,8), %ymm6
        vmovupd 96(%rdx,%rdi,8), %ymm7
        vmulpd  (%rcx,%rdi,8), %ymm4, %ymm4
        vaddpd  %ymm0, %ymm4, %ymm0
        vmulpd  32(%rcx,%rdi,8), %ymm5, %ymm4
        vaddpd  %ymm1, %ymm4, %ymm1
        vmulpd  64(%rcx,%rdi,8), %ymm6, %ymm4
        vmulpd  96(%rcx,%rdi,8), %ymm7, %ymm5
        vaddpd  %ymm2, %ymm4, %ymm2
        vaddpd  %ymm3, %ymm5, %ymm3
        addq    $16, %rdi
        cmpq    %rdi, %rsi
        jne     L64
        vaddpd  %ymm0, %ymm1, %ymm0
        vaddpd  %ymm0, %ymm2, %ymm0
        vaddpd  %ymm0, %ymm3, %ymm0
        vextractf128    $1, %ymm0, %xmm1
        vaddpd  %xmm1, %xmm0, %xmm0
        vpermilpd       $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
        vaddsd  %xmm1, %xmm0, %xmm0
        cmpq    %rsi, %rax
        je      L195
        nopl    (%rax)
L176:
        vmovsd  (%rdx,%rsi,8), %xmm1    # xmm1 = mem[0],zero
        vfmadd231sd     (%rcx,%rsi,8), %xmm1, %xmm0 # xmm0 = (xmm1 * mem) + xmm0
        incq    %rsi
        cmpq    %rsi, %rax
        jne     L176
L195:
        vzeroupper
        retq
        nopw    (%rax,%rax)

The llvm clearly shows the contract/fast attributes, in the case of the @simd for:

  %30 = fmul contract <4 x double> %wide.load, %wide.load16
  %31 = fmul contract <4 x double> %wide.load13, %wide.load17
  %32 = fmul contract <4 x double> %wide.load14, %wide.load18
  %33 = fmul contract <4 x double> %wide.load15, %wide.load19
  %34 = fadd fast <4 x double> %vec.phi, %30
  %35 = fadd fast <4 x double> %vec.phi10, %31
  %36 = fadd fast <4 x double> %vec.phi11, %32
  %37 = fadd fast <4 x double> %vec.phi12, %33

and the @fastmath while loop:

  %31 = fmul fast <4 x double> %wide.load14, %wide.load
  %32 = fmul fast <4 x double> %wide.load15, %wide.load11
  %33 = fmul fast <4 x double> %wide.load16, %wide.load12
  %34 = fmul fast <4 x double> %wide.load17, %wide.load13
  %35 = fadd fast <4 x double> %31, %vec.phi
  %36 = fadd fast <4 x double> %32, %vec.phi8
  %37 = fadd fast <4 x double> %33, %vec.phi9
  %38 = fadd fast <4 x double> %34, %vec.phi10

Yet neither case shows a contraction.
With

julia> versioninfo()
Julia Version 1.3.1-pre.0
Commit b42f4ab* (2019-11-26 17:58 UTC)
Platform Info:
  OS: Linux (x86_64-pc-linux-gnu)
  CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-6.0.1 (ORCJIT, haswell)
Environment:
  JULIA_NUM_THREADS = 24

we get

julia> @code_native debuginfo=:none dotf(a,b)
        .text
        movq    24(%rdi), %rax
        testq   %rax, %rax
        jle     L32
        movq    (%rdi), %rcx
        movq    (%rsi), %rdx
        cmpq    $16, %rax
        jae     L40
        vxorpd  %xmm0, %xmm0, %xmm0
        xorl    %esi, %esi
        jmp     L176
L32:
        vxorps  %xmm0, %xmm0, %xmm0
        vzeroupper
        retq
L40:
        movq    %rax, %rsi
        andq    $-16, %rsi
        vxorpd  %xmm0, %xmm0, %xmm0
        xorl    %edi, %edi
        vxorpd  %xmm1, %xmm1, %xmm1
        vxorpd  %xmm2, %xmm2, %xmm2
        vxorpd  %xmm3, %xmm3, %xmm3
        nopw    %cs:(%rax,%rax)
L80:
        vmovupd (%rcx,%rdi,8), %ymm4
        vmovupd 32(%rcx,%rdi,8), %ymm5
        vmovupd 64(%rcx,%rdi,8), %ymm6
        vmovupd 96(%rcx,%rdi,8), %ymm7
        vfmadd231pd     (%rdx,%rdi,8), %ymm4, %ymm0
        vfmadd231pd     32(%rdx,%rdi,8), %ymm5, %ymm1
        vfmadd231pd     64(%rdx,%rdi,8), %ymm6, %ymm2
        vfmadd231pd     96(%rdx,%rdi,8), %ymm7, %ymm3
        addq    $16, %rdi
        cmpq    %rdi, %rsi
        jne     L80
        vaddpd  %ymm0, %ymm1, %ymm0
        vaddpd  %ymm0, %ymm2, %ymm0
        vaddpd  %ymm0, %ymm3, %ymm0
        vextractf128    $1, %ymm0, %xmm1
        vaddpd  %ymm1, %ymm0, %ymm0
        vhaddpd %ymm0, %ymm0, %ymm0
        cmpq    %rsi, %rax
        je      L196
        nopw    (%rax,%rax)
L176:
        vmovsd  (%rcx,%rsi,8), %xmm1    # xmm1 = mem[0],zero
        vfmadd231sd     (%rdx,%rsi,8), %xmm1, %xmm0
        addq    $1, %rsi
        cmpq    %rax, %rsi
        jb      L176
L196:
        vzeroupper
        retq
        nopl    (%rax,%rax)

julia> @code_native debuginfo=:none dotw(a,b)
        .text
        movq    8(%rdi), %rax
        testq   %rax, %rax
        jle     L32
        movq    (%rdi), %rcx
        movq    (%rsi), %rdx
        cmpq    $16, %rax
        jae     L40
        vxorpd  %xmm0, %xmm0, %xmm0
        xorl    %esi, %esi
        jmp     L176
L32:
        vxorps  %xmm0, %xmm0, %xmm0
        vzeroupper
        retq
L40:
        movq    %rax, %rsi
        andq    $-16, %rsi
        vxorpd  %xmm0, %xmm0, %xmm0
        xorl    %edi, %edi
        vxorpd  %xmm1, %xmm1, %xmm1
        vxorpd  %xmm2, %xmm2, %xmm2
        vxorpd  %xmm3, %xmm3, %xmm3
        nopw    %cs:(%rax,%rax)
L80:
        vmovupd (%rdx,%rdi,8), %ymm4
        vmovupd 32(%rdx,%rdi,8), %ymm5
        vmovupd 64(%rdx,%rdi,8), %ymm6
        vmovupd 96(%rdx,%rdi,8), %ymm7
        vfmadd231pd     (%rcx,%rdi,8), %ymm4, %ymm0
        vfmadd231pd     32(%rcx,%rdi,8), %ymm5, %ymm1
        vfmadd231pd     64(%rcx,%rdi,8), %ymm6, %ymm2
        vfmadd231pd     96(%rcx,%rdi,8), %ymm7, %ymm3
        addq    $16, %rdi
        cmpq    %rdi, %rsi
        jne     L80
        vaddpd  %ymm0, %ymm1, %ymm0
        vaddpd  %ymm0, %ymm2, %ymm0
        vaddpd  %ymm0, %ymm3, %ymm0
        vextractf128    $1, %ymm0, %xmm1
        vaddpd  %ymm1, %ymm0, %ymm0
        vhaddpd %ymm0, %ymm0, %ymm0
        cmpq    %rsi, %rax
        je      L196
        nopw    (%rax,%rax)
L176:
        vmovsd  (%rdx,%rsi,8), %xmm1    # xmm1 = mem[0],zero
        vfmadd231sd     (%rcx,%rsi,8), %xmm1, %xmm0
        addq    $1, %rsi
        cmpq    %rsi, %rax
        jne     L176
L196:
        vzeroupper
        retq
        nopl    (%rax,%rax)

Both versions show essentially the same @code_llvm, so I'd guess something changed on the LLVM-side? Maybe the createCombineMulAddPass changed somehow?

I'm not sure it's an LLVM regression, because Clang 9.0.0 doesn't seem to have this issue.

external dependencies performance upstream

Most helpful comment

Open a patch against upstream https://reviews.llvm.org/D71495

All 6 comments

The fact that the fmul is marked contract means that CombineMulAddPass did run correctly, since this is a backend optimization.

Clang set's the function attribute unsafe-fp-math=true when compiling with -Ofast, setting that for the Julia version causes
contraction to happen and removing it from the Clang produced IR stops it. https://godbolt.org/z/jBjkU5

Looking at DagCombiner it seems that both need to be contract for the fusion to happen without unsafe-fp-math=true.
Which is rather odd since fast is supposed to imply contract https://llvm.org/docs/LangRef.html#fast-math-flags

; │┌ @ float.jl:405 within `*'
    %29 = fmul contract <4 x double> %wide.load, %wide.load16
    %30 = fmul contract <4 x double> %wide.load13, %wide.load17
    %31 = fmul contract <4 x double> %wide.load14, %wide.load18
    %32 = fmul contract <4 x double> %wide.load15, %wide.load19
; │└
; │┌ @ float.jl:401 within `+'
    %33 = fadd contract <4 x double> %vec.phi, %29
    %34 = fadd contract <4 x double> %vec.phi10, %30
    %35 = fadd contract <4 x double> %vec.phi11, %31
    %36 = fadd contract <4 x double> %vec.phi12, %32
; │└

See https://godbolt.org/z/whUcJK and switching from llc 6.0.0 to llc 7.0.0

Interesting.
So, I take it that @code_llvm optimize=true shows llvm after all the passes in the jitlayers have run, and because profitability of combining is target-dependent, this waits until back end specific lowering?
Even though some backend-specifics (vector width) already show up?

fast not implying contract sounds like an llvm bug. Are you filing a bug report with LLVM (the documentation doesn't seem ambiguous in stating the fast "flag implies all of the others."), or getting Julia to add the contract flag with @simd (and also with @fastmath?)?

I'm wondering if I should update SIMDPirates.jl to use the contract flag in places where I currently used fast.

Yes code_llvm optimize=false is what Julia emits, and code_llvm optimize=true is after the (mostly target independent) middle end, which includes everything we setup in jitlayers.cpp.

I am working on localising the bug within LLVM, since fast implying contract is an assumption several places have so it is likely a place where things weren't copied/setup right.

I'm wondering if I should update SIMDPirates.jl to use the contract flag in places where I currently used fast.

No.

Open a patch against upstream https://reviews.llvm.org/D71495

Was this page helpful?
0 / 5 - 0 ratings

Related issues

StefanKarpinski picture StefanKarpinski  Â·  3Comments

yurivish picture yurivish  Â·  3Comments

manor picture manor  Â·  3Comments

iamed2 picture iamed2  Â·  3Comments

musm picture musm  Â·  3Comments