Rust: Can't optimize loop assignment into memset

Created on 23 Oct 2017  路  12Comments  路  Source: rust-lang/rust

https://godbolt.org/g/9uWCF8

pub fn memzero(data: &mut [u8]) {
    for i in 0..data.len() {
        data[i] = 0;
    }
}
example::memzero:
  push rbp
  mov rbp, rsp
  test rsi, rsi
  je .LBB0_13
  cmp rsi, 31
  jbe .LBB0_2
  mov rax, rsi
  and rax, -32
  je .LBB0_2
  lea r8, [rax - 32]
  mov ecx, r8d
  shr ecx, 5
  inc ecx
  and rcx, 7
  je .LBB0_5
  neg rcx
  xor edx, edx
  xorps xmm0, xmm0
.LBB0_7:
  movups xmmword ptr [rdi + rdx], xmm0
  movups xmmword ptr [rdi + rdx + 16], xmm0
  add rdx, 32
  inc rcx
  jne .LBB0_7
  jmp .LBB0_8
.LBB0_2:
  xor eax, eax
.LBB0_12:
  mov byte ptr [rdi + rax], 0
  inc rax
  cmp rax, rsi
  jb .LBB0_12
.LBB0_13:
  pop rbp
  ret
.LBB0_5:
  xor edx, edx
.LBB0_8:
  cmp r8, 224
  jb .LBB0_11
  mov rcx, rax
  sub rcx, rdx
  lea rdx, [rdi + rdx + 240]
  xorps xmm0, xmm0
.LBB0_10:
  movups xmmword ptr [rdx - 240], xmm0
  movups xmmword ptr [rdx - 224], xmm0
  movups xmmword ptr [rdx - 208], xmm0
  movups xmmword ptr [rdx - 192], xmm0
  movups xmmword ptr [rdx - 176], xmm0
  movups xmmword ptr [rdx - 160], xmm0
  movups xmmword ptr [rdx - 144], xmm0
  movups xmmword ptr [rdx - 128], xmm0
  movups xmmword ptr [rdx - 112], xmm0
  movups xmmword ptr [rdx - 96], xmm0
  movups xmmword ptr [rdx - 80], xmm0
  movups xmmword ptr [rdx - 64], xmm0
  movups xmmword ptr [rdx - 48], xmm0
  movups xmmword ptr [rdx - 32], xmm0
  movups xmmword ptr [rdx - 16], xmm0
  movups xmmword ptr [rdx], xmm0
  add rdx, 256
  add rcx, -256
  jne .LBB0_10
.LBB0_11:
  cmp rax, rsi
  jne .LBB0_12
  jmp .LBB0_13

This is normal before rustc 1.20, which is a performance regression.

C-enhancement I-slow P-high T-compiler regression-from-stable-to-stable

Most helpful comment

However, there is an LLVM problem behind all of this:

After #43595, before loop optimizations, we have this code:

; Function Attrs: uwtable
define void @_ZN7suspect7memzero17h76c06c0c84a550b5E(i8* nocapture nonnull %data.ptr, i64 %data.meta) {
start:
  %0 = icmp eq i64 %data.meta, 0
  br i1 %0, label %bb5, label %bb2.i.preheader

bb2.i.preheader:                                  ; preds = %start
  br label %bb2.i

bb2.i:                                            ; preds = %bb2.i.preheader, %bb7
  %iter.sroa.0.010 = phi i64 [ %3, %bb7 ], [ 0, %bb2.i.preheader ]
  %1 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %iter.sroa.0.010, i64 1) #5
  %2 = extractvalue { i64, i1 } %1, 1
  br i1 %2, label %bb5.loopexit, label %bb7

bb5.loopexit:                                     ; preds = %bb7, %bb2.i
  br label %bb5

bb5:                                              ; preds = %bb5.loopexit, %start
  ret void

bb7:                                              ; preds = %bb2.i
  %3 = extractvalue { i64, i1 } %1, 0
  %4 = getelementptr inbounds i8, i8* %data.ptr, i64 %iter.sroa.0.010
  store i8 0, i8* %4, align 1
  %5 = icmp ult i64 %3, %data.meta
  br i1 %5, label %bb2.i, label %bb5.loopexit
}
declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64)

indvars removes the add with overflow, but leaves behind a br i1 false:

; ModuleID = '<stdin>'
source_filename = "<stdin>"

define void @_ZN7suspect7memzero17h76c06c0c84a550b5E(i8* nocapture nonnull %data.ptr, i64 %data.meta) {
start:
  %0 = icmp eq i64 %data.meta, 0
  br i1 %0, label %bb5, label %bb2.i.preheader

bb2.i.preheader:                                  ; preds = %start
  br label %bb2.i

bb2.i:                                            ; preds = %bb7, %bb2.i.preheader
  %iter.sroa.0.010 = phi i64 [ %1, %bb7 ], [ 0, %bb2.i.preheader ]
  %1 = add nuw i64 %iter.sroa.0.010, 1
  br i1 false, label %bb5.loopexit, label %bb7

bb5.loopexit:                                     ; preds = %bb7, %bb2.i
  br label %bb5

bb5:                                              ; preds = %bb5.loopexit, %start
  ret void

bb7:                                              ; preds = %bb2.i
  %2 = getelementptr inbounds i8, i8* %data.ptr, i64 %iter.sroa.0.010
  store i8 0, i8* %2, align 1
  %3 = icmp ult i64 %1, %data.meta
  br i1 %3, label %bb2.i, label %bb5.loopexit
}

; Function Attrs: nounwind readnone
declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #0

attributes #0 = { nounwind readnone }

Which the immediately succeeding loop idiom recognition can't remove. Replacing the br i1 false with a direct branch:

start:
  %0 = icmp eq i64 %data.meta, 0
  br i1 %0, label %bb5, label %bb2.i.preheader

bb2.i.preheader:                                  ; preds = %start
  br label %bb2.i

bb2.i:                                            ; preds = %bb7, %bb2.i.preheader
  %iter.sroa.0.010 = phi i64 [ %1, %bb7 ], [ 0, %bb2.i.preheader ]
  %1 = add nuw i64 %iter.sroa.0.010, 1
  br label %bb7 ; MANUALLY CHANGED

bb5.loopexit:                                     ; preds = %bb7, %bb2.i
  br label %bb5

bb5:                                              ; preds = %bb5.loopexit, %start
  ret void

bb7:                                              ; preds = %bb2.i
  %2 = getelementptr inbounds i8, i8* %data.ptr, i64 %iter.sroa.0.010
  store i8 0, i8* %2, align 1
  %3 = icmp ult i64 %1, %data.meta
  br i1 %3, label %bb2.i, label %bb5.loopexit
}

; Function Attrs: nounwind readnone
declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #0

attributes #0 = { nounwind readnone }

optimizes to a memset:

; ModuleID = '<stdin>'
source_filename = "<stdin>"

define void @_ZN7suspect7memzero17h76c06c0c84a550b5E(i8* nocapture nonnull %data.ptr, i64 %data.meta) {
start:
  %0 = icmp eq i64 %data.meta, 0
  br i1 %0, label %bb5, label %bb2.i.preheader

bb2.i.preheader:                                  ; preds = %start
  call void @llvm.memset.p0i8.i64(i8* %data.ptr, i8 0, i64 %data.meta, i32 1, i1 false)
  br label %bb2.i

bb2.i:                                            ; preds = %bb7, %bb2.i.preheader
  %iter.sroa.0.010 = phi i64 [ %1, %bb7 ], [ 0, %bb2.i.preheader ]
  %1 = add nuw i64 %iter.sroa.0.010, 1
  br label %bb7

bb5.loopexit:                                     ; preds = %bb7
  br label %bb5

bb5:                                              ; preds = %bb5.loopexit, %start
  ret void

bb7:                                              ; preds = %bb2.i
  %2 = getelementptr inbounds i8, i8* %data.ptr, i64 %iter.sroa.0.010
  %3 = icmp ult i64 %1, %data.meta
  br i1 %3, label %bb2.i, label %bb5.loopexit
}

; Function Attrs: nounwind readnone
declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #0

; Function Attrs: argmemonly nounwind
declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1) #1

attributes #0 = { nounwind readnone }
attributes #1 = { argmemonly nounwind }

This means that LLVM's LoopIdiomRecognize needs to recognize that br i1 false always branches to the second arm, or that some CFG cleanup pass needs to run in-between. This is not the first time I encounter br i1 false in LLVM IR, so this is probably a problem that needs a deeper solution.

All 12 comments

I think it has just inlined the memset call. Do you have a benchmark to show if there is actual regression in time?

BTW the following still compiled to a memset call.

for r in data {
    *r = 0;
}

https://gist.github.com/quininer/361b79cd3396ca23a9de3fbc09da1b8a

running 3 tests
test bench_for  ... bench:          30 ns/iter (+/- 0)
test bench_for2 ... bench:       3,222 ns/iter (+/- 121)
test bench_set  ... bench:          29 ns/iter (+/- 1)

Thank you!

Can we bisect this? Seems like it would be somewhat painful, but probably the best way to find the problem.

cc @dotdash -- any chance you want to swoop in a fix this? =)

cc @Mark-Simulacrum @est31 -- do you folks have any nifty way to bisect this? I guess since it doesn't error it would be difficult, unless we added some sort of snooping thing. We could do it by hand, presumably.

triage: P-high

We will revisit next week.

Rust 1.20 doesn't feature the bug. Rust 1.21 does.

My setup was like (deleting the target dir in each step):

cargo rustc --release -- --emit asm
! cat target/release/deps/*.s | rg memset

Bisection output:

Searching in 545 commits; about 10 steps
INFO:bisect: tested a83c3e777 from Sat, 23 Sep 2017 13:13:15 +0000: test failed: true
INFO:bisect: tested 3e9646123 from Sun, 27 Aug 2017 01:41:45 +0000: test failed: true
INFO:bisect: tested e40dc66f4 from Wed, 16 Aug 2017 01:16:37 +0000: test failed: true
INFO:bisect: tested 38bdbb7cf from Fri, 11 Aug 2017 13:04:59 +0000: test failed: true
INFO:bisect: tested 3f977baf3 from Wed,  9 Aug 2017 04:03:49 +0000: test failed: true
INFO:bisect: tested c5e2051f0 from Tue,  8 Aug 2017 02:08:23 +0000: test failed: false
thread 'main' panicked at 'called `Result::unwrap()` on an `Err` value: Error(Utils(Msg("unable to download sha ddc02deb07d609fc14d119622a53c55eca3e534e triple x86_64-unknown-linux-gnu module rustc")), State { next_error: None, backtrace: None })', /checkout/src/libcore/result.rs:906:4
note: Run with `RUST_BACKTRACE=1` for a backtrace.

Can't get any closer because the regression is apparently beyond the deletion event horizon :/. 90 days is roughly 3 months, so it would make sense. Not entirely sure though because c5e2051f0 was available...

Either way, this is the commit range I could pinpoint it to: c5e2051f0...3f977baf3

Most suspicious commit:

43595 - Add an overflow check in the Iter::next() impl for Range<_> to help with vectorization.

Confirmed that #43595 is at fault - reverting it on top of master fixes the bug.

However, there is an LLVM problem behind all of this:

After #43595, before loop optimizations, we have this code:

; Function Attrs: uwtable
define void @_ZN7suspect7memzero17h76c06c0c84a550b5E(i8* nocapture nonnull %data.ptr, i64 %data.meta) {
start:
  %0 = icmp eq i64 %data.meta, 0
  br i1 %0, label %bb5, label %bb2.i.preheader

bb2.i.preheader:                                  ; preds = %start
  br label %bb2.i

bb2.i:                                            ; preds = %bb2.i.preheader, %bb7
  %iter.sroa.0.010 = phi i64 [ %3, %bb7 ], [ 0, %bb2.i.preheader ]
  %1 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %iter.sroa.0.010, i64 1) #5
  %2 = extractvalue { i64, i1 } %1, 1
  br i1 %2, label %bb5.loopexit, label %bb7

bb5.loopexit:                                     ; preds = %bb7, %bb2.i
  br label %bb5

bb5:                                              ; preds = %bb5.loopexit, %start
  ret void

bb7:                                              ; preds = %bb2.i
  %3 = extractvalue { i64, i1 } %1, 0
  %4 = getelementptr inbounds i8, i8* %data.ptr, i64 %iter.sroa.0.010
  store i8 0, i8* %4, align 1
  %5 = icmp ult i64 %3, %data.meta
  br i1 %5, label %bb2.i, label %bb5.loopexit
}
declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64)

indvars removes the add with overflow, but leaves behind a br i1 false:

; ModuleID = '<stdin>'
source_filename = "<stdin>"

define void @_ZN7suspect7memzero17h76c06c0c84a550b5E(i8* nocapture nonnull %data.ptr, i64 %data.meta) {
start:
  %0 = icmp eq i64 %data.meta, 0
  br i1 %0, label %bb5, label %bb2.i.preheader

bb2.i.preheader:                                  ; preds = %start
  br label %bb2.i

bb2.i:                                            ; preds = %bb7, %bb2.i.preheader
  %iter.sroa.0.010 = phi i64 [ %1, %bb7 ], [ 0, %bb2.i.preheader ]
  %1 = add nuw i64 %iter.sroa.0.010, 1
  br i1 false, label %bb5.loopexit, label %bb7

bb5.loopexit:                                     ; preds = %bb7, %bb2.i
  br label %bb5

bb5:                                              ; preds = %bb5.loopexit, %start
  ret void

bb7:                                              ; preds = %bb2.i
  %2 = getelementptr inbounds i8, i8* %data.ptr, i64 %iter.sroa.0.010
  store i8 0, i8* %2, align 1
  %3 = icmp ult i64 %1, %data.meta
  br i1 %3, label %bb2.i, label %bb5.loopexit
}

; Function Attrs: nounwind readnone
declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #0

attributes #0 = { nounwind readnone }

Which the immediately succeeding loop idiom recognition can't remove. Replacing the br i1 false with a direct branch:

start:
  %0 = icmp eq i64 %data.meta, 0
  br i1 %0, label %bb5, label %bb2.i.preheader

bb2.i.preheader:                                  ; preds = %start
  br label %bb2.i

bb2.i:                                            ; preds = %bb7, %bb2.i.preheader
  %iter.sroa.0.010 = phi i64 [ %1, %bb7 ], [ 0, %bb2.i.preheader ]
  %1 = add nuw i64 %iter.sroa.0.010, 1
  br label %bb7 ; MANUALLY CHANGED

bb5.loopexit:                                     ; preds = %bb7, %bb2.i
  br label %bb5

bb5:                                              ; preds = %bb5.loopexit, %start
  ret void

bb7:                                              ; preds = %bb2.i
  %2 = getelementptr inbounds i8, i8* %data.ptr, i64 %iter.sroa.0.010
  store i8 0, i8* %2, align 1
  %3 = icmp ult i64 %1, %data.meta
  br i1 %3, label %bb2.i, label %bb5.loopexit
}

; Function Attrs: nounwind readnone
declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #0

attributes #0 = { nounwind readnone }

optimizes to a memset:

; ModuleID = '<stdin>'
source_filename = "<stdin>"

define void @_ZN7suspect7memzero17h76c06c0c84a550b5E(i8* nocapture nonnull %data.ptr, i64 %data.meta) {
start:
  %0 = icmp eq i64 %data.meta, 0
  br i1 %0, label %bb5, label %bb2.i.preheader

bb2.i.preheader:                                  ; preds = %start
  call void @llvm.memset.p0i8.i64(i8* %data.ptr, i8 0, i64 %data.meta, i32 1, i1 false)
  br label %bb2.i

bb2.i:                                            ; preds = %bb7, %bb2.i.preheader
  %iter.sroa.0.010 = phi i64 [ %1, %bb7 ], [ 0, %bb2.i.preheader ]
  %1 = add nuw i64 %iter.sroa.0.010, 1
  br label %bb7

bb5.loopexit:                                     ; preds = %bb7
  br label %bb5

bb5:                                              ; preds = %bb5.loopexit, %start
  ret void

bb7:                                              ; preds = %bb2.i
  %2 = getelementptr inbounds i8, i8* %data.ptr, i64 %iter.sroa.0.010
  %3 = icmp ult i64 %1, %data.meta
  br i1 %3, label %bb2.i, label %bb5.loopexit
}

; Function Attrs: nounwind readnone
declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #0

; Function Attrs: argmemonly nounwind
declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1) #1

attributes #0 = { nounwind readnone }
attributes #1 = { argmemonly nounwind }

This means that LLVM's LoopIdiomRecognize needs to recognize that br i1 false always branches to the second arm, or that some CFG cleanup pass needs to run in-between. This is not the first time I encounter br i1 false in LLVM IR, so this is probably a problem that needs a deeper solution.

LLVM patch that fixes this:

diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 260b2283b31..baa38d94d17 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -1,3 +1,4 @@
+
 //===- PassManagerBuilder.cpp - Build Standard Pass -----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
@@ -315,6 +316,7 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   MPM.add(createCFGSimplificationPass());
   addInstructionCombiningPass(MPM);
   MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars
+  MPM.add(createCFGSimplificationPass());     // Clean up after IndVarSimplify
   MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
   MPM.add(createLoopDeletionPass());          // Delete dead loops
   if (EnableLoopInterchange) {

OTOH, adding random LLVM passes all around is bad for compilation time, so I'm not sure how much do we want this

Discussed in @rust-lang/compiler meeting: seems like we should assemble a PR and do a perf run.

Was this page helpful?
0 / 5 - 0 ratings