Rust: Can't optimize loop assignment into memset

Created on 23 Oct 2017 · 12Comments · Source: rust-lang/rust

pub fn memzero(data: &mut [u8]) {
    for i in 0..data.len() {
        data[i] = 0;
    }
}

example::memzero:
  push rbp
  mov rbp, rsp
  test rsi, rsi
  je .LBB0_13
  cmp rsi, 31
  jbe .LBB0_2
  mov rax, rsi
  and rax, -32
  je .LBB0_2
  lea r8, [rax - 32]
  mov ecx, r8d
  shr ecx, 5
  inc ecx
  and rcx, 7
  je .LBB0_5
  neg rcx
  xor edx, edx
  xorps xmm0, xmm0
.LBB0_7:
  movups xmmword ptr [rdi + rdx], xmm0
  movups xmmword ptr [rdi + rdx + 16], xmm0
  add rdx, 32
  inc rcx
  jne .LBB0_7
  jmp .LBB0_8
.LBB0_2:
  xor eax, eax
.LBB0_12:
  mov byte ptr [rdi + rax], 0
  inc rax
  cmp rax, rsi
  jb .LBB0_12
.LBB0_13:
  pop rbp
  ret
.LBB0_5:
  xor edx, edx
.LBB0_8:
  cmp r8, 224
  jb .LBB0_11
  mov rcx, rax
  sub rcx, rdx
  lea rdx, [rdi + rdx + 240]
  xorps xmm0, xmm0
.LBB0_10:
  movups xmmword ptr [rdx - 240], xmm0
  movups xmmword ptr [rdx - 224], xmm0
  movups xmmword ptr [rdx - 208], xmm0
  movups xmmword ptr [rdx - 192], xmm0
  movups xmmword ptr [rdx - 176], xmm0
  movups xmmword ptr [rdx - 160], xmm0
  movups xmmword ptr [rdx - 144], xmm0
  movups xmmword ptr [rdx - 128], xmm0
  movups xmmword ptr [rdx - 112], xmm0
  movups xmmword ptr [rdx - 96], xmm0
  movups xmmword ptr [rdx - 80], xmm0
  movups xmmword ptr [rdx - 64], xmm0
  movups xmmword ptr [rdx - 48], xmm0
  movups xmmword ptr [rdx - 32], xmm0
  movups xmmword ptr [rdx - 16], xmm0
  movups xmmword ptr [rdx], xmm0
  add rdx, 256
  add rcx, -256
  jne .LBB0_10
.LBB0_11:
  cmp rax, rsi
  jne .LBB0_12
  jmp .LBB0_13

This is normal before rustc 1.20, which is a performance regression.

C-enhancement I-slow P-high T-compiler regression-from-stable-to-stable

Source

quininer

Most helpful comment

However, there is an LLVM problem behind all of this:

After #43595, before loop optimizations, we have this code:

; Function Attrs: uwtable
define void @_ZN7suspect7memzero17h76c06c0c84a550b5E(i8* nocapture nonnull %data.ptr, i64 %data.meta) {
start:
  %0 = icmp eq i64 %data.meta, 0
  br i1 %0, label %bb5, label %bb2.i.preheader

bb2.i.preheader:                                  ; preds = %start
  br label %bb2.i

bb2.i:                                            ; preds = %bb2.i.preheader, %bb7
  %iter.sroa.0.010 = phi i64 [ %3, %bb7 ], [ 0, %bb2.i.preheader ]
  %1 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %iter.sroa.0.010, i64 1) #5
  %2 = extractvalue { i64, i1 } %1, 1
  br i1 %2, label %bb5.loopexit, label %bb7

bb5.loopexit:                                     ; preds = %bb7, %bb2.i
  br label %bb5

bb5:                                              ; preds = %bb5.loopexit, %start
  ret void

bb7:                                              ; preds = %bb2.i
  %3 = extractvalue { i64, i1 } %1, 0
  %4 = getelementptr inbounds i8, i8* %data.ptr, i64 %iter.sroa.0.010
  store i8 0, i8* %4, align 1
  %5 = icmp ult i64 %3, %data.meta
  br i1 %5, label %bb2.i, label %bb5.loopexit
}
declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64)

indvars removes the add with overflow, but leaves behind a br i1 false:

; ModuleID = '<stdin>'
source_filename = "<stdin>"

define void @_ZN7suspect7memzero17h76c06c0c84a550b5E(i8* nocapture nonnull %data.ptr, i64 %data.meta) {
start:
  %0 = icmp eq i64 %data.meta, 0
  br i1 %0, label %bb5, label %bb2.i.preheader

bb2.i.preheader:                                  ; preds = %start
  br label %bb2.i

bb2.i:                                            ; preds = %bb7, %bb2.i.preheader
  %iter.sroa.0.010 = phi i64 [ %1, %bb7 ], [ 0, %bb2.i.preheader ]
  %1 = add nuw i64 %iter.sroa.0.010, 1
  br i1 false, label %bb5.loopexit, label %bb7

bb5.loopexit:                                     ; preds = %bb7, %bb2.i
  br label %bb5

bb5:                                              ; preds = %bb5.loopexit, %start
  ret void

bb7:                                              ; preds = %bb2.i
  %2 = getelementptr inbounds i8, i8* %data.ptr, i64 %iter.sroa.0.010
  store i8 0, i8* %2, align 1
  %3 = icmp ult i64 %1, %data.meta
  br i1 %3, label %bb2.i, label %bb5.loopexit
}

; Function Attrs: nounwind readnone
declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #0

attributes #0 = { nounwind readnone }

Which the immediately succeeding loop idiom recognition can't remove. Replacing the br i1 false with a direct branch:

start:
  %0 = icmp eq i64 %data.meta, 0
  br i1 %0, label %bb5, label %bb2.i.preheader

bb2.i.preheader:                                  ; preds = %start
  br label %bb2.i

bb2.i:                                            ; preds = %bb7, %bb2.i.preheader
  %iter.sroa.0.010 = phi i64 [ %1, %bb7 ], [ 0, %bb2.i.preheader ]
  %1 = add nuw i64 %iter.sroa.0.010, 1
  br label %bb7 ; MANUALLY CHANGED

bb5.loopexit:                                     ; preds = %bb7, %bb2.i
  br label %bb5

bb5:                                              ; preds = %bb5.loopexit, %start
  ret void

bb7:                                              ; preds = %bb2.i
  %2 = getelementptr inbounds i8, i8* %data.ptr, i64 %iter.sroa.0.010
  store i8 0, i8* %2, align 1
  %3 = icmp ult i64 %1, %data.meta
  br i1 %3, label %bb2.i, label %bb5.loopexit
}

; Function Attrs: nounwind readnone
declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #0

attributes #0 = { nounwind readnone }

optimizes to a memset:

; ModuleID = '<stdin>'
source_filename = "<stdin>"

define void @_ZN7suspect7memzero17h76c06c0c84a550b5E(i8* nocapture nonnull %data.ptr, i64 %data.meta) {
start:
  %0 = icmp eq i64 %data.meta, 0
  br i1 %0, label %bb5, label %bb2.i.preheader

bb2.i.preheader:                                  ; preds = %start
  call void @llvm.memset.p0i8.i64(i8* %data.ptr, i8 0, i64 %data.meta, i32 1, i1 false)
  br label %bb2.i

bb2.i:                                            ; preds = %bb7, %bb2.i.preheader
  %iter.sroa.0.010 = phi i64 [ %1, %bb7 ], [ 0, %bb2.i.preheader ]
  %1 = add nuw i64 %iter.sroa.0.010, 1
  br label %bb7

bb5.loopexit:                                     ; preds = %bb7
  br label %bb5

bb5:                                              ; preds = %bb5.loopexit, %start
  ret void

bb7:                                              ; preds = %bb2.i
  %2 = getelementptr inbounds i8, i8* %data.ptr, i64 %iter.sroa.0.010
  %3 = icmp ult i64 %1, %data.meta
  br i1 %3, label %bb2.i, label %bb5.loopexit
}

; Function Attrs: nounwind readnone
declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #0

; Function Attrs: argmemonly nounwind
declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1) #1

attributes #0 = { nounwind readnone }
attributes #1 = { argmemonly nounwind }

This means that LLVM's LoopIdiomRecognize needs to recognize that br i1 false always branches to the second arm, or that some CFG cleanup pass needs to run in-between. This is not the first time I encounter br i1 false in LLVM IR, so this is probably a problem that needs a deeper solution.

arielb1 on 10 Nov 2017

❤4

All 12 comments

I think it has just inlined the memset call. Do you have a benchmark to show if there is actual regression in time?

BTW the following still compiled to a memset call.

for r in data {
    *r = 0;
}

kennytm on 23 Oct 2017

👍1

https://gist.github.com/quininer/361b79cd3396ca23a9de3fbc09da1b8a

running 3 tests
test bench_for  ... bench:          30 ns/iter (+/- 0)
test bench_for2 ... bench:       3,222 ns/iter (+/- 121)
test bench_set  ... bench:          29 ns/iter (+/- 1)

quininer on 23 Oct 2017

Thank you!

kennytm on 23 Oct 2017

Can we bisect this? Seems like it would be somewhat painful, but probably the best way to find the problem.

cc @dotdash -- any chance you want to swoop in a fix this? =)

cc @Mark-Simulacrum @est31 -- do you folks have any nifty way to bisect this? I guess since it doesn't error it would be difficult, unless we added some sort of snooping thing. We could do it by hand, presumably.

nikomatsakis on 9 Nov 2017

triage: P-high

nikomatsakis on 9 Nov 2017

We will revisit next week.

nikomatsakis on 9 Nov 2017

Rust 1.20 doesn't feature the bug. Rust 1.21 does.

My setup was like (deleting the target dir in each step):

cargo rustc --release -- --emit asm
! cat target/release/deps/*.s | rg memset

Bisection output:

Searching in 545 commits; about 10 steps
INFO:bisect: tested a83c3e777 from Sat, 23 Sep 2017 13:13:15 +0000: test failed: true
INFO:bisect: tested 3e9646123 from Sun, 27 Aug 2017 01:41:45 +0000: test failed: true
INFO:bisect: tested e40dc66f4 from Wed, 16 Aug 2017 01:16:37 +0000: test failed: true
INFO:bisect: tested 38bdbb7cf from Fri, 11 Aug 2017 13:04:59 +0000: test failed: true
INFO:bisect: tested 3f977baf3 from Wed,  9 Aug 2017 04:03:49 +0000: test failed: true
INFO:bisect: tested c5e2051f0 from Tue,  8 Aug 2017 02:08:23 +0000: test failed: false
thread 'main' panicked at 'called `Result::unwrap()` on an `Err` value: Error(Utils(Msg("unable to download sha ddc02deb07d609fc14d119622a53c55eca3e534e triple x86_64-unknown-linux-gnu module rustc")), State { next_error: None, backtrace: None })', /checkout/src/libcore/result.rs:906:4
note: Run with `RUST_BACKTRACE=1` for a backtrace.

Can't get any closer because the regression is apparently beyond the deletion event horizon :/. 90 days is roughly 3 months, so it would make sense. Not entirely sure though because c5e2051f0 was available...

Either way, this is the commit range I could pinpoint it to: c5e2051f0...3f977baf3

est31 on 9 Nov 2017

Most suspicious commit:

43595 - Add an overflow check in the Iter::next() impl for Range<_> to help with vectorization.

arielb1 on 9 Nov 2017

Confirmed that #43595 is at fault - reverting it on top of master fixes the bug.

arielb1 on 9 Nov 2017

However, there is an LLVM problem behind all of this:

After #43595, before loop optimizations, we have this code:

; Function Attrs: uwtable
define void @_ZN7suspect7memzero17h76c06c0c84a550b5E(i8* nocapture nonnull %data.ptr, i64 %data.meta) {
start:
  %0 = icmp eq i64 %data.meta, 0
  br i1 %0, label %bb5, label %bb2.i.preheader

bb2.i.preheader:                                  ; preds = %start
  br label %bb2.i

bb2.i:                                            ; preds = %bb2.i.preheader, %bb7
  %iter.sroa.0.010 = phi i64 [ %3, %bb7 ], [ 0, %bb2.i.preheader ]
  %1 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %iter.sroa.0.010, i64 1) #5
  %2 = extractvalue { i64, i1 } %1, 1
  br i1 %2, label %bb5.loopexit, label %bb7

bb5.loopexit:                                     ; preds = %bb7, %bb2.i
  br label %bb5

bb5:                                              ; preds = %bb5.loopexit, %start
  ret void

bb7:                                              ; preds = %bb2.i
  %3 = extractvalue { i64, i1 } %1, 0
  %4 = getelementptr inbounds i8, i8* %data.ptr, i64 %iter.sroa.0.010
  store i8 0, i8* %4, align 1
  %5 = icmp ult i64 %3, %data.meta
  br i1 %5, label %bb2.i, label %bb5.loopexit
}
declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64)

indvars removes the add with overflow, but leaves behind a br i1 false:

; ModuleID = '<stdin>'
source_filename = "<stdin>"

define void @_ZN7suspect7memzero17h76c06c0c84a550b5E(i8* nocapture nonnull %data.ptr, i64 %data.meta) {
start:
  %0 = icmp eq i64 %data.meta, 0
  br i1 %0, label %bb5, label %bb2.i.preheader

bb2.i.preheader:                                  ; preds = %start
  br label %bb2.i

bb2.i:                                            ; preds = %bb7, %bb2.i.preheader
  %iter.sroa.0.010 = phi i64 [ %1, %bb7 ], [ 0, %bb2.i.preheader ]
  %1 = add nuw i64 %iter.sroa.0.010, 1
  br i1 false, label %bb5.loopexit, label %bb7

bb5.loopexit:                                     ; preds = %bb7, %bb2.i
  br label %bb5

bb5:                                              ; preds = %bb5.loopexit, %start
  ret void

bb7:                                              ; preds = %bb2.i
  %2 = getelementptr inbounds i8, i8* %data.ptr, i64 %iter.sroa.0.010
  store i8 0, i8* %2, align 1
  %3 = icmp ult i64 %1, %data.meta
  br i1 %3, label %bb2.i, label %bb5.loopexit
}

; Function Attrs: nounwind readnone
declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #0

attributes #0 = { nounwind readnone }

Which the immediately succeeding loop idiom recognition can't remove. Replacing the br i1 false with a direct branch:

start:
  %0 = icmp eq i64 %data.meta, 0
  br i1 %0, label %bb5, label %bb2.i.preheader

bb2.i.preheader:                                  ; preds = %start
  br label %bb2.i

bb2.i:                                            ; preds = %bb7, %bb2.i.preheader
  %iter.sroa.0.010 = phi i64 [ %1, %bb7 ], [ 0, %bb2.i.preheader ]
  %1 = add nuw i64 %iter.sroa.0.010, 1
  br label %bb7 ; MANUALLY CHANGED

bb5.loopexit:                                     ; preds = %bb7, %bb2.i
  br label %bb5

bb5:                                              ; preds = %bb5.loopexit, %start
  ret void

bb7:                                              ; preds = %bb2.i
  %2 = getelementptr inbounds i8, i8* %data.ptr, i64 %iter.sroa.0.010
  store i8 0, i8* %2, align 1
  %3 = icmp ult i64 %1, %data.meta
  br i1 %3, label %bb2.i, label %bb5.loopexit
}

; Function Attrs: nounwind readnone
declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #0

attributes #0 = { nounwind readnone }

optimizes to a memset:

; ModuleID = '<stdin>'
source_filename = "<stdin>"

define void @_ZN7suspect7memzero17h76c06c0c84a550b5E(i8* nocapture nonnull %data.ptr, i64 %data.meta) {
start:
  %0 = icmp eq i64 %data.meta, 0
  br i1 %0, label %bb5, label %bb2.i.preheader

bb2.i.preheader:                                  ; preds = %start
  call void @llvm.memset.p0i8.i64(i8* %data.ptr, i8 0, i64 %data.meta, i32 1, i1 false)
  br label %bb2.i

bb2.i:                                            ; preds = %bb7, %bb2.i.preheader
  %iter.sroa.0.010 = phi i64 [ %1, %bb7 ], [ 0, %bb2.i.preheader ]
  %1 = add nuw i64 %iter.sroa.0.010, 1
  br label %bb7

bb5.loopexit:                                     ; preds = %bb7
  br label %bb5

bb5:                                              ; preds = %bb5.loopexit, %start
  ret void

bb7:                                              ; preds = %bb2.i
  %2 = getelementptr inbounds i8, i8* %data.ptr, i64 %iter.sroa.0.010
  %3 = icmp ult i64 %1, %data.meta
  br i1 %3, label %bb2.i, label %bb5.loopexit
}

; Function Attrs: nounwind readnone
declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #0

; Function Attrs: argmemonly nounwind
declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1) #1

attributes #0 = { nounwind readnone }
attributes #1 = { argmemonly nounwind }

arielb1 on 10 Nov 2017

❤4

LLVM patch that fixes this:

diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 260b2283b31..baa38d94d17 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -1,3 +1,4 @@
+
 //===- PassManagerBuilder.cpp - Build Standard Pass -----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
@@ -315,6 +316,7 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   MPM.add(createCFGSimplificationPass());
   addInstructionCombiningPass(MPM);
   MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars
+  MPM.add(createCFGSimplificationPass());     // Clean up after IndVarSimplify
   MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
   MPM.add(createLoopDeletionPass());          // Delete dead loops
   if (EnableLoopInterchange) {

OTOH, adding random LLVM passes all around is bad for compilation time, so I'm not sure how much do we want this

arielb1 on 15 Nov 2017

Discussed in @rust-lang/compiler meeting: seems like we should assemble a PR and do a perf run.

nikomatsakis on 16 Nov 2017

Was this page helpful?

0 / 5 - 0 ratings

Related issues

Comments for macro expansions

Robbepop · 3Comments

Unhelpful diagnostic when using proc macros 2.0 without #![feature(use_extern_macros)]

lambda-fairy · 3Comments

Internal compiler error: cannot relate bound region (likely caused by `conservative_impl_trait`)

jmegaffin · 3Comments

Integer division/remainder rounding behavior should be documented

wthrowe · 3Comments

Callback parameter names are missing from rustdoc

dtolnay · 3Comments