Rust: AVR: interrupt code broken by unused alloca

Created on 14 Aug 2020 · 6Comments · Source: rust-lang/rust

I've run into an issue with code that uses interrupts on AVR. I'm not sure if this applies to any other platform.

I've attempted to minimize the example, but there's still a fair amount here, largely because I'm having trouble coming up with good, simple ways to observe the chip's behavior. It's written for the ATmega2560 (and specifically the Arduino Mega2560 board), but should be easily adapted to other AVR MCUs.

This code works fine, flashing the built-in LED on my Arduino Mega2560:

#![no_std]
#![no_main]
#![feature(abi_avr_interrupt, llvm_asm)]

extern crate panic_halt;
use core::cell::UnsafeCell;
use avr_device::atmega2560;

static FAIL: RacyUnsafeCell<Result<(), ()>> = RacyUnsafeCell::new(Err(()));

#[repr(transparent)]
pub struct RacyUnsafeCell<T>(UnsafeCell<T>);

unsafe impl<T: Sync> Sync for RacyUnsafeCell<T> {}

impl<T> RacyUnsafeCell<T> {
    pub const fn new(x: T) -> Self {
        RacyUnsafeCell(UnsafeCell::new(x))
    }

    pub fn get(&self) -> *mut T {
        self.0.get()
    }
}

#[no_mangle]
pub unsafe extern "avr-interrupt" fn __vector_9() {
    match *FAIL.get() {
        Ok(t) => t,
        Err(_) => unwrap_failed(&()),
    }
}

fn unwrap_failed(error: &dyn core::fmt::Debug) -> ! {
    panic!("{:?}", error)
}

#[export_name = "main"]
pub extern "C" fn main() -> ! {
    unsafe {
        *FAIL.get() = Ok(());
    }

    let dp = atmega2560::Peripherals::take().unwrap();

    // output for LED & PCINT0
    dp.PORTB.ddrb.write(|w| unsafe { w.bits(0b1000_0001) });
    dp.PORTB.portb.write(|w| unsafe { w.bits(0b1000_0001) });

    // enable PCINT0
    dp.EXINT.pcicr.write(|w| w.pcie().bits(1));
    dp.EXINT.pcmsk0.write(|w| w.pcint().bits(1));

    unsafe {
        llvm_asm!("sei" :::: "volatile");
    }

    loop {
        for _ in 0..50 {
            let mut u = 0xffff;
            unsafe {
                llvm_asm!(
                    "1: sbiw $0,1\n\tbrne 1b"
                    : "=w"(u)
                    : "0"(u)
                    :
                    : "volatile"
                );
            }
        }

        dp.PORTB.portb.write(|w| unsafe { w.bits(0) });

        for _ in 0..50 {
            let mut u = 0xffff;
            unsafe {
                llvm_asm!(
                    "1: sbiw $0,1\n\tbrne 1b"
                    : "=w"(u)
                    : "0"(u)
                    :
                    : "volatile"
                );
            }
        }

        dp.PORTB.portb.write(|w| unsafe { w.bits(0b1000_0001) });
    }
}

Making a single change to unwrap the actual error causes it to fail. It now turns the LED on once and then makes no further discernible progress. Notably, that means it's not even reaching the first update to the PCINT0 pin, which is mystifying.

$ diff examples/min-repro-working.rs examples/min-repro-broken.rs 
30c30
<         Err(_) => unwrap_failed(&()),
---
>         Err(e) => unwrap_failed(&e),

The function unwrap_failed shouldn't ever be called, as the interrupts are not enabled until after Ok(()) is written to the static. (RacyUnsafeCell is used here just to prove that there's no shenanigans).

I don't have any fancy in-circuit debugging tools, so it's hard for me to know what's happening on the metal, but I do have diff, which I've ran against the LLVM IR from these two programs. As far as I can tell, the only meaningful difference is that the broken program issues an alloca at the start of the ISR:

; Function Attrs: nofree norecurse nounwind optsize
define avr_signalcc  void @__vector_9() unnamed_addr addrspace(1) #0 {
start:
  %e = alloca {}, align 1
  %.b = load i1, i1* @_ZN16min_repro_broken4FAIL17h1f0c05d9b688f3e0E.0.0, align 1
  br i1 %.b, label %bb4, label %bb2

bb2:                                              ; preds = %start
; call min_repro_broken::unwrap_failed
  call fastcc addrspace(1) void @_ZN16min_repro_broken13unwrap_failed17h382387c4357ef48cE({}* nonnull align 1 %e)
  unreachable

bb4:                                              ; preds = %start
  ret void
}

versus

; Function Attrs: nofree norecurse nounwind optsize
define avr_signalcc  void @__vector_9() unnamed_addr addrspace(1) #0 {
start:
  %.b = load i1, i1* @_ZN17min_repro_working4FAIL17h3f0200d99301bf56E.0.0, align 1
  br i1 %.b, label %bb4, label %bb2

bb2:                                              ; preds = %start
; call min_repro_working::unwrap_failed
  tail call fastcc addrspace(1) void @_ZN17min_repro_working13unwrap_failed17h0e47ae32571d11baE()
  unreachable

bb4:                                              ; preds = %start
  ret void
}

I've pretty much run out of places I know to look for what could be causing this issue, or ways to productively continue investigating. Any pointers would be greatly appreciated!

My nightly was previously a few weeks old, I've updated to latest and the issue persists.

C-bug O-AVR T-compiler requires-nightly

Source

couchand

Most helpful comment

An upstream fix has been posted as D87735.

couchand on 19 Sep 2020

👍2 ❤1

All 6 comments

I have a bit more information. After going over the generated assembler for the ISR with a fine-toothed comb, I've uncovered an apparent miscompilation. Here is the complete text of the ISR in the problematic program (lines starting with an asterisk are common with the good version):

__vector_9:
*   push    r0
*   push    r1
*   in  r0, 63
*   push    r0
*   clr r0
*   push    r24
    push    r25
    push    r28
    push    r29
    in  r28, 61
    in  r29, 62
    sbiw    r28, 1
    in  r0, 63
    cli
    out 62, r29
    out 63, r0
    out 61, r28
*   lds r24, _ZN16min_repro_broken4FAIL17hb143df4aedca2a54E.0.0
*   cpi r24, 0
*   breq    LBB0_2
    pop r29
    pop r28
    pop r25
*   pop r24
*   pop r0
*   out 63, r0
    adiw    r28, 1
    in  r0, 63
    cli
    out 62, r29
    out 63, r0
    out 61, r28
*   pop r1
*   pop r0
*   reti

First, let's examine the sequence corresponding to the alloca:

    push    r25
    push    r28
    push    r29
    in  r28, 61
    in  r29, 62
    sbiw    r28, 1
    in  r0, 63
    cli
    out 62, r29
    out 63, r0
    out 61, r28

We use r28 and r29 to store the stack pointer and decrement it by one. We use r0 to store the status register. There are a few fishy things going on here:

r25 is saved, even though it doesn't appear here.
the status register isn't saved until after the stack pointer is decremented.
we clear interrupts needlessly.

Now let's look look at the sequence of deallocation just before exiting the ISR (again, lines starting with an asterisk are common with the good version):

    pop r29
    pop r28
    pop r25
*   pop r24
*   pop r0
*   out 63, r0
    adiw    r28, 1
    in  r0, 63
    cli
    out 62, r29
    out 63, r0
    out 61, r28

So we restore the registers clobbered by stack pointer math, and we increment the stack pointer and restore it, but we do those two steps in the wrong order.

The stack pointer increment needs to happen before we restore the old value of the registers used to calculate it.
And again, we restore the status register to a value saved after the increment, rather than before.
And once more we clear interrupts needlessly.

But we do go ahead and restore r25 even though it never got clobbered.

Fun stuff.

couchand on 14 Aug 2020

Well, now that I know exactly what to look for, I've been able to minimize the example much more effectively. Here is a minimal reproduction of the bug:

#![no_std]
#![no_main]
#![feature(abi_avr_interrupt)]

#[no_mangle]
pub unsafe extern "avr-interrupt" fn __vector_0() {
    let mut item = 0u8;
    use_var(&mut item);
}

#[inline(never)]
pub fn use_var(item: &mut u8) { *item = 0; }

#[export_name = "main"]
pub extern "C" fn main() -> ! { loop {} }

#[panic_handler]
fn panic(_: &core::panic::PanicInfo) -> ! { loop {} }

Compiling for an AVR target produces this reasonable LLVM IR for the interrupt service routine:

; Function Attrs: nounwind writeonly
define avr_signalcc  void @__vector_0() unnamed_addr addrspace(1) #0 {
start:
  %item = alloca i8, align 1
  call addrspace(1) void @llvm.lifetime.start.p0i8(i64 1, i8* nonnull %item)
  store i8 0, i8* %item, align 1
; call codegen_bug_2::use_var
  call fastcc addrspace(1) void @_ZN13codegen_bug_27use_var17h8e808e1926aec402E(i8* nonnull align 1 dereferenceable(1) %item)
  call addrspace(1) void @llvm.lifetime.end.p0i8(i64 1, i8* nonnull %item)
  ret void
}

and this broken assembler:

    .section    .text.__vector_0,"ax",@progbits
    .globl  __vector_0
    .p2align    1
    .type   __vector_0,@function
__vector_0:
    push    r0
    push    r1
    in  r0, 63
    push    r0
    clr r0
    push    r24
    push    r25
    push    r28
    push    r29
    in  r28, 61
    in  r29, 62
    sbiw    r28, 1
    in  r0, 63
    cli
    out 62, r29
    out 63, r0
    out 61, r28
    ldi r24, 0
    std Y+1, r24
    movw    r24, r28
    adiw    r24, 1
    call    _ZN13codegen_bug_27use_var17h8e808e1926aec402E
    pop r29
    pop r28
    pop r25
    pop r24
    pop r0
    out 63, r0
    adiw    r28, 1
    in  r0, 63
    cli
    out 62, r29
    out 63, r0
    out 61, r28
    pop r1
    pop r0
    reti
.Lfunc_end0:
    .size   __vector_0, .Lfunc_end0-__vector_0

The LLVM is simple and reasonable, but the assembler is not a true representation of the semantics of the IR. This leads me to believe it is a bug in LLVM.

I've only seen it appear in a function with the avr_signalcc/avr-interrupt calling convention, but it does show up reliably for any allocas within an ISR.

I guess it's time to start digging into the LLVM source.

couchand on 20 Aug 2020

I've identified the issue, it has been reported upstream as Bug 47253. The issue is in the stack frame lowering code.

couchand on 20 Aug 2020

An upstream fix has been posted as D87735.

couchand on 19 Sep 2020

👍2 ❤1

The upstream patch has been accepted, and once it merges I'll open a PR to cherry-pick it into Rust's LLVM fork.

couchand on 30 Sep 2020

🎉1 👍1

I create example with some activity in interrupt: https://github.com/no111u3/m48_robo_rust/blob/master/examples/timer_compare.rs.

It corrupt run than I build in debug and disaster interrupt logic than I build in release mode.
Some updates: I build this code with different optimization and has success run with use - opt-level=2 in release mode.