From f86dab5ff3e9c35e9e68b798b5c4faa08d390085 Mon Sep 17 00:00:00 2001
From: Per Lindgren <per.lindgren@ltu.se>
Date: Fri, 21 Jan 2022 21:49:45 +0100
Subject: [PATCH] Added support for SRP based scheduling for armv6m

---
 CHANGELOG.md                           |   1 +
 ci/expected/complex.run                |  47 ++++++++
 examples/complex.rs                    | 132 ++++++++++++++++++++++
 macros/src/codegen.rs                  |   2 +-
 macros/src/codegen/assertions.rs       |  32 +++++-
 macros/src/codegen/shared_resources.rs |  33 ++++++
 macros/src/codegen/util.rs             |   1 +
 src/export.rs                          | 148 ++++++++++++++++++++++---
 ui/v6m-interrupt-not-enough.rs_no      |  54 +++++++++
 9 files changed, 434 insertions(+), 16 deletions(-)
 create mode 100644 ci/expected/complex.run
 create mode 100644 examples/complex.rs
 create mode 100644 ui/v6m-interrupt-not-enough.rs_no

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 094275735b..f05aeeaf71 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,7 @@ For each category, *Added*, *Changed*, *Fixed* add new entries at the top!
 - Rework branch structure, release/vVERSION
 - Cargo clippy in CI
 - Use rust-cache Github Action
+- Support for NVIC based SPR based scheduling for armv6m.
 - CI changelog entry enforcer
 - `examples/periodic-at.rs`, an example of a periodic timer without accumulated drift.
 - `examples/periodic-at2.rs`, an example of a periodic process with two tasks, with offset timing.
diff --git a/ci/expected/complex.run b/ci/expected/complex.run
new file mode 100644
index 0000000000..5df884dabd
--- /dev/null
+++ b/ci/expected/complex.run
@@ -0,0 +1,47 @@
+init
+idle p0 started
+t2 p4 called 1 time
+enter lock s4 0
+t3 p4 exit
+idle enter lock s3 0
+idle pend t0
+idle pend t1
+idle pend t2
+t2 p4 called 2 times
+enter lock s4 1
+t3 p4 exit
+idle still in lock s3 0
+t1 p3 called 1 time
+t1 enter lock s4 2
+t1 pend t0
+t1 pend t2
+t1 still in lock s4 2
+t2 p4 called 3 times
+enter lock s4 2
+t3 p4 exit
+t1 p3 exit
+t0 p2 called 1 time
+t0 p2 exit
+
+back in idle
+enter lock s2 0
+idle pend t0
+idle pend t1
+t1 p3 called 2 times
+t1 enter lock s4 3
+t1 pend t0
+t1 pend t2
+t1 still in lock s4 3
+t2 p4 called 4 times
+enter lock s4 3
+t3 p4 exit
+t1 p3 exit
+idle pend t2
+t2 p4 called 5 times
+enter lock s4 4
+t3 p4 exit
+idle still in lock s2 0
+t0 p2 called 2 times
+t0 p2 exit
+
+idle exit
diff --git a/examples/complex.rs b/examples/complex.rs
new file mode 100644
index 0000000000..e5cf6dbea3
--- /dev/null
+++ b/examples/complex.rs
@@ -0,0 +1,132 @@
+//! examples/complex.rs
+
+#![deny(unsafe_code)]
+#![deny(warnings)]
+#![no_main]
+#![no_std]
+
+use panic_semihosting as _;
+
+#[rtic::app(device = lm3s6965)]
+mod app {
+
+    use cortex_m_semihosting::{debug, hprintln};
+    use lm3s6965::Interrupt;
+
+    #[shared]
+    struct Shared {
+        s2: u32, // shared with ceiling 2
+        s3: u32, // shared with ceiling 3
+        s4: u32, // shared with ceiling 4
+    }
+
+    #[local]
+    struct Local {}
+
+    #[init]
+    fn init(_: init::Context) -> (Shared, Local, init::Monotonics) {
+        hprintln!("init").unwrap();
+
+        (
+            Shared {
+                s2: 0,
+                s3: 0,
+                s4: 0,
+            },
+            Local {},
+            init::Monotonics(),
+        )
+    }
+
+    #[idle(shared = [s2, s3])]
+    fn idle(mut cx: idle::Context) -> ! {
+        hprintln!("idle p0 started").ok();
+        rtic::pend(Interrupt::GPIOC);
+        cx.shared.s3.lock(|s| {
+            hprintln!("idle enter lock s3 {}", s).ok();
+            hprintln!("idle pend t0").ok();
+            rtic::pend(Interrupt::GPIOA); // t0 p2, with shared ceiling 3
+            hprintln!("idle pend t1").ok();
+            rtic::pend(Interrupt::GPIOB); // t1 p3, with shared ceiling 3
+            hprintln!("idle pend t2").ok();
+            rtic::pend(Interrupt::GPIOC); // t2 p4, no sharing
+            hprintln!("idle still in lock s3 {}", s).ok();
+        });
+        hprintln!("\nback in idle").ok();
+
+        cx.shared.s2.lock(|s| {
+            hprintln!("enter lock s2 {}", s).ok();
+            hprintln!("idle pend t0").ok();
+            rtic::pend(Interrupt::GPIOA); // t0 p2, with shared ceiling 2
+            hprintln!("idle pend t1").ok();
+            rtic::pend(Interrupt::GPIOB); // t1 p3, no sharing
+            hprintln!("idle pend t2").ok();
+            rtic::pend(Interrupt::GPIOC); // t2 p4, no sharing
+            hprintln!("idle still in lock s2 {}", s).ok();
+        });
+        hprintln!("\nidle exit").ok();
+
+        debug::exit(debug::EXIT_SUCCESS); // Exit QEMU simulator
+
+        loop {
+            cortex_m::asm::nop();
+        }
+    }
+
+    #[task(binds = GPIOA, priority = 2, local = [times: u32 = 0], shared = [s2, s3])]
+    fn t0(cx: t0::Context) {
+        // Safe access to local `static mut` variable
+        *cx.local.times += 1;
+
+        hprintln!(
+            "t0 p2 called {} time{}",
+            *cx.local.times,
+            if *cx.local.times > 1 { "s" } else { "" }
+        )
+        .ok();
+        hprintln!("t0 p2 exit").ok();
+    }
+
+    #[task(binds = GPIOB, priority = 3, local = [times: u32 = 0], shared = [s3, s4])]
+    fn t1(mut cx: t1::Context) {
+        // Safe access to local `static mut` variable
+        *cx.local.times += 1;
+
+        hprintln!(
+            "t1 p3 called {} time{}",
+            *cx.local.times,
+            if *cx.local.times > 1 { "s" } else { "" }
+        )
+        .ok();
+
+        cx.shared.s4.lock(|s| {
+            hprintln!("t1 enter lock s4 {}", s).ok();
+            hprintln!("t1 pend t0").ok();
+            rtic::pend(Interrupt::GPIOA); // t0 p2, with shared ceiling 2
+            hprintln!("t1 pend t2").ok();
+            rtic::pend(Interrupt::GPIOC); // t2 p4, no sharing
+            hprintln!("t1 still in lock s4 {}", s).ok();
+        });
+
+        hprintln!("t1 p3 exit").ok();
+    }
+
+    #[task(binds = GPIOC, priority = 4, local = [times: u32 = 0], shared = [s4])]
+    fn t2(mut cx: t2::Context) {
+        // Safe access to local `static mut` variable
+        *cx.local.times += 1;
+
+        hprintln!(
+            "t2 p4 called {} time{}",
+            *cx.local.times,
+            if *cx.local.times > 1 { "s" } else { "" }
+        )
+        .unwrap();
+
+        cx.shared.s4.lock(|s| {
+            hprintln!("enter lock s4 {}", s).ok();
+            *s += 1;
+        });
+        hprintln!("t3 p4 exit").ok();
+    }
+}
diff --git a/macros/src/codegen.rs b/macros/src/codegen.rs
index f5cae34a72..01be1d5787 100644
--- a/macros/src/codegen.rs
+++ b/macros/src/codegen.rs
@@ -28,7 +28,7 @@ pub fn app(app: &App, analysis: &Analysis, extra: &Extra) -> TokenStream2 {
     let mut user = vec![];
 
     // Generate the `main` function
-    let assertion_stmts = assertions::codegen(app, analysis);
+    let assertion_stmts = assertions::codegen(app, analysis, extra);
 
     let pre_init_stmts = pre_init::codegen(app, analysis, extra);
 
diff --git a/macros/src/codegen/assertions.rs b/macros/src/codegen/assertions.rs
index a8a4491bdf..36ab036445 100644
--- a/macros/src/codegen/assertions.rs
+++ b/macros/src/codegen/assertions.rs
@@ -1,11 +1,11 @@
 use proc_macro2::TokenStream as TokenStream2;
 use quote::quote;
 
-use crate::analyze::Analysis;
+use crate::{analyze::Analysis, check::Extra, codegen::util};
 use rtic_syntax::ast::App;
 
 /// Generates compile-time assertions that check that types implement the `Send` / `Sync` traits
-pub fn codegen(app: &App, analysis: &Analysis) -> Vec<TokenStream2> {
+pub fn codegen(app: &App, analysis: &Analysis, extra: &Extra) -> Vec<TokenStream2> {
     let mut stmts = vec![];
 
     for ty in &analysis.send_types {
@@ -21,5 +21,33 @@ pub fn codegen(app: &App, analysis: &Analysis) -> Vec<TokenStream2> {
         stmts.push(quote!(rtic::export::assert_monotonic::<#ty>();));
     }
 
+    let device = &extra.device;
+    let arm_v6_checks: Vec<_> = app
+        .hardware_tasks
+        .iter()
+        .filter_map(|(_, task)| {
+            if !util::is_exception(&task.args.binds) {
+                let interrupt_name = &task.args.binds;
+                Some(quote!(assert!((#device::Interrupt::#interrupt_name as u32) < 32);))
+            } else {
+                None
+            }
+        })
+        .collect();
+
+    let const_check = quote! {
+        const _CONST_CHECK: () = {
+            if rtic::export::is_armv6() {
+                #(#arm_v6_checks)*
+            } else {
+                // TODO: Add armv7 checks here
+            }
+        };
+
+        let _ = _CONST_CHECK;
+    };
+
+    stmts.push(const_check);
+
     stmts
 }
diff --git a/macros/src/codegen/shared_resources.rs b/macros/src/codegen/shared_resources.rs
index 9e45cff973..a016e4538d 100644
--- a/macros/src/codegen/shared_resources.rs
+++ b/macros/src/codegen/shared_resources.rs
@@ -105,5 +105,38 @@ pub fn codegen(
         })
     };
 
+    // Computing mapping of used interrupts to masks
+    let interrupt_ids = analysis.interrupts.iter().map(|(p, (id, _))| (p, id));
+
+    use std::collections::HashMap;
+    let mut masks: HashMap<u8, _> = std::collections::HashMap::new();
+    let device = &extra.device;
+
+    for p in 0..3 {
+        masks.insert(p, quote!(0));
+    }
+
+    for (&priority, name) in interrupt_ids.chain(app.hardware_tasks.values().flat_map(|task| {
+        if !util::is_exception(&task.args.binds) {
+            Some((&task.args.priority, &task.args.binds))
+        } else {
+            // TODO: exceptions not implemented
+            None
+        }
+    })) {
+        let name = quote!(#device::Interrupt::#name as u32);
+        if let Some(v) = masks.get_mut(&(priority - 1)) {
+            *v = quote!(#v | 1 << #name);
+        };
+    }
+
+    let mut mask_arr: Vec<(_, _)> = masks.iter().collect();
+    mask_arr.sort_by_key(|(k, _v)| *k);
+    let mask_arr: Vec<_> = mask_arr.iter().map(|(_, v)| v).collect();
+
+    mod_app.push(quote!(
+        const MASKS: [u32; 3] = [#(#mask_arr),*];
+    ));
+
     (mod_app, mod_resources)
 }
diff --git a/macros/src/codegen/util.rs b/macros/src/codegen/util.rs
index 6a07732c34..4a29754bc8 100644
--- a/macros/src/codegen/util.rs
+++ b/macros/src/codegen/util.rs
@@ -52,6 +52,7 @@ pub fn impl_mutex(
                         #priority,
                         CEILING,
                         #device::NVIC_PRIO_BITS,
+                        &MASKS,
                         f,
                     )
                 }
diff --git a/src/export.rs b/src/export.rs
index 838ae8435e..ed51a9e9f7 100644
--- a/src/export.rs
+++ b/src/export.rs
@@ -102,6 +102,19 @@ impl Priority {
     }
 }
 
+/// Const helper to check architecture
+pub const fn is_armv6() -> bool {
+    #[cfg(not(armv6m))]
+    {
+        false
+    }
+
+    #[cfg(armv6m)]
+    {
+        true
+    }
+}
+
 #[inline(always)]
 pub fn assert_send<T>()
 where
@@ -123,13 +136,40 @@ where
 {
 }
 
-/// Lock the resource proxy by setting the BASEPRI
-/// and running the closure with interrupt::free
+/// Lock implementation using BASEPRI and global Critical Section (CS)
 ///
 /// # Safety
 ///
-/// Writing to the BASEPRI
-/// Dereferencing a raw pointer
+/// The system ceiling is raised from current to ceiling
+/// by either
+/// - raising the BASEPRI to the ceiling value, or
+/// - disable all interrupts in case we want to
+///   mask interrupts with maximum priority
+///
+/// Dereferencing a raw pointer inside CS
+///
+/// The priority.set/priority.get can safely be outside the CS
+/// as being a context local cell (not affected by preemptions).
+/// It is merely used in order to omit masking in case current
+/// priority is current priority >= ceiling.
+///
+/// Lock Efficiency:
+/// Experiments validate (sub)-zero cost for CS implementation
+/// (Sub)-zero as:
+/// - Either zero OH (lock optimized out), or
+/// - Amounting to an optimal assembly implementation
+///   - The BASEPRI value is folded to a constant at compile time
+///   - CS entry, single assembly instruction to write BASEPRI
+///   - CS exit, single assembly instruction to write BASEPRI
+///   - priority.set/get optimized out (their effect not)
+/// - On par or better than any handwritten implementation of SRP
+///
+/// Limitations:
+/// The current implementation reads/writes BASEPRI once
+/// even in some edge cases where this may be omitted.
+/// Total OH of per task is max 2 clock cycles, negligible in practice
+/// but can in theory be fixed.
+///
 #[cfg(armv7m)]
 #[inline(always)]
 pub unsafe fn lock<T, R>(
@@ -137,6 +177,7 @@ pub unsafe fn lock<T, R>(
     priority: &Priority,
     ceiling: u8,
     nvic_prio_bits: u8,
+    _mask: &[u32; 3],
     f: impl FnOnce(&mut T) -> R,
 ) -> R {
     let current = priority.get();
@@ -160,13 +201,50 @@ pub unsafe fn lock<T, R>(
     }
 }
 
-/// Lock the resource proxy by setting the PRIMASK
-/// and running the closure with ``interrupt::free``
+/// Lock implementation using interrupt masking
 ///
 /// # Safety
 ///
-/// Writing to the PRIMASK
-/// Dereferencing a raw pointer
+/// The system ceiling is raised from current to ceiling
+/// by computing a 32 bit `mask` (1 bit per interrupt)
+/// 1: ceiling >= priority > current
+/// 0: else
+///
+/// On CS entry, `clear_enable_mask(mask)` disables interrupts
+/// On CS exit,  `set_enable_mask(mask)` re-enables interrupts
+///
+/// The priority.set/priority.get can safely be outside the CS
+/// as being a context local cell (not affected by preemptions).
+/// It is merely used in order to omit masking in case
+/// current priority >= ceiling.
+///
+/// Dereferencing a raw pointer is done safely inside the CS
+///
+/// Lock Efficiency:
+/// Early experiments validate (sub)-zero cost for CS implementation
+/// (Sub)-zero as:
+/// - Either zero OH (lock optimized out), or
+/// - Amounting to an optimal assembly implementation
+///   - if ceiling == (1 << nvic_prio_bits)
+///     - we execute the closure in a global critical section (interrupt free)
+///     - CS entry cost, single write to core register
+///     - CS exit cost, single write to core register
+///   else
+///     - The `mask` value is folded to a constant at compile time
+///     - CS entry, single write of the 32 bit `mask` to the `icer` register
+///     - CS exit, single write of the 32 bit `mask` to the `iser` register
+/// - priority.set/get optimized out (their effect not)
+/// - On par or better than any hand written implementation of SRP
+///
+/// Limitations:
+/// Current implementation does not allow for tasks with shared resources
+/// to be bound to exception handlers, as these cannot be masked in HW.
+///
+/// Possible solutions:
+/// - Mask exceptions by global critical sections (interrupt::free)
+/// - Temporary lower exception priority
+///
+/// These possible solutions are set goals for future work
 #[cfg(not(armv7m))]
 #[inline(always)]
 pub unsafe fn lock<T, R>(
@@ -174,20 +252,64 @@ pub unsafe fn lock<T, R>(
     priority: &Priority,
     ceiling: u8,
     _nvic_prio_bits: u8,
+    masks: &[u32; 3],
     f: impl FnOnce(&mut T) -> R,
 ) -> R {
     let current = priority.get();
-
     if current < ceiling {
-        priority.set(u8::max_value());
-        let r = interrupt::free(|_| f(&mut *ptr));
-        priority.set(current);
-        r
+        if ceiling >= 4 {
+            // safe to manipulate outside critical section
+            priority.set(ceiling);
+            // execute closure under protection of raised system ceiling
+            let r = interrupt::free(|_| f(&mut *ptr));
+            // safe to manipulate outside critical section
+            priority.set(current);
+            r
+        } else {
+            // safe to manipulate outside critical section
+            priority.set(ceiling);
+            let mask = compute_mask(current, ceiling, masks);
+            clear_enable_mask(mask);
+
+            // execute closure under protection of raised system ceiling
+            let r = f(&mut *ptr);
+
+            set_enable_mask(mask);
+
+            // safe to manipulate outside critical section
+            priority.set(current);
+            r
+        }
     } else {
+        // execute closure without raising system ceiling
         f(&mut *ptr)
     }
 }
 
+#[cfg(not(armv7m))]
+#[inline(always)]
+fn compute_mask(from_prio: u8, to_prio: u8, masks: &[u32; 3]) -> u32 {
+    let mut res = 0;
+    masks[from_prio as usize..to_prio as usize]
+        .iter()
+        .for_each(|m| res |= m);
+    res
+}
+
+// enables interrupts
+#[cfg(not(armv7m))]
+#[inline(always)]
+unsafe fn set_enable_mask(mask: u32) {
+    (*NVIC::ptr()).iser[0].write(mask)
+}
+
+// disables interrupts
+#[cfg(not(armv7m))]
+#[inline(always)]
+unsafe fn clear_enable_mask(mask: u32) {
+    (*NVIC::ptr()).icer[0].write(mask)
+}
+
 #[inline]
 #[must_use]
 pub fn logical2hw(logical: u8, nvic_prio_bits: u8) -> u8 {
diff --git a/ui/v6m-interrupt-not-enough.rs_no b/ui/v6m-interrupt-not-enough.rs_no
new file mode 100644
index 0000000000..3fbf3cf7bf
--- /dev/null
+++ b/ui/v6m-interrupt-not-enough.rs_no
@@ -0,0 +1,54 @@
+//! v6m-interrupt-not-enough.rs_no (not run atm)
+//!
+//! Expected behavior:
+//! should pass
+//! > cargo build --example m0_perf_err  --target thumbv7m-none-eabi --release
+//!
+//! should fail
+//! > cargo build --example m0_perf_err  --target thumbv6m-none-eabi --release
+//! Compiling cortex-m-rtic v1.0.0 (/home/pln/rust/rtic/cortex-m-rtic)
+//! error[E0308]: mismatched types
+//! --> examples/m0_perf_err.rs:25:1
+//!  |
+//! 25 | #[rtic::app(device = lm3s6965)]
+//!  | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ expected an array with a fixed size of 4 elements, found one with 5 elements
+//!  |
+//!  = note: this error originates in the attribute macro `rtic::app` (in Nightly builds, run with -Z macro-backtrace for more info)
+
+#![deny(unsafe_code)]
+#![deny(warnings)]
+#![no_main]
+#![no_std]
+
+use panic_semihosting as _;
+
+#[rtic::app(device = lm3s6965)]
+mod app {
+
+    use cortex_m_semihosting::debug;
+
+    #[shared]
+    struct Shared {}
+
+    #[local]
+    struct Local {}
+
+    #[init]
+    fn init(_: init::Context) -> (Shared, Local, init::Monotonics) {
+        (Shared {}, Local {}, init::Monotonics())
+    }
+
+    #[inline(never)]
+    #[idle]
+    fn idle(_cx: idle::Context) -> ! {
+        debug::exit(debug::EXIT_SUCCESS); // Exit QEMU simulator
+
+        loop {
+            cortex_m::asm::nop();
+        }
+    }
+
+    // priority to high for v6m
+    #[task(binds = GPIOA, priority = 5)]
+    fn t0(_cx: t0::Context) {}
+}