From 61e5798e3a8aca767b9f0953f27e4e28a901576d Mon Sep 17 00:00:00 2001
From: Jim Huang <jserv@ccns.ncku.edu.tw>
Date: Fri, 30 Jan 2026 01:19:10 +0800
Subject: [PATCH] Add BASEPRI zero-latency ISR support

This implements BASEPRI-based critical sections to enable zero-latency
ISRs (priority 0x0-0x2) that can preempt kernel operations. It adds
cycle-accurate interrupt latency measurement infrastructure using DWT.
- Add BASEPRI primitives (irq_kernel_critical_enter/exit) to irq.h
- Convert scheduler critical sections from PRIMASK to BASEPRI
- Convert IPC critical sections from PRIMASK to BASEPRI
- Add TCB validation in IPC to prevent use-after-free during callbacks
- Add irq_system_state tracking and in_isr_context() helper
---
 include/platform/irq-latency.h | 125 ++++++++++++++++++++
 include/platform/irq.h         |  76 +++++++++++-
 kernel/build.mk                |   3 +-
 kernel/ipc.c                   |  31 +++--
 kernel/kdb-latency.c           |  67 +++++++++++
 kernel/kdb.c                   |  10 ++
 kernel/sched.c                 |  52 ++++----
 platform/build.mk              |   3 +-
 platform/irq-latency.c         | 209 +++++++++++++++++++++++++++++++++
 platform/irq.c                 |   6 +
 10 files changed, 546 insertions(+), 36 deletions(-)
 create mode 100644 include/platform/irq-latency.h
 create mode 100644 kernel/kdb-latency.c
 create mode 100644 platform/irq-latency.c

diff --git a/include/platform/irq-latency.h b/include/platform/irq-latency.h
new file mode 100644
index 00000000..66d96641
--- /dev/null
+++ b/include/platform/irq-latency.h
@@ -0,0 +1,125 @@
+/* Copyright (c) 2026 The F9 Microkernel Project. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef PLATFORM_IRQ_LATENCY_H_
+#define PLATFORM_IRQ_LATENCY_H_
+
+#include <stdint.h>
+
+/* DWT (Data Watchpoint and Trace) registers for cycle counting */
+#define DWT_CTRL ((volatile uint32_t *) 0xE0001000)
+#define DWT_CYCCNT ((volatile uint32_t *) 0xE0001004)
+#define DWT_CTRL_CYCCNTENA (1 << 0)
+
+#define DEMCR ((volatile uint32_t *) 0xE000EDFC)
+#define DEMCR_TRCENA (1 << 24)
+
+/**
+ * @file irq_latency.h
+ * @brief Interrupt latency measurement and profiling infrastructure
+ *
+ * Provides cycle-accurate latency tracking for zero-latency ISRs and
+ * standard IRQs. Enables validation of BASEPRI-based zero-latency
+ * interrupt performance (<10 cycle target).
+ *
+ * Usage:
+ *   1. Call latency_sample_start() at ISR entry
+ *   2. Call latency_sample_end(priority, irq_num) at ISR exit
+ *   3. View statistics via KDB 'L' command
+ */
+
+/**
+ * Latency statistics per interrupt priority level.
+ */
+typedef struct {
+    uint32_t count; /* Number of samples */
+    uint32_t min;   /* Minimum latency (cycles) */
+    uint32_t max;   /* Maximum latency (cycles) */
+    uint32_t sum;   /* Sum for average calculation */
+    uint32_t avg;   /* Average latency (cycles) */
+} latency_stats_t;
+
+/**
+ * Get current cycle count from DWT_CYCCNT.
+ * Returns 0 if DWT is not enabled.
+ */
+static inline uint32_t get_cycle_count(void)
+{
+    return *DWT_CYCCNT;
+}
+
+/**
+ * Enable DWT cycle counter for latency measurements.
+ * Called during system initialization.
+ */
+void latency_init(void);
+
+/**
+ * Record latency sample for an interrupt.
+ *
+ * @param priority Interrupt priority (0x0-0xF)
+ * @param irq_num IRQ number (-15 to 239)
+ * @param cycles Measured latency in cycles
+ */
+void latency_record(uint8_t priority, int16_t irq_num, uint32_t cycles);
+
+/**
+ * Get latency statistics for a priority level.
+ *
+ * @param priority Interrupt priority (0x0-0xF)
+ * @return Pointer to statistics structure
+ */
+const latency_stats_t *latency_get_stats(uint8_t priority);
+
+/**
+ * Get a best-effort atomic snapshot of latency statistics.
+ *
+ * Uses relaxed atomics only; intended for diagnostic reads outside ISR
+ * context. Returns 1 on success, 0 on invalid input.
+ */
+int latency_get_stats_snapshot(uint8_t priority, latency_stats_t *out);
+
+/**
+ * Reset all latency statistics.
+ */
+void latency_reset(void);
+
+/**
+ * Get interrupt number from IPSR.
+ * Returns 0 for thread mode, 1-15 for exceptions, 16+ for IRQs.
+ */
+static inline uint32_t get_irq_number(void)
+{
+    uint32_t ipsr;
+    __asm__ __volatile__("mrs %0, ipsr" : "=r"(ipsr));
+    return ipsr & 0x1FF;
+}
+
+/**
+ * Latency measurement helper - call at ISR entry.
+ * Returns timestamp for latency_sample_end().
+ */
+static inline uint32_t latency_sample_start(void)
+{
+    return get_cycle_count();
+}
+
+/**
+ * Latency measurement helper - call at ISR exit.
+ *
+ * @param start_cycles Timestamp from latency_sample_start()
+ * @param priority Interrupt priority level
+ * @param irq_num IRQ number from IPSR
+ */
+static inline void latency_sample_end(uint32_t start_cycles,
+                                      uint8_t priority,
+                                      int16_t irq_num)
+{
+    uint32_t end_cycles = get_cycle_count();
+    uint32_t elapsed = end_cycles - start_cycles;
+    latency_record(priority, irq_num, elapsed);
+}
+
+#endif /* PLATFORM_IRQ_LATENCY_H_ */
diff --git a/include/platform/irq.h b/include/platform/irq.h
index 6a057c1e..efeeed58 100644
--- a/include/platform/irq.h
+++ b/include/platform/irq.h
@@ -15,6 +15,34 @@
 
 void irq_init(void);
 
+/*
+ * Interrupt Priority Levels (ARM Cortex-M 4-bit priorities)
+ */
+#define IRQ_PRIO_ZERO_LATENCY_MAX 0x2 /* Highest priority, never masked */
+#define IRQ_PRIO_SYSTICK 0x3          /* System timer */
+#define IRQ_PRIO_KERNEL_MASK 0x40     /* BASEPRI mask (0x4 << 4) */
+#define IRQ_PRIO_USER_DEFAULT 0x8     /* Default user IRQ priority */
+#define IRQ_PRIO_LOWEST 0xF           /* SVCall, PendSV */
+
+/*
+ * System state tracking for ISR context.
+ * 0 = Thread mode (PSP), 1+ = Handler mode (MSP, tracks nesting depth).
+ */
+extern volatile uint32_t irq_system_state;
+
+/*
+ * Fast ISR context check using hardware IPSR register.
+ * Returns: true if currently in exception handler, false if in thread mode.
+ * Zero overhead: Single MRS instruction, no memory access or race conditions.
+ */
+static inline bool in_isr_context(void)
+{
+    return IPSR() != 0;
+}
+
+/*
+ * PRIMASK-based critical sections (blocks ALL interrupts).
+ */
 static inline void irq_disable(void)
 {
     __asm__ __volatile__("cpsid i" ::: "memory");
@@ -45,6 +73,53 @@ static inline void irq_restore_flags(uint32_t flags)
     __asm__ __volatile__("msr primask, %0" ::"r"(flags) : "memory");
 }
 
+/*
+ * BASEPRI-based critical sections (blocks interrupts >= priority level).
+ * Zero-latency ISRs at priority 0x0-0x2 can preempt kernel critical sections.
+ */
+static inline void irq_disable_below(uint8_t priority)
+{
+    uint32_t basepri = (priority << 4) & 0xFF;
+    __asm__ __volatile__("msr basepri, %0" ::"r"(basepri) : "memory");
+}
+
+static inline void irq_enable_all(void)
+{
+    __asm__ __volatile__("msr basepri, %0" ::"r"(0) : "memory");
+}
+
+static inline uint32_t irq_save_basepri(uint8_t priority)
+{
+    uint32_t prev_basepri;
+    uint32_t new_basepri = (priority << 4) & 0xFF;
+    __asm__ __volatile__(
+        "mrs %0, basepri\n\t"
+        "msr basepri, %1"
+        : "=r"(prev_basepri)
+        : "r"(new_basepri)
+        : "memory");
+    return prev_basepri;
+}
+
+static inline void irq_restore_basepri(uint32_t basepri)
+{
+    __asm__ __volatile__("msr basepri, %0" ::"r"(basepri) : "memory");
+}
+
+/*
+ * Kernel critical section (masks interrupts >= 0x4, allows 0x0-0x3).
+ * Use this as the default for scheduler, IPC, and memory operations.
+ */
+static inline uint32_t irq_kernel_critical_enter(void)
+{
+    return irq_save_basepri(IRQ_PRIO_KERNEL_MASK >> 4);
+}
+
+static inline void irq_kernel_critical_exit(uint32_t basepri)
+{
+    irq_restore_basepri(basepri);
+}
+
 static inline void irq_svc(void)
 {
     __asm__ __volatile__("svc #0");
@@ -242,7 +317,6 @@ extern volatile uint32_t __irq_saved_regs[8];
         request_schedule();    \
         irq_return();          \
     }
-
 extern volatile tcb_t *current;
 
 #endif /* PLATFORM_IRQ_H_ */
diff --git a/kernel/build.mk b/kernel/build.mk
index ac5d8707..7aa68255 100644
--- a/kernel/build.mk
+++ b/kernel/build.mk
@@ -21,7 +21,8 @@ kernel-y = \
 	interrupt.o
 
 KDB-$(CONFIG_KDB) = \
-	kdb.o
+	kdb.o \
+	kdb-latency.o
 
 KPROBES-$(CONFIG_KPROBES) = \
 	kprobes.o
diff --git a/kernel/ipc.c b/kernel/ipc.c
index e494acd2..a6340c90 100644
--- a/kernel/ipc.c
+++ b/kernel/ipc.c
@@ -214,18 +214,19 @@ static void do_ipc(tcb_t *from, tcb_t *to)
      * CONSTRAINT: Callback MUST NOT destroy its own TCB.
      */
     if (to->ipc_notify && to->notify_pending && to->notify_depth < 3) {
-        uint32_t irq_flags;
+        uint32_t basepri;
         uint8_t generation_before;
         notify_handler_t callback;
 
         /* Atomically increment depth and capture generation.
-         * IRQ masking prevents race with nested interrupt-driven IPC.
+         * BASEPRI masking prevents race with nested interrupt-driven IPC.
+         * Zero-latency ISRs (0x0-0x2) can still preempt during this operation.
          */
-        irq_flags = irq_save_flags();
+        basepri = irq_kernel_critical_enter();
         to->notify_depth++;
         generation_before = to->notify_generation;
         callback = to->ipc_notify;
-        irq_restore_flags(irq_flags);
+        irq_kernel_critical_exit(basepri);
 
         /* Recursion protection: prevent unbounded callback nesting.
          * Max depth 3 allows: serial → network → timer notification chains.
@@ -245,11 +246,27 @@ static void do_ipc(tcb_t *from, tcb_t *to)
         /* Atomically decrement depth only if TCB still valid.
          * Generation counter detects TCB destruction during callback.
          * If TCB was destroyed, skip depth decrement (would be use-after-free).
+         *
+         * SAFETY: We must verify 'to' is still a valid TCB before accessing it.
+         * Search thread_map to confirm the pointer hasn't been freed and
+         * reused.
          */
-        irq_flags = irq_save_flags();
-        if (to->notify_generation == generation_before)
+        basepri = irq_kernel_critical_enter();
+
+        /* Verify TCB is still valid by checking thread_map */
+        int tcb_valid = 0;
+        for (int i = 1; i < thread_count; ++i) {
+            if (thread_map[i] == to) {
+                tcb_valid = 1;
+                break;
+            }
+        }
+
+        /* Only decrement if TCB is valid AND generation hasn't changed */
+        if (tcb_valid && to->notify_generation == generation_before)
             to->notify_depth--;
-        irq_restore_flags(irq_flags);
+
+        irq_kernel_critical_exit(basepri);
 
         /* Check for preemption after notification.
          * Callback may have made higher-priority threads runnable.
diff --git a/kernel/kdb-latency.c b/kernel/kdb-latency.c
new file mode 100644
index 00000000..6749c70d
--- /dev/null
+++ b/kernel/kdb-latency.c
@@ -0,0 +1,67 @@
+/* Copyright (c) 2026 The F9 Microkernel Project. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include <debug.h>
+#include <platform/irq-latency.h>
+
+/**
+ * KDB command: Display interrupt latency statistics.
+ *
+ * Shows min/avg/max latency for each priority level, highlighting
+ * zero-latency ISRs (0x0-0x2) and standard user IRQs.
+ */
+void kdb_show_latency(void)
+{
+    int i;
+    latency_stats_t stats;
+    int has_data = 0;
+
+    dbg_printf(DL_KDB, "\n=== Interrupt Latency Statistics ===\n");
+    dbg_printf(DL_KDB, "Prio  Type              Count    Min    Avg    Max\n");
+    dbg_printf(DL_KDB, "----  ----------------  ------  -----  -----  -----\n");
+
+    for (i = 0; i < 16; i++) {
+        if (!latency_get_stats_snapshot(i, &stats))
+            continue;
+
+        if (stats.count == 0)
+            continue;
+
+        has_data = 1;
+
+        const char *type;
+        if (i <= 0x2)
+            type = "Zero-latency ISR";
+        else if (i == 0x3)
+            type = "SysTick";
+        else if (i <= 0xE)
+            type = "User IRQ";
+        else
+            type = "SVCall/PendSV";
+
+        stats.avg = stats.count > 0 ? (stats.sum / stats.count) : 0;
+        dbg_printf(DL_KDB, "0x%X   %-16s  %6u  %5u  %5u  %5u\n", i, type,
+                   stats.count, stats.min, stats.avg, stats.max);
+    }
+
+    if (!has_data) {
+        dbg_printf(DL_KDB, "(No latency samples recorded yet)\n");
+    }
+
+    dbg_printf(DL_KDB, "\nNotes:\n");
+    dbg_printf(DL_KDB, "  - Zero-latency ISRs (0x0-0x2) target <10 cycles\n");
+    dbg_printf(DL_KDB, "  - User IRQs (0x4-0xE) masked during kernel ops\n");
+    dbg_printf(DL_KDB, "  - Use 'r' to reset statistics\n");
+    dbg_printf(DL_KDB, "\n");
+}
+
+/**
+ * KDB command: Reset latency statistics.
+ */
+void kdb_reset_latency(void)
+{
+    latency_reset();
+    dbg_printf(DL_KDB, "Latency statistics reset.\n");
+}
diff --git a/kernel/kdb.c b/kernel/kdb.c
index e711f587..502eaa2b 100644
--- a/kernel/kdb.c
+++ b/kernel/kdb.c
@@ -30,6 +30,8 @@ extern void kdb_dump_as(void);
 extern void kdb_show_sampling(void);
 extern void kdb_show_tickless_verify(void);
 extern void kdb_dump_notifications(void);
+extern void kdb_show_latency(void);
+extern void kdb_reset_latency(void);
 
 struct kdb_t kdb_functions[] = {
     {.option = 'K',
@@ -84,6 +86,14 @@ struct kdb_t kdb_functions[] = {
      .menuentry = "show tickless scheduling stat",
      .function = kdb_show_tickless_verify},
 #endif
+    {.option = 'L',
+     .name = "LATENCY",
+     .menuentry = "show interrupt latency",
+     .function = kdb_show_latency},
+    {.option = 'r',
+     .name = "RESET LATENCY",
+     .menuentry = "reset latency statistics",
+     .function = kdb_reset_latency},
     /* Insert KDB functions here */
 };
 
diff --git a/kernel/sched.c b/kernel/sched.c
index a3f6af4c..24e27b7a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -100,25 +100,25 @@ void sched_enqueue(tcb_t *thread)
 {
     uint8_t prio;
     tcb_t *head;
-    uint32_t flags;
+    uint32_t basepri;
 
     if (!thread)
         return;
 
-    flags = irq_save_flags();
+    basepri = irq_kernel_critical_enter();
 
     /* Strict invariant: only runnable threads in ready queues
      * Check inside critical section to prevent race with state changes
      */
     if (thread->state != T_RUNNABLE) {
-        irq_restore_flags(flags);
+        irq_kernel_critical_exit(basepri);
         panic("SCHED: Enqueueing non-runnable thread %t (state %d)\n",
               thread->t_globalid, thread->state);
     }
 
     /* Don't double-enqueue */
     if (sched_is_queued(thread)) {
-        irq_restore_flags(flags);
+        irq_kernel_critical_exit(basepri);
         return;
     }
 
@@ -147,7 +147,7 @@ void sched_enqueue(tcb_t *thread)
         /* Bitmap already set - no update needed */
     }
 
-    irq_restore_flags(flags);
+    irq_kernel_critical_exit(basepri);
 }
 
 /**
@@ -159,16 +159,16 @@ void sched_dequeue(tcb_t *thread)
 {
     uint8_t prio;
     tcb_t *prev, *next;
-    uint32_t flags;
+    uint32_t basepri;
 
     if (!thread)
         return;
 
-    flags = irq_save_flags();
+    basepri = irq_kernel_critical_enter();
 
     /* Not in queue */
     if (!sched_is_queued(thread)) {
-        irq_restore_flags(flags);
+        irq_kernel_critical_exit(basepri);
         return;
     }
 
@@ -197,7 +197,7 @@ void sched_dequeue(tcb_t *thread)
     /* Mark as not queued */
     sched_link_init(thread);
 
-    irq_restore_flags(flags);
+    irq_kernel_critical_exit(basepri);
 }
 
 /**
@@ -210,15 +210,15 @@ void sched_yield(void)
     tcb_t *curr = thread_current();
     uint8_t prio;
     tcb_t *head;
-    uint32_t flags;
+    uint32_t basepri;
 
     if (!curr)
         return;
 
-    flags = irq_save_flags();
+    basepri = irq_kernel_critical_enter();
 
     if (!sched_is_queued(curr)) {
-        irq_restore_flags(flags);
+        irq_kernel_critical_exit(basepri);
         return;
     }
 
@@ -245,7 +245,7 @@ void sched_yield(void)
         ready_queue[prio] = head->sched_link.next;
     }
 
-    irq_restore_flags(flags);
+    irq_kernel_critical_exit(basepri);
 }
 
 /**
@@ -268,15 +268,15 @@ tcb_t *schedule_select(void)
     uint32_t prio;
     tcb_t *thread;
     tcb_t *curr;
-    uint32_t flags;
+    uint32_t basepri;
 
-    flags = irq_save_flags();
+    basepri = irq_kernel_critical_enter();
 
     /* CLZ returns 32 if bitmap is 0 (no branches needed) */
     prio = clz32(ready_bitmap);
 
     if (prio >= SCHED_PRIORITY_LEVELS) {
-        irq_restore_flags(flags);
+        irq_kernel_critical_exit(basepri);
         /* Not reached: idle thread should always be runnable */
         panic("SCHED: Empty ready_bitmap (idle missing)\n");
         return NULL;
@@ -286,7 +286,7 @@ tcb_t *schedule_select(void)
 
     /* Safety check for consistency */
     if (!thread) {
-        irq_restore_flags(flags);
+        irq_kernel_critical_exit(basepri);
         panic("SCHED: Inconsistent bitmap/queue at prio %d\n", prio);
         return NULL;
     }
@@ -310,7 +310,7 @@ tcb_t *schedule_select(void)
                        "SCHED: PTS defer prio %d (curr %t thresh %d)\n", prio,
                        curr->t_globalid, curr->preempt_threshold);
 
-            irq_restore_flags(flags);
+            irq_kernel_critical_exit(basepri);
             return curr; /* Continue running current thread */
         }
     }
@@ -320,7 +320,7 @@ tcb_t *schedule_select(void)
      */
     preempted_bitmap &= ~(1UL << (31 - prio));
 
-    irq_restore_flags(flags);
+    irq_kernel_critical_exit(basepri);
     /* Strict invariant: queued threads are always runnable */
     return thread;
 }
@@ -335,7 +335,7 @@ tcb_t *schedule_select(void)
  */
 void sched_set_priority(tcb_t *thread, uint8_t new_prio)
 {
-    uint32_t flags;
+    uint32_t basepri;
     int was_queued;
 
     if (!thread)
@@ -344,11 +344,11 @@ void sched_set_priority(tcb_t *thread, uint8_t new_prio)
     if (new_prio >= SCHED_PRIORITY_LEVELS)
         new_prio = SCHED_PRIO_IDLE;
 
-    flags = irq_save_flags();
+    basepri = irq_kernel_critical_enter();
 
     /* Check if priority actually changes */
     if (thread->priority == new_prio) {
-        irq_restore_flags(flags);
+        irq_kernel_critical_exit(basepri);
         return;
     }
 
@@ -364,7 +364,7 @@ void sched_set_priority(tcb_t *thread, uint8_t new_prio)
     if (was_queued)
         sched_enqueue(thread);
 
-    irq_restore_flags(flags);
+    irq_kernel_critical_exit(basepri);
 }
 
 /**
@@ -389,7 +389,7 @@ int sched_preemption_change(tcb_t *thread,
                             uint8_t new_threshold,
                             uint8_t *old_threshold)
 {
-    uint32_t flags;
+    uint32_t basepri;
     uint8_t old_thresh;
     int should_reschedule = 0;
 
@@ -410,7 +410,7 @@ int sched_preemption_change(tcb_t *thread,
         return -1;
     }
 
-    flags = irq_save_flags();
+    basepri = irq_kernel_critical_enter();
 
     /* Save old threshold (return user-set value) */
     old_thresh = thread->user_preempt_threshold;
@@ -456,7 +456,7 @@ int sched_preemption_change(tcb_t *thread,
         }
     }
 
-    irq_restore_flags(flags);
+    irq_kernel_critical_exit(basepri);
 
     /* Trigger reschedule if needed (outside critical section) */
     if (should_reschedule) {
diff --git a/platform/build.mk b/platform/build.mk
index dd4ba431..b2fdc77b 100644
--- a/platform/build.mk
+++ b/platform/build.mk
@@ -21,7 +21,8 @@ platform-y = \
 	debug_device.o \
 	mpu.o \
 	spinlock.o \
-	irq.o
+	irq.o \
+	irq-latency.o
 
 platform-$(CONFIG_DEBUG_DEV_UART) += debug_uart.o
 platform-$(CONFIG_DEBUG_DEV_RAM) += debug_ram.o
diff --git a/platform/irq-latency.c b/platform/irq-latency.c
new file mode 100644
index 00000000..802e8a19
--- /dev/null
+++ b/platform/irq-latency.c
@@ -0,0 +1,209 @@
+/* Copyright (c) 2026 The F9 Microkernel Project. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include <debug.h>
+#include <init_hook.h>
+#include <platform/cortex_m.h>
+#include <platform/irq-latency.h>
+#include <platform/irq.h>
+
+/**
+ * Latency statistics per priority level (0x0-0xF = 16 levels).
+ */
+static latency_stats_t latency_stats[16];
+
+/**
+ * Enable DWT cycle counter for latency measurements.
+ *
+ * Note: Called at INIT_LEVEL_PLATFORM with interrupts disabled.
+ * Safe to reset statistics with plain writes (no atomic operations needed).
+ */
+void latency_init(void)
+{
+    uint32_t test_before, test_after;
+
+    /* Enable DWT if not already enabled */
+    if (!(*DEMCR & DEMCR_TRCENA)) {
+        *DEMCR |= DEMCR_TRCENA; /* Enable trace and debug */
+    }
+
+    /* Enable cycle counter */
+    *DWT_CTRL |= DWT_CTRL_CYCCNTENA;
+
+    /* Reset cycle counter */
+    *DWT_CYCCNT = 0;
+
+    /* Verify DWT functionality (helpful diagnostic for QEMU vs hardware) */
+    test_before = *DWT_CYCCNT;
+    for (volatile int i = 0; i < 100; i++)
+        ; /* Small busy loop */
+    test_after = *DWT_CYCCNT;
+
+    if (test_after == test_before) {
+        /* QEMU: DWT not emulated, cycle counter stays at 0 */
+        dbg_printf(DL_KDB,
+                   "IRQ latency profiling enabled (DWT not available)\n");
+    } else {
+        /* Hardware: DWT working, show delta to confirm */
+        dbg_printf(DL_KDB,
+                   "IRQ latency profiling enabled (DWT cycle counter active, "
+                   "test delta=%u)\n",
+                   test_after - test_before);
+    }
+
+    /* Reset statistics - inline to avoid irq_save_flags during early boot */
+    for (int i = 0; i < 16; i++) {
+        latency_stats[i].count = 0;
+        latency_stats[i].min = 0;
+        latency_stats[i].max = 0;
+        latency_stats[i].sum = 0;
+        latency_stats[i].avg = 0;
+    }
+}
+
+/*
+ * Note: DWT cycle counter not emulated in QEMU (reads return 0).
+ * On real hardware (STM32F4), DWT provides cycle-accurate latency measurement.
+ * System boots safely in both environments - no hang.
+ */
+INIT_HOOK(latency_init, INIT_LEVEL_PLATFORM);
+
+/**
+ * Record latency sample for an interrupt.
+ *
+ * CRITICAL: This function is called from ISRs, including zero-latency ISRs
+ * (priority 0x0-0x2). It MUST NOT use PRIMASK or any operation that blocks
+ * zero-latency interrupts. Uses atomic operations for lock-free updates.
+ *
+ * Note: min/max updates use atomic compare-exchange loops to ensure
+ * consistency even under heavy preemption from other zero-latency ISRs.
+ */
+void latency_record(uint8_t priority, int16_t irq_num, uint32_t cycles)
+{
+    latency_stats_t *stats;
+    uint32_t old_min, old_max;
+
+    (void) irq_num;
+
+    /* Validate priority (0x0-0xF) */
+    if (priority >= 16)
+        return;
+
+    /* Ignore obviously bogus samples (wraparound or >1M cycles).
+     * 1M cycles at 168MHz is ~6ms, which is a reasonable upper bound
+     * for most real-time ISRs.
+     */
+    if (cycles == 0 || cycles > 1000000)
+        return;
+
+    stats = &latency_stats[priority];
+
+    /* Atomic updates for count and sum (lock-free, no PRIMASK). */
+    __atomic_add_fetch(&stats->count, 1, __ATOMIC_RELAXED);
+    __atomic_add_fetch(&stats->sum, cycles, __ATOMIC_RELAXED);
+
+    /* Update min/max using atomic compare-exchange loops.
+     * These ensure that even under heavy preemption, we never miss a new
+     * min/max value.
+     */
+
+    /* Update min */
+    old_min = __atomic_load_n(&stats->min, __ATOMIC_RELAXED);
+    while ((old_min == 0 || cycles < old_min) &&
+           !__atomic_compare_exchange_n(&stats->min, &old_min, cycles, 0,
+                                        __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
+        /* If CAS failed, old_min was updated by another ISR; loop and retry */
+    }
+
+    /* Update max */
+    old_max = __atomic_load_n(&stats->max, __ATOMIC_RELAXED);
+    while (cycles > old_max &&
+           !__atomic_compare_exchange_n(&stats->max, &old_max, cycles, 0,
+                                        __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
+        /* If CAS failed, old_max was updated by another ISR; loop and retry */
+    }
+}
+
+/**
+ * Get latency statistics for a priority level.
+ *
+ * Calculates average lazily on read to avoid division in ISR hot path.
+ */
+const latency_stats_t *latency_get_stats(uint8_t priority)
+{
+    latency_stats_t *stats;
+
+    if (priority >= 16)
+        return NULL;
+
+    stats = &latency_stats[priority];
+
+    /* NOTE: This returns live stats. Callers must use atomic loads or
+     * prefer latency_get_stats_snapshot() for a stable read.
+     */
+    return stats;
+}
+
+/**
+ * Get a best-effort atomic snapshot of latency statistics for a priority.
+ *
+ * Uses only relaxed atomics (single-core). We retry if count changes during
+ * the read. Because count and sum are updated separately, this provides a
+ * consistent snapshot in the common case but is still best-effort.
+ */
+int latency_get_stats_snapshot(uint8_t priority, latency_stats_t *out)
+{
+    uint32_t count_before;
+    uint32_t count_after;
+
+    if (!out || priority >= 16)
+        return 0;
+
+    /*
+     * Retry loop ensures all fields (count, sum, min, max) are from
+     * the same snapshot generation. Reading count before and after
+     * ensures no ISR updated the stats during our reads.
+     */
+    do {
+        count_before =
+            __atomic_load_n(&latency_stats[priority].count, __ATOMIC_RELAXED);
+        out->sum =
+            __atomic_load_n(&latency_stats[priority].sum, __ATOMIC_RELAXED);
+        out->min =
+            __atomic_load_n(&latency_stats[priority].min, __ATOMIC_RELAXED);
+        out->max =
+            __atomic_load_n(&latency_stats[priority].max, __ATOMIC_RELAXED);
+        count_after =
+            __atomic_load_n(&latency_stats[priority].count, __ATOMIC_RELAXED);
+    } while (count_before != count_after);
+
+    out->count = count_after;
+
+    /* avg is computed by the caller to avoid shared writes. */
+    out->avg = 0;
+
+    return 1;
+}
+
+/**
+ * Reset all latency statistics.
+ */
+void latency_reset(void)
+{
+    uint32_t flags;
+    int i;
+
+    flags = irq_save_flags();
+
+    for (i = 0; i < 16; i++) {
+        latency_stats[i].count = 0;
+        latency_stats[i].min = 0;
+        latency_stats[i].max = 0;
+        latency_stats[i].sum = 0;
+        latency_stats[i].avg = 0;
+    }
+
+    irq_restore_flags(flags);
+}
diff --git a/platform/irq.c b/platform/irq.c
index 79eb2d96..6afac92f 100644
--- a/platform/irq.c
+++ b/platform/irq.c
@@ -13,6 +13,12 @@
  */
 volatile uint32_t __irq_saved_regs[8];
 
+/*
+ * System state tracking for ISR context.
+ * 0 = Thread mode (PSP), 1+ = Handler mode (MSP).
+ */
+volatile uint32_t irq_system_state = 0;
+
 void irq_init(void)
 {
     /* Set all 4-bit to pre-emption priority bit */