From 8568ecd1dd75b670bd73315f582c6183bcba5424 Mon Sep 17 00:00:00 2001 From: Ludvig Liljenberg <4257730+ludfjig@users.noreply.github.com> Date: Wed, 17 Jun 2026 23:02:14 -0700 Subject: [PATCH] fix(guest): give page faults their own exception stack A guest exception handler runs on the IST1 stack. If the handler writes a copy-on-write page, the first write faults. The page fault also uses IST1, so the CPU resets RSP to the top of IST1 and writes the fault frame over the live handler frame. The handler then returns to a bad address and the guest aborts. The bug stays latent until an exception handler writes a copy-on-write page. It surfaced when a memory layout change moved a counter that an existing handler increments onto a page that stays copy-on-write after a snapshot. The increment then faulted while the handler ran and crashed the guest. Send page faults to their own IST2 stack so a fault inside a handler keeps the handler frame intact. The page-fault stack uses the second of the two scratch pages already reserved at the top of the region. Add a regression test, exception_handler_nested_page_fault. It installs a handler that writes a copy-on-write page, then triggers int3. Without the fix the guest aborts with a page fault. With the fix it returns 0. Signed-off-by: Ludvig Liljenberg <4257730+ludfjig@users.noreply.github.com> --- src/hyperlight_common/src/layout.rs | 2 + .../src/arch/amd64/prim_alloc.rs | 5 +- .../src/arch/amd64/exception/entry.rs | 14 ++++-- .../src/arch/amd64/init.rs | 22 +++++++-- .../src/arch/amd64/machine.rs | 17 ++++++- src/hyperlight_host/tests/integration_test.rs | 25 ++++++++++ src/tests/rust_guests/simpleguest/src/main.rs | 48 +++++++++++++++++++ 7 files changed, 121 insertions(+), 12 deletions(-) diff --git a/src/hyperlight_common/src/layout.rs b/src/hyperlight_common/src/layout.rs index 69ecdb6ef..83b6540f7 100644 --- a/src/hyperlight_common/src/layout.rs +++ b/src/hyperlight_common/src/layout.rs @@ -26,6 +26,8 @@ pub const SCRATCH_TOP_ALLOCATOR_OFFSET: u64 = 0x10; pub const SCRATCH_TOP_SNAPSHOT_PT_GPA_BASE_OFFSET: u64 = 0x18; pub const SCRATCH_TOP_SNAPSHOT_GENERATION_OFFSET: u64 = 0x20; pub const SCRATCH_TOP_EXN_STACK_OFFSET: u64 = 0x30; +/// Top of the page-fault exception stack, one page below the top of scratch memory. +pub const SCRATCH_TOP_PF_EXN_STACK_OFFSET: u64 = 0x1000; pub fn scratch_base_gpa(size: usize) -> u64 { (MAX_GPA - size + 1) as u64 diff --git a/src/hyperlight_guest/src/arch/amd64/prim_alloc.rs b/src/hyperlight_guest/src/arch/amd64/prim_alloc.rs index cfaad9a0b..3392fd19e 100644 --- a/src/hyperlight_guest/src/arch/amd64/prim_alloc.rs +++ b/src/hyperlight_guest/src/arch/amd64/prim_alloc.rs @@ -31,8 +31,9 @@ pub unsafe fn alloc_phys_pages(n: u64) -> u64 { x = inout(reg) x ); } - // Set aside two pages at the top of the scratch region for the - // exception stack, shared state, etc + // Set aside two pages at the top of the scratch region. The top + // page holds shared metadata and the general exception stack. The + // page below it holds the page-fault exception stack. let max_avail = hyperlight_common::layout::MAX_GPA - hyperlight_common::vmem::PAGE_SIZE * 2; if x.checked_add(nbytes) .is_none_or(|xx| xx >= max_avail as u64) diff --git a/src/hyperlight_guest_bin/src/arch/amd64/exception/entry.rs b/src/hyperlight_guest_bin/src/arch/amd64/exception/entry.rs index 87f89f15c..576b7bd9e 100644 --- a/src/hyperlight_guest_bin/src/arch/amd64/exception/entry.rs +++ b/src/hyperlight_guest_bin/src/arch/amd64/exception/entry.rs @@ -22,7 +22,9 @@ use core::arch::{asm, global_asm}; use hyperlight_common::outb::Exception; use super::super::context; -use super::super::machine::{IDT, IdtEntry, IdtPointer, ProcCtrl}; +use super::super::machine::{ + IDT, IST_GENERAL_EXCEPTION, IST_PAGE_FAULT, IdtEntry, IdtPointer, ProcCtrl, +}; unsafe extern "C" { // Exception handlers @@ -174,12 +176,16 @@ global_asm!( pub(in super::super) fn init_idt(pc: *mut ProcCtrl) { let idt = unsafe { &raw mut (*pc).idt }; - let set_idt_entry = |idx, handler: unsafe extern "C" fn()| { + let set_idt_entry_ist = |idx, handler: unsafe extern "C" fn(), ist: u8| { let handler_addr = handler as *const () as u64; unsafe { - (&raw mut (*idt).entries[idx as usize]).write_volatile(IdtEntry::new(handler_addr)); + (&raw mut (*idt).entries[idx as usize]) + .write_volatile(IdtEntry::new_with_ist(handler_addr, ist)); } }; + let set_idt_entry = |idx, handler: unsafe extern "C" fn()| { + set_idt_entry_ist(idx, handler, IST_GENERAL_EXCEPTION) + }; set_idt_entry(Exception::DivideByZero, _do_excp0); // Divide by zero set_idt_entry(Exception::Debug, _do_excp1); // Debug set_idt_entry(Exception::NonMaskableInterrupt, _do_excp2); // Non-maskable interrupt @@ -194,7 +200,7 @@ pub(in super::super) fn init_idt(pc: *mut ProcCtrl) { set_idt_entry(Exception::SegmentNotPresent, _do_excp11); // Segment Not Present set_idt_entry(Exception::StackSegmentFault, _do_excp12); // Stack-Segment Fault set_idt_entry(Exception::GeneralProtectionFault, _do_excp13); // General Protection Fault - set_idt_entry(Exception::PageFault, _do_excp14); // Page Fault + set_idt_entry_ist(Exception::PageFault, _do_excp14, IST_PAGE_FAULT); // Page Fault (own IST stack) set_idt_entry(Exception::Reserved, _do_excp15); // Reserved set_idt_entry(Exception::X87FloatingPointException, _do_excp16); // x87 Floating-Point Exception set_idt_entry(Exception::AlignmentCheck, _do_excp17); // Alignment Check diff --git a/src/hyperlight_guest_bin/src/arch/amd64/init.rs b/src/hyperlight_guest_bin/src/arch/amd64/init.rs index 073bd3a2f..912bc5d5d 100644 --- a/src/hyperlight_guest_bin/src/arch/amd64/init.rs +++ b/src/hyperlight_guest_bin/src/arch/amd64/init.rs @@ -79,10 +79,19 @@ unsafe fn init_gdt(pc: *mut ProcCtrl) { } } -/// Hyperlight's TSS contains only a single IST entry, which is used -/// to set up the stack switch to the exception stack whenever we take -/// an exception (including page faults, which are important, since -/// the fault might be due to needing to grow the stack!) +/// Hyperlight's TSS provides two IST stacks. The CPU switches to one +/// when an exception is taken, so a handler always runs on a known-good +/// stack. This matters because a fault can mean the main stack needs to +/// grow. +/// +/// * `ist1` is the general exception stack. +/// * `ist2` is the page-fault stack. +/// +/// Page faults get a separate stack because they can nest inside +/// another exception. A handler running on `ist1` may write a +/// copy-on-write page, which raises a page fault. The CPU delivers that +/// fault on `ist2`, so each one has its own stack and the handler +/// resumes once the fault is serviced. /// /// This function sets up the TSS and then points the processor at the /// system segment descriptor, initialized in [`init_gdt`] above, @@ -96,6 +105,11 @@ unsafe fn init_tss(pc: *mut ProcCtrl) { - hyperlight_common::layout::SCRATCH_TOP_EXN_STACK_OFFSET + 1; ist1_ptr.write_volatile(exn_stack.to_ne_bytes()); + let ist2_ptr = &raw mut (*tss_ptr).ist2 as *mut [u8; 8]; + let pf_exn_stack = hyperlight_common::layout::MAX_GVA as u64 + - hyperlight_common::layout::SCRATCH_TOP_PF_EXN_STACK_OFFSET + + 1; + ist2_ptr.write_volatile(pf_exn_stack.to_ne_bytes()); asm!( "ltr ax", in("ax") core::mem::offset_of!(HyperlightGDT, tss), diff --git a/src/hyperlight_guest_bin/src/arch/amd64/machine.rs b/src/hyperlight_guest_bin/src/arch/amd64/machine.rs index cde8118e3..b4facd6d0 100644 --- a/src/hyperlight_guest_bin/src/arch/amd64/machine.rs +++ b/src/hyperlight_guest_bin/src/arch/amd64/machine.rs @@ -20,6 +20,12 @@ use hyperlight_common::vmem::{BasicMapping, MappingKind, PAGE_SIZE}; use super::layout::PROC_CONTROL_GVA; +/// IDT gate IST index for general exceptions. Selects [`TSS::ist1`]. +pub(super) const IST_GENERAL_EXCEPTION: u8 = 1; +/// IDT gate IST index for page faults. Selects [`TSS::ist2`], the +/// page-fault stack. See the TSS setup in `init.rs` for why. +pub(super) const IST_PAGE_FAULT: u8 = 2; + /// Entry in the Global Descriptor Table (GDT) /// For reference, see page 3-10 Vol. 3A of Intel 64 and IA-32 /// Architectures Software Developer's Manual, figure 3-8 @@ -117,7 +123,7 @@ pub(super) struct TSS { _rsp2: u64, _rsvd1: [u8; 8], pub(super) ist1: u64, - _ist2: u64, + pub(super) ist2: u64, _ist3: u64, _ist4: u64, _ist5: u64, @@ -127,6 +133,7 @@ pub(super) struct TSS { } const _: () = assert!(mem::size_of::() == 0x64); const _: () = assert!(mem::offset_of!(TSS, ist1) == 0x24); +const _: () = assert!(mem::offset_of!(TSS, ist2) == 0x2c); /// An entry in the Interrupt Descriptor Table (IDT) /// For reference, see page 7-20 Vol. 3A of Intel 64 and IA-32 @@ -154,10 +161,16 @@ const _: () = assert!(mem::size_of::() == 0x10); impl IdtEntry { pub(super) fn new(handler: u64) -> Self { + Self::new_with_ist(handler, IST_GENERAL_EXCEPTION) + } + + /// Build an IDT gate that switches to IST stack `ist` (1-based, one + /// of `TSS::ist1..ist7`) when the vector is taken. + pub(super) fn new_with_ist(handler: u64, ist: u8) -> Self { Self { offset_low: (handler & 0xFFFF) as u16, selector: 0x08, // Kernel Code Segment - interrupt_stack_table_offset: 1, + interrupt_stack_table_offset: ist, type_attr: 0x8E, // 0x8E = 10001110b // 1 00 0 1101 diff --git a/src/hyperlight_host/tests/integration_test.rs b/src/hyperlight_host/tests/integration_test.rs index 6b5a7f8e3..49f9d2cf2 100644 --- a/src/hyperlight_host/tests/integration_test.rs +++ b/src/hyperlight_host/tests/integration_test.rs @@ -1674,6 +1674,31 @@ fn exception_handler_installation_and_validation() { }); } +/// A guest exception handler writes a copy-on-write page, which faults while +/// the handler runs on the exception stack. Page faults use their own stack, +/// so the handler frame survives and the guest resumes. +#[test] +fn exception_handler_nested_page_fault() { + with_rust_sandbox(|mut sandbox| { + let count: i32 = sandbox.call("GetExceptionHandlerCallCount", ()).unwrap(); + assert_eq!(count, 0, "Handler should not have been called yet"); + + sandbox + .call::<()>("InstallCowFaultingHandler", 3i32) + .unwrap(); + + // The handler faults as it runs. The guest resumes from int3 and returns 0. + let trigger_result: i32 = sandbox.call("TriggerInt3Bare", ()).unwrap(); + assert_eq!( + trigger_result, 0, + "Guest should resume after the nested page fault" + ); + + let count: i32 = sandbox.call("GetExceptionHandlerCallCount", ()).unwrap(); + assert_eq!(count, 1, "Handler should have been called once"); + }); +} + /// Tests that an exception can be properly handled even when the heap is exhausted. /// The guest function fills the heap completely, then triggers a ud2 exception. /// This validates that the exception handling path does not require heap allocations. diff --git a/src/tests/rust_guests/simpleguest/src/main.rs b/src/tests/rust_guests/simpleguest/src/main.rs index acc176052..1e58d1cbe 100644 --- a/src/tests/rust_guests/simpleguest/src/main.rs +++ b/src/tests/rust_guests/simpleguest/src/main.rs @@ -187,6 +187,54 @@ fn trigger_int3() -> i32 { 0 } +/// Page-aligned probe written from [`cow_faulting_exception_handler`]. +/// Its page stays copy-on-write after the snapshot, so the handler's +/// first write faults while the handler runs on the exception stack. +#[repr(align(4096))] +struct CowFaultProbe([u64; 512]); +static mut COW_FAULT_PROBE: CowFaultProbe = CowFaultProbe([0; 512]); + +/// Handler that faults while it runs by writing a copy-on-write page. +fn cow_faulting_exception_handler( + exception_number: u64, + _exception_info: *mut ExceptionInfo, + _context: *mut Context, + _page_fault_address: u64, +) -> bool { + HANDLER_INVOCATION_COUNT.fetch_add(1, Ordering::SeqCst); + + // INT3 is exception vector 3 + assert_eq!(exception_number, 3); + + // First write to this page faults, here on the exception stack. + unsafe { + let probe = &raw mut COW_FAULT_PROBE.0; + core::ptr::write_volatile(&mut (*probe)[0], TEST_R10_VALUE); + } + + // Return true to resume execution. + true +} + +/// Install [`cow_faulting_exception_handler`] for a vector. +#[guest_function("InstallCowFaultingHandler")] +fn install_cow_faulting_handler(vector: i32) { + hyperlight_guest_bin::exception::arch::HANDLERS[vector as usize].store( + cow_faulting_exception_handler as *const () as usize as u64, + Ordering::Release, + ); +} + +/// Trigger an INT3 breakpoint (vector 3). Pairs with +/// [`install_cow_faulting_handler`]. +#[guest_function("TriggerInt3Bare")] +fn trigger_int3_bare() -> i32 { + unsafe { + core::arch::asm!("int3"); + } + 0 +} + #[guest_function("EchoFloat")] fn echo_float(value: f32) -> f32 { value