From f1c1a635bee28d948a10cf0cae49bc5b9e7068ad Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Tue, 5 May 2026 05:42:32 +0800 Subject: [PATCH] Move mmap gap hints for per-guest isolation Two static globals in src/syscall/mem.c, mmap_r{w,x}_gap_hint, described a specific guest's region layout and would alias if 2 guest_t instances ever coexisted in one process (test harnesses, future multi-VM use). Move both fields into guest_t. guest_reset zeroes them; guest_{init,init_from_shm} zero them via memset. The public mmap_reset_hints() helper and its callers in src/syscall/exec.c (right after guest_reset) and src/runtime/fork-state.c (right after syscall_init in fork_ipc_recv_fd_table) are no longer needed and removed. find_free_gap loses const on its guest_t argument to mutate the per-guest hints; it is static so the change has no cross-TU API impact. The four munmap/mremap rewind sites point at g->mmap_*_gap_hint instead. Audited the rest of the module-level state in src/. Host-derived caches (sysinfo, host_port, totalram, getloadavg, cached uname/groups/affinity/ rlimits) stay process-global by design: any future second guest in this process would share the same host stats. Other process-scoped tables (proc_table, next_guest_pid, pidfd/inotify/netlink/sysv-ipc/abstract- socket tables, futex buckets, thread_table, signal state, procemu temp dirs, log state) remain global because the current architecture is one VM per macOS process (HVF restriction) and fork uses posix_spawn into a fresh process. Documented the sysinfo cache inline as intentionally process-scoped. --- src/core/guest.c | 2 ++ src/core/guest.h | 20 +++++++++---- src/runtime/fork-state.c | 1 - src/syscall/abi.h | 3 -- src/syscall/exec.c | 1 - src/syscall/mem.c | 65 ++++++++++++++++++---------------------- src/syscall/sys.c | 14 +++++++-- 7 files changed, 58 insertions(+), 48 deletions(-) diff --git a/src/core/guest.c b/src/core/guest.c index ea4258e..632a033 100644 --- a/src/core/guest.c +++ b/src/core/guest.c @@ -796,6 +796,8 @@ void guest_reset(guest_t *g) g->mmap_end = MMAP_INITIAL_END; g->mmap_rx_next = MMAP_RX_BASE; g->mmap_rx_end = MMAP_RX_INITIAL_END; + g->mmap_rw_gap_hint = 0; + g->mmap_rx_gap_hint = 0; g->ttbr0 = 0; g->need_tlbi = false; diff --git a/src/core/guest.h b/src/core/guest.h index 6e57623..b21aaa4 100644 --- a/src/core/guest.h +++ b/src/core/guest.h @@ -127,22 +127,32 @@ typedef struct { typedef struct { void *host_base; /* Host pointer to allocated guest memory */ int shm_fd; /* File fd backing host_base for CoW fork (-1 if MAP_ANON) */ + uint64_t guest_size; /* Total size (determined by IPA capacity) */ uint64_t ipa_base; /* IPA base for hv_vm_map (GUEST_IPA_BASE) */ uint64_t mmap_limit; /* Max mmap address (computed from guest_size) */ - uint64_t - interp_base; /* Dynamic linker load base (computed from guest_size) */ + + uint64_t interp_base; /* Dynamic linker load base (from guest_size) */ uint64_t pt_pool_next; /* Next free page table page in pool */ uint64_t brk_base; /* Initial brk (set after ELF load) */ uint64_t brk_current; /* Current brk position */ uint64_t stack_base; /* Bottom of stack region (dynamic, above brk) */ uint64_t stack_top; /* Top of stack (stack grows down from here) */ - uint64_t mmap_next; /* RW mmap high-water mark for fork IPC snapshots */ - uint64_t mmap_end; /* Current page-table-covered RW mmap limit */ + + uint64_t mmap_next; /* RW mmap high-water mark for fork IPC snapshots */ + uint64_t mmap_end; /* Current page-table-covered RW mmap limit */ /* RX mmap high-water mark serialized through fork IPC. */ uint64_t mmap_rx_next; uint64_t mmap_rx_end; /* Current page-table-covered RX mmap limit */ - uint64_t ttbr0; /* TTBR0 value (IPA of L0 page table) */ + /* Gap-finder allocator hints. First free GPA past the last successful mmap + * in each region; munmap and mremap rewind the hint when a lower address is + * freed. mprotect does not, since permission changes do not free address + * space. Per-guest so multi-guest test harnesses (or any future second VM + * in the same process) cannot cross-pollute each other's allocator state. + */ + uint64_t mmap_rw_gap_hint, mmap_rx_gap_hint; + + uint64_t ttbr0; /* TTBR0 value (IPA of L0 page table) */ bool need_tlbi; /* Signal shim to flush TLB after page table changes */ hv_vcpu_t vcpu; /* vCPU handle */ hv_vcpu_exit_t *exit; /* vCPU exit info */ diff --git a/src/runtime/fork-state.c b/src/runtime/fork-state.c index 5fb530d..b40c324 100644 --- a/src/runtime/fork-state.c +++ b/src/runtime/fork-state.c @@ -285,7 +285,6 @@ int fork_ipc_recv_fd_table(int ipc_fd, guest_t *g) } syscall_init(); - mmap_reset_hints(); if (num_fds > FD_TABLE_SIZE) { log_error("fork-child: num_fds %u exceeds FD_TABLE_SIZE", num_fds); diff --git a/src/syscall/abi.h b/src/syscall/abi.h index 286fef7..578253c 100644 --- a/src/syscall/abi.h +++ b/src/syscall/abi.h @@ -688,9 +688,6 @@ static inline void sock_opt_clear(fd_entry_t *e) /* Initialize the syscall subsystem (FD table, etc.) */ void syscall_init(void); -/* Reset mmap gap-finder hints after execve. */ -void mmap_reset_hints(void); - /* Dispatch a syscall. Reads X8 (nr) and X0-X5 (args) from vCPU registers. * Writes result back to X0. Sets *exit_code if the process should exit. * Returns 0 to continue, 1 to exit. diff --git a/src/syscall/exec.c b/src/syscall/exec.c index 52f109d..3ec2870 100644 --- a/src/syscall/exec.c +++ b/src/syscall/exec.c @@ -447,7 +447,6 @@ int64_t sys_execve(hv_vcpu_t vcpu, * kernel exec failure after its point of no return. */ guest_reset(g); - mmap_reset_hints(); /* The replacement image must not inherit process-wide shutdown requests * from the old thread group. diff --git a/src/syscall/mem.c b/src/syscall/mem.c index 14a57cd..9a02a62 100644 --- a/src/syscall/mem.c +++ b/src/syscall/mem.c @@ -32,16 +32,19 @@ pthread_mutex_t mmap_lock = PTHREAD_MUTEX_INITIALIZER; /* Lock order: 1 */ /* Gap-finding allocator for mmap. * - * find_free_gap_inner() scans guest_t.regions[] (sorted) for the first free - * gap of length bytes within [min_addr, max_addr). Replaces a bump - * allocator so munmap'd ranges become reusable (critical for runtimes that - * reserve, trim, and re-reserve in the same address window). + * find_free_gap_inner() scans guest_t.regions[] (sorted) for the first free gap + * of length bytes within [min_addr, max_addr). Replaces a bump allocator so + * munmap'd ranges become reusable (critical for runtimes that reserve, trim, + * and re-reserve in the same address window). * - * The cached hints below amortize the O(n) scan to O(1) for sequential - * allocations: after each success, the hint is set to the end of the - * allocation. munmap resets the hint when freeing a region before it. + * Per-guest hints (mmap_rw_gap_hint / mmap_rx_gap_hint in guest_t) amortize the + * O(n) scan to O(1) for sequential allocations: after each success the hint is + * set to the end of the allocation. munmap/mremap rewinds the hint when a lower + * address is freed. Stored in guest_t so multiple guest instances in one + * process (test harnesses, future multi-VM use) cannot cross-pollute each + * other's allocator state. Reset to 0 by guest_init, guest_init_from_shm (via + * memset), and guest_reset. */ -static uint64_t mmap_rw_gap_hint = 0, mmap_rx_gap_hint = 0; typedef struct { uint64_t start, end; @@ -103,16 +106,6 @@ static int dup_region_backing_fd(const guest_region_t *region) return dup(region->backing_fd); } -/* Reset mmap gap hints after execve. Without this, the gap-finder starts - * searching past the previous binary's allocations, wasting address space - * and potentially causing issues with the new dynamic linker. - */ -void mmap_reset_hints(void) -{ - mmap_rw_gap_hint = 0; - mmap_rx_gap_hint = 0; -} - static uint64_t find_free_gap_inner(const guest_t *g, uint64_t length, uint64_t min_addr, @@ -152,14 +145,14 @@ static uint64_t find_free_gap_inner(const guest_t *g, * mmap activity. A miss falls back to the region base so holes reopened by * munmap are still reusable. */ -static uint64_t find_free_gap(const guest_t *g, +static uint64_t find_free_gap(guest_t *g, uint64_t length, uint64_t min_addr, uint64_t max_addr) { /* RX and RW mappings advance independently, so keep separate hints. */ uint64_t *hint = - (min_addr < MMAP_BASE) ? &mmap_rx_gap_hint : &mmap_rw_gap_hint; + (min_addr < MMAP_BASE) ? &g->mmap_rx_gap_hint : &g->mmap_rw_gap_hint; /* Try cached hint first (only if within the valid range) */ if (*hint >= min_addr && *hint < max_addr) { @@ -771,10 +764,10 @@ int64_t sys_mremap(guest_t *g, memset((uint8_t *) g->host_base + tail_off, 0, tail_end - tail_off); guest_region_remove(g, tail_off, tail_end); guest_invalidate_ptes(g, tail_off, tail_end); - if (tail_off < mmap_rw_gap_hint) - mmap_rw_gap_hint = tail_off; - if (tail_off < mmap_rx_gap_hint) - mmap_rx_gap_hint = tail_off; + if (tail_off < g->mmap_rw_gap_hint) + g->mmap_rw_gap_hint = tail_off; + if (tail_off < g->mmap_rx_gap_hint) + g->mmap_rx_gap_hint = tail_off; return (int64_t) old_addr; } @@ -844,10 +837,10 @@ int64_t sys_mremap(guest_t *g, memset((uint8_t *) g->host_base + old_off, 0, old_size); guest_region_remove(g, old_off, old_off + old_size); guest_invalidate_ptes(g, old_off, old_off + old_size); - if (old_off < mmap_rw_gap_hint) - mmap_rw_gap_hint = old_off; - if (old_off < mmap_rx_gap_hint) - mmap_rx_gap_hint = old_off; + if (old_off < g->mmap_rw_gap_hint) + g->mmap_rw_gap_hint = old_off; + if (old_off < g->mmap_rx_gap_hint) + g->mmap_rx_gap_hint = old_off; } if (guest_region_add_ex_owned( @@ -987,10 +980,10 @@ int64_t sys_mremap(guest_t *g, memset((uint8_t *) g->host_base + old_off, 0, old_size); guest_region_remove(g, old_off, old_off + old_size); guest_invalidate_ptes(g, old_off, old_off + old_size); - if (old_off < mmap_rw_gap_hint) - mmap_rw_gap_hint = old_off; - if (old_off < mmap_rx_gap_hint) - mmap_rx_gap_hint = old_off; + if (old_off < g->mmap_rw_gap_hint) + g->mmap_rw_gap_hint = old_off; + if (old_off < g->mmap_rx_gap_hint) + g->mmap_rx_gap_hint = old_off; /* Track new region */ if (guest_region_add_ex_owned( @@ -1211,10 +1204,10 @@ int64_t sys_munmap(guest_t *g, uint64_t addr, uint64_t length) memset((uint8_t *) g->host_base + zstart, 0, zend - zstart); } guest_region_remove(g, unmap_off, end); - if (unmap_off < mmap_rw_gap_hint) - mmap_rw_gap_hint = unmap_off; - if (unmap_off < mmap_rx_gap_hint) - mmap_rx_gap_hint = unmap_off; + if (unmap_off < g->mmap_rw_gap_hint) + g->mmap_rw_gap_hint = unmap_off; + if (unmap_off < g->mmap_rx_gap_hint) + g->mmap_rx_gap_hint = unmap_off; } } return 0; diff --git a/src/syscall/sys.c b/src/syscall/sys.c index 4d540f4..80a007f 100644 --- a/src/syscall/sys.c +++ b/src/syscall/sys.c @@ -36,8 +36,8 @@ static int cached_ngroups = -1; static const linux_utsname_t cached_uname = { .sysname = "Linux", .nodename = "elfuse", - /* Kernel version: match the lima aarch64 VM kernel to avoid - * version-gated feature detection mismatches in userspace. + /* Kernel version: match the lima aarch64 VM kernel to avoid version-gated + * feature detection mismatches in userspace. */ .release = "6.17.0-20-generic", .version = "#20-Ubuntu SMP PREEMPT_DYNAMIC", @@ -45,6 +45,16 @@ static const linux_utsname_t cached_uname = { .domainname = "(none)", }; static const uint8_t cached_affinity_mask[256] = {1}, zero_block[256] = {0}; + +/* sysinfo cache. + * + * Process-scoped by intent: the cache mirrors the host's view (totalram from + * sysctl(HW_MEMSIZE), free pages from host_statistics64, getloadavg). Even if + * multiple guest_t instances ever coexist in one process they share the same + * host stats, so a single rwlock-protected cache refreshed at most once per + * second is the right shape. Audited under TODO "Static state testability + * audit" -- intentionally NOT moved into guest_t. + */ static pthread_once_t sysinfo_once = PTHREAD_ONCE_INIT; static pthread_rwlock_t sysinfo_lock = PTHREAD_RWLOCK_INITIALIZER; static time_t cached_boottime_sec = 0;