From fb5e2539c4199fdc2ae6758ffc20b839e4d5cdfd Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Tue, 5 May 2026 03:31:10 +0800 Subject: [PATCH] Use 4KiB L3 pages for mixed-permission 2MiB block guest_build_page_tables OR-merged perms when multiple regions shared a 2MiB block, collapsing small static binaries' .text RX + .data RW + heap RW into one RWX block. The pair-only post-build fixup in bootstrap.c only handled adjacent array pairs, and exec.c had no fixup at all so execve into a small binary left the merged RWX state. The new finalize_block_perms at the end of guest_build_page_tables splits any 2MiB block where the input regions disagree on perms or leave gaps, applies the union of perms per 4KiB page, and leaves uncovered pages invalid. Idempotent across overlapping regions, so no visited-set is needed. Bundled adjustments: VDSO declared as an explicit boot region in bootstrap.c and exec.c (it previously rode the shim's RX block); MAX_REGIONS / MAX_BOOT_REGIONS bumped to 8 + 2*ELF_MAX_SEGMENTS and silent segment truncation in execve turned into a fatal abort; build_boot_regions / load_interpreter / finalize_block_perms switched to bool returns where the only meaningful values are success/failure; BLOCK_2MB family and \d+(KB|MB|GB|TB) literals swept to IEC binary prefixes (BLOCK_2MIB, KiB/MiB/GiB/TiB) across src/ and tests/. Lowercase 'kB' in /proc/meminfo and /proc/self/status emulation kept verbatim for Linux ABI compat. --- src/core/bootstrap.c | 79 ++++---- src/core/bootstrap.h | 6 +- src/core/elf.c | 2 +- src/core/guest.c | 344 +++++++++++++++++++++++---------- src/core/guest.h | 58 +++--- src/core/shim.S | 2 +- src/core/stack.c | 2 +- src/core/vdso.c | 6 +- src/core/vdso.h | 11 +- src/debug/gdbstub.c | 2 +- src/hvutil.h | 2 +- src/main.c | 2 +- src/runtime/forkipc.c | 2 +- src/runtime/proctitle.c | 2 +- src/runtime/proctitle.h | 2 +- src/runtime/thread.c | 12 +- src/runtime/thread.h | 12 +- src/syscall/exec.c | 50 +++-- src/syscall/fs.c | 8 +- src/syscall/inotify.c | 2 +- src/syscall/io.c | 4 +- src/syscall/mem.c | 34 ++-- src/syscall/net-absock.h | 6 +- src/syscall/proc-state.h | 6 +- src/syscall/proc.c | 13 +- src/syscall/sys.c | 6 +- src/utils.h | 10 +- tests/test-cow-fork.c | 6 +- tests/test-futex-pi.c | 2 +- tests/test-guard-page.c | 6 +- tests/test-large-io-boundary.c | 18 +- tests/test-madvise.c | 4 +- tests/test-mremap.c | 2 +- tests/test-multi-vcpu.c | 32 +-- tests/test-perf.sh | 4 +- tests/test-rwx.c | 58 +++--- tests/test-stress.c | 6 +- tests/test-thread.c | 2 +- 38 files changed, 494 insertions(+), 331 deletions(-) diff --git a/src/core/bootstrap.c b/src/core/bootstrap.c index b608146..e7912fb 100644 --- a/src/core/bootstrap.c +++ b/src/core/bootstrap.c @@ -1,4 +1,4 @@ -/* Guest bootstrap helpers for elfuse +/* Guest bootstrap helpers * * Copyright 2026 elfuse contributors * SPDX-License-Identifier: Apache-2.0 @@ -30,7 +30,10 @@ #include "debug/log.h" -#define MAX_BOOT_REGIONS 32 +/* Worst case: 7 fixed regions (shim, shim-data, vDSO, brk, stack, mmap RX, mmap + * RW) plus up to ELF_MAX_SEGMENTS for both the executable and the interpreter. + */ +#define MAX_BOOT_REGIONS (8 + 2 * ELF_MAX_SEGMENTS) static bool append_boot_region(mem_region_t *regions, int *nregions, @@ -83,12 +86,12 @@ static void log_initial_page_tables(const guest_t *g, uint64_t ttbr0) } } -static int load_interpreter(guest_t *g, - const char *sysroot, - guest_bootstrap_t *boot) +static bool load_interpreter(guest_t *g, + const char *sysroot, + guest_bootstrap_t *boot) { if (boot->elf_info.interp_path[0] == '\0') - return 0; + return true; elf_resolve_interp(sysroot, boot->elf_info.interp_path, boot->interp_resolved, sizeof(boot->interp_resolved)); @@ -96,20 +99,20 @@ static int load_interpreter(guest_t *g, if (elf_load(boot->interp_resolved, &boot->interp_info) < 0) { log_error("failed to load interpreter: %s", boot->interp_resolved); - return -1; + return false; } if (boot->interp_info.e_machine != EM_AARCH64) { log_error("interpreter has unsupported machine type %u: %s", boot->interp_info.e_machine, boot->interp_resolved); - return -1; + return false; } boot->interp_base = g->interp_base; if (elf_map_segments(&boot->interp_info, boot->interp_resolved, g->host_base, g->guest_size, boot->interp_base) < 0) { log_error("failed to map interpreter segments"); - return -1; + return false; } log_debug( @@ -117,20 +120,27 @@ static int load_interpreter(guest_t *g, (unsigned long long) boot->interp_base, (unsigned long long) (boot->interp_info.entry + boot->interp_base), boot->interp_info.num_segments); - return 0; + return true; } -static int build_boot_regions(mem_region_t *regions, - int *nregions, - guest_t *g, - const guest_bootstrap_t *boot, - size_t shim_bin_len) +static bool build_boot_regions(mem_region_t *regions, + int *nregions, + guest_t *g, + const guest_bootstrap_t *boot, + size_t shim_bin_len) { + /* The vDSO trampolines live in the same 2MiB block as the shim. They must + * appear in the region set so finalize_block_perms validates and grants RX + * to the vDSO page when splitting the block; otherwise vdso_build cannot + * write into it through guest_ptr. + */ if (!append_boot_region(regions, nregions, SHIM_BASE, SHIM_BASE + shim_bin_len, MEM_PERM_RX) || !append_boot_region(regions, nregions, SHIM_DATA_BASE, - SHIM_DATA_BASE + BLOCK_2MB, MEM_PERM_RW)) { - return -1; + SHIM_DATA_BASE + BLOCK_2MIB, MEM_PERM_RW) || + !append_boot_region(regions, nregions, VDSO_BASE, VDSO_BASE + VDSO_SIZE, + MEM_PERM_RX)) { + return false; } for (int i = 0; i < boot->elf_info.num_segments; i++) { @@ -140,7 +150,7 @@ static int build_boot_regions(mem_region_t *regions, boot->elf_info.segments[i].gpa + boot->elf_info.segments[i].memsz + boot->elf_load_base, elf_pf_to_prot(boot->elf_info.segments[i].flags))) { - return -1; + return false; } } @@ -151,7 +161,7 @@ static int build_boot_regions(mem_region_t *regions, boot->interp_info.segments[i].gpa + boot->interp_info.segments[i].memsz + boot->interp_base, elf_pf_to_prot(boot->interp_info.segments[i].flags))) { - return -1; + return false; } } @@ -163,12 +173,12 @@ static int build_boot_regions(mem_region_t *regions, MMAP_RX_INITIAL_END, MEM_PERM_RX) || !append_boot_region(regions, nregions, MMAP_BASE, MMAP_INITIAL_END, MEM_PERM_RW)) { - return -1; + return false; } g->mmap_rx_end = MMAP_RX_INITIAL_END; g->mmap_end = MMAP_INITIAL_END; - return 0; + return true; } int guest_bootstrap_prepare(guest_t *g, @@ -214,7 +224,7 @@ int guest_bootstrap_prepare(guest_t *g, } *guest_initialized = true; - log_debug("IPA size: %u bits (%lluGB primary)", g->ipa_bits, + log_debug("IPA size: %u bits (%llu GiB primary)", g->ipa_bits, (unsigned long long) (g->guest_size / (1024ULL * 1024 * 1024))); boot->elf_load_base = (boot->elf_info.e_type == ET_DYN) ? PIE_LOAD_BASE : 0; @@ -229,15 +239,15 @@ int guest_bootstrap_prepare(guest_t *g, g->brk_base = BRK_BASE_DEFAULT; g->brk_current = g->brk_base; - g->stack_top = ALIGN_UP(g->brk_base, BLOCK_2MB) + STACK_SIZE; + g->stack_top = ALIGN_UP(g->brk_base, BLOCK_2MIB) + STACK_SIZE; if (g->stack_top < STACK_TOP_DEFAULT) g->stack_top = STACK_TOP_DEFAULT; g->stack_base = g->stack_top - STACK_SIZE; - if (load_interpreter(g, sysroot, boot) < 0) + if (!load_interpreter(g, sysroot, boot)) return -1; - if (shim_bin_len > BLOCK_2MB) { + if (shim_bin_len > BLOCK_2MIB) { log_error("shim binary too large (%zu bytes)", shim_bin_len); return -1; } @@ -252,7 +262,7 @@ int guest_bootstrap_prepare(guest_t *g, boot->interp_base); sys_icache_invalidate((uint8_t *) g->host_base + SHIM_BASE, shim_bin_len); - if (build_boot_regions(regions, &nregions, g, boot, shim_bin_len) < 0) { + if (!build_boot_regions(regions, &nregions, g, boot, shim_bin_len)) { log_error("too many memory regions (%d >= %d)", nregions, MAX_BOOT_REGIONS); return -1; @@ -263,25 +273,12 @@ int guest_bootstrap_prepare(guest_t *g, log_error("failed to build page tables"); return -1; } - - for (int i = 1; i < nregions; i++) { - uint64_t prev_block = (regions[i - 1].gpa_end - 1) & ~(BLOCK_2MB - 1); - uint64_t curr_block = regions[i].gpa_start & ~(BLOCK_2MB - 1); - if (prev_block == curr_block && - regions[i - 1].perms != regions[i].perms && - guest_split_block(g, curr_block) == 0) { - guest_update_perms(g, regions[i - 1].gpa_start, - regions[i - 1].gpa_end, regions[i - 1].perms); - guest_update_perms(g, regions[i].gpa_start, regions[i].gpa_end, - regions[i].perms); - } - } g->need_tlbi = true; guest_region_add(g, SHIM_BASE, SHIM_BASE + shim_bin_len, LINUX_PROT_READ | LINUX_PROT_EXEC, LINUX_MAP_PRIVATE, 0, "[shim]"); - guest_region_add(g, SHIM_DATA_BASE, SHIM_DATA_BASE + BLOCK_2MB, + guest_region_add(g, SHIM_DATA_BASE, SHIM_DATA_BASE + BLOCK_2MIB, LINUX_PROT_READ | LINUX_PROT_WRITE, LINUX_MAP_PRIVATE, 0, "[shim-data]"); @@ -386,7 +383,7 @@ int guest_bootstrap_create_vcpu(guest_t *g, uint64_t shim_ipa = guest_ipa(g, SHIM_BASE); uint64_t entry_ipa = guest_ipa(g, boot->entry_point); uint64_t sp_ipa = guest_ipa(g, boot->stack_pointer); - uint64_t el1_sp = guest_ipa(g, SHIM_DATA_BASE + BLOCK_2MB); + uint64_t el1_sp = guest_ipa(g, SHIM_DATA_BASE + BLOCK_2MIB); hv_vcpu_t vcpu; hv_vcpu_exit_t *vexit; diff --git a/src/core/bootstrap.h b/src/core/bootstrap.h index 0939f95..e2ce4c4 100644 --- a/src/core/bootstrap.h +++ b/src/core/bootstrap.h @@ -1,11 +1,11 @@ -#pragma once - -/* Guest bootstrap helpers for elfuse +/* Guest bootstrap helpers * * Copyright 2026 elfuse contributors * SPDX-License-Identifier: Apache-2.0 */ +#pragma once + #include #include #include diff --git a/src/core/elf.c b/src/core/elf.c index c8837a5..316ad7c 100644 --- a/src/core/elf.c +++ b/src/core/elf.c @@ -97,7 +97,7 @@ int elf_load(const char *path, elf_info_t *info) fclose(f); return -1; } - /* Linux kernel caps program headers at 64KB. Reject pathological inputs + /* Linux kernel caps program headers at 64KiB. Reject pathological inputs * before allocating to avoid attacker-controlled large allocations. */ if ((size_t) ehdr.e_phnum * ehdr.e_phentsize > 65536) { diff --git a/src/core/guest.c b/src/core/guest.c index 83c5b16..ea4258e 100644 --- a/src/core/guest.c +++ b/src/core/guest.c @@ -6,11 +6,11 @@ * * Identity-mapped guest memory: GVA == GPA == offset into host_base. * The guest address space size is determined by the VM's configured IPA width - * (capped at 40-bit = 1TB): 64GB for native aarch64 on M2 (36-bit), 1TB for M3+ - * (40-bit). Reserved via mmap(MAP_ANON); macOS demand-pages physical memory on - * first touch, so only used pages consume RAM. The slab is mapped RWX to + * (capped at 40-bit = 1TiB): 64GiB for native aarch64 on M2 (36-bit), 1TiB for + * M3+ (40-bit). Reserved via mmap(MAP_ANON); macOS demand-pages physical memory + * on first touch, so only used pages consume RAM. The slab is mapped RWX to * Hypervisor.framework. The guest's own page tables (built here) enforce - * per-region permissions using 2MB block descriptors, which are mandatory for + * per-region permissions using 2MiB block descriptors, which are mandatory for * transparent misaligned access. Page tables can be extended at runtime via * guest_extend_page_tables(). * @@ -21,12 +21,12 @@ * created on demand when mprotect changes PROT_NONE to an accessible * permission. * - * Page table format: AArch64 4KB granule, up to 4-level: - * L0 entry covers 512GB: multiple entries for >512GB address spaces - * L1 entry covers 1GB: either block or table pointing to L2 - * L2 entry covers 2MB: block descriptors with final permissions - * L3 entry covers 4KB: optional, created by guest_split_block() for - * mixed permissions within a 2MB block (W^X) + * Page table format: AArch64 4KiB granule, up to 4-level: + * L0 entry covers 512GiB: multiple entries for >512GiB address spaces + * L1 entry covers 1GiB: either block or table pointing to L2 + * L2 entry covers 2MiB: block descriptors with final permissions + * L3 entry covers 4KiB: optional, created by guest_split_block() for mixed + * permissions within a 2MiB block (W^X) */ #include @@ -57,11 +57,11 @@ static void guest_region_clear(guest_t *g); #define PT_AP_RW_EL0 (1ULL << 6) /* AP[2:1]=01: RW at EL1, RW at EL0 */ #define PT_AP_RO (3ULL << 6) /* AP[2:1]=11: RO at EL1, RO at EL0 */ -/* PAGE_SIZE / ALIGN_2MB_* live in utils.h; BLOCK_2MB lives in core/guest.h. */ +/* PAGE_SIZE / ALIGN_2MB_* live in utils.h; BLOCK_2MIB lives in core/guest.h. */ #define PAGE_SIZE GUEST_PAGE_SIZE -#define BLOCK_1GB (1ULL * 1024 * 1024 * 1024) +#define BLOCK_1GIB (1ULL * 1024 * 1024 * 1024) -/* Mask to extract the physical address from a 2MB L2 block descriptor */ +/* Mask to extract the physical address from a 2MiB L2 block descriptor */ #define L2_BLOCK_ADDR_MASK 0xFFFFFFE00000ULL /* Forward declaration (defined in the page table section below) */ @@ -77,7 +77,7 @@ static pthread_mutex_t pt_lock = PTHREAD_MUTEX_INITIALIZER; /* Lock order: 2 */ /* Track whether the 80% warning has been emitted (avoid log spam) */ static bool pt_pool_warned = false; -/* Allocate a zeroed 4KB page from the page table pool. +/* Allocate a zeroed 4KiB page from the page table pool. * Returns GPA of the page, or 0 on pool exhaustion. * Acquires pt_lock internally. Caller typically holds mmap_lock. */ @@ -136,8 +136,8 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits) g->mmap_rx_next = MMAP_RX_BASE; /* Query the maximum IPA size supported by the hardware/kernel. macOS 15+ - * on Apple Silicon reports 40 bits (1TB). Older versions or fallback - * yields 36 bits (64GB). + * on Apple Silicon reports 40 bits (1TiB). Older versions or fallback + * yields 36 bits (64GiB). */ uint32_t max_ipa = 0; hv_vm_config_get_max_ipa_size(&max_ipa); @@ -157,7 +157,7 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits) vm_ipa = 36; /* Primary buffer size: use the VM's configured IPA width (capped at - * 40-bit = 1TB). macOS demand-pages the host reservation, so only touched + * 40-bit = 1TiB). macOS demand-pages the host reservation, so only touched * pages cost physical memory. */ uint32_t buf_bits = (vm_ipa > 40) ? 40 : vm_ipa; @@ -168,17 +168,17 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits) g->ipa_bits = vm_ipa; /* Compute dynamic layout limits from primary buffer size. - * interp_base: last 4GB (dynamic linker load address) - * mmap_limit: last 8GB reserved (max mmap RW address) - * For 64GB: interp=60GB, mmap_limit=56GB - * For 1TB: interp=1020GB, mmap_limit=1016GB + * interp_base: last 4GiB (dynamic linker load address) + * mmap_limit: last 8GiB reserved (max mmap RW address) + * For 64GiB: interp=60GiB, mmap_limit=56GiB + * For 1TiB: interp=1020GiB, mmap_limit=1016GiB */ g->interp_base = g->guest_size - 0x100000000ULL; g->mmap_limit = g->guest_size - 0x200000000ULL; /* Reserve primary address space via mmap(MAP_ANON). macOS demand-pages * this: physical pages are allocated only on first touch, so reserving up - * to 1TB costs nothing until pages are actually used. Do NOT memset + * to 1TiB costs nothing until pages are actually used. Do NOT memset * because that would touch all pages and defeat demand paging. */ g->host_base = @@ -261,14 +261,14 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits) ret = hv_vm_map(g->host_base, GUEST_IPA_BASE, size, HV_MEMORY_READ | HV_MEMORY_WRITE | HV_MEMORY_EXEC); if (ret != HV_SUCCESS && buf_bits > max_ipa) { - /* 1TB primary map failed; fall back to hardware-default buffer. + /* 1TiB primary map failed; fall back to hardware-default buffer. * This handles undocumented HVF limits on primary buffer size. * Close shm_fd since the fallback uses anonymous memory (the file is no * longer mapped to host_base, so CoW fork cannot work). */ log_info( - "guest: hv_vm_map %lluGB failed (%d), " - "retrying with %u-bit (%lluGB)", + "guest: hv_vm_map %llu GiB failed (%d), " + "retrying with %u-bit (%llu GiB)", (unsigned long long) (size >> 30), (int) ret, max_ipa, 1ULL << (max_ipa - 30)); munmap(g->host_base, size); @@ -372,7 +372,7 @@ int guest_init_from_shm(guest_t *g, } log_debug( - "guest: CoW fork: mapped %lluGB from shm " + "guest: CoW fork: mapped %llu GiB from shm " "(ipa=%u bits)", (unsigned long long) (size / (1024ULL * 1024 * 1024)), ipa_bits); @@ -416,7 +416,7 @@ typedef struct { /* Per-thread GVA TLB cache. * * Single-entry translation cache: avoids 3-4 pointer chases through the page - * table on repeated accesses to the same 2MB block (or 4KB page if L3-split). + * table on repeated accesses to the same 2MiB block (or 4KiB page if L3-split). * Validated by an atomic generation counter in guest_t that is bumped on every * page table modification. */ @@ -424,7 +424,7 @@ static _Thread_local struct { const guest_t *owner; /* Which guest_t this entry belongs to */ uint64_t base_gva; /* Block/page-aligned GVA */ uint64_t base_gpa; /* Corresponding GPA offset */ - uint64_t size; /* 2MB or 4KB (0 = invalid) */ + uint64_t size; /* 2MiB or 4KiB (0 = invalid) */ int perms; /* Cached permissions */ uint64_t gen; /* guest_t.pt_gen at population time */ } gva_tlb; @@ -452,7 +452,7 @@ static int gva_translate_perm(const guest_t *g, uint64_t base = g->ipa_base; const uint64_t *l0 = pt_at(g, g->ttbr0 - base); - unsigned l0_idx = (unsigned) (gva / (512ULL * BLOCK_1GB)); + unsigned l0_idx = (unsigned) (gva / (512ULL * BLOCK_1GIB)); if (l0_idx >= 512 || !(l0[l0_idx] & PT_VALID)) return -1; @@ -460,7 +460,7 @@ static int gva_translate_perm(const guest_t *g, if (l1_ipa < base || l1_ipa - base >= g->guest_size) return -1; const uint64_t *l1 = pt_at(g, l1_ipa - base); - unsigned l1_idx = (unsigned) ((gva / BLOCK_1GB) % 512); + unsigned l1_idx = (unsigned) ((gva / BLOCK_1GIB) % 512); if (!(l1[l1_idx] & PT_VALID)) return -1; @@ -468,12 +468,12 @@ static int gva_translate_perm(const guest_t *g, if (l2_ipa < base || l2_ipa - base >= g->guest_size) return -1; const uint64_t *l2 = pt_at(g, l2_ipa - base); - unsigned l2_idx = (unsigned) ((gva / BLOCK_2MB) % 512); + unsigned l2_idx = (unsigned) ((gva / BLOCK_2MIB) % 512); if (!(l2[l2_idx] & PT_VALID)) return -1; if (l2[l2_idx] & PT_TABLE) { - /* L3 page descriptor: 4KB granularity. */ + /* L3 page descriptor: 4KiB granularity. */ uint64_t l3_ipa = l2[l2_idx] & 0xFFFFFFFFF000ULL; if (l3_ipa < base || l3_ipa - base >= g->guest_size) return -1; @@ -496,7 +496,7 @@ static int gva_translate_perm(const guest_t *g, out->gpa = gpa; out->chunk = PAGE_SIZE - (gva & (PAGE_SIZE - 1)); - /* Populate TLB cache for this 4KB page */ + /* Populate TLB cache for this 4KiB page */ gva_tlb.owner = g; gva_tlb.base_gva = gva & ~(PAGE_SIZE - 1); gva_tlb.base_gpa = page_ipa - base; @@ -506,7 +506,7 @@ static int gva_translate_perm(const guest_t *g, return 0; } - /* L2 block descriptor: 2MB granularity. */ + /* L2 block descriptor: 2MiB granularity. */ int perms = desc_to_perms(l2[l2_idx]); if ((perms & required_perms) != required_perms) return -1; @@ -514,18 +514,18 @@ static int gva_translate_perm(const guest_t *g, uint64_t block_ipa = l2[l2_idx] & L2_BLOCK_ADDR_MASK; if (block_ipa < base) return -1; - uint64_t gpa = (block_ipa - base) + (gva & (BLOCK_2MB - 1)); + uint64_t gpa = (block_ipa - base) + (gva & (BLOCK_2MIB - 1)); if (gpa >= g->guest_size) return -1; out->gpa = gpa; - out->chunk = BLOCK_2MB - (gva & (BLOCK_2MB - 1)); + out->chunk = BLOCK_2MIB - (gva & (BLOCK_2MIB - 1)); - /* Populate TLB cache for this 2MB block */ + /* Populate TLB cache for this 2MiB block */ gva_tlb.owner = g; - gva_tlb.base_gva = gva & ~(BLOCK_2MB - 1); + gva_tlb.base_gva = gva & ~(BLOCK_2MIB - 1); gva_tlb.base_gpa = block_ipa - base; - gva_tlb.size = BLOCK_2MB; + gva_tlb.size = BLOCK_2MIB; gva_tlb.perms = perms; gva_tlb.gen = gen; return 0; @@ -588,7 +588,7 @@ static void *gva_resolve_perm(const guest_t *g, { /* Always walk page tables to enforce permissions. The guest slab is * identity-mapped (GVA == GPA == offset), but L2 block descriptors carry - * permission bits and L3 page tables have per-4KB permissions after + * permission bits and L3 page tables have per-4KiB permissions after * guest_split_block. Skipping the walk would bypass W^X enforcement for * all normal guest addresses. */ @@ -755,7 +755,7 @@ int guest_read_str_small(const guest_t *g, uint64_t gva, char *dst, size_t max) void guest_reset(guest_t *g) { - /* Zero only actually-used memory regions. With a potentially 1TB address + /* Zero only actually-used memory regions. With a potentially 1TiB address * space, memset of the entire range would fault in all demand-paged memory * for no benefit. PROT_NONE regions (e.g., a managed runtime's heap * reservation) were never written to, so they're already in the MAP_ANON @@ -783,7 +783,7 @@ void guest_reset(guest_t *g) * callers; shim regions are added AFTER reset by the exec path) */ memset((uint8_t *) g->host_base + SHIM_BASE, 0, - SHIM_DATA_BASE + BLOCK_2MB - SHIM_BASE); + SHIM_DATA_BASE + BLOCK_2MIB - SHIM_BASE); /* Reset allocation state */ guest_pt_gen_bump(g); @@ -826,10 +826,10 @@ int guest_get_used_regions(const guest_t *g, n++; } - /* Shim data/stack (full 2MB block) */ + /* Shim data/stack (full 2MiB block) */ if (n < max) { out[n].offset = SHIM_DATA_BASE; - out[n].size = BLOCK_2MB; + out[n].size = BLOCK_2MIB; n++; } @@ -1263,7 +1263,7 @@ static void guest_region_clear(guest_t *g) /* Page table builder. */ -/* Build block descriptor for a 2MB block at the given GPA with perms. */ +/* Build block descriptor for a 2MiB block at the given GPA with perms. */ static uint64_t make_block_desc(uint64_t gpa, int perms) { uint64_t desc = (gpa & L2_BLOCK_ADDR_MASK) /* PA bits */ @@ -1289,6 +1289,144 @@ static uint64_t make_block_desc(uint64_t gpa, int perms) return desc; } +/* Convert mixed-permission and partially-covered 2MiB blocks into L3 4KiB + * pages. + * + * The block-emit loop in guest_build_page_tables uses 2MiB block descriptors + * and OR-merges permissions when multiple regions touch the same block. The + * merge is correct only when every region in the block agrees on perms AND the + * union of those regions covers the entire block; otherwise it leaves + * over-permissive PTEs (e.g. .text RX + .data RW + heap RW in one 2MiB block + * collapses to RWX) and grants access to gap pages that should fault. + * + * For each unique 2MiB block touched by the input regions, this pass either + * keeps the block descriptor in place (single-perm full coverage) or splits it + * into 512 L3 pages, invalidates the lot, and re-validates each region's pages + * with the correct perms. Pages with no region coverage stay invalid, matching + * Linux semantics for inter-segment gaps in small static binaries. + */ +static bool finalize_block_perms(guest_t *g, const mem_region_t *regions, int n) +{ + /* Walk every 2MiB block touched by any region. Blocks shared by multiple + * regions are processed multiple times; the underlying split / invalidate / + * re-validate sequence is idempotent (guest_split_block is a no-op once + * the L2 entry is a table descriptor; guest_invalidate_ptes + per-region + * guest_update_perms produce the same final L3 state on every pass), so + * dedup is an optimization the heap-region scale (~127 blocks for the + * default brk window) does not justify against a fixed-size visited set. + */ + for (int r = 0; r < n; r++) { + uint64_t r_block_lo = ALIGN_2MIB_DOWN(regions[r].gpa_start); + uint64_t r_block_hi = ALIGN_2MIB_UP(regions[r].gpa_end); + + for (uint64_t b = r_block_lo; b < r_block_hi; b += BLOCK_2MIB) { + /* Walk all regions touching this block. Track perm uniformity and + * collect them into idx[] sorted by start so coverage can be + * checked with a single sweep. + */ + int idx[GUEST_MAX_REGIONS]; + int nidx = 0; + int first_perm = -1; + bool same_perm = true; + + for (int s = 0; s < n; s++) { + if (regions[s].gpa_end <= b || + regions[s].gpa_start >= b + BLOCK_2MIB) + continue; + if (first_perm < 0) + first_perm = regions[s].perms; + else if (regions[s].perms != first_perm) + same_perm = false; + + int pos = nidx; + while (pos > 0 && + regions[idx[pos - 1]].gpa_start > regions[s].gpa_start) { + idx[pos] = idx[pos - 1]; + pos--; + } + idx[pos] = s; + nidx++; + } + + /* Coverage sweep: regions are sorted by start, so the union covers + * the block iff each region begins at or before the running + * high-water mark. + */ + uint64_t covered_until = b; + bool full_coverage = true; + for (int i = 0; i < nidx; i++) { + uint64_t cs = regions[idx[i]].gpa_start; + uint64_t ce = regions[idx[i]].gpa_end; + if (cs > covered_until) { + full_coverage = false; + break; + } + if (ce > covered_until) + covered_until = ce; + } + if (covered_until < b + BLOCK_2MIB) + full_coverage = false; + + /* Single perm covering the whole block: the existing 2MiB + * descriptor is already correct. + */ + if (same_perm && full_coverage) + continue; + + /* Split into L3 pages, invalidate the lot, then rebuild the block + * from per-page unions. This preserves the required permission + * union when adjacent ELF segments share a 4KiB page after + * page-granularity rounding. + */ + if (guest_split_block(g, b) < 0) + return false; + if (guest_invalidate_ptes(g, b, b + BLOCK_2MIB) < 0) + return false; + + int page_perms[BLOCK_2MIB / PAGE_SIZE] = {0}; + for (int i = 0; i < nidx; i++) { + uint64_t s_start = regions[idx[i]].gpa_start; + uint64_t s_end = regions[idx[i]].gpa_end; + uint64_t apply_start = (s_start > b) ? s_start : b; + uint64_t apply_end = + (s_end < b + BLOCK_2MIB) ? s_end : b + BLOCK_2MIB; + /* Page-align to 4KiB so partially covered pages are recreated + * with the union of all overlapping segment permissions. + */ + apply_start = ALIGN_DOWN(apply_start, PAGE_SIZE); + apply_end = PAGE_ALIGN_UP(apply_end); + if (apply_end > b + BLOCK_2MIB) + apply_end = b + BLOCK_2MIB; + + for (uint64_t pa = apply_start; pa < apply_end; + pa += PAGE_SIZE) { + unsigned page_idx = (unsigned) ((pa - b) / PAGE_SIZE); + page_perms[page_idx] |= regions[idx[i]].perms; + } + } + + for (int i = 0; i < (int) ARRAY_SIZE(page_perms);) { + int perms = page_perms[i]; + int run_start = i; + + while (i < (int) ARRAY_SIZE(page_perms) && + page_perms[i] == perms) + i++; + if (!perms) + continue; + + uint64_t run_gpa_start = b + (uint64_t) run_start * PAGE_SIZE; + uint64_t run_gpa_end = b + (uint64_t) i * PAGE_SIZE; + if (guest_update_perms(g, run_gpa_start, run_gpa_end, perms) < + 0) + return false; + } + } + } + + return true; +} + uint64_t guest_build_page_tables(guest_t *g, const mem_region_t *regions, int n) { uint64_t base = g->ipa_base; @@ -1300,20 +1438,20 @@ uint64_t guest_build_page_tables(guest_t *g, const mem_region_t *regions, int n) uint64_t *l0 = pt_at(g, l0_gpa); - /* For each region, determine which 2MB blocks need mapping. + /* For each region, determine which 2MiB blocks need mapping. * Identity-mapped: VA == GPA, so L0/L1/L2 indices and the block * descriptor output address are both derived from gpa_start + ipa_base. */ for (int r = 0; r < n; r++) { - uint64_t gpa_start = ALIGN_2MB_DOWN(regions[r].gpa_start); - uint64_t gpa_end = ALIGN_2MB_UP(regions[r].gpa_end); + uint64_t gpa_start = ALIGN_2MIB_DOWN(regions[r].gpa_start); + uint64_t gpa_end = ALIGN_2MIB_UP(regions[r].gpa_end); int perms = regions[r].perms; - for (uint64_t gpa = gpa_start; gpa < gpa_end; gpa += BLOCK_2MB) { + for (uint64_t gpa = gpa_start; gpa < gpa_end; gpa += BLOCK_2MIB) { uint64_t lookup_addr = base + gpa; - /* L0 index: which 512GB slot this VA falls in */ - unsigned l0_idx = (unsigned) (lookup_addr / (512ULL * BLOCK_1GB)); + /* L0 index: which 512GiB slot this VA falls in */ + unsigned l0_idx = (unsigned) (lookup_addr / (512ULL * BLOCK_1GIB)); if (l0_idx >= 512) { log_error("guest: VA 0x%llx out of L0 range", (unsigned long long) lookup_addr); @@ -1330,9 +1468,9 @@ uint64_t guest_build_page_tables(guest_t *g, const mem_region_t *regions, int n) uint64_t l1_ipa = l0[l0_idx] & 0xFFFFFFFFF000ULL; uint64_t *l1 = pt_at(g, l1_ipa - base); - /* L1 index within the 512GB L0 entry (from VA) */ + /* L1 index within the 512GiB L0 entry (from VA) */ unsigned l1_idx = - (unsigned) ((lookup_addr % (512ULL * BLOCK_1GB)) / BLOCK_1GB); + (unsigned) ((lookup_addr % (512ULL * BLOCK_1GIB)) / BLOCK_1GIB); if (l1_idx >= 512) { log_error("guest: VA 0x%llx out of L1 range", (unsigned long long) lookup_addr); @@ -1347,19 +1485,19 @@ uint64_t guest_build_page_tables(guest_t *g, const mem_region_t *regions, int n) l1[l1_idx] = (base + l2_gpa) | PT_VALID | PT_TABLE; } - /* L2 table for this 1GB region (stored in host at gpa offset) */ + /* L2 table for this 1GiB region (stored in host at gpa offset) */ uint64_t l2_ipa = l1[l1_idx] & 0xFFFFFFFFF000ULL; uint64_t l2_gpa_off = l2_ipa - base; uint64_t *l2 = pt_at(g, l2_gpa_off); - /* L2 index: which 2MB block within the 1GB region (from VA) */ + /* L2 index: which 2MiB block within the 1GiB region (from VA) */ unsigned l2_idx = - (unsigned) ((lookup_addr % BLOCK_1GB) / BLOCK_2MB); + (unsigned) ((lookup_addr % BLOCK_1GIB) / BLOCK_2MIB); /* If block already mapped, merge permissions (most permissive). * Use a local variable for the merged perms. Do NOT modify the * outer perms variable, which would leak accumulated permissions - * to subsequent 2MB blocks in the same region. + * to subsequent 2MiB blocks in the same region. */ int block_perms = perms; if (l2[l2_idx] & PT_BLOCK) { @@ -1380,11 +1518,18 @@ uint64_t guest_build_page_tables(guest_t *g, const mem_region_t *regions, int n) /* Store TTBR0 for later use by guest_extend_page_tables */ uint64_t ttbr0 = base + l0_gpa; g->ttbr0 = ttbr0; + + /* Convert blocks shared by regions with mixed perms or partial coverage + * into L3 4KiB pages so each segment's permissions are honored exactly. + */ + if (!finalize_block_perms(g, regions, n)) + return 0; + guest_pt_gen_bump(g); return ttbr0; } -/* Extend page tables to cover [start, end) with 2MB block descriptors. +/* Extend page tables to cover [start, end) with 2MiB block descriptors. * Walks the existing L0->L1 structure (from g->ttbr0) and allocates new * L2 tables as needed. This is safe to call while the vCPU is paused * (during HVC #5 handling). Sets g->need_tlbi so the shim flushes the @@ -1401,14 +1546,14 @@ int guest_extend_page_tables(guest_t *g, uint64_t l0_gpa_off = g->ttbr0 - base; uint64_t *l0 = pt_at(g, l0_gpa_off); - /* Walk 2MB blocks in [start, end) */ - uint64_t addr_start = ALIGN_2MB_DOWN(start), addr_end = ALIGN_2MB_UP(end); + /* Walk 2MiB blocks in [start, end) */ + uint64_t addr_start = ALIGN_2MIB_DOWN(start), addr_end = ALIGN_2MIB_UP(end); - for (uint64_t addr = addr_start; addr < addr_end; addr += BLOCK_2MB) { + for (uint64_t addr = addr_start; addr < addr_end; addr += BLOCK_2MIB) { uint64_t ipa = base + addr; - /* L0 index: which 512GB slot (>512GB addresses need L0[1]+) */ - unsigned l0_idx = (unsigned) (ipa / (512ULL * BLOCK_1GB)); + /* L0 index: which 512GiB slot (>512GiB addresses need L0[1]+) */ + unsigned l0_idx = (unsigned) (ipa / (512ULL * BLOCK_1GIB)); if (l0_idx >= 512) { log_error("guest: IPA 0x%llx out of L0 range in extend", (unsigned long long) ipa); @@ -1426,7 +1571,8 @@ int guest_extend_page_tables(guest_t *g, uint64_t l1_ipa = l0[l0_idx] & 0xFFFFFFFFF000ULL; uint64_t *l1 = pt_at(g, l1_ipa - base); - unsigned l1_idx = (unsigned) ((ipa % (512ULL * BLOCK_1GB)) / BLOCK_1GB); + unsigned l1_idx = + (unsigned) ((ipa % (512ULL * BLOCK_1GIB)) / BLOCK_1GIB); if (l1_idx >= 512) { log_error("guest: IPA 0x%llx out of L1 range in extend", (unsigned long long) ipa); @@ -1445,7 +1591,7 @@ int guest_extend_page_tables(guest_t *g, uint64_t l2_ipa = l1[l1_idx] & 0xFFFFFFFFF000ULL; uint64_t *l2 = pt_at(g, l2_ipa - base); - unsigned l2_idx = (unsigned) ((ipa % BLOCK_1GB) / BLOCK_2MB); + unsigned l2_idx = (unsigned) ((ipa % BLOCK_1GIB) / BLOCK_2MIB); /* Only map if not already mapped */ if (!(l2[l2_idx] & PT_BLOCK)) { @@ -1465,7 +1611,7 @@ int guest_extend_page_tables(guest_t *g, */ #define PT_L3_PAGE (3ULL) -/* Build a 4KB L3 page descriptor with the given permissions. +/* Build a 4KiB L3 page descriptor with the given permissions. * Layout matches block descriptors (AF, SH, NS, MAIR, AP, XN) except * bits[1:0]=11 instead of 01. */ @@ -1506,26 +1652,26 @@ static uint64_t *find_l2_entry(guest_t *g, uint64_t gpa_offset) uint64_t l0_gpa_off = g->ttbr0 - base; uint64_t *l0 = pt_at(g, l0_gpa_off); - /* L0 index from actual IPA (not base), correct for >512GB */ - unsigned l0_idx = (unsigned) (ipa / (512ULL * BLOCK_1GB)); + /* L0 index from actual IPA (not base), correct for >512GiB */ + unsigned l0_idx = (unsigned) (ipa / (512ULL * BLOCK_1GIB)); if (l0_idx >= 512 || !(l0[l0_idx] & PT_VALID)) return NULL; uint64_t l1_ipa = l0[l0_idx] & 0xFFFFFFFFF000ULL; uint64_t *l1 = pt_at(g, l1_ipa - base); - unsigned l1_idx = (unsigned) ((ipa % (512ULL * BLOCK_1GB)) / BLOCK_1GB); + unsigned l1_idx = (unsigned) ((ipa % (512ULL * BLOCK_1GIB)) / BLOCK_1GIB); if (l1_idx >= 512 || !(l1[l1_idx] & PT_VALID)) return NULL; uint64_t l2_ipa = l1[l1_idx] & 0xFFFFFFFFF000ULL; uint64_t *l2 = pt_at(g, l2_ipa - base); - unsigned l2_idx = (unsigned) ((ipa % BLOCK_1GB) / BLOCK_2MB); + unsigned l2_idx = (unsigned) ((ipa % BLOCK_1GIB) / BLOCK_2MIB); return &l2[l2_idx]; } -/* Split a 2MB L2 block descriptor into 512 × 4KB L3 page descriptors. +/* Split a 2MiB L2 block descriptor into 512 × 4KiB L3 page descriptors. * The caller provides the L2 entry via find_l2_entry. * Extracts the output IPA from the existing descriptor. */ @@ -1549,7 +1695,7 @@ static int split_l2_block(guest_t *g, uint64_t *l2_entry) return -1; uint64_t *l3 = pt_at(g, l3_gpa); - /* Fill 512 L3 entries with 4KB page descriptors inheriting the + /* Fill 512 L3 entries with 4KiB page descriptors inheriting the * block's permissions. Extract the output IPA from bits [47:21] * of the existing descriptor (not from the caller's address). */ @@ -1564,7 +1710,7 @@ static int split_l2_block(guest_t *g, uint64_t *l2_entry) int guest_split_block(guest_t *g, uint64_t block_gpa) { - uint64_t block_start = ALIGN_2MB_DOWN(block_gpa); + uint64_t block_start = ALIGN_2MIB_DOWN(block_gpa); uint64_t *l2_entry = find_l2_entry(g, block_start); return split_l2_block(g, l2_entry); } @@ -1580,13 +1726,13 @@ int guest_invalidate_ptes(guest_t *g, uint64_t start, uint64_t end) for (uint64_t addr = start; addr < end;) { uint64_t *l2_entry = find_l2_entry(g, addr); if (!l2_entry) { - /* No L2 entry (already unmapped); skip this 2MB block */ - addr = ALIGN_2MB_UP(addr + 1); + /* No L2 entry (already unmapped); skip this 2MiB block */ + addr = ALIGN_2MIB_UP(addr + 1); continue; } - uint64_t block_start = ALIGN_2MB_DOWN(addr); - uint64_t block_end = block_start + BLOCK_2MB; + uint64_t block_start = ALIGN_2MIB_DOWN(addr); + uint64_t block_end = block_start + BLOCK_2MIB; /* Not mapped at all: skip */ if (!(*l2_entry & 1)) { @@ -1594,25 +1740,25 @@ int guest_invalidate_ptes(guest_t *g, uint64_t start, uint64_t end) continue; } - /* Check if this is a 2MB block or already an L3 table */ + /* Check if this is a 2MiB block or already an L3 table */ if ((*l2_entry & 3) == 1) { - /* 2MB block descriptor */ + /* 2MiB block descriptor */ if (start <= block_start && end >= block_end) { - /* Invalidating the entire 2MB block: clear the L2 entry */ + /* Invalidating the entire 2MiB block: clear the L2 entry */ *l2_entry = 0; g->need_tlbi = true; addr = block_end; continue; } - /* Partial invalidation within a 2MB block: split first, + /* Partial invalidation within a 2MiB block: split first, * then invalidate individual L3 pages below. */ if (guest_split_block(g, block_start) < 0) return -1; } - /* L3 table: invalidate individual 4KB page descriptors */ + /* L3 table: invalidate individual 4KiB page descriptors */ uint64_t l3_ipa = *l2_entry & 0xFFFFFFFFF000ULL; uint64_t *l3 = pt_at(g, l3_ipa - base); @@ -1621,7 +1767,7 @@ int guest_invalidate_ptes(guest_t *g, uint64_t start, uint64_t end) for (uint64_t pa = page_start; pa < page_end; pa += PAGE_SIZE) { unsigned l3_idx = - (unsigned) (((base + pa) % BLOCK_2MB) / PAGE_SIZE); + (unsigned) (((base + pa) % BLOCK_2MIB) / PAGE_SIZE); l3[l3_idx] = 0; /* Invalid descriptor */ } @@ -1644,13 +1790,13 @@ int guest_update_perms(guest_t *g, uint64_t start, uint64_t end, int perms) for (uint64_t addr = start; addr < end;) { uint64_t *l2_entry = find_l2_entry(g, addr); if (!l2_entry) { - /* Skip unmapped 2MB blocks */ - addr = ALIGN_2MB_UP(addr + 1); + /* Skip unmapped 2MiB blocks */ + addr = ALIGN_2MIB_UP(addr + 1); continue; } - uint64_t block_start = ALIGN_2MB_DOWN(addr); - uint64_t block_end = block_start + BLOCK_2MB; + uint64_t block_start = ALIGN_2MIB_DOWN(addr); + uint64_t block_end = block_start + BLOCK_2MIB; /* Not mapped at all: skip */ if (!(*l2_entry & 1)) { @@ -1658,12 +1804,12 @@ int guest_update_perms(guest_t *g, uint64_t start, uint64_t end, int perms) continue; } - /* Check if this is a 2MB block or already an L3 table */ + /* Check if this is a 2MiB block or already an L3 table */ if ((*l2_entry & 3) == 1) { - /* 2MB block descriptor */ + /* 2MiB block descriptor */ int old_perms = desc_to_perms(*l2_entry); - /* If the whole 2MB block changes permissions, rewrite the block + /* If the whole 2MiB block changes permissions, rewrite the block * descriptor without splitting. Extract the output IPA from the * existing descriptor, correct for both identity and non-identity * mapped regions. @@ -1678,7 +1824,7 @@ int guest_update_perms(guest_t *g, uint64_t start, uint64_t end, int perms) continue; } - /* Partial update: split the 2MB block into L3 pages first, then + /* Partial update: split the 2MiB block into L3 pages first, then * fall through to update individual pages below. */ if (old_perms != perms) { @@ -1691,17 +1837,17 @@ int guest_update_perms(guest_t *g, uint64_t start, uint64_t end, int perms) } } - /* L3 table: update individual 4KB page descriptors */ + /* L3 table: update individual 4KiB page descriptors */ uint64_t l3_ipa = *l2_entry & 0xFFFFFFFFF000ULL; uint64_t *l3 = pt_at(g, l3_ipa - base); - /* Update pages within this 2MB block that fall in [start, end) */ + /* Update pages within this 2MiB block that fall in [start, end) */ uint64_t page_start = (addr > block_start) ? addr : block_start; uint64_t page_end = (end < block_end) ? end : block_end; for (uint64_t pa = page_start; pa < page_end; pa += PAGE_SIZE) { unsigned l3_idx = - (unsigned) (((base + pa) % BLOCK_2MB) / PAGE_SIZE); + (unsigned) (((base + pa) % BLOCK_2MIB) / PAGE_SIZE); /* Extract the existing output IPA from the L3 entry. For * non-identity mapped regions, pa is a VA not a GPA, so the builder * must use the IPA already stored in the descriptor (set by @@ -1745,14 +1891,14 @@ int guest_materialize_lazy(guest_t *g, uint64_t fault_offset) if (!region) return -1; /* Not a noreserve region */ - /* Materialize one 2MB block containing the fault address. This is + /* Materialize one 2MiB block containing the fault address. This is * the smallest granule that guest_extend_page_tables works with. * For the common case (sparse heap touch), materializing one block * at a time is the right trade-off: it avoids over-committing the * large reservation while keeping the fault rate manageable. */ - uint64_t block_start = fault_offset & ~(BLOCK_2MB - 1); - uint64_t block_end = block_start + BLOCK_2MB; + uint64_t block_start = fault_offset & ~(BLOCK_2MIB - 1); + uint64_t block_end = block_start + BLOCK_2MIB; /* Clamp to guest size */ if (block_end > g->guest_size) @@ -1791,9 +1937,9 @@ int guest_materialize_lazy(guest_t *g, uint64_t fault_offset) return -1; /* If this block had no page-table entry before the lazy fault, - * guest_extend_page_tables() necessarily created a full 2MB block. + * guest_extend_page_tables() necessarily created a full 2MiB block. * Split it and remove pages outside this noreserve region so holes and - * guards in the same 2MB block remain faults. Existing split blocks + * guards in the same 2MiB block remain faults. Existing split blocks * already encode neighboring mappings, so leave them intact. */ if (!had_mapping) { diff --git a/src/core/guest.h b/src/core/guest.h index ee99cb1..6e57623 100644 --- a/src/core/guest.h +++ b/src/core/guest.h @@ -6,8 +6,8 @@ * * Provides identity-mapped guest physical memory (GVA == GPA == offset into * host buffer). Buffer size is determined by the VM's configured IPA width: - * - Native aarch64 on M2 (36-bit IPA): 64GB - * - Native aarch64 on M3+ (40-bit IPA): 1TB + * - Native aarch64 on M2 (36-bit IPA): 64GiB + * - Native aarch64 on M3+ (40-bit IPA): 1TiB * * Reserved via mmap(MAP_ANON); macOS demand-pages physical memory on first * touch, so unused pages cost nothing. The slab is mapped RWX to @@ -27,49 +27,49 @@ /* Memory layout constants. * * Guest memory size is determined dynamically from the VM's IPA width - * (36-bit = 64GB on M2, 40-bit = 1TB on M3+). See guest.c for the + * (36-bit = 64GiB on M2, 40-bit = 1TiB on M3+). See guest.c for the * runtime probe that selects the correct size. */ #define PT_POOL_BASE 0x00010000ULL /* Page table pool start */ -#define PT_POOL_END 0x00100000ULL /* Page table pool end (960KB) */ -#define SHIM_BASE 0x00100000ULL /* Shim code (2MB block, RX) */ -#define SHIM_DATA_BASE 0x00200000ULL /* Shim stack/data (2MB block, RW) */ +#define PT_POOL_END 0x00100000ULL /* Page table pool end (960KiB) */ +#define SHIM_BASE 0x00100000ULL /* Shim code (2MiB block, RX) */ +#define SHIM_DATA_BASE 0x00200000ULL /* Shim stack/data (2MiB block, RW) */ #define ELF_DEFAULT_BASE 0x00400000ULL /* Typical ELF load base */ -#define PIE_LOAD_BASE 0x00400000ULL /* PIE (ET_DYN) executable base (4MB) */ -#define BRK_BASE_DEFAULT 0x01000000ULL /* Default brk start (16MB) */ +#define PIE_LOAD_BASE 0x00400000ULL /* PIE (ET_DYN) executable base (4MiB) */ +#define BRK_BASE_DEFAULT 0x01000000ULL /* Default brk start (16MiB) */ -/* 8MB stack (four 2MB blocks); unused HVF backing pages consume no RAM. */ +/* 8MiB stack (four 2MiB blocks); unused HVF backing pages consume no RAM. */ #define STACK_SIZE 0x00800000ULL -/* Used when brk_start is below 128MB; otherwise placed above brk. */ +/* Used when brk_start is below 128MiB; otherwise placed above brk. */ #define STACK_TOP_DEFAULT 0x08000000ULL -#define STACK_GUARD_SIZE 0x00001000ULL /* 4KB guard page at bottom of stack */ +#define STACK_GUARD_SIZE 0x00001000ULL /* 4KiB guard at stack bottom */ -/* mmap RX region for PROT_EXEC; placed below 8GB to leave the high mmap +/* mmap RX region for PROT_EXEC; placed below 8GiB to leave the high mmap * region clear for runtimes that demand a specific minimum heap address. */ #define MMAP_RX_BASE 0x10000000ULL -/* Initial pre-mapped mmap RX end. Only covers the first 2MB block; +/* Initial pre-mapped mmap RX end. Only covers the first 2MiB block; * additional pages are mapped lazily by guest_extend_page_tables() * when sys_mmap needs more PROT_EXEC space. Reduces startup time * and memory pressure for small binaries that never call mmap. */ -#define MMAP_RX_INITIAL_END (MMAP_RX_BASE + 0x200000ULL) /* +2MB */ +#define MMAP_RX_INITIAL_END (MMAP_RX_BASE + 0x200000ULL) /* +2MiB */ -/* mmap RW region starts at 8GB to match real Linux address layouts. */ +/* mmap RW region starts at 8GiB to match real Linux address layouts. */ #define MMAP_BASE 0x200000000ULL -/* Initial pre-mapped mmap RW end. Only covers the first 2MB block; +/* Initial pre-mapped mmap RW end. Only covers the first 2MiB block; * additional pages are mapped lazily by guest_extend_page_tables(). */ -#define MMAP_INITIAL_END (MMAP_BASE + 0x200000ULL) /* +2MB */ +#define MMAP_INITIAL_END (MMAP_BASE + 0x200000ULL) /* +2MiB */ /* mmap_limit and interp_base are computed dynamically from guest_size * in main.c and stored in guest_t. */ -#define BLOCK_2MB (2ULL * 1024 * 1024) +#define BLOCK_2MIB (2ULL * 1024 * 1024) /* IPA base: guest memory is mapped at this IPA in the hypervisor. * All guest physical addresses = GUEST_IPA_BASE + offset. @@ -91,8 +91,8 @@ * Identity-mapped: VA == GPA. */ typedef struct { - uint64_t gpa_start; /* Output IPA/GPA (2MB aligned) */ - uint64_t gpa_end; /* Output IPA/GPA end (exclusive, 2MB aligned) */ + uint64_t gpa_start; /* Output IPA/GPA (2MiB aligned) */ + uint64_t gpa_end; /* Output IPA/GPA end (exclusive, 2MiB aligned) */ int perms; /* MEM_PERM_* flags */ } mem_region_t; @@ -261,14 +261,14 @@ int guest_read_str(const guest_t *g, uint64_t gva, char *dst, size_t max); int guest_read_str_small(const guest_t *g, uint64_t gva, char *dst, size_t max); /* Build L0->L1->L2 page tables from an array of memory regions. - * Uses 2MB block descriptors. Returns the TTBR0 value (GPA of L0 table), + * Uses 2MiB block descriptors. Returns the TTBR0 value (GPA of L0 table), * or 0 on failure. */ uint64_t guest_build_page_tables(guest_t *g, const mem_region_t *regions, int n); -/* Extend page tables to cover a new address range [start, end) with 2MB +/* Extend page tables to cover a new address range [start, end) with 2MiB * block descriptors. Reuses the existing L0->L1 table structure and * allocates new L2 tables as needed. Sets g->need_tlbi = true. * Returns 0 on success, -1 on failure. @@ -278,8 +278,8 @@ int guest_extend_page_tables(guest_t *g, uint64_t end, int perms); -/* Split a 2MB block descriptor into 512 x 4KB L3 page descriptors. - * block_gpa must be within a currently-mapped 2MB block. The block's +/* Split a 2MiB block descriptor into 512 x 4KiB L3 page descriptors. + * block_gpa must be within a currently-mapped 2MiB block. The block's * permissions are inherited by all 512 page entries. If the block is * already split (L2 entry is a table descriptor), this is a no-op. * Sets g->need_tlbi = true. Returns 0 on success, -1 on failure. @@ -290,16 +290,16 @@ int guest_split_block(guest_t *g, uint64_t block_gpa); * Sets L2 block descriptors and L3 page descriptors to 0 (invalid), * causing translation faults on access. Used when mprotect sets * PROT_NONE; the correct behavior is for the guest to fault. - * If a 2MB block is only partially invalidated, the block is split + * If a 2MiB block is only partially invalidated, the block is split * into L3 pages first (preserving the non-invalidated pages). * Sets g->need_tlbi = true. Returns 0 on success, -1 on failure. */ int guest_invalidate_ptes(guest_t *g, uint64_t start, uint64_t end); /* Update page table permissions for the range [start, end). - * If a 2MB block needs mixed permissions (only part of it is being - * updated), the block is automatically split into 4KB L3 pages first. - * If the entire 2MB block is being updated, the block descriptor is + * If a 2MiB block needs mixed permissions (only part of it is being + * updated), the block is automatically split into 4KiB L3 pages first. + * If the entire 2MiB block is being updated, the block descriptor is * modified in place without splitting. * perms is a MEM_PERM_R/W/X combination. Sets g->need_tlbi = true. * Returns 0 on success, -1 on failure. @@ -377,7 +377,7 @@ void guest_region_set_prot(guest_t *g, uint64_t start, uint64_t end, int prot); /* Try to materialize a lazy (MAP_NORESERVE) page at the given offset. * Called from the data/instruction abort handler when the faulting address - * falls within a noreserve region. Creates page table entries for one 2MB + * falls within a noreserve region. Creates page table entries for one 2MiB * block containing the fault address, zeros the memory, and clears the * noreserve flag for the materialized sub-range. * Returns 0 on success (caller should TLBI and retry), -1 if the offset is not diff --git a/src/core/shim.S b/src/core/shim.S index 3b82e74..fe82f9a 100644 --- a/src/core/shim.S +++ b/src/core/shim.S @@ -169,7 +169,7 @@ _start: eret /* Exception Vector Table - * Must be 2KB (0x800) aligned. Each entry is 128 bytes (0x80). + * Must be 2KiB (0x800) aligned. Each entry is 128 bytes (0x80). * * bad_exception vectors: mov x5, #offset + b bad_exception * X5 carries the vector offset for host-side debugging. diff --git a/src/core/stack.c b/src/core/stack.c index 1ff369d..fb75916 100644 --- a/src/core/stack.c +++ b/src/core/stack.c @@ -161,7 +161,7 @@ uint64_t build_linux_stack(guest_t *g, } /* Bounds-check: Linux returns E2BIG for oversized argument/environment. - * ARG_MAX on Linux is typically 2MB; stack setup caps at reasonable stack + * ARG_MAX on Linux is typically 2MiB; stack setup caps at reasonable stack * limits. */ #define MAX_ARGS 131072 diff --git a/src/core/vdso.c b/src/core/vdso.c index 21078ba..444be88 100644 --- a/src/core/vdso.c +++ b/src/core/vdso.c @@ -23,8 +23,6 @@ #include "core/elf.h" #include "debug/log.h" -#define VDSO_SIZE 0x00001000ULL /* 4KB */ - /* ELF section header (not in core/elf.h). */ typedef struct { @@ -72,7 +70,7 @@ typedef struct { * [3] __kernel_gettimeofday */ -/* Offsets within the 4KB page */ +/* Offsets within the 4KiB page */ #define VDSO_OFF_EHDR 0x000 #define VDSO_OFF_PHDR 0x040 #define VDSO_OFF_PHDR1 0x078 @@ -100,7 +98,7 @@ typedef struct { /* 6 * 16 = 96, 0x1D8 + 96 = 0x238 */ #define VDSO_OFF_SHDR 0x238 -/* 6 * 64 = 384, 0x238 + 384 = 0x3B8 (fits in 4KB) */ +/* 6 * 64 = 384, 0x238 + 384 = 0x3B8 (fits in 4KiB) */ #define VDSO_NUM_SYMS 4 #define HASH_NCHAIN (VDSO_NUM_SYMS + 1) #define HASH_NBUCKET 1 diff --git a/src/core/vdso.h b/src/core/vdso.h index cb63aa4..e3a41d5 100644 --- a/src/core/vdso.h +++ b/src/core/vdso.h @@ -14,14 +14,15 @@ #include "core/guest.h" -/* Guest address where the vDSO is placed (one 4KB page, below PT pool) */ +/* Guest address where the vDSO is placed (one 4KiB page, below PT pool) */ #define VDSO_BASE 0x0000F000ULL -#define VDSO_OFF_TEXT 0x0B0 /* Offset of .text (trampoline code) */ +#define VDSO_SIZE 0x00001000ULL /* 4KiB */ +#define VDSO_OFF_TEXT 0x0B0 /* Offset of .text (trampoline code) */ /* Build a minimal vDSO ELF image at VDSO_BASE in guest memory. - * The image contains a valid ELF header, one LOAD program header, - * SHT_DYNSYM and SHT_STRTAB sections, and a __kernel_rt_sigreturn - * symbol pointing to a small trampoline (mov x8, #139; svc #0). + * The image contains a valid ELF header, one LOAD program header, SHT_DYNSYM + * and SHT_STRTAB sections, and a __kernel_rt_sigreturn symbol pointing to + * a small trampoline (mov x8, #139; svc #0). * Returns the GVA of the ELF header (== VDSO_BASE), or 0 on failure. */ uint64_t vdso_build(guest_t *g); diff --git a/src/debug/gdbstub.c b/src/debug/gdbstub.c index 242478b..a8f3e5b 100644 --- a/src/debug/gdbstub.c +++ b/src/debug/gdbstub.c @@ -50,7 +50,7 @@ /* Constants. */ -#define GDB_PKT_BUF_SIZE ((size_t) 128 * 1024) /* Max packet size (128KB) */ +#define GDB_PKT_BUF_SIZE ((size_t) 128 * 1024) /* Max packet size (128KiB) */ #define MAX_HW_BREAKPOINTS 16 #define MAX_HW_WATCHPOINTS 16 diff --git a/src/hvutil.h b/src/hvutil.h index 5a687f8..6a211d1 100644 --- a/src/hvutil.h +++ b/src/hvutil.h @@ -63,7 +63,7 @@ (1ULL << 7) /* ITD */) /* TCR_EL1. - * 4KB granule, 48-bit VA, EPD1=1 (TTBR1 walks disabled). + * 4KiB granule, 48-bit VA, EPD1=1 (TTBR1 walks disabled). * Used by main.c (initial setup) and syscall/exec.c (exec re-init). */ #define TCR_EL1_VALUE 0x25B5903510ULL diff --git a/src/main.c b/src/main.c index 60397af..fa7ce52 100644 --- a/src/main.c +++ b/src/main.c @@ -8,7 +8,7 @@ * - A minimal EL1 shim (embedded as shim_blob.h) that provides exception * vectors and forwards SVC #0 (Linux syscalls) to the host via HVC #5. * - All system registers configured from the host before vCPU start. - * - Guest memory identity-mapped at GVA=GPA with 2MB block page tables. + * - Guest memory identity-mapped at GVA=GPA with 2MiB block page tables. * - Syscall handlers that translate Linux syscalls to macOS equivalents. * * Usage: elfuse [--verbose] [--timeout N] [--sysroot PATH] [args...] diff --git a/src/runtime/forkipc.c b/src/runtime/forkipc.c index 60cc9de..e01e6a0 100644 --- a/src/runtime/forkipc.c +++ b/src/runtime/forkipc.c @@ -1020,7 +1020,7 @@ int64_t sys_clone(hv_vcpu_t vcpu, * Siblings may mmap/munmap/mprotect after resume, so the code needs a * stable copy for the IPC send. Heap-allocated because * GUEST_MAX_REGIONS * sizeof(guest_region_t) exceeds safe - * stack limits on worker threads (512KB default). + * stack limits on worker threads (512KiB default). */ int nregions_snapshot = g->nregions; size_t snap_sz = (size_t) nregions_snapshot * sizeof(guest_region_t); diff --git a/src/runtime/proctitle.c b/src/runtime/proctitle.c index 10cbacf..4e296dc 100644 --- a/src/runtime/proctitle.c +++ b/src/runtime/proctitle.c @@ -1,4 +1,4 @@ -/* Process-title helpers for elfuse +/* Process-title helpers * * Copyright 2026 elfuse contributors * SPDX-License-Identifier: Apache-2.0 diff --git a/src/runtime/proctitle.h b/src/runtime/proctitle.h index d2c55c8..15c39ef 100644 --- a/src/runtime/proctitle.h +++ b/src/runtime/proctitle.h @@ -1,6 +1,6 @@ #pragma once -/* Process-title helpers for elfuse +/* Process-title helpers * * Copyright 2026 elfuse contributors * SPDX-License-Identifier: Apache-2.0 diff --git a/src/runtime/thread.c b/src/runtime/thread.c index a2c8ab5..aedddce 100644 --- a/src/runtime/thread.c +++ b/src/runtime/thread.c @@ -21,7 +21,7 @@ #include "runtime/thread.h" #include "debug/log.h" -#include "core/guest.h" /* SHIM_DATA_BASE, BLOCK_2MB, GUEST_IPA_BASE */ +#include "core/guest.h" /* SHIM_DATA_BASE, BLOCK_2MIB, GUEST_IPA_BASE */ #include "hvutil.h" /* vcpu_get_gpr, vcpu_get_sysreg */ /* From syscall/signal.h, included here directly to avoid pulling in @@ -32,8 +32,8 @@ static void thread_ptrace_init(thread_entry_t *t); -/* Top of the EL1 exception stack region (one 4KB slot per thread) */ -#define SP_EL1_TOP (GUEST_IPA_BASE + SHIM_DATA_BASE + BLOCK_2MB) +/* Top of the EL1 exception stack region (one 4KiB slot per thread) */ +#define SP_EL1_TOP (GUEST_IPA_BASE + SHIM_DATA_BASE + BLOCK_2MIB) /* Thread table. */ @@ -61,7 +61,7 @@ static _Atomic int active_thread_count = 0; /* Bitmask tracking allocated SP_EL1 slots. Bit N set = slot N in use. * MAX_THREADS=64 fits exactly in a uint64_t. Slot 0 is the main thread (top of - * shim data region); each subsequent slot is 4KB below. + * shim data region); each subsequent slot is 4KiB below. */ static uint64_t sp_el1_allocated = 0; @@ -272,8 +272,8 @@ uint64_t thread_alloc_sp_el1(void) log_error("thread: SP_EL1 slots exhausted"); } else { int slot = bit_ctz64(free_mask); - /* Main thread's SP_EL1 = IPA_BASE + SHIM_DATA_BASE + 2MB. - * Each subsequent thread is 4KB below. + /* Main thread's SP_EL1 = IPA_BASE + SHIM_DATA_BASE + 2MiB. + * Each subsequent thread is 4KiB below. */ uint64_t top = SP_EL1_TOP; sp = top - (uint64_t) slot * 4096; diff --git a/src/runtime/thread.h b/src/runtime/thread.h index 371433c..4304eaa 100644 --- a/src/runtime/thread.h +++ b/src/runtime/thread.h @@ -9,9 +9,9 @@ * threads are added via clone(CLONE_THREAD). A _Thread_local pointer provides * O(1) access to the current thread's entry from any syscall handler. * - * SP_EL1 allocation: each thread gets a 4KB EL1 exception stack carved from the - * shim data region (SHIM_DATA_BASE + 2MB). Thread 0 (main) gets the top, thread - * N gets offset -(N * 4096). + * SP_EL1 allocation: each thread gets a 4KiB EL1 exception stack carved from + * the shim data region (SHIM_DATA_BASE + 2MiB). Thread 0 (main) gets the top, + * thread N gets offset -(N * 4096). */ #pragma once @@ -156,10 +156,10 @@ int thread_active_count(void); /* Fast path: return non-zero when exactly one guest thread is active. */ int thread_is_single_active(void); -/* Allocate a per-thread SP_EL1 value. Thread N gets the Nth 4KB slot counting +/* Allocate a per-thread SP_EL1 value. Thread N gets the Nth 4KiB slot counting * down from the top of the shim data region. The IPA base (GUEST_IPA_BASE + - * SHIM_DATA_BASE + 2MB) is the main thread's SP_EL1; each subsequent thread - * subtracts 4KB. Returns the IPA, or 0 on failure. + * SHIM_DATA_BASE + 2MiB) is the main thread's SP_EL1; each subsequent thread + * subtracts 4KiB. Returns the IPA, or 0 on failure. */ uint64_t thread_alloc_sp_el1(void); diff --git a/src/syscall/exec.c b/src/syscall/exec.c index 7b856ef..52f109d 100644 --- a/src/syscall/exec.c +++ b/src/syscall/exec.c @@ -369,7 +369,7 @@ int64_t sys_execve(hv_vcpu_t vcpu, * Cleanup acquires sfd_lock or inotify_lock, which must NOT be held under * fd_lock (lock ordering: fd_lock(3) < sfd_lock(5a) < inotify_lock(7)). * - * Two passes: count first, then heap-allocate. Avoids placing a ~100KB + * Two passes: count first, then heap-allocate. Avoids placing a ~100KiB * VLA on the stack (FD_TABLE_SIZE * sizeof(fd_entry_t+int)). */ int cloexec_count = 0; @@ -529,20 +529,25 @@ int64_t sys_execve(hv_vcpu_t vcpu, g->brk_current = brk_start; /* Keep exec stack placement consistent with initial process startup. */ - uint64_t stack_top = ALIGN_UP(brk_start, BLOCK_2MB); + uint64_t stack_top = ALIGN_UP(brk_start, BLOCK_2MIB); stack_top += STACK_SIZE; if (stack_top < STACK_TOP_DEFAULT) stack_top = STACK_TOP_DEFAULT; g->stack_top = stack_top; g->stack_base = stack_top - STACK_SIZE; -#define MAX_REGIONS 32 + /* Worst case: 7 fixed regions (shim, shim-data, vDSO, brk, stack, mmap RX, + * mmap RW) plus up to ELF_MAX_SEGMENTS for both the executable and the + * interpreter. Sized comfortably to keep the bounds-check loops simple + * after the point of no return. + */ +#define MAX_REGIONS (8 + 2 * ELF_MAX_SEGMENTS) mem_region_t regions[MAX_REGIONS]; int nregions = 0; - /* Fixed regions (shim, brk, stack, mmap areas): 6 entries. - * Bounds-check before each to prevent array overflow. After the point of no - * return, overflow is fatal (exit). + /* Fixed regions (shim, shim-data, vDSO, brk, stack, mmap RX, mmap RW): 7 + * entries. Bounds-check before each to prevent array overflow. After the + * point of no return, overflow is fatal (exit). */ /* Keep the shim executable-only; HVF faults on merged RWX mappings. */ @@ -555,14 +560,29 @@ int64_t sys_execve(hv_vcpu_t vcpu, /* EL1 exception handlers use this block for stack and scratch state. */ if (nregions >= MAX_REGIONS) goto too_many_regions; - regions[nregions++] = (mem_region_t) {.gpa_start = SHIM_DATA_BASE, - .gpa_end = SHIM_DATA_BASE + BLOCK_2MB, - .perms = MEM_PERM_RW}; + regions[nregions++] = + (mem_region_t) {.gpa_start = SHIM_DATA_BASE, + .gpa_end = SHIM_DATA_BASE + BLOCK_2MIB, + .perms = MEM_PERM_RW}; + + /* The vDSO sits in the same 2MiB block as the shim. The page-table builder + * splits the block into 4KiB L3 pages when its regions don't fully cover + * it, so the vDSO must appear here to keep the trampoline page valid and + * RX after rebuild. + */ + if (nregions >= MAX_REGIONS) + goto too_many_regions; + regions[nregions++] = (mem_region_t) {.gpa_start = VDSO_BASE, + .gpa_end = VDSO_BASE + VDSO_SIZE, + .perms = MEM_PERM_RX}; - /* Translate ELF p_flags into guest page permissions. */ + /* Translate ELF p_flags into guest page permissions. Silent drops would + * leave the loaded segment unmapped, so treat overflow as fatal (we are + * already past the point of no return). + */ for (int i = 0; i < elf_info.num_segments; i++) { if (nregions >= MAX_REGIONS) - break; + goto too_many_regions; regions[nregions++] = (mem_region_t) { .gpa_start = elf_info.segments[i].gpa + elf_load_base, .gpa_end = elf_info.segments[i].gpa + elf_info.segments[i].memsz + @@ -571,11 +591,11 @@ int64_t sys_execve(hv_vcpu_t vcpu, } /* Interpreter segments use the same permission translation, shifted by - * interp_base. + * interp_base. Same fatal-overflow rule as the executable's segments. */ for (int i = 0; i < interp_info.num_segments; i++) { if (nregions >= MAX_REGIONS) - break; + goto too_many_regions; regions[nregions++] = (mem_region_t) { .gpa_start = interp_info.segments[i].gpa + interp_base, .gpa_end = interp_info.segments[i].gpa + @@ -598,7 +618,7 @@ int64_t sys_execve(hv_vcpu_t vcpu, .perms = MEM_PERM_RW}; /* PROT_EXEC mmap allocations start in a separate RX area to preserve W^X - * with 2MB page-table blocks. + * with 2MiB page-table blocks. */ if (nregions >= MAX_REGIONS) goto too_many_regions; @@ -629,7 +649,7 @@ int64_t sys_execve(hv_vcpu_t vcpu, guest_region_add(g, SHIM_BASE, SHIM_BASE + shim_size, LINUX_PROT_READ | LINUX_PROT_EXEC, LINUX_MAP_PRIVATE, 0, "[shim]"); - guest_region_add(g, SHIM_DATA_BASE, SHIM_DATA_BASE + BLOCK_2MB, + guest_region_add(g, SHIM_DATA_BASE, SHIM_DATA_BASE + BLOCK_2MIB, LINUX_PROT_READ | LINUX_PROT_WRITE, LINUX_MAP_PRIVATE, 0, "[shim-data]"); for (int i = 0; i < elf_info.num_segments; i++) { diff --git a/src/syscall/fs.c b/src/syscall/fs.c index 8d062d3..283ef7b 100644 --- a/src/syscall/fs.c +++ b/src/syscall/fs.c @@ -572,7 +572,7 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg) * macOS layout: {off_t l_start, off_t l_len, pid_t l_pid, * short l_type, short l_whence} * Use guest_read/guest_write (not guest_ptr) to safely handle - * structs that span 2MB page table block boundaries. + * structs that span 2MiB page table block boundaries. */ uint8_t lflock[32]; /* Linux struct flock is 32 bytes on aarch64 */ if (guest_read_small(g, arg, lflock, sizeof(lflock)) < 0) @@ -620,7 +620,7 @@ int64_t sys_fcntl(guest_t *g, int fd, int cmd, uint64_t arg) return 0; } case 1024: /* F_GETPIPE_SZ */ - /* macOS does not support pipe size queries; return default 64KB */ + /* macOS does not support pipe size queries; return default 64KiB */ return 65536; case 1031: /* F_SETPIPE_SZ */ /* macOS does not support pipe size setting; pretend success */ @@ -720,7 +720,7 @@ int64_t sys_getdents64(guest_t *g, int fd, uint64_t buf_gva, uint64_t count) /* Temp buffer for dirent serialization. Max dirent64 is 280 bytes * (19-byte header + NAME_MAX=255 + null + padding to 8). Using a * stack buffer avoids guest_ptr boundary issues: guest_write() handles - * 2MB block crossings that raw memcpy into guest_ptr() cannot. + * 2MiB block crossings that raw memcpy into guest_ptr() cannot. */ uint8_t entry_buf[280]; @@ -751,7 +751,7 @@ int64_t sys_getdents64(guest_t *g, int fd, uint64_t buf_gva, uint64_t count) lde.d_type = de->d_type; /* Serialize entry into temp buffer, then copy to guest via - * guest_write() which handles 2MB block boundary crossings. + * guest_write() which handles 2MiB block boundary crossings. */ memcpy(entry_buf, &lde, sizeof(lde)); memcpy(entry_buf + 19, de->d_name, name_len + 1); diff --git a/src/syscall/inotify.c b/src/syscall/inotify.c index 3398aa4..cf5205e 100644 --- a/src/syscall/inotify.c +++ b/src/syscall/inotify.c @@ -650,7 +650,7 @@ int64_t inotify_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count) pos += event_size; } - /* Copy event data to a local buffer (max 4KB) */ + /* Copy event data to a local buffer (max 4KiB) */ uint8_t local_buf[INOTIFY_BUFSIZE]; if (copied > 0) memcpy(local_buf, inst->event_buf, copied); diff --git a/src/syscall/io.c b/src/syscall/io.c index bfb14b1..3b40c03 100644 --- a/src/syscall/io.c +++ b/src/syscall/io.c @@ -686,7 +686,7 @@ int64_t sys_pwrite64(guest_t *g, } /* Helper: build host iovec array from guest iovec array. - * Uses guest_read for the iovec array (may cross 2MB block boundary) + * Uses guest_read for the iovec array (may cross 2MiB block boundary) * and guest_ptr_avail for each buffer (caps to contiguous bytes). * required_perms: MEM_PERM_W for readv (host writes to guest buffers), * MEM_PERM_R for writev (host reads from guest buffers). @@ -808,7 +808,7 @@ int64_t sys_readv(guest_t *g, int fd, uint64_t iov_gva, int iovcnt) if (iovcnt <= 0) return -LINUX_EINVAL; /* Use guest_read for the iov array since guest_ptr alone is unsafe - * if the array spans a 2MB block boundary. + * if the array spans a 2MiB block boundary. */ linux_iovec_t giov; if (guest_read_small(g, iov_gva, &giov, sizeof(giov)) < 0) diff --git a/src/syscall/mem.c b/src/syscall/mem.c index ad584d6..106b189 100644 --- a/src/syscall/mem.c +++ b/src/syscall/mem.c @@ -201,8 +201,8 @@ static int mremap_extend_range(guest_t *g, } int page_perms = prot_to_perms(prot); - uint64_t ext_start = ALIGN_DOWN(off, BLOCK_2MB); - uint64_t ext_end = ALIGN_UP(off + size, BLOCK_2MB); + uint64_t ext_start = ALIGN_DOWN(off, BLOCK_2MIB); + uint64_t ext_end = ALIGN_UP(off + size, BLOCK_2MIB); if (ext_end > g->guest_size) ext_end = g->guest_size; if (guest_extend_page_tables(g, ext_start, ext_end, page_perms) < 0) @@ -237,11 +237,11 @@ int64_t sys_brk(guest_t *g, uint64_t addr) * The brk region is initially mapped up to MMAP_RX_BASE; if it grows * past that, the mmap allocator needs to extend dynamically. */ - uint64_t brk_pt_end = ALIGN_UP(g->brk_current, BLOCK_2MB); + uint64_t brk_pt_end = ALIGN_UP(g->brk_current, BLOCK_2MIB); if (brk_pt_end < MMAP_RX_BASE) brk_pt_end = MMAP_RX_BASE; if (new_off > brk_pt_end) { - uint64_t new_end = ALIGN_UP(new_off, BLOCK_2MB); + uint64_t new_end = ALIGN_UP(new_off, BLOCK_2MIB); if (guest_extend_page_tables(g, brk_pt_end, new_end, MEM_PERM_RW) < 0) return (int64_t) ipa_brk; } @@ -426,8 +426,8 @@ int64_t sys_mmap(guest_t *g, */ int page_perms = prot_to_perms(prot); - uint64_t ext_start = ALIGN_DOWN(result_off, BLOCK_2MB); - uint64_t ext_end = ALIGN_UP(result_off + length, BLOCK_2MB); + uint64_t ext_start = ALIGN_DOWN(result_off, BLOCK_2MIB); + uint64_t ext_end = ALIGN_UP(result_off + length, BLOCK_2MIB); if (ext_end > g->guest_size) ext_end = g->guest_size; @@ -446,7 +446,7 @@ int64_t sys_mmap(guest_t *g, /* Fine-tune permissions for the exact range. Handles L3 * splitting when MAP_FIXED overlays different permissions - * onto an existing 2MB block (e.g., .data RW over .text RX). + * onto an existing 2MiB block (e.g., .data RW over .text RX). */ guest_update_perms(g, result_off, result_off + length, page_perms); @@ -503,8 +503,8 @@ int64_t sys_mmap(guest_t *g, if (!is_fixed) { if (needs_exec && !(prot & LINUX_PROT_WRITE)) { /* PROT_EXEC without PROT_WRITE: allocate from the RX mmap region. - * Apple HVF enforces W^X on 2MB block page table entries, so - * executable mappings must be in separate 2MB blocks from writable + * Apple HVF enforces W^X on 2MiB block page table entries, so + * executable mappings must be in separate 2MiB blocks from writable * ones. The RX region at MMAP_RX_BASE is pre-mapped with execute * permission. */ @@ -512,7 +512,7 @@ int64_t sys_mmap(guest_t *g, if (result_off == UINT64_MAX) { log_debug( "mmap: RX address space exhausted " - "(len=0x%llx, limit=0x%llx, %u-bit IPA / %lluGB)", + "(len=0x%llx, limit=0x%llx, %u-bit IPA / %llu GiB)", (unsigned long long) length, (unsigned long long) g->mmap_limit, g->ipa_bits, (unsigned long long) (g->guest_size >> 30)); @@ -526,7 +526,7 @@ int64_t sys_mmap(guest_t *g, /* RW (or PROT_NONE, or PROT_READ): allocate from main mmap region. * Honor the address hint if provided and within bounds. Some * managed-runtime allocators need the heap at a specific high - * address range (e.g., ~264GB for a megablock-style map) and + * address range (e.g., ~264GiB for a megablock-style map) and * spin-retry if they get a low address instead. On real Linux, * mmap tries the hint first and falls back to any suitable address. */ @@ -543,7 +543,7 @@ int64_t sys_mmap(guest_t *g, if (result_off == UINT64_MAX) { log_debug( "mmap: RW address space exhausted " - "(len=0x%llx, limit=0x%llx, %u-bit IPA / %lluGB)", + "(len=0x%llx, limit=0x%llx, %u-bit IPA / %llu GiB)", (unsigned long long) length, (unsigned long long) g->mmap_limit, g->ipa_bits, (unsigned long long) (g->guest_size >> 30)); @@ -590,8 +590,8 @@ int64_t sys_mmap(guest_t *g, * creating entries for PROT_NONE gaps between allocations. */ if (needs_exec && !(prot & LINUX_PROT_WRITE)) { - uint64_t ext_start = ALIGN_DOWN(result_off, BLOCK_2MB); - uint64_t ext_end = ALIGN_UP(result_off + length, BLOCK_2MB); + uint64_t ext_start = ALIGN_DOWN(result_off, BLOCK_2MIB); + uint64_t ext_end = ALIGN_UP(result_off + length, BLOCK_2MIB); if (ext_end > g->mmap_limit) ext_end = g->mmap_limit; if (guest_extend_page_tables(g, ext_start, ext_end, MEM_PERM_RX) < @@ -608,8 +608,8 @@ int64_t sys_mmap(guest_t *g, if (ext_end > g->mmap_rx_end) g->mmap_rx_end = ext_end; } else { - uint64_t ext_start = ALIGN_DOWN(result_off, BLOCK_2MB); - uint64_t ext_end = ALIGN_UP(result_off + length, BLOCK_2MB); + uint64_t ext_start = ALIGN_DOWN(result_off, BLOCK_2MIB); + uint64_t ext_end = ALIGN_UP(result_off + length, BLOCK_2MIB); if (ext_end > g->mmap_limit) ext_end = g->mmap_limit; /* Preserve execute permission for RWX requests. Stage-2 @@ -1133,7 +1133,7 @@ int64_t sys_munmap(guest_t *g, uint64_t addr, uint64_t length) if (unmap_off < ELF_DEFAULT_BASE && end > PT_POOL_BASE) return -LINUX_EINVAL; - /* Invalidate PTEs first. This may need to split a 2MB block + /* Invalidate PTEs first. This may need to split a 2MiB block * which can fail if the page table pool is exhausted. Failing * before region removal keeps metadata consistent. */ diff --git a/src/syscall/net-absock.h b/src/syscall/net-absock.h index 6aec06d..4e5eae8 100644 --- a/src/syscall/net-absock.h +++ b/src/syscall/net-absock.h @@ -1,11 +1,11 @@ -#pragma once - -/* Abstract AF_UNIX emulation helpers for elfuse +/* Abstract AF_UNIX emulation helpers * * Copyright 2026 elfuse contributors * SPDX-License-Identifier: Apache-2.0 */ +#pragma once + #include #include diff --git a/src/syscall/proc-state.h b/src/syscall/proc-state.h index c48c246..77adeca 100644 --- a/src/syscall/proc-state.h +++ b/src/syscall/proc-state.h @@ -1,9 +1,9 @@ -#pragma once - -/* Process metadata state helpers for elfuse +/* Process metadata state helpers * * Copyright 2026 elfuse contributors * SPDX-License-Identifier: Apache-2.0 */ +#pragma once + void proc_state_init(void); diff --git a/src/syscall/proc.c b/src/syscall/proc.c index 9c0dc62..60adda1 100644 --- a/src/syscall/proc.c +++ b/src/syscall/proc.c @@ -1360,12 +1360,13 @@ int vcpu_run_loop(hv_vcpu_t vcpu, * EC=0x24 data abort) and forwards the faulting address * here. * - * Toggling at 2MB granularity causes thrashing when the + * Toggling at 2MiB granularity causes thrashing when the * JIT writes new code and executes existing code within - * the same 2MB block. Instead, the code splits the 2MB - * block into 4KB L3 pages and toggle only the faulting 4KB - * page. This allows different pages within a 2MB block to - * have independent RW/RX permissions simultaneously. + * the same 2MiB block. Instead, the code splits the 2MiB + * block into 4KiB L3 pages and toggle only the faulting + * 4KiB page. This allows different pages within a 2MiB + * block to have independent RW/RX permissions + * simultaneously. * * x0 = FAR_EL1 (faulting virtual address) * x1 = type: 0 = exec fault -> flip to RX @@ -1421,7 +1422,7 @@ int vcpu_run_loop(hv_vcpu_t vcpu, prefix, (unsigned long long) far, (type == 0) ? "RX" : "RW", (unsigned long long) page_start); - uint64_t block_start = far & ~(BLOCK_2MB - 1); + uint64_t block_start = far & ~(BLOCK_2MIB - 1); int sr = guest_split_block(g, block_start); int ur = guest_update_perms(g, page_start, page_end, new_perms); diff --git a/src/syscall/sys.c b/src/syscall/sys.c index 65c26ad..4d540f4 100644 --- a/src/syscall/sys.c +++ b/src/syscall/sys.c @@ -88,7 +88,7 @@ static void sysinfo_init_cached_host_state(void) size_t ms_len = sizeof(memsize); int mib_mem[2] = {CTL_HW, HW_MEMSIZE}; if (sysctl(mib_mem, 2, &memsize, &ms_len, NULL, 0) == 0) { - const uint64_t vm_ram_cap = 4094595072ULL; /* Match Lima VZ 4GB VM */ + const uint64_t vm_ram_cap = 4094595072ULL; /* Match Lima VZ 4GiB VM */ cached_real_memsize = memsize; cached_totalram = (memsize > vm_ram_cap) ? vm_ram_cap : memsize; } @@ -367,8 +367,8 @@ static linux_rlimit64_t translate_host_rlimit(int resource, struct rlimit rl) lim.rlim_cur = (rl.rlim_cur == RLIM_INFINITY) ? UINT64_MAX : rl.rlim_cur; lim.rlim_max = (rl.rlim_max == RLIM_INFINITY) ? UINT64_MAX : rl.rlim_max; - /* macOS returns ~8MB-16KB for the default stack; round to Linux's - * conventional 8MB to keep guest userspace behavior stable. + /* macOS returns ~8MiB-16KiB for the default stack; round to Linux's + * conventional 8MiB to keep guest userspace behavior stable. */ if (resource == 3 /* RLIMIT_STACK */ && lim.rlim_cur > 0 && lim.rlim_cur < 8388608) { diff --git a/src/utils.h b/src/utils.h index 153b94a..efe55c5 100644 --- a/src/utils.h +++ b/src/utils.h @@ -30,18 +30,18 @@ /* Align x down to the previous multiple of a; a must be a power of two. */ #define ALIGN_DOWN(x, a) ((uint64_t) (x) & ~((uint64_t) (a) - 1)) -/* The Linux ABI fixes the page size at 4KB on aarch64 regardless of the host +/* The Linux ABI fixes the page size at 4KiB on aarch64 regardless of the host * page size, so this is shared by every guest memory path (mmap, brk, * mprotect, ELF loading). */ #define GUEST_PAGE_SIZE 4096ULL #define PAGE_ALIGN_UP(x) ALIGN_UP(x, GUEST_PAGE_SIZE) -/* 2MB block alignment shared by region setup, page table walking, and stack - * placement. BLOCK_2MB itself is defined in core/guest.h. +/* 2MiB block alignment shared by region setup, page table walking, and stack + * placement. BLOCK_2MIB itself is defined in core/guest.h. */ -#define ALIGN_2MB_DOWN(x) ALIGN_DOWN(x, 2ULL * 1024 * 1024) -#define ALIGN_2MB_UP(x) ALIGN_UP(x, 2ULL * 1024 * 1024) +#define ALIGN_2MIB_DOWN(x) ALIGN_DOWN(x, 2ULL * 1024 * 1024) +#define ALIGN_2MIB_UP(x) ALIGN_UP(x, 2ULL * 1024 * 1024) /* Branchless range check: true when minx <= x < minx + size. * diff --git a/tests/test-cow-fork.c b/tests/test-cow-fork.c index 3df6ffd..8770420 100644 --- a/tests/test-cow-fork.c +++ b/tests/test-cow-fork.c @@ -170,7 +170,7 @@ static void test_mmap_isolation(void) static void test_large_cow(void) { - TEST("fork: 1MB COW integrity"); + TEST("fork: 1MiB COW integrity"); int pipefd[2]; if (pipe(pipefd) != 0) { @@ -182,7 +182,7 @@ static void test_large_cow(void) char *buf = mmap(NULL, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (buf == MAP_FAILED) { - FAIL("mmap 1MB"); + FAIL("mmap 1MiB"); return; } @@ -229,7 +229,7 @@ static void test_large_cow(void) int status; waitpid(pid, &status, 0); - EXPECT_TRUE(parent_ok && child_ok, "1MB COW integrity failed"); + EXPECT_TRUE(parent_ok && child_ok, "1MiB COW integrity failed"); munmap(buf, sz); } diff --git a/tests/test-futex-pi.c b/tests/test-futex-pi.c index 0bfe294..8c5ca60 100644 --- a/tests/test-futex-pi.c +++ b/tests/test-futex-pi.c @@ -68,7 +68,7 @@ static long raw_futex_unlock_pi(uint32_t *addr) /* Child thread for dead-owner test */ -/* Stack for child thread (8KB, 16-byte aligned) */ +/* Stack for child thread (8KiB, 16-byte aligned) */ static char child_stack_buf[8192] __attribute__((aligned(16))); /* Child: acquire PI lock, signal parent, exit WITHOUT releasing. diff --git a/tests/test-guard-page.c b/tests/test-guard-page.c index b4809ce..caf91ac 100644 --- a/tests/test-guard-page.c +++ b/tests/test-guard-page.c @@ -61,13 +61,13 @@ static void test_prot_none(void) static void test_large_mmap(void) { - TEST("mmap 64MB anonymous"); + TEST("mmap 64MiB anonymous"); size_t sz = 64UL * 1024 * 1024; void *p = mmap(NULL, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (p == MAP_FAILED) { - FAIL("mmap 64MB failed"); + FAIL("mmap 64MiB failed"); return; } @@ -78,7 +78,7 @@ static void test_large_mmap(void) c[sz - 1] = 'C'; EXPECT_TRUE(c[0] == 'A' && c[sz / 2] == 'B' && c[sz - 1] == 'C', - "data mismatch in 64MB region"); + "data mismatch in 64MiB region"); munmap(p, sz); } diff --git a/tests/test-large-io-boundary.c b/tests/test-large-io-boundary.c index 18230d8..28b76e7 100644 --- a/tests/test-large-io-boundary.c +++ b/tests/test-large-io-boundary.c @@ -4,7 +4,7 @@ * Copyright 2025 Moritz Angermann, zw3rk pte. ltd. * SPDX-License-Identifier: Apache-2.0 * - * Tests: read/write buffers crossing 2MB L2 blocks and split L3 tables. + * Tests: read/write buffers crossing 2MiB L2 blocks and split L3 tables. */ #include @@ -19,7 +19,7 @@ int passes = 0, fails = 0; -#define BLOCK_2MB (2UL * 1024 * 1024) +#define BLOCK_2MIB (2UL * 1024 * 1024) #define MAP_SIZE (6UL * 1024 * 1024) #define IO_OFFSET 12345UL #define IO_SIZE (3UL * 1024 * 1024) @@ -27,7 +27,7 @@ int passes = 0, fails = 0; static unsigned char *next_2mb_boundary(unsigned char *p) { uintptr_t addr = (uintptr_t) p; - addr = (addr + BLOCK_2MB - 1) & ~(uintptr_t) (BLOCK_2MB - 1); + addr = (addr + BLOCK_2MIB - 1) & ~(uintptr_t) (BLOCK_2MIB - 1); return (unsigned char *) addr; } @@ -53,7 +53,7 @@ static int verify_pattern(const unsigned char *buf, size_t len) return 0; } -/* Verify a repeating 4KB seed pattern across a large buffer. +/* Verify a repeating 4KiB seed pattern across a large buffer. * The seed is: seed[i] = (i * 131 + 17) for i in [0, 4096). */ static int verify_repeating_seed(const unsigned char *buf, size_t len) @@ -68,7 +68,7 @@ static int verify_repeating_seed(const unsigned char *buf, size_t len) static void test_large_write(void) { - TEST("write crosses 2MB boundary"); + TEST("write crosses 2MiB boundary"); unsigned char *map = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); @@ -97,7 +97,7 @@ static void test_large_write(void) ok = 0; /* Read back the entire write and verify all bytes, including those - * spanning the 2MB page table boundary. + * spanning the 2MiB page table boundary. */ unsigned char *readback = mmap(NULL, IO_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); @@ -129,12 +129,12 @@ static void test_large_read_from_split_block(void) return; } - /* Force the first 2MB block to remain split into L3 pages while ending + /* Force the first 2MiB block to remain split into L3 pages while ending * with RW permissions, then read across the L3-to-L2 boundary. */ unsigned char *block = next_2mb_boundary(map); unsigned char *buf = block + IO_OFFSET; - void *page = block + BLOCK_2MB / 2; + void *page = block + BLOCK_2MIB / 2; if (mprotect(page, 4096, PROT_READ) != 0 || mprotect(page, 4096, PROT_READ | PROT_WRITE) != 0) { munmap(map, MAP_SIZE); @@ -170,7 +170,7 @@ static void test_large_read_from_split_block(void) ssize_t ret = read(fd, buf, IO_SIZE); ok = (ret == (ssize_t) IO_SIZE); } - /* Verify the entire read buffer, including the 2MB boundary + /* Verify the entire read buffer, including the 2MiB boundary * crossing where L3-to-L2 page table transitions happen. */ if (ok && verify_repeating_seed(buf, IO_SIZE) != 0) diff --git a/tests/test-madvise.c b/tests/test-madvise.c index 0b153d4..a81a9ba 100644 --- a/tests/test-madvise.c +++ b/tests/test-madvise.c @@ -212,7 +212,7 @@ static void test_advisory_hints(void) static void test_dontneed_large(void) { - TEST("MADV_DONTNEED 1MB range"); + TEST("MADV_DONTNEED 1MiB range"); size_t sz = 1024 * 1024; void *p = mmap(NULL, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); @@ -239,7 +239,7 @@ static void test_dontneed_large(void) } } - EXPECT_TRUE(ok, "1MB range not zeroed"); + EXPECT_TRUE(ok, "1MiB range not zeroed"); munmap(p, sz); } diff --git a/tests/test-mremap.c b/tests/test-mremap.c index 5375d7f..61d2764 100644 --- a/tests/test-mremap.c +++ b/tests/test-mremap.c @@ -234,7 +234,7 @@ static void test_same_size(void) static void test_large_realloc(void) { - TEST("mremap large (256KB->512KB)"); + TEST("mremap large (256KiB->512KiB)"); size_t old_sz = 256 * 1024, new_sz = 512 * 1024; void *p = mmap(NULL, old_sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); diff --git a/tests/test-multi-vcpu.c b/tests/test-multi-vcpu.c index b3277e4..01d9301 100644 --- a/tests/test-multi-vcpu.c +++ b/tests/test-multi-vcpu.c @@ -55,9 +55,9 @@ #define PT_AP_RO (3ULL << 6) /* RO at EL0 */ #define PAGE_SIZE_4K 4096ULL -#define BLOCK_2MB (2ULL * 1024 * 1024) +#define BLOCK_2MIB (2ULL * 1024 * 1024) -/* Memory layout (16MB total, much smaller than elfuse's 32GB) */ +/* Memory layout (16MiB total, much smaller than elfuse's 32GiB) */ #define GUEST_SIZE (16ULL * 1024 * 1024) @@ -70,18 +70,18 @@ #define STACK_A_BASE 0x00A00000ULL /* EL0 stack A (RW) */ #define STACK_B_BASE 0x00C00000ULL /* EL0 stack B (RW) */ -/* vCPU-A and vCPU-B SP_EL1 (top of respective 512KB regions within shim data) +/* vCPU-A and vCPU-B SP_EL1 (top of respective 512KiB regions within shim data) */ -#define SP_EL1_A (SHIM_DATA_BASE + BLOCK_2MB) /* 0x400000 */ -#define SP_EL1_B (SHIM_DATA_BASE + BLOCK_2MB / 2) /* 0x300000 */ +#define SP_EL1_A (SHIM_DATA_BASE + BLOCK_2MIB) /* 0x400000 */ +#define SP_EL1_B (SHIM_DATA_BASE + BLOCK_2MIB / 2) /* 0x300000 */ /* vCPU-A and vCPU-B EL0 code offsets within GUEST_CODE region */ #define CODE_A_OFF 0x0000ULL -#define CODE_B_OFF 0x1000ULL /* 4KB apart */ +#define CODE_B_OFF 0x1000ULL /* 4KiB apart */ -/* EL0 stack tops (top of each 2MB region) */ -#define SP_EL0_A (STACK_A_BASE + BLOCK_2MB) /* 0xC00000 */ -#define SP_EL0_B (STACK_B_BASE + BLOCK_2MB) /* 0xE00000 */ +/* EL0 stack tops (top of each 2MiB region) */ +#define SP_EL0_A (STACK_A_BASE + BLOCK_2MIB) /* 0xC00000 */ +#define SP_EL0_B (STACK_B_BASE + BLOCK_2MIB) /* 0xE00000 */ /* System register values (from main.c) */ @@ -134,7 +134,7 @@ static uint64_t pt_alloc(vm_state_t *vm) return off; } -/* Build a 2MB block descriptor at a given GPA with RX or RW perms. */ +/* Build a 2MiB block descriptor at a given GPA with RX or RW perms. */ static uint64_t make_block(uint64_t gpa, int perm) { uint64_t desc = (gpa & 0xFFFFFFFFE00000ULL) | PT_AF | PT_SH_ISH | PT_NS | @@ -162,10 +162,10 @@ static uint64_t build_page_tables(vm_state_t *vm, int include_tlbi_region) uint64_t *l0 = (uint64_t *) ((uint8_t *) vm->host_base + l0_off); uint64_t *l1 = (uint64_t *) ((uint8_t *) vm->host_base + l1_off); - /* L0[0] -> L1 table (all the current addresses are < 512GB) */ + /* L0[0] -> L1 table (all the current addresses are < 512GiB) */ l0[0] = l1_off | PT_VALID | PT_TABLE; - /* L1[0] -> L2 table (all the current addresses are < 1GB) */ + /* L1[0] -> L2 table (all the current addresses are < 1GiB) */ uint64_t l2_off = pt_alloc(vm); if (!l2_off) return 0; @@ -173,7 +173,7 @@ static uint64_t build_page_tables(vm_state_t *vm, int include_tlbi_region) uint64_t *l2 = (uint64_t *) ((uint8_t *) vm->host_base + l2_off); - /* Map 2MB blocks. L2 index = addr / 2MB. */ + /* Map 2MiB blocks. L2 index = addr / 2MiB. */ /* Shim code (RX) at 0x100000 -> L2[0] (shares 0x0-0x1FFFFF) */ l2[0] = make_block(0x000000, PERM_RX); @@ -199,8 +199,8 @@ static uint64_t build_page_tables(vm_state_t *vm, int include_tlbi_region) /* Stack B spills into 0xE00000 (SP=0xE00000 grows down into 0xC00000 * block), already covered by L2[6] since SP_EL0_B = 0xE00000 is top of - * 0xC00000 block. Actually 0xE00000 = 7 * 2MB, that's a separate block. Map - * it too: + * 0xC00000 block. Actually 0xE00000 = 7 * 2MiB, that's a separate block. + * Map it too: */ l2[7] = make_block(0xE00000, PERM_RW); @@ -475,7 +475,7 @@ static int vm_create(vm_state_t *vm) vm->pt_next = PT_POOL_BASE; /* Query max IPA size and configure VM (matches guest.c pattern). - * The test uses only 16MB, so any IPA size works; this is for + * The test uses only 16MiB, so any IPA size works; this is for * API consistency with elfuse's production code path. */ uint32_t max_ipa = 0; diff --git a/tests/test-perf.sh b/tests/test-perf.sh index f729cfc..8175409 100755 --- a/tests/test-perf.sh +++ b/tests/test-perf.sh @@ -104,10 +104,10 @@ benchmark "elfuse guest wc" sh -c "'$ELFUSE' '$TOOL_BIN/wc' -l '$SRC_SUBDIR'/*.c echo # --- Test 4: I/O throughput — cat large file through wc --- -printf "${YELLOW}▸ cat ~10MB | wc -l (I/O throughput)${RESET}\n" +printf "${YELLOW}▸ cat ~10MiB | wc -l (I/O throughput)${RESET}\n" TMPFILE=$(mktemp) trap 'rm -f "$TMPFILE"' EXIT -# Build ~10MB test file by repeating syscall.c (~100 times) +# Build ~10MiB test file by repeating syscall.c (~100 times) for _ in $(seq 1 100); do cat "$SYSCALL_C" >> "$TMPFILE"; done TMPSIZE=$(wc -c < "$TMPFILE" | tr -d ' ') printf " ${CYAN}(test file: %s bytes)${RESET}\n" "$TMPSIZE" diff --git a/tests/test-rwx.c b/tests/test-rwx.c index e04b75e..180e743 100644 --- a/tests/test-rwx.c +++ b/tests/test-rwx.c @@ -9,8 +9,8 @@ * page table entries work at stage-1 when SCTLR_EL1.WXN=0. * * Tests: - * 1. RWX 2MB block: L2 block descriptor with AP=RW_EL0, UXN=0, PXN=0 - * 2. RWX 4KB page: L3 page descriptor with the same RWX permissions + * 1. RWX 2MiB block: L2 block descriptor with AP=RW_EL0, UXN=0, PXN=0 + * 2. RWX 4KiB page: L3 page descriptor with the same RWX permissions * 3. Baseline RX: Confirm execution works on a normal RX page * 4. Baseline RW: Confirm writes work on a normal RW page * @@ -65,9 +65,9 @@ #define PT_AP_RO (3ULL << 6) /* AP[2:1]=11 -> RO at EL0 */ #define PAGE_SIZE_4K 4096ULL -#define BLOCK_2MB (2ULL * 1024 * 1024) +#define BLOCK_2MIB (2ULL * 1024 * 1024) -/* Memory layout (16MB total) */ +/* Memory layout (16MiB total) */ #define GUEST_SIZE (16ULL * 1024 * 1024) @@ -75,20 +75,20 @@ #define SHIM_BASE 0x00100000ULL /* Shim code (RX) */ #define SHIM_DATA_BASE 0x00200000ULL /* Shim data / EL1 stack (RW) */ #define GUEST_CODE 0x00400000ULL /* EL0 test code (RX) */ -#define RWX_BLOCK 0x00600000ULL /* 2MB block for RWX test (test 1) */ +#define RWX_BLOCK 0x00600000ULL /* 2MiB block for RWX test (test 1) */ #define RWX_PAGE_BLOCK \ - 0x00800000ULL /* 2MB region containing RWX 4KB page (test 2) */ + 0x00800000ULL /* 2MiB region containing RWX 4KiB page (test 2) */ #define GUEST_DATA 0x00A00000ULL /* RW data (test 4 baseline) */ #define STACK_BASE 0x00C00000ULL /* EL0 stack (RW) */ -/* Within RWX_PAGE_BLOCK, the RWX 4KB page is at offset 0 */ +/* Within RWX_PAGE_BLOCK, the RWX 4KiB page is at offset 0 */ #define RWX_PAGE_ADDR RWX_PAGE_BLOCK /* EL0 stack top and SP_EL1 */ -#define SP_EL0 (STACK_BASE + BLOCK_2MB) -#define SP_EL1 (SHIM_DATA_BASE + BLOCK_2MB) +#define SP_EL0 (STACK_BASE + BLOCK_2MIB) +#define SP_EL1 (SHIM_DATA_BASE + BLOCK_2MIB) -/* Code offsets within GUEST_CODE (4KB apart for different tests) */ +/* Code offsets within GUEST_CODE (4KiB apart for different tests) */ #define CODE_TEST1 0x0000ULL /* Test 1: RWX block write+exec */ #define CODE_TEST2 0x1000ULL /* Test 2: RWX page write+exec */ #define CODE_TEST3 0x2000ULL /* Test 3: baseline RX exec */ @@ -142,27 +142,27 @@ static uint64_t pt_alloc(vm_state_t *vm) /* Descriptor builders */ -/* Common base attributes for a 2MB block or 4KB page */ +/* Common base attributes for a 2MiB block or 4KiB page */ static uint64_t common_attrs(void) { return PT_AF | PT_SH_ISH | PT_NS | PT_ATTR1; } -/* 2MB block: RX (executable, read-only at EL0) */ +/* 2MiB block: RX (executable, read-only at EL0) */ static uint64_t make_block_rx(uint64_t gpa) { return (gpa & 0xFFFFFFFFE00000ULL) | common_attrs() | PT_BLOCK | PT_AP_RO; /* UXN=0, PXN=0 -> executable */ } -/* 2MB block: RW (writable, not executable) */ +/* 2MiB block: RW (writable, not executable) */ static uint64_t make_block_rw(uint64_t gpa) { return (gpa & 0xFFFFFFFFE00000ULL) | common_attrs() | PT_BLOCK | PT_AP_RW_EL0 | PT_UXN | PT_PXN; } -/* 2MB block: RWX (writable AND executable at EL0, the test subject) */ +/* 2MiB block: RWX (writable AND executable at EL0, the test subject) */ static uint64_t make_block_rwx(uint64_t gpa) { return (gpa & 0xFFFFFFFFE00000ULL) | common_attrs() | PT_BLOCK | @@ -170,7 +170,7 @@ static uint64_t make_block_rwx(uint64_t gpa) /* UXN=0, PXN=0 -> executable; AP=01 -> writable at EL0 */ } -/* 4KB L3 page: RWX (writable AND executable at EL0) */ +/* 4KiB L3 page: RWX (writable AND executable at EL0) */ static uint64_t make_page_rwx(uint64_t gpa) { return (gpa & 0xFFFFFFFFF000ULL) | common_attrs() | PT_VALID | PT_PAGE | @@ -178,7 +178,7 @@ static uint64_t make_page_rwx(uint64_t gpa) /* UXN=0, PXN=0 -> executable; AP=01 -> writable at EL0 */ } -/* 4KB L3 page: RW (not executable) */ +/* 4KiB L3 page: RW (not executable) */ static uint64_t make_page_rw(uint64_t gpa) { return (gpa & 0xFFFFFFFFF000ULL) | common_attrs() | PT_VALID | PT_PAGE | @@ -230,7 +230,7 @@ static uint64_t build_page_tables(vm_state_t *vm) l2[3] = make_block_rwx(0x600000); /* L2[4]: Table descriptor -> L3 page table for Test 2. - * the code splits this 2MB block into 512 x 4KB pages. The first page + * the code splits this 2MiB block into 512 x 4KiB pages. The first page * at 0x800000 is RWX, the rest are RW (non-executable). */ { @@ -519,9 +519,9 @@ static void print_bad_exception(const vcpu_exit_t *ex) } } -/* TEST 1: RWX 2MB Block +/* TEST 1: RWX 2MiB Block * - * Stage-1 page table has a 2MB block at 0x600000 with: + * Stage-1 page table has a 2MiB block at 0x600000 with: * AP[2:1]=01 (RW at EL0), UXN=0, PXN=0 (executable) * This is a true RWX mapping. * @@ -678,10 +678,10 @@ static int test1_rwx_block(void) return result; } -/* TEST 2: RWX 4KB Page (L3 descriptor) +/* TEST 2: RWX 4KiB Page (L3 descriptor) * - * Same as test 1, but using a 4KB L3 page descriptor at 0x800000 - * instead of a 2MB L2 block descriptor. Tests whether the + * Same as test 1, but using a 4KiB L3 page descriptor at 0x800000 + * instead of a 2MiB L2 block descriptor. Tests whether the * granularity matters for W^X enforcement. */ @@ -753,13 +753,13 @@ static int test2_rwx_page(void) (unsigned long long) ex.x0, (unsigned long long) ex.x1, ex.x1 == 0 ? "exec fault -> flip to RX" : "write fault -> flip to RW"); - printf(" " YELLOW "HVF enforces W^X at stage-2 (4KB page)" RESET + printf(" " YELLOW "HVF enforces W^X at stage-2 (4KiB page)" RESET "\n"); result = -1; } else if (ex.reason == HVF_EXIT_HVC5 && ex.x0 == 42) { printf("\n " GREEN "RWX works!" RESET - " Written code executed (4KB page, x0=%llu)\n", + " Written code executed (4KiB page, x0=%llu)\n", (unsigned long long) ex.x0); result = 0; @@ -769,11 +769,11 @@ static int test2_rwx_page(void) uint32_t ec = (uint32_t) (ex.esr >> 26) & 0x3F; if (ec == 0x20) printf(" " YELLOW - "Instruction abort: W^X blocks execution (4KB page)" RESET + "Instruction abort: W^X blocks execution (4KiB page)" RESET "\n"); else if (ec == 0x24) - printf(" " YELLOW "Data abort: W^X blocks write (4KB page)" RESET - "\n"); + printf(" " YELLOW + "Data abort: W^X blocks write (4KiB page)" RESET "\n"); result = -1; } else { @@ -952,8 +952,8 @@ int main(void) } tests[] = { {"Baseline: RX execution", test3_baseline_rx}, {"Baseline: RW write", test4_baseline_rw}, - {"RWX 2MB block (write+exec)", test1_rwx_block}, - {"RWX 4KB page (write+exec)", test2_rwx_page}, + {"RWX 2MiB block (write+exec)", test1_rwx_block}, + {"RWX 4KiB page (write+exec)", test2_rwx_page}, }; int ntests = (int) ARRAY_SIZE(tests); diff --git a/tests/test-stress.c b/tests/test-stress.c index 687afd3..f81d599 100644 --- a/tests/test-stress.c +++ b/tests/test-stress.c @@ -97,7 +97,7 @@ static void test_mmap_churn(void) TEST("mmap/munmap churn (256 cycles)"); #define CHURN_CYCLES 256 -#define CHURN_SIZE (64 * 1024) /* 64KB each */ +#define CHURN_SIZE (64 * 1024) /* 64KiB each */ bool ok = true; for (int i = 0; i < CHURN_CYCLES; i++) { @@ -275,7 +275,7 @@ static void test_mprotect_cycling(void) static void test_large_mmap(void) { - TEST("large mmap (16MB)"); + TEST("large mmap (16MiB)"); size_t sz = 16 * 1024 * 1024; void *p = mmap(NULL, sz, PROT_READ | PROT_WRITE, @@ -285,7 +285,7 @@ static void test_large_mmap(void) return; } - /* Touch every page (4KB stride) */ + /* Touch every page (4KiB stride) */ volatile char *vp = (volatile char *) p; for (size_t off = 0; off < sz; off += 4096) { vp[off] = (char) (off >> 12); diff --git a/tests/test-thread.c b/tests/test-thread.c index d4f88c6..da427df 100644 --- a/tests/test-thread.c +++ b/tests/test-thread.c @@ -57,7 +57,7 @@ static void child_work(void) /* Tests */ -/* Stack for child thread (8KB, 16-byte aligned) */ +/* Stack for child thread (8KiB, 16-byte aligned) */ static char child_stack_buf[8192] __attribute__((aligned(16))); /* Test 1: clone(CLONE_THREAD) creates a new thread that runs concurrently */