diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json index dcd6753a4c5..1eb2d83e0f2 100644 --- a/resources/seccomp/x86_64-unknown-linux-musl.json +++ b/resources/seccomp/x86_64-unknown-linux-musl.json @@ -31,6 +31,9 @@ { "syscall": "mincore" }, + { + "syscall": "pread64" + }, { "syscall": "writev", "comment": "Used by the VirtIO net device to write to tap" diff --git a/src/firecracker/src/api_server/mod.rs b/src/firecracker/src/api_server/mod.rs index 60daaa26639..961fc68e836 100644 --- a/src/firecracker/src/api_server/mod.rs +++ b/src/firecracker/src/api_server/mod.rs @@ -275,7 +275,7 @@ mod tests { Box::new(VmmAction::CreateSnapshot(CreateSnapshotParams { snapshot_type: SnapshotType::Diff, snapshot_path: PathBuf::new(), - mem_file_path: PathBuf::new(), + mem_file_path: Some(PathBuf::new()), })), start_time_us, ); @@ -288,7 +288,7 @@ mod tests { Box::new(VmmAction::CreateSnapshot(CreateSnapshotParams { snapshot_type: SnapshotType::Diff, snapshot_path: PathBuf::new(), - mem_file_path: PathBuf::new(), + mem_file_path: Some(PathBuf::new()), })), start_time_us, ); diff --git a/src/firecracker/src/api_server/parsed_request.rs b/src/firecracker/src/api_server/parsed_request.rs index f98170ccbea..478483e9ad9 100644 --- a/src/firecracker/src/api_server/parsed_request.rs +++ b/src/firecracker/src/api_server/parsed_request.rs @@ -31,6 +31,7 @@ use super::request::vsock::parse_put_vsock; use crate::api_server::request::hotplug::memory::{ parse_get_memory_hotplug, parse_patch_memory_hotplug, parse_put_memory_hotplug, }; +use crate::api_server::request::memory_info::parse_get_memory; use crate::api_server::request::serial::parse_put_serial; #[derive(Debug)] @@ -91,6 +92,7 @@ impl TryFrom<&Request> for ParsedRequest { (Method::Get, "hotplug", None) if path_tokens.next() == Some("memory") => { parse_get_memory_hotplug() } + (Method::Get, "memory", None) => parse_get_memory(path_tokens), (Method::Get, _, Some(_)) => method_to_error(Method::Get), (Method::Put, "actions", Some(body)) => parse_put_actions(body), (Method::Put, "balloon", Some(body)) => parse_put_balloon(body), @@ -196,6 +198,9 @@ impl ParsedRequest { &serde_json::json!({ "firecracker_version": version.as_str() }), ), VmmData::FullVmConfig(config) => Self::success_response_with_data(config), + VmmData::MemoryMappings(mappings) => Self::success_response_with_data(mappings), + VmmData::Memory(meminfo) => Self::success_response_with_data(meminfo), + VmmData::MemoryDirty(dirty) => Self::success_response_with_data(dirty), }, Err(vmm_action_error) => { let mut response = match vmm_action_error { @@ -610,6 +615,15 @@ pub mod tests { &serde_json::json!({ "firecracker_version": version.as_str() }).to_string(), 200, ), + VmmData::MemoryMappings(mappings) => { + http_response(&serde_json::to_string(mappings).unwrap(), 200) + } + VmmData::Memory(meminfo) => { + http_response(&serde_json::to_string(meminfo).unwrap(), 200) + } + VmmData::MemoryDirty(dirty) => { + http_response(&serde_json::to_string(dirty).unwrap(), 200) + } }; let response = ParsedRequest::convert_to_response(&data); response.write_all(&mut buf).unwrap(); diff --git a/src/firecracker/src/api_server/request/memory_info.rs b/src/firecracker/src/api_server/request/memory_info.rs new file mode 100644 index 00000000000..40c7f711bb0 --- /dev/null +++ b/src/firecracker/src/api_server/request/memory_info.rs @@ -0,0 +1,19 @@ +use micro_http::StatusCode; +use vmm::rpc_interface::VmmAction; + +use crate::api_server::parsed_request::{ParsedRequest, RequestError}; + +pub(crate) fn parse_get_memory<'a, T>(mut path_tokens: T) -> Result +where + T: Iterator, +{ + match path_tokens.next() { + Some("mappings") => Ok(ParsedRequest::new_sync(VmmAction::GetMemoryMappings)), + Some("dirty") => Ok(ParsedRequest::new_sync(VmmAction::GetMemoryDirty)), + Some(unknown_path) => Err(RequestError::Generic( + StatusCode::BadRequest, + format!("Unrecognized GET request path `{unknown_path}`"), + )), + None => Ok(ParsedRequest::new_sync(VmmAction::GetMemory)), + } +} diff --git a/src/firecracker/src/api_server/request/mod.rs b/src/firecracker/src/api_server/request/mod.rs index 9be4617bd8e..89472c52d8e 100644 --- a/src/firecracker/src/api_server/request/mod.rs +++ b/src/firecracker/src/api_server/request/mod.rs @@ -11,6 +11,7 @@ pub mod hotplug; pub mod instance_info; pub mod logger; pub mod machine_configuration; +pub mod memory_info; pub mod metrics; pub mod mmds; pub mod net; diff --git a/src/firecracker/src/api_server/request/snapshot.rs b/src/firecracker/src/api_server/request/snapshot.rs index 8284aa66287..cc7c1c28762 100644 --- a/src/firecracker/src/api_server/request/snapshot.rs +++ b/src/firecracker/src/api_server/request/snapshot.rs @@ -144,7 +144,7 @@ mod tests { let expected_config = CreateSnapshotParams { snapshot_type: SnapshotType::Diff, snapshot_path: PathBuf::from("foo"), - mem_file_path: PathBuf::from("bar"), + mem_file_path: Some(PathBuf::from("bar")), }; assert_eq!( vmm_action_from_request(parse_put_snapshot(&Body::new(body), Some("create")).unwrap()), @@ -158,7 +158,7 @@ mod tests { let expected_config = CreateSnapshotParams { snapshot_type: SnapshotType::Full, snapshot_path: PathBuf::from("foo"), - mem_file_path: PathBuf::from("bar"), + mem_file_path: Some(PathBuf::from("bar")), }; assert_eq!( vmm_action_from_request(parse_put_snapshot(&Body::new(body), Some("create")).unwrap()), diff --git a/src/firecracker/swagger/firecracker.yaml b/src/firecracker/swagger/firecracker.yaml index 0523dd9b08e..ff659b69d39 100644 --- a/src/firecracker/swagger/firecracker.yaml +++ b/src/firecracker/swagger/firecracker.yaml @@ -786,6 +786,50 @@ paths: schema: $ref: "#/definitions/Error" + /memory/mappings: + get: + summary: Gets the memory mappings with skippable pages bitmap. + operationId: getMemoryMappings + responses: + 200: + description: OK + schema: + $ref: "#/definitions/MemoryMappingsResponse" + default: + description: Internal server error + schema: + $ref: "#/definitions/Error" + + /memory: + get: + summary: Gets the memory info (resident and empty pages). + description: Returns an object with resident and empty bitmaps. The resident bitmap marks all pages that are resident. The empty bitmap marks zero pages (subset of resident pages). This is checked at the pageSize of each region. All regions must have the same page size. + operationId: getMemory + responses: + 200: + description: OK + schema: + $ref: "#/definitions/MemoryResponse" + default: + description: Internal server error + schema: + $ref: "#/definitions/Error" + + /memory/dirty: + get: + summary: Gets the dirty guest memory + description: This returns the resident memory that has been written since last snapshot. + operationId: getDirtyMemory + responses: + 200: + description: OK + schema: + $ref: "#/definitions/MemoryDirty" + default: + description: Internal server error + schema: + $ref: "#/definitions/Error" + /version: get: summary: Gets the Firecracker version. @@ -1347,6 +1391,72 @@ definitions: description: MicroVM hypervisor build version. type: string + GuestMemoryRegionMapping: + type: object + description: Describes the region of guest memory that can be used for creating the memfile. + required: + - base_host_virt_addr + - size + - offset + - page_size + properties: + base_host_virt_addr: + type: integer + size: + description: The size of the region in bytes. + type: integer + offset: + description: The offset of the region in bytes. + type: integer + page_size: + description: The page size in bytes. + type: integer + + MemoryMappingsResponse: + type: object + description: Response containing memory region mappings. + required: + - mappings + properties: + mappings: + type: array + description: The memory region mappings. + items: + $ref: "#/definitions/GuestMemoryRegionMapping" + + MemoryResponse: + type: object + description: Response containing the memory info (resident and empty pages). + required: + - resident + - empty + properties: + resident: + type: array + description: The resident bitmap as a vector of u64 values. Each bit represents if the page is resident. + items: + type: integer + format: uint64 + empty: + type: array + description: The empty bitmap as a vector of u64 values. Each bit represents if the page is zero (empty). This is a subset of the resident pages. + items: + type: integer + format: uint64 + + MemoryDirty: + type: object + description: Response containing the bitmap (one bit per page) of dirty pages of guest memory + required: + - bitmap: + properties: + bitmap: + type: array + description: The dirty bitmap as a vector of u64 values. Each bit respresents if the page is dirty. + itmes: + type: integer + format: uint64 + Logger: type: object description: @@ -1555,12 +1665,14 @@ definitions: SnapshotCreateParams: type: object required: - - mem_file_path - snapshot_path properties: mem_file_path: type: string - description: Path to the file that will contain the guest memory. + description: + Path to the file that will contain the guest memory. It is optional. + In case that a user doesn't provide a path, they are responsible to + ensure they store the microVM's memory state via external means. snapshot_path: type: string description: Path to the file that will contain the microVM state. diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 332b1ac3cc3..63944bfff83 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -318,6 +318,7 @@ pub fn build_microvm_for_boot( vcpus_handles: Vec::new(), vcpus_exit_evt, device_manager, + page_size: vm_resources.machine_config.huge_pages.page_size(), }; let vmm = Arc::new(Mutex::new(vmm)); @@ -518,6 +519,7 @@ pub fn build_microvm_from_snapshot( vcpus_handles: Vec::new(), vcpus_exit_evt, device_manager, + page_size: vm_resources.machine_config.huge_pages.page_size(), }; // Move vcpus to their own threads and start their state machine in the 'Paused' state. @@ -751,6 +753,7 @@ pub(crate) mod tests { use vmm_sys_util::tempfile::TempFile; use super::*; + use crate::arch::host_page_size; use crate::device_manager::tests::default_device_manager; use crate::devices::virtio::block::CacheType; use crate::devices::virtio::generated::virtio_ids; @@ -836,6 +839,7 @@ pub(crate) mod tests { vcpus_handles: Vec::new(), vcpus_exit_evt, device_manager: default_device_manager(), + page_size: host_page_size(), } } diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 30273e92c06..06144a5ddd9 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -144,13 +144,15 @@ use crate::devices::virtio::block::device::Block; use crate::devices::virtio::mem::{VIRTIO_MEM_DEV_ID, VirtioMem, VirtioMemError, VirtioMemStatus}; use crate::devices::virtio::net::Net; use crate::logger::{METRICS, MetricsError, error, info, warn}; -use crate::persist::{MicrovmState, MicrovmStateError, VmInfo}; +use crate::persist::{GuestRegionUffdMapping, MicrovmState, MicrovmStateError, VmInfo}; use crate::rate_limiter::BucketUpdate; +use crate::utils::usize_to_u64; use crate::vmm_config::instance_info::{InstanceInfo, VmState}; use crate::vstate::memory::{GuestMemory, GuestMemoryMmap, GuestMemoryRegion}; use crate::vstate::vcpu::VcpuState; pub use crate::vstate::vcpu::{Vcpu, VcpuConfig, VcpuEvent, VcpuHandle, VcpuResponse}; pub use crate::vstate::vm::Vm; +use crate::vstate::vm::mincore_bitmap; /// Shorthand type for the EventManager flavour used by Firecracker. pub type EventManager = BaseEventManager>>; @@ -254,6 +256,8 @@ pub enum VmmError { Block(#[from] BlockError), /// Balloon: {0} Balloon(#[from] BalloonError), + /// Pagemap error: {0} + Pagemap(#[from] utils::pagemap::PagemapError), /// Failed to create memory hotplug device: {0} VirtioMem(#[from] VirtioMemError), } @@ -313,6 +317,8 @@ pub struct Vmm { vcpus_exit_evt: EventFd, // Device manager device_manager: DeviceManager, + /// Page size used for backing guest memory + pub page_size: usize, } impl Vmm { @@ -690,6 +696,129 @@ impl Vmm { pub fn vm(&self) -> &Vm { &self.vm } + + /// Get the list of mappings for guest memory + pub fn guest_memory_mappings(&self, page_size: usize) -> Vec { + let mut mappings = vec![]; + let mut offset = 0; + + for region in self + .vm + .guest_memory() + .iter() + .flat_map(|region| region.plugged_slots()) + { + let size = region.slice.len(); + #[allow(deprecated)] + mappings.push(GuestRegionUffdMapping { + base_host_virt_addr: region.slice.ptr_guard_mut().as_ptr() as u64, + size, + offset, + page_size, + page_size_kib: page_size, + }); + + offset += usize_to_u64(size); + } + + mappings + } + + /// Get info regarding resident and empty pages for guest memory + pub fn guest_memory_info(&self, page_size: usize) -> Result<(Vec, Vec), VmmError> { + let mut resident = vec![]; + let mut empty = vec![]; + let zero_page = vec![0u8; page_size]; + + for mem_slot in self + .vm + .guest_memory() + .iter() + .flat_map(|region| region.plugged_slots()) + { + debug_assert!(mem_slot.slice.len().is_multiple_of(page_size)); + debug_assert!( + (mem_slot.slice.ptr_guard_mut().as_ptr() as usize).is_multiple_of(page_size) + ); + + let len = mem_slot.slice.len(); + let nr_pages = len / page_size; + let addr = mem_slot.slice.ptr_guard_mut().as_ptr(); + let mut curr_empty = vec![0u64; nr_pages.div_ceil(64)]; + let curr_resident = mincore_bitmap(addr, mem_slot.slice.len(), page_size)?; + + for page_idx in 0..nr_pages { + if (curr_resident[page_idx / 64] & (1u64 << (page_idx % 64))) == 0 { + continue; + } + + // SAFETY: `addr` points to a memory region that is `nr_pages * page_size` long. + let curr_addr = unsafe { addr.add(page_idx * page_size) }; + + // SAFETY: both addresses are valid and they point to a memory region + // that is (at least) `page_size` long + let ret = unsafe { + libc::memcmp( + curr_addr.cast::(), + zero_page.as_ptr().cast::(), + page_size, + ) + }; + + if ret == 0 { + curr_empty[page_idx / 64] |= 1u64 << (page_idx % 64); + } + } + + resident.extend_from_slice(&curr_resident); + empty.extend_from_slice(&curr_empty); + } + + Ok((resident, empty)) + } + + /// Get dirty pages bitmap for guest memory + pub fn get_dirty_memory(&self, page_size: usize) -> Result, VmmError> { + let pagemap = utils::pagemap::PagemapReader::new(page_size)?; + let mut dirty_bitmap = vec![]; + + for mem_slot in self + .vm + .guest_memory() + .iter() + .flat_map(|region| region.plugged_slots()) + { + let base_addr = mem_slot.slice.ptr_guard_mut().as_ptr() as usize; + let len = mem_slot.slice.len(); + let nr_pages = len / page_size; + + // Use mincore_bitmap to get resident pages at guest page size granularity + let resident_bitmap = vstate::vm::mincore_bitmap(base_addr as *mut u8, len, page_size)?; + + // TODO: if we don't support UFFD/async WP, we can completely skip this bit. For the + // time being, we always do. + // + // Build dirty bitmap: check pagemap only for pages that mincore reports resident. + // This way we reduce the amount of times we read out of /proc//pagemap. + let mut slot_bitmap = vec![0u64; nr_pages.div_ceil(64)]; + for page_idx in 0..nr_pages { + // Check if page is resident in the bitmap. + // TODO: These operations (add to bitmap, check for presence, etc.) merit their own + // implementation, somewhere within a bitmap type). + let is_resident = (resident_bitmap[page_idx / 64] & (1u64 << (page_idx % 64))) != 0; + if is_resident { + let virt_addr = base_addr + (page_idx * page_size); + if pagemap.is_page_dirty(virt_addr)? { + slot_bitmap[page_idx / 64] |= 1u64 << (page_idx % 64); + } + } + } + + dirty_bitmap.extend_from_slice(&slot_bitmap); + } + + Ok(dirty_bitmap) + } } /// Process the content of the MPIDR_EL1 register in order to be able to pass it to KVM diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index ba2608070c6..3baf9e358e0 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -161,8 +161,10 @@ pub fn create_snapshot( snapshot_state_to_file(µvm_state, ¶ms.snapshot_path)?; - vmm.vm - .snapshot_memory_to_file(¶ms.mem_file_path, params.snapshot_type)?; + if let Some(mem_file_path) = params.mem_file_path.as_ref() { + vmm.vm + .snapshot_memory_to_file(mem_file_path, params.snapshot_type, vmm.page_size)?; + } // We need to mark queues as dirty again for all activated devices. The reason we // do it here is that we don't mark pages as dirty during runtime diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index fdd0862a9d4..c517fe9a307 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -28,8 +28,9 @@ use crate::vmm_config::balloon::{ use crate::vmm_config::boot_source::{BootSourceConfig, BootSourceConfigError}; use crate::vmm_config::drive::{BlockDeviceConfig, BlockDeviceUpdateConfig, DriveError}; use crate::vmm_config::entropy::{EntropyDeviceConfig, EntropyDeviceError}; -use crate::vmm_config::instance_info::InstanceInfo; +use crate::vmm_config::instance_info::{InstanceInfo, VmState}; use crate::vmm_config::machine_config::{MachineConfig, MachineConfigError, MachineConfigUpdate}; +use crate::vmm_config::meminfo::{MemoryDirty, MemoryMapingsResponse, MemoryResponse}; use crate::vmm_config::memory_hotplug::{ MemoryHotplugConfig, MemoryHotplugConfigError, MemoryHotplugSizeUpdate, }; @@ -146,6 +147,12 @@ pub enum VmmAction { /// Update the microVM configuration (memory & vcpu) using `VmUpdateConfig` as input. This /// action can only be called before the microVM has booted. UpdateMachineConfiguration(MachineConfigUpdate), + /// Get the guest memory mappings to host memory + GetMemoryMappings, + /// Get guest memory resident and empty pages information + GetMemory, + /// Get guest memory dirty pages information + GetMemoryDirty, } /// Wrapper for all errors associated with VMM actions. @@ -197,6 +204,8 @@ pub enum VmmActionError { OperationNotSupportedPostBoot, /// The requested operation is not supported before starting the microVM. OperationNotSupportedPreBoot, + /// The requested operation is not supported while the microVM is running. + OperationNotSupportedWhileRunning, /// Start microvm error: {0} StartMicrovm(#[from] StartMicrovmError), /// Vsock config error: {0} @@ -228,6 +237,12 @@ pub enum VmmData { VirtioMemStatus(VirtioMemStatus), /// The status of the virtio-balloon hinting run HintingStatus(HintingStatus), + /// The guest memory mapping information. + MemoryMappings(MemoryMapingsResponse), + /// The guest memory resident and empty pages information + Memory(MemoryResponse), + /// The guest memory dirty pages information + MemoryDirty(MemoryDirty), } /// Trait used for deduplicating the MMDS request handling across the two ApiControllers. @@ -495,7 +510,10 @@ impl<'a> PrebootApiController<'a> { | UpdateNetworkInterface(_) | StartFreePageHinting(_) | GetFreePageHintingStatus - | StopFreePageHinting => Err(VmmActionError::OperationNotSupportedPreBoot), + | StopFreePageHinting + | GetMemoryMappings + | GetMemory + | GetMemoryDirty => Err(VmmActionError::OperationNotSupportedPreBoot), #[cfg(target_arch = "x86_64")] SendCtrlAltDel => Err(VmmActionError::OperationNotSupportedPreBoot), } @@ -771,6 +789,9 @@ impl RuntimeApiController { .update_memory_hotplug_size(cfg.requested_size_mib) .map(|_| VmmData::Empty) .map_err(VmmActionError::MemoryHotplugUpdate), + GetMemoryMappings => self.get_guest_memory_mappings(), + GetMemory => self.get_guest_memory_info(), + GetMemoryDirty => self.get_dirty_memory_info(), // Operations not allowed post-boot. ConfigureBootSource(_) | ConfigureLogger(_) @@ -937,6 +958,54 @@ impl RuntimeApiController { .map_err(NetworkInterfaceError::DeviceUpdate) .map_err(VmmActionError::NetworkConfig) } + + /// Get guest memory mappings + fn get_guest_memory_mappings(&self) -> Result { + let start_us = get_time_us(ClockType::Monotonic); + + let vmm = self.vmm.lock().expect("Poisoned lock"); + let page_size = self.vm_resources.machine_config.huge_pages.page_size(); + let mappings = vmm.guest_memory_mappings(page_size); + + let elapsed_time_us = get_time_us(ClockType::Monotonic) - start_us; + info!("'get memory mappings' VMM action took {} us.", elapsed_time_us); + + Ok(VmmData::MemoryMappings(MemoryMapingsResponse { mappings })) + } + + /// Get resident and empty pages information for guest memory + fn get_guest_memory_info(&self) -> Result { + let start_us = get_time_us(ClockType::Monotonic); + + let vmm = self.vmm.lock().expect("Poisoned lock"); + let page_size = self.vm_resources.machine_config.huge_pages.page_size(); + let (resident, empty) = vmm.guest_memory_info(page_size)?; + + let elapsed_time_us = get_time_us(ClockType::Monotonic) - start_us; + info!("'get memory info' VMM action took {} us.", elapsed_time_us); + + Ok(VmmData::Memory(MemoryResponse { resident, empty })) + } + + /// Get dirty pages information for guest memory + fn get_dirty_memory_info(&self) -> Result { + let start_us = get_time_us(ClockType::Monotonic); + + let vmm = self.vmm.lock().expect("Poisoned lock"); + + // Check if VM is paused + if vmm.instance_info.state != VmState::Paused { + return Err(VmmActionError::OperationNotSupportedWhileRunning); + } + + let page_size = self.vm_resources.machine_config.huge_pages.page_size(); + let bitmap = vmm.get_dirty_memory(page_size)?; + + let elapsed_time_us = get_time_us(ClockType::Monotonic) - start_us; + info!("'get dirty memory' VMM action took {} us.", elapsed_time_us); + + Ok(VmmData::MemoryDirty(MemoryDirty { bitmap })) + } } #[cfg(test)] @@ -1243,7 +1312,7 @@ mod tests { CreateSnapshotParams { snapshot_type: SnapshotType::Full, snapshot_path: PathBuf::new(), - mem_file_path: PathBuf::new(), + mem_file_path: Some(PathBuf::new()), }, ))); #[cfg(target_arch = "x86_64")] diff --git a/src/vmm/src/utils/mod.rs b/src/vmm/src/utils/mod.rs index 1288abef0ba..6718100e2cf 100644 --- a/src/vmm/src/utils/mod.rs +++ b/src/vmm/src/utils/mod.rs @@ -9,6 +9,8 @@ pub mod net; pub mod signal; /// Module with state machine pub mod sm; +/// Module with pagemap utilities +pub mod pagemap; use std::fs::{File, OpenOptions}; use std::num::Wrapping; diff --git a/src/vmm/src/utils/pagemap.rs b/src/vmm/src/utils/pagemap.rs new file mode 100644 index 00000000000..fff9e1f5cb2 --- /dev/null +++ b/src/vmm/src/utils/pagemap.rs @@ -0,0 +1,115 @@ +//! Utilities for reading /proc/self/pagemap to track dirty pages. + +#![allow(clippy::cast_possible_wrap)] + +use std::fs::File; +use std::os::unix::io::AsRawFd; + +use crate::arch::host_page_size; + +const PAGEMAP_ENTRY_SIZE: usize = 8; + +/// Errors related to pagemap operations +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum PagemapError { + /// Failed to open /proc/self/pagemap: {0} + OpenPagemap(#[source] std::io::Error), + /// Failed to read pagemap entry: {0} + ReadEntry(#[source] std::io::Error), + /// Failed to open /proc/self/clear_refs: {0} + OpenClearRefs(#[source] std::io::Error), + /// Failed to clear soft-dirty bits: {0} + ClearSoftDirty(#[source] std::io::Error), +} + +/// Represents a single entry in /proc/pid/pagemap. +/// +/// Each virtual page has an 8-byte entry with the following layout: +/// - Bits 0-54: Page frame number (PFN) if present +/// - Bit 55: Page is soft-dirty (written to since last clear) +/// - Bit 56: Page is exclusively mapped +/// - Bit 57: Page is write-protected via userfaultfd +/// - Bit 58: Unused +/// - Bit 59-60: Unused +/// - Bit 61: Page is file-page or shared-anon +/// - Bit 62: Page is swapped +/// - Bit 63: Page is present in RAM +#[derive(Debug, Clone, Copy)] +pub struct PagemapEntry { + raw: u64, +} + +impl PagemapEntry { + /// Create a PagemapEntry from bytes (little-endian) + pub fn from_bytes(bytes: [u8; 8]) -> Self { + Self { + raw: u64::from_ne_bytes(bytes), + } + } + + /// Check if page is write-protected via userfaultfd + pub fn is_write_protected(&self) -> bool { + (self.raw & (1u64 << 57)) != 0 + } + + /// Check if page is present in RAM (bit 63) + pub fn is_present(&self) -> bool { + (self.raw & (1u64 << 63)) != 0 + } +} + +/// Reader for /proc/self/pagemap +#[derive(Debug)] +pub struct PagemapReader { + pagemap_fd: File, +} + +impl PagemapReader { + /// Create a new PagemapReader + pub fn new(_page_size: usize) -> Result { + let pagemap_fd = File::open("/proc/self/pagemap").map_err(PagemapError::OpenPagemap)?; + + Ok(Self { pagemap_fd }) + } + + /// Check if a single page is dirty (write-protected bit cleared). + /// + /// Checks the first host page (4K) of the guest page at the given address. + /// For huge pages, all host pages within the huge page typically have the same + /// dirty status, so sampling the first is sufficient. + /// + /// # Arguments + /// * `virt_addr` - Virtual address of the page to check + /// + /// # Returns + /// True if the page is present and write-protected bit is cleared (dirty). + pub fn is_page_dirty(&self, virt_addr: usize) -> Result { + // Pagemap always uses host (4K) page size + let host_page_size = host_page_size(); + + // Calculate offset for this virtual page (using host page size) + let host_vpn = virt_addr / host_page_size; + let offset = (host_vpn * PAGEMAP_ENTRY_SIZE) as i64; + + let mut entry_bytes = [0u8; 8]; + + // SAFETY: pread is safe as long as the fd is valid and the buffer is properly sized + let ret = unsafe { + libc::pread( + self.pagemap_fd.as_raw_fd(), + entry_bytes.as_mut_ptr().cast(), + PAGEMAP_ENTRY_SIZE, + offset, + ) + }; + + if ret != PAGEMAP_ENTRY_SIZE as isize { + return Err(PagemapError::ReadEntry(std::io::Error::last_os_error())); + } + + let entry = PagemapEntry::from_bytes(entry_bytes); + + // Page must be present and the write_protected bit cleared (indicating it was written to) + Ok(entry.is_present() && !entry.is_write_protected()) + } +} diff --git a/src/vmm/src/vmm_config/meminfo.rs b/src/vmm/src/vmm_config/meminfo.rs new file mode 100644 index 00000000000..693ece6b4d4 --- /dev/null +++ b/src/vmm/src/vmm_config/meminfo.rs @@ -0,0 +1,29 @@ +use serde::Serialize; + +use crate::persist::GuestRegionUffdMapping; + +/// Serializeable struct that contains information about guest's memory mappings +#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize)] +pub struct MemoryMapingsResponse { + /// Vector with mappings from guest physical to host virtual memoryv + pub mappings: Vec, +} + +/// Information about guest memory resident pages and pages that are all-0s +#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize)] +pub struct MemoryResponse { + /// Bitmap for resident pages. The bitmap is encoded as a vector of u64 values. + /// Each bit represents whether a page is present in the resident memory set + pub resident: Vec, + /// Bitmap for empty pages. The bitmap is encoded as a vector of u64 values. + /// Each bit represents whether a page is empty (all 0s). + pub empty: Vec, +} + +/// Information about dirty guest memory pages +#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize)] +pub struct MemoryDirty { + /// Bitmap for dirty pages. The bitmap is encoded as a vector of u64 values. + /// Each bit represents whether a page has been written since the last snapshot. + pub bitmap: Vec, +} diff --git a/src/vmm/src/vmm_config/mod.rs b/src/vmm/src/vmm_config/mod.rs index 9a4c104ce3a..c593b3ec0dc 100644 --- a/src/vmm/src/vmm_config/mod.rs +++ b/src/vmm/src/vmm_config/mod.rs @@ -20,6 +20,8 @@ pub mod entropy; pub mod instance_info; /// Wrapper for configuring the memory and CPU of the microVM. pub mod machine_config; +/// Wrapper for getting memory-related information. +pub mod meminfo; /// Wrapper for configuring memory hotplug. pub mod memory_hotplug; /// Wrapper for configuring the metrics. diff --git a/src/vmm/src/vmm_config/snapshot.rs b/src/vmm/src/vmm_config/snapshot.rs index 13a87ba30c4..dcd1d5b2268 100644 --- a/src/vmm/src/vmm_config/snapshot.rs +++ b/src/vmm/src/vmm_config/snapshot.rs @@ -44,7 +44,7 @@ pub struct CreateSnapshotParams { /// Path to the file that will contain the microVM state. pub snapshot_path: PathBuf, /// Path to the file that will contain the guest memory. - pub mem_file_path: PathBuf, + pub mem_file_path: Option, } /// Allows for changing the mapping between tap devices and host devices diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 83e899eff1d..0cca1eb02d6 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -305,7 +305,7 @@ impl Vm { } /// Retrieves the KVM dirty bitmap for each of the guest's memory regions. - pub fn get_dirty_bitmap(&self) -> Result { + pub fn get_dirty_bitmap(&self, page_size: usize) -> Result { self.guest_memory() .iter() .flat_map(|region| region.plugged_slots()) @@ -318,6 +318,7 @@ impl Vm { None => mincore_bitmap( mem_slot.slice.ptr_guard_mut().as_ptr(), mem_slot.slice.len(), + page_size, )?, }; Ok((mem_slot.slot, bitmap)) @@ -335,6 +336,7 @@ impl Vm { &self, mem_file_path: &Path, snapshot_type: SnapshotType, + page_size: usize, ) -> Result<(), CreateSnapshotError> { use self::CreateSnapshotError::*; @@ -377,7 +379,7 @@ impl Vm { match snapshot_type { SnapshotType::Diff => { - let dirty_bitmap = self.get_dirty_bitmap()?; + let dirty_bitmap = self.get_dirty_bitmap(page_size)?; self.guest_memory().dump_dirty(&mut file, &dirty_bitmap)?; } SnapshotType::Full => { @@ -503,7 +505,11 @@ impl Vm { /// Use `mincore(2)` to overapproximate the dirty bitmap for the given memslot. To be used /// if a diff snapshot is requested, but dirty page tracking wasn't enabled. -fn mincore_bitmap(addr: *mut u8, len: usize) -> Result, VmError> { +pub(crate) fn mincore_bitmap( + addr: *mut u8, + len: usize, + page_size: usize, +) -> Result, VmError> { // TODO: Once Host 5.10 goes out of support, we can make this more robust and work on // swap-enabled systems, by doing mlock2(MLOCK_ONFAULT)/munlock() in this function (to // force swapped-out pages to get paged in, so that mincore will consider them incore). @@ -513,8 +519,11 @@ fn mincore_bitmap(addr: *mut u8, len: usize) -> Result, VmError> { // Mincore always works at PAGE_SIZE granularity, even if the VMA we are dealing with // is a hugetlbfs VMA (e.g. to report a single hugepage as "present", mincore will // give us 512 4k markers with the lowest bit set). - let page_size = host_page_size(); - let mut mincore_bitmap = vec![0u8; len / page_size]; + let host_page_size = host_page_size(); + let mut mincore_bitmap = vec![0u8; len / host_page_size]; + // The bitmap we return though tracks pages in terms of the actually used page size. In + // the case of a hugetlbfs VMA, we just need to check if the first of the reported pages + // is present. let mut bitmap = vec![0u64; (len / page_size).div_ceil(64)]; // SAFETY: The safety invariants of GuestRegionMmap ensure that region.as_ptr() is a valid @@ -529,7 +538,8 @@ fn mincore_bitmap(addr: *mut u8, len: usize) -> Result, VmError> { return Err(VmError::Mincore(vmm_sys_util::errno::Error::last())); } - for (page_idx, b) in mincore_bitmap.iter().enumerate() { + let step = page_size / host_page_size; + for (page_idx, b) in mincore_bitmap.iter().step_by(step).enumerate() { bitmap[page_idx / 64] |= (*b as u64 & 0x1) << (page_idx as u64 % 64); } diff --git a/src/vmm/tests/integration_tests.rs b/src/vmm/tests/integration_tests.rs index 6a5e6a08a14..a7a4a8c1d73 100644 --- a/src/vmm/tests/integration_tests.rs +++ b/src/vmm/tests/integration_tests.rs @@ -235,7 +235,7 @@ fn verify_create_snapshot( let snapshot_params = CreateSnapshotParams { snapshot_type, snapshot_path: snapshot_file.as_path().to_path_buf(), - mem_file_path: memory_file.as_path().to_path_buf(), + mem_file_path: Some(memory_file.as_path().to_path_buf()), }; controller