diff --git a/CHANGELOG.md b/CHANGELOG.md index 213ecff4be2..fd2589935a2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,10 @@ and this project adheres to supported snapshot version format. This change renders all previous Firecracker snapshots (up to Firecracker version v1.6.0) incompatible with the current Firecracker version. +- [#4449](https://github.com/firecracker-microvm/firecracker/pull/4449): Added + information about page size to the payload Firecracker sends to the UFFD + handler. Each memory region object now contains a `page_size_kib` field. See + also the [hugepages documentation](docs/hugepages.md). ### Fixed diff --git a/docs/hugepages.md b/docs/hugepages.md index a5105cc802b..0b4b994e04f 100644 --- a/docs/hugepages.md +++ b/docs/hugepages.md @@ -33,6 +33,11 @@ microVMs backed with huge pages can only be restored via UFFD. Lastly, note that even for guests backed by huge pages, differential snapshots will always track write accesses to guest memory at 4K granularity. +When restoring snapshots via UFFD, Firecracker will send the configured page +size (in KiB) for each memory region as part of the initial handshake, as +described in our documentation on +[UFFD-assisted snapshot-restore](snapshotting/handling-page-faults-on-snapshot-resume.md). + ## Known Limitations Currently, hugetlbfs support is mutually exclusive with the following diff --git a/docs/snapshotting/handling-page-faults-on-snapshot-resume.md b/docs/snapshotting/handling-page-faults-on-snapshot-resume.md index d699c5d24ee..7145f675c6c 100644 --- a/docs/snapshotting/handling-page-faults-on-snapshot-resume.md +++ b/docs/snapshotting/handling-page-faults-on-snapshot-resume.md @@ -91,7 +91,8 @@ Firecracker and the page fault handler. ![](../images/uffd_flow3.png) - Firecracker passes the userfault file descriptor and the guest memory layout - to the page fault handler process through the socket. + (e.g. dimensions of each memory region, and their [page size](../hugepages.md) + in KiB) to the page fault handler process through the socket. ![](../images/uffd_flow4.png) @@ -106,7 +107,7 @@ Firecracker and the page fault handler. happens, the page fault handler issues `UFFDIO_COPY` to load the previously mmaped file contents into the correspondent memory region. -After Firecracker sends the payload (i.e mem mappings and file descriptor), no +After Firecracker sends the payload (i.e. mem mappings and file descriptor), no other communication happens on the UDS socket (or otherwise) between Firecracker and the page fault handler process. @@ -161,7 +162,7 @@ connect/send data. ### Example An example of a handler process can be found -[here](../../src/firecracker/examples/uffd/valid_4k_handler.rs). The process is +[here](../../src/firecracker/examples/uffd/valid_handler.rs). The process is designed to tackle faults on a certain address by loading into memory the entire region that the address belongs to, but users can choose any other behavior that suits their use case best. diff --git a/src/firecracker/Cargo.toml b/src/firecracker/Cargo.toml index 1da89fe698e..b9b92d556d1 100644 --- a/src/firecracker/Cargo.toml +++ b/src/firecracker/Cargo.toml @@ -50,16 +50,12 @@ serde_json = "1.0.113" tracing = ["log-instrument", "seccompiler/tracing", "utils/tracing", "vmm/tracing"] [[example]] -name = "uffd_malicious_4k_handler" -path = "examples/uffd/malicious_4k_handler.rs" +name = "uffd_malicious_handler" +path = "examples/uffd/malicious_handler.rs" [[example]] -name = "uffd_valid_4k_handler" -path = "examples/uffd/valid_4k_handler.rs" - -[[example]] -name = "uffd_valid_2m_handler" -path = "examples/uffd/valid_2m_handler.rs" +name = "uffd_valid_handler" +path = "examples/uffd/valid_handler.rs" [[example]] name = "uffd_fault_all_handler" diff --git a/src/firecracker/examples/uffd/fault_all_handler.rs b/src/firecracker/examples/uffd/fault_all_handler.rs index 1ab22ada680..31ce68a97bc 100644 --- a/src/firecracker/examples/uffd/fault_all_handler.rs +++ b/src/firecracker/examples/uffd/fault_all_handler.rs @@ -11,7 +11,6 @@ use std::fs::File; use std::os::unix::net::UnixListener; use uffd_utils::{Runtime, UffdHandler}; -use utils::get_page_size; fn main() { let mut args = std::env::args(); @@ -24,13 +23,8 @@ fn main() { let listener = UnixListener::bind(uffd_sock_path).expect("Cannot bind to socket path"); let (stream, _) = listener.accept().expect("Cannot listen on UDS socket"); - // Populate a single page from backing memory file. - // This is just an example, probably, with the worst-case latency scenario, - // of how memory can be loaded in guest RAM. - let len = get_page_size().unwrap(); // page size does not matter, we fault in everything on the first fault - let mut runtime = Runtime::new(stream, file); - runtime.run(len, |uffd_handler: &mut UffdHandler| { + runtime.run(|uffd_handler: &mut UffdHandler| { // Read an event from the userfaultfd. let event = uffd_handler .read_event() diff --git a/src/firecracker/examples/uffd/malicious_4k_handler.rs b/src/firecracker/examples/uffd/malicious_handler.rs similarity index 95% rename from src/firecracker/examples/uffd/malicious_4k_handler.rs rename to src/firecracker/examples/uffd/malicious_handler.rs index 157d3d7e147..9af94e057aa 100644 --- a/src/firecracker/examples/uffd/malicious_4k_handler.rs +++ b/src/firecracker/examples/uffd/malicious_handler.rs @@ -23,7 +23,7 @@ fn main() { let (stream, _) = listener.accept().expect("Cannot listen on UDS socket"); let mut runtime = Runtime::new(stream, file); - runtime.run(4096, |uffd_handler: &mut UffdHandler| { + runtime.run(|uffd_handler: &mut UffdHandler| { // Read an event from the userfaultfd. let event = uffd_handler .read_event() diff --git a/src/firecracker/examples/uffd/uffd_utils.rs b/src/firecracker/examples/uffd/uffd_utils.rs index d517f785e19..f5a5773e115 100644 --- a/src/firecracker/examples/uffd/uffd_utils.rs +++ b/src/firecracker/examples/uffd/uffd_utils.rs @@ -30,6 +30,8 @@ pub struct GuestRegionUffdMapping { pub size: usize, /// Offset in the backend file/buffer where the region contents are. pub offset: u64, + /// The configured page size for this memory region. + pub page_size_kib: usize, } #[derive(Debug, Clone, Copy)] @@ -49,18 +51,13 @@ pub struct MemRegion { #[derive(Debug)] pub struct UffdHandler { pub mem_regions: Vec, - page_size: usize, + pub page_size: usize, backing_buffer: *const u8, uffd: Uffd, } impl UffdHandler { - pub fn from_unix_stream( - stream: &UnixStream, - page_size: usize, - backing_buffer: *const u8, - size: usize, - ) -> Self { + pub fn from_unix_stream(stream: &UnixStream, backing_buffer: *const u8, size: usize) -> Self { let mut message_buf = vec![0u8; 1024]; let (bytes_read, file) = stream .recv_with_fd(&mut message_buf[..]) @@ -73,6 +70,8 @@ impl UffdHandler { let mappings = serde_json::from_str::>(&body) .expect("Cannot deserialize memory mappings."); let memsize: usize = mappings.iter().map(|r| r.size).sum(); + // Page size is the same for all memory regions, so just grab the first one + let page_size = mappings.first().unwrap().page_size_kib; // Make sure memory size matches backing data size. assert_eq!(memsize, size); @@ -214,7 +213,7 @@ impl Runtime { /// When uffd is polled, page fault is handled by /// calling `pf_event_dispatch` with corresponding /// uffd object passed in. - pub fn run(&mut self, page_size: usize, pf_event_dispatch: impl Fn(&mut UffdHandler)) { + pub fn run(&mut self, pf_event_dispatch: impl Fn(&mut UffdHandler)) { let mut pollfds = vec![]; // Poll the stream for incoming uffds @@ -249,7 +248,6 @@ impl Runtime { // Handle new uffd from stream let handler = UffdHandler::from_unix_stream( &self.stream, - page_size, self.backing_memory, self.backing_memory_size, ); @@ -330,7 +328,7 @@ mod tests { let (stream, _) = listener.accept().expect("Cannot listen on UDS socket"); // Update runtime with actual runtime let runtime = uninit_runtime.write(Runtime::new(stream, file)); - runtime.run(4096, |_: &mut UffdHandler| {}); + runtime.run(|_: &mut UffdHandler| {}); }); // wait for runtime thread to initialize itself @@ -343,6 +341,7 @@ mod tests { base_host_virt_addr: 0, size: 0x1000, offset: 0, + page_size_kib: 4096, }]; let dummy_memory_region_json = serde_json::to_string(&dummy_memory_region).unwrap(); @@ -375,6 +374,7 @@ mod tests { base_host_virt_addr: 0, size: 0, offset: 0, + page_size_kib: 4096, }]; let error_memory_region_json = serde_json::to_string(&error_memory_region).unwrap(); stream diff --git a/src/firecracker/examples/uffd/valid_2m_handler.rs b/src/firecracker/examples/uffd/valid_2m_handler.rs deleted file mode 100644 index d824ca01f55..00000000000 --- a/src/firecracker/examples/uffd/valid_2m_handler.rs +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -//! Provides functionality for a userspace page fault handler -//! which loads the whole region from the backing memory file -//! when a page fault occurs. - -mod uffd_utils; - -use std::fs::File; -use std::os::unix::net::UnixListener; - -use uffd_utils::{MemPageState, Runtime, UffdHandler}; - -fn main() { - let mut args = std::env::args(); - let uffd_sock_path = args.nth(1).expect("No socket path given"); - let mem_file_path = args.next().expect("No memory file given"); - - let file = File::open(mem_file_path).expect("Cannot open memfile"); - - // Get Uffd from UDS. We'll use the uffd to handle PFs for Firecracker. - let listener = UnixListener::bind(uffd_sock_path).expect("Cannot bind to socket path"); - let (stream, _) = listener.accept().expect("Cannot listen on UDS socket"); - - // Populate a single page from backing memory file. - // This is just an example, probably, with the worst-case latency scenario, - // of how memory can be loaded in guest RAM. - let len = 2 * 1024 * 1024; - - let mut runtime = Runtime::new(stream, file); - runtime.run(len, |uffd_handler: &mut UffdHandler| { - // Read an event from the userfaultfd. - let event = uffd_handler - .read_event() - .expect("Failed to read uffd_msg") - .expect("uffd_msg not ready"); - - // We expect to receive either a Page Fault or Removed - // event (if the balloon device is enabled). - match event { - userfaultfd::Event::Pagefault { addr, .. } => uffd_handler.serve_pf(addr.cast(), len), - userfaultfd::Event::Remove { start, end } => uffd_handler.update_mem_state_mappings( - start as u64, - end as u64, - MemPageState::Removed, - ), - _ => panic!("Unexpected event on userfaultfd"), - } - }); -} diff --git a/src/firecracker/examples/uffd/valid_4k_handler.rs b/src/firecracker/examples/uffd/valid_handler.rs similarity index 79% rename from src/firecracker/examples/uffd/valid_4k_handler.rs rename to src/firecracker/examples/uffd/valid_handler.rs index 1f752f141f1..cfc5faf432c 100644 --- a/src/firecracker/examples/uffd/valid_4k_handler.rs +++ b/src/firecracker/examples/uffd/valid_handler.rs @@ -11,7 +11,6 @@ use std::fs::File; use std::os::unix::net::UnixListener; use uffd_utils::{MemPageState, Runtime, UffdHandler}; -use utils::get_page_size; fn main() { let mut args = std::env::args(); @@ -24,13 +23,8 @@ fn main() { let listener = UnixListener::bind(uffd_sock_path).expect("Cannot bind to socket path"); let (stream, _) = listener.accept().expect("Cannot listen on UDS socket"); - // Populate a single page from backing memory file. - // This is just an example, probably, with the worst-case latency scenario, - // of how memory can be loaded in guest RAM. - let len = get_page_size().unwrap(); - let mut runtime = Runtime::new(stream, file); - runtime.run(len, |uffd_handler: &mut UffdHandler| { + runtime.run(|uffd_handler: &mut UffdHandler| { // Read an event from the userfaultfd. let event = uffd_handler .read_event() @@ -40,7 +34,9 @@ fn main() { // We expect to receive either a Page Fault or Removed // event (if the balloon device is enabled). match event { - userfaultfd::Event::Pagefault { addr, .. } => uffd_handler.serve_pf(addr.cast(), len), + userfaultfd::Event::Pagefault { addr, .. } => { + uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size) + } userfaultfd::Event::Remove { start, end } => uffd_handler.update_mem_state_mappings( start as u64, end as u64, diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 0a3c912f8ce..e1290ca8492 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -93,7 +93,7 @@ pub struct MicrovmState { /// E.g. Guest memory contents for a region of `size` bytes can be found in the /// backend at `offset` bytes from the beginning, and should be copied/populated /// into `base_host_address`. -#[derive(Clone, Debug, Serialize)] +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] pub struct GuestRegionUffdMapping { /// Base host virtual address where the guest memory contents for this /// region should be copied/populated. @@ -102,6 +102,8 @@ pub struct GuestRegionUffdMapping { pub size: usize, /// Offset in the backend file/buffer where the region contents are. pub offset: u64, + /// The configured page size for this memory region. + pub page_size_kib: usize, } /// Errors related to saving and restoring Microvm state. @@ -514,7 +516,8 @@ fn guest_memory_from_uffd( enable_balloon: bool, huge_pages: HugePageConfig, ) -> Result<(GuestMemoryMmap, Option), GuestMemoryFromUffdError> { - let guest_memory = GuestMemoryMmap::from_state(None, mem_state, track_dirty_pages, huge_pages)?; + let (guest_memory, backend_mappings) = + create_guest_memory(mem_state, track_dirty_pages, huge_pages)?; let mut uffd_builder = UffdBuilder::new(); @@ -531,23 +534,43 @@ fn guest_memory_from_uffd( .create() .map_err(GuestMemoryFromUffdError::Create)?; + for mem_region in guest_memory.iter() { + uffd.register(mem_region.as_ptr().cast(), mem_region.size() as _) + .map_err(GuestMemoryFromUffdError::Register)?; + } + + send_uffd_handshake(mem_uds_path, &backend_mappings, &uffd)?; + + Ok((guest_memory, Some(uffd))) +} + +fn create_guest_memory( + mem_state: &GuestMemoryState, + track_dirty_pages: bool, + huge_pages: HugePageConfig, +) -> Result<(GuestMemoryMmap, Vec), GuestMemoryFromUffdError> { + let guest_memory = GuestMemoryMmap::from_state(None, mem_state, track_dirty_pages, huge_pages)?; let mut backend_mappings = Vec::with_capacity(guest_memory.num_regions()); for (mem_region, state_region) in guest_memory.iter().zip(mem_state.regions.iter()) { - let host_base_addr = mem_region.as_ptr(); - let size = mem_region.size(); - - uffd.register(host_base_addr.cast(), size as _) - .map_err(GuestMemoryFromUffdError::Register)?; backend_mappings.push(GuestRegionUffdMapping { - base_host_virt_addr: host_base_addr as u64, - size, + base_host_virt_addr: mem_region.as_ptr() as u64, + size: mem_region.size(), offset: state_region.offset, + page_size_kib: huge_pages.page_size_kib(), }); } + Ok((guest_memory, backend_mappings)) +} + +fn send_uffd_handshake( + mem_uds_path: &Path, + backend_mappings: &[GuestRegionUffdMapping], + uffd: &impl AsRawFd, +) -> Result<(), GuestMemoryFromUffdError> { // This is safe to unwrap() because we control the contents of the vector // (i.e GuestRegionUffdMapping entries). - let backend_mappings = serde_json::to_string(&backend_mappings).unwrap(); + let backend_mappings = serde_json::to_string(backend_mappings).unwrap(); let socket = UnixStream::connect(mem_uds_path)?; socket.send_with_fd( @@ -585,11 +608,13 @@ fn guest_memory_from_uffd( uffd.as_raw_fd(), )?; - Ok((guest_memory, Some(uffd))) + Ok(()) } #[cfg(test)] mod tests { + use std::os::unix::net::UnixListener; + use utils::tempfile::TempFile; use super::*; @@ -604,6 +629,7 @@ mod tests { use crate::vmm_config::balloon::BalloonDeviceConfig; use crate::vmm_config::net::NetworkInterfaceConfig; use crate::vmm_config::vsock::tests::default_config; + use crate::vstate::memory::GuestMemoryRegionState; use crate::Vmm; fn default_vmm_with_devices() -> Vmm { @@ -697,4 +723,65 @@ mod tests { microvm_state.device_states ) } + + #[test] + fn test_create_guest_memory() { + let mem_state = GuestMemoryState { + regions: vec![GuestMemoryRegionState { + base_address: 0, + size: 0x20000, + offset: 0x10000, + }], + }; + + let (_, uffd_regions) = + create_guest_memory(&mem_state, false, HugePageConfig::None).unwrap(); + + assert_eq!(uffd_regions.len(), 1); + assert_eq!(uffd_regions[0].size, 0x20000); + assert_eq!(uffd_regions[0].offset, 0x10000); + assert_eq!( + uffd_regions[0].page_size_kib, + HugePageConfig::None.page_size_kib() + ); + } + + #[test] + fn test_send_uffd_handshake() { + let uffd_regions = vec![ + GuestRegionUffdMapping { + base_host_virt_addr: 0, + size: 0x100000, + offset: 0, + page_size_kib: HugePageConfig::None.page_size_kib(), + }, + GuestRegionUffdMapping { + base_host_virt_addr: 0x100000, + size: 0x200000, + offset: 0, + page_size_kib: HugePageConfig::Hugetlbfs2M.page_size_kib(), + }, + ]; + + let uds_path = TempFile::new().unwrap(); + let uds_path = uds_path.as_path(); + std::fs::remove_file(uds_path).unwrap(); + + let listener = UnixListener::bind(uds_path).expect("Cannot bind to socket path"); + + send_uffd_handshake(uds_path, &uffd_regions, &std::io::stdin()).unwrap(); + + let (stream, _) = listener.accept().expect("Cannot listen on UDS socket"); + + let mut message_buf = vec![0u8; 1024]; + let (bytes_read, _) = stream + .recv_with_fd(&mut message_buf[..]) + .expect("Cannot recv_with_fd"); + message_buf.resize(bytes_read, 0); + + let deserialized: Vec = + serde_json::from_slice(&message_buf).unwrap(); + + assert_eq!(uffd_regions, deserialized); + } } diff --git a/src/vmm/src/vmm_config/machine_config.rs b/src/vmm/src/vmm_config/machine_config.rs index b012cb2c2c5..14aae758453 100644 --- a/src/vmm/src/vmm_config/machine_config.rs +++ b/src/vmm/src/vmm_config/machine_config.rs @@ -84,6 +84,14 @@ impl HugePageConfig { pub fn is_hugetlbfs(&self) -> bool { matches!(self, HugePageConfig::Hugetlbfs2M) } + + /// Gets the page size in KiB of this [`HugePageConfig`]. + pub fn page_size_kib(&self) -> usize { + match self { + HugePageConfig::None => 4096, + HugePageConfig::Hugetlbfs2M => 2 * 1024 * 1024, + } + } } impl From for Option { diff --git a/tests/conftest.py b/tests/conftest.py index 0149ceecf66..0a87cb455e0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -229,7 +229,7 @@ def uffd_handler_paths(): """Build UFFD handler binaries.""" handlers = { f"{handler}_handler": build_tools.get_example(f"uffd_{handler}_handler") - for handler in ["malicious_4k", "valid_4k", "valid_2m", "fault_all"] + for handler in ["malicious", "valid", "fault_all"] } yield handlers diff --git a/tests/integration_tests/functional/test_uffd.py b/tests/integration_tests/functional/test_uffd.py index 8d70cedff46..6e7e96552a8 100644 --- a/tests/integration_tests/functional/test_uffd.py +++ b/tests/integration_tests/functional/test_uffd.py @@ -110,7 +110,7 @@ def test_valid_handler(uvm_plain, snapshot, uffd_handler_paths): # Spawn page fault handler process. _pf_handler = spawn_pf_handler( - vm, uffd_handler_paths["valid_4k_handler"], snapshot.mem + vm, uffd_handler_paths["valid_handler"], snapshot.mem ) vm.restore_from_snapshot(snapshot, resume=True, uffd_path=SOCKET_PATH) @@ -144,7 +144,7 @@ def test_malicious_handler(uvm_plain, snapshot, uffd_handler_paths): # Spawn page fault handler process. _pf_handler = spawn_pf_handler( - vm, uffd_handler_paths["malicious_4k_handler"], snapshot.mem + vm, uffd_handler_paths["malicious_handler"], snapshot.mem ) # We expect Firecracker to freeze while resuming from a snapshot diff --git a/tests/integration_tests/performance/test_huge_pages.py b/tests/integration_tests/performance/test_huge_pages.py index 40817ffec3e..e50cc588563 100644 --- a/tests/integration_tests/performance/test_huge_pages.py +++ b/tests/integration_tests/performance/test_huge_pages.py @@ -109,7 +109,7 @@ def test_hugetlbfs_snapshot( # Spawn page fault handler process. _pf_handler = spawn_pf_handler( - vm, uffd_handler_paths["valid_2m_handler"], snapshot.mem + vm, uffd_handler_paths["valid_handler"], snapshot.mem ) vm.restore_from_snapshot(snapshot, resume=True, uffd_path=SOCKET_PATH) @@ -164,7 +164,7 @@ def test_hugetlbfs_diff_snapshot(microvm_factory, uvm_plain, uffd_handler_paths) # Spawn page fault handler process. _pf_handler = spawn_pf_handler( - vm, uffd_handler_paths["valid_2m_handler"], snapshot_merged.mem + vm, uffd_handler_paths["valid_handler"], snapshot_merged.mem ) vm.restore_from_snapshot(snapshot_merged, resume=True, uffd_path=SOCKET_PATH)