diff --git a/Cargo.lock b/Cargo.lock index 8a1e1f08c3..af10c55a23 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -934,6 +934,15 @@ version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" +[[package]] +name = "crc32c" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a47af21622d091a8f0fb295b88bc886ac74efcc613efc19f5d0b21de5c89e47" +dependencies = [ + "rustc_version", +] + [[package]] name = "crc32fast" version = "1.5.0" @@ -1665,6 +1674,26 @@ dependencies = [ "vm_resource", ] +[[package]] +name = "disklayer_vhdx" +version = "0.0.0" +dependencies = [ + "anyhow", + "async-trait", + "blocking", + "disk_backend", + "disk_backend_resources", + "disk_layered", + "guestmem", + "inspect", + "pal_async", + "scsi_buffers", + "tempfile", + "thiserror 2.0.16", + "vhdx", + "vm_resource", +] + [[package]] name = "dissimilar" version = "1.0.10" @@ -1688,9 +1717,9 @@ checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" [[package]] name = "elfcore" -version = "2.0.1" +version = "2.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e254a61387a9d5706e00576e8ddc08705a8ce3f2d306280459ece426378f94f2" +checksum = "0fdaa3d1c27119b3394513f4596894a40cd53cb4acec7fce636a9ca0c4abb171" dependencies = [ "libc", "nix 0.31.2", @@ -4768,9 +4797,9 @@ dependencies = [ [[package]] name = "ntapi" -version = "0.4.3" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3b335231dfd352ffb0f8017f3b6027a4917f7df785ea2143d8af2adc66980ae" +checksum = "e8a3895c6391c39d7fe7ebc444a87eb2991b2a0bc718fdabd071eec617fc68e4" dependencies = [ "winapi", ] @@ -5474,6 +5503,7 @@ dependencies = [ "disk_backend_resources", "disk_vhd1", "disk_vhdmp", + "disklayer_vhdx", "fs-err", "get_resources", "hypervisor_resources", @@ -5532,6 +5562,7 @@ dependencies = [ "disk_vhdmp", "disklayer_ram", "disklayer_sqlite", + "disklayer_vhdx", "gdma", "guest_crash_device", "guest_emulation_device", @@ -8779,6 +8810,26 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "vhdx" +version = "0.0.0" +dependencies = [ + "bitfield-struct 0.11.0", + "bitvec", + "crc32c", + "event-listener", + "futures", + "guid", + "mesh", + "pal_async", + "parking_lot", + "tempfile", + "thiserror 2.0.16", + "tracing", + "windows 0.62.0", + "zerocopy", +] + [[package]] name = "vhost_user_backend" version = "0.0.0" diff --git a/Cargo.toml b/Cargo.toml index 6184da3a66..ab831f75d3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -308,6 +308,7 @@ disk_delay = { path = "vm/devices/storage/disk_delay" } disk_prwrap = { path = "vm/devices/storage/disk_prwrap" } disk_striped = { path = "vm/devices/storage/disk_striped" } disk_vhd1 = { path = "vm/devices/storage/disk_vhd1" } +disklayer_vhdx = { path = "vm/devices/storage/disklayer_vhdx" } disk_vhdmp = { path = "vm/devices/storage/disk_vhdmp" } disklayer_ram = { path = "vm/devices/storage/disklayer_ram" } disklayer_sqlite = { path = "vm/devices/storage/disklayer_sqlite" } @@ -325,6 +326,7 @@ storvsc_driver = { path = "vm/devices/storage/storvsc_driver" } storvsp = { path = "vm/devices/storage/storvsp" } storvsp_protocol = { path = "vm/devices/storage/storvsp_protocol" } storvsp_resources = { path = "vm/devices/storage/storvsp_resources" } +vhdx = { path = "vm/devices/storage/vhdx" } device_emulators = { path = "vm/devices/support/device_emulators" } fuse = { path = "vm/devices/support/fs/fuse" } lx = { path = "vm/devices/support/fs/lx" } @@ -455,6 +457,7 @@ cc = "1.2.34" cfg-if = "1" clap = "4.2" constant_time_eq = "0.3" +crc32c = "0.6" crc32fast = { version = "1.3.2", default-features = false } criterion = { version = "0.7", default-features = false } crossterm = { version = "0.29.0", default-features = false } diff --git a/Guide/src/SUMMARY.md b/Guide/src/SUMMARY.md index b9533b7948..077def0349 100644 --- a/Guide/src/SUMMARY.md +++ b/Guide/src/SUMMARY.md @@ -123,6 +123,7 @@ - [Serial]() - [Graphics and Input]() - [Storage](./reference/backends/storage.md) + - [VHDX Parser](./reference/backends/vhdx.md) - [Networking](./reference/backends/networking.md) - [Consomme](./reference/backends/consomme.md) - [Architecture](./reference/architecture.md) diff --git a/Guide/src/reference/architecture/devices/storage.md b/Guide/src/reference/architecture/devices/storage.md index e71fabe898..2e3fc0cbc7 100644 --- a/Guide/src/reference/architecture/devices/storage.md +++ b/Guide/src/reference/architecture/devices/storage.md @@ -172,6 +172,7 @@ For the OpenHCL settings model (`StorageController`, `Lun`, `PhysicalDevice`), s | FileDisk | [`disk_file`](https://openvmm.dev/rustdoc/linux/disk_file/index.html) | Host file | Cross-platform | Simplest backend | | Vhd1Disk | [`disk_vhd1`](https://openvmm.dev/rustdoc/linux/disk_vhd1/index.html) | VHD1 fixed file | Cross-platform | Parses VHD footer | | VhdmpDisk | `disk_vhdmp` | Windows vhdmp driver | Windows | Dynamic/differencing VHD/VHDX | +| VhdxDisk | [`vhdx`](../../backends/vhdx.md) | VHDX file | Cross-platform | Pure-Rust VHDX parser | | BlobDisk | [`disk_blob`](https://openvmm.dev/rustdoc/linux/disk_blob/index.html) | HTTP / Azure Blob | Cross-platform | Read-only, HTTP range requests | | BlockDeviceDisk | [`disk_blockdevice`](https://openvmm.dev/rustdoc/linux/disk_blockdevice/index.html) | Linux block device | Linux | io_uring, resize via uevent, PR passthrough | | NvmeDisk | [`disk_nvme`](https://openvmm.dev/rustdoc/linux/disk_nvme/index.html) | Physical NVMe (VFIO) | Linux/Windows | User-mode NVMe driver, resize via AEN | diff --git a/Guide/src/reference/backends/storage.md b/Guide/src/reference/backends/storage.md index 96e60e4ce8..065cb0f1cb 100644 --- a/Guide/src/reference/backends/storage.md +++ b/Guide/src/reference/backends/storage.md @@ -16,6 +16,7 @@ blob, or a layered composition of multiple backends. | FileDisk | [`disk_file`](https://openvmm.dev/rustdoc/linux/disk_file/index.html) | Host file | Cross-platform | Simplest backend. Blocking I/O via `unblock()`. | | Vhd1Disk | [`disk_vhd1`](https://openvmm.dev/rustdoc/linux/disk_vhd1/index.html) | VHD1 fixed file | Cross-platform | Parses VHD footer for geometry. | | VhdmpDisk | `disk_vhdmp` | Windows vhdmp driver | Windows | Dynamic and differencing VHD/VHDX. | +| VhdxDisk | [`vhdx`](vhdx.md) | VHDX file | Cross-platform | Pure-Rust VHDX parser. Dynamic, fixed, and differencing. | | BlobDisk | [`disk_blob`](https://openvmm.dev/rustdoc/linux/disk_blob/index.html) | HTTP / Azure Blob | Cross-platform | Read-only. HTTP range requests. | | BlockDeviceDisk | [`disk_blockdevice`](https://openvmm.dev/rustdoc/linux/disk_blockdevice/index.html) | Linux block device or file | Linux | io_uring, resize via uevent, PR passthrough. Default for raw files on Linux in both OpenHCL and OpenVMM. | | NvmeDisk | [`disk_nvme`](https://openvmm.dev/rustdoc/linux/disk_nvme/index.html) | Physical NVMe (VFIO) | Linux/Windows | User-mode NVMe driver. Resize via AEN. | diff --git a/Guide/src/reference/backends/vhdx.md b/Guide/src/reference/backends/vhdx.md new file mode 100644 index 0000000000..2f0ebad334 --- /dev/null +++ b/Guide/src/reference/backends/vhdx.md @@ -0,0 +1,109 @@ +# VHDX parser + +The `vhdx` crate (`vm/devices/storage/vhdx/`) is a pure-Rust +implementation of the +[VHDX format specification](https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-vhdx/). +It supports dynamic, fixed, and differencing VHDX virtual hard disk +files on all platforms — no Windows APIs or kernel drivers required. + +## Features + +- **Create** and **open** VHDX files (read-only or writable) +- Dynamic block allocation with four-priority free space management +- Write-ahead log (WAL) for crash-consistent metadata updates +- Sector bitmap tracking for partially-present (differencing) blocks +- Block trim/unmap with multiple modes (file space, free space, zero, + transparent, soft-anchor removal) +- Concurrent flush coalescing +- Parent locator parsing for differencing disk chains + +## Architecture + +A VHDX file stores a virtual disk as a collection of fixed-size data +blocks (default 2 MiB) tracked by a Block Allocation Table (BAT). +The crate's write path uses a three-stage pipeline for crash +consistency: + +```text +┌───────────┐ commit ┌──────────┐ apply ┌────────────┐ +│ Cache │ ──────────►│ Log Task │ ─────────►│ Apply Task │ +│ (dirty │ dirty │ (WAL │ logged │ (final │ +│ pages) │ pages │ writer) │ pages │ offsets) │ +└───────────┘ └──────────┘ └────────────┘ +``` + +1. The **cache** accumulates dirty 4 KiB metadata pages (BAT entries, + sector bitmap bits). When the dirty count reaches a threshold or + `flush()` is called, pages are committed to the log task. +2. The **log task** writes WAL entries to the circular log region in + the VHDX file. On crash, `replay_log()` restores metadata from + the WAL. +3. The **apply task** writes logged pages to their final file offsets. + +Backpressure is managed by a permit semaphore that limits in-flight +pages. A flush sequencer coalesces concurrent flush requests so at +most one file flush is in progress at a time. + +## Lifecycle + +```rust,ignore +// Create a new empty VHDX file. +create::create(&file, &mut params).await?; + +// Open for writing. +let vhdx = VhdxFile::open(file) + .block_alignment(2 * 1024 * 1024) + .writable(&spawner) + .await?; + +// Resolve a read — returns file-level ranges. +let mut ranges = Vec::new(); +let guard = vhdx.resolve_read(offset, len, &mut ranges).await?; +// ... perform file I/O at the returned offsets ... +drop(guard); + +// Resolve a write — returns file-level ranges + I/O guard. +let mut ranges = Vec::new(); +let guard = vhdx.resolve_write(offset, len, &mut ranges).await?; +// ... write data at the returned offsets ... +guard.complete().await?; + +// Flush to stable storage. +vhdx.flush().await?; + +// Clean close (clears log GUID). +vhdx.close().await?; +``` + +## I/O model + +The crate separates **metadata I/O** from **payload I/O**. + +Metadata I/O (headers, BAT pages, sector bitmaps, WAL entries) is +handled internally through the `AsyncFile` trait — the caller provides +an `AsyncFile` implementation at open time and never thinks about +metadata again. + +Payload I/O (guest data reads and writes) is the caller's +responsibility. `resolve_read()` and `resolve_write()` translate +virtual disk offsets into file-level byte ranges (`ReadRange` / +`WriteRange`). The caller performs its own data I/O at those offsets +using whatever mechanism it prefers (io_uring, standard file I/O, +etc.), then finalizes metadata via the returned I/O guard. This +separation lets the caller use a different, potentially more +performant I/O path for bulk data without the crate imposing any +particular strategy. + +- The `vhdx` crate provides the low-level VHDX format implementation + and I/O resolution API. For OpenVMM integration, the `disklayer_vhdx` + crate supplies a `LayerIo`-compatible backend used in the layered + disk storage pipeline. +- For differencing disks, the `vhdx` crate parses parent locator + metadata, while `disklayer_vhdx::chain::open_vhdx_chain` walks and + opens parent chains automatically. + +## Related pages + +- [Storage backends](./storage.md) — catalog of all storage backends +- [Storage pipeline](../architecture/devices/storage.md) — how + frontends, backends, and layers connect diff --git a/Guide/src/reference/openvmm/management/cli.md b/Guide/src/reference/openvmm/management/cli.md index a042451178..a9bdb1a291 100644 --- a/Guide/src/reference/openvmm/management/cli.md +++ b/Guide/src/reference/openvmm/management/cli.md @@ -17,7 +17,7 @@ as well as the generated CLI help (via `cargo run -- --help`). pass `--hv`. The `DISK` argument can be: * A flat binary disk image * A VHD file with an extension of .vhd (Windows host only) - * A VHDX file with an extension of .vhdx (Windows host only) + * A VHDX file with an extension of .vhdx On Linux, raw files and block devices use the `disk_blockdevice` backend (io_uring-based async I/O) by default. Append `;direct` to the path to diff --git a/Guide/src/user_guide/openvmm/run.md b/Guide/src/user_guide/openvmm/run.md index 5d5883e26c..605ee5e39b 100644 --- a/Guide/src/user_guide/openvmm/run.md +++ b/Guide/src/user_guide/openvmm/run.md @@ -133,12 +133,14 @@ docs. The file `windows.vhdx` can be any format of VHD(X). -Note that OpenVMM does not currently support using dynamic VHD/VHDX files on -Linux hosts. Unless you have a fixed VHD1 image, you will need to convert the -image to raw format, using the following command: +VHDX files (dynamic, fixed, and differencing) are supported on non-Windows +platforms via the pure-Rust [`vhdx`](../../reference/backends/vhdx.md) +parser. On Windows, `.vhdx` files use the native kernel-mode VHD path +instead. Fixed VHD1 images work on all platforms. Dynamic and differencing VHD1 +files are **not** supported — convert them to VHDX first: -```shell -qemu-img convert -f vhdx -O raw windows.vhdx windows.img +```bash +qemu-img convert -f vpc -O vhdx dynamic.vhd converted.vhdx ``` Also, note the use of `memdiff`, which creates a memory-backed "differencing @@ -157,7 +159,7 @@ integration tests. First, build the test artifacts from Linux or WSL using `vmm-tests-run --build-only`. The IGVM must be built on Linux: -```shell +```bash cargo xflowey vmm-tests-run --build-only --dir --target windows-x64 ``` diff --git a/openvmm/openvmm_helpers/Cargo.toml b/openvmm/openvmm_helpers/Cargo.toml index 6ebefa738d..60c3a6e4c9 100644 --- a/openvmm/openvmm_helpers/Cargo.toml +++ b/openvmm/openvmm_helpers/Cargo.toml @@ -9,6 +9,7 @@ rust-version.workspace = true [dependencies] disk_backend_resources.workspace = true disk_vhd1.workspace = true +disklayer_vhdx.workspace = true get_resources.workspace = true hypervisor_resources.workspace = true openvmm_defs.workspace = true diff --git a/openvmm/openvmm_helpers/src/disk.rs b/openvmm/openvmm_helpers/src/disk.rs index b1d45f016a..dbaf852a7a 100644 --- a/openvmm/openvmm_helpers/src/disk.rs +++ b/openvmm/openvmm_helpers/src/disk.rs @@ -34,8 +34,12 @@ pub struct OpenDiskOptions { /// Opens the resources needed for using a disk from a file at `path`. /// /// If the file ends with .vhd and is a fixed VHD1, it will be opened using -/// the user-mode VHD parser. Otherwise, if the file ends with .vhd or -/// .vhdx, the file will be opened using the kernel-mode VHD parser. +/// the user-mode VHD parser. Otherwise, if the file ends with .vhd, the +/// file will be opened using the kernel-mode VHD parser (Windows only). +/// +/// If the file ends with .vhdx, the kernel-mode VHD parser is used on +/// Windows. On Linux, the pure-Rust VHDX parser is used, with automatic +/// parent-locator walking for differencing chains. pub async fn open_disk_type( path: &Path, options: OpenDiskOptions, @@ -91,7 +95,10 @@ pub async fn open_disk_type( )) } #[cfg(not(windows))] - anyhow::bail!("VHDX not supported on Linux"); + { + ensure_no_direct(".vhdx")?; + disklayer_vhdx::chain::open_vhdx_chain(path, read_only).await? + } } Some("iso") if !read_only => { anyhow::bail!("iso file cannot be opened as read/write") diff --git a/openvmm/openvmm_resources/Cargo.toml b/openvmm/openvmm_resources/Cargo.toml index 33aea95551..cb34d64783 100644 --- a/openvmm/openvmm_resources/Cargo.toml +++ b/openvmm/openvmm_resources/Cargo.toml @@ -44,6 +44,7 @@ disk_file.workspace = true disk_layered.workspace = true disk_prwrap.workspace = true disk_vhd1.workspace = true +disklayer_vhdx.workspace = true disklayer_ram.workspace = true disklayer_sqlite = { workspace = true, optional = true } diff --git a/openvmm/openvmm_resources/src/lib.rs b/openvmm/openvmm_resources/src/lib.rs index 81e40e4103..eabb8c873b 100644 --- a/openvmm/openvmm_resources/src/lib.rs +++ b/openvmm/openvmm_resources/src/lib.rs @@ -66,6 +66,7 @@ vm_resource::register_static_resolvers! { disklayer_ram::resolver::RamDiskLayerResolver, #[cfg(feature = "disklayer_sqlite")] disklayer_sqlite::resolver::SqliteDiskLayerResolver, + disklayer_vhdx::resolver::VhdxDiskLayerResolver, // PCI devices gdma::resolver::GdmaDeviceResolver, diff --git a/vm/devices/storage/disk_backend_resources/src/layer.rs b/vm/devices/storage/disk_backend_resources/src/layer.rs index 08aa58bc8a..eab80c6a71 100644 --- a/vm/devices/storage/disk_backend_resources/src/layer.rs +++ b/vm/devices/storage/disk_backend_resources/src/layer.rs @@ -74,3 +74,16 @@ pub struct SqliteAutoCacheDiskLayerHandle { impl ResourceId for SqliteAutoCacheDiskLayerHandle { const ID: &'static str = "sqlite-autocache"; } + +/// Handle for a VHDX disk layer. +#[derive(MeshPayload)] +pub struct VhdxDiskLayerHandle { + /// The open file handle for the VHDX file. + pub file: std::fs::File, + /// Whether to open the VHDX as read-only. + pub read_only: bool, +} + +impl ResourceId for VhdxDiskLayerHandle { + const ID: &'static str = "vhdx"; +} diff --git a/vm/devices/storage/disklayer_vhdx/Cargo.toml b/vm/devices/storage/disklayer_vhdx/Cargo.toml new file mode 100644 index 0000000000..e93c77eed1 --- /dev/null +++ b/vm/devices/storage/disklayer_vhdx/Cargo.toml @@ -0,0 +1,28 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +[package] +name = "disklayer_vhdx" +edition.workspace = true +rust-version.workspace = true + +[dependencies] +anyhow.workspace = true +async-trait.workspace = true +blocking.workspace = true +disk_backend.workspace = true +disk_backend_resources.workspace = true +disk_layered.workspace = true +guestmem.workspace = true +inspect.workspace = true +scsi_buffers.workspace = true +thiserror.workspace = true +vhdx.workspace = true +vm_resource.workspace = true + +[dev-dependencies] +pal_async.workspace = true +tempfile.workspace = true + +[lints] +workspace = true diff --git a/vm/devices/storage/disklayer_vhdx/src/chain.rs b/vm/devices/storage/disklayer_vhdx/src/chain.rs new file mode 100644 index 0000000000..0c6597ca77 --- /dev/null +++ b/vm/devices/storage/disklayer_vhdx/src/chain.rs @@ -0,0 +1,314 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! VHDX chain helpers. +//! +//! Functions for opening one or more VHDX files as a +//! [`LayeredDiskHandle`](disk_backend_resources::LayeredDiskHandle) ready for +//! resource resolution. + +use anyhow::Context; +use disk_backend_resources::DiskLayerDescription; +use disk_backend_resources::LayeredDiskHandle; +use disk_backend_resources::layer::VhdxDiskLayerHandle; +use std::path::Path; +use vm_resource::IntoResource; +use vm_resource::Resource; +use vm_resource::kind::DiskHandleKind; + +/// Open a single VHDX file as a [`LayeredDiskHandle`] with one layer. +/// +/// Use this for base (non-differencing) VHDX files. For differencing chains, +/// use [`open_vhdx_chain_explicit`] or [`open_vhdx_chain`]. +/// +/// The file is opened for read+write unless `read_only` is true. +pub fn open_vhdx_single(path: &Path, read_only: bool) -> anyhow::Result> { + let file = std::fs::OpenOptions::new() + .read(true) + .write(!read_only) + .open(path)?; + + Ok(Resource::new(LayeredDiskHandle::single_layer( + VhdxDiskLayerHandle { file, read_only }, + ))) +} + +/// Open a VHDX differencing chain from an explicit list of file paths. +/// +/// `paths` must be ordered from **leaf** (child, index 0) to **base** +/// (parent, last index). The leaf is opened for read+write (unless +/// `read_only` is true); all parent files are opened read-only. +/// +/// Returns a [`LayeredDiskHandle`] with layers ordered top (leaf) to +/// bottom (base), matching the order expected by +/// [`LayeredDisk`](disk_layered::LayeredDisk). +/// +/// # Errors +/// +/// Returns an error if: +/// - `paths` is empty +/// - Any file cannot be opened +/// +/// # Example +/// +/// ```no_run +/// # use disklayer_vhdx::chain::open_vhdx_chain_explicit; +/// # use std::path::Path; +/// let resource = open_vhdx_chain_explicit( +/// &[Path::new("child.vhdx"), Path::new("base.vhdx")], +/// false, +/// ).unwrap(); +/// ``` +pub fn open_vhdx_chain_explicit( + paths: &[&Path], + read_only: bool, +) -> anyhow::Result> { + anyhow::ensure!(!paths.is_empty(), "vhdx chain must have at least one file"); + + let layers: Vec = paths + .iter() + .enumerate() + .map(|(i, path)| { + let is_leaf = i == 0; + let layer_read_only = !is_leaf || read_only; + + let file = std::fs::OpenOptions::new() + .read(true) + .write(!layer_read_only) + .open(path) + .with_context(|| format!("failed to open vhdx layer {}: {}", i, path.display()))?; + + let handle = VhdxDiskLayerHandle { + file, + read_only: layer_read_only, + }; + + Ok(DiskLayerDescription { + layer: handle.into_resource(), + read_cache: false, + write_through: false, + }) + }) + .collect::>>()?; + + Ok(Resource::new(LayeredDiskHandle { layers })) +} + +/// Open a VHDX differencing chain by auto-walking parent locators. +/// +/// Starting from the file at `path`, reads each VHDX file's parent locator +/// to discover the next parent in the chain, continuing until a base +/// (non-differencing) disk is found. +/// +/// The leaf file is opened for read+write (unless `read_only` is true); +/// all parent files are opened read-only. +/// +/// Parent path resolution order: +/// 1. `relative_path` — resolved relative to the child's directory +/// 2. `absolute_win32_path` — absolute path (platform-dependent) +/// 3. `volume_path` — volume GUID path (Windows-specific) +/// +/// # Errors +/// +/// Returns an error if: +/// - The leaf file cannot be opened or parsed +/// - A parent locator specifies no usable path +/// - A parent file cannot be found at any of the locator paths +/// - The chain exceeds a reasonable depth limit (detect cycles) +pub async fn open_vhdx_chain( + path: &Path, + read_only: bool, +) -> anyhow::Result> { + // Reasonable depth limit to detect cycles or absurdly long chains. + const MAX_CHAIN_DEPTH: usize = 256; + + let mut paths: Vec = vec![path.to_path_buf()]; + let mut current_path = path.to_path_buf(); + + loop { + if paths.len() > MAX_CHAIN_DEPTH { + anyhow::bail!( + "vhdx chain exceeds maximum depth of {} — possible cycle", + MAX_CHAIN_DEPTH + ); + } + + // Open the current file read-only just to read metadata. + // The actual read-write open happens later via open_vhdx_chain_explicit. + let bf = crate::io::BlockingFile::open(¤t_path, true) + .with_context(|| format!("failed to open vhdx file: {}", current_path.display()))?; + let vhdx = vhdx::VhdxFile::open(bf) + .read_only() + .await + .with_context(|| format!("failed to parse vhdx file: {}", current_path.display()))?; + + if !vhdx.has_parent() { + // Base disk — chain is complete. + break; + } + + // Read the parent locator. + let locator = vhdx + .parent_locator() + .await + .with_context(|| { + format!( + "failed to read parent locator from: {}", + current_path.display() + ) + })? + .context("differencing disk has no parent locator")?; + + let parent_paths = locator.parent_paths(); + let child_dir = current_path.parent().unwrap_or_else(|| Path::new(".")); + + // Try to resolve the parent path in order of preference. + let parent_path = resolve_parent_path(child_dir, &parent_paths).with_context(|| { + format!( + "could not find parent for vhdx file: {}", + current_path.display() + ) + })?; + + paths.push(parent_path.clone()); + current_path = parent_path; + } + + // Convert PathBufs to Path references for open_vhdx_chain_explicit. + let path_refs: Vec<&Path> = paths.iter().map(|p| p.as_path()).collect(); + open_vhdx_chain_explicit(&path_refs, read_only) +} + +/// Try to resolve a parent path from the locator's well-known keys. +/// +/// Tries paths in order: relative_path, absolute_win32_path, volume_path. +/// Returns the first path that exists on disk, or an error if none work. +fn resolve_parent_path( + child_dir: &Path, + parent_paths: &vhdx::ParentPaths, +) -> anyhow::Result { + let mut candidates: Vec = Vec::new(); + + // 1. Relative path — resolve relative to the child's directory. + if let Some(ref rel) = parent_paths.relative_path { + // VHDX relative paths use Windows separators (backslash). + // Normalize to the platform's separator. + let normalized: String = rel + .chars() + .map(|c| { + if c == '\\' { + std::path::MAIN_SEPARATOR + } else { + c + } + }) + .collect(); + // Strip leading ".\" or "./" if present. + let stripped = normalized + .strip_prefix(&format!(".{}", std::path::MAIN_SEPARATOR)) + .unwrap_or(&normalized); + candidates.push(child_dir.join(stripped)); + } + + // 2. Absolute Win32 path (Windows-specific). + if cfg!(windows) { + if let Some(ref abs) = parent_paths.absolute_win32_path { + candidates.push(std::path::PathBuf::from(abs)); + } + } + + // 3. Volume path (Windows-specific). + if cfg!(windows) { + if let Some(ref vol) = parent_paths.volume_path { + candidates.push(std::path::PathBuf::from(vol)); + } + } + + for candidate in &candidates { + if candidate.exists() { + return Ok(candidate.clone()); + } + } + + if candidates.is_empty() { + anyhow::bail!("parent locator contains no path entries"); + } + + // None of the candidates exist. Report all attempted paths. + let tried: Vec = candidates.iter().map(|p| p.display().to_string()).collect(); + anyhow::bail!("parent not found at any locator path: {}", tried.join(", ")) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn open_single_creates_one_layer() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("test.vhdx"); + + let path2 = path.clone(); + pal_async::DefaultPool::run_with(|_driver| async move { + let bf = crate::io::BlockingFile::open(&path2, false).unwrap(); + let mut params = vhdx::CreateParams { + disk_size: 1024 * 1024, + ..Default::default() + }; + vhdx::create(&bf, &mut params).await.unwrap(); + }); + + let resource = open_vhdx_single(&path, false).unwrap(); + let _ = resource; + } + + #[test] + fn explicit_chain_empty_errors() { + let result = open_vhdx_chain_explicit(&[], false); + assert!(result.is_err()); + } + + #[test] + fn explicit_chain_single_path() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("base.vhdx"); + + let path2 = path.clone(); + pal_async::DefaultPool::run_with(|_driver| async move { + let bf = crate::io::BlockingFile::open(&path2, false).unwrap(); + let mut params = vhdx::CreateParams { + disk_size: 1024 * 1024, + ..Default::default() + }; + vhdx::create(&bf, &mut params).await.unwrap(); + }); + + let resource = open_vhdx_chain_explicit(&[path.as_path()], false).unwrap(); + let _ = resource; + } + + #[test] + fn explicit_chain_missing_file_errors() { + let result = open_vhdx_chain_explicit(&[Path::new("nonexistent.vhdx")], false); + assert!(result.is_err()); + } + + #[pal_async::async_test] + async fn auto_walk_base_disk() { + // Create a base (non-differencing) VHDX, then auto-walk it. + // Should produce a single-layer chain. + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("base.vhdx"); + + let bf = crate::io::BlockingFile::open(&path, false).unwrap(); + let mut params = vhdx::CreateParams { + disk_size: 1024 * 1024, + ..Default::default() + }; + vhdx::create(&bf, &mut params).await.unwrap(); + drop(bf); + + let resource = open_vhdx_chain(&path, false).await.unwrap(); + let _ = resource; + } +} diff --git a/vm/devices/storage/disklayer_vhdx/src/io.rs b/vm/devices/storage/disklayer_vhdx/src/io.rs new file mode 100644 index 0000000000..a696bcd3ca --- /dev/null +++ b/vm/devices/storage/disklayer_vhdx/src/io.rs @@ -0,0 +1,217 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Async file I/O backends for the `vhdx` crate. +//! +//! [`BlockingFile`] implements [`vhdx::AsyncFile`] using `blocking::unblock` +//! and positional I/O (pread/pwrite on Unix, seek_read/seek_write on Windows). +//! No Mutex is needed — `Arc` with positional I/O is inherently safe +//! for concurrent access. + +use std::borrow::Borrow; +use std::fs; +use std::io; +use std::path::Path; +use std::sync::Arc; +use vhdx::AsyncFile; + +/// Platform-specific positional read. +#[cfg(unix)] +fn file_read_at(file: &fs::File, buf: &mut [u8], offset: u64) -> io::Result { + std::os::unix::fs::FileExt::read_at(file, buf, offset) +} + +/// Platform-specific positional read. +#[cfg(windows)] +fn file_read_at(file: &fs::File, buf: &mut [u8], offset: u64) -> io::Result { + std::os::windows::fs::FileExt::seek_read(file, buf, offset) +} + +/// Platform-specific positional write. +#[cfg(unix)] +fn file_write_at(file: &fs::File, buf: &[u8], offset: u64) -> io::Result { + std::os::unix::fs::FileExt::write_at(file, buf, offset) +} + +/// Platform-specific positional write. +#[cfg(windows)] +fn file_write_at(file: &fs::File, buf: &[u8], offset: u64) -> io::Result { + std::os::windows::fs::FileExt::seek_write(file, buf, offset) +} + +/// Read exactly `buf.len()` bytes at `offset`, looping on short reads. +fn read_exact_at(file: &fs::File, mut buf: &mut [u8], mut offset: u64) -> io::Result<()> { + while !buf.is_empty() { + let n = file_read_at(file, buf, offset)?; + if n == 0 { + return Err(io::Error::new(io::ErrorKind::UnexpectedEof, "short read")); + } + offset += n as u64; + buf = &mut buf[n..]; + } + Ok(()) +} + +/// Write exactly `buf.len()` bytes at `offset`, looping on short writes. +fn write_exact_at(file: &fs::File, mut buf: &[u8], mut offset: u64) -> io::Result<()> { + while !buf.is_empty() { + let n = file_write_at(file, buf, offset)?; + if n == 0 { + return Err(io::Error::new( + io::ErrorKind::WriteZero, + "failed to write any bytes", + )); + } + offset += n as u64; + buf = &buf[n..]; + } + Ok(()) +} + +/// A concrete [`AsyncFile`] backed by `Arc`. +/// +/// Uses positional I/O so no seek state or Mutex is needed. Multiple I/Os +/// can be dispatched concurrently. Each operation runs on the `blocking` +/// crate's thread pool via `blocking::unblock`. +#[derive(Clone)] +pub struct BlockingFile { + file: Arc, +} + +impl BlockingFile { + /// Wrap an existing open file. + pub fn new(file: fs::File) -> Self { + Self { + file: Arc::new(file), + } + } + + /// Open a file at the given path. + /// + /// If `read_only`, the file is opened for reading only. + /// Otherwise, it is opened for reading, writing, and creation. + pub fn open(path: &Path, read_only: bool) -> io::Result { + let file = fs::OpenOptions::new() + .read(true) + .write(!read_only) + .create(!read_only) + .open(path)?; + Ok(Self::new(file)) + } + + /// Returns a clone of the inner `Arc`. + /// + /// Useful when the caller needs to perform additional file operations + /// (e.g., data I/O on resolved ranges in the LayerIo implementation). + pub fn clone_arc(&self) -> Arc { + self.file.clone() + } +} + +impl AsyncFile for BlockingFile { + type Buffer = Vec; + + fn alloc_buffer(&self, len: usize) -> Vec { + vec![0u8; len] + } + + async fn read_into(&self, offset: u64, buf: Vec) -> Result, io::Error> { + let file = self.file.clone(); + blocking::unblock(move || { + let mut buf = buf; + read_exact_at(&file, &mut buf, offset)?; + Ok(buf) + }) + .await + } + + async fn write_from( + &self, + offset: u64, + buf: impl Borrow> + Send + 'static, + ) -> Result<(), io::Error> { + let file = self.file.clone(); + blocking::unblock(move || write_exact_at(&file, buf.borrow().as_ref(), offset)).await + } + + async fn flush(&self) -> Result<(), io::Error> { + let file = self.file.clone(); + blocking::unblock(move || file.sync_all()).await + } + + async fn file_size(&self) -> Result { + Ok(self.file.metadata()?.len()) + } + + async fn set_file_size(&self, size: u64) -> Result<(), io::Error> { + let file = self.file.clone(); + blocking::unblock(move || file.set_len(size)).await + } +} + +#[cfg(test)] +mod tests { + use super::*; + use pal_async::async_test; + + #[async_test] + async fn round_trip_read_write() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("test.bin"); + + // Create file, write data, read back + let bf = BlockingFile::open(&path, false).unwrap(); + // Set a size + bf.set_file_size(4096).await.unwrap(); + + let write_data = Arc::new(vec![0xAB_u8; 512]); + bf.write_from(0, write_data.clone()).await.unwrap(); + bf.write_from(1024, write_data.clone()).await.unwrap(); + + let read_buf = bf.read_into(0, vec![0u8; 512]).await.unwrap(); + assert_eq!(read_buf, *write_data); + + let read_buf = bf.read_into(1024, vec![0u8; 512]).await.unwrap(); + assert_eq!(read_buf, *write_data); + + // Verify gap is zeros + let read_buf = bf.read_into(512, vec![0u8; 512]).await.unwrap(); + assert_eq!(read_buf, vec![0u8; 512]); + + // Verify file_size + assert_eq!(bf.file_size().await.unwrap(), 4096); + } + + #[async_test] + async fn flush_succeeds() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("test.bin"); + let bf = BlockingFile::open(&path, false).unwrap(); + bf.set_file_size(4096).await.unwrap(); + bf.flush().await.unwrap(); + } + + #[async_test] + async fn open_with_vhdx() { + // Create a VHDX in memory, write to disk, open with BlockingFile, + // validate VhdxFile::open works + use vhdx::VhdxFile; + + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("test.vhdx"); + + // Step 1: Create via BlockingFile + let bf = BlockingFile::open(&path, false).unwrap(); + let mut params = vhdx::CreateParams { + disk_size: 1024 * 1024, // 1 MiB + ..Default::default() + }; + vhdx::create(&bf, &mut params).await.unwrap(); + + // Step 2: Re-open and validate + let bf = BlockingFile::open(&path, false).unwrap(); + let vhdx = VhdxFile::open(bf).read_only().await.unwrap(); + assert_eq!(vhdx.disk_size(), 1024 * 1024); + assert_eq!(vhdx.logical_sector_size(), 512); + } +} diff --git a/vm/devices/storage/disklayer_vhdx/src/lib.rs b/vm/devices/storage/disklayer_vhdx/src/lib.rs new file mode 100644 index 0000000000..e580785551 --- /dev/null +++ b/vm/devices/storage/disklayer_vhdx/src/lib.rs @@ -0,0 +1,491 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! VHDX disk layer for OpenVMM. +//! +//! Provides a cross-platform, pure-Rust VHDX backend for the layered disk +//! stack. Uses the `vhdx` crate for format parsing and the `disk_layered` +//! crate's `LayerIo` trait for integration. +//! +//! # Modules +//! +//! - [`io`] — `BlockingFile`: async file I/O via `blocking::unblock` + +#![forbid(unsafe_code)] + +pub mod chain; +pub mod io; +pub mod resolver; + +use disk_backend::DiskError; +use disk_backend::UnmapBehavior; +use disk_layered::LayerIo; +use disk_layered::SectorMarker; +use guestmem::MemoryRead; +use guestmem::MemoryWrite; +use inspect::Inspect; +use io::BlockingFile; +use scsi_buffers::RequestBuffers; +use vhdx::AsyncFile; +use vhdx::ReadRange; +use vhdx::VhdxFile; +use vhdx::WriteRange; + +/// A VHDX disk layer implementing [`LayerIo`]. +/// +/// Bridges the `vhdx` crate's range-based I/O model ([`ReadRange`]/[`WriteRange`]) +/// to the `disk_layered` crate's buffer-based I/O model ([`RequestBuffers`] + +/// [`SectorMarker`]). +#[derive(Inspect)] +pub struct VhdxLayer { + #[inspect(skip)] + vhdx: VhdxFile, + #[inspect(skip)] + file: BlockingFile, + sector_size: u32, + physical_sector_size: u32, + sector_count: u64, + block_size: u32, + has_parent: bool, + read_only: bool, +} + +impl VhdxLayer { + /// Create a `VhdxLayer` from an open `VhdxFile` and a clone of the + /// `BlockingFile` used to open it. + /// + /// `file` must be a clone of the `BlockingFile` that was passed to + /// `VhdxFile::open`. Both share the same `Arc`, so data I/O + /// on resolved ranges goes to the same underlying file descriptor. + pub fn new(vhdx: VhdxFile, file: BlockingFile, read_only: bool) -> Self { + let sector_size = vhdx.logical_sector_size(); + let physical_sector_size = vhdx.physical_sector_size(); + let sector_count = vhdx.disk_size() / sector_size as u64; + let block_size = vhdx.block_size(); + let has_parent = vhdx.has_parent(); + Self { + vhdx, + file, + sector_size, + physical_sector_size, + sector_count, + block_size, + has_parent, + read_only, + } + } +} + +/// Convert a [`vhdx::VhdxIoError`] to a [`DiskError`]. +fn vhdx_to_disk_error(e: vhdx::VhdxIoError) -> DiskError { + match e.kind() { + vhdx::VhdxIoErrorKind::ReadOnly => DiskError::ReadOnly, + vhdx::VhdxIoErrorKind::InvalidInput => DiskError::InvalidInput, + vhdx::VhdxIoErrorKind::InvalidSector => DiskError::IllegalBlock, + _ => DiskError::Io(std::io::Error::other(e)), + } +} + +impl LayerIo for VhdxLayer { + fn layer_type(&self) -> &str { + "vhdx" + } + + fn sector_count(&self) -> u64 { + self.sector_count + } + + fn sector_size(&self) -> u32 { + self.sector_size + } + + fn disk_id(&self) -> Option<[u8; 16]> { + Some(self.vhdx.page_83_data().into()) + } + + fn physical_sector_size(&self) -> u32 { + self.physical_sector_size + } + + fn is_fua_respected(&self) -> bool { + true + } + + fn is_logically_read_only(&self) -> bool { + self.read_only + } + + fn optimal_unmap_sectors(&self) -> u32 { + self.block_size / self.sector_size + } + + fn unmap_behavior(&self) -> UnmapBehavior { + if self.has_parent { + UnmapBehavior::Unspecified + } else { + UnmapBehavior::Zeroes + } + } + + async fn sync_cache(&self) -> Result<(), DiskError> { + self.vhdx.flush().await.map_err(vhdx_to_disk_error) + } + + async fn read( + &self, + buffers: &RequestBuffers<'_>, + sector: u64, + mut marker: SectorMarker<'_>, + ) -> Result<(), DiskError> { + let offset = sector * self.sector_size as u64; + let len = buffers.len() as u32; + + // Resolve the read into file-level ranges. + let mut ranges = Vec::new(); + let guard = self + .vhdx + .resolve_read(offset, len, &mut ranges) + .await + .map_err(vhdx_to_disk_error)?; + + // Process each range. + for range in &ranges { + match *range { + ReadRange::Data { + guest_offset, + length, + file_offset, + } => { + let buf_offset = (guest_offset - offset) as usize; + + // Read from the VHDX file into an owned buffer (zero-copy I/O). + let buf = self.file.alloc_buffer(length as usize); + let buf = self + .file + .read_into(file_offset, buf) + .await + .map_err(DiskError::Io)?; + + // Write data into the request buffers at the correct position. + buffers + .subrange(buf_offset, length as usize) + .writer() + .write(buf.as_ref())?; + + // Mark these sectors as present. + let start_sector = guest_offset / self.sector_size as u64; + let sector_count = length as u64 / self.sector_size as u64; + marker.set_range(start_sector..start_sector + sector_count); + } + ReadRange::Zero { + guest_offset, + length, + } => { + let buf_offset = (guest_offset - offset) as usize; + + // Zero this portion of the request buffers. + buffers + .subrange(buf_offset, length as usize) + .writer() + .zero(length as usize)?; + + // Mark these sectors as present (they are definitively zero). + let start_sector = guest_offset / self.sector_size as u64; + let sector_count = length as u64 / self.sector_size as u64; + marker.set_range(start_sector..start_sector + sector_count); + } + ReadRange::Unmapped { .. } => { + // Do NOT mark these sectors. LayeredDisk will read from + // the next layer down in the stack. + } + } + } + + // Drop the guard (decrements per-block refcounts). + drop(guard); + + Ok(()) + } + + async fn write( + &self, + buffers: &RequestBuffers<'_>, + sector: u64, + fua: bool, + ) -> Result<(), DiskError> { + let offset = sector * self.sector_size as u64; + let len = buffers.len() as u32; + + // Resolve the write into file-level ranges. + let mut ranges = Vec::new(); + let guard = self + .vhdx + .resolve_write(offset, len, &mut ranges) + .await + .map_err(vhdx_to_disk_error)?; + + // Process each range. + for range in &ranges { + match *range { + WriteRange::Data { + guest_offset, + length, + file_offset, + } => { + let buf_offset = (guest_offset - offset) as usize; + + // Read data from the request buffers into an owned buffer. + let mut buf = self.file.alloc_buffer(length as usize); + buffers + .subrange(buf_offset, length as usize) + .reader() + .read(buf.as_mut())?; + + // Write to the VHDX file at the resolved offset (zero-copy I/O). + self.file + .write_from(file_offset, buf) + .await + .map_err(DiskError::Io)?; + } + WriteRange::Zero { + file_offset, + length, + } => { + // Write zeros to the file at the given offset + // (for newly-allocated block padding). + self.file + .zero_range(file_offset, length as u64) + .await + .map_err(DiskError::Io)?; + } + } + } + + // Complete the write (commits TFP blocks, updates sector bitmaps). + guard.complete().await.map_err(vhdx_to_disk_error)?; + + // If FUA, flush to stable storage. + if fua { + self.vhdx.flush().await.map_err(vhdx_to_disk_error)?; + } + + Ok(()) + } + + async fn unmap( + &self, + sector: u64, + count: u64, + _block_level_only: bool, + _next_is_zero: bool, + ) -> Result<(), DiskError> { + let offset = sector * self.sector_size as u64; + let length = count * self.sector_size as u64; + + // Use TrimMode::Zero for base disks (unmapped reads will return zero), + // TrimMode::MakeTransparent for diff disks (reads fall through to parent). + let mode = if self.has_parent { + vhdx::TrimMode::MakeTransparent + } else { + vhdx::TrimMode::Zero + }; + + self.vhdx + .trim(vhdx::TrimRequest::new(mode, offset, length)) + .await + .map_err(vhdx_to_disk_error) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use disk_backend::DiskIo; + use disk_layered::DiskLayer; + use disk_layered::LayerConfiguration; + use disk_layered::LayeredDisk; + use guestmem::GuestMemory; + use pal_async::DefaultDriver; + use pal_async::async_test; + use scsi_buffers::OwnedRequestBuffers; + use vhdx::VhdxFile; + + /// Create a VHDX file at the given path and return a `VhdxLayer`. + async fn create_and_open_layer(path: &std::path::Path, driver: &DefaultDriver) -> VhdxLayer { + // Create a 1 MiB VHDX. + let bf = BlockingFile::open(path, false).unwrap(); + let mut params = vhdx::CreateParams { + disk_size: 1024 * 1024, + ..Default::default() + }; + vhdx::create(&bf, &mut params).await.unwrap(); + + // Re-open and wrap as VhdxLayer. + let bf = BlockingFile::open(path, false).unwrap(); + let bf2 = bf.clone(); + let vhdx = VhdxFile::open(bf).writable(&driver).await.unwrap(); + VhdxLayer::new(vhdx, bf2, false) + } + + /// Wrap a VhdxLayer in a single-layer LayeredDisk. + async fn wrap_in_layered_disk(layer: VhdxLayer) -> LayeredDisk { + LayeredDisk::new( + false, + vec![LayerConfiguration { + layer: DiskLayer::new(layer), + write_through: false, + read_cache: false, + }], + ) + .await + .unwrap() + } + + #[async_test] + async fn read_empty_disk_via_layer(driver: DefaultDriver) { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("test.vhdx"); + + let layer = create_and_open_layer(&path, &driver).await; + + // Verify metadata. + assert_eq!(layer.sector_size(), 512); + assert_eq!(layer.sector_count(), 1024 * 1024 / 512); + assert_eq!(layer.layer_type(), "vhdx"); + assert!(!layer.is_logically_read_only()); + + let disk = wrap_in_layered_disk(layer).await; + + // Read sector 0 — empty disk should return all zeros. + let mem = GuestMemory::allocate(512); + let owned = OwnedRequestBuffers::linear(0, 512, true); + disk.read_vectored(&owned.buffer(&mem), 0).await.unwrap(); + + let mut buf = vec![0u8; 512]; + mem.read_at(0, &mut buf).unwrap(); + assert_eq!(buf, vec![0u8; 512]); + } + + #[async_test] + async fn write_and_read_back(driver: DefaultDriver) { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("test.vhdx"); + + let layer = create_and_open_layer(&path, &driver).await; + let disk = wrap_in_layered_disk(layer).await; + + // Write a known pattern to sector 0. + let mem = GuestMemory::allocate(512); + let pattern: Vec = (0..512u16).map(|i| (i % 251) as u8).collect(); + mem.write_at(0, &pattern).unwrap(); + let owned = OwnedRequestBuffers::linear(0, 512, false); + disk.write_vectored(&owned.buffer(&mem), 0, false) + .await + .unwrap(); + + // Read back sector 0, verify data matches. + let owned = OwnedRequestBuffers::linear(0, 512, true); + disk.read_vectored(&owned.buffer(&mem), 0).await.unwrap(); + let mut buf = vec![0u8; 512]; + mem.read_at(0, &mut buf).unwrap(); + assert_eq!(buf, pattern); + + // Read sector 1 — should be zero. + disk.read_vectored(&owned.buffer(&mem), 1).await.unwrap(); + mem.read_at(0, &mut buf).unwrap(); + assert_eq!(buf, vec![0u8; 512]); + } + + #[async_test] + async fn sync_cache_works(driver: DefaultDriver) { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("test.vhdx"); + + let layer = create_and_open_layer(&path, &driver).await; + let disk = wrap_in_layered_disk(layer).await; + + disk.sync_cache().await.unwrap(); + } + + #[async_test] + async fn write_close_reopen_read(driver: DefaultDriver) { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("test.vhdx"); + + // Create and write data + { + let layer = create_and_open_layer(&path, &driver).await; + let disk = wrap_in_layered_disk(layer).await; + + let mem = GuestMemory::allocate(512); + let pattern: Vec = (0..512u16).map(|i| (i % 251) as u8).collect(); + mem.write_at(0, &pattern).unwrap(); + let owned = OwnedRequestBuffers::linear(0, 512, false); + disk.write_vectored(&owned.buffer(&mem), 0, false) + .await + .unwrap(); + + // Flush to ensure data is on disk + disk.sync_cache().await.unwrap(); + } + + // Re-open and read back + { + let bf = BlockingFile::open(&path, true).unwrap(); + let bf2 = bf.clone(); + let vhdx = VhdxFile::open(bf) + .allow_replay(true) + .read_only() + .await + .unwrap(); + let layer = VhdxLayer::new(vhdx, bf2, true); + let disk = LayeredDisk::new( + true, + vec![LayerConfiguration { + layer: DiskLayer::new(layer), + write_through: false, + read_cache: false, + }], + ) + .await + .unwrap(); + + let mem = GuestMemory::allocate(512); + let owned = OwnedRequestBuffers::linear(0, 512, true); + disk.read_vectored(&owned.buffer(&mem), 0).await.unwrap(); + + let mut buf = vec![0u8; 512]; + mem.read_at(0, &mut buf).unwrap(); + let expected: Vec = (0..512u16).map(|i| (i % 251) as u8).collect(); + assert_eq!(buf, expected); + } + } + + #[async_test] + async fn multi_sector_write_and_read(driver: DefaultDriver) { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("test.vhdx"); + + let layer = create_and_open_layer(&path, &driver).await; + let disk = wrap_in_layered_disk(layer).await; + + // Write 4 KiB (8 sectors) starting at sector 0 + let len = 4096usize; + let mem = GuestMemory::allocate(len); + let pattern: Vec = (0..len).map(|i| (i % 137) as u8).collect(); + mem.write_at(0, &pattern).unwrap(); + + let owned = OwnedRequestBuffers::linear(0, len, false); + disk.write_vectored(&owned.buffer(&mem), 0, false) + .await + .unwrap(); + + // Read back and verify + let owned = OwnedRequestBuffers::linear(0, len, true); + disk.read_vectored(&owned.buffer(&mem), 0).await.unwrap(); + + let mut buf = vec![0u8; len]; + mem.read_at(0, &mut buf).unwrap(); + assert_eq!(buf, pattern); + } +} diff --git a/vm/devices/storage/disklayer_vhdx/src/resolver.rs b/vm/devices/storage/disklayer_vhdx/src/resolver.rs new file mode 100644 index 0000000000..3d30dbe377 --- /dev/null +++ b/vm/devices/storage/disklayer_vhdx/src/resolver.rs @@ -0,0 +1,65 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Resource resolver for VHDX disk layers. + +use crate::VhdxLayer; +use crate::io::BlockingFile; +use async_trait::async_trait; +use disk_backend_resources::layer::VhdxDiskLayerHandle; +use disk_layered::resolve::ResolveDiskLayerParameters; +use disk_layered::resolve::ResolvedDiskLayer; +use thiserror::Error; +use vhdx::VhdxFile; +use vm_resource::AsyncResolveResource; +use vm_resource::ResourceResolver; +use vm_resource::declare_static_async_resolver; +use vm_resource::kind::DiskLayerHandleKind; + +/// Resolver for [`VhdxDiskLayerHandle`]. +pub struct VhdxDiskLayerResolver; + +declare_static_async_resolver!( + VhdxDiskLayerResolver, + (DiskLayerHandleKind, VhdxDiskLayerHandle) +); + +/// Errors from resolving a VHDX disk layer. +#[derive(Debug, Error)] +pub enum ResolveVhdxError { + /// Failed to open the VHDX file. + #[error("failed to open vhdx")] + Open(#[source] vhdx::OpenError), +} + +#[async_trait] +impl AsyncResolveResource for VhdxDiskLayerResolver { + type Output = ResolvedDiskLayer; + type Error = ResolveVhdxError; + + async fn resolve( + &self, + _resolver: &ResourceResolver, + resource: VhdxDiskLayerHandle, + input: ResolveDiskLayerParameters<'_>, + ) -> Result { + let read_only = resource.read_only || input.read_only; + let file = BlockingFile::new(resource.file); + let file2 = file.clone(); + let vhdx = if read_only { + VhdxFile::open(file) + .read_only() + .await + .map_err(ResolveVhdxError::Open)? + } else { + let driver = input.driver_source.simple(); + VhdxFile::open(file) + .writable(&driver) + .await + .map_err(ResolveVhdxError::Open)? + }; + Ok(ResolvedDiskLayer::new(VhdxLayer::new( + vhdx, file2, read_only, + ))) + } +} diff --git a/vm/devices/storage/vhdx/Cargo.toml b/vm/devices/storage/vhdx/Cargo.toml new file mode 100644 index 0000000000..2cc7f05a97 --- /dev/null +++ b/vm/devices/storage/vhdx/Cargo.toml @@ -0,0 +1,34 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +[package] +name = "vhdx" +edition.workspace = true +rust-version.workspace = true + +[dependencies] +bitfield-struct.workspace = true +bitvec = { workspace = true, features = ["std"] } +crc32c.workspace = true +event-listener.workspace = true +futures.workspace = true +guid.workspace = true +mesh.workspace = true +pal_async.workspace = true +parking_lot = { workspace = true, features = ["arc_lock"] } +thiserror.workspace = true +tracing.workspace = true +zerocopy.workspace = true + +[target.'cfg(windows)'.dev-dependencies] +windows = { workspace = true, features = [ + "Win32_Storage_Vhd", + "Win32_Foundation", + "Win32_Storage_FileSystem", + "Win32_System_IO", + "Win32_System_Threading", +] } +tempfile.workspace = true + +[lints] +workspace = true diff --git a/vm/devices/storage/vhdx/src/apply_task.rs b/vm/devices/storage/vhdx/src/apply_task.rs new file mode 100644 index 0000000000..14966a1f5a --- /dev/null +++ b/vm/devices/storage/vhdx/src/apply_task.rs @@ -0,0 +1,95 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Apply task — writes logged pages to their final file offsets. +//! +//! The apply task receives [`ApplyBatch`] items from the +//! [log task](crate::log_task) via a mesh channel. For each batch, it +//! writes all pages to their final file offsets, **releases log permits** +//! (via [`LogPermits`](crate::log_permits::LogPermits)), and publishes +//! `applied_lsn` with the flush sequence number (FSN) needed to make +//! the writes durable. +//! +//! The apply task does **not** flush. Flushing is driven by consumers +//! who need durability: +//! - The log task flushes when it needs to advance the log tail +//! (on `LogFull` or graceful close). +//! - [`VhdxFile::flush()`](crate::open::VhdxFile::flush) flushes for crash safety. +//! +//! Both callers use +//! [`FlushSequencer::flush_through()`](crate::flush::FlushSequencer::flush_through) +//! with the FSN from the watermark, which coalesces naturally. + +use crate::AsyncFile; +use crate::flush::FlushSequencer; +use crate::log_permits::LogPermits; +use crate::log_task::LogData; +use crate::log_task::Lsn; +use crate::lsn_watermark::LsnWatermark; +use crate::open::FailureFlag; +use std::sync::Arc; + +/// A batch of page-aligned data that has been logged and needs to be applied +/// (written to their final file offsets). +pub(crate) struct ApplyBatch { + /// The data to write. + pub data: Vec>, + /// The LSN of the log entry that contains these pages. + pub lsn: Lsn, +} + +/// Run the apply task main loop. +/// +/// Receives batches from the log task, writes pages to their final +/// file offsets, releases log permits, and publishes `applied_lsn` +/// with the FSN needed for durability. +pub(crate) async fn run_apply_task( + mut rx: mesh::Receiver>, + file: Arc, + flush_sequencer: Arc, + applied_lsn: Arc, + log_permits: Arc, + failure_flag: Arc, +) { + loop { + let batch = match rx.recv().await { + Ok(batch) => batch, + Err(_) => { + // Channel closed — log task shut down. Exit. + break; + } + }; + + let lsn = batch.lsn; + let mut page_count = 0; + + // Write each range to its final file offset (zero-copy via Arc). + for entry in batch.data { + page_count += entry.page_count(); + let (file_offset, data) = entry.into_parts(); + if let Err(e) = file.write_from(file_offset, data).await { + tracing::error!( + "VHDX apply task: write error at offset {:#x}: {e}", + file_offset + ); + log_permits.fail(format!("apply write failed: {e}")); + applied_lsn.fail(format!("apply write failed: {e}")); + failure_flag.set(&e); + return; + } + } + + // The pages have been dropped by this point, so it's safe to release + // the permits for this batch. + log_permits.release(page_count); + + // Capture the FSN *after* the writes. Flushing through this FSN + // will make all the writes above durable. We don't flush here — + // the log task or VhdxFile::flush() will do it when needed. + let fsn = flush_sequencer.current_fsn(); + + // Publish (lsn, fsn): "pages through this LSN are at their final + // offsets; flush through this FSN to make them durable." + applied_lsn.advance(lsn, fsn); + } +} diff --git a/vm/devices/storage/vhdx/src/bat.rs b/vm/devices/storage/vhdx/src/bat.rs new file mode 100644 index 0000000000..1e5514745b --- /dev/null +++ b/vm/devices/storage/vhdx/src/bat.rs @@ -0,0 +1,1395 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! BAT (Block Allocation Table) lookup and management. +//! +//! Provides on-demand BAT entry lookup through the [`PageCache`], computing +//! the correct BAT page offset for any given block number. Handles the +//! interleaving of payload block entries with sector bitmap entries. + +use crate::AsyncFile; +use crate::cache::PageCache; +use crate::cache::PageKey; +use crate::cache::WriteMode; +use crate::create::ceil_div; +use crate::create::chunk_block_count; +use crate::error::CorruptionType; +use crate::error::OpenError; +use crate::error::OpenErrorInner; +use crate::error::VhdxIoError; +use crate::error::VhdxIoErrorInner; +use crate::flush::Fsn; +use crate::format::BatEntry; +use crate::format::BatEntryState; +use crate::format::CACHE_PAGE_SIZE; +use crate::format::ENTRIES_PER_BAT_PAGE; +use crate::format::MB1; +use bitfield_struct::bitfield; +use std::sync::atomic::AtomicU16; +use std::sync::atomic::AtomicU32; +use std::sync::atomic::Ordering; +use zerocopy::IntoBytes; + +use crate::space::EofState; +use crate::space::FreeSpaceTracker; +use zerocopy::FromBytes; + +/// Cache tag for BAT region pages. +pub(crate) const BAT_TAG: u8 = 0; + +/// Size of a sector bitmap block in bytes (1 MiB). +pub(crate) const SECTOR_BITMAP_BLOCK_SIZE: u32 = 1024 * 1024; + +/// Per-block I/O refcount packed into a `u16`. +/// +/// Layout: +/// - Bit 15 (`TRIM_PENDING_BIT`): set by trim to block new I/O acquisitions. +/// - Bits 0-14: I/O reference count (0..32767). +/// +/// Valid states: +/// - `0x0000` — idle, no I/O, no trim. +/// - `0x0001..MAX_IO_REFCOUNT` — active I/O refcount. +/// - `TRIM_PENDING` (`0x8000`) — trim pending, I/Os drained, ready to claim. +/// - `0x8001..0xFFFE` — trim pending + draining I/Os. +/// - `TRIM_CLAIMED` (`0xFFFF`) — trim owns the block exclusively. +/// +/// The pending bit gives trim **writer priority**: once set, no new I/O +/// can increment the refcount, preventing livelock from a steady I/O stream. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +struct IoBlockRef(u16); + +impl IoBlockRef { + /// High bit: trim is pending (blocks new I/O). + const TRIM_PENDING_BIT: u16 = 0x8000; + /// Maximum I/O refcount (bits 0-14 all set). + const MAX_IO_REFCOUNT: u16 = 0x7FFF; + const FREE: Self = Self(0); + /// Trim pending, all I/Os drained — ready to finish claiming. + const TRIM_PENDING: Self = Self(Self::TRIM_PENDING_BIT); + /// Trim has exclusively claimed the block. + const TRIM_CLAIMED: Self = Self(u16::MAX); + + /// The I/O refcount (bits 0-14), ignoring the trim-pending bit. + fn io_count(self) -> u16 { + self.0 & Self::MAX_IO_REFCOUNT + } + + /// Whether the trim-pending bit is set. + fn trim_pending(self) -> bool { + self.0 & Self::TRIM_PENDING_BIT != 0 + } + + /// Whether new I/O acquisitions should be blocked. + /// + /// True when the trim-pending bit is set OR the I/O count is at + /// the maximum (would overflow into the pending bit). + fn blocks_new_io(self) -> bool { + self.0 >= Self::MAX_IO_REFCOUNT + } +} + +pub(crate) struct Bat { + /// Number of data blocks (payload blocks) in the disk. + pub data_block_count: u32, + /// Number of sector bitmap blocks (chunks). Zero if no parent. + pub sector_bitmap_block_count: u32, + /// Chunk ratio: number of data blocks per sector bitmap entry. + pub chunk_ratio: u32, + /// Block size in bytes. + pub block_size: u32, + /// Whether the disk has a parent (differencing). + pub has_parent: bool, + + /// One `AtomicU32` per payload block (indexed by block number). + /// Each stores a [`BlockMapping`] bitfield. Lock-free: individual + /// entries are read/written atomically without a shared lock. + payload_mappings: Vec, + /// One `AtomicU32` per sector bitmap block (indexed by chunk number). + sector_bitmap_mappings: Vec, + + /// Per-payload-block I/O refcounts (see [`IoBlockRef`] for layout). + io_refcounts: Vec, + + /// Notified whenever a block's refcount changes in a way that could + /// unblock a waiter: I/O count reaching zero (unblocks trim), + /// trim releasing a claim (unblocks I/O), or I/O count dropping + /// below the overflow threshold (unblocks I/O). + refcount_event: event_listener::Event, +} + +/// In-memory BAT entry. Compact 32-bit representation used in the in-memory +/// BAT array (not on disk). +/// +/// Layout: state (3 bits) | transitioning_to_fully_present (1 bit) | file_megabyte (28 bits) +/// +/// The 28-bit `file_megabyte` field supports files up to 2^28 MB = 256 TB. +#[bitfield(u32)] +#[derive(PartialEq, Eq)] +pub(crate) struct BlockMapping { + /// Block state (same values as BatEntryState). + #[bits(3)] + state: u8, + /// Set during allocation: space has been allocated but data I/O may still + /// be in flight. Other writers to this block must wait. + #[bits(1)] + pub transitioning_to_fully_present: bool, + /// File offset in megabytes. + #[bits(28)] + pub file_megabyte: u32, +} + +impl BlockMapping { + fn supported_bat_entry_bits() -> u64 { + u64::from( + BatEntry::new() + .with_state(0b111) + .with_file_offset_mb((1_u64 << 44) - 1), + ) + } + + /// File byte offset (converts the megabyte field to bytes). + pub fn file_offset(self) -> u64 { + self.file_megabyte() as u64 * MB1 + } + + /// Parse the block state. + /// + /// Panics if the raw state is invalid — this is an internal invariant + /// since states are validated at BAT load time and only set to known + /// values at runtime. + pub fn bat_state(self) -> BatEntryState { + BatEntryState::from_raw(self.state()).expect("InternalBlockMapping has invalid state") + } + + pub fn with_bat_state(self, state: BatEntryState) -> Self { + self.with_state(state as u8) + } + + /// Whether this mapping is soft-anchored: unmapped or undefined + /// with a non-zero file offset retained for potential reuse. + pub fn is_soft_anchored(self) -> bool { + let state = self.bat_state(); + matches!(state, BatEntryState::Unmapped | BatEntryState::Undefined) + && self.file_megabyte() != 0 + } + + /// Create a [`BlockMapping`] from an on-disk [`BatEntry`]. + /// + /// Validates the entry state and file offset. For non-differencing + /// disks (`has_parent == false`), normalizes `PartiallyPresent` to + /// `FullyPresent` at load time. + pub fn from_bat_entry(entry: BatEntry, has_parent: bool) -> Result { + if u64::from(entry) & !Self::supported_bat_entry_bits() != 0 { + return Err(CorruptionType::ReservedBatEntryFieldNonzero.into()); + } + let raw_state = entry.state(); + let mut bat_state = + BatEntryState::from_raw(raw_state).ok_or(CorruptionType::InvalidBlockState)?; + // Normalize PartiallyPresent → FullyPresent for non-diff disks. + if !has_parent && bat_state == BatEntryState::PartiallyPresent { + bat_state = BatEntryState::FullyPresent; + } + let file_mb = entry.file_offset_mb(); + if file_mb > 0x0FFF_FFFF { + return Err((CorruptionType::InvalidBlockState).into()); + } + Ok(BlockMapping::new() + .with_bat_state(bat_state) + .with_transitioning_to_fully_present(false) + .with_file_megabyte(file_mb as u32)) + } + + /// Create a [`BlockMapping`] from an on-disk SBM [`BatEntry`]. + /// + /// Validates the entry state and file offset. Normalizes + /// `PartiallyPresent` to `FullyPresent` (compatibility). + pub fn from_sbm_bat_entry(entry: BatEntry) -> Result { + if u64::from(entry) & !Self::supported_bat_entry_bits() != 0 { + return Err(CorruptionType::ReservedBatEntryFieldNonzero.into()); + } + let raw_state = entry.state(); + + let mut bat_state = + BatEntryState::from_raw(raw_state).ok_or(CorruptionType::InvalidBlockState)?; + if bat_state == BatEntryState::PartiallyPresent { + bat_state = BatEntryState::FullyPresent; + } + let file_mb = entry.file_offset_mb(); + if file_mb > 0x0FFF_FFFF { + return Err((CorruptionType::InvalidBlockState).into()); + } + Ok(BlockMapping::new() + .with_bat_state(bat_state) + .with_transitioning_to_fully_present(false) + .with_file_megabyte(file_mb as u32)) + } +} + +/// Block type discriminator for BAT entries. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum BlockType { + /// A data payload block. + Payload, + /// A sector bitmap block (differencing disks only). + SectorBitmap, +} + +impl Bat { + /// Create a new BAT manager from parsed metadata. + /// + /// Computes chunk ratio, data block count, and sector bitmap block count. + pub fn new( + disk_size: u64, + block_size: u32, + logical_sector_size: u32, + has_parent: bool, + bat_length: u32, + ) -> Result { + let chunk_ratio = chunk_block_count(block_size, logical_sector_size); + if chunk_ratio == 0 { + return Err((CorruptionType::InvalidBlockSize).into()); + } + + let data_block_count = ceil_div(disk_size, block_size as u64) as u32; + let sector_bitmap_block_count = if has_parent { + ceil_div(data_block_count as u64, chunk_ratio as u64) as u32 + } else { + 0 + }; + + let entry_count = if has_parent { + sector_bitmap_block_count as u64 * (chunk_ratio as u64 + 1) + } else { + data_block_count as u64 + + (data_block_count.saturating_sub(1) as u64 / chunk_ratio as u64) + }; + + let required_bytes = entry_count * size_of::() as u64; + if required_bytes > bat_length as u64 { + return Err((CorruptionType::BatTooSmall).into()); + } + + let payload_mappings = (0..data_block_count).map(|_| AtomicU32::new(0)).collect(); + let sector_bitmap_mappings = (0..sector_bitmap_block_count) + .map(|_| AtomicU32::new(0)) + .collect(); + let io_refcounts = (0..data_block_count).map(|_| AtomicU16::new(0)).collect(); + + Ok(Bat { + data_block_count, + sector_bitmap_block_count, + chunk_ratio, + block_size, + has_parent, + payload_mappings, + sector_bitmap_mappings, + io_refcounts, + refcount_event: event_listener::Event::new(), + }) + } + + /// Try to atomically increment the I/O refcount for a block. + /// + /// Returns `true` if the increment succeeded, `false` if new I/O is + /// blocked. New I/O is blocked when: + /// - The trim-pending bit is set (trim has writer priority). + /// - The I/O count is at `MAX_IO_REFCOUNT` (would overflow). + /// - The block is trim-claimed (`TRIM_CLAIMED`). + fn try_increment_io_refcount(&self, block_number: u32) -> bool { + let rc = &self.io_refcounts[block_number as usize]; + loop { + let old = IoBlockRef(rc.load(Ordering::Acquire)); + if old.blocks_new_io() { + return false; + } + let new = old.0 + 1; + match rc.compare_exchange_weak(old.0, new, Ordering::AcqRel, Ordering::Acquire) { + Ok(_) => return true, + Err(_) => continue, + } + } + } + + /// Atomically decrement the I/O refcount. + /// + /// The trim-pending bit is preserved — only the I/O count in + /// bits 0-14 is decremented. Panics on underflow or if the block + /// is trim-claimed. + /// + /// Returns `true` if callers should notify `refcount_event`: + /// when the I/O count hits zero (trim may be waiting), or when + /// the count drops from the overflow threshold. + #[must_use] + fn decrement_io_refcount(&self, block_number: u32) -> bool { + let prev = + IoBlockRef(self.io_refcounts[block_number as usize].fetch_sub(1, Ordering::AcqRel)); + assert!( + prev.io_count() > 0 && prev != IoBlockRef::TRIM_CLAIMED, + "io_refcount underflow or trim claimed on block {block_number} (was {:#06x})", + prev.0, + ); + prev.io_count() == 1 || prev.io_count() == IoBlockRef::MAX_IO_REFCOUNT + } + + /// Claim a block for trim, with writer priority. + pub(crate) async fn claim_for_trim(&self, block_number: u32) -> TrimGuard<'_> { + let rc = &self.io_refcounts[block_number as usize]; + loop { + let listener = self.refcount_event.listen(); + let result = rc.fetch_update(Ordering::AcqRel, Ordering::Acquire, |raw| { + let old = IoBlockRef(raw); + match old { + // Idle — claim directly. + IoBlockRef::FREE => Some(IoBlockRef::TRIM_CLAIMED.0), + // Pending bit set, I/Os drained — finish claiming. + IoBlockRef::TRIM_PENDING => Some(IoBlockRef::TRIM_CLAIMED.0), + // Already claimed — wait for release. + IoBlockRef::TRIM_CLAIMED => None, + // I/Os active, no pending bit — set it. + _ if !old.trim_pending() => Some(old.0 | IoBlockRef::TRIM_PENDING_BIT), + // Pending bit set, I/Os still draining — wait. + _ => None, + } + }); + break match result.map(IoBlockRef) { + Ok(IoBlockRef::FREE | IoBlockRef::TRIM_PENDING) => TrimGuard { + bat: self, + block_number, + }, + _ => { + // Wait for the I/O count to reach 0 or for the trim claim to be released. + listener.await; + continue; + } + }; + } + } + + /// Release a trim claim on a block (store 0), waking blocked I/O paths. + fn release_trim_claim(&self, block_number: u32) { + let prev = IoBlockRef(self.io_refcounts[block_number as usize].swap(0, Ordering::Release)); + assert_eq!( + prev, + IoBlockRef::TRIM_CLAIMED, + "release_trim_claim on block {block_number} that wasn't claimed (was {:#06x})", + prev.0, + ); + self.refcount_event.notify(usize::MAX); + } + + /// Load the current raw I/O refcount for a block (for testing). + #[cfg(test)] + pub(crate) fn io_refcount(&self, block_number: u32) -> u16 { + self.io_refcounts[block_number as usize].load(Ordering::Acquire) + } + + /// Compute the BAT entry index for a given data block number. + /// + /// For every `chunk_ratio` payload entries, one sector bitmap entry is + /// interleaved. The entry index accounts for these interleaved entries. + pub fn payload_entry_index(&self, block_number: u32) -> u32 { + block_number + (block_number / self.chunk_ratio) + } + + /// Compute the BAT entry index for a given sector bitmap block (chunk number). + /// + /// The sector bitmap entry follows every `chunk_ratio` payload entries. + pub fn sector_bitmap_entry_index(&self, chunk_number: u32) -> u32 { + ((chunk_number + 1) * self.chunk_ratio) + chunk_number + } + + /// Reverse-map a flat BAT entry number to (block_type, block_number). + /// + /// Returns `None` if the entry is beyond the end of the disk. + fn entry_number_to_block_id(&self, entry_number: u32) -> Option<(BlockType, u32)> { + let group_size = self.chunk_ratio + 1; + let group = entry_number / group_size; + let position = entry_number % group_size; + + if position == self.chunk_ratio { + // This is a sector bitmap / padding entry. + if self.has_parent && group < self.sector_bitmap_block_count { + Some((BlockType::SectorBitmap, group)) + } else { + None + } + } else { + // This is a payload entry. + let block_number = group * self.chunk_ratio + position; + if block_number < self.data_block_count { + Some((BlockType::Payload, block_number)) + } else { + None + } + } + } + + /// Convert a virtual disk byte offset to a block number. + pub fn offset_to_block(&self, offset: u64) -> u32 { + (offset / self.block_size as u64) as u32 + } + + /// Compute the byte offset within a block for a given virtual disk offset. + #[cfg(test)] + pub fn offset_within_block(&self, offset: u64) -> u32 { + (offset % self.block_size as u64) as u32 + } + + /// Iterate over the block spans touched by a virtual disk range. + /// + /// Given a guest `offset` and `len`, yields one [`BlockSpan`] per + /// block touched, with the block-relative offset and clamped length. + /// This is the single source of truth for the block-walk arithmetic + /// used by read, write, complete, and abort paths. + pub fn block_spans(&self, offset: u64, len: u32) -> BlockSpanIter { + BlockSpanIter { + block_size: self.block_size, + base_offset: offset, + total_len: len, + current_offset: 0, + } + } + + /// Serialize a BAT page from in-memory state. + /// + /// Produces all entries for the given page, with TFP blocks having + /// their `file_offset_mb` masked to zero (allocation not committed + /// yet). + fn produce_page(&self, page_index: usize, buf: &mut [u8; CACHE_PAGE_SIZE as usize]) { + let base_entry = page_index as u32 * ENTRIES_PER_BAT_PAGE as u32; + for i in 0..ENTRIES_PER_BAT_PAGE as u32 { + let entry_number = base_entry + i; + let bat_entry = match self.entry_number_to_block_id(entry_number) { + Some((BlockType::Payload, block_number)) => { + let mapping = self.get_block_mapping(block_number); + let file_mb = if mapping.transitioning_to_fully_present() { + 0 + } else { + mapping.file_megabyte() as u64 + }; + BatEntry::new() + .with_state(mapping.state()) + .with_file_offset_mb(file_mb) + } + Some((BlockType::SectorBitmap, chunk_number)) => { + let mapping = self.get_sector_bitmap_mapping(chunk_number); + BatEntry::new() + .with_state(mapping.state()) + .with_file_offset_mb(mapping.file_megabyte() as u64) + } + None => BatEntry::new(), + }; + let offset = i as usize * size_of::(); + buf[offset..offset + size_of::()].copy_from_slice(bat_entry.as_bytes()); + } + } + + /// Write a block mapping to the cache, converting from in-memory + /// representation to on-disk BAT entry format. + /// + /// Atomically updates the in-memory BAT and the cache page under + /// the page lock, ensuring no window where the in-memory state is + /// visible but the cache page hasn't been stamped with the FSN. + /// + /// Uses `Overwrite` mode to avoid unnecessary disk reads. If the + /// page is already cached, patches only the single entry. If not + /// cached, builds the full page from in-memory state (no disk read). + pub async fn write_block_mapping( + &self, + cache: &PageCache, + block_type: BlockType, + block_number: u32, + mapping: BlockMapping, + pre_log_fsn: Option, + ) -> Result<(), VhdxIoError> { + let entry_number = match block_type { + BlockType::Payload => self.payload_entry_index(block_number), + BlockType::SectorBitmap => self.sector_bitmap_entry_index(block_number), + }; + let page_number = entry_number as usize / ENTRIES_PER_BAT_PAGE as usize; + let page_offset = page_number as u64 * CACHE_PAGE_SIZE; + let entry_within_page = entry_number as usize % ENTRIES_PER_BAT_PAGE as usize; + + let mut guard = cache + .acquire_write( + PageKey { + tag: BAT_TAG, + offset: page_offset, + }, + WriteMode::Overwrite, + ) + .await + .map_err(VhdxIoErrorInner::BatCache)?; + + // Update in-memory BAT under the page lock. This ensures a + // concurrent trim on a block sharing the same page can't dirty + // the page (and get it flushed to WAL) between our in-memory + // update and the FSN stamp below. + match block_type { + BlockType::Payload => { + self.set_block_mapping(block_number, mapping); + } + BlockType::SectorBitmap => { + self.set_sector_bitmap_mapping(block_number, mapping); + } + } + + if guard.is_overwriting() { + // Slow path: page not cached — build from in-memory state. + self.produce_page(page_number, &mut guard); + } else { + // Fast path: page is cached — patch just the one entry. + let bat_entry = BatEntry::new() + .with_state(mapping.state()) + .with_file_offset_mb(mapping.file_megabyte() as u64); + let byte_offset = entry_within_page * size_of::(); + guard[byte_offset..byte_offset + size_of::()] + .copy_from_slice(bat_entry.as_bytes()); + } + + // Set pre-log FSN while the page lock is still held, so + // that the FSN is visible atomically with the dirty-mark. + if let Some(fsn) = pre_log_fsn { + guard.set_pre_log_fsn(fsn); + } + + // BAT pages are always rebuildable from in-memory BatState, + // so prefer evicting them over sector bitmap pages. + guard.demote(); + + Ok(()) + } + + /// Read chunk size for BAT loading (256 KiB = 32768 entries). + const BAT_READ_CHUNK: usize = 256 * 1024; + + /// Load the in-memory BAT state from disk. + /// + /// Reads the BAT region in fixed-size chunks and does a single + /// sequential pass over all entries, dispatching payload vs. SBM + /// entries via [`entry_number_to_block_id`]. This avoids both a + /// large peak allocation and redundant reads of the same region. + /// + /// During parse, marks allocated blocks in the FreeSpaceTracker + /// and records soft-anchored blocks. + pub(crate) async fn load_bat_state( + &mut self, + file: &F, + bat_offset: u64, + bat_length: u32, + free_space: &FreeSpaceTracker, + eof_state: &mut EofState, + ) -> Result<(), OpenError> { + let bat_len = bat_length as usize; + let total_entries = bat_len / size_of::(); + let chunk_size = std::cmp::min(bat_len, Self::BAT_READ_CHUNK); + let entries_per_chunk = chunk_size / size_of::(); + + let mut file_pos: usize = 0; + let mut entry_num = 0; + + // Allocate a single read buffer, reused across iterations. + // On the last iteration, we may read up to `chunk_size` bytes + // even if fewer remain — the excess is zero-initialized and + // ignored by the parser. + let mut buf = file.alloc_buffer(chunk_size); + + while entry_num < total_entries { + // Read the next chunk. + buf = file + .read_into(bat_offset + file_pos as u64, buf) + .await + .map_err(OpenErrorInner::Io)?; + + let entries_in_chunk = std::cmp::min(entries_per_chunk, total_entries - entry_num); + for i in 0..entries_in_chunk { + let byte_offset = i * size_of::(); + let entry = BatEntry::read_from_bytes( + &buf.as_ref()[byte_offset..byte_offset + size_of::()], + ) + .map_err(|_| CorruptionType::InvalidBlockState)?; + + match self.entry_number_to_block_id((entry_num + i) as u32) { + Some((BlockType::Payload, block_number)) => { + let mapping = BlockMapping::from_bat_entry(entry, self.has_parent)?; + if mapping.bat_state().is_allocated() { + let file_offset = mapping.file_offset(); + if file_offset != 0 { + free_space.mark_range_in_use( + eof_state, + file_offset, + self.block_size, + )?; + } + } else if (mapping.bat_state() == BatEntryState::Unmapped + || mapping.bat_state() == BatEntryState::Undefined) + && mapping.file_megabyte() != 0 + { + let file_offset = mapping.file_offset(); + free_space.mark_range_in_use( + eof_state, + file_offset, + self.block_size, + )?; + free_space.mark_trimmed_block( + block_number, + file_offset, + self.block_size, + )?; + } + self.payload_mappings[block_number as usize] + .store(mapping.into(), Ordering::Relaxed); + } + Some((BlockType::SectorBitmap, chunk_number)) => { + let mapping = BlockMapping::from_sbm_bat_entry(entry)?; + if mapping.bat_state().is_allocated() { + let file_offset = mapping.file_offset(); + if file_offset != 0 { + free_space.mark_range_in_use( + eof_state, + file_offset, + SECTOR_BITMAP_BLOCK_SIZE, + )?; + } + } + self.sector_bitmap_mappings[chunk_number as usize] + .store(mapping.into(), Ordering::Relaxed); + } + None => { + // Entry beyond the disk — padding per the VHDX spec. + } + } + } + + entry_num += entries_in_chunk; + file_pos += chunk_size; + } + + // Cross-validate: every PartiallyPresent payload block must have + // a corresponding allocated (FullyPresent) SBM block. This is + // required by the VHDX spec and enforced at runtime by + // ensure_sbm_allocated, but a corrupt file could violate it. + if self.has_parent { + for block in 0..self.data_block_count { + let mapping = BlockMapping::from( + self.payload_mappings[block as usize].load(Ordering::Relaxed), + ); + if mapping.bat_state() == BatEntryState::PartiallyPresent { + let chunk = block / self.chunk_ratio; + let sbm = BlockMapping::from( + self.sector_bitmap_mappings[chunk as usize].load(Ordering::Relaxed), + ); + if sbm.bat_state() != BatEntryState::FullyPresent { + return Err(CorruptionType::PartiallyPresentWithoutSectorBitmap.into()); + } + } + } + } + + Ok(()) + } + + /// Atomically increment I/O refcounts for a contiguous range of + /// blocks, returning a [`BatGuard`] that releases them on drop. + /// + /// Blocks are acquired in ascending order. If a block is claimed + /// by trim, the caller holds previously-acquired blocks and waits + /// for the blocked block to become available. Deadlock-free because + /// both I/O and trim always acquire blocks in ascending order. + pub async fn acquire_io_refcounts(&self, start_block: u32, block_count: u32) -> BatGuard<'_> { + let mut guard = BatGuard { + bat: Some(self), + start_block, + block_count: 0, + }; + for block in start_block..start_block + block_count { + while !self.try_increment_io_refcount(block) { + let listener = self.refcount_event.listen(); + if !self.try_increment_io_refcount(block) { + listener.await; + } + } + guard.block_count += 1; + } + guard + } + + /// Look up the payload block mapping for a given data block number. + /// + /// Returns a point-in-time snapshot. Callers that hold I/O + /// refcounts can rely on the following: + /// + /// - **`file_offset` is stable for allocated blocks.** If the + /// mapping shows `FullyPresent` or `PartiallyPresent`, the file + /// offset won't be reclaimed out from under you — trim must + /// drain I/O refcounts before it can claim the block. + /// - **State can only advance, not regress.** A block that is + /// `FullyPresent` won't revert to `NotPresent` while I/O + /// refcounts are held. (Trim sets the pending bit to block new + /// I/O, then waits for existing I/O to drain.) + /// - **TFP blocks are in flight.** If `transitioning_to_fully_present` + /// is set, another writer is mid-allocation. The write path waits + /// on `allocation_event` and retries; the read path ignores TFP + /// and uses the current state+offset directly (safe because the + /// file offset is valid and I/O refcounts prevent reclamation). + /// + /// Without I/O refcounts (or a trim claim), the mapping is purely + /// advisory — the block could be trimmed between the load and any + /// action on it. + pub(crate) fn get_block_mapping(&self, block_number: u32) -> BlockMapping { + BlockMapping::from(self.payload_mappings[block_number as usize].load(Ordering::Acquire)) + } + + /// Look up the sector bitmap block mapping for a given chunk number. + /// + /// SBM mappings are set once during allocation and never revert, + /// so any reader that sees `FullyPresent` can rely on the file + /// offset being stable indefinitely. A reader that sees + /// `NotPresent` must allocate the SBM block before proceeding + /// (see `ensure_sbm_allocated`). + pub(crate) fn get_sector_bitmap_mapping(&self, chunk_number: u32) -> BlockMapping { + BlockMapping::from( + self.sector_bitmap_mappings[chunk_number as usize].load(Ordering::Acquire), + ) + } + + /// Update the payload block mapping for a given data block number. + /// + /// In-memory only — does not persist to cache or disk. Use + /// [`write_block_mapping`](Self::write_block_mapping) to persist. + /// + /// Allowed transitions and their required guards: + /// + /// - Unallocated → same state + TFP + file offset: `allocation_lock`. + /// (Unallocated = NotPresent, Zero, Unmapped, or Undefined.) + /// - PartiallyPresent → same state + TFP: `allocation_lock`. + /// - Any + TFP → original mapping (revert): abort path — TFP + /// acts as an exclusive flag so no other guard is needed. + /// - Soft-anchored → same state + file_megabyte=0: `allocation_lock`. + /// + /// The TFP bit is the key invariant: once set on a block, no other + /// allocator will touch that block (they wait on `allocation_event`), + /// and trim cannot reach it because the allocator holds I/O + /// refcounts on TFP blocks. This makes the setter the exclusive + /// owner until TFP is cleared. + pub(crate) fn set_block_mapping(&self, block_number: u32, mapping: BlockMapping) { + self.payload_mappings[block_number as usize].store(mapping.into(), Ordering::Release); + } + + /// Update the sector bitmap block mapping for a given chunk number. + /// + /// Only called from [`write_block_mapping`](Self::write_block_mapping) + /// under the page cache write lock. SBM mappings transition from + /// `NotPresent` to `FullyPresent` exactly once and never revert. + fn set_sector_bitmap_mapping(&self, chunk_number: u32, mapping: BlockMapping) { + self.sector_bitmap_mappings[chunk_number as usize].store(mapping.into(), Ordering::Release); + } + + /// Initialize payload mappings for testing. Replaces any existing + /// mappings with `data_block_count` entries set to `NotPresent`. + #[cfg(test)] + pub(crate) fn init_test_payload_mappings(&mut self) { + let not_present = BlockMapping::new().with_bat_state(BatEntryState::NotPresent); + for mapping in &self.payload_mappings { + mapping.store(not_present.into(), Ordering::Relaxed); + } + } +} + +#[must_use] +pub struct BatGuard<'a> { + bat: Option<&'a Bat>, + /// First payload block number with incremented refcount. + start_block: u32, + /// Number of consecutive payload blocks with incremented refcounts. + block_count: u32, +} + +impl<'a> BatGuard<'a> { + pub(crate) fn empty() -> Self { + Self { + bat: None, + start_block: 0, + block_count: 0, + } + } +} + +impl Drop for BatGuard<'_> { + fn drop(&mut self) { + let Some(bat) = self.bat else { return }; + let mut notify = false; + for block in self.start_block..self.start_block + self.block_count { + notify |= bat.decrement_io_refcount(block); + } + if notify { + bat.refcount_event.notify(usize::MAX); + } + } +} + +#[must_use] +pub struct TrimGuard<'a> { + bat: &'a Bat, + block_number: u32, +} + +impl Drop for TrimGuard<'_> { + fn drop(&mut self) { + self.bat.release_trim_claim(self.block_number); + } +} + +/// A single block's portion of a virtual disk I/O range. +/// +/// Produced by [`Bat::block_spans`]. Each span describes one block's +/// contribution to an `(offset, len)` range. +#[derive(Debug, Clone, Copy)] +pub struct BlockSpan { + /// Block number within the BAT. + pub block_number: u32, + /// Byte offset within the block where this span starts. + pub block_offset: u32, + /// Number of bytes this span covers within the block. + pub length: u32, + /// Absolute guest virtual disk byte offset for this span. + pub virtual_offset: u64, +} + +impl BlockSpan { + /// Whether this span covers the entire block. + pub fn is_full_block(&self, block_size: u32) -> bool { + self.block_offset == 0 && self.length >= block_size + } +} + +/// Iterator over [`BlockSpan`]s produced by [`Bat::block_spans`]. +pub struct BlockSpanIter { + block_size: u32, + base_offset: u64, + total_len: u32, + current_offset: u32, +} + +impl Iterator for BlockSpanIter { + type Item = BlockSpan; + + fn next(&mut self) -> Option { + if self.current_offset >= self.total_len { + return None; + } + let virtual_offset = self.base_offset + self.current_offset as u64; + let block_number = (virtual_offset / self.block_size as u64) as u32; + let block_offset = (virtual_offset % self.block_size as u64) as u32; + let length = std::cmp::min( + self.block_size - block_offset, + self.total_len - self.current_offset, + ); + self.current_offset += length; + Some(BlockSpan { + block_number, + block_offset, + length, + virtual_offset, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::format; + use pal_async::async_test; + use std::sync::Arc; + + #[test] + fn chunk_ratio_default_params() { + // 2 MiB blocks, 512-byte sectors → chunk_ratio = 2048 + let bat = Bat::new( + format::GB1, + format::DEFAULT_BLOCK_SIZE, + 512, + false, + MB1 as u32, + ) + .unwrap(); + assert_eq!(bat.chunk_ratio, 2048); + } + + #[test] + fn chunk_ratio_various_sizes() { + // 1 MiB blocks, 512 sectors + let bat = Bat::new(format::GB1, MB1 as u32, 512, false, MB1 as u32).unwrap(); + assert_eq!(bat.chunk_ratio, 4096); + + // 4 MiB blocks, 512 sectors + let bat = Bat::new(format::GB1, 4 * MB1 as u32, 512, false, MB1 as u32).unwrap(); + assert_eq!(bat.chunk_ratio, 1024); + + // 32 MiB blocks, 512 sectors + let bat = Bat::new(format::GB1, 32 * MB1 as u32, 512, false, MB1 as u32).unwrap(); + assert_eq!(bat.chunk_ratio, 128); + + // 256 MiB blocks, 512 sectors + let bat = Bat::new(format::GB1, 256 * MB1 as u32, 512, false, MB1 as u32).unwrap(); + assert_eq!(bat.chunk_ratio, 16); + + // 2 MiB blocks, 4096 sectors: sectors_per_block = 512, chunk_ratio = 8388608 / 512 = 16384 + let bat = Bat::new( + format::GB1, + format::DEFAULT_BLOCK_SIZE, + 4096, + false, + MB1 as u32, + ) + .unwrap(); + assert_eq!(bat.chunk_ratio, 16384); + + // 1 MiB blocks, 4096 sectors: sectors_per_block = 256, chunk_ratio = 8388608 / 256 = 32768 + let bat = Bat::new(format::GB1, MB1 as u32, 4096, false, MB1 as u32).unwrap(); + assert_eq!(bat.chunk_ratio, 32768); + } + + #[test] + fn payload_entry_index_calculations() { + let bat = Bat::new( + format::GB1, + format::DEFAULT_BLOCK_SIZE, + 512, + false, + MB1 as u32, + ) + .unwrap(); + // chunk_ratio = 2048 + assert_eq!(bat.payload_entry_index(0), 0); + assert_eq!(bat.payload_entry_index(1), 1); + assert_eq!( + bat.payload_entry_index(bat.chunk_ratio - 1), + bat.chunk_ratio - 1 + ); + // At chunk_ratio, we skip one SBM slot. + assert_eq!( + bat.payload_entry_index(bat.chunk_ratio), + bat.chunk_ratio + 1 + ); + assert_eq!( + bat.payload_entry_index(bat.chunk_ratio + 1), + bat.chunk_ratio + 2 + ); + // At 2 * chunk_ratio, skip another. + assert_eq!( + bat.payload_entry_index(2 * bat.chunk_ratio), + 2 * bat.chunk_ratio + 2 + ); + } + + #[test] + fn sector_bitmap_entry_index_calculations() { + let bat = Bat::new( + format::GB1, + format::DEFAULT_BLOCK_SIZE, + 512, + true, + MB1 as u32, + ) + .unwrap(); + // SBM entry 0 is at position chunk_ratio. + assert_eq!(bat.sector_bitmap_entry_index(0), bat.chunk_ratio); + // SBM entry 1 is at position 2*chunk_ratio + 1. + assert_eq!(bat.sector_bitmap_entry_index(1), 2 * bat.chunk_ratio + 1); + } + + #[test] + fn validate_bat_size_ok() { + // For 1 GiB / 2 MiB = 512 data blocks, chunk_ratio = 2048. + // entries = 512 + ((512-1)/2048) = 512 + 0 = 512 + // 512 * 8 = 4096 bytes. Any bat_length >= 4096 is fine. + Bat::new( + format::GB1, + format::DEFAULT_BLOCK_SIZE, + 512, + false, + MB1 as u32, + ) + .unwrap(); + } + + #[test] + fn validate_bat_size_too_small() { + // 512 entries * 8 bytes = 4096 bytes needed. + let result = Bat::new(format::GB1, format::DEFAULT_BLOCK_SIZE, 512, false, 4095); + assert!(matches!( + result, + Err(OpenError(OpenErrorInner::Corrupt( + CorruptionType::BatTooSmall + ))) + )); + } + + #[test] + fn offset_to_block_calculations() { + let bat = Bat::new( + format::GB1, + format::DEFAULT_BLOCK_SIZE, + 512, + false, + MB1 as u32, + ) + .unwrap(); + assert_eq!(bat.offset_to_block(0), 0); + assert_eq!( + bat.offset_to_block(format::DEFAULT_BLOCK_SIZE as u64 - 1), + 0 + ); + assert_eq!(bat.offset_to_block(format::DEFAULT_BLOCK_SIZE as u64), 1); + assert_eq!( + bat.offset_to_block(format::DEFAULT_BLOCK_SIZE as u64 * 10 + 42), + 10 + ); + } + + #[test] + fn offset_within_block_calculations() { + let bat = Bat::new( + format::GB1, + format::DEFAULT_BLOCK_SIZE, + 512, + false, + MB1 as u32, + ) + .unwrap(); + assert_eq!(bat.offset_within_block(0), 0); + assert_eq!(bat.offset_within_block(512), 512); + assert_eq!( + bat.offset_within_block(format::DEFAULT_BLOCK_SIZE as u64), + 0 + ); + assert_eq!( + bat.offset_within_block(format::DEFAULT_BLOCK_SIZE as u64 + 1024), + 1024 + ); + } + + #[test] + fn mapping_max_file_megabyte() { + let max_mb: u32 = (1 << 28) - 1; // 268435455 + let mapping = BlockMapping::new() + .with_state(BatEntryState::FullyPresent as u8) + .with_file_megabyte(max_mb); + assert_eq!(mapping.file_megabyte(), max_mb); + assert_eq!(mapping.file_offset(), max_mb as u64 * MB1); + } + + #[test] + fn mapping_tfp_flag() { + let with_tfp = BlockMapping::new() + .with_state(BatEntryState::NotPresent as u8) + .with_transitioning_to_fully_present(true); + assert!(with_tfp.transitioning_to_fully_present()); + + let without_tfp = BlockMapping::new() + .with_state(BatEntryState::FullyPresent as u8) + .with_transitioning_to_fully_present(false); + assert!(!without_tfp.transitioning_to_fully_present()); + + // TFP is independent of state. + assert_eq!(with_tfp.state(), BatEntryState::NotPresent as u8); + assert_eq!(without_tfp.state(), BatEntryState::FullyPresent as u8); + } + + #[test] + fn mapping_from_bat_entry() { + let entry = BatEntry::new() + .with_state(BatEntryState::FullyPresent as u8) + .with_file_offset_mb(100); + let mapping = BlockMapping::from_bat_entry(entry, false).unwrap(); + assert_eq!(mapping.state(), BatEntryState::FullyPresent as u8); + assert_eq!(mapping.file_megabyte(), 100); + assert!(!mapping.transitioning_to_fully_present()); + } + + #[test] + fn mapping_rejects_reserved_bat_entry_bits() { + let entry = BatEntry::from( + u64::from(BatEntry::new().with_state(BatEntryState::FullyPresent as u8)) | (1 << 3), + ); + let result = BlockMapping::from_bat_entry(entry, false); + assert!(matches!( + result, + Err(OpenError(OpenErrorInner::Corrupt( + CorruptionType::ReservedBatEntryFieldNonzero + ))) + )); + } + + #[test] + fn entry_number_to_block_id_payload() { + // Non-differencing: all entries are payload. + let bat = Bat::new( + format::GB1, + format::DEFAULT_BLOCK_SIZE, + 512, + false, + MB1 as u32, + ) + .unwrap(); + // chunk_ratio = 2048, data_block_count = 512 + for i in 0..bat.data_block_count { + let entry_index = bat.payload_entry_index(i); + let result = bat.entry_number_to_block_id(entry_index); + assert_eq!(result, Some((BlockType::Payload, i)), "block {i}"); + } + } + + #[test] + fn entry_number_to_block_id_with_sbm() { + // Differencing disk with SBM entries. + // Use small chunk_ratio to exercise interleaving. + // 1 MiB blocks, 4096 sectors → chunk_ratio = 32768. + // Use 256 MiB blocks, 512 sectors → chunk_ratio = 16. + let bat = Bat::new(format::GB1, 256 * MB1 as u32, 512, true, MB1 as u32).unwrap(); + assert_eq!(bat.chunk_ratio, 16); + // data_block_count = 4, sector_bitmap_block_count = 1 + + // Payload entries for group 0: positions 0..15 → blocks 0..3 + for i in 0..bat.data_block_count { + let entry_index = bat.payload_entry_index(i); + let result = bat.entry_number_to_block_id(entry_index); + assert_eq!(result, Some((BlockType::Payload, i)), "payload block {i}"); + } + + // SBM entry for chunk 0 at position chunk_ratio = 16 + let sbm_index = bat.sector_bitmap_entry_index(0); + assert_eq!( + bat.entry_number_to_block_id(sbm_index), + Some((BlockType::SectorBitmap, 0)) + ); + } + + #[test] + fn entry_number_to_block_id_beyond_end() { + let bat = Bat::new( + format::GB1, + format::DEFAULT_BLOCK_SIZE, + 512, + false, + MB1 as u32, + ) + .unwrap(); + // Entry beyond all data blocks should return None. + let beyond = bat.payload_entry_index(bat.data_block_count); + assert_eq!(bat.entry_number_to_block_id(beyond), None); + } + + /// Non-differencing disk with data_block_count > chunk_ratio. + /// + /// The BAT has padding entries at every chunk_ratio boundary. These + /// must NOT be misidentified as payload entries. + #[test] + fn entry_number_to_block_id_padding_not_payload() { + // Use 256 MiB blocks so chunk_ratio is small (16 with 512B sectors). + // 8 GiB disk → data_block_count = 32 (> chunk_ratio=16). + let bat = Bat::new( + 8 * format::GB1, + 256 * MB1 as u32, + 512, + false, + 4 * MB1 as u32, // BAT length large enough + ) + .unwrap(); + assert_eq!(bat.chunk_ratio, 16); + assert_eq!(bat.data_block_count, 32); + + // Entry 16 is the padding entry (position == chunk_ratio in group 0). + // It should NOT map to payload block 16. + let padding_entry = bat.chunk_ratio; // entry 16 + let result = bat.entry_number_to_block_id(padding_entry); + assert_eq!( + result, None, + "entry {} is a padding entry on non-diff disk and should return None, \ + but got {:?}", + padding_entry, result + ); + + // Payload block 16 should be at entry 17 (payload_entry_index(16) = 16 + 16/16 = 17). + let real_entry = bat.payload_entry_index(16); + assert_eq!(real_entry, 17); + let result = bat.entry_number_to_block_id(real_entry); + assert_eq!( + result, + Some((BlockType::Payload, 16)), + "entry {} should map to payload block 16", + real_entry + ); + } + + // ---- Refcount async behavior tests ---- + + fn make_test_bat() -> Bat { + Bat::new(4 * MB1, format::DEFAULT_BLOCK_SIZE, 512, false, MB1 as u32).unwrap() + } + + #[test] + fn decrement_preserves_trim_pending_bit() { + let bat = make_test_bat(); + // Simulate: trim-pending with 3 in-flight I/Os draining. + bat.io_refcounts[0].store(IoBlockRef::TRIM_PENDING_BIT | 3, Ordering::Release); + assert!(!bat.decrement_io_refcount(0), "3→2 should not need notify"); + // After decrement: pending bit preserved, count is 2. + let cur = IoBlockRef(bat.io_refcount(0)); + assert!(cur.trim_pending()); + assert_eq!(cur.io_count(), 2); + } + + #[test] + #[should_panic(expected = "io_refcount underflow")] + fn decrement_panics_on_underflow() { + let _ = make_test_bat().decrement_io_refcount(0); + } + + #[test] + #[should_panic(expected = "trim claimed")] + fn decrement_panics_on_trim_claimed() { + let bat = make_test_bat(); + bat.io_refcounts[0].store(IoBlockRef::TRIM_CLAIMED.0, Ordering::Release); + let _ = bat.decrement_io_refcount(0); + } + + #[test] + #[should_panic(expected = "wasn't claimed")] + fn release_trim_claim_panics_if_not_claimed() { + make_test_bat().release_trim_claim(0); + } + + #[async_test] + async fn acquire_io_on_idle_block() { + let bat = make_test_bat(); + let guard = bat.acquire_io_refcounts(0, 1).await; + assert_eq!(bat.io_refcount(0), 1); + drop(guard); + assert_eq!(bat.io_refcount(0), 0); + } + + #[async_test] + async fn acquire_io_resumes_after_trim_releases() { + let bat = Arc::new(make_test_bat()); + bat.io_refcounts[0].store(IoBlockRef::TRIM_CLAIMED.0, Ordering::Release); + + let bat2 = bat.clone(); + let io_task = async move { + let guard = bat2.acquire_io_refcounts(0, 1).await; + assert_eq!(bat2.io_refcount(0), 1); + drop(guard); + }; + + let release_task = async { + bat.release_trim_claim(0); + }; + + futures::future::join(io_task, release_task).await; + assert_eq!(bat.io_refcount(0), 0); + } + + #[async_test] + async fn acquire_io_multi_block_rolls_back_on_partial_conflict() { + let bat = Arc::new(make_test_bat()); + bat.io_refcounts[1].store(IoBlockRef::TRIM_CLAIMED.0, Ordering::Release); + + let bat2 = bat.clone(); + let io_task = async move { + let guard = bat2.acquire_io_refcounts(0, 2).await; + assert_eq!(bat2.io_refcount(0), 1); + assert_eq!(bat2.io_refcount(1), 1); + drop(guard); + }; + + let release_task = async { + bat.release_trim_claim(1); + }; + + futures::future::join(io_task, release_task).await; + } + + #[async_test] + async fn claim_for_trim_on_idle_block() { + let bat = make_test_bat(); + let guard = bat.claim_for_trim(0).await; + assert_eq!(bat.io_refcount(0), IoBlockRef::TRIM_CLAIMED.0); + drop(guard); + assert_eq!(bat.io_refcount(0), 0); + } + + #[async_test] + async fn claim_for_trim_waits_for_io_drain() { + let bat = Arc::new(make_test_bat()); + + let io_guard = bat.acquire_io_refcounts(0, 1).await; + assert_eq!(bat.io_refcount(0), 1); + + let trim_task = async { + let guard = bat.claim_for_trim(0).await; + assert_eq!(bat.io_refcount(0), IoBlockRef::TRIM_CLAIMED.0); + guard + }; + + let drain_task = async { + // After trim_task's first poll, trim-pending is set. + assert!(IoBlockRef(bat.io_refcount(0)).trim_pending()); + assert!(!bat.try_increment_io_refcount(0)); + drop(io_guard); + }; + + let (trim_guard, ()) = futures::future::join(trim_task, drain_task).await; + drop(trim_guard); + assert_eq!(bat.io_refcount(0), 0); + } + + #[async_test] + async fn trim_has_writer_priority_over_new_io() { + let bat = Arc::new(make_test_bat()); + + // Block 0 has an in-flight I/O. + let io_guard = bat.acquire_io_refcounts(0, 1).await; + + // Trim claims — sets pending, waits for drain. + let trim_task = async { + let guard = bat.claim_for_trim(0).await; + assert_eq!(bat.io_refcount(0), IoBlockRef::TRIM_CLAIMED.0); + guard + }; + let drain_task = async { drop(io_guard) }; + + let (trim_guard, ()) = futures::future::join(trim_task, drain_task).await; + + // Trim owns the block. New I/O should be blocked. + assert!(!bat.try_increment_io_refcount(0)); + + // Release trim, then new I/O should succeed. + drop(trim_guard); + let io_guard2 = bat.acquire_io_refcounts(0, 1).await; + assert_eq!(bat.io_refcount(0), 1); + drop(io_guard2); + assert_eq!(bat.io_refcount(0), 0); + } + + #[async_test] + async fn acquire_io_blocked_at_overflow_resumes() { + let bat = Arc::new(make_test_bat()); + bat.io_refcounts[0].store(IoBlockRef::MAX_IO_REFCOUNT, Ordering::Release); + + let bat2 = bat.clone(); + let io_task = async move { + let guard = bat2.acquire_io_refcounts(0, 1).await; + assert_eq!(bat2.io_refcount(0), IoBlockRef::MAX_IO_REFCOUNT); + drop(guard); + }; + + let unblock_task = async { + if bat.decrement_io_refcount(0) { + bat.refcount_event.notify(usize::MAX); + } + }; + + futures::future::join(io_task, unblock_task).await; + } +} diff --git a/vm/devices/storage/vhdx/src/cache.rs b/vm/devices/storage/vhdx/src/cache.rs new file mode 100644 index 0000000000..9aa2edb4fc --- /dev/null +++ b/vm/devices/storage/vhdx/src/cache.rs @@ -0,0 +1,1969 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Write-back page cache for VHDX metadata pages. +//! +//! Provides a hash-table-backed, page-granularity (4 KiB) caching layer over +//! an [`AsyncFile`](crate::AsyncFile). Pages are identified by a [`PageKey`] +//! consisting of a tag (u8) and an offset within a tagged region. Tags map +//! to base file offsets, allowing region relocation without invalidating +//! cached pages. +//! +//! Modified pages accumulate as **Dirty** in the cache. On [`commit()`](PageCache::commit), +//! dirty pages are sent to the [log task](crate::log_task) via a mesh channel +//! for WAL persistence. The log task applies them to their final file offsets +//! in the background. +//! +//! Page data is stored as `Arc<[u8; PAGE_SIZE]>` to enable zero-copy commit +//! (Arc::clone) and implicit COW (Arc::make_mut) when a page is modified while +//! the log task holds a reference. +//! +//! # Write Ordering +//! +//! The cache guarantees that writes are **ordered** through the log. If a +//! caller writes page A, then later writes page B, the only crash-recovery +//! outcomes are: {neither}, {A only}, or {both A and B}. It is never the case +//! that B is persisted without A. +//! +//! This ordering is maintained by **batch-full commit**: when the dirty page +//! count reaches [`MAX_COMMIT_PAGES`] and a new page is about to become dirty, +//! the cache automatically commits the current dirty set to the log before +//! allowing the new page to enter the dirty set. + +use crate::AsyncFile; +use crate::error::CacheError; +use crate::flush::Fsn; +use crate::log_permits::LogPermits; +use crate::log_task::LogClient; +use crate::log_task::LogData; +use crate::log_task::Lsn; +use crate::lsn_watermark::LsnWatermark; +use parking_lot::ArcMutexGuard; +use parking_lot::Mutex; +use std::collections::HashMap; +use std::collections::hash_map; +use std::sync::Arc; + +/// Page size used by the cache (4 KiB). +pub const PAGE_SIZE: usize = 4096; + +/// Maximum number of dirty pages per commit batch. +/// +/// Derived from 1/4 of the minimum 1 MiB VHDX log. With 0 zero ranges: +/// entry_length(N) = ceil((64 + 32*N) / 4096) * 4096 + N * 4096 +/// (N+1)*4096 + 4096 (guard) ≤ 262144 → N ≤ 62 +/// +/// Note: the permit count is a *multiple* of this value (see `open.rs`) +/// to allow pipelining — multiple batches can be in-flight in the +/// log/apply pipeline simultaneously. +pub const MAX_COMMIT_PAGES: usize = 62; + +/// Key identifying a cached page. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct PageKey { + /// Tag selecting the region (e.g., 0 = BAT, 1 = metadata). + pub tag: u8, + /// Byte offset within the tagged region. Must be 4 KiB aligned. + pub offset: u64, +} + +/// Write mode for page acquisition. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum WriteMode { + /// Page is loaded from file if not cached. Caller will modify parts. + Modify, + /// Page is NOT loaded from file (caller will overwrite the entire page). + Overwrite, +} + +/// Per-page lifecycle state. +/// +/// Encodes the dirty flag, permit state, and data provenance as a single +/// enum to prevent invalid combinations. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum PageState { + /// Page is not dirty. Data may or may not be loaded (`data` can be + /// `None` for a freshly created entry that hasn't been loaded or + /// written yet). + Clean, + /// Page data is being loaded from disk by another task. + /// Other acquirers wait on `state_event`. + Loading, + /// A log permit is being acquired for this page. + /// Other acquirers wait on `state_event`. + AcquiringPermit, + /// Page has been modified (or a permit has been acquired for it). + /// A permit is consumed (transfers to the log task on commit). + Dirty, +} + +/// Internal per-page data, generic over the file buffer type. +struct PageData { + /// The page contents as `Arc` for zero-copy commit and COW. + /// `Some` when `state` is `Dirty`, and when `Clean` after a + /// successful load or write. + /// `None` when `Clean` (freshly created, not yet loaded), + /// `Loading`, or `AcquiringPermit`. + data: Option>, + /// Page lifecycle state. + state: PageState, + /// If set, the log task must wait for this FSN to complete before + /// including this page in a log entry. + pre_log_fsn: Option, + /// Index into `PageMap::lru`. Allocated on entry creation. + lru_index: usize, + /// Hint: this page is cheap to regenerate; evict before other pages. + demoted: bool, +} + +/// Node in the slab-indexed LRU doubly-linked list. +struct LruNode { + key: PageKey, + prev: usize, + next: usize, + linked: bool, +} + +/// Slab-indexed doubly-linked list for LRU eviction ordering. +/// +/// Index 0 is a sentinel node. `sentinel.next` is the MRU end, +/// `sentinel.prev` is the LRU end (eviction candidate). All +/// operations are O(1). +struct LruList { + nodes: Vec, + free: Vec, +} + +impl LruList { + /// Create a new list with only the sentinel node. + fn new() -> Self { + Self { + nodes: vec![LruNode { + key: PageKey { tag: 0, offset: 0 }, + prev: 0, + next: 0, + linked: true, // sentinel is always "linked" + }], + free: Vec::new(), + } + } + + /// Allocate a slot for a new page. The node is NOT linked. + fn alloc(&mut self, key: PageKey) -> usize { + if let Some(idx) = self.free.pop() { + self.nodes[idx] = LruNode { + key, + prev: 0, + next: 0, + linked: false, + }; + idx + } else { + let idx = self.nodes.len(); + self.nodes.push(LruNode { + key, + prev: 0, + next: 0, + linked: false, + }); + idx + } + } + + /// Returns true if the node is currently linked in the list. + fn is_linked(&self, idx: usize) -> bool { + debug_assert!(idx != 0, "cannot check sentinel linkage"); + self.nodes[idx].linked + } + + /// Remove a node from its current position. No-op if unlinked. + fn unlink(&mut self, idx: usize) { + if idx == 0 || !self.is_linked(idx) { + return; + } + let prev = self.nodes[idx].prev; + let next = self.nodes[idx].next; + self.nodes[prev].next = next; + self.nodes[next].prev = prev; + self.nodes[idx].prev = 0; + self.nodes[idx].next = 0; + self.nodes[idx].linked = false; + } + + /// Move a node to the MRU end. Works whether linked or unlinked. + fn move_to_front(&mut self, idx: usize) { + if idx == 0 { + return; + } + // Already at front? + if self.nodes[0].next == idx { + return; + } + // Remove from current position if linked. + if self.nodes[idx].linked { + let prev = self.nodes[idx].prev; + let next = self.nodes[idx].next; + self.nodes[prev].next = next; + self.nodes[next].prev = prev; + } + // Insert after sentinel. + let old_front = self.nodes[0].next; + self.nodes[idx].prev = 0; + self.nodes[idx].next = old_front; + self.nodes[0].next = idx; + self.nodes[old_front].prev = idx; + self.nodes[idx].linked = true; + } + + /// Move a node to the LRU end. Works whether linked or unlinked. + fn move_to_back(&mut self, idx: usize) { + if idx == 0 { + return; + } + // Already at back? + if self.nodes[0].prev == idx { + return; + } + // Remove from current position if linked. + if self.nodes[idx].linked { + let prev = self.nodes[idx].prev; + let next = self.nodes[idx].next; + self.nodes[prev].next = next; + self.nodes[next].prev = prev; + } + // Insert before sentinel. + let old_back = self.nodes[0].prev; + self.nodes[idx].next = 0; + self.nodes[idx].prev = old_back; + self.nodes[0].prev = idx; + self.nodes[old_back].next = idx; + self.nodes[idx].linked = true; + } + + /// Return the index of the LRU tail node, or 0 if empty. + fn tail(&self) -> usize { + self.nodes[0].prev + } + + /// Unlink and recycle a node. + fn dealloc(&mut self, idx: usize) { + self.unlink(idx); + self.free.push(idx); + } +} + +/// Number of distinct cache tags (BAT=0, METADATA=1, SBM=2). +const TAG_COUNT: usize = 3; + +/// Entry in the page map. Wraps the page data mutex with metadata +/// that can be read under the map lock without taking the page lock. +struct CacheEntry { + page: Arc>>, + /// LSN of the most recent commit that included this page. + /// Set under the map lock in `commit_locked`, read under the map + /// lock in eviction — no page lock needed. 0 = never committed. + committed_lsn: Lsn, + /// True when the page is clean and no writer holds it. + /// Set under the map lock on all write-acquire and write-release + /// paths. Eviction checks this without taking the page lock. + idle: bool, +} + +/// Internal page map wrapping the `HashMap` and dirty page counter. +struct PageMap { + map: HashMap>, + /// Number of pages with a consumed permit (Dirty, or Clean with + /// an active `WritePageGuard` that hasn't called `DerefMut` yet). + /// Maintained under the map lock to prevent races. + dirty_count: usize, + /// Log client for sending transactions. `None` for read-only caches. + log_client: Option>, + /// Base file offset per tag, indexed by tag value. + tag_offsets: [u64; TAG_COUNT], + /// LRU eviction list. Protected by the same lock as `map`. + lru: LruList, +} + +impl PageMap { + /// Resolve a [`PageKey`] to an absolute file offset. + fn resolve_offset(&self, key: PageKey) -> u64 { + self.tag_offsets[key.tag as usize] + key.offset + } +} + +/// Action to perform when a page isn't ready (returned by sync helpers). +/// This enum is `Send` — it never contains `ArcMutexGuard`. +enum PendingAction { + /// Wait for another task to finish loading/acquiring. + Wait(event_listener::EventListener), + /// Load page data from disk at this file offset. Carries the page + /// entry Arc so `complete_load` can skip the map re-lookup. + Load(u64, Arc>>), +} + +/// Action for acquire_write when the page isn't ready. +/// This enum is `Send` — it never contains `ArcMutexGuard`. +enum WritePendingAction { + /// Wait for another task to finish loading/acquiring. + Wait(event_listener::EventListener), + /// Load page data from disk at this file offset. Carries the page + /// entry Arc so `complete_load` can skip the map re-lookup. + Load(u64, Arc>>), + /// Acquire a log permit. Carries the page entry Arc so + /// `finalize_permit` can skip the map re-lookup. + AcquirePermit(Arc>>), +} + +/// Log pipeline state shared between the cache and the log/apply tasks. +/// +/// Present only when the file is opened writable with a log task. +pub(crate) struct CacheLogState { + /// Failable semaphore for log backpressure. + pub permits: Arc, + /// LSN watermark published by the apply task. + pub applied_lsn: Arc, +} + +/// Write-back page cache backed by an [`AsyncFile`]. +pub struct PageCache { + pub(crate) file: Arc, + pages: Mutex>, + log_state: Option, + /// Notified when a page transitions out of `Loading` or `AcquiringPermit`. + state_event: event_listener::Event, + /// Maximum number of pages to keep in the cache. 0 = unlimited. + quota: usize, +} + +impl PageCache { + /// Create a new cache backed by the given file. + pub fn new( + file: Arc, + log_client: Option>, + log_state: Option, + quota: usize, + ) -> Self { + Self { + file, + pages: Mutex::new(PageMap { + map: HashMap::new(), + dirty_count: 0, + log_client, + tag_offsets: [0; TAG_COUNT], + lru: LruList::new(), + }), + log_state, + state_event: event_listener::Event::new(), + quota, + } + } + + /// Take the log client out of the cache, returning it. + pub fn take_log_client(&mut self) -> Option> { + self.pages.lock().log_client.take() + } + + /// Set the log pipeline state (for late initialization after log task spawn). + pub fn set_log_state(&mut self, state: CacheLogState) { + self.log_state = Some(state); + } + + /// Register a tag with its base file offset. + pub fn register_tag(&mut self, tag: u8, base_offset: u64) { + self.pages.lock().tag_offsets[tag as usize] = base_offset; + } + + /// Evict clean, applied pages to bring the cache back under quota. + /// Must be called with the pages map lock held. + /// `skip_key` is the page being acquired — never evict it. + fn try_evict_under_lock(&self, pages: &mut PageMap, skip_key: Option) { + let applied = self + .log_state + .as_ref() + .map(|s| s.applied_lsn.get()) + .unwrap_or(Lsn::ZERO); + + // Walk backward from the LRU tail. Check `idle` and + // `committed_lsn` on the entry — both maintained under the + // map lock, so no page lock needed. + let mut idx = pages.lru.tail(); + while self.quota > 0 && pages.map.len() > self.quota { + if idx == 0 { + break; + } + let prev_idx = pages.lru.nodes[idx].prev; + let key = pages.lru.nodes[idx].key; + if skip_key == Some(key) { + idx = prev_idx; + continue; + } + let entry = pages.map.get(&key).expect("LRU key missing from map"); + if entry.idle && entry.committed_lsn <= applied { + pages.map.remove(&key); + pages.lru.dealloc(idx); + idx = pages.lru.tail(); + continue; + } + idx = prev_idx; + } + } + + /// Acquire read access to a page. + pub async fn acquire_read(&self, key: PageKey) -> Result, CacheError> { + loop { + let action = match self.try_acquire_read(key) { + Ok(guard) => return Ok(guard), + Err(action) => action, + }; + match action { + PendingAction::Wait(listener) => listener.await, + PendingAction::Load(file_offset, entry) => { + let buf = self.file.alloc_buffer(PAGE_SIZE); + match self.file.read_into(file_offset, buf).await { + Ok(buf) => self.complete_load(entry, Some(Arc::new(buf))), + Err(e) => { + self.complete_load(entry, None); + return Err(CacheError::Read { + err: e, + file_offset, + }); + } + } + } + } + } + } + + /// Sync helper: try to acquire read access. + fn try_acquire_read( + &self, + key: PageKey, + ) -> Result, PendingAction> { + assert!( + key.offset.is_multiple_of(PAGE_SIZE as u64), + "page offset {:#x} is not {PAGE_SIZE}-byte aligned", + key.offset + ); + let mut pages = self.pages.lock(); + let file_offset = pages.resolve_offset(key); + + // Pre-allocate an LRU slot. Freed below if the entry already exists. + let lru_index = pages.lru.alloc(key); + let mut inserted = false; + let page = pages + .map + .entry(key) + .or_insert_with(|| { + inserted = true; + CacheEntry { + page: Arc::new(Mutex::new(PageData { + data: None, + state: PageState::Clean, + pre_log_fsn: None, + lru_index, + demoted: false, + })), + committed_lsn: Lsn::ZERO, + idle: true, + } + }) + .page + .clone(); + + if !inserted { + pages.lru.dealloc(lru_index); + } else if self.quota > 0 && pages.map.len() > self.quota { + self.try_evict_under_lock(&mut pages, Some(key)); + } + + let mut guard = Mutex::lock_arc(&page); + + match guard.state { + PageState::Loading | PageState::AcquiringPermit => { + let listener = self.state_event.listen(); + drop(guard); + drop(pages); + Err(PendingAction::Wait(listener)) + } + PageState::Clean if guard.data.is_none() => { + guard.state = PageState::Loading; + let entry_arc = ArcMutexGuard::into_arc(guard); + drop(pages); + Err(PendingAction::Load(file_offset, entry_arc)) + } + PageState::Clean | PageState::Dirty => { + assert!( + guard.data.is_some(), + "page in {:?} has no data", + guard.state + ); + // Promote to MRU. + let idx = guard.lru_index; + guard.demoted = false; + pages.lru.move_to_front(idx); + drop(pages); + Ok(ReadPageGuard { guard }) + } + } + } + + /// Complete a page load: store data and transition out of Loading. + /// + /// On success (`data` is `Some`): stores data, transitions `Loading → Clean`. + /// Uses the `entry` Arc directly — no map re-lookup needed. + /// + /// On failure (`data` is `None`): removes the entry from the cache so the + /// next acquirer creates a fresh entry and retries. + fn complete_load(&self, entry: Arc>>, data: Option>) { + let mut page = entry.lock(); + assert!( + page.state == PageState::Loading, + "complete_load called but page state is {:?}, expected Loading", + page.state + ); + assert!( + page.data.is_none(), + "complete_load called but page already has data" + ); + page.state = PageState::Clean; + page.data = data; + self.state_event.notify(usize::MAX); + } + + /// Acquire write access to a page. + /// + /// If a log is configured, acquires a permit (backpressure). If the + /// dirty batch is full, commits it first (batch-full commit). + pub async fn acquire_write( + &self, + key: PageKey, + mode: WriteMode, + ) -> Result, CacheError> { + let load = mode == WriteMode::Modify; + + loop { + let action = match self.try_acquire_write(key, load) { + Ok(guard) => return Ok(guard), + Err(action) => action, + }; + match action { + WritePendingAction::Wait(listener) => listener.await, + WritePendingAction::Load(file_offset, entry) => { + let buf = self.file.alloc_buffer(PAGE_SIZE); + match self.file.read_into(file_offset, buf).await { + Ok(buf) => self.complete_load(entry, Some(Arc::new(buf))), + Err(e) => { + self.complete_load(entry, None); + return Err(CacheError::Read { + err: e, + file_offset, + }); + } + } + } + WritePendingAction::AcquirePermit(entry) => { + let permits = &self.log_state.as_ref().unwrap().permits; + let result = permits.acquire(1).await; + match result { + Ok(()) => { + return self.finalize_permit(entry); + } + Err(e) => { + self.finalize_permit_failed(entry); + return Err(CacheError::PipelineFailed(e)); + } + } + } + } + } + } + + /// Sync helper: try to acquire write access. + /// + /// Returns the guard on success, or an action to perform before + /// retrying. Batch-full commit is handled in [`finalize_permit`]. + fn try_acquire_write( + &self, + key: PageKey, + load: bool, + ) -> Result, WritePendingAction> { + assert!( + self.log_state.is_some(), + "acquire_write requires a log (use VhdxFile::open().writable())" + ); + + assert!( + key.offset.is_multiple_of(PAGE_SIZE as u64), + "page offset {:#x} is not {PAGE_SIZE}-byte aligned", + key.offset + ); + + let file_offset; + let mut pages = self.pages.lock(); + let mut guard = { + let pages = &mut *pages; + file_offset = pages.resolve_offset(key); + + // Pre-allocate an LRU slot. Freed below if the entry already exists. + let lru_index = pages.lru.alloc(key); + match pages.map.entry(key) { + hash_map::Entry::Occupied(entry) => { + let entry = entry.into_mut(); + pages.lru.dealloc(lru_index); + entry.idle = false; + entry.page.lock_arc() + } + hash_map::Entry::Vacant(entry) => { + let entry = entry.insert(CacheEntry { + page: Arc::new(Mutex::new(PageData { + data: None, + state: PageState::Clean, + pre_log_fsn: None, + lru_index, + demoted: false, + })), + committed_lsn: Lsn::ZERO, + idle: false, + }); + let page = entry.page.clone(); + if self.quota > 0 && pages.map.len() > self.quota { + self.try_evict_under_lock(pages, Some(key)); + } + page.lock_arc() + } + } + }; + + match guard.state { + PageState::Loading | PageState::AcquiringPermit => { + Err(WritePendingAction::Wait(self.state_event.listen())) + } + PageState::Dirty => { + assert!( + guard.data.is_some(), + "page in {:?} has no data", + guard.state + ); + // Promote to MRU. + let idx = guard.lru_index; + guard.demoted = false; + pages.lru.move_to_front(idx); + drop(pages); + Ok(WritePageGuard { + cache: self, + guard: Some(guard), + overwriting: false, + }) + } + PageState::Clean if load && guard.data.is_none() => { + guard.state = PageState::Loading; + Err(WritePendingAction::Load( + file_offset, + ArcMutexGuard::into_arc(guard), + )) + } + PageState::Clean => { + // Promote to MRU. + let idx = guard.lru_index; + guard.demoted = false; + pages.lru.move_to_front(idx); + guard.state = PageState::AcquiringPermit; + Err(WritePendingAction::AcquirePermit(ArcMutexGuard::into_arc( + guard, + ))) + } + } + } + + /// Finalize a successful permit acquisition. + /// + /// Returns the page guard directly — the caller wraps it in a + /// `WritePageGuard` without re-entering `try_acquire_write`. + /// This eliminates the window where the page is in HasPermit/Overwritten + /// state without an active writer. + /// + /// The dirty_count check, batch-full commit, and dirty_count increment + /// are all performed atomically under the map lock — no TOCTOU gap. + fn finalize_permit( + &self, + entry: Arc>>, + ) -> Result, CacheError> { + let mut pages = self.pages.lock(); + + // Batch-full commit: if the dirty batch has reached + // MAX_COMMIT_PAGES, commit before adding this page. + if pages.dirty_count >= MAX_COMMIT_PAGES { + if let Err(e) = self.commit_locked(&mut pages) { + self.revert_permit(&entry, &mut pages); + return Err(e); + } + } + // Note that this may actually put us over MAX_COMMIT_PAGES, but only due to + // transient dirty counts from pages that are clean and have not yet decremented + // the count in [`WritePageGuard::drop`]. So, it will still be imposible for a + // cache transaction to be larger than MAX_COMMIT_PAGES. + pages.dirty_count += 1; + + let mut page = Mutex::lock_arc(&entry); + assert!(page.state == PageState::AcquiringPermit); + let overwriting = page.data.is_none(); + if overwriting { + page.data = Some(Arc::new(self.file.alloc_buffer(PAGE_SIZE))); + } + // For overwriting pages (fresh zeros for Overwrite), set Dirty — + // the zeros are synthetic, not real disk data, so they must be + // committed even if the caller doesn't call DerefMut. + page.state = if overwriting { + PageState::Dirty + } else { + PageState::Clean + }; + + self.state_event.notify(usize::MAX); + Ok(WritePageGuard { + cache: self, + guard: Some(page), + overwriting, + }) + } + + /// Release a clean write permit: mark idle, release the permit, + /// decrement dirty_count, and notify waiters. The page must already + /// be in `Clean` state. + fn release_clean_permit(&self, lru_index: usize, pages: &mut PageMap) { + pages.dirty_count -= 1; + let key = pages.lru.nodes[lru_index].key; + if let Some(ce) = pages.map.get_mut(&key) { + ce.idle = true; + } + if let Some(ref state) = self.log_state { + state.permits.release(1); + } + self.state_event.notify(usize::MAX); + } + + /// Revert a page from `AcquiringPermit` back to `Clean` and release + /// the permit. Used on permit acquisition failure and commit failure. + fn revert_permit( + &self, + entry: &Arc>>, + pages: &mut PageMap, + ) { + let mut page = entry.lock(); + assert!(page.state == PageState::AcquiringPermit); + page.state = PageState::Clean; + let lru_index = page.lru_index; + drop(page); + self.release_clean_permit(lru_index, pages); + } + + /// Finalize a failed permit acquisition: revert to Clean. + fn finalize_permit_failed(&self, entry: Arc>>) { + let mut pages = self.pages.lock(); + self.revert_permit(&entry, &mut pages); + } + + /// Get the pre-log FSN for a specific page, if set. + #[cfg(test)] + pub fn get_pre_log_fsn(&self, key: PageKey) -> Option { + let pages = self.pages.lock(); + if let Some(entry) = pages.map.get(&key) { + let page = entry.page.lock(); + page.pre_log_fsn + } else { + None + } + } + + /// Commit all dirty pages to the log task (fire-and-forget). + /// + /// Returns the current LSN. If there were dirty pages, they are sent + /// to the log task and the returned LSN is the one assigned to that + /// batch. If there were no dirty pages, returns the most recently + /// assigned LSN (so that concurrent `flush()` callers still wait + /// for any in-flight WAL writes). + pub fn commit(&self) -> Result { + let mut pages = self.pages.lock(); + self.commit_locked(&mut pages) + } + + /// Send pre-built page-aligned data through the log, bypassing the + /// cache's dirty-page tracking. Used for non-cache metadata writes + /// (e.g., region table repair). + /// + /// Returns the assigned LSN. + pub fn commit_raw(&self, raw_pages: Vec>, pre_log_fsn: Option) -> Lsn { + let mut map = self.pages.lock(); + let client = map + .log_client + .as_mut() + .expect("commit_raw requires a log client (use VhdxFile::open().writable())"); + let txn = client.begin(); + txn.commit(raw_pages, pre_log_fsn) + } + + /// Inner commit implementation that takes an already-held map lock. + /// + /// This allows `finalize_permit` to check dirty_count and commit + /// atomically under the same lock — no TOCTOU gap. + fn commit_locked(&self, pages: &mut PageMap) -> Result { + let client = pages + .log_client + .as_mut() + .expect("commit requires a log client (use VhdxFile::open().writable())"); + + let mut log_data = Vec::new(); + let mut max_pre_log_fsn: Option = None; + + let txn = client.begin(); + let lsn = txn.lsn(); + + // Destructure to get separate borrows on map, lru, and tag_offsets. + let PageMap { + ref mut map, + ref mut lru, + ref tag_offsets, + .. + } = *pages; + + for (&key, entry) in map.iter_mut() { + let mut page = entry.page.lock(); + if matches!(page.state, PageState::Dirty) { + let file_offset = tag_offsets[key.tag as usize] + key.offset; + let data = page.data.as_ref().expect("dirty page has no data").clone(); + + if let Some(fsn) = page.pre_log_fsn.take() { + max_pre_log_fsn = Some(max_pre_log_fsn.map_or(fsn, |m| m.max(fsn))); + } + + page.state = PageState::Clean; + + if page.demoted { + page.demoted = false; + lru.move_to_back(page.lru_index); + } + + entry.committed_lsn = lsn; + entry.idle = true; + + log_data.push(LogData::new(file_offset, data)); + } + } + + if log_data.is_empty() { + return Ok(client.current_lsn()); + } + + let committed_count = log_data.len(); + pages.dirty_count -= committed_count; + + txn.commit(log_data, max_pre_log_fsn); + + // Do NOT release permits here. Permits stay consumed until the + // apply task writes pages to their final offsets and releases + // them. This bounds the total in-flight page data (Arc clones) + // in the log/apply pipeline, preventing unbounded memory growth. + + Ok(lsn) + } +} + +/// RAII guard providing read-only access to a cached page. +#[must_use = "page guard holds a lock; drop it when done reading"] +pub struct ReadPageGuard { + guard: ArcMutexGuard>, +} + +impl + Send + Sync + 'static> std::ops::Deref for ReadPageGuard { + type Target = [u8; PAGE_SIZE]; + + fn deref(&self) -> &[u8; PAGE_SIZE] { + self.guard + .data + .as_ref() + .expect("page data missing") + .as_ref() + .as_ref() + .try_into() + .expect("buffer is not PAGE_SIZE") + } +} + +/// RAII guard providing write access to a cached page. +/// +/// Mutating via `DerefMut` transitions the page to `Dirty`. Arc COW +/// ensures the writer gets a private copy if the log task holds a +/// reference. +pub struct WritePageGuard<'a, F: AsyncFile> { + cache: &'a PageCache, + guard: Option>>, + /// Data existed before this acquire (loaded or previously written). + /// False for first-touch Overwrite (zeroed data). + overwriting: bool, +} + +impl WritePageGuard<'_, F> { + /// Returns true if the page is being overwritten rather than modified. + /// + /// If true, the page data is freshly zeroed and must be fully written by + /// the caller (unless the caller just wants to commit a zero page). + pub fn is_overwriting(&self) -> bool { + self.overwriting + } + + /// Set the pre-log flush sequence number on this page. + pub fn set_pre_log_fsn(&mut self, fsn: Fsn) { + let guard = self.guard.as_mut().expect("guard consumed"); + guard.pre_log_fsn = Some(match guard.pre_log_fsn { + Some(existing) => existing.max(fsn), + None => fsn, + }); + } + + /// Hint that this page is cheap to regenerate and should be evicted + /// before other pages (e.g., BAT pages that can be rebuilt from + /// in-memory state). + /// + /// If the page becomes dirty, the demotion is applied when the page + /// transitions back to clean in [`PageCache::commit`]. If the page + /// stays clean (guard dropped without mutation), the demotion is + /// applied immediately on drop. + pub fn demote(&mut self) { + self.guard.as_mut().expect("guard consumed").demoted = true; + } +} + +impl std::ops::Deref for WritePageGuard<'_, F> { + type Target = [u8; PAGE_SIZE]; + + fn deref(&self) -> &[u8; PAGE_SIZE] { + self.guard + .as_ref() + .expect("guard consumed") + .data + .as_ref() + .expect("page data missing") + .as_ref() + .as_ref() + .try_into() + .expect("buffer is not PAGE_SIZE") + } +} + +impl std::ops::DerefMut for WritePageGuard<'_, F> { + fn deref_mut(&mut self) -> &mut [u8; PAGE_SIZE] { + let guard = self.guard.as_mut().expect("guard consumed"); + guard.state = PageState::Dirty; + let buf = Arc::make_mut(guard.data.as_mut().expect("page data missing")); + buf.as_mut().try_into().expect("buffer is not PAGE_SIZE") + } +} + +impl Drop for WritePageGuard<'_, F> { + fn drop(&mut self) { + if let Some(guard) = self.guard.take() { + if guard.state != PageState::Dirty { + // Guard dropped without mutation. Page is clean — release + // the permit and mark idle. + let lru_index = guard.lru_index; + drop(guard); + let mut pages = self.cache.pages.lock(); + self.cache.release_clean_permit(lru_index, &mut pages); + } + // If Dirty: permit consumed, page stays not-idle. + // Guard drops, releasing page lock. + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::AsyncFileExt; + use crate::log_task::LogRequest; + use crate::tests::support::{FailingInterceptor, InMemoryFile}; + use pal_async::async_test; + use std::sync::Arc; + + /// Helper to create a writable cache with log sender + permits. + fn writable_cache( + file: InMemoryFile, + ) -> (PageCache, mesh::Receiver>>) { + let (tx, rx) = mesh::channel::>>(); + let permits = Arc::new(LogPermits::new(1000)); + let cache = PageCache::new( + Arc::new(file), + Some(LogClient::new(tx)), + Some(CacheLogState { + permits, + applied_lsn: Arc::new(LsnWatermark::new()), + }), + 0, + ); + (cache, rx) + } + + #[async_test] + async fn acquire_read_loads_from_file() { + let file = InMemoryFile::new(PAGE_SIZE as u64); + let pattern: Vec = (0..PAGE_SIZE).map(|i| (i & 0xFF) as u8).collect(); + file.write_at(0, &pattern).await.unwrap(); + + let mut cache = PageCache::new(Arc::new(file), None, None, 0); + cache.register_tag(0, 0); + + let guard = cache + .acquire_read(PageKey { tag: 0, offset: 0 }) + .await + .unwrap(); + assert_eq!(&guard[..], &pattern[..]); + } + + #[async_test] + async fn acquire_modify_loads_and_writes_back() { + let file = InMemoryFile::new(PAGE_SIZE as u64); + let pattern: Vec = (0..PAGE_SIZE).map(|i| (i & 0xFF) as u8).collect(); + file.write_at(0, &pattern).await.unwrap(); + + let (_cache, _rx) = writable_cache(InMemoryFile::new(PAGE_SIZE as u64)); + // Re-create with the patterned file. + let file = InMemoryFile::new(PAGE_SIZE as u64); + file.write_at(0, &pattern).await.unwrap(); + let (tx, _rx) = mesh::channel::>>(); + let permits = Arc::new(LogPermits::new(1000)); + let mut cache = PageCache::new( + Arc::new(file), + Some(LogClient::new(tx)), + Some(CacheLogState { + permits, + applied_lsn: Arc::new(LsnWatermark::new()), + }), + 0, + ); + cache.register_tag(0, 0); + + { + let mut guard = cache + .acquire_write(PageKey { tag: 0, offset: 0 }, WriteMode::Modify) + .await + .unwrap(); + assert_eq!(guard[0], 0x00); + assert_eq!(guard[1], 0x01); + guard[0] = 0xAA; + guard[1] = 0xBB; + } + + let guard = cache + .acquire_read(PageKey { tag: 0, offset: 0 }) + .await + .unwrap(); + assert_eq!(guard[0], 0xAA); + assert_eq!(guard[1], 0xBB); + assert_eq!(guard[2], 0x02); + } + + #[async_test] + async fn acquire_overwrite_skips_read() { + let file = InMemoryFile::with_interceptor( + PAGE_SIZE as u64, + Arc::new(FailingInterceptor { + fail_reads: true, + fail_writes: false, + fail_flushes: false, + fail_set_file_size: false, + }), + ); + + let (tx, _rx) = mesh::channel::>>(); + let permits = Arc::new(LogPermits::new(1000)); + let mut cache = PageCache::new( + Arc::new(file), + Some(LogClient::new(tx)), + Some(CacheLogState { + permits, + applied_lsn: Arc::new(LsnWatermark::new()), + }), + 0, + ); + cache.register_tag(0, 0); + + { + let mut guard = cache + .acquire_write(PageKey { tag: 0, offset: 0 }, WriteMode::Overwrite) + .await + .unwrap(); + guard.fill(0xCC); + } + + let guard = cache + .acquire_read(PageKey { tag: 0, offset: 0 }) + .await + .unwrap(); + assert!(guard.iter().all(|&b| b == 0xCC)); + } + + #[async_test] + async fn concurrent_reads_return_correct_data() { + let file = InMemoryFile::new(PAGE_SIZE as u64); + let pattern: Vec = (0..PAGE_SIZE).map(|i| ((i * 3) & 0xFF) as u8).collect(); + file.write_at(0, &pattern).await.unwrap(); + + let mut cache = PageCache::new(Arc::new(file), None, None, 0); + cache.register_tag(0, 0); + + let g1 = cache + .acquire_read(PageKey { tag: 0, offset: 0 }) + .await + .unwrap(); + assert_eq!(&g1[..], &pattern[..]); + drop(g1); + + let g2 = cache + .acquire_read(PageKey { tag: 0, offset: 0 }) + .await + .unwrap(); + assert_eq!(&g2[..], &pattern[..]); + } + + #[async_test] + async fn sequential_modify_acquires_work() { + let (tx, _rx) = mesh::channel::>>(); + let permits = Arc::new(LogPermits::new(1000)); + let mut cache = PageCache::new( + Arc::new(InMemoryFile::new(PAGE_SIZE as u64)), + Some(LogClient::new(tx)), + Some(CacheLogState { + permits, + applied_lsn: Arc::new(LsnWatermark::new()), + }), + 0, + ); + cache.register_tag(0, 0); + + { + let mut guard = cache + .acquire_write(PageKey { tag: 0, offset: 0 }, WriteMode::Modify) + .await + .unwrap(); + guard[0] = 0x11; + } + + { + let mut guard = cache + .acquire_write(PageKey { tag: 0, offset: 0 }, WriteMode::Modify) + .await + .unwrap(); + assert_eq!(guard[0], 0x11); + guard[0] = 0x22; + } + + let guard = cache + .acquire_read(PageKey { tag: 0, offset: 0 }) + .await + .unwrap(); + assert_eq!(guard[0], 0x22); + } + + #[async_test] + async fn modify_then_modify_same_page() { + let (tx, _rx) = mesh::channel::>>(); + let permits = Arc::new(LogPermits::new(1000)); + let mut cache = PageCache::new( + Arc::new(InMemoryFile::new(PAGE_SIZE as u64)), + Some(LogClient::new(tx)), + Some(CacheLogState { + permits, + applied_lsn: Arc::new(LsnWatermark::new()), + }), + 0, + ); + cache.register_tag(0, 0); + + { + let mut g = cache + .acquire_write(PageKey { tag: 0, offset: 0 }, WriteMode::Modify) + .await + .unwrap(); + g[0] = 0xAA; + } + + { + let mut g = cache + .acquire_write(PageKey { tag: 0, offset: 0 }, WriteMode::Modify) + .await + .unwrap(); + assert_eq!(g[0], 0xAA); + g[1] = 0xBB; + } + + let guard = cache + .acquire_read(PageKey { tag: 0, offset: 0 }) + .await + .unwrap(); + assert_eq!(guard[0], 0xAA); + assert_eq!(guard[1], 0xBB); + } + + #[async_test] + async fn different_pages_independent() { + let (tx, _rx) = mesh::channel::>>(); + let permits = Arc::new(LogPermits::new(1000)); + let mut cache = PageCache::new( + Arc::new(InMemoryFile::new(PAGE_SIZE as u64 * 4)), + Some(LogClient::new(tx)), + Some(CacheLogState { + permits, + applied_lsn: Arc::new(LsnWatermark::new()), + }), + 0, + ); + cache.register_tag(0, 0); + + { + let mut g = cache + .acquire_write(PageKey { tag: 0, offset: 0 }, WriteMode::Modify) + .await + .unwrap(); + g[0] = 0x11; + } + + { + let mut g = cache + .acquire_write( + PageKey { + tag: 0, + offset: PAGE_SIZE as u64, + }, + WriteMode::Modify, + ) + .await + .unwrap(); + g[0] = 0x22; + } + + let g1 = cache + .acquire_read(PageKey { tag: 0, offset: 0 }) + .await + .unwrap(); + assert_eq!(g1[0], 0x11); + drop(g1); + + let g2 = cache + .acquire_read(PageKey { + tag: 0, + offset: PAGE_SIZE as u64, + }) + .await + .unwrap(); + assert_eq!(g2[0], 0x22); + } + + #[async_test] + async fn tag_offset_resolution() { + let base: u64 = 0x10000; + let page_offset: u64 = 0x1000; + let file = InMemoryFile::new(base + page_offset + PAGE_SIZE as u64); + let pattern = [0xDE; PAGE_SIZE]; + file.write_at(base + page_offset, &pattern).await.unwrap(); + + let mut cache = PageCache::new(Arc::new(file), None, None, 0); + cache.register_tag(0, base); + + let guard = cache + .acquire_read(PageKey { + tag: 0, + offset: page_offset, + }) + .await + .unwrap(); + assert_eq!(&guard[..], &pattern[..]); + } + + #[async_test] + async fn commit_sends_transaction() { + let (tx, mut rx) = mesh::channel::>>(); + let permits = Arc::new(LogPermits::new(1000)); + let mut cache = PageCache::new( + Arc::new(InMemoryFile::new(PAGE_SIZE as u64)), + Some(LogClient::new(tx)), + Some(CacheLogState { + permits, + applied_lsn: Arc::new(LsnWatermark::new()), + }), + 0, + ); + cache.register_tag(0, 0); + let key = PageKey { tag: 0, offset: 0 }; + + { + let mut g = cache.acquire_write(key, WriteMode::Modify).await.unwrap(); + g.fill(0xAA); + } + let lsn = cache.commit().unwrap(); + assert!(lsn > Lsn::ZERO); + + match rx.recv().await.unwrap() { + LogRequest::Commit(txn) => { + assert_eq!(txn.lsn, lsn); + assert_eq!(txn.data.len(), 1); + assert!(txn.data[0].data().iter().all(|&b| b == 0xAA)); + } + _ => panic!("expected Commit"), + } + } + + #[async_test] + async fn consecutive_commits_get_increasing_lsns() { + let (tx, mut rx) = mesh::channel::>>(); + let permits = Arc::new(LogPermits::new(1000)); + let mut cache = PageCache::new( + Arc::new(InMemoryFile::new(PAGE_SIZE as u64)), + Some(LogClient::new(tx)), + Some(CacheLogState { + permits, + applied_lsn: Arc::new(LsnWatermark::new()), + }), + 0, + ); + cache.register_tag(0, 0); + let key = PageKey { tag: 0, offset: 0 }; + + { + let mut g = cache.acquire_write(key, WriteMode::Modify).await.unwrap(); + g.fill(0xAA); + } + let lsn1 = cache.commit().unwrap(); + + { + let mut g = cache.acquire_write(key, WriteMode::Modify).await.unwrap(); + g.fill(0xBB); + } + let lsn2 = cache.commit().unwrap(); + + assert!(lsn2 > lsn1); + + match rx.recv().await.unwrap() { + LogRequest::Commit(txn) => assert_eq!(txn.lsn, lsn1), + _ => panic!("expected Commit"), + } + match rx.recv().await.unwrap() { + LogRequest::Commit(txn) => assert_eq!(txn.lsn, lsn2), + _ => panic!("expected Commit"), + } + } + + #[async_test] + async fn commit_sets_committed_lsn() { + let (tx, _rx) = mesh::channel::>>(); + let permits = Arc::new(LogPermits::new(1000)); + let mut cache = PageCache::new( + Arc::new(InMemoryFile::new(PAGE_SIZE as u64)), + Some(LogClient::new(tx)), + Some(CacheLogState { + permits, + applied_lsn: Arc::new(LsnWatermark::new()), + }), + 0, + ); + cache.register_tag(0, 0); + let key = PageKey { tag: 0, offset: 0 }; + + { + let mut g = cache.acquire_write(key, WriteMode::Modify).await.unwrap(); + g.fill(0xAA); + } + let lsn = cache.commit().unwrap(); + + let pages = cache.pages.lock(); + let entry = pages.map.get(&key).unwrap(); + assert_eq!(entry.committed_lsn, lsn); + } + + async fn dirty_pages(cache: &PageCache, count: usize) { + for i in 0..count { + let key = PageKey { + tag: 0, + offset: (i * PAGE_SIZE) as u64, + }; + let mut g = cache + .acquire_write(key, WriteMode::Overwrite) + .await + .unwrap(); + g.fill(i as u8); + } + } + + #[async_test] + async fn batch_full_commit_on_dirty_overflow() { + let (tx, mut rx) = mesh::channel::>>(); + let permits = Arc::new(LogPermits::new(1000)); + let mut cache = PageCache::new( + Arc::new(InMemoryFile::new(PAGE_SIZE as u64 * 200)), + Some(LogClient::new(tx)), + Some(CacheLogState { + permits, + applied_lsn: Arc::new(LsnWatermark::new()), + }), + 0, + ); + cache.register_tag(0, 0); + + dirty_pages(&cache, MAX_COMMIT_PAGES).await; + + let new_key = PageKey { + tag: 0, + offset: (MAX_COMMIT_PAGES * PAGE_SIZE) as u64, + }; + { + let mut guard = cache + .acquire_write(new_key, WriteMode::Overwrite) + .await + .unwrap(); + guard.fill(0xFF); + } + + match rx.recv().await.unwrap() { + LogRequest::Commit(txn) => { + assert_eq!(txn.data.len(), MAX_COMMIT_PAGES); + } + _ => panic!("expected Commit from batch-full commit"), + } + + cache.commit().unwrap(); + match rx.recv().await.unwrap() { + LogRequest::Commit(txn) => { + assert_eq!(txn.data.len(), 1); + } + _ => panic!("expected Commit from explicit commit"), + } + } + + #[async_test] + async fn redirty_does_not_trigger_batch_full_commit() { + let (tx, mut rx) = mesh::channel::>>(); + let permits = Arc::new(LogPermits::new(1000)); + let mut cache = PageCache::new( + Arc::new(InMemoryFile::new(PAGE_SIZE as u64 * 200)), + Some(LogClient::new(tx)), + Some(CacheLogState { + permits, + applied_lsn: Arc::new(LsnWatermark::new()), + }), + 0, + ); + cache.register_tag(0, 0); + + dirty_pages(&cache, MAX_COMMIT_PAGES).await; + + let key = PageKey { tag: 0, offset: 0 }; + let mut g = cache.acquire_write(key, WriteMode::Modify).await.unwrap(); + g[0] = 0xDD; + + assert!( + rx.try_recv().is_err(), + "re-dirtying an already-dirty page must not trigger batch-full commit" + ); + + assert_eq!(cache.pages.lock().dirty_count, MAX_COMMIT_PAGES); + } + + #[async_test] + async fn write_ordering_across_batches() { + let (tx, mut rx) = mesh::channel::>>(); + let permits = Arc::new(LogPermits::new(1000)); + let mut cache = PageCache::new( + Arc::new(InMemoryFile::new(PAGE_SIZE as u64 * 200)), + Some(LogClient::new(tx)), + Some(CacheLogState { + permits, + applied_lsn: Arc::new(LsnWatermark::new()), + }), + 0, + ); + cache.register_tag(0, 0); + + dirty_pages(&cache, MAX_COMMIT_PAGES).await; + + let key_b = PageKey { + tag: 0, + offset: (MAX_COMMIT_PAGES * PAGE_SIZE) as u64, + }; + { + let mut g = cache + .acquire_write(key_b, WriteMode::Overwrite) + .await + .unwrap(); + g.fill(0xBB); + } + + let batch1 = match rx.recv().await.unwrap() { + LogRequest::Commit(txn) => txn, + _ => panic!("expected Commit"), + }; + assert_eq!(batch1.data.len(), MAX_COMMIT_PAGES); + + let key_c = PageKey { + tag: 0, + offset: ((MAX_COMMIT_PAGES + 1) * PAGE_SIZE) as u64, + }; + { + let mut g = cache + .acquire_write(key_c, WriteMode::Overwrite) + .await + .unwrap(); + g.fill(0xCC); + } + + cache.commit().unwrap(); + let batch2 = match rx.recv().await.unwrap() { + LogRequest::Commit(txn) => txn, + _ => panic!("expected Commit"), + }; + assert_eq!(batch2.data.len(), 2); + assert!(batch1.lsn < batch2.lsn); + } + + // ---- Eviction tests ---- + + #[async_test] + async fn eviction_removes_clean_page() { + let file = InMemoryFile::new(PAGE_SIZE as u64 * 4); + let pattern_a = [0xAA; PAGE_SIZE]; + let pattern_b = [0xBB; PAGE_SIZE]; + file.write_at(0, &pattern_a).await.unwrap(); + file.write_at(PAGE_SIZE as u64, &pattern_b).await.unwrap(); + + // Quota of 1 page. + let mut cache = PageCache::new(Arc::new(file), None, None, 1); + cache.register_tag(0, 0); + + // Load page A. + let g = cache + .acquire_read(PageKey { tag: 0, offset: 0 }) + .await + .unwrap(); + assert_eq!(g[0], 0xAA); + drop(g); + + // Cache has 1 page (at quota). Loading page B should evict page A. + let g = cache + .acquire_read(PageKey { + tag: 0, + offset: PAGE_SIZE as u64, + }) + .await + .unwrap(); + assert_eq!(g[0], 0xBB); + drop(g); + + // Page A was evicted — cache should have 1 entry. + assert_eq!(cache.pages.lock().map.len(), 1); + } + + #[async_test] + async fn eviction_reloads_from_disk() { + let file = InMemoryFile::new(PAGE_SIZE as u64 * 4); + let pattern_a = [0xAA; PAGE_SIZE]; + let pattern_b = [0xBB; PAGE_SIZE]; + file.write_at(0, &pattern_a).await.unwrap(); + file.write_at(PAGE_SIZE as u64, &pattern_b).await.unwrap(); + + let mut cache = PageCache::new(Arc::new(file), None, None, 1); + cache.register_tag(0, 0); + + // Load page A. + let g = cache + .acquire_read(PageKey { tag: 0, offset: 0 }) + .await + .unwrap(); + assert_eq!(g[0], 0xAA); + drop(g); + + // Load page B (evicts A). + let g = cache + .acquire_read(PageKey { + tag: 0, + offset: PAGE_SIZE as u64, + }) + .await + .unwrap(); + assert_eq!(g[0], 0xBB); + drop(g); + + // Re-load page A (evicts B, reloads from disk). + let g = cache + .acquire_read(PageKey { tag: 0, offset: 0 }) + .await + .unwrap(); + assert_eq!(g[0], 0xAA); + drop(g); + } + + #[async_test] + async fn eviction_skips_dirty_pages() { + let file = InMemoryFile::new(PAGE_SIZE as u64 * 4); + file.write_at(PAGE_SIZE as u64, &[0xBB; PAGE_SIZE]) + .await + .unwrap(); + + let (tx, _rx) = mesh::channel::>>(); + let permits = Arc::new(LogPermits::new(1000)); + // Quota of 1, but page 0 will be dirty. + let mut cache = PageCache::new( + Arc::new(file), + Some(LogClient::new(tx)), + Some(CacheLogState { + permits, + applied_lsn: Arc::new(LsnWatermark::new()), + }), + 1, + ); + cache.register_tag(0, 0); + + // Write page A (makes it Dirty). + { + let mut g = cache + .acquire_write(PageKey { tag: 0, offset: 0 }, WriteMode::Overwrite) + .await + .unwrap(); + g.fill(0xAA); + } + + // Try to load page B. Eviction should skip dirty page A. + // Cache will have 2 entries (over quota but nothing evictable). + let g = cache + .acquire_read(PageKey { + tag: 0, + offset: PAGE_SIZE as u64, + }) + .await + .unwrap(); + assert_eq!(g[0], 0xBB); + drop(g); + + // Both pages present. + assert_eq!(cache.pages.lock().map.len(), 2); + + // Verify page A is still readable (not evicted). + let g = cache + .acquire_read(PageKey { tag: 0, offset: 0 }) + .await + .unwrap(); + assert_eq!(g[0], 0xAA); + } + + #[async_test] + async fn eviction_skips_uncommitted_page() { + let file = InMemoryFile::new(PAGE_SIZE as u64 * 4); + file.write_at(0, &[0xAA; PAGE_SIZE]).await.unwrap(); + file.write_at(PAGE_SIZE as u64, &[0xBB; PAGE_SIZE]) + .await + .unwrap(); + + let applied = Arc::new(LsnWatermark::new()); + // applied_lsn = 0, so committed pages with lsn > 0 are not evictable. + + let (tx, _rx) = mesh::channel::>>(); + let permits = Arc::new(LogPermits::new(1000)); + let mut cache = PageCache::new( + Arc::new(file), + Some(LogClient::new(tx)), + Some(CacheLogState { + permits, + applied_lsn: applied.clone(), + }), + 1, + ); + cache.register_tag(0, 0); + + // Write and commit page A (committed_lsn = 1, applied_lsn = 0). + { + let mut g = cache + .acquire_write(PageKey { tag: 0, offset: 0 }, WriteMode::Overwrite) + .await + .unwrap(); + g.fill(0xAA); + } + cache.commit().unwrap(); + + // Page A is Clean with committed_lsn=1. applied_lsn=0. + // Eviction should skip it (not yet applied). + let g = cache + .acquire_read(PageKey { + tag: 0, + offset: PAGE_SIZE as u64, + }) + .await + .unwrap(); + assert_eq!(g[0], 0xBB); + drop(g); + + // Both pages present (A is not evictable). + assert_eq!(cache.pages.lock().map.len(), 2); + + // Now advance applied_lsn past the committed_lsn. + applied.advance(Lsn::new(1), Fsn::ZERO); + + // Load another page — now A is evictable. + let _file_size = PAGE_SIZE as u64 * 4; + // Load page at offset 2*PAGE_SIZE (need data there). + cache + .file + .write_at(PAGE_SIZE as u64 * 2, &[0xCC; PAGE_SIZE]) + .await + .unwrap(); + let g = cache + .acquire_read(PageKey { + tag: 0, + offset: PAGE_SIZE as u64 * 2, + }) + .await + .unwrap(); + assert_eq!(g[0], 0xCC); + drop(g); + + // Should have evicted one of the old pages (A or B). + assert!(cache.pages.lock().map.len() <= 2); + } + + #[async_test] + async fn no_deadlock_with_quota() { + // Regression test: verify that acquiring pages with a small quota + // doesn't deadlock. The dual-lock pattern + // should prevent lock-order issues. + let (tx, _rx) = mesh::channel::>>(); + let permits = Arc::new(LogPermits::new(1000)); + let mut cache = PageCache::new( + Arc::new(InMemoryFile::new(PAGE_SIZE as u64 * 10)), + Some(LogClient::new(tx)), + Some(CacheLogState { + permits, + applied_lsn: Arc::new(LsnWatermark::new()), + }), + 2, + ); + cache.register_tag(0, 0); + + // Rapidly acquire and drop pages, cycling through more than the quota. + for i in 0..5u64 { + let mut g = cache + .acquire_write( + PageKey { + tag: 0, + offset: i * PAGE_SIZE as u64, + }, + WriteMode::Overwrite, + ) + .await + .unwrap(); + g.fill(i as u8); + } + // If we get here without hanging, no deadlock. + } + + #[async_test] + async fn overwrite_uncached_reports_not_cached() { + let (tx, _rx) = mesh::channel::>>(); + let permits = Arc::new(LogPermits::new(1000)); + let mut cache = PageCache::new( + Arc::new(InMemoryFile::new(PAGE_SIZE as u64)), + Some(LogClient::new(tx)), + Some(CacheLogState { + permits, + applied_lsn: Arc::new(LsnWatermark::new()), + }), + 0, + ); + cache.register_tag(0, 0); + + let key = PageKey { tag: 0, offset: 0 }; + let g = cache + .acquire_write(key, WriteMode::Overwrite) + .await + .unwrap(); + assert!( + g.is_overwriting(), + "first Overwrite acquire should report overwriting (not cached)" + ); + } + + #[async_test] + async fn overwrite_cached_reports_cached() { + let (tx, _rx) = mesh::channel::>>(); + let permits = Arc::new(LogPermits::new(1000)); + let mut cache = PageCache::new( + Arc::new(InMemoryFile::new(PAGE_SIZE as u64)), + Some(LogClient::new(tx)), + Some(CacheLogState { + permits, + applied_lsn: Arc::new(LsnWatermark::new()), + }), + 0, + ); + cache.register_tag(0, 0); + + let key = PageKey { tag: 0, offset: 0 }; + + // First write populates the cache. + { + let mut g = cache + .acquire_write(key, WriteMode::Overwrite) + .await + .unwrap(); + g.fill(0xAA); + } + + // Second write should find it cached. + let g = cache + .acquire_write(key, WriteMode::Overwrite) + .await + .unwrap(); + assert!( + !g.is_overwriting(), + "second Overwrite acquire should report cached (not overwriting)" + ); + assert_eq!(g[0], 0xAA); + assert_eq!(g[PAGE_SIZE - 1], 0xAA); + } + + #[async_test] + async fn modify_always_reports_cached() { + // Modify loads from disk if not cached, so populated reflects + // map presence after load — always true since load populates it. + let file = InMemoryFile::new(PAGE_SIZE as u64); + file.write_at(0, &[0xBB; PAGE_SIZE]).await.unwrap(); + + let (tx, _rx) = mesh::channel::>>(); + let permits = Arc::new(LogPermits::new(1000)); + let mut cache = PageCache::new( + Arc::new(file), + Some(LogClient::new(tx)), + Some(CacheLogState { + permits, + applied_lsn: Arc::new(LsnWatermark::new()), + }), + 0, + ); + cache.register_tag(0, 0); + + let key = PageKey { tag: 0, offset: 0 }; + + // Modify loads from disk then retries — page is in map on retry. + let g = cache.acquire_write(key, WriteMode::Modify).await.unwrap(); + assert!( + !g.is_overwriting(), + "Modify always reports cached (not overwriting)" + ); + assert_eq!(g[0], 0xBB); + } + + #[async_test] + async fn lru_evicts_oldest_first() { + let file = InMemoryFile::new(PAGE_SIZE as u64 * 4); + for i in 0..4 { + file.write_at(i * PAGE_SIZE as u64, &[(i as u8) + 0xA0; PAGE_SIZE]) + .await + .unwrap(); + } + + // Quota of 2. + let mut cache = PageCache::new(Arc::new(file), None, None, 2); + cache.register_tag(0, 0); + + let key_a = PageKey { tag: 0, offset: 0 }; + let key_b = PageKey { + tag: 0, + offset: PAGE_SIZE as u64, + }; + let key_c = PageKey { + tag: 0, + offset: 2 * PAGE_SIZE as u64, + }; + + // Load A then B (both in cache, at quota). + let g = cache.acquire_read(key_a).await.unwrap(); + assert_eq!(g[0], 0xA0); + drop(g); + + let g = cache.acquire_read(key_b).await.unwrap(); + assert_eq!(g[0], 0xA1); + drop(g); + + // LRU order: MRU=B, LRU=A. Loading C should evict A. + let g = cache.acquire_read(key_c).await.unwrap(); + assert_eq!(g[0], 0xA2); + drop(g); + + let pages = cache.pages.lock(); + assert!( + !pages.map.contains_key(&key_a), + "A should have been evicted" + ); + assert!(pages.map.contains_key(&key_b), "B should still be cached"); + assert!(pages.map.contains_key(&key_c), "C should be cached"); + } + + #[async_test] + async fn write_demote_defers_to_commit() { + let file = InMemoryFile::new(PAGE_SIZE as u64 * 4); + for i in 0..3 { + file.write_at(i * PAGE_SIZE as u64, &[(i as u8) + 0xC0; PAGE_SIZE]) + .await + .unwrap(); + } + + let (tx, _rx) = mesh::channel::>>(); + let permits = Arc::new(LogPermits::new(1000)); + let applied = Arc::new(LsnWatermark::new()); + let mut cache = PageCache::new( + Arc::new(file), + Some(LogClient::new(tx)), + Some(CacheLogState { + permits, + applied_lsn: applied.clone(), + }), + 2, + ); + cache.register_tag(0, 0); + + let key_a = PageKey { tag: 0, offset: 0 }; + let key_b = PageKey { + tag: 0, + offset: PAGE_SIZE as u64, + }; + let key_c = PageKey { + tag: 0, + offset: 2 * PAGE_SIZE as u64, + }; + + // Write A with demote. Page becomes dirty. + { + let mut g = cache + .acquire_write(key_a, WriteMode::Overwrite) + .await + .unwrap(); + g.fill(0xDD); + g.demote(); + } + + // Read B. + let g = cache.acquire_read(key_b).await.unwrap(); + drop(g); + + // Commit A (dirty→clean). Since demoted, it should go to LRU end. + let lsn = cache.commit().unwrap(); + applied.advance(lsn, Fsn::ZERO); + + // Now load C. Should evict A (demoted at LRU end) not B. + let g = cache.acquire_read(key_c).await.unwrap(); + assert_eq!(g[0], 0xC2); + drop(g); + + let pages = cache.pages.lock(); + assert!( + !pages.map.contains_key(&key_a), + "demoted A should be evicted after commit" + ); + assert!(pages.map.contains_key(&key_b), "B should still be cached"); + } +} diff --git a/vm/devices/storage/vhdx/src/create.rs b/vm/devices/storage/vhdx/src/create.rs new file mode 100644 index 0000000000..0fa3c52717 --- /dev/null +++ b/vm/devices/storage/vhdx/src/create.rs @@ -0,0 +1,936 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! VHDX file creation. +//! +//! Writes a valid, empty VHDX file (file identifier, dual headers, dual +//! region tables, metadata table, and empty BAT) to an [`AsyncFile`]. + +use crate::AsyncFile; +use crate::error::CreateError; +use crate::error::InvalidFormatReason; +use crate::format; +use crate::format::FileIdentifier; +use crate::format::FileParameters; +use crate::format::FileParametersFlags; +use crate::format::Header; +use crate::format::MetadataTableEntry; +use crate::format::MetadataTableEntryFlags; +use crate::format::MetadataTableHeader; +use crate::format::RegionTableEntry; +use crate::format::RegionTableEntryFlags; +use crate::format::RegionTableHeader; +use guid::Guid; +use zerocopy::FromZeros; +use zerocopy::IntoBytes; + +/// Parameters for creating a new VHDX file. +pub struct CreateParams { + /// Virtual disk size in bytes. Must be a multiple of `logical_sector_size` + /// and at most 64 TiB. + pub disk_size: u64, + + /// Block size in bytes. Must be a multiple of 1 MiB and at most 256 MiB. + /// Default: 2 MiB. + pub block_size: u32, + + /// Logical sector size. Must be 512 or 4096. Default: 512. + pub logical_sector_size: u32, + + /// Physical sector size. Must be 512 or 4096. Default: 512. + pub physical_sector_size: u32, + + /// Whether this is a differencing disk (has a parent). + pub has_parent: bool, + + /// Block alignment for the data region. 0 means no special alignment. + /// If non-zero, must be a power of 2. + pub block_alignment: u32, + + /// If true, create the file in an incomplete state + /// (adds an "incomplete file" metadata item that prevents open). + pub create_incomplete: bool, + + /// If true, mark all blocks as allocated (fixed VHD). + pub is_fully_allocated: bool, + + /// Data write GUID. If zero GUID, a random one will be generated. + /// Callers can supply a specific GUID for re-parenting workflows. + pub data_write_guid: Guid, + + /// Page 83 SCSI identifier. If zero GUID, a random one will be generated. + pub page_83_data: Guid, +} + +impl Default for CreateParams { + fn default() -> Self { + Self { + disk_size: 0, + block_size: 0, + logical_sector_size: 0, + physical_sector_size: 0, + has_parent: false, + block_alignment: 0, + create_incomplete: false, + is_fully_allocated: false, + data_write_guid: Guid::ZERO, + page_83_data: Guid::ZERO, + } + } +} + +/// Integer ceiling division (a / b, rounded up). Panics if b == 0. +pub(crate) fn ceil_div(a: u64, b: u64) -> u64 { + a.div_ceil(b) +} + +/// Round `value` up to the next multiple of `alignment`. +/// `alignment` must be a power of 2. +pub(crate) fn round_up(value: u64, alignment: u64) -> u64 { + (value + alignment - 1) & !(alignment - 1) +} + +/// Compute the chunk ratio (number of data blocks per sector bitmap block). +pub(crate) fn chunk_block_count(block_size: u32, sector_size: u32) -> u32 { + let sectors_per_block = block_size / sector_size; + (format::SECTORS_PER_CHUNK / sectors_per_block as u64) as u32 +} + +/// Create a new, empty VHDX file. +/// +/// Writes file identifier, dual headers, dual region tables, metadata +/// table with standard metadata items, and an empty BAT to the provided +/// file. The file is truncated/extended to the required size. +/// +/// `params` is updated in place with defaults filled in (e.g. zero +/// `block_size` becomes 2 MiB, zero GUIDs become random). +pub async fn create(file: &impl AsyncFile, params: &mut CreateParams) -> Result<(), CreateError> { + // --- Validate and default parameters --- + + if params.logical_sector_size == 0 { + params.logical_sector_size = format::DEFAULT_SECTOR_SIZE; + } + if params.logical_sector_size != 512 && params.logical_sector_size != 4096 { + return Err(CreateError::InvalidFormat( + InvalidFormatReason::InvalidLogicalSectorSize, + )); + } + + if params.physical_sector_size == 0 { + params.physical_sector_size = format::DEFAULT_SECTOR_SIZE; + } + if params.physical_sector_size != 512 && params.physical_sector_size != 4096 { + return Err(CreateError::InvalidFormat( + InvalidFormatReason::InvalidPhysicalSectorSize, + )); + } + + if params.disk_size == 0 { + return Err(CreateError::InvalidFormat( + InvalidFormatReason::DiskSizeZero, + )); + } + if !params + .disk_size + .is_multiple_of(params.logical_sector_size as u64) + { + return Err(CreateError::InvalidFormat( + InvalidFormatReason::DiskSizeNotAligned, + )); + } + if params.disk_size > format::MAXIMUM_DISK_SIZE { + return Err(CreateError::InvalidFormat( + InvalidFormatReason::DiskSizeTooLarge, + )); + } + + if params.block_size == 0 { + params.block_size = format::DEFAULT_BLOCK_SIZE; + } + if !(params.block_size as u64).is_multiple_of(format::REGION_ALIGNMENT) { + return Err(CreateError::InvalidFormat( + InvalidFormatReason::BlockSizeNotAligned, + )); + } + if params.block_size as u64 > format::MAXIMUM_BLOCK_SIZE { + return Err(CreateError::InvalidFormat( + InvalidFormatReason::BlockSizeTooLarge, + )); + } + + if params.block_alignment != 0 && !params.block_alignment.is_power_of_two() { + return Err(CreateError::InvalidFormat( + InvalidFormatReason::BlockAlignmentNotPowerOfTwo, + )); + } + + // Generate a random page 83 GUID if not provided. + if params.page_83_data == Guid::ZERO { + params.page_83_data = Guid::new_random(); + } + + // --- Compute BAT size --- + + let data_block_count = ceil_div(params.disk_size, params.block_size as u64); + let chunk_ratio = chunk_block_count(params.block_size, params.logical_sector_size); + + if chunk_ratio == 0 { + return Err(CreateError::InvalidFormat( + InvalidFormatReason::InvalidChunkRatio, + )); + } + + let sector_bitmap_block_count = ceil_div(data_block_count, chunk_ratio as u64); + + let bat_entry_count = if params.has_parent { + sector_bitmap_block_count * (chunk_ratio as u64 + 1) + } else { + data_block_count + data_block_count.saturating_sub(1) / chunk_ratio as u64 + }; + + if bat_entry_count > format::ABSOLUTE_MAXIMUM_BAT_ENTRY_COUNT { + return Err(CreateError::InvalidFormat( + InvalidFormatReason::BatEntryCountTooLarge, + )); + } + + let bat_sector_count = ceil_div(bat_entry_count, format::ENTRIES_PER_BAT_PAGE); + let bat_length = round_up( + bat_sector_count * format::CACHE_PAGE_SIZE, + format::REGION_ALIGNMENT, + ); + + if bat_length > format::MAXIMUM_BAT_SIZE { + return Err(CreateError::InvalidFormat( + InvalidFormatReason::BatSizeTooLarge, + )); + } + + // --- Region layout --- + + let log_offset = format::HEADER_AREA_SIZE; + let log_length = format::DEFAULT_LOG_SIZE as u64; + let metadata_offset = log_offset + log_length; + let metadata_length = format::DEFAULT_METADATA_REGION_SIZE as u64; + let bat_offset = metadata_offset + metadata_length; + + // --- Build the 1 MiB header area buffer --- + + let mut buf = file.alloc_buffer(format::HEADER_AREA_SIZE as usize); + let buf_bytes = buf.as_mut(); + + // File identifier at offset 0. + let mut ident = FileIdentifier::new_zeroed(); + ident.signature = format::FILE_IDENTIFIER_SIGNATURE; + buf_bytes[..size_of::()].copy_from_slice(ident.as_bytes()); + + // Generate random GUIDs for the headers. + let file_write_guid = Guid::new_random(); + if params.data_write_guid == Guid::ZERO { + params.data_write_guid = Guid::new_random(); + } + let data_write_guid = params.data_write_guid; + + // Header 1 (sequence number 0). + let mut header = Header::new_zeroed(); + header.signature = format::HEADER_SIGNATURE; + header.sequence_number = 0; + header.file_write_guid = file_write_guid; + header.data_write_guid = data_write_guid; + header.log_guid = Guid::ZERO; + header.log_version = format::LOG_VERSION; + header.version = format::VERSION_1; + header.log_offset = log_offset; + header.log_length = log_length as u32; + header.checksum = 0; + + // Serialize header 1, compute checksum, update. + let h1_start = format::HEADER_OFFSET_1 as usize; + let h1_end = h1_start + size_of::
(); + buf_bytes[h1_start..h1_end].copy_from_slice(header.as_bytes()); + let crc = format::compute_checksum( + &buf_bytes[h1_start..h1_start + format::HEADER_SIZE as usize], + 4, // checksum field offset within Header + ); + buf_bytes[h1_start + 4..h1_start + 8].copy_from_slice(&crc.to_le_bytes()); + + // Header 2 (sequence number 1). + header.sequence_number = 1; + header.checksum = 0; + let h2_start = format::HEADER_OFFSET_2 as usize; + let h2_end = h2_start + size_of::
(); + buf_bytes[h2_start..h2_end].copy_from_slice(header.as_bytes()); + let crc = format::compute_checksum( + &buf_bytes[h2_start..h2_start + format::HEADER_SIZE as usize], + 4, + ); + buf_bytes[h2_start + 4..h2_start + 8].copy_from_slice(&crc.to_le_bytes()); + + // Region table 1. + let rt_start = format::REGION_TABLE_OFFSET as usize; + let mut rt_header = RegionTableHeader::new_zeroed(); + rt_header.signature = format::REGION_TABLE_SIGNATURE; + rt_header.entry_count = 2; + + let rt_header_bytes = rt_header.as_bytes(); + buf_bytes[rt_start..rt_start + rt_header_bytes.len()].copy_from_slice(rt_header_bytes); + + // BAT region entry. + let entry_offset = rt_start + size_of::(); + let bat_entry = RegionTableEntry { + guid: format::BAT_REGION_GUID, + file_offset: bat_offset, + length: bat_length as u32, + flags: RegionTableEntryFlags::new().with_required(true), + }; + let bat_entry_bytes = bat_entry.as_bytes(); + buf_bytes[entry_offset..entry_offset + bat_entry_bytes.len()].copy_from_slice(bat_entry_bytes); + + // Metadata region entry. + let entry_offset2 = entry_offset + size_of::(); + let meta_entry = RegionTableEntry { + guid: format::METADATA_REGION_GUID, + file_offset: metadata_offset, + length: metadata_length as u32, + flags: RegionTableEntryFlags::new().with_required(true), + }; + let meta_entry_bytes = meta_entry.as_bytes(); + buf_bytes[entry_offset2..entry_offset2 + meta_entry_bytes.len()] + .copy_from_slice(meta_entry_bytes); + + // Compute region table checksum over the full 64 KiB region. + let rt_end = rt_start + format::REGION_TABLE_SIZE as usize; + let crc = format::compute_checksum(&buf_bytes[rt_start..rt_end], 4); + buf_bytes[rt_start + 4..rt_start + 8].copy_from_slice(&crc.to_le_bytes()); + + // Copy region table 1 to region table 2. + let alt_start = format::ALT_REGION_TABLE_OFFSET as usize; + buf_bytes.copy_within(rt_start..rt_end, alt_start); + + // Write the header area. + file.write_from(0, buf).await.map_err(CreateError::Write)?; + + // --- Zero the log region --- + + file.zero_range(log_offset, log_length) + .await + .map_err(CreateError::Write)?; + + // --- Build and write the metadata table --- + + let mut meta_buf = file.alloc_buffer(metadata_length as usize); + let meta_bytes = meta_buf.as_mut(); + + let mut table_header = MetadataTableHeader::new_zeroed(); + table_header.signature = format::METADATA_TABLE_SIGNATURE; + + let mut entry_count: u16 = 0; + let entries_start = size_of::(); + let mut entry_write_offset = entries_start; + let mut item_data_offset = format::METADATA_TABLE_SIZE as u32; + + // Helper: write a metadata table entry. + let add_entry = |buf: &mut [u8], + entry_write_offset: &mut usize, + entry_count: &mut u16, + item_id: Guid, + offset: u32, + length: u32, + is_required: bool, + is_virtual_disk: bool| { + let entry = MetadataTableEntry { + item_id, + offset, + length, + flags: MetadataTableEntryFlags::new() + .with_is_required(is_required) + .with_is_virtual_disk(is_virtual_disk), + reserved2: 0, + }; + let bytes = entry.as_bytes(); + buf[*entry_write_offset..*entry_write_offset + bytes.len()].copy_from_slice(bytes); + *entry_write_offset += bytes.len(); + *entry_count += 1; + }; + + // 1. File parameters (IsRequired only). + let file_params_len = size_of::() as u32; + add_entry( + meta_bytes, + &mut entry_write_offset, + &mut entry_count, + format::FILE_PARAMETERS_ITEM_GUID, + item_data_offset, + file_params_len, + true, + false, + ); + let fp_data_offset = item_data_offset; + item_data_offset += file_params_len; + + // 2. Virtual disk size (IsRequired + IsVirtualDisk). + let disk_size_len = 8u32; // u64 + add_entry( + meta_bytes, + &mut entry_write_offset, + &mut entry_count, + format::VIRTUAL_DISK_SIZE_ITEM_GUID, + item_data_offset, + disk_size_len, + true, + true, + ); + let ds_data_offset = item_data_offset; + item_data_offset += disk_size_len; + + // 3. Logical sector size (IsRequired + IsVirtualDisk). + let sector_len = 4u32; // u32 + add_entry( + meta_bytes, + &mut entry_write_offset, + &mut entry_count, + format::LOGICAL_SECTOR_SIZE_ITEM_GUID, + item_data_offset, + sector_len, + true, + true, + ); + let lss_data_offset = item_data_offset; + item_data_offset += sector_len; + + // 4. Physical sector size (IsRequired + IsVirtualDisk). + add_entry( + meta_bytes, + &mut entry_write_offset, + &mut entry_count, + format::PHYSICAL_SECTOR_SIZE_ITEM_GUID, + item_data_offset, + sector_len, + true, + true, + ); + let pss_data_offset = item_data_offset; + item_data_offset += sector_len; + + // 5. Page 83 data (IsRequired + IsVirtualDisk). + let guid_len = 16u32; + add_entry( + meta_bytes, + &mut entry_write_offset, + &mut entry_count, + format::PAGE_83_ITEM_GUID, + item_data_offset, + guid_len, + true, + true, + ); + let p83_data_offset = item_data_offset; + item_data_offset += guid_len; + + // 6. Incomplete file (optional, IsRequired only). + if params.create_incomplete { + add_entry( + meta_bytes, + &mut entry_write_offset, + &mut entry_count, + format::INCOMPLETE_FILE_ITEM_GUID, + 0, + 0, + true, + false, + ); + } + + // Verify initial metadata items fit within a single hosting sector. + debug_assert!( + (item_data_offset as u64 - format::METADATA_TABLE_SIZE) <= format::MAX_HOSTING_SECTOR_SIZE + ); + + // Write the metadata table header. + table_header.entry_count = entry_count; + let th_bytes = table_header.as_bytes(); + meta_bytes[..th_bytes.len()].copy_from_slice(th_bytes); + + // Write the file parameters item data. + let fp = FileParameters { + block_size: params.block_size, + flags: FileParametersFlags::new() + .with_has_parent(params.has_parent) + .with_leave_blocks_allocated(params.is_fully_allocated), + }; + let fp_bytes = fp.as_bytes(); + let fp_off = fp_data_offset as usize; + meta_bytes[fp_off..fp_off + fp_bytes.len()].copy_from_slice(fp_bytes); + + // Write the virtual disk size item data. + let ds_off = ds_data_offset as usize; + meta_bytes[ds_off..ds_off + 8].copy_from_slice(¶ms.disk_size.to_le_bytes()); + + // Write the logical sector size item data. + let lss_off = lss_data_offset as usize; + meta_bytes[lss_off..lss_off + 4].copy_from_slice(¶ms.logical_sector_size.to_le_bytes()); + + // Write the physical sector size item data. + let pss_off = pss_data_offset as usize; + meta_bytes[pss_off..pss_off + 4].copy_from_slice(¶ms.physical_sector_size.to_le_bytes()); + + // Write the page 83 item data. + let p83_off = p83_data_offset as usize; + meta_bytes[p83_off..p83_off + 16].copy_from_slice(params.page_83_data.as_bytes()); + + // Write the metadata region. + file.write_from(metadata_offset, meta_buf) + .await + .map_err(CreateError::Write)?; + + // --- Zero the BAT region --- + + file.zero_range(bat_offset, bat_length) + .await + .map_err(CreateError::Write)?; + + // --- Set file size --- + + let mut file_size = bat_offset + bat_length; + + // Apply block alignment padding if requested. + if params.block_alignment as u64 > format::REGION_ALIGNMENT + && params.block_alignment <= params.block_size + { + file_size = round_up(file_size, params.block_alignment as u64); + } + + file.set_file_size(file_size) + .await + .map_err(CreateError::Write)?; + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::format; + use crate::tests::support::InMemoryFile; + use pal_async::async_test; + use zerocopy::FromBytes; + + /// Read a little-endian u64 from a byte slice at the given offset. + fn read_u64(data: &[u8], offset: usize) -> u64 { + u64::from_le_bytes(data[offset..offset + 8].try_into().unwrap()) + } + + /// Read a little-endian u32 from a byte slice at the given offset. + fn read_u32(data: &[u8], offset: usize) -> u32 { + u32::from_le_bytes(data[offset..offset + 4].try_into().unwrap()) + } + + /// Read a Header from the snapshot at the given offset. + fn read_header(snapshot: &[u8], offset: usize) -> Header { + Header::read_from_bytes(&snapshot[offset..offset + size_of::
()]) + .unwrap() + .clone() + } + + /// Read the region table header from the snapshot at the given offset. + fn read_region_table_header(snapshot: &[u8], offset: usize) -> RegionTableHeader { + RegionTableHeader::read_from_bytes( + &snapshot[offset..offset + size_of::()], + ) + .unwrap() + .clone() + } + + /// Read metadata table header from the metadata region. + fn read_metadata_table_header(snapshot: &[u8], meta_offset: usize) -> MetadataTableHeader { + MetadataTableHeader::read_from_bytes( + &snapshot[meta_offset..meta_offset + size_of::()], + ) + .unwrap() + .clone() + } + + /// Read a metadata table entry at the given index (0-based). + fn read_metadata_entry( + snapshot: &[u8], + meta_offset: usize, + index: usize, + ) -> MetadataTableEntry { + let entry_offset = meta_offset + + size_of::() + + index * size_of::(); + MetadataTableEntry::read_from_bytes( + &snapshot[entry_offset..entry_offset + size_of::()], + ) + .unwrap() + .clone() + } + + #[async_test] + async fn create_default_params() { + let disk_size = format::GB1; + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size, + ..Default::default() + }; + create(&file, &mut params).await.unwrap(); + + assert_eq!(params.disk_size, disk_size); + assert_eq!(params.block_size, format::DEFAULT_BLOCK_SIZE); + assert_eq!(params.logical_sector_size, 512); + assert_eq!(params.physical_sector_size, 512); + + let snapshot = file.snapshot(); + let file_size = file.file_size().await.unwrap(); + + // File identifier signature at offset 0. + let sig = read_u64(&snapshot, 0); + assert_eq!(sig, format::FILE_IDENTIFIER_SIGNATURE); + + // Header 1 at 64K. + let h1 = read_header(&snapshot, format::HEADER_OFFSET_1 as usize); + assert_eq!(h1.signature, format::HEADER_SIGNATURE); + assert!(format::validate_checksum( + &snapshot[format::HEADER_OFFSET_1 as usize + ..format::HEADER_OFFSET_1 as usize + format::HEADER_SIZE as usize], + 4 + )); + + // Header 2 at 128K. + let h2 = read_header(&snapshot, format::HEADER_OFFSET_2 as usize); + assert_eq!(h2.signature, format::HEADER_SIGNATURE); + assert!(format::validate_checksum( + &snapshot[format::HEADER_OFFSET_2 as usize + ..format::HEADER_OFFSET_2 as usize + format::HEADER_SIZE as usize], + 4 + )); + + // Region table 1 at 192K. + let rt = read_region_table_header(&snapshot, format::REGION_TABLE_OFFSET as usize); + assert_eq!(rt.signature, format::REGION_TABLE_SIGNATURE); + assert_eq!(rt.entry_count, 2); + assert!(format::validate_checksum( + &snapshot[format::REGION_TABLE_OFFSET as usize + ..format::REGION_TABLE_OFFSET as usize + format::REGION_TABLE_SIZE as usize], + 4 + )); + + // Region table 2 checksum. + assert!(format::validate_checksum( + &snapshot[format::ALT_REGION_TABLE_OFFSET as usize + ..format::ALT_REGION_TABLE_OFFSET as usize + format::REGION_TABLE_SIZE as usize], + 4 + )); + + // Metadata region starts at 2 MiB. + let meta_offset = 2 * format::MB1 as usize; + let mth = read_metadata_table_header(&snapshot, meta_offset); + assert_eq!(mth.signature, format::METADATA_TABLE_SIGNATURE); + assert_eq!(mth.entry_count, 5); + + // BAT region should be all zeros. + let bat_offset = 3 * format::MB1 as usize; + // Compute expected BAT length. + let data_block_count = ceil_div(disk_size, format::DEFAULT_BLOCK_SIZE as u64); + let chunk_ratio = chunk_block_count(format::DEFAULT_BLOCK_SIZE, 512); + let bat_entry_count = data_block_count + data_block_count / chunk_ratio as u64; + let bat_sec_count = ceil_div(bat_entry_count, format::ENTRIES_PER_BAT_PAGE); + let bat_len = round_up( + bat_sec_count * format::CACHE_PAGE_SIZE, + format::REGION_ALIGNMENT, + ) as usize; + assert!( + snapshot[bat_offset..bat_offset + bat_len] + .iter() + .all(|&b| b == 0) + ); + + // File size should cover all regions. + assert_eq!(file_size, (bat_offset + bat_len) as u64); + } + + #[async_test] + async fn create_validates_disk_size_zero() { + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: 0, + ..Default::default() + }; + assert!(create(&file, &mut params).await.is_err()); + } + + #[async_test] + async fn create_validates_disk_size_alignment() { + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: 1000, // not a multiple of 512 + logical_sector_size: 512, + ..Default::default() + }; + assert!(create(&file, &mut params).await.is_err()); + } + + #[async_test] + async fn create_validates_sector_sizes() { + let file = InMemoryFile::new(0); + + // Invalid logical sector size. + let mut params = CreateParams { + disk_size: format::GB1, + logical_sector_size: 1024, + ..Default::default() + }; + assert!(create(&file, &mut params).await.is_err()); + + // Invalid physical sector size. + let mut params = CreateParams { + disk_size: format::GB1, + physical_sector_size: 8192, + ..Default::default() + }; + assert!(create(&file, &mut params).await.is_err()); + } + + #[async_test] + async fn create_validates_block_size() { + let file = InMemoryFile::new(0); + + // Not a multiple of 1 MiB. + let mut params = CreateParams { + disk_size: format::GB1, + block_size: 500_000, + ..Default::default() + }; + assert!(create(&file, &mut params).await.is_err()); + + // Greater than maximum (256 MiB). + let mut params = CreateParams { + disk_size: format::GB1, + block_size: 512 * 1024 * 1024, + ..Default::default() + }; + assert!(create(&file, &mut params).await.is_err()); + } + + #[async_test] + async fn create_validates_block_alignment() { + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: format::GB1, + block_alignment: 3, // not a power of 2 + ..Default::default() + }; + assert!(create(&file, &mut params).await.is_err()); + } + + #[async_test] + async fn create_with_512_sectors() { + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: format::GB1, + logical_sector_size: 512, + physical_sector_size: 512, + ..Default::default() + }; + create(&file, &mut params).await.unwrap(); + assert_eq!(params.logical_sector_size, 512); + assert_eq!(params.physical_sector_size, 512); + + let snapshot = file.snapshot(); + let meta_offset = 2 * format::MB1 as usize; + + // Find the logical sector size entry (index 2) and physical (index 3). + let lss_entry = read_metadata_entry(&snapshot, meta_offset, 2); + assert_eq!(lss_entry.item_id, format::LOGICAL_SECTOR_SIZE_ITEM_GUID); + let lss_val = read_u32(&snapshot, meta_offset + lss_entry.offset as usize); + assert_eq!(lss_val, 512); + + let pss_entry = read_metadata_entry(&snapshot, meta_offset, 3); + assert_eq!(pss_entry.item_id, format::PHYSICAL_SECTOR_SIZE_ITEM_GUID); + let pss_val = read_u32(&snapshot, meta_offset + pss_entry.offset as usize); + assert_eq!(pss_val, 512); + } + + #[async_test] + async fn create_with_4k_sectors() { + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: format::GB1, + logical_sector_size: 4096, + physical_sector_size: 4096, + ..Default::default() + }; + create(&file, &mut params).await.unwrap(); + assert_eq!(params.logical_sector_size, 4096); + assert_eq!(params.physical_sector_size, 4096); + + let snapshot = file.snapshot(); + let meta_offset = 2 * format::MB1 as usize; + + let lss_entry = read_metadata_entry(&snapshot, meta_offset, 2); + let lss_val = read_u32(&snapshot, meta_offset + lss_entry.offset as usize); + assert_eq!(lss_val, 4096); + + let pss_entry = read_metadata_entry(&snapshot, meta_offset, 3); + let pss_val = read_u32(&snapshot, meta_offset + pss_entry.offset as usize); + assert_eq!(pss_val, 4096); + } + + #[async_test] + async fn create_various_block_sizes() { + let block_sizes: Vec = vec![1, 2, 4, 8, 16, 32, 64, 128, 256] + .into_iter() + .map(|m| m * format::MB1 as u32) + .collect(); + + for &bs in &block_sizes { + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: format::GB1, + block_size: bs, + ..Default::default() + }; + let result = create(&file, &mut params).await; + assert!(result.is_ok(), "failed for block_size={bs}"); + + let snapshot = file.snapshot(); + let sig = read_u64(&snapshot, 0); + assert_eq!(sig, format::FILE_IDENTIFIER_SIGNATURE); + } + } + + #[async_test] + async fn create_block_alignment() { + // No alignment: file ends right after BAT. + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: format::GB1, + block_alignment: 0, + ..Default::default() + }; + create(&file, &mut params).await.unwrap(); + let size_no_align = file.file_size().await.unwrap(); + + // With 2 MiB alignment. + let file2 = InMemoryFile::new(0); + let align = 2 * format::MB1 as u32; + let mut params2 = CreateParams { + disk_size: format::GB1, + block_alignment: align, + ..Default::default() + }; + create(&file2, &mut params2).await.unwrap(); + let size_aligned = file2.file_size().await.unwrap(); + + // Aligned size should be >= non-aligned and a multiple of alignment. + assert!(size_aligned >= size_no_align); + assert_eq!(size_aligned % align as u64, 0); + + // With alignment == block_size (should be honored since + // block_alignment <= block_size). + let file3 = InMemoryFile::new(0); + let mut params3 = CreateParams { + disk_size: format::GB1, + block_alignment: params.block_size, + ..Default::default() + }; + create(&file3, &mut params3).await.unwrap(); + let size3 = file3.file_size().await.unwrap(); + assert_eq!(size3 % params.block_size as u64, 0); + } + + #[async_test] + async fn create_differencing_disk() { + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: format::GB1, + has_parent: true, + ..Default::default() + }; + create(&file, &mut params).await.unwrap(); + + let snapshot = file.snapshot(); + let meta_offset = 2 * format::MB1 as usize; + + // File parameters entry (index 0). + let fp_entry = read_metadata_entry(&snapshot, meta_offset, 0); + assert_eq!(fp_entry.item_id, format::FILE_PARAMETERS_ITEM_GUID); + + // Read the FileParameters data. + let fp_off = meta_offset + fp_entry.offset as usize; + let fp = FileParameters::read_from_bytes( + &snapshot[fp_off..fp_off + size_of::()], + ) + .unwrap(); + assert!(fp.flags.has_parent()); + + // BAT entry count should include sector bitmap entries. + let data_block_count = ceil_div(format::GB1, format::DEFAULT_BLOCK_SIZE as u64); + let chunk_ratio = chunk_block_count(format::DEFAULT_BLOCK_SIZE, 512); + let sbm_count = ceil_div(data_block_count, chunk_ratio as u64); + let bat_entry_count_diff = sbm_count * (chunk_ratio as u64 + 1); + let bat_entry_count_nondiff = data_block_count + data_block_count / chunk_ratio as u64; + // Differencing should have more entries. + assert!(bat_entry_count_diff > bat_entry_count_nondiff); + } + + #[async_test] + async fn create_incomplete() { + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: format::GB1, + create_incomplete: true, + ..Default::default() + }; + create(&file, &mut params).await.unwrap(); + + let snapshot = file.snapshot(); + let meta_offset = 2 * format::MB1 as usize; + let mth = read_metadata_table_header(&snapshot, meta_offset); + assert_eq!(mth.entry_count, 6); + + // The 6th entry (index 5) should be the incomplete file item. + let entry = read_metadata_entry(&snapshot, meta_offset, 5); + assert_eq!(entry.item_id, format::INCOMPLETE_FILE_ITEM_GUID); + assert!(entry.flags.is_required()); + assert!(!entry.flags.is_virtual_disk()); + assert_eq!(entry.offset, 0); + assert_eq!(entry.length, 0); + } + + #[async_test] + async fn create_headers_have_different_sequence_numbers() { + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: format::GB1, + ..Default::default() + }; + create(&file, &mut params).await.unwrap(); + + let snapshot = file.snapshot(); + let h1 = read_header(&snapshot, format::HEADER_OFFSET_1 as usize); + let h2 = read_header(&snapshot, format::HEADER_OFFSET_2 as usize); + + assert_eq!(h1.sequence_number, 0); + assert_eq!(h2.sequence_number, 1); + } + + #[async_test] + async fn create_region_tables_are_identical() { + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: format::GB1, + ..Default::default() + }; + create(&file, &mut params).await.unwrap(); + + let snapshot = file.snapshot(); + let rt1_start = format::REGION_TABLE_OFFSET as usize; + let rt1_end = rt1_start + format::REGION_TABLE_SIZE as usize; + let rt2_start = format::ALT_REGION_TABLE_OFFSET as usize; + let rt2_end = rt2_start + format::REGION_TABLE_SIZE as usize; + + assert_eq!(&snapshot[rt1_start..rt1_end], &snapshot[rt2_start..rt2_end]); + } +} diff --git a/vm/devices/storage/vhdx/src/error.rs b/vm/devices/storage/vhdx/src/error.rs new file mode 100644 index 0000000000..b7722975f3 --- /dev/null +++ b/vm/devices/storage/vhdx/src/error.rs @@ -0,0 +1,349 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Error types for the VHDX parser. +//! +//! Separate error types are provided for each category of operation: +//! +//! - [`CreateError`] — file creation parameter validation +//! - [`OpenError`] — file open and format parsing +//! - [`VhdxIoError`] — runtime I/O (read, write, flush, trim, close) + +use thiserror::Error; + +use crate::log_task::LogTaskError; + +/// The VHDX write pipeline has been poisoned by a previous fatal error. +/// +/// Once set, all runtime I/O operations on the file fail permanently. +/// Produced by the log permits semaphore and LSN watermark when the +/// log or apply task encounters a fatal error. +#[derive(Debug, Clone, Error)] +#[error("VHDX pipeline failed: {0}")] +pub(crate) struct PipelineFailed(pub(crate) String); + +/// Errors returned by VHDX file creation ([`create::create`](crate::create::create)). +#[derive(Debug, Error)] +pub enum CreateError { + /// An I/O error occurred while writing the VHDX file. + #[error("write error")] + Write(#[source] std::io::Error), + + /// A parameter validation error. + #[error("invalid format parameters")] + InvalidFormat(#[source] InvalidFormatReason), +} + +/// Errors returned when opening or parsing a VHDX file. +/// +/// Covers file identifier validation, header parsing, region table +/// validation, metadata parsing, BAT loading, and parent locator +/// parsing. +#[derive(Debug, Error)] +pub(crate) enum OpenErrorInner { + /// An I/O error occurred while reading the VHDX file. + #[error("I/O error")] + Io(#[source] std::io::Error), + + /// The VHDX file is corrupt or has an invalid structure. + #[error("VHDX file is corrupt")] + Corrupt(#[from] CorruptionType), + + /// An open option (e.g. block alignment) is invalid. + #[error("invalid parameter")] + InvalidParameter(InvalidFormatReason), + + /// The write pipeline failed during writable open initialization. + #[error("pipeline failed during open")] + PipelineFailed(#[source] PipelineFailed), + + /// A metadata item could not be read through the page cache. + #[error("failed to access metadata page cache")] + MetadataCache(#[source] CacheError), +} + +/// Errors returned when opening or parsing a VHDX file. +/// +/// Covers file identifier validation, header parsing, region table +/// validation, metadata parsing, BAT loading, and parent locator +/// parsing. +#[derive(Debug, Error)] +#[error(transparent)] +pub struct OpenError(pub(crate) OpenErrorInner); + +impl> From for OpenError { + fn from(inner: T) -> Self { + OpenError(inner.into()) + } +} + +/// Errors returned by runtime VHDX I/O operations. +/// +/// Covers read, write, flush, trim, and close. Use [`kind()`](Self::kind) +/// to classify the error. +#[derive(Debug, Error)] +#[error(transparent)] +pub struct VhdxIoError(#[from] pub(crate) VhdxIoErrorInner); + +impl VhdxIoError { + /// Classify this error into a [`VhdxIoErrorKind`]. + pub fn kind(&self) -> VhdxIoErrorKind { + match self.0 { + VhdxIoErrorInner::ReadOnly => VhdxIoErrorKind::ReadOnly, + VhdxIoErrorInner::UnalignedIo => VhdxIoErrorKind::InvalidInput, + VhdxIoErrorInner::BeyondEndOfDisk => VhdxIoErrorKind::InvalidSector, + _ => VhdxIoErrorKind::Other, + } + } +} + +/// Classification of [`VhdxIoError`] for programmatic handling. +#[non_exhaustive] +pub enum VhdxIoErrorKind { + /// The file was opened read-only. + ReadOnly, + /// The I/O request had invalid parameters (e.g., unaligned). + InvalidInput, + /// The I/O request referenced a sector beyond the virtual disk. + InvalidSector, + /// Any other error (I/O failure, pipeline failure, etc.). + Other, +} + +/// Inner representation of [`VhdxIoError`]. +#[derive(Debug, Error)] +pub(crate) enum VhdxIoErrorInner { + #[error("failed to write header")] + WriteHeader(#[source] std::io::Error), + #[error("failed to flush")] + Flush(#[source] std::io::Error), + #[error("failed to commit cache")] + CommitCache(#[source] CacheError), + #[error("failed to read sector bitmap")] + ReadSectorBitmap(#[source] CacheError), + #[error("failed to zero block at file offset {file_offset:#x}")] + ZeroBlock { + #[source] + err: std::io::Error, + file_offset: u64, + }, + #[error("failed to extend file to {target_file_size:#x}")] + ExtendFile { + #[source] + err: std::io::Error, + target_file_size: u64, + }, + #[error("failed to truncate file to {target_file_size:#x}")] + TruncateFile { + #[source] + err: std::io::Error, + target_file_size: u64, + }, + #[error("failed to access BAT page cache")] + BatCache(#[source] CacheError), + #[error("failed to access sector bitmap page cache")] + SectorBitmapCache(#[source] CacheError), + #[error("VHDX file is opened read-only")] + ReadOnly, + #[error("VHDX file failed")] + Failed(#[source] PipelineFailed), + #[error("I/O request is not aligned to logical sector size")] + UnalignedIo, + #[error("I/O request extends beyond end of virtual disk")] + BeyondEndOfDisk, + #[error("failed to close log task")] + LogClose(#[source] LogTaskError), +} + +/// Errors from the page cache write path. +/// +/// Produced by [`PageCache::acquire_write`](crate::cache::PageCache::acquire_write) +/// when a page cannot be acquired for writing. +#[derive(Debug, Error)] +pub(crate) enum CacheError { + /// An I/O error occurred while loading the page from disk. + #[error("read error at file offset {file_offset:#x}")] + Read { + #[source] + err: std::io::Error, + file_offset: u64, + }, + + /// The write pipeline has been poisoned by a previous fatal error. + #[error("pipeline failed")] + PipelineFailed(#[source] PipelineFailed), +} + +/// Specific reasons a VHDX creation or parameter validation may fail. +/// +/// Each variant corresponds to a distinct validation error detected +/// when processing VHDX parameters (e.g. during file creation). +#[derive(Debug, Clone, Error)] +pub enum InvalidFormatReason { + /// The logical sector size is not 512 or 4096. + #[error("logical sector size must be 512 or 4096")] + InvalidLogicalSectorSize, + + /// The physical sector size is not 512 or 4096. + #[error("physical sector size must be 512 or 4096")] + InvalidPhysicalSectorSize, + + /// The disk size is zero. + #[error("disk size must be > 0")] + DiskSizeZero, + + /// The disk size is not a multiple of the logical sector size. + #[error("disk size must be a multiple of logical sector size")] + DiskSizeNotAligned, + + /// The disk size exceeds the maximum (64 TiB). + #[error("disk size exceeds maximum (64 TiB)")] + DiskSizeTooLarge, + + /// The block size is not a multiple of 1 MiB. + #[error("block size must be a multiple of 1 MiB")] + BlockSizeNotAligned, + + /// The block size exceeds the maximum (256 MiB). + #[error("block size exceeds maximum (256 MiB)")] + BlockSizeTooLarge, + + /// The block alignment is not a power of 2. + #[error("block alignment must be a power of 2")] + BlockAlignmentNotPowerOfTwo, + + /// The block size / logical sector size combination is invalid (chunk ratio is zero). + #[error("invalid block size / logical sector size combination")] + InvalidChunkRatio, + + /// The computed BAT entry count exceeds the absolute maximum. + #[error("BAT entry count exceeds absolute maximum")] + BatEntryCountTooLarge, + + /// The computed BAT size exceeds the maximum. + #[error("BAT size exceeds maximum")] + BatSizeTooLarge, +} + +/// Specific reasons a VHDX file may be considered corrupt. +/// +/// Each variant corresponds to a distinct corruption condition detected +/// during parsing or validation. Covers all corruption types from the +/// VHDX implementation. +#[derive(Debug, Clone, Error)] +pub(crate) enum CorruptionType { + #[error("user metadata entry is marked as required")] + MetadataUserRequired, + #[error("BAT region is too small for the disk geometry")] + BatTooSmall, + #[error("no valid VHDX headers found")] + NoValidHeaders, + #[error("invalid log offset or length in header")] + InvalidLogOffsetOrLength, + #[error("log offset is not aligned")] + InvalidLogOffset, + #[error("log region extends beyond end of file")] + LogBeyondEndOfFile, + #[error("parent locator item is too small for its header")] + LocatorTooSmallForHeader, + #[error("parent locator item is too small for its entries")] + LocatorTooSmallForEntries, + #[error("parent locator entry key is invalid")] + InvalidLocatorEntryKey, + #[error("parent locator entry value is invalid")] + InvalidLocatorEntryValue, + #[error("metadata table has an invalid signature")] + InvalidMetadataTableSignature, + #[error("metadata table entry count too high")] + MetadataTableEntryCountTooHigh, + #[error("reserved metadata table field is nonzero")] + ReservedMetadataTableFieldNonzero, + #[error("duplicate metadata GUID")] + MetadataDuplicateGuid, + #[error("metadata entries have overlapping ranges")] + MetadataOverlapping, + #[error("user metadata entry count exceeded")] + MetadataUserCountExceeded, + #[error("file is empty")] + EmptyFile, + #[error("file parameters item has invalid size")] + InvalidFileParameterSize, + #[error("reserved file parameters field is nonzero")] + ReservedFileParametersFieldNonzero, + #[error("file parameters marked as virtual disk metadata")] + FileParametersMarkedVirtual, + #[error("invalid block size")] + InvalidBlockSize, + #[error("invalid logical sector size")] + InvalidLogicalSectorSize, + #[error("logical sector size marked as virtual disk metadata")] + LogicalSectorSizeMarkedVirtual, + #[error("invalid sector size")] + InvalidSectorSize, + #[error("logical sector size item has invalid size")] + InvalidLogicalSectorSizeSize, + #[error("disk size item marked as virtual disk metadata")] + DiskMarkedVirtual, + #[error("invalid virtual disk size")] + InvalidDiskSize, + #[error("both region tables are corrupt")] + RegionTablesBothCorrupt, + #[error("invalid entry count in region table")] + InvalidEntryCountInRegionTable, + #[error("reserved region table field is nonzero")] + ReservedRegionTableFieldNonzero, + #[error("duplicate region table entry")] + DuplicateRegionEntry, + #[error("invalid offset or length in region table entry")] + OffsetOrLengthInRegionTable, + #[error("unknown required region")] + UnknownRequiredRegion, + #[error("BAT or metadata region is missing")] + MissingBatOrMetadataRegion, + #[error("bad log entry encountered during replay")] + BadLogEntryOnReplay, + #[error("no valid log entries found")] + NoValidLogEntries, + #[error("BAT entry references range beyond end of file")] + RangeBeyondEof, + #[error("BAT entries reference overlapping file ranges")] + RangeCollision, + #[error("invalid block state in BAT entry")] + InvalidBlockState, + #[error("reserved BAT entry field is nonzero")] + ReservedBatEntryFieldNonzero, + #[error("partially present block has no sector bitmap")] + PartiallyPresentWithoutSectorBitmap, + #[error("trimmed range collides with allocated range")] + TrimmedRangeCollision, + #[error("unknown required metadata item")] + UnknownRequiredMetadata, + #[error("file is marked as incomplete")] + IncompleteFile, + #[error("required metadata item is missing")] + MissingRequiredMetadata, + #[error("header has log GUID but log is missing")] + MissingLogHasGuid, + #[error("invalid metadata entry offset")] + InvalidMetadataEntryOffset, + #[error("metadata region is too large")] + MetadataRegionTooLarge, + #[error("metadata item is too large")] + MetadataItemTooLarge, + #[error("total metadata size per category exceeded")] + TotalMetadataSizeExceeded, + #[error("metadata entry has zero item GUID")] + ZeroMetadataItemId, + #[error("invalid file identifier signature")] + InvalidFileIdentifier, + #[error("invalid parent locator key-value count")] + InvalidLocatorKeyValueCount, + #[error("log is full")] + LogFull, + #[error("log replay required (log GUID is non-zero)")] + LogReplayRequired, + #[error("unsupported VHDX version")] + UnsupportedVersion, + #[error("unsupported VHDX log version")] + UnsupportedLogVersion, +} diff --git a/vm/devices/storage/vhdx/src/flush.rs b/vm/devices/storage/vhdx/src/flush.rs new file mode 100644 index 0000000000..753d3af2f4 --- /dev/null +++ b/vm/devices/storage/vhdx/src/flush.rs @@ -0,0 +1,601 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Flush sequencer — FSN tracking and concurrent flush coalescing. +//! +//! The VHDX write path needs a way to order and coalesce file flush operations. +//! Multiple concurrent callers may request flushes simultaneously (e.g., several +//! I/O threads completing writes at the same time). Rather than issuing one file +//! flush per caller, the [`FlushSequencer`] coalesces them: if a flush is already +//! in progress that will satisfy a caller's flush sequence number (FSN), the +//! caller waits for that flush instead of issuing a new one. +//! +//! FSNs increase monotonically. Each `flush()` call is assigned the next FSN. +//! When the flush I/O completes, the completed FSN advances to match. Callers +//! can ensure all data through a specific FSN is flushed via +//! [`FlushSequencer::flush_through`]. +//! +//! Used by both the [log task](crate::log_task) (to make WAL entries durable +//! before advancing the log tail) and +//! [`VhdxFile::flush()`](crate::open::VhdxFile::flush) (for crash safety). + +use crate::AsyncFile; +use crate::open::FailureFlag; +use event_listener::Event; +use parking_lot::Mutex; +use std::sync::Arc; +use std::sync::atomic::AtomicBool; +use std::sync::atomic::Ordering::Acquire; +use std::sync::atomic::Ordering::Release; + +/// Tracks flush sequence numbers and coalesces concurrent flush requests. +/// +/// Multiple callers can request flushes concurrently. The sequencer ensures +/// that at most one file flush is in progress at a time. If a flush is +/// in-flight that will satisfy a caller's FSN, the caller waits for that +/// flush instead of issuing a redundant one. +/// +/// FSNs increase monotonically. Each [`flush()`](FlushSequencer::flush) call +/// is assigned the next FSN. [`flush_through()`](FlushSequencer::flush_through) +/// ensures all data through a specific FSN is flushed (used by the log task +/// to enforce ordering constraints like "data must be flushed before BAT +/// is logged"). +pub(crate) struct FlushSequencer { + state: Mutex, + failure_flag: Option>, +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub(crate) struct Fsn(u64); + +impl Fsn { + pub const ZERO: Self = Fsn(0); + + #[cfg(test)] + pub(crate) const fn new(value: u64) -> Self { + Self(value) + } +} + +struct FlushState { + /// The most recently issued FSN that has been assigned. The next flush + /// will get `issued_fsn + 1`. + issued_fsn: Fsn, + /// The most recently completed FSN. All FSNs <= this value have been + /// durably flushed. + completed_fsn: Fsn, + /// The currently in-progress flush, if any. New callers whose target FSN + /// is covered by this flush wait on it rather than issuing a redundant + /// flush. If a caller needs a higher FSN, it replaces this with a new + /// flush (the old one continues running but is no longer advertised). + active_flush: Option>, +} + +/// A single in-progress flush operation. Waiters hold an `Arc` clone and +/// poll [`wait_done`](Flush::wait_done) until the flusher signals completion. +struct Flush { + /// The FSN that this flush will satisfy when it completes. + fsn: Fsn, + /// Set to `true` by the flusher after the I/O completes (success or failure). + done: AtomicBool, + /// Notified when `done` becomes `true`. + event: Event, +} + +impl FlushSequencer { + /// Create a new flush sequencer with FSNs starting at 0. + pub fn new() -> Self { + Self { + state: Mutex::new(FlushState { + issued_fsn: Fsn::ZERO, + completed_fsn: Fsn::ZERO, + active_flush: None, + }), + failure_flag: None, + } + } + + /// Set the failure flag for poisoning on I/O errors. + pub fn set_failure_flag(&mut self, flag: Arc) { + self.failure_flag = Some(flag); + } + + /// Returns the next FSN that will be assigned to a flush request. + /// + /// This is `issued_fsn + 1`. Callers use this to capture the "current + /// point in time" after performing writes, so they can later + /// [`flush_through()`](Self::flush_through) to ensure those writes have + /// been flushed. + /// + /// # Why this is safe without "reserving" an FSN + /// + /// Because `completed_fsn ≤ issued_fsn` always holds, `current_fsn()` + /// always returns a value *strictly greater than* `completed_fsn`. + /// Therefore `flush_through(current_fsn())` will never short-circuit + /// — it always creates a new `Flush` (or waits for one with a + /// sufficiently high FSN) that calls `file.flush()` *after* the + /// caller's writes have been issued. No separate reserve/issue + /// step is needed. + pub fn current_fsn(&self) -> Fsn { + let state = self.state.lock(); + Fsn(state.issued_fsn.0 + 1) + } + + /// Request a file flush through the sequencer. + /// + /// Assigns the next FSN to this flush request and ensures that a file + /// flush completes that covers this FSN. Multiple concurrent `flush()` + /// calls are coalesced: if a flush is already in progress, the caller + /// waits for it to complete. If the completed FSN is still less than the + /// caller's FSN after the in-progress flush finishes, a new flush is + /// issued. + /// + /// Returns the completed FSN of the flush that satisfied this request. + /// Under concurrency this may be greater than the FSN originally assigned + /// to this caller, because a single flush can cover multiple requests. + pub async fn flush(&self, file: &impl AsyncFile) -> Result { + self.flush_until(file, None).await + } + + /// Ensure all data through the given FSN is durably flushed. + /// + /// If the FSN has already completed, returns immediately. Otherwise, + /// bumps `issued_fsn` if needed and waits for a flush to complete that + /// covers the requested FSN. + /// + /// This is the safe replacement for the old `require_fsn` + `wait_for_fsn` + /// pattern — it both issues and waits in a single call. + pub async fn flush_through( + &self, + file: &impl AsyncFile, + fsn: Fsn, + ) -> Result<(), std::io::Error> { + let completed = self.flush_until(file, Some(fsn)).await?; + // Safety invariant: callers only pass FSNs from current_fsn(), which is + // issued_fsn + 1 at capture time. Since issued_fsn only grows, by the + // time we run, issued_fsn + 1 >= fsn, so the flush we create always + // covers the target. If this fires, a caller passed a bogus FSN. + assert!( + completed >= fsn, + "flush_through({fsn:?}) completed only through {completed:?}" + ); + Ok(()) + } + + /// Returns the most recently completed FSN. + pub fn completed_fsn(&self) -> Fsn { + self.state.lock().completed_fsn + } + + /// Inner workhorse: keep flushing until `completed_fsn >= target_fsn`. + /// + /// `target_fsn`: + /// - `None` — assign the next sequential FSN (used by `flush()`). + /// - `Some(fsn)` — ensure completion through that FSN (used by `flush_through()`). + /// + /// Returns the resolved FSN. + async fn flush_until( + &self, + file: &impl AsyncFile, + mut requested_fsn: Option, + ) -> Result { + // Step 1: find or create the Flush we'll execute. + // + // If there's an active flush covering our target FSN, wait for it. + // When it completes, loop back — if completed_fsn >= target we're + // done; otherwise we'll create a new flush ourselves. + // + // If no active flush covers our target, create one and `break` out + // of the loop to proceed to the I/O in step 2. + let my_flush = loop { + let active = { + let mut state = self.state.lock(); + let target_fsn = requested_fsn.unwrap_or(Fsn(state.issued_fsn.0 + 1)); + requested_fsn = Some(target_fsn); + + if target_fsn <= state.completed_fsn { + return Ok(state.completed_fsn); + } + + if let Some(active) = &state.active_flush + && active.fsn >= target_fsn + { + active.clone() + } else { + let fsn = Fsn(state.issued_fsn.0 + 1); + // flush_through() callers capture FSNs via current_fsn(), + // which returns issued_fsn + 1. Since issued_fsn only + // grows, the target can never exceed the next FSN we're + // about to issue. + assert!( + target_fsn <= fsn, + "flush_through target {target_fsn:?} exceeds next FSN {fsn:?}" + ); + let new_flush = Arc::new(Flush { + fsn, + done: false.into(), + event: Default::default(), + }); + state.active_flush = Some(new_flush.clone()); + state.issued_fsn = fsn; + break new_flush; + } + }; + active.wait_done().await; + }; + + // Step 2: perform the actual file flush and update state. + let r = file.flush().await; + let completed_fsn = { + let mut state = self.state.lock(); + if r.is_ok() { + state.completed_fsn = my_flush.fsn.max(state.completed_fsn); + } + if state + .active_flush + .as_ref() + .is_some_and(|p| Arc::ptr_eq(p, &my_flush)) + { + state.active_flush = None; + } + state.completed_fsn + }; + my_flush.done.store(true, Release); + my_flush.event.notify(usize::MAX); + r.inspect_err(|e| { + if let Some(flag) = &self.failure_flag { + flag.set(e); + } + })?; + Ok(completed_fsn) + } +} + +impl Flush { + /// Wait for this flush to complete (success or failure). + async fn wait_done(&self) { + loop { + let event = self.event.listen(); + if self.done.load(Acquire) { + break; + } + event.await; + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::tests::support::InMemoryFile; + use pal_async::async_test; + use std::borrow::Borrow; + use std::sync::Arc; + use std::sync::atomic::{AtomicBool, AtomicU32, Ordering}; + + // -- Helper wrappers -- + + /// File wrapper that counts how many times `flush()` is called. + struct CountingFile { + inner: InMemoryFile, + flush_count: AtomicU32, + } + + impl CountingFile { + fn new() -> Self { + Self { + inner: InMemoryFile::new(0), + flush_count: AtomicU32::new(0), + } + } + + fn flush_count(&self) -> u32 { + self.flush_count.load(Ordering::Relaxed) + } + } + + impl AsyncFile for CountingFile { + type Buffer = Vec; + + fn alloc_buffer(&self, len: usize) -> Vec { + self.inner.alloc_buffer(len) + } + + async fn read_into(&self, offset: u64, buf: Vec) -> Result, std::io::Error> { + self.inner.read_into(offset, buf).await + } + + async fn write_from( + &self, + offset: u64, + buf: impl Borrow> + Send + 'static, + ) -> Result<(), std::io::Error> { + self.inner.write_from(offset, buf).await + } + + async fn flush(&self) -> Result<(), std::io::Error> { + self.flush_count.fetch_add(1, Ordering::Relaxed); + self.inner.flush().await + } + + async fn file_size(&self) -> Result { + self.inner.file_size().await + } + + async fn set_file_size(&self, size: u64) -> Result<(), std::io::Error> { + self.inner.set_file_size(size).await + } + } + + /// File wrapper that can be configured to fail flushes. + struct FailingFile { + inner: InMemoryFile, + fail_flush: AtomicBool, + } + + impl FailingFile { + fn new(fail: bool) -> Self { + Self { + inner: InMemoryFile::new(0), + fail_flush: AtomicBool::new(fail), + } + } + + fn set_fail(&self, fail: bool) { + self.fail_flush.store(fail, Ordering::Relaxed); + } + } + + impl AsyncFile for FailingFile { + type Buffer = Vec; + + fn alloc_buffer(&self, len: usize) -> Vec { + self.inner.alloc_buffer(len) + } + + async fn read_into(&self, offset: u64, buf: Vec) -> Result, std::io::Error> { + self.inner.read_into(offset, buf).await + } + + async fn write_from( + &self, + offset: u64, + buf: impl Borrow> + Send + 'static, + ) -> Result<(), std::io::Error> { + self.inner.write_from(offset, buf).await + } + + async fn flush(&self) -> Result<(), std::io::Error> { + if self.fail_flush.load(Ordering::Relaxed) { + return Err(std::io::Error::other("flush failed")); + } + self.inner.flush().await + } + + async fn file_size(&self) -> Result { + self.inner.file_size().await + } + + async fn set_file_size(&self, size: u64) -> Result<(), std::io::Error> { + self.inner.set_file_size(size).await + } + } + + // -- Tests -- + + /// Single `flush()` call → FSN advances from 0 to 1. + #[async_test] + async fn test_basic_flush() { + let file = InMemoryFile::new(0); + let seq = FlushSequencer::new(); + let fsn = seq.flush(&file).await.unwrap(); + assert_eq!(fsn, Fsn::new(1)); + assert_eq!(seq.completed_fsn(), Fsn::new(1)); + } + + /// Three sequential `flush()` calls → FSNs are 1, 2, 3. + #[async_test] + async fn test_fsn_monotonically_increasing() { + let file = InMemoryFile::new(0); + let seq = FlushSequencer::new(); + let fsn1 = seq.flush(&file).await.unwrap(); + let fsn2 = seq.flush(&file).await.unwrap(); + let fsn3 = seq.flush(&file).await.unwrap(); + assert_eq!(fsn1, Fsn::new(1)); + assert_eq!(fsn2, Fsn::new(2)); + assert_eq!(fsn3, Fsn::new(3)); + assert_eq!(seq.completed_fsn(), Fsn::new(3)); + } + + /// `current_fsn()` returns 1 initially, advances after each flush. + #[async_test] + async fn test_current_fsn() { + let file = InMemoryFile::new(0); + let seq = FlushSequencer::new(); + assert_eq!(seq.current_fsn(), Fsn::new(1)); + seq.flush(&file).await.unwrap(); + assert_eq!(seq.current_fsn(), Fsn::new(2)); + seq.flush(&file).await.unwrap(); + assert_eq!(seq.current_fsn(), Fsn::new(3)); + } + + /// Spawn two concurrent `flush()` tasks. Both should complete, and the + /// total number of actual file flushes should be ≤ 2 (possibly 1 if + /// coalesced). + #[async_test] + async fn test_concurrent_flush_coalescing() { + let file = Arc::new(CountingFile::new()); + let seq = Arc::new(FlushSequencer::new()); + + let file1 = file.clone(); + let seq1 = seq.clone(); + let t1 = + futures::FutureExt::boxed(async move { seq1.flush(file1.as_ref()).await.unwrap() }); + + let file2 = file.clone(); + let seq2 = seq.clone(); + let t2 = + futures::FutureExt::boxed(async move { seq2.flush(file2.as_ref()).await.unwrap() }); + + let (fsn1, fsn2) = futures::join!(t1, t2); + + // Both FSNs should be valid (1 or 2). + assert!((Fsn::new(1)..=Fsn::new(2)).contains(&fsn1)); + assert!((Fsn::new(1)..=Fsn::new(2)).contains(&fsn2)); + assert_ne!(fsn1, fsn2); + + // Completed FSN should be at least the max of both. + assert!(seq.completed_fsn() >= fsn1.max(fsn2)); + + // At most 2 actual file flushes should have occurred. + assert!(file.flush_count() <= 2); + } + + /// Call `flush()`, then `flush_through(fsn)` → returns immediately. + #[async_test] + async fn test_flush_through_already_completed() { + let file = CountingFile::new(); + let seq = FlushSequencer::new(); + let fsn = seq.flush(&file).await.unwrap(); + let count_before = file.flush_count(); + // Should return immediately since the FSN is already completed. + seq.flush_through(&file, fsn).await.unwrap(); + assert_eq!(seq.completed_fsn(), fsn); + // No additional flush should have been issued. + assert_eq!(file.flush_count(), count_before); + } + + /// Call `flush_through(fsn)` on an un-issued FSN → triggers a flush + /// and completes. + #[async_test] + async fn test_flush_through_triggers_flush() { + let file = CountingFile::new(); + let seq = FlushSequencer::new(); + // FSN 1 has not been issued yet. + seq.flush_through(&file, Fsn::new(1)).await.unwrap(); + assert!(seq.completed_fsn() >= Fsn::new(1)); + assert!(file.flush_count() >= 1); + } + + /// Spawn a concurrent `flush()` and `flush_through()` — both complete. + #[async_test] + async fn test_flush_through_waits_for_in_progress() { + let file = Arc::new(CountingFile::new()); + let seq = Arc::new(FlushSequencer::new()); + + let file1 = file.clone(); + let seq1 = seq.clone(); + let flusher = futures::FutureExt::boxed(async move { + seq1.flush(file1.as_ref()).await.unwrap(); + }); + + let file2 = file.clone(); + let seq2 = seq.clone(); + let waiter = futures::FutureExt::boxed(async move { + seq2.flush_through(file2.as_ref(), Fsn::new(1)) + .await + .unwrap(); + }); + + futures::join!(flusher, waiter); + assert!(seq.completed_fsn() >= Fsn::new(1)); + } + + /// Use a file wrapper that fails on `flush()` → `flush()` returns error, + /// `completed_fsn` does NOT advance. + #[async_test] + async fn test_flush_error_propagated() { + let file = FailingFile::new(true); + let seq = FlushSequencer::new(); + let result = seq.flush(&file).await; + assert!(result.is_err()); + assert_eq!(seq.completed_fsn(), Fsn::ZERO); + } + + /// Use a file wrapper that fails on the first `flush()` but succeeds on + /// retry → first call fails, second `flush()` succeeds and FSN advances. + #[async_test] + async fn test_flush_error_recovery() { + let file = FailingFile::new(true); + let seq = FlushSequencer::new(); + + // First flush should fail. + let result = seq.flush(&file).await; + assert!(result.is_err()); + assert_eq!(seq.completed_fsn(), Fsn::ZERO); + + // Allow flushes to succeed now. + file.set_fail(false); + + // Second flush should succeed. + let fsn = seq.flush(&file).await.unwrap(); + assert!(fsn >= Fsn::new(1)); + assert!(seq.completed_fsn() >= fsn); + } + + /// `flush_through(0)` returns immediately — FSN 0 is always completed + /// since the sequencer starts with `completed_fsn = 0`. + #[async_test] + async fn test_flush_through_zero_is_noop() { + let file = CountingFile::new(); + let seq = FlushSequencer::new(); + seq.flush_through(&file, Fsn::ZERO).await.unwrap(); + assert_eq!(file.flush_count(), 0); + assert_eq!(seq.completed_fsn(), Fsn::ZERO); + } + + /// `flush_through` on a failing file propagates the error, and a + /// subsequent retry with a working file succeeds. + #[async_test] + async fn test_flush_through_error_recovery() { + let file = FailingFile::new(true); + let seq = FlushSequencer::new(); + + let result = seq.flush_through(&file, Fsn::new(1)).await; + assert!(result.is_err()); + assert_eq!(seq.completed_fsn(), Fsn::ZERO); + + file.set_fail(false); + seq.flush_through(&file, Fsn::new(1)).await.unwrap(); + assert!(seq.completed_fsn() >= Fsn::new(1)); + } + + /// Two concurrent `flush_through` calls for the same FSN — both + /// complete, and the total number of file flushes is reasonable. + #[async_test] + async fn test_concurrent_flush_through_same_fsn() { + let file = Arc::new(CountingFile::new()); + let seq = Arc::new(FlushSequencer::new()); + + let file1 = file.clone(); + let seq1 = seq.clone(); + let t1 = futures::FutureExt::boxed(async move { + seq1.flush_through(file1.as_ref(), Fsn::new(1)) + .await + .unwrap(); + }); + + let file2 = file.clone(); + let seq2 = seq.clone(); + let t2 = futures::FutureExt::boxed(async move { + seq2.flush_through(file2.as_ref(), Fsn::new(1)) + .await + .unwrap(); + }); + + futures::join!(t1, t2); + assert!(seq.completed_fsn() >= Fsn::new(1)); + } + + /// `flush()` returns `completed_fsn`, which may be higher than the + /// caller's own FSN if a concurrent flush for a higher FSN completed. + #[async_test] + async fn test_flush_returns_completed_fsn() { + let file = InMemoryFile::new(0); + let seq = FlushSequencer::new(); + + // Sequential flushes: returned FSN equals completed_fsn. + let fsn = seq.flush(&file).await.unwrap(); + assert_eq!(fsn, seq.completed_fsn()); + } +} diff --git a/vm/devices/storage/vhdx/src/format.rs b/vm/devices/storage/vhdx/src/format.rs new file mode 100644 index 0000000000..473e642f20 --- /dev/null +++ b/vm/devices/storage/vhdx/src/format.rs @@ -0,0 +1,782 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! On-disk format types and constants for the VHDX file format. +//! +//! All structures use `#[repr(C)]` and derive zerocopy traits for safe +//! zero-copy parsing. + +#![allow(dead_code)] + +use bitfield_struct::bitfield; +use guid::Guid; +use guid::guid; +use zerocopy::FromBytes; +use zerocopy::Immutable; +use zerocopy::IntoBytes; +use zerocopy::KnownLayout; + +// --------------------------------------------------------------------------- +// Size constants +// --------------------------------------------------------------------------- + +/// 4 KiB. +pub const KB4: u64 = 4096; +/// 64 KiB. +pub const KB64: u64 = 65536; +/// 1 MiB. +pub const MB1: u64 = 1024 * 1024; +/// 1 GiB. +pub const GB1: u64 = 1024 * MB1; +/// 1 TiB. +pub const TB1: u64 = MB1 * MB1; + +/// Size of a log sector (4 KiB). +pub const LOG_SECTOR_SIZE: u64 = KB4; +/// Size of a large sector (64 KiB). +pub const LARGE_SECTOR_SIZE: u64 = KB64; +/// Alignment requirement for VHDX regions. +pub const REGION_ALIGNMENT: u64 = MB1; +/// Size of a sector bitmap block (1 MiB). +pub const SECTOR_BITMAP_BLOCK_SIZE: u64 = MB1; +/// Number of sectors described per chunk (sector bitmap block size * 8 bits). +pub const SECTORS_PER_CHUNK: u64 = SECTOR_BITMAP_BLOCK_SIZE * 8; +/// Minimum file offset that may be covered by log replay. +pub const LOGGABLE_OFFSET: u64 = REGION_TABLE_OFFSET; + +// --------------------------------------------------------------------------- +// Header area +// --------------------------------------------------------------------------- + +/// Total size of the header area (1 MiB). +pub const HEADER_AREA_SIZE: u64 = MB1; +/// On-disk size of a single header (4 KiB). +pub const HEADER_SIZE: u64 = KB4; +/// File offset of the first (primary) header. +pub const HEADER_OFFSET_1: u64 = LARGE_SECTOR_SIZE; +/// File offset of the second (alternate) header. +pub const HEADER_OFFSET_2: u64 = LARGE_SECTOR_SIZE * 2; + +/// Signature for [`Header`] (`'head'` as a little-endian u32). +pub const HEADER_SIGNATURE: u32 = u32::from_le_bytes(*b"head"); +/// Current VHDX format version. +pub const VERSION_1: u16 = 1; +/// Current log format version. +pub const LOG_VERSION: u16 = 0; + +// --------------------------------------------------------------------------- +// Region table +// --------------------------------------------------------------------------- + +/// Size of a region table (64 KiB). +pub const REGION_TABLE_SIZE: u64 = LARGE_SECTOR_SIZE; +/// File offset of the primary region table. +pub const REGION_TABLE_OFFSET: u64 = LARGE_SECTOR_SIZE * 3; +/// File offset of the alternate region table. +pub const ALT_REGION_TABLE_OFFSET: u64 = LARGE_SECTOR_SIZE * 4; + +/// Signature for [`RegionTableHeader`] (`'regi'` as a little-endian u32). +pub const REGION_TABLE_SIGNATURE: u32 = u32::from_le_bytes(*b"regi"); + +/// Maximum number of entries in a region table. +pub const REGION_TABLE_MAX_ENTRY_COUNT: u64 = (REGION_TABLE_SIZE + - size_of::() as u64) + / size_of::() as u64; + +// --------------------------------------------------------------------------- +// BAT +// --------------------------------------------------------------------------- + +/// Well-known GUID identifying the BAT region. +pub const BAT_REGION_GUID: Guid = guid!("2dc27766-f623-4200-9d64-115e9bfd4a08"); + +/// Maximum BAT size in bytes (513 MiB). +pub const MAXIMUM_BAT_SIZE: u64 = 513 * MB1; +/// Maximum number of BAT entries. +pub const MAXIMUM_BAT_ENTRY_COUNT: u64 = MAXIMUM_BAT_SIZE / size_of::() as u64; +/// Absolute maximum BAT entry count (2^30). +pub const ABSOLUTE_MAXIMUM_BAT_ENTRY_COUNT: u64 = 1 << 30; +/// Maximum block size (256 MiB). +pub const MAXIMUM_BLOCK_SIZE: u64 = 256 * MB1; +/// Maximum virtual disk size (64 TiB). +pub const MAXIMUM_DISK_SIZE: u64 = 64 * TB1; + +// --------------------------------------------------------------------------- +// Metadata +// --------------------------------------------------------------------------- + +/// Well-known GUID identifying the metadata region. +pub const METADATA_REGION_GUID: Guid = guid!("8b7ca206-4790-4b9a-b8fe-575f050f886e"); + +/// Signature for [`MetadataTableHeader`] (`"metadata"` as a little-endian u64). +pub const METADATA_TABLE_SIGNATURE: u64 = u64::from_le_bytes(*b"metadata"); + +/// Size of the metadata table (64 KiB). +pub const METADATA_TABLE_SIZE: u64 = LARGE_SECTOR_SIZE; + +/// Maximum number of metadata table entries. +pub const METADATA_ENTRY_MAX_COUNT: u64 = (METADATA_TABLE_SIZE + - size_of::() as u64) + / size_of::() as u64; + +/// Maximum number of system (non-user) metadata entries. +pub const METADATA_SYSTEM_ENTRY_MAX_COUNT: u64 = 1023; +/// Maximum number of user metadata entries. +pub const METADATA_USER_ENTRY_MAX_COUNT: u64 = 1024; + +/// Maximum size of the entire metadata region (128 MiB). +pub const MAXIMUM_METADATA_REGION_SIZE: u64 = 128 * MB1; +/// Maximum total metadata size per category (user or system) (40 MiB). +pub const MAXIMUM_TOTAL_METADATA_SIZE_PER_CATEGORY: u64 = 40 * MB1; +/// Maximum size of a single metadata item (1 MiB). +pub const MAXIMUM_METADATA_ITEM_SIZE: u64 = MB1; + +// --------------------------------------------------------------------------- +// Metadata item GUIDs +// --------------------------------------------------------------------------- + +/// File parameters metadata item GUID. +pub const FILE_PARAMETERS_ITEM_GUID: Guid = guid!("caa16737-fa36-4d43-b3b6-33f0aa44e76b"); + +/// Virtual disk size metadata item GUID. +pub const VIRTUAL_DISK_SIZE_ITEM_GUID: Guid = guid!("2fa54224-cd1b-4876-b211-5dbed83bf4b8"); + +/// Page 83 data metadata item GUID. +pub const PAGE_83_ITEM_GUID: Guid = guid!("beca12ab-b2e6-4523-93ef-c309e000c746"); + +/// CHS (cylinder-head-sector) parameters metadata item GUID. +pub const CHS_PARAMETERS_ITEM_GUID: Guid = guid!("da02d7bc-3d3a-423c-ac88-2a36ab21479b"); + +/// Logical sector size metadata item GUID. +pub const LOGICAL_SECTOR_SIZE_ITEM_GUID: Guid = guid!("8141bf1d-a96f-4709-ba47-f233a8faab5f"); + +/// Physical sector size metadata item GUID. +pub const PHYSICAL_SECTOR_SIZE_ITEM_GUID: Guid = guid!("cda348c7-445d-4471-9cc9-e9885251c556"); + +/// Incomplete file metadata item GUID. +/// +/// Present on VHDs that have been created but not yet fully initialized. +/// Deleted when creation is complete. +pub const INCOMPLETE_FILE_ITEM_GUID: Guid = guid!("71cc85f0-1b69-4e28-9558-c3bf83ae75d3"); + +// --------------------------------------------------------------------------- +// Parent locator GUIDs +// --------------------------------------------------------------------------- + +/// Parent locator metadata item GUID. +pub const PARENT_LOCATOR_ITEM_GUID: Guid = guid!("a8d35f2d-b30b-454d-abf7-d3d84834ab0c"); + +/// Parent locator type GUID for VHDX parent references. +pub const PARENT_LOCATOR_VHDX_TYPE_GUID: Guid = guid!("b04aefb7-d19e-4a81-b789-25b8e9445913"); + +/// Maximum number of key-value pairs in a parent locator. +pub const PARENT_LOCATOR_MAXIMUM_KEY_VALUE_COUNT: u16 = 256; + +// --------------------------------------------------------------------------- +// PMEM label storage area +// --------------------------------------------------------------------------- + +/// PMEM label storage area metadata item GUID. +pub const PMEM_LABEL_STORAGE_AREA_ITEM_GUID: Guid = guid!("10e1ae8a-4b7e-4169-a40f-cd70de928393"); + +/// Version 1 of the PMEM label storage area header. +pub const PMEM_LABEL_STORAGE_AREA_VERSION_1: u16 = 1; + +// --------------------------------------------------------------------------- +// Log signatures +// --------------------------------------------------------------------------- + +/// Signature for [`LogEntryHeader`] (`'loge'` as a little-endian u32). +pub const LOG_ENTRY_HEADER_SIGNATURE: u32 = u32::from_le_bytes(*b"loge"); +/// Signature for a data log descriptor (`'desc'` as a little-endian u32). +pub const LOG_DESCRIPTOR_DATA_SIGNATURE: u32 = u32::from_le_bytes(*b"desc"); +/// Signature for a zero log descriptor (`'zero'` as a little-endian u32). +pub const LOG_DESCRIPTOR_ZERO_SIGNATURE: u32 = u32::from_le_bytes(*b"zero"); +/// Signature for [`LogDataSector`] (`'data'` as a little-endian u32). +pub const LOG_DATA_SECTOR_SIGNATURE: u32 = u32::from_le_bytes(*b"data"); + +// --------------------------------------------------------------------------- +// File identifier signature +// --------------------------------------------------------------------------- + +/// Default block size (2 MiB). +pub const DEFAULT_BLOCK_SIZE: u32 = 2 * MB1 as u32; + +/// Default logical/physical sector size (512 bytes). +pub const DEFAULT_SECTOR_SIZE: u32 = 512; + +/// Default metadata region size (1 MiB). +pub const DEFAULT_METADATA_REGION_SIZE: u32 = MB1 as u32; + +/// Default log region size (1 MiB). +pub const DEFAULT_LOG_SIZE: u32 = MB1 as u32; + +/// Cache page size (4 KiB) — the granularity of BAT page I/O. +pub const CACHE_PAGE_SIZE: u64 = KB4; + +/// Number of BAT entries per cache page (4096 / 8 = 512). +pub const ENTRIES_PER_BAT_PAGE: u64 = CACHE_PAGE_SIZE / size_of::() as u64; + +/// Maximum hosting sector size (64 KiB) — largest sector the metadata +/// table items should fit in. +pub const MAX_HOSTING_SECTOR_SIZE: u64 = KB64; + +/// Signature for [`FileIdentifier`] (`"vhdxfile"` as a little-endian u64). +pub const FILE_IDENTIFIER_SIGNATURE: u64 = u64::from_le_bytes(*b"vhdxfile"); + +// =========================================================================== +// On-disk structures +// =========================================================================== + +/// VHDX file identifier — the first structure at offset 0 in a VHDX file. +/// +/// Contains the file signature and a UTF-16 creator string. +#[repr(C)] +#[derive(Debug, Clone, FromBytes, IntoBytes, Immutable, KnownLayout)] +pub struct FileIdentifier { + /// Must be [`FILE_IDENTIFIER_SIGNATURE`]. + pub signature: u64, + /// UTF-16LE creator string (informational, not validated by parsers). + pub creator: [u16; 256], +} + +/// VHDX header — one of two dual headers located at [`HEADER_OFFSET_1`] +/// and [`HEADER_OFFSET_2`]. +/// +/// The header with the higher valid sequence number is the current header. +#[repr(C)] +#[derive(Debug, Clone, FromBytes, IntoBytes, Immutable, KnownLayout)] +pub struct Header { + /// Must be [`HEADER_SIGNATURE`]. + pub signature: u32, + /// CRC-32C checksum of the entire 4 KiB header (with this field zeroed). + pub checksum: u32, + /// Monotonically increasing sequence number. + pub sequence_number: u64, + /// GUID changed on every file-level write (metadata or data structure writes). + pub file_write_guid: Guid, + /// GUID changed on every virtual-disk data write. + pub data_write_guid: Guid, + /// GUID identifying the active log. Zero GUID means no active log. + pub log_guid: Guid, + /// Log format version (currently [`LOG_VERSION`]). + pub log_version: u16, + /// File format version (currently [`VERSION_1`]). + pub version: u16, + /// Length of the log region in bytes. + pub log_length: u32, + /// File offset of the log region. + pub log_offset: u64, +} + +/// Region table header — precedes an array of [`RegionTableEntry`] values. +/// +/// Two copies exist at [`REGION_TABLE_OFFSET`] and [`ALT_REGION_TABLE_OFFSET`]. +#[repr(C)] +#[derive(Debug, Clone, FromBytes, IntoBytes, Immutable, KnownLayout)] +pub struct RegionTableHeader { + /// Must be [`REGION_TABLE_SIGNATURE`]. + pub signature: u32, + /// CRC-32C checksum of the entire 64 KiB region table. + pub checksum: u32, + /// Number of valid entries following this header. + pub entry_count: u32, + /// Reserved, must be zero. + pub reserved: u32, +} + +/// A single entry in the region table. +#[repr(C)] +#[derive(Debug, Clone, FromBytes, IntoBytes, Immutable, KnownLayout)] +pub struct RegionTableEntry { + /// GUID identifying the region type (e.g. [`BAT_REGION_GUID`]). + pub guid: Guid, + /// File offset of the region. + pub file_offset: u64, + /// Length of the region in bytes. + pub length: u32, + /// Region table entry flags. + pub flags: RegionTableEntryFlags, +} + +/// Flags for a [`RegionTableEntry`]. +#[bitfield(u32)] +#[derive(IntoBytes, Immutable, KnownLayout, FromBytes, PartialEq, Eq)] +pub struct RegionTableEntryFlags { + /// Whether this region is required for the file to be valid. + pub required: bool, + /// Reserved bits. + #[bits(31)] + _reserved: u32, +} + +/// BAT (Block Allocation Table) entry. +/// +/// Packs a 3-bit block state and a 44-bit file offset (in MiB units) +/// into a single `u64`. +#[bitfield(u64)] +#[derive(IntoBytes, Immutable, KnownLayout, FromBytes, PartialEq, Eq)] +pub struct BatEntry { + /// Block state (see [`BatEntryState`]). + #[bits(3)] + pub state: u8, + /// Reserved bits. + #[bits(17)] + _reserved: u32, + /// File offset in MiB units (bits 20..63). + #[bits(44)] + pub file_offset_mb: u64, +} + +impl BatEntry { + /// Computes the full file offset in bytes. + pub fn file_offset(&self) -> u64 { + self.file_offset_mb() << 20 + } +} + +/// Block states stored in the low 3 bits of a [`BatEntry`]. +#[repr(u8)] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub enum BatEntryState { + /// Block is not present. For data blocks: read from parent or return + /// zeros. For sector bitmap blocks: treat all sectors as not present. + NotPresent = 0, + /// Block has undefined content. Reading returns an error. + Undefined = 1, + /// Block is explicitly zero-filled. + Zero = 2, + /// Block is unmapped (trimmed). Content is undefined. + Unmapped = 3, + // Values 4 and 5 are unused / reserved. + /// Block is fully present and backed by file data. + FullyPresent = 6, + /// Block is partially present. A sector bitmap describes which + /// sectors contain data. + PartiallyPresent = 7, +} + +impl BatEntryState { + /// Attempt to convert a raw `u8` state value to a [`BatEntryState`]. + pub fn from_raw(value: u8) -> Option { + match value { + 0 => Some(Self::NotPresent), + 1 => Some(Self::Undefined), + 2 => Some(Self::Zero), + 3 => Some(Self::Unmapped), + 6 => Some(Self::FullyPresent), + 7 => Some(Self::PartiallyPresent), + _ => None, + } + } + + /// Whether this state counts as "allocated" (backed by file space). + pub fn is_allocated(self) -> bool { + matches!(self, Self::FullyPresent | Self::PartiallyPresent) + } +} + +/// Metadata table header. +/// +/// Located at the start of the metadata region, followed by up to +/// [`METADATA_ENTRY_MAX_COUNT`] entries. +#[repr(C)] +#[derive(Debug, Clone, FromBytes, IntoBytes, Immutable, KnownLayout)] +pub struct MetadataTableHeader { + /// Must be [`METADATA_TABLE_SIGNATURE`]. + pub signature: u64, + /// Reserved, must be zero. + pub reserved: u16, + /// Number of valid entries following this header. + pub entry_count: u16, + /// Reserved, must be zero. + pub reserved2: [u32; 5], +} + +/// A single entry in the metadata table. +#[repr(C)] +#[derive(Debug, Clone, FromBytes, IntoBytes, Immutable, KnownLayout)] +pub struct MetadataTableEntry { + /// GUID identifying the metadata item. + pub item_id: Guid, + /// Offset of the item data relative to the start of the metadata region. + pub offset: u32, + /// Length of the item data in bytes. + pub length: u32, + /// Metadata entry flags. + pub flags: MetadataTableEntryFlags, + /// Reserved, must be zero. + pub reserved2: u32, +} + +/// Flags for a [`MetadataTableEntry`]. +#[bitfield(u32)] +#[derive(IntoBytes, Immutable, KnownLayout, FromBytes, PartialEq, Eq)] +pub struct MetadataTableEntryFlags { + /// Whether this is a user metadata entry. + pub is_user: bool, + /// Whether this is a virtual disk metadata entry. + pub is_virtual_disk: bool, + /// Whether this metadata entry is required. + pub is_required: bool, + /// Reserved bits. + #[bits(29)] + _reserved: u32, +} + +/// File parameters metadata item. +#[repr(C)] +#[derive(Debug, Clone, FromBytes, IntoBytes, Immutable, KnownLayout)] +pub struct FileParameters { + /// Block size in bytes. + pub block_size: u32, + /// File parameters flags. + pub flags: FileParametersFlags, +} + +/// Flags for [`FileParameters`]. +#[bitfield(u32)] +#[derive(IntoBytes, Immutable, KnownLayout, FromBytes, PartialEq, Eq)] +pub struct FileParametersFlags { + /// Whether blocks are left allocated (fixed VHD). + pub leave_blocks_allocated: bool, + /// Whether the disk has a parent (differencing disk). + pub has_parent: bool, + /// Reserved bits. + #[bits(30)] + _reserved: u32, +} + +/// CHS (cylinder-head-sector) parameters metadata item. +#[repr(C)] +#[derive(Debug, Clone, FromBytes, IntoBytes, Immutable, KnownLayout)] +pub struct ChsParameters { + /// Number of heads per cylinder. + pub heads_per_cylinder: u32, + /// Number of sectors per track. + pub sectors_per_track: u32, +} + +/// Parent locator header. +/// +/// Precedes an array of [`ParentLocatorEntry`] values within the parent +/// locator metadata item. +#[repr(C)] +#[derive(Debug, Clone, FromBytes, IntoBytes, Immutable, KnownLayout)] +pub struct ParentLocatorHeader { + /// GUID identifying the locator type (e.g. [`PARENT_LOCATOR_VHDX_TYPE_GUID`]). + pub locator_type: Guid, + /// Reserved, must be zero. + pub reserved: u16, + /// Number of key-value entries following this header. + pub key_value_count: u16, +} + +/// A single key-value entry in a parent locator. +/// +/// Keys and values are stored as UTF-16LE strings at the indicated offsets +/// within the parent locator metadata item. +#[repr(C)] +#[derive(Debug, Clone, FromBytes, IntoBytes, Immutable, KnownLayout)] +pub struct ParentLocatorEntry { + /// Byte offset of the key string (relative to the locator item start). + pub key_offset: u32, + /// Byte offset of the value string (relative to the locator item start). + pub value_offset: u32, + /// Length of the key string in bytes. + pub key_length: u16, + /// Length of the value string in bytes. + pub value_length: u16, +} + +/// Log entry header. +/// +/// Each log entry starts with this header, followed by an array of +/// log descriptors (data or zero). +#[repr(C)] +#[derive(Debug, Clone, FromBytes, IntoBytes, Immutable, KnownLayout)] +pub struct LogEntryHeader { + /// Must be [`LOG_ENTRY_HEADER_SIGNATURE`]. + pub signature: u32, + /// CRC-32C checksum of the entire log entry (with this field zeroed). + pub checksum: u32, + /// Total length of this log entry in bytes (including header, descriptors, + /// and data sectors). + pub entry_length: u32, + /// Byte offset of the oldest active log entry (the "tail"). + pub tail: u32, + /// Sequence number of this log entry. + pub sequence_number: u64, + /// Number of descriptors in this entry. + pub descriptor_count: u32, + /// Reserved, must be zero. + pub reserved: u32, + /// Must match the log GUID in the active header. + pub log_guid: Guid, + /// File size after all entries up to and including this one are applied. + pub flushed_file_offset: u64, + /// File size required to write this entry's data. + pub last_file_offset: u64, +} + +/// Log data descriptor — describes a range of data to write from log +/// data sectors. +#[repr(C)] +#[derive(Debug, Clone, FromBytes, IntoBytes, Immutable, KnownLayout)] +pub struct LogDataDescriptor { + /// Must be [`LOG_DESCRIPTOR_DATA_SIGNATURE`]. + pub signature: u32, + /// Number of trailing bytes from the previous 4 KiB sector that begin + /// this data region. + pub trailing_bytes: u32, + /// Number of leading bytes from the next 4 KiB sector that end this + /// data region. + pub leading_bytes: u64, + /// File offset where this data should be written. + pub file_offset: u64, + /// Sequence number (must match the log entry's sequence number). + pub sequence_number: u64, +} + +/// Log zero descriptor — describes a range of the file that should be +/// zero-filled during replay. +#[repr(C)] +#[derive(Debug, Clone, FromBytes, IntoBytes, Immutable, KnownLayout)] +pub struct LogZeroDescriptor { + /// Must be [`LOG_DESCRIPTOR_ZERO_SIGNATURE`]. + pub signature: u32, + /// Reserved, must be zero. + pub reserved: u32, + /// Length of the zero-filled range in bytes. + pub length: u64, + /// File offset where zeroing should begin. + pub file_offset: u64, + /// Sequence number (must match the log entry's sequence number). + pub sequence_number: u64, +} + +/// A single 4 KiB data sector within a log entry (following the +/// descriptors). +#[repr(C)] +#[derive(Debug, Clone, FromBytes, IntoBytes, Immutable, KnownLayout)] +pub struct LogDataSector { + /// Must be [`LOG_DATA_SECTOR_SIGNATURE`]. + pub signature: u32, + /// High 32 bits of the sequence number. + pub sequence_high: u32, + /// Payload data (4084 bytes). + pub data: [u8; 4084], + /// Low 32 bits of the sequence number. + pub sequence_low: u32, +} + +/// PMEM label storage area header. +/// +/// Describes label storage for NVDIMM-backed VHDX files. +#[repr(C)] +#[derive(Debug, Clone, FromBytes, IntoBytes, Immutable, KnownLayout)] +pub struct PmemLabelStorageAreaHeader { + /// Version of this header (currently [`PMEM_LABEL_STORAGE_AREA_VERSION_1`]). + pub version: u16, + /// Reserved, must be zero. + pub reserved: u16, + /// GUID identifying the address abstraction type. + pub address_abstraction_type: Guid, + /// Byte offset of the label data (relative to this item). + pub data_offset: u32, + /// Length of the label data in bytes. + pub data_length: u32, +} + +// =========================================================================== +// Checksum helpers +// =========================================================================== + +/// Compute the CRC-32C checksum of `data`, treating the 4 bytes at +/// `checksum_offset` as zero during computation. +/// +/// This is used for headers and region tables where the checksum field +/// itself must be excluded from the CRC calculation. +pub fn compute_checksum(data: &[u8], checksum_offset: usize) -> u32 { + let mut crc = crc32c::crc32c(&data[..checksum_offset]); + crc = crc32c::crc32c_append(crc, &[0; 4]); + crc32c::crc32c_append(crc, &data[checksum_offset + 4..]) +} + +/// Validate that the CRC-32C checksum stored in `data` at `checksum_offset` +/// matches the computed value. +pub fn validate_checksum(data: &[u8], checksum_offset: usize) -> bool { + let stored = u32::from_le_bytes( + data[checksum_offset..checksum_offset + 4] + .try_into() + .unwrap(), + ); + let computed = compute_checksum(data, checksum_offset); + stored == computed +} + +// =========================================================================== +// Well-known parent locator key names +// =========================================================================== + +/// Parent linkage key name (UTF-16LE). +pub const PARENT_LOCATOR_KEY_PARENT_LINKAGE: &str = "parent_linkage"; +/// Alternative parent linkage key name (UTF-16LE). +pub const PARENT_LOCATOR_KEY_ALT_PARENT_LINKAGE: &str = "parent_linkage2"; +/// Relative path key name (UTF-16LE). +pub const PARENT_LOCATOR_KEY_RELATIVE_PATH: &str = "relative_path"; +/// Absolute Win32 path key name (UTF-16LE). +pub const PARENT_LOCATOR_KEY_ABSOLUTE_PATH: &str = "absolute_win32_path"; +/// Volume path key name (UTF-16LE). +pub const PARENT_LOCATOR_KEY_VOLUME_PATH: &str = "volume_path"; + +// =========================================================================== +// Tests +// =========================================================================== + +// Compile-time layout assertions. +const _: () = { + // FileIdentifier: 8 bytes signature + 256 * 2 bytes creator = 520 bytes + assert!(size_of::() == 8 + 256 * 2); + // Header: Signature(4) + Checksum(4) + SequenceNumber(8) + + // FileWriteGuid(16) + DataWriteGuid(16) + LogGuid(16) + + // LogVersion(2) + Version(2) + LogLength(4) + LogOffset(8) = 80 + assert!(size_of::
() == 80); + assert!(size_of::() == 16); + // RegionTableEntry: GUID(16) + FileOffset(8) + Length(4) + Flags(4) = 32 + assert!(size_of::() == 32); + assert!(size_of::() == 32); + assert!(size_of::() == 32); + assert!(size_of::() == 64); + assert!(size_of::() == 32); + assert!(size_of::() == 32); + assert!(size_of::() == KB4 as usize); + // System + user entries should equal the maximum entry count. + assert!( + METADATA_SYSTEM_ENTRY_MAX_COUNT + METADATA_USER_ENTRY_MAX_COUNT == METADATA_ENTRY_MAX_COUNT + ); +}; + +#[cfg(test)] +mod tests { + use super::*; + use zerocopy::FromZeros; + + #[test] + fn bat_entry_accessors() { + let entry = BatEntry::new().with_state(6).with_file_offset_mb(2); + assert_eq!(entry.state(), 6); + assert_eq!(entry.file_offset_mb(), 2); + assert_eq!(entry.file_offset(), 2 * MB1); + } + + #[test] + fn bat_entry_state_roundtrip() { + for &(raw, expected) in &[ + (0, BatEntryState::NotPresent), + (1, BatEntryState::Undefined), + (2, BatEntryState::Zero), + (3, BatEntryState::Unmapped), + (6, BatEntryState::FullyPresent), + (7, BatEntryState::PartiallyPresent), + ] { + assert_eq!(BatEntryState::from_raw(raw), Some(expected)); + } + // Values 4, 5 are undefined. + assert_eq!(BatEntryState::from_raw(4), None); + assert_eq!(BatEntryState::from_raw(5), None); + } + + #[test] + fn file_parameters_flags() { + let flags = FileParametersFlags::new() + .with_leave_blocks_allocated(true) + .with_has_parent(true); + let params = FileParameters { + block_size: 0, + flags, + }; + assert!(params.flags.leave_blocks_allocated()); + assert!(params.flags.has_parent()); + + let params2 = FileParameters { + block_size: 0, + flags: FileParametersFlags::new(), + }; + assert!(!params2.flags.leave_blocks_allocated()); + assert!(!params2.flags.has_parent()); + } + + #[test] + fn region_table_entry_flags() { + let entry = RegionTableEntry { + guid: Guid::ZERO, + file_offset: 0, + length: 0, + flags: RegionTableEntryFlags::new().with_required(true), + }; + assert!(entry.flags.required()); + + let entry2 = RegionTableEntry { + guid: Guid::ZERO, + file_offset: 0, + length: 0, + flags: RegionTableEntryFlags::new(), + }; + assert!(!entry2.flags.required()); + } + + #[test] + fn checksum_roundtrip() { + // Create a fake header-sized buffer and verify checksum round-trip. + let mut data = vec![0u8; HEADER_SIZE as usize]; + // Write the header signature. + data[0..4].copy_from_slice(&HEADER_SIGNATURE.to_le_bytes()); + // Checksum field is at offset 4. + let checksum_offset = 4; + let crc = compute_checksum(&data, checksum_offset); + data[checksum_offset..checksum_offset + 4].copy_from_slice(&crc.to_le_bytes()); + assert!(validate_checksum(&data, checksum_offset)); + } + + #[test] + fn zero_copy_roundtrip_header() { + let mut header = Header::new_zeroed(); + header.signature = HEADER_SIGNATURE; + header.version = VERSION_1; + header.sequence_number = 42; + + let bytes = header.as_bytes(); + let parsed = Header::read_from_bytes(bytes).unwrap(); + assert_eq!(parsed.signature, HEADER_SIGNATURE); + assert_eq!(parsed.version, VERSION_1); + assert_eq!(parsed.sequence_number, 42); + } + + #[test] + fn zero_copy_roundtrip_bat_entry() { + let entry = BatEntry::new().with_state(6).with_file_offset_mb(100); + let bytes = entry.as_bytes(); + let parsed = BatEntry::read_from_bytes(bytes).unwrap(); + assert_eq!(parsed.state(), 6); + assert_eq!(parsed.file_offset_mb(), 100); + } + + #[test] + fn metadata_table_entry_flags() { + let flags = MetadataTableEntryFlags::new() + .with_is_user(true) + .with_is_virtual_disk(true) + .with_is_required(true); + let entry = MetadataTableEntry { + item_id: Guid::ZERO, + offset: 0, + length: 0, + flags, + reserved2: 0, + }; + assert!(entry.flags.is_user()); + assert!(entry.flags.is_virtual_disk()); + assert!(entry.flags.is_required()); + } +} diff --git a/vm/devices/storage/vhdx/src/header.rs b/vm/devices/storage/vhdx/src/header.rs new file mode 100644 index 0000000000..6c0daf309c --- /dev/null +++ b/vm/devices/storage/vhdx/src/header.rs @@ -0,0 +1,520 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Dual header parsing, validation, and write-mode management for VHDX files. +//! +//! Reads both VHDX headers, validates their signatures and CRC-32C checksums, +//! selects the active header (higher sequence number), and validates log +//! region parameters. +//! +//! Also provides [`HeaderState`], which serializes all header writes behind +//! a `futures::lock::Mutex` and exposes the current [`WriteMode`] via an +//! `AtomicU8` for lock-free hot-path checks. + +use crate::AsyncFile; +use crate::error::CorruptionType; +use crate::error::OpenError; +use crate::error::OpenErrorInner; +use crate::flush::FlushSequencer; +use crate::format; +use crate::format::Header; +use guid::Guid; +use std::sync::atomic::AtomicU8; +use std::sync::atomic::Ordering; +use zerocopy::FromBytes; +use zerocopy::FromZeros; +use zerocopy::IntoBytes; + +/// Parsed and validated header data extracted from a VHDX file. +pub(crate) struct ParsedHeader { + /// The active header's sequence number. + pub sequence_number: u64, + /// GUID changed on every file-level write. + pub file_write_guid: Guid, + /// GUID changed on every virtual-disk data write. + pub data_write_guid: Guid, + /// GUID identifying the active log. Zero means no active log. + pub log_guid: Guid, + /// File offset of the log region. + pub log_offset: u64, + /// Length of the log region in bytes. + pub log_length: u32, + /// True if header 1 was chosen as the active header. + pub first_header_current: bool, +} + +/// Read a single 4 KiB header from the file and validate its signature +/// and CRC-32C checksum. Returns `Some(header)` if valid, `None` otherwise. +async fn read_and_validate_header( + file: &impl AsyncFile, + offset: u64, +) -> Result, OpenError> { + let buf = file.alloc_buffer(format::HEADER_SIZE as usize); + let buf = file + .read_into(offset, buf) + .await + .map_err(OpenErrorInner::Io)?; + + // Check signature. + let header = match Header::read_from_prefix(buf.as_ref()) { + Ok((h, _)) => h, + Err(_) => return Ok(None), + }; + if header.signature != format::HEADER_SIGNATURE { + return Ok(None); + } + + // Validate CRC-32C checksum (checksum field is at byte offset 4). + if !format::validate_checksum(buf.as_ref(), 4) { + return Ok(None); + } + + Ok(Some(header.clone())) +} + +/// Read both headers from the file, validate them, and return the active one. +/// +/// If both headers are valid, the one with the higher sequence number wins. +/// If only one is valid, it is used. If neither is valid, returns an error. +pub(crate) async fn parse_headers( + file: &impl AsyncFile, + file_length: u64, +) -> Result { + let header1 = read_and_validate_header(file, format::HEADER_OFFSET_1).await?; + let header2 = read_and_validate_header(file, format::HEADER_OFFSET_2).await?; + + // Choose the active header. + let (header, first_header_current) = match (&header1, &header2) { + (Some(h1), Some(h2)) => { + if h1.sequence_number >= h2.sequence_number { + (h1, true) + } else { + (h2, false) + } + } + (Some(h1), None) => (h1, true), + (None, Some(h2)) => (h2, false), + (None, None) => return Err((CorruptionType::NoValidHeaders).into()), + }; + + // Validate version. + if header.version != format::VERSION_1 { + return Err((CorruptionType::UnsupportedVersion).into()); + } + + // If log GUID is non-zero, validate log version. + if header.log_guid != Guid::ZERO && header.log_version != format::LOG_VERSION { + return Err((CorruptionType::UnsupportedLogVersion).into()); + } + + // Validate log offset and length alignment. + if !header.log_offset.is_multiple_of(format::REGION_ALIGNMENT) + || !(header.log_length as u64).is_multiple_of(format::REGION_ALIGNMENT) + { + return Err((CorruptionType::InvalidLogOffsetOrLength).into()); + } + + let (log_offset, log_length) = if header.log_length == 0 { + // Log is empty — log GUID must also be zero. + if header.log_guid != Guid::ZERO { + return Err((CorruptionType::MissingLogHasGuid).into()); + } + (0, 0) + } else { + // Log is present — validate offset and bounds. + if header.log_offset < format::HEADER_AREA_SIZE { + return Err((CorruptionType::InvalidLogOffset).into()); + } + if header.log_offset.saturating_add(header.log_length as u64) > file_length { + return Err((CorruptionType::LogBeyondEndOfFile).into()); + } + (header.log_offset, header.log_length) + }; + + Ok(ParsedHeader { + sequence_number: header.sequence_number, + file_write_guid: header.file_write_guid, + data_write_guid: header.data_write_guid, + log_guid: header.log_guid, + log_offset, + log_length, + first_header_current, + }) +} + +/// Serialize a VHDX header to a 4 KiB buffer with CRC and determine +/// the target offset (non-current header slot). +/// +/// Returns `(buffer, file_offset)` ready for `write_from`. +pub(crate) fn serialize_header( + file: &F, + sequence_number: u64, + file_write_guid: Guid, + data_write_guid: Guid, + log_guid: Guid, + log_offset: u64, + log_length: u32, + first_header_current: bool, +) -> (F::Buffer, u64) { + let mut header = Header::new_zeroed(); + header.signature = format::HEADER_SIGNATURE; + header.sequence_number = sequence_number; + header.file_write_guid = file_write_guid; + header.data_write_guid = data_write_guid; + header.log_guid = log_guid; + header.log_version = format::LOG_VERSION; + header.version = format::VERSION_1; + header.log_length = log_length; + header.log_offset = log_offset; + header.checksum = 0; + + let mut buf = file.alloc_buffer(format::HEADER_SIZE as usize); + let hdr_bytes = header.as_bytes(); + buf.as_mut()[..hdr_bytes.len()].copy_from_slice(hdr_bytes); + let crc = format::compute_checksum(buf.as_ref(), 4); + buf.as_mut()[4..8].copy_from_slice(&crc.to_le_bytes()); + + let offset = if first_header_current { + format::HEADER_OFFSET_2 + } else { + format::HEADER_OFFSET_1 + }; + + (buf, offset) +} + +/// The kind of modification being made to the VHDX file. Controls which +/// GUIDs are updated in the header before the first write. +/// +/// Values are ordered: `FileWritable < DataWritable`. Once `DataWritable` +/// is reached, `FileWritable` is a no-op. The `#[repr(u8)]` layout +/// matches the `AtomicU8` stored in [`HeaderState`]. +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] +#[repr(u8)] +pub(crate) enum WriteMode { + /// The file is being modified (metadata only, e.g. resize/compact). + /// Updates FileWriteGuid. + FileWritable = 1, + /// User-visible virtual disk data is being modified. + /// Updates both FileWriteGuid and DataWriteGuid. + DataWritable = 2, +} + +/// Value used in [`HeaderState::write_mode`] when no write has occurred yet. +const WRITE_MODE_NONE: u8 = 0; + +/// Mutable header state, serialized behind a `futures::lock::Mutex`. +/// +/// All header writes go through [`HeaderState::write()`], which holds the +/// async mutex across the serialize→write→flush→flip sequence, preventing +/// concurrent header writes from interleaving. +/// +/// The current [`WriteMode`] is also published to an `AtomicU8` so that +/// the hot path (`enable_write_mode`) can check it with a single atomic +/// load and avoid taking any lock. +pub(crate) struct HeaderState { + /// Current write mode, published atomically for lock-free fast-path + /// checks. Updated *after* the header is on stable storage. + write_mode: AtomicU8, + /// Data-write GUID, stored separately for the sync public accessor + /// `VhdxFile::data_write_guid()`. Updated under the async mutex, + /// read via `parking_lot::Mutex` (or `AtomicU64` pair if needed). + /// Here we use `parking_lot::Mutex` since it's a brief, non-contended + /// read. + data_write_guid: parking_lot::Mutex, + /// File offset of the log region (immutable after open). + log_offset: u64, + /// Length of the log region in bytes (immutable after open). + log_length: u32, + /// Async mutex serializing all header writes. + inner: futures::lock::Mutex, +} + +/// Fields protected by the async mutex inside [`HeaderState`]. +struct HeaderStateInner { + /// Current header sequence number (bumped on every write). + sequence_number: u64, + /// GUID changed on every file-level write. + file_write_guid: Guid, + /// GUID changed on every virtual-disk data write. + data_write_guid: Guid, + /// Active log GUID. Zero when no log task is running. + log_guid: Guid, + /// True if header slot 1 (offset 64 KiB) is the current header. + first_header_current: bool, +} + +impl HeaderState { + /// Create a new `HeaderState` from a parsed header. + pub fn new(header: &ParsedHeader) -> Self { + Self { + write_mode: AtomicU8::new(WRITE_MODE_NONE), + data_write_guid: parking_lot::Mutex::new(header.data_write_guid), + log_offset: header.log_offset, + log_length: header.log_length, + inner: futures::lock::Mutex::new(HeaderStateInner { + sequence_number: header.sequence_number, + file_write_guid: header.file_write_guid, + data_write_guid: header.data_write_guid, + log_guid: header.log_guid, + first_header_current: header.first_header_current, + }), + } + } + + /// Lock-free check: is the current write mode ≥ `mode`? + pub fn is_mode_enabled(&self, mode: WriteMode) -> bool { + self.write_mode.load(Ordering::Acquire) >= mode as u8 + } + + /// Read the current data-write GUID (sync, brief lock). + pub fn data_write_guid(&self) -> Guid { + *self.data_write_guid.lock() + } + + /// Get the log region offset and length (immutable after open). + pub fn log_region(&self) -> (u64, u32) { + (self.log_offset, self.log_length) + } + + /// Read the current sequence number. Requires the async lock. + #[cfg(test)] + pub async fn sequence_number(&self) -> u64 { + self.inner.lock().await.sequence_number + } + + /// Read the current write mode (for test assertions). + #[cfg(test)] + pub fn write_mode(&self) -> Option { + match self.write_mode.load(Ordering::Acquire) { + 0 => None, + 1 => Some(WriteMode::FileWritable), + 2 => Some(WriteMode::DataWritable), + _ => unreachable!(), + } + } + + /// Ensure the file is in at least write mode `mode`. + /// + /// Hot path (mode already enabled): single atomic load, no lock. + /// + /// Cold path (mode transition): acquires the async mutex, generates + /// new GUIDs, writes the header to the non-current slot, flushes, + /// flips the active slot, then publishes the new mode atomically. + /// + /// Safe to call concurrently — the async mutex serializes transitions. + pub async fn enable_write_mode( + &self, + mode: WriteMode, + file: &impl AsyncFile, + flush_sequencer: Option<&FlushSequencer>, + ) -> Result<(), std::io::Error> { + // Hot path: single atomic load. + if self.is_mode_enabled(mode) { + return Ok(()); + } + + // Cold path: serialize under async mutex. + let mut inner = self.inner.lock().await; + + // Double-check under lock (another caller may have raced). + if self.write_mode.load(Ordering::Relaxed) >= mode as u8 { + return Ok(()); + } + + // Generate new GUIDs. + inner.file_write_guid = Guid::new_random(); + if mode >= WriteMode::DataWritable { + inner.data_write_guid = Guid::new_random(); + *self.data_write_guid.lock() = inner.data_write_guid; + } + + // Write header, flush, flip slot. + self.write_header(&mut inner, file, flush_sequencer).await?; + + // Publish the mode change — only after the header is on stable storage. + self.write_mode.store(mode as u8, Ordering::Release); + + Ok(()) + } + + /// Set the log GUID and write a header update. Used by `open_writable` + /// to mark the file as dirty before spawning the log task. + pub async fn set_log_guid( + &self, + log_guid: Guid, + file: &impl AsyncFile, + flush_sequencer: Option<&FlushSequencer>, + ) -> Result<(), std::io::Error> { + let mut inner = self.inner.lock().await; + inner.log_guid = log_guid; + self.write_header(&mut inner, file, flush_sequencer).await + } + + /// Clear the log GUID (set to ZERO) and write a clean header. + /// Used by `close()` after the log is fully drained. + pub async fn clear_log_guid( + &self, + file: &impl AsyncFile, + flush_sequencer: Option<&FlushSequencer>, + ) -> Result<(), std::io::Error> { + let mut inner = self.inner.lock().await; + inner.log_guid = Guid::ZERO; + self.write_header(&mut inner, file, flush_sequencer).await + } + + /// Bump the sequence number, serialize the header, write to the + /// non-current slot, flush, and flip the active slot. + /// + /// Caller must hold the async mutex (`inner` is `&mut`). + async fn write_header( + &self, + inner: &mut HeaderStateInner, + file: &impl AsyncFile, + flush_sequencer: Option<&FlushSequencer>, + ) -> Result<(), std::io::Error> { + inner.sequence_number += 1; + let (buf, offset) = serialize_header( + file, + inner.sequence_number, + inner.file_write_guid, + inner.data_write_guid, + inner.log_guid, + self.log_offset, + self.log_length, + inner.first_header_current, + ); + + file.write_from(offset, buf).await?; + + if let Some(fs) = flush_sequencer { + fs.flush(file).await?; + } else { + file.flush().await?; + } + + inner.first_header_current = !inner.first_header_current; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::AsyncFileExt; + use crate::tests::support::InMemoryFile; + use pal_async::async_test; + + #[async_test] + async fn parse_valid_dual_headers() { + let (file, _params) = InMemoryFile::create_test_vhdx(format::GB1).await; + let file_length = file.file_size().await.unwrap(); + let parsed = parse_headers(&file, file_length).await.unwrap(); + + // Header 2 has sequence_number 1. Header 1 has 0. So header 2 wins. + assert_eq!(parsed.sequence_number, 1); + assert!(!parsed.first_header_current); + assert_eq!(parsed.log_guid, Guid::ZERO); + assert_ne!(parsed.file_write_guid, Guid::ZERO); + assert_ne!(parsed.data_write_guid, Guid::ZERO); + } + + #[async_test] + async fn parse_higher_sequence_wins() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let file_length = file.file_size().await.unwrap(); + + // Corrupt header 1's CRC by flipping a byte. + let mut buf = vec![0u8; format::HEADER_SIZE as usize]; + file.read_at(format::HEADER_OFFSET_1, &mut buf) + .await + .unwrap(); + buf[10] ^= 0xFF; + file.write_at(format::HEADER_OFFSET_1, &buf).await.unwrap(); + + let parsed = parse_headers(&file, file_length).await.unwrap(); + // Header 1 is invalid, so header 2 is used. + assert!(!parsed.first_header_current); + assert_eq!(parsed.sequence_number, 1); + } + + #[async_test] + async fn parse_both_headers_corrupt() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let file_length = file.file_size().await.unwrap(); + + // Corrupt both headers. + let mut buf1 = vec![0u8; format::HEADER_SIZE as usize]; + file.read_at(format::HEADER_OFFSET_1, &mut buf1) + .await + .unwrap(); + buf1[10] ^= 0xFF; + file.write_at(format::HEADER_OFFSET_1, &buf1).await.unwrap(); + + let mut buf2 = vec![0u8; format::HEADER_SIZE as usize]; + file.read_at(format::HEADER_OFFSET_2, &mut buf2) + .await + .unwrap(); + buf2[10] ^= 0xFF; + file.write_at(format::HEADER_OFFSET_2, &buf2).await.unwrap(); + + let result = parse_headers(&file, file_length).await; + assert!(matches!( + result, + Err(OpenError(OpenErrorInner::Corrupt( + CorruptionType::NoValidHeaders + ))) + )); + } + + #[async_test] + async fn parse_one_valid_header() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let file_length = file.file_size().await.unwrap(); + + // Corrupt header 2's CRC. + let mut buf = vec![0u8; format::HEADER_SIZE as usize]; + file.read_at(format::HEADER_OFFSET_2, &mut buf) + .await + .unwrap(); + buf[10] ^= 0xFF; + file.write_at(format::HEADER_OFFSET_2, &buf).await.unwrap(); + + let parsed = parse_headers(&file, file_length).await.unwrap(); + assert!(parsed.first_header_current); + assert_eq!(parsed.sequence_number, 0); + } + + #[async_test] + async fn parse_log_validation() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let file_length = file.file_size().await.unwrap(); + + // Manually construct a header with valid signature but misaligned log. + let mut buf = vec![0u8; format::HEADER_SIZE as usize]; + file.read_at(format::HEADER_OFFSET_1, &mut buf) + .await + .unwrap(); + + let mut header = Header::read_from_prefix(&buf).unwrap().0.clone(); + header.log_offset = 12345; // Not aligned to REGION_ALIGNMENT. + header.log_length = format::REGION_ALIGNMENT as u32; + header.sequence_number = 100; // Make this the winning header. + header.checksum = 0; + + // Write header bytes, recompute CRC. + let header_bytes = IntoBytes::as_bytes(&header); + buf[..header_bytes.len()].copy_from_slice(header_bytes); + let crc = format::compute_checksum(&buf, 4); + buf[4..8].copy_from_slice(&crc.to_le_bytes()); + file.write_at(format::HEADER_OFFSET_1, &buf).await.unwrap(); + + let result = parse_headers(&file, file_length).await; + assert!(matches!( + result, + Err(OpenError(OpenErrorInner::Corrupt( + CorruptionType::InvalidLogOffsetOrLength + ))) + )); + } +} diff --git a/vm/devices/storage/vhdx/src/io.rs b/vm/devices/storage/vhdx/src/io.rs new file mode 100644 index 0000000000..82da345e80 --- /dev/null +++ b/vm/devices/storage/vhdx/src/io.rs @@ -0,0 +1,979 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! VHDX read/write I/O resolution and guards. +//! +//! Translates guest virtual disk offsets into file-level ranges via +//! [`VhdxFile::resolve_read`] and [`VhdxFile::resolve_write`], handling +//! block allocation, TFP lifecycle, sector bitmap updates, and +//! crash-consistent BAT commits. + +use crate::AsyncFile; +use crate::bat::BatGuard; +use crate::bat::BlockMapping; +use crate::bat::BlockSpan; +use crate::bat::BlockType; +use crate::error::VhdxIoError; +use crate::error::VhdxIoErrorInner; +use crate::format::BatEntryState; +use crate::format::MB1; +use crate::header::WriteMode; +use crate::open::VhdxFile; +use crate::space::AllocateFlags; + +/// Record of a block that had Transitioning-to-Fully-Present (TFP) set +/// during the allocation phase of [`VhdxFile::resolve_write`]. +/// +/// Carried inside [`WriteIoGuard`] so that `complete()` can finalize +/// the BAT without re-walking the block range, and `abort()` can revert +/// without guessing which blocks were modified. +struct TfpRecord { + /// Block number in the BAT. + block_number: u32, + /// The block's mapping before TFP was set. Used by the abort path + /// to revert the in-memory BAT. + original_mapping: BlockMapping, + /// File offset of newly allocated space, if any. `None` when TFP + /// was set on an already-allocated block (e.g. PartiallyPresent → + /// FullyPresent promotion). The abort path releases this space back + /// to the free pool. + allocated_offset: Option, +} + +/// Resolved range from a read operation. +/// +/// Each range describes a contiguous portion of the read request and its +/// data source. The caller iterates these ranges to perform the actual I/O. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ReadRange { + /// Data present at this file offset. Caller should read from the VHDX file. + Data { + /// Byte offset within the virtual disk. + guest_offset: u64, + /// Length in bytes. + length: u32, + /// Byte offset within the VHDX file where the data lives. + file_offset: u64, + }, + /// Range is zero-filled. Caller should return zeros. + Zero { + /// Byte offset within the virtual disk. + guest_offset: u64, + /// Length in bytes. + length: u32, + }, + /// Range is unmapped (transparent to parent). Caller should read from + /// the parent disk in a differencing chain. + Unmapped { + /// Byte offset within the virtual disk. + guest_offset: u64, + /// Length in bytes. + length: u32, + }, +} + +/// Resolved range from a write operation. +/// +/// Each range describes a contiguous portion of the write target and +/// what the caller needs to do. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum WriteRange { + /// Write caller's data at this file offset. + Data { + /// Byte offset within the virtual disk. + guest_offset: u64, + /// Length in bytes. + length: u32, + /// Byte offset within the VHDX file where data should be written. + file_offset: u64, + }, + /// Zero-fill this file range (e.g. newly allocated block padding). + Zero { + /// Byte offset within the VHDX file to zero-fill. + file_offset: u64, + /// Length in bytes. + length: u32, + }, +} + +impl VhdxFile { + /// Validate an I/O request and acquire per-block refcounts. + /// + /// Checks the failure flag, alignment, and bounds. Then increments + /// per-block refcounts atomically, waiting if trim has claimed any + /// block. Returns the [`BatGuard`] that holds the refcounts. + /// + /// Callers must handle zero-length requests before calling this. + async fn validate_and_acquire( + &self, + offset: u64, + len: u32, + ) -> Result, VhdxIoError> { + self.failed.check()?; + + // Validate alignment to logical sector size. + if !offset.is_multiple_of(self.logical_sector_size as u64) + || !(len as u64).is_multiple_of(self.logical_sector_size as u64) + { + return Err(VhdxIoErrorInner::UnalignedIo.into()); + } + + // Validate bounds. + if offset + .checked_add(len as u64) + .is_none_or(|end| end > self.disk_size) + { + return Err(VhdxIoErrorInner::BeyondEndOfDisk.into()); + } + + // Increment per-block refcounts atomically. If trim has claimed + // any block (sentinel), wait and retry. Once the refcount is + // incremented, trim cannot modify that block's mapping, so the + // subsequent mapping reads are guaranteed to see stable state. + let start_block = self.bat.offset_to_block(offset); + let end_block = self.bat.offset_to_block(offset + len as u64 - 1); + let block_count = end_block - start_block + 1; + + let guard = self + .bat + .acquire_io_refcounts(start_block, block_count) + .await; + + Ok(guard) + } + + /// Resolve a read request into file-level ranges. + /// + /// Walks the read request block-by-block, looking up each block's state + /// in the BAT and appending one or more [`ReadRange`] entries to `ranges`. + /// The caller performs actual file I/O based on the returned ranges. + /// + /// # Errors + /// + /// Returns an error if the read extends beyond the virtual disk size, + /// if the offset or length is not aligned to the logical sector size, + /// or if a BAT entry is corrupt. + pub async fn resolve_read( + &self, + offset: u64, + len: u32, + ranges: &mut Vec, + ) -> Result, VhdxIoError> { + // Zero-length reads succeed immediately. + if len == 0 { + return Ok(ReadIoGuard::empty()); + } + + let guard = self.validate_and_acquire(offset, len).await?; + + for span in self.bat.block_spans(offset, len) { + let mapping = self.bat.get_block_mapping(span.block_number); + + match mapping.bat_state() { + BatEntryState::FullyPresent => { + let file_offset = mapping.file_offset() + span.block_offset as u64; + ranges.push(ReadRange::Data { + guest_offset: span.virtual_offset, + length: span.length, + file_offset, + }); + } + BatEntryState::PartiallyPresent => { + self.resolve_partial_block_read( + mapping.file_offset(), + span.virtual_offset, + span.length, + ranges, + ) + .await?; + } + BatEntryState::NotPresent => { + if self.has_parent { + ranges.push(ReadRange::Unmapped { + guest_offset: span.virtual_offset, + length: span.length, + }); + } else { + ranges.push(ReadRange::Zero { + guest_offset: span.virtual_offset, + length: span.length, + }); + } + } + BatEntryState::Zero | BatEntryState::Unmapped | BatEntryState::Undefined => { + ranges.push(ReadRange::Zero { + guest_offset: span.virtual_offset, + length: span.length, + }); + } + } + } + + Ok(ReadIoGuard::new(guard)) + } + + /// Resolve a write request into file-level ranges. + /// + /// Walks the write request block-by-block, allocating blocks as needed. + /// For each block, emits [`WriteRange::Data`] entries describing where + /// the caller should write data, and [`WriteRange::Zero`] entries for + /// any newly allocated regions that must be zero-filled. + /// + /// Blocks that are fully-covering writes use TFP (Transitioning to Fully + /// Present) to defer BAT commit to [`WriteIoGuard::complete()`]. Partial writes + /// commit the BAT immediately via per-entry cache write. + /// + /// Before any ranges are returned, the header is updated with new GUIDs + /// and flushed to disk (first-write gate). + /// + /// After the caller writes data at the returned offsets, it **must** call + /// [`WriteIoGuard::complete()`] to finalize the BAT and sector bitmaps. + /// Dropping the guard without calling `complete()` aborts the write. + pub async fn resolve_write( + &self, + offset: u64, + len: u32, + ranges: &mut Vec, + ) -> Result, VhdxIoError> { + // Check read-only. + if self.read_only { + return Err(VhdxIoErrorInner::ReadOnly.into()); + } + + // Zero-length writes succeed immediately. + if len == 0 { + return Ok(WriteIoGuard::new_completed(self)); + } + + // First-write gate: update header with new GUIDs before any data. + self.enable_write_mode(WriteMode::DataWritable) + .await + .map_err(VhdxIoErrorInner::WriteHeader)?; + + let refcount_guard = self.validate_and_acquire(offset, len).await?; + + let mut blocks_needing_allocation: Vec = Vec::new(); + + // --- Read phase: check BAT state for each block --- + for span in self.bat.block_spans(offset, len) { + let is_full_block = span.is_full_block(self.block_size); + + // Read the in-memory BAT state. + loop { + let (state, file_offset, has_tfp) = { + let mapping = self.bat.get_block_mapping(span.block_number); + ( + mapping.bat_state(), + mapping.file_offset(), + mapping.transitioning_to_fully_present(), + ) + }; + + if has_tfp { + // Block is being allocated by another task — wait and retry. + let listener = self.allocation_event.listen(); + if self + .bat + .get_block_mapping(span.block_number) + .transitioning_to_fully_present() + { + listener.await; + } + continue; + } + + match state { + BatEntryState::FullyPresent => { + ranges.push(WriteRange::Data { + guest_offset: span.virtual_offset, + length: span.length, + file_offset: file_offset + span.block_offset as u64, + }); + break; + } + BatEntryState::PartiallyPresent if !is_full_block => { + // Partial write to already-allocated block — write + // directly. complete_write() updates sector bitmaps. + ranges.push(WriteRange::Data { + guest_offset: span.virtual_offset, + length: span.length, + file_offset: file_offset + span.block_offset as u64, + }); + break; + } + BatEntryState::PartiallyPresent => { + // Fully-covering write to PartiallyPresent block — + // needs TFP to promote to FullyPresent. Fall through + // to allocation phase. + blocks_needing_allocation.push(span); + break; + } + BatEntryState::NotPresent + | BatEntryState::Zero + | BatEntryState::Unmapped + | BatEntryState::Undefined => { + // Unallocated — needs allocation. + blocks_needing_allocation.push(span); + break; + } + } + } + } + + // If nothing needs allocation, we're done. Transfer refcount + // ownership from the ReadIoGuard to the WriteIoGuard. + if blocks_needing_allocation.is_empty() { + return Ok(WriteIoGuard::new_no_alloc( + self, + refcount_guard, + offset, + len, + )); + } + + // --- Allocation phase --- + // Keep refcount_guard alive — its Drop will release refcounts + // if any error path below returns early. On success, we forget + // it and let the WriteIoGuard take ownership. + + // --- Allocation phase: acquire BlockAllocationLock --- + // Wait until no blocks in our allocation set have TFP set by + // a concurrent allocator. This is the OverlappingAllocations + // serialization: if another writer is + // transitioning any of our blocks, we park and wait for that + // writer's post-allocate to clear TFP before proceeding. + // LOCK AUDIT: No synchronous locks held entering allocation loop. + // allocation_lock (futures::Mutex) is acquired via .await — fine. + let mut alloc_guard = loop { + let alloc_guard = self.allocation_lock.lock().await; + + // Check all blocks under BAT lock for TFP overlap. + // Register listener before dropping locks to avoid missed wakes. + let listener = self.allocation_event.listen(); + if !blocks_needing_allocation.iter().any(|span| { + self.bat + .get_block_mapping(span.block_number) + .transitioning_to_fully_present() + }) { + break alloc_guard; + } + drop(alloc_guard); + listener.await; + }; + + // Track blocks that got TFP set (for error cleanup and complete/abort). + let mut tfp_records: Vec = Vec::new(); + + // Track whether any TFP allocation used unsafe (non-safe-data) space. + // When true, complete_write_inner() captures the current FSN and + // attaches it to the BAT page(s) so the log task waits for the + // data flush before logging the BAT update. + let mut needs_flush_before_log = false; + + // Re-check and allocate under the lock. + // No block in our set should have TFP at this point — we waited + // for all concurrent allocators to finish above. + let eof = &mut *alloc_guard; + let allocation_result: Result<(), VhdxIoError> = async { + for span in &blocks_needing_allocation { + let is_full_block = span.is_full_block(self.block_size); + + // Re-read mapping (may have changed since read phase). + let mapping = self.bat.get_block_mapping(span.block_number); + + // Assert no TFP — we serialized against concurrent + // allocators in the loop above. + assert!( + !mapping.transitioning_to_fully_present(), + "block {} has TFP after overlap wait", + span.block_number + ); + + match mapping.bat_state() { + BatEntryState::FullyPresent => { + // Already allocated by a concurrent writer — just emit range. + ranges.push(WriteRange::Data { + guest_offset: span.virtual_offset, + length: span.length, + file_offset: mapping.file_offset() + span.block_offset as u64, + }); + } + BatEntryState::PartiallyPresent if is_full_block => { + // Fully-covering write to PartiallyPresent — set TFP + // on existing mapping, no new space. + // This is always safe (space already has this block's + // data), so no change to needs_flush_before_log. + let original = mapping; + let new_mapping = original.with_transitioning_to_fully_present(true); + + self.bat.set_block_mapping(span.block_number, new_mapping); + + tfp_records.push(TfpRecord { + block_number: span.block_number, + original_mapping: original, + allocated_offset: None, + }); + + ranges.push(WriteRange::Data { + guest_offset: span.virtual_offset, + length: span.length, + file_offset: mapping.file_offset() + span.block_offset as u64, + }); + } + _ => { + // Unallocated block — allocate space. + let original = mapping; + let (new_offset, space_state) = self + .allocate_block_space(span.block_number, mapping, eof) + .await?; + + if is_full_block { + self.allocate_full_block( + span, + original, + new_offset, + space_state, + &mut tfp_records, + &mut needs_flush_before_log, + ranges, + ); + } else { + self.allocate_partial_block( + span, + mapping, + new_offset, + space_state, + eof, + ranges, + ) + .await?; + } + } + } + } + + Ok(()) + } + .await; + + // Error cleanup: revert TFP-marked blocks and release allocated space on failure. + if let Err(e) = allocation_result { + self.abort_write_sync(&tfp_records); + return Err(e); + } + + // Allocation lock is released when _alloc_guard drops (after + // returning ranges to caller). + + Ok(WriteIoGuard::new( + self, + refcount_guard, + offset, + len, + needs_flush_before_log, + tfp_records, + )) + } + + /// Allocate file space for a payload block. + /// + /// Tries three sources in priority order: + /// 1. Deferred releases (non-durable trim) — reclaim same-block space. + /// 2. Soft-anchored blocks (durable trim) — reclaim same-block space. + /// 3. Fresh allocation via [`allocate_space`]. + /// + /// Returns the file offset and [`SpaceState`] of the allocated region. + async fn allocate_block_space( + &self, + block_number: u32, + mapping: BlockMapping, + eof: &mut crate::space::EofState, + ) -> Result<(u64, crate::space::SpaceState), VhdxIoError> { + // 1. Check deferred releases (non-durable trim). + if let Some(deferred_offset) = self.deferred_releases.remove(block_number) { + return Ok((deferred_offset, crate::space::SpaceState::OwnStale)); + } + + // 2. Check soft-anchored blocks (durable trim). + if mapping.is_soft_anchored() { + let old_file_offset = mapping.file_offset(); + if self + .free_space + .unmark_trimmed_block(block_number, old_file_offset, self.block_size) + { + return Ok((old_file_offset, crate::space::SpaceState::OwnStale)); + } + // Unmark failed (race) — fall through to fresh allocation. + } + + // 3. Allocate fresh space. + let r = self + .allocate_space(eof, self.block_size, AllocateFlags::new()) + .await?; + Ok((r.file_offset, r.state)) + } + + /// Handle a full-block allocation: set TFP, record for later completion, + /// and emit the data range. + fn allocate_full_block( + &self, + span: &BlockSpan, + original_mapping: BlockMapping, + new_offset: u64, + space_state: crate::space::SpaceState, + tfp_records: &mut Vec, + needs_flush_before_log: &mut bool, + ranges: &mut Vec, + ) { + let new_mapping = BlockMapping::new() + .with_bat_state(original_mapping.bat_state()) + .with_transitioning_to_fully_present(true) + .with_file_megabyte((new_offset / MB1) as u32); + + self.bat.set_block_mapping(span.block_number, new_mapping); + + tfp_records.push(TfpRecord { + block_number: span.block_number, + original_mapping, + allocated_offset: Some(new_offset), + }); + + if !space_state.is_safe() { + *needs_flush_before_log = true; + } + + ranges.push(WriteRange::Data { + guest_offset: span.virtual_offset, + length: span.length, + file_offset: new_offset + span.block_offset as u64, + }); + } + + /// Handle a partial-block allocation: commit BAT immediately, allocate + /// SBM if needed (differencing disks), and emit data + zero-padding ranges. + async fn allocate_partial_block( + &self, + span: &BlockSpan, + mapping: BlockMapping, + new_offset: u64, + space_state: crate::space::SpaceState, + eof: &mut crate::space::EofState, + ranges: &mut Vec, + ) -> Result<(), VhdxIoError> { + // For differencing disks: if the block was NotPresent (transparent + // to parent), allocate as PartiallyPresent so that unwritten sectors + // remain transparent. The sector bitmap will be updated in + // complete_write_inner() to mark only the written sectors as present. + // + // For non-diff disks or blocks in other states (Zero, Unmapped, + // Undefined): allocate as FullyPresent with zero-padding. + let is_partial_present = + self.has_parent && mapping.bat_state() == BatEntryState::NotPresent; + + // Allocate SBM block if needed for PartiallyPresent. + if is_partial_present { + self.ensure_sbm_allocated(span.block_number, eof).await?; + } + + let new_state = if is_partial_present { + BatEntryState::PartiallyPresent + } else { + BatEntryState::FullyPresent + }; + + let new_mapping = BlockMapping::new() + .with_bat_state(new_state) + .with_transitioning_to_fully_present(false) + .with_file_megabyte((new_offset / MB1) as u32); + + // Capture per-page FSN when !is_safe. The FSN is captured now + // (before the caller writes data) to ensure the data flush + // completes before the BAT update is logged. + let pre_log_fsn = if !space_state.is_safe() { + self.log_state + .as_ref() + .map(|state| state.flush_sequencer.current_fsn()) + } else { + None + }; + + // Per-entry cache write (write-through to disk). + self.bat + .write_block_mapping( + &self.cache, + BlockType::Payload, + span.block_number, + new_mapping, + pre_log_fsn, + ) + .await?; + + // Emit zero + data + zero ranges. + // For PartiallyPresent blocks, skip zero-fill — unwritten sectors + // are transparent to parent (the sector bitmap tracks presence). + // For FullyPresent blocks, zero-fill surround unless the space + // is already safe (zeroed). + if !is_partial_present && span.block_offset > 0 && !space_state.is_zero() { + ranges.push(WriteRange::Zero { + file_offset: new_offset, + length: span.block_offset, + }); + } + + ranges.push(WriteRange::Data { + guest_offset: span.virtual_offset, + length: span.length, + file_offset: new_offset + span.block_offset as u64, + }); + + let end_offset = span.block_offset + span.length; + if !is_partial_present && end_offset < self.block_size && !space_state.is_zero() { + ranges.push(WriteRange::Zero { + file_offset: new_offset + end_offset as u64, + length: self.block_size - end_offset, + }); + } + + Ok(()) + } + + /// Ensure the sector bitmap block for the given payload block's chunk + /// is allocated. No-op if the SBM is already present. + async fn ensure_sbm_allocated( + &self, + block_number: u32, + eof: &mut crate::space::EofState, + ) -> Result<(), VhdxIoError> { + let chunk_number = block_number / self.bat.chunk_ratio; + let sbm_mapping = self.bat.get_sector_bitmap_mapping(chunk_number); + + if sbm_mapping.bat_state() == BatEntryState::FullyPresent { + return Ok(()); + } + + let sbm_alloc = self + .allocate_space( + eof, + crate::bat::SECTOR_BITMAP_BLOCK_SIZE, + AllocateFlags::new().with_zero(true), + ) + .await?; + + let new_sbm = BlockMapping::new() + .with_bat_state(BatEntryState::FullyPresent) + .with_file_megabyte((sbm_alloc.file_offset / MB1) as u32); + + self.bat + .write_block_mapping( + &self.cache, + BlockType::SectorBitmap, + chunk_number, + new_sbm, + None, + ) + .await?; + + Ok(()) + } + + /// Finalize a write operation (internal implementation). + /// + /// Called by [`WriteIoGuard::complete()`] after the caller has written + /// data to the resolved ranges. + /// + /// Iterates the TFP records from `resolve_write`, clearing the TFP + /// flag and setting each block to `FullyPresent` via + /// [`write_block_mapping`] (which atomically updates the in-memory + /// BAT and dirties the cache page). Then updates sector bitmaps for + /// any `PartiallyPresent` blocks (differencing disk partial writes). + /// + /// If a cache write fails, the remaining TFP records are reverted + /// via [`abort_write_sync`] and the error is returned. The file will + /// be poisoned by the log pipeline shortly after. + async fn complete_write_inner( + &self, + offset: u64, + len: u32, + tfp_records: &[TfpRecord], + needs_flush_before_log: bool, + ) -> Result<(), VhdxIoError> { + let had_tfp = !tfp_records.is_empty(); + + // Capture FSN after the caller's data writes. + // Passed into write_block_mapping so it's stamped on the + // cache page atomically with the dirty-mark. + let pre_log_fsn = if needs_flush_before_log { + self.log_state + .as_ref() + .map(|state| state.flush_sequencer.current_fsn()) + } else { + None + }; + + for (i, record) in tfp_records.iter().enumerate() { + let mapping = self.bat.get_block_mapping(record.block_number); + let final_mapping = BlockMapping::new() + .with_bat_state(BatEntryState::FullyPresent) + .with_transitioning_to_fully_present(false) + .with_file_megabyte(mapping.file_megabyte()); + + if let Err(e) = self + .bat + .write_block_mapping( + &self.cache, + BlockType::Payload, + record.block_number, + final_mapping, + pre_log_fsn, + ) + .await + { + self.abort_write_sync(&tfp_records[i..]); + return Err(e); + } + } + + // Notify waiters ALWAYS, even on failure or cache write error. + if had_tfp { + self.allocation_event.notify(usize::MAX); + } + + // Update sector bitmaps for partial writes to differencing disks. + // Walk the block range to find PartiallyPresent blocks that need + // their sector bitmaps updated. This is cheap (BAT state is in memory) + // and avoids carrying SBM records in the guard. + if self.has_parent && len > 0 { + for span in self.bat.block_spans(offset, len) { + let mapping = self.bat.get_block_mapping(span.block_number); + if !mapping.transitioning_to_fully_present() + && mapping.bat_state() == BatEntryState::PartiallyPresent + { + self.set_sector_bitmap_bits(span.virtual_offset, span.length, true) + .await?; + } + } + } + + Ok(()) + } + + /// Revert TFP blocks to their original state. + /// + /// Called on two paths: + /// - `WriteIoGuard::drop()` without `complete()` (write aborted) + /// - `complete_write_inner()` when a cache write fails (partial completion) + /// + /// Restores each block's in-memory BAT to [`TfpRecord::original_mapping`], + /// releases any newly allocated space back to the free pool, and + /// notifies allocation waiters. Does not perform any file I/O. + fn abort_write_sync(&self, tfp_records: &[TfpRecord]) { + if tfp_records.is_empty() { + return; + } + + for record in tfp_records { + self.bat + .set_block_mapping(record.block_number, record.original_mapping); + // Release allocated space back to free pool. + if let Some(offset) = record.allocated_offset { + self.free_space.release(offset, self.block_size); + } + } + + self.allocation_event.notify(usize::MAX); + } + + /// Flush all writes to stable storage. + /// + /// Commits dirty cache pages to the log task, waits for the WAL + /// entry to be written, then flushes to make everything durable: + /// user data writes, WAL entries, and apply-task writes. + pub async fn flush(&self) -> Result<(), VhdxIoError> { + self.failed.check()?; + + if self.read_only { + return Err(VhdxIoErrorInner::ReadOnly.into()); + } + + // Stamp all uncommitted deferred entries with the current + // generation BEFORE commit(). This ensures that entries stamped + // here have their BAT pages swept into this commit's WAL entry. + // Any new trims that run concurrently will insert with + // committed_gen = None and wait for the next flush. + let flush_gen = self.deferred_releases.stamp_uncommitted(); + + let lsn = self.cache.commit().map_err(VhdxIoErrorInner::CommitCache)?; + + let state = self + .log_state + .as_ref() + .expect("writable file has log_state"); + + // Wait for the log task to write WAL entries through this LSN. + state + .logged_lsn + .wait_for(lsn) + .await + .map_err(VhdxIoErrorInner::Failed)?; + + // Flush everything: user data, WAL entries, applied pages. + state + .flush_sequencer + .flush(self.file.as_ref()) + .await + .map_err(VhdxIoErrorInner::Flush)?; + + // Now that the WAL is durable, promote entries committed at or + // before this generation. Their BAT changes are crash-safe. + for (block_number, file_offset, size, anchor) in + self.deferred_releases.drain_committed(flush_gen) + { + if anchor { + let _ = self + .free_space + .mark_trimmed_block(block_number, file_offset, size); + } else { + self.free_space.release(file_offset, size); + } + } + + Ok(()) + } +} + +// --------------------------------------------------------------------------- +// I/O guards +// --------------------------------------------------------------------------- + +/// Guard for read I/O. Drop after file reads are complete. +/// +/// Returned by [`VhdxFile::resolve_read`]. Dropping this guard decrements +/// per-block refcounts, allowing trim to proceed. +pub struct ReadIoGuard<'a, F: AsyncFile> { + // Significant drop. + _bat_guard: BatGuard<'a>, + _phantom: std::marker::PhantomData<&'a VhdxFile>, +} + +impl<'a, F: AsyncFile> ReadIoGuard<'a, F> { + /// Create a new read guard with refcount tracking. + fn new(bat_guard: BatGuard<'a>) -> Self { + Self { + _bat_guard: bat_guard, + _phantom: std::marker::PhantomData, + } + } + + fn empty() -> Self { + Self { + _bat_guard: BatGuard::empty(), + _phantom: std::marker::PhantomData, + } + } +} + +/// Guard for write I/O. Call [`complete()`](Self::complete) to finalize, +/// or drop to abort. +/// +/// Returned by [`VhdxFile::resolve_write`]. Dropping without calling +/// `complete()` aborts the write, reverting TFP blocks and releasing +/// allocated space. In both cases, per-block refcounts are decremented +/// via the owned [`ReadIoGuard`]. +pub struct WriteIoGuard<'a, F: AsyncFile> { + vhdx: &'a VhdxFile, + // Significant drop. + _bat_guard: BatGuard<'a>, + /// The guest offset of the write (needed for SBM bitmap updates). + offset: u64, + /// The length of the write in bytes. + len: u32, + /// Whether `complete()` was called. If false on drop, the write is aborted. + completed: bool, + /// True when at least one TFP block was allocated from space that is + /// NOT safe (could contain stale data from another block). When true, + /// `complete_write_inner` must capture the current FSN and apply it + /// to the BAT pages so the log task waits for the data flush before + /// logging the BAT update. + needs_flush_before_log: bool, + /// TFP records collected during resolve_write, needed by complete/abort. + /// `None` after complete() or for zero-length writes. + tfp_records: Vec, +} + +impl<'a, F: AsyncFile> WriteIoGuard<'a, F> { + /// Create a new write guard that takes ownership of a [`ReadIoGuard`] + /// for refcount management. + fn new( + vhdx: &'a VhdxFile, + bat_guard: BatGuard<'a>, + offset: u64, + len: u32, + needs_flush_before_log: bool, + tfp_records: Vec, + ) -> Self { + Self { + vhdx, + _bat_guard: bat_guard, + offset, + len, + completed: false, + needs_flush_before_log, + tfp_records, + } + } + + /// Create a write guard that is already completed (for zero-length writes). + fn new_completed(vhdx: &'a VhdxFile) -> Self { + Self { + vhdx, + _bat_guard: BatGuard::empty(), + offset: 0, + len: 0, + completed: true, + needs_flush_before_log: false, + tfp_records: Vec::new(), + } + } + + /// Create a write guard with no completion records (no allocation was + /// needed — all blocks were already FullyPresent or PartiallyPresent + /// with a sub-block write). + fn new_no_alloc(vhdx: &'a VhdxFile, bat_guard: BatGuard<'a>, offset: u64, len: u32) -> Self { + Self { + vhdx, + _bat_guard: bat_guard, + offset, + len, + completed: false, + needs_flush_before_log: false, + tfp_records: Vec::new(), + } + } + + /// Finalize the write after data has been written to resolved ranges. + /// + /// Commits TFP -> FullyPresent, updates sector bitmaps. + /// Consumes the guard. Refcounts are decremented when `self` is dropped + /// after this method returns. + pub async fn complete(mut self) -> Result<(), VhdxIoError> { + self.completed = true; + self.vhdx + .complete_write_inner( + self.offset, + self.len, + &self.tfp_records, + self.needs_flush_before_log, + ) + .await + } +} + +impl Drop for WriteIoGuard<'_, F> { + fn drop(&mut self) { + // If complete() was not called, abort the write. + if !self.completed { + self.vhdx.abort_write_sync(&self.tfp_records); + } + // Refcounts are decremented when self.bat_guard drops. + } +} diff --git a/vm/devices/storage/vhdx/src/known_meta.rs b/vm/devices/storage/vhdx/src/known_meta.rs new file mode 100644 index 0000000000..ca90450151 --- /dev/null +++ b/vm/devices/storage/vhdx/src/known_meta.rs @@ -0,0 +1,473 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Known metadata parsing for VHDX files. +//! +//! Verifies that all required system metadata items are recognized, then +//! reads and parses the well-known items (file parameters, disk size, +//! sector sizes, page 83 data) into typed Rust values. + +use crate::AsyncFile; +use crate::cache::PageCache; +use crate::error::CorruptionType; +use crate::error::OpenError; +use crate::format; +use crate::format::FileParameters; +use crate::format::FileParametersFlags; +use crate::metadata::MetadataTable; +use guid::Guid; +use zerocopy::FromBytes; + +/// Parsed metadata from a VHDX file's metadata region. +pub(crate) struct KnownMetadata { + /// Block size in bytes. + pub block_size: u32, + /// Whether this is a differencing disk. + pub has_parent: bool, + /// Whether blocks should remain allocated (fixed VHD). + pub leave_blocks_allocated: bool, + /// Virtual disk size in bytes. + pub disk_size: u64, + /// Logical sector size (512 or 4096). + pub logical_sector_size: u32, + /// Physical sector size (512 or 4096). + pub physical_sector_size: u32, + /// Page 83 data GUID. + pub page_83_data: Guid, +} + +/// Known system metadata item GUIDs that this parser understands. +const KNOWN_ITEM_IDS: &[Guid] = &[ + format::FILE_PARAMETERS_ITEM_GUID, + format::VIRTUAL_DISK_SIZE_ITEM_GUID, + format::PAGE_83_ITEM_GUID, + format::CHS_PARAMETERS_ITEM_GUID, + format::LOGICAL_SECTOR_SIZE_ITEM_GUID, + format::PHYSICAL_SECTOR_SIZE_ITEM_GUID, + format::PARENT_LOCATOR_ITEM_GUID, + format::PMEM_LABEL_STORAGE_AREA_ITEM_GUID, +]; + +/// Verify that all required system metadata items in the table are known to +/// this parser. Unknown required items cause an error (except the incomplete +/// file marker, which has special handling). +pub(crate) fn verify_known_metadata( + table: &MetadataTable, + allow_incomplete: bool, +) -> Result<(), OpenError> { + for entry in table.entries() { + // Only check system (non-user) entries that are required. + if entry.flags.is_user() || !entry.flags.is_required() { + continue; + } + + if KNOWN_ITEM_IDS.contains(&entry.item_id) { + continue; + } + + if entry.item_id == format::INCOMPLETE_FILE_ITEM_GUID { + if allow_incomplete { + continue; + } + return Err(CorruptionType::IncompleteFile.into()); + } + + return Err(CorruptionType::UnknownRequiredMetadata.into()); + } + Ok(()) +} + +/// Read and parse all known metadata items from the file. +pub(crate) async fn read_known_metadata( + cache: &PageCache, + table: &MetadataTable, +) -> Result { + // --- Logical sector size (read first, needed for disk size validation) --- + let logical_sector_size = { + let entry = table + .find_entry(false, &format::LOGICAL_SECTOR_SIZE_ITEM_GUID) + .ok_or(CorruptionType::MissingRequiredMetadata)?; + + if entry.length != 4 { + return Err(CorruptionType::InvalidLogicalSectorSizeSize.into()); + } + if !entry.flags.is_virtual_disk() { + return Err(CorruptionType::LogicalSectorSizeMarkedVirtual.into()); + } + + let data = table + .read_item(cache, false, &format::LOGICAL_SECTOR_SIZE_ITEM_GUID) + .await?; + let value = u32::from_le_bytes(data.try_into().unwrap()); + if value != 512 && value != 4096 { + return Err(CorruptionType::InvalidLogicalSectorSize.into()); + } + value + }; + + // --- File parameters --- + let (block_size, has_parent, leave_blocks_allocated) = { + let entry = table + .find_entry(false, &format::FILE_PARAMETERS_ITEM_GUID) + .ok_or(CorruptionType::MissingRequiredMetadata)?; + + if entry.length as usize != size_of::() { + return Err(CorruptionType::InvalidFileParameterSize.into()); + } + if entry.flags.is_virtual_disk() { + return Err(CorruptionType::FileParametersMarkedVirtual.into()); + } + + let data = table + .read_item(cache, false, &format::FILE_PARAMETERS_ITEM_GUID) + .await?; + let params = FileParameters::read_from_bytes(&data) + .map_err(|_| CorruptionType::InvalidFileParameterSize)?; + let supported = u32::from( + FileParametersFlags::new() + .with_leave_blocks_allocated(true) + .with_has_parent(true), + ); + if u32::from(params.flags) & !supported != 0 { + return Err(CorruptionType::ReservedFileParametersFieldNonzero.into()); + } + + let bs = params.block_size; + if !bs.is_power_of_two() + || (bs as u64) < format::MB1 + || bs as u64 > format::MAXIMUM_BLOCK_SIZE + { + return Err(CorruptionType::InvalidBlockSize.into()); + } + + ( + bs, + params.flags.has_parent(), + params.flags.leave_blocks_allocated(), + ) + }; + + // --- Virtual disk size --- + let disk_size = { + let entry = table + .find_entry(false, &format::VIRTUAL_DISK_SIZE_ITEM_GUID) + .ok_or(CorruptionType::MissingRequiredMetadata)?; + + if entry.length != 8 { + return Err(CorruptionType::InvalidDiskSize.into()); + } + if !entry.flags.is_virtual_disk() { + return Err(CorruptionType::DiskMarkedVirtual.into()); + } + + let data = table + .read_item(cache, false, &format::VIRTUAL_DISK_SIZE_ITEM_GUID) + .await?; + let value = u64::from_le_bytes(data.try_into().unwrap()); + if value == 0 + || value > format::MAXIMUM_DISK_SIZE + || !value.is_multiple_of(logical_sector_size as u64) + { + return Err(CorruptionType::InvalidDiskSize.into()); + } + value + }; + + // --- Physical sector size --- + let physical_sector_size = { + let entry = table + .find_entry(false, &format::PHYSICAL_SECTOR_SIZE_ITEM_GUID) + .ok_or(CorruptionType::MissingRequiredMetadata)?; + + if entry.length != 4 { + return Err(CorruptionType::InvalidSectorSize.into()); + } + if !entry.flags.is_virtual_disk() { + return Err(CorruptionType::InvalidSectorSize.into()); + } + + let data = table + .read_item(cache, false, &format::PHYSICAL_SECTOR_SIZE_ITEM_GUID) + .await?; + let value = u32::from_le_bytes(data.try_into().unwrap()); + if value != 512 && value != 4096 { + return Err(CorruptionType::InvalidSectorSize.into()); + } + value + }; + + // --- Page 83 data --- + let page_83_data = { + let entry = table + .find_entry(false, &format::PAGE_83_ITEM_GUID) + .ok_or(CorruptionType::MissingRequiredMetadata)?; + + if entry.length != 16 { + return Err(CorruptionType::MissingRequiredMetadata.into()); + } + if !entry.flags.is_virtual_disk() { + return Err(CorruptionType::MissingRequiredMetadata.into()); + } + + let data = table + .read_item(cache, false, &format::PAGE_83_ITEM_GUID) + .await?; + Guid::read_from_bytes(&data).map_err(|_| CorruptionType::MissingRequiredMetadata)? + }; + + Ok(KnownMetadata { + block_size, + has_parent, + leave_blocks_allocated, + disk_size, + logical_sector_size, + physical_sector_size, + page_83_data, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::AsyncFileExt; + use crate::cache::PageCache; + use crate::error::OpenErrorInner; + use crate::metadata::METADATA_TAG; + use crate::region; + use crate::tests::support::InMemoryFile; + use pal_async::async_test; + use std::sync::Arc; + use zerocopy::IntoBytes; + + fn metadata_cache(file: InMemoryFile, metadata_offset: u64) -> PageCache { + let mut cache = PageCache::new(Arc::new(file), None, None, 0); + cache.register_tag(METADATA_TAG, metadata_offset); + cache + } + + #[async_test] + async fn read_known_metadata_from_created_file() { + let (file, params) = InMemoryFile::create_test_vhdx(format::GB1).await; + let regions = region::parse_region_tables(&file).await.unwrap(); + let table = MetadataTable::read(&file, regions.metadata_offset, regions.metadata_length) + .await + .unwrap(); + let cache = metadata_cache(file, regions.metadata_offset); + + let meta = read_known_metadata(&cache, &table).await.unwrap(); + + assert_eq!(meta.disk_size, format::GB1); + assert_eq!(meta.block_size, params.block_size); + assert_eq!(meta.logical_sector_size, params.logical_sector_size); + assert_eq!(meta.physical_sector_size, params.physical_sector_size); + assert!(!meta.has_parent); + assert!(!meta.leave_blocks_allocated); + assert_ne!(meta.page_83_data, Guid::ZERO); + } + + #[async_test] + async fn verify_known_metadata_all_known() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let regions = region::parse_region_tables(&file).await.unwrap(); + let table = MetadataTable::read(&file, regions.metadata_offset, regions.metadata_length) + .await + .unwrap(); + + // All standard entries should be recognized. + verify_known_metadata(&table, false).unwrap(); + } + + #[async_test] + async fn verify_unknown_required_item() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let regions = region::parse_region_tables(&file).await.unwrap(); + + // Add a fake required system metadata entry to the table. + let mut buf = vec![0u8; format::METADATA_TABLE_SIZE as usize]; + file.read_at(regions.metadata_offset, &mut buf) + .await + .unwrap(); + + let mut header = format::MetadataTableHeader::read_from_prefix(&buf) + .unwrap() + .0 + .clone(); + let old_count = header.entry_count; + header.entry_count = old_count + 1; + let h_bytes = header.as_bytes(); + buf[..h_bytes.len()].copy_from_slice(h_bytes); + + // Add a fake entry with unknown GUID. + let fake_guid = guid::guid!("deadbeef-dead-beef-dead-beefdeadbeef"); + let fake_entry = format::MetadataTableEntry { + item_id: fake_guid, + offset: 0, + length: 0, + flags: format::MetadataTableEntryFlags::new().with_is_required(true), + reserved2: 0, + }; + let entry_start = size_of::(); + let entry_size = size_of::(); + let off = entry_start + old_count as usize * entry_size; + let e_bytes = fake_entry.as_bytes(); + buf[off..off + e_bytes.len()].copy_from_slice(e_bytes); + + file.write_at(regions.metadata_offset, &buf).await.unwrap(); + + let table = MetadataTable::read(&file, regions.metadata_offset, regions.metadata_length) + .await + .unwrap(); + + let result = verify_known_metadata(&table, false); + assert!(matches!( + result, + Err(OpenError(OpenErrorInner::Corrupt( + CorruptionType::UnknownRequiredMetadata + ))) + )); + } + + #[async_test] + async fn verify_incomplete_file() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let regions = region::parse_region_tables(&file).await.unwrap(); + + // Add the incomplete file marker. + let mut buf = vec![0u8; format::METADATA_TABLE_SIZE as usize]; + file.read_at(regions.metadata_offset, &mut buf) + .await + .unwrap(); + + let mut header = format::MetadataTableHeader::read_from_prefix(&buf) + .unwrap() + .0 + .clone(); + let old_count = header.entry_count; + header.entry_count = old_count + 1; + let h_bytes = header.as_bytes(); + buf[..h_bytes.len()].copy_from_slice(h_bytes); + + let incomplete_entry = format::MetadataTableEntry { + item_id: format::INCOMPLETE_FILE_ITEM_GUID, + offset: 0, + length: 0, + flags: format::MetadataTableEntryFlags::new().with_is_required(true), + reserved2: 0, + }; + let entry_start = size_of::(); + let entry_size = size_of::(); + let off = entry_start + old_count as usize * entry_size; + let e_bytes = incomplete_entry.as_bytes(); + buf[off..off + e_bytes.len()].copy_from_slice(e_bytes); + + file.write_at(regions.metadata_offset, &buf).await.unwrap(); + + let table = MetadataTable::read(&file, regions.metadata_offset, regions.metadata_length) + .await + .unwrap(); + + // With allow_incomplete=false, should fail. + let result = verify_known_metadata(&table, false); + assert!(matches!( + result, + Err(OpenError(OpenErrorInner::Corrupt( + CorruptionType::IncompleteFile + ))) + )); + + // With allow_incomplete=true, should pass. + verify_known_metadata(&table, true).unwrap(); + } + + #[async_test] + async fn validate_block_size_power_of_two() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let regions = region::parse_region_tables(&file).await.unwrap(); + let table = MetadataTable::read(&file, regions.metadata_offset, regions.metadata_length) + .await + .unwrap(); + + // Overwrite file parameters with a non-power-of-2 block size. + let entry = table + .find_entry(false, &format::FILE_PARAMETERS_ITEM_GUID) + .unwrap(); + let item_offset = regions.metadata_offset + entry.offset as u64; + + let bad_params = FileParameters { + block_size: 3 * format::MB1 as u32, // not power of 2 + flags: FileParametersFlags::new(), + }; + file.write_at(item_offset, bad_params.as_bytes()) + .await + .unwrap(); + + let cache = metadata_cache(file, regions.metadata_offset); + let result = read_known_metadata(&cache, &table).await; + assert!(matches!( + result, + Err(OpenError(OpenErrorInner::Corrupt( + CorruptionType::InvalidBlockSize + ))) + )); + } + + #[async_test] + async fn validate_file_parameters_reserved_flags() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let regions = region::parse_region_tables(&file).await.unwrap(); + let table = MetadataTable::read(&file, regions.metadata_offset, regions.metadata_length) + .await + .unwrap(); + + let entry = table + .find_entry(false, &format::FILE_PARAMETERS_ITEM_GUID) + .unwrap(); + let item_offset = regions.metadata_offset + entry.offset as u64; + + let params = FileParameters { + block_size: format::DEFAULT_BLOCK_SIZE, + flags: FileParametersFlags::new(), + }; + let mut bytes = params.as_bytes().to_vec(); + bytes[4..8].copy_from_slice(&4_u32.to_le_bytes()); + file.write_at(item_offset, &bytes).await.unwrap(); + + let cache = metadata_cache(file, regions.metadata_offset); + let result = read_known_metadata(&cache, &table).await; + assert!(matches!( + result, + Err(OpenError(OpenErrorInner::Corrupt( + CorruptionType::ReservedFileParametersFieldNonzero + ))) + )); + } + + #[async_test] + async fn validate_sector_sizes() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let regions = region::parse_region_tables(&file).await.unwrap(); + let table = MetadataTable::read(&file, regions.metadata_offset, regions.metadata_length) + .await + .unwrap(); + + // Overwrite logical sector size with an invalid value. + let entry = table + .find_entry(false, &format::LOGICAL_SECTOR_SIZE_ITEM_GUID) + .unwrap(); + let item_offset = regions.metadata_offset + entry.offset as u64; + + let bad_value: u32 = 1024; // not 512 or 4096 + file.write_at(item_offset, &bad_value.to_le_bytes()) + .await + .unwrap(); + + let cache = metadata_cache(file, regions.metadata_offset); + let result = read_known_metadata(&cache, &table).await; + assert!(matches!( + result, + Err(OpenError(OpenErrorInner::Corrupt( + CorruptionType::InvalidLogicalSectorSize + ))) + )); + } +} diff --git a/vm/devices/storage/vhdx/src/lib.rs b/vm/devices/storage/vhdx/src/lib.rs new file mode 100644 index 0000000000..96e03a8e76 --- /dev/null +++ b/vm/devices/storage/vhdx/src/lib.rs @@ -0,0 +1,264 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Pure-Rust VHDX file format parser and writer. +//! +//! This crate implements the +//! [VHDX format specification](https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-vhdx/) +//! with no platform-specific dependencies, enabling cross-platform support +//! for dynamic, fixed, and differencing VHDX virtual hard disk files. +//! +//! # Overview +//! +//! A VHDX file stores a virtual disk as a collection of fixed-size data +//! blocks tracked by a Block Allocation Table (BAT). Crash consistency is +//! maintained through a write-ahead log (WAL) that journals metadata +//! changes before they reach their final file locations. +//! +//! ## Lifecycle +//! +//! ```text +//! create() ──► VhdxFile::open(file) ──► VhdxBuilder +//! ├── .read_only() ──► VhdxFile (read) +//! └── .writable() ──► VhdxFile (read/write) +//! │ +//! resolve_read / resolve_write / flush / trim +//! │ +//! close() +//! ``` +//! +//! 1. **Create** — [`create::create()`] writes a valid, empty VHDX file. +//! 2. **Open** — [`VhdxFile::open()`] returns a [`VhdxBuilder`] for +//! configuring options (block alignment, log replay policy) before +//! calling [`read_only()`](VhdxBuilder::read_only) or +//! [`writable()`](VhdxBuilder::writable). +//! 3. **I/O** — [`VhdxFile::resolve_read()`](open::VhdxFile::resolve_read) +//! and [`VhdxFile::resolve_write()`](open::VhdxFile::resolve_write) +//! translate virtual disk offsets into file-level ranges. The caller +//! performs actual data I/O at the returned offsets, then calls +//! [`WriteIoGuard::complete()`] to finalize metadata. +//! 4. **Flush** — [`VhdxFile::flush()`](open::VhdxFile::flush) commits +//! dirty pages through the WAL and flushes to stable storage. +//! 5. **Close** — [`VhdxFile::close()`](open::VhdxFile::close) drains the +//! pipeline and clears the log GUID, leaving the file clean. +//! +//! ## Write pipeline (cache → log → apply) +//! +//! Writable opens spawn two background tasks that form a three-stage +//! pipeline for crash-consistent metadata persistence: +//! +//! ```text +//! ┌───────────┐ commit() ┌──────────┐ apply ┌────────────┐ +//! │ Cache │ ──────────────►│ Log Task │ ───────────►│ Apply Task │ +//! │ (dirty │ dirty pages │ (WAL │ logged │ (final │ +//! │ pages) │ │ writer) │ pages │ offsets) │ +//! └───────────┘ └──────────┘ └────────────┘ +//! ▲ │ │ +//! LogPermits logged_lsn applied_lsn +//! (backpressure) (LsnWatermark) (LsnWatermark) +//! ``` +//! +//! - The **cache** accumulates dirty 4 KiB metadata pages (BAT entries, +//! sector bitmap bits). On commit, pages are sent to the log task. +//! - The **log task** writes WAL entries to the circular log region and +//! publishes `logged_lsn`. +//! - The **apply task** writes logged pages to their final file offsets +//! and publishes `applied_lsn`. +//! - A permit semaphore limits in-flight pages for backpressure. A flush +//! sequencer coalesces concurrent flush requests. +//! +//! # I/O model +//! +//! The crate separates **metadata I/O** from **payload I/O**. +//! +//! Metadata I/O (headers, BAT pages, sector bitmaps, WAL entries) is +//! handled internally through [`AsyncFile`] — the caller provides an +//! implementation at open time and never thinks about metadata again. +//! +//! Payload I/O (guest data reads and writes) is the caller's +//! responsibility. [`resolve_read()`](open::VhdxFile::resolve_read) and +//! [`resolve_write()`](open::VhdxFile::resolve_write) translate virtual +//! disk offsets into file-level byte ranges ([`ReadRange`] / +//! [`WriteRange`]). The caller performs its own data I/O at those +//! offsets using whatever mechanism it prefers (io_uring, standard file +//! I/O, etc.), then finalizes metadata via the returned I/O guard. +//! This separation lets the caller use a different, potentially more +//! performant I/O path for bulk data without the crate imposing any +//! particular strategy. + +#![forbid(unsafe_code)] +#![allow(async_fn_in_trait)] + +use std::borrow::Borrow; +use std::future::Future; + +pub(crate) mod apply_task; +pub(crate) mod bat; +pub(crate) mod cache; +pub(crate) mod create; +pub(crate) mod error; +pub(crate) mod flush; +pub(crate) mod format; +pub(crate) mod header; +pub(crate) mod io; +pub(crate) mod known_meta; +pub(crate) mod locator; +pub(crate) mod log; +pub(crate) mod log_permits; +pub(crate) mod log_task; +pub(crate) mod lsn_watermark; +pub(crate) mod metadata; +pub(crate) mod open; +pub(crate) mod region; +pub(crate) mod sector_bitmap; +pub(crate) mod space; +pub(crate) mod trim; + +pub use create::CreateParams; +pub use create::create; +pub use error::CreateError; +pub use error::InvalidFormatReason; +pub use error::OpenError; +pub use error::VhdxIoError; +pub use error::VhdxIoErrorKind; +pub use io::ReadIoGuard; +pub use io::ReadRange; +pub use io::WriteIoGuard; +pub use io::WriteRange; +pub use locator::LocatorKeyValue; +pub use locator::ParentLocator; +pub use locator::ParentPaths; +pub use open::VhdxBuilder; +pub use open::VhdxFile; +pub use trim::TrimMode; +pub use trim::TrimRequest; + +#[cfg(test)] +mod tests; + +/// Trait abstracting metadata file I/O for the VHDX parser. +/// +/// The crate uses this trait for all internal metadata access (headers, +/// BAT pages, sector bitmaps, WAL entries, log replay). Payload data +/// I/O is **not** routed through this trait — the caller handles it +/// directly at the file offsets returned by +/// [`VhdxFile::resolve_read()`](open::VhdxFile::resolve_read) and +/// [`VhdxFile::resolve_write()`](open::VhdxFile::resolve_write). +/// +/// All async methods return `Send` futures so that the log task (spawned +/// on a multi-threaded executor) can call them. +/// +/// This trait is **not** dyn-compatible due to `impl Future` return types. +/// When dynamic dispatch is needed (e.g. `disk_backend` integration), +/// create a separate dyn-compatible wrapper trait with a blanket impl. +pub trait AsyncFile: Send + Sync { + /// Buffer type for owned I/O operations. + /// + /// Implementations control allocation strategy (e.g., alignment for + /// O_DIRECT). Buffers are owned and `'static`, so they can be safely + /// moved into `blocking::unblock`, io_uring submissions, etc. + type Buffer: AsRef<[u8]> + AsMut<[u8]> + Clone + Send + Sync + 'static; + + /// Allocate a zero-initialized buffer of the given length. + fn alloc_buffer(&self, len: usize) -> Self::Buffer; + + /// Read from the file into an owned buffer. Returns the filled buffer. + /// + /// The read starts at `offset` and fills `buf.as_mut().len()` bytes. + fn read_into( + &self, + offset: u64, + buf: Self::Buffer, + ) -> impl Future> + Send; + + /// Write a buffer to the file at the given offset. + /// + /// Accepts any type that borrows as `Self::Buffer`, enabling zero-copy + /// writes from `Arc` (the `Arc` moves into the I/O + /// closure; data is borrowed in place without copying). + fn write_from( + &self, + offset: u64, + buf: impl Borrow + Send + 'static, + ) -> impl Future> + Send; + + /// Flush all buffered writes to stable storage. + fn flush(&self) -> impl Future> + Send; + + /// Return the current size of the file in bytes. + fn file_size(&self) -> impl Future> + Send; + + /// Set (truncate or extend) the file to the given size in bytes. + fn set_file_size(&self, size: u64) -> impl Future> + Send; + + /// Zero a byte range of the file. + /// + /// Implementations may use platform-specific APIs (e.g., `fallocate` + /// with `FALLOC_FL_ZERO_RANGE` on Linux, or `FSCTL_SET_ZERO_DATA` on + /// Windows) for efficiency. The default implementation writes zeros + /// in fixed-size chunks via [`write_from`](Self::write_from). + fn zero_range( + &self, + offset: u64, + len: u64, + ) -> impl Future> + Send { + async move { + const CHUNK: usize = 64 * 1024; + let zeros = self.alloc_buffer(CHUNK); + let mut remaining = len; + let mut pos = offset; + while remaining > 0 { + let n = (remaining as usize).min(CHUNK); + if n < CHUNK { + let small = self.alloc_buffer(n); + self.write_from(pos, small).await?; + } else { + self.write_from(pos, zeros.clone()).await?; + } + pos += n as u64; + remaining -= n as u64; + } + Ok(()) + } + } +} + +/// Extension trait providing slice-based `read_at`/`write_at` convenience +/// methods. These allocate a temporary buffer internally, so they involve +/// an extra copy compared to `read_into`/`write_from`. +/// +/// Automatically implemented for all [`AsyncFile`] types. +/// +/// Only used for tests within this crate, not to be exposed publicly. +#[cfg(test)] +pub trait AsyncFileExt: AsyncFile { + /// Read exactly `buf.len()` bytes from the file at the given byte offset. + fn read_at( + &self, + offset: u64, + buf: &mut [u8], + ) -> impl Future> + Send { + async move { + let owned = self.alloc_buffer(buf.len()); + let owned = self.read_into(offset, owned).await?; + buf.copy_from_slice(owned.as_ref()); + Ok(()) + } + } + + /// Write exactly `buf.len()` bytes to the file at the given byte offset. + fn write_at( + &self, + offset: u64, + buf: &[u8], + ) -> impl Future> + Send { + async move { + let mut owned = self.alloc_buffer(buf.len()); + owned.as_mut().copy_from_slice(buf); + self.write_from(offset, owned).await + } + } +} + +#[cfg(test)] +impl AsyncFileExt for T {} diff --git a/vm/devices/storage/vhdx/src/locator.rs b/vm/devices/storage/vhdx/src/locator.rs new file mode 100644 index 0000000000..612b96972e --- /dev/null +++ b/vm/devices/storage/vhdx/src/locator.rs @@ -0,0 +1,393 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Parent locator parsing for VHDX files. +//! +//! Parses the parent locator metadata item (a key-value table of UTF-16LE +//! strings) into a structured Rust type. + +use crate::error::CorruptionType; +use crate::error::OpenError; +use crate::format; +use crate::format::ParentLocatorEntry; +use crate::format::ParentLocatorHeader; +use guid::Guid; +use zerocopy::FromBytes; + +/// A parsed key-value pair from a parent locator. +#[derive(Debug, Clone)] +pub struct LocatorKeyValue { + /// The key string. + pub key: String, + /// The value string. + pub value: String, +} + +/// A parsed parent locator. +#[derive(Debug, Clone)] +pub struct ParentLocator { + /// The locator type GUID. + pub locator_type: Guid, + /// The key-value entries. + pub entries: Vec, +} + +/// Decode a UTF-16LE string from `data` at the given byte offset and length. +/// +/// Returns `None` if the string is empty, misaligned, out of bounds, +/// contains embedded nulls, or is not valid UTF-16. +fn decode_utf16le(data: &[u8], offset: usize, length: usize) -> Option { + if length == 0 || !length.is_multiple_of(2) || !offset.is_multiple_of(2) { + return None; + } + let end = offset.checked_add(length)?; + if end > data.len() { + return None; + } + let slice = &data[offset..end]; + if has_embedded_null(slice) { + return None; + } + let u16s: Vec = slice + .chunks_exact(2) + .map(|c| u16::from_le_bytes([c[0], c[1]])) + .collect(); + String::from_utf16(&u16s).ok() +} + +/// Check that a UTF-16LE byte slice does not contain embedded null characters. +fn has_embedded_null(data: &[u8]) -> bool { + data.chunks_exact(2) + .any(|c| u16::from_le_bytes([c[0], c[1]]) == 0) +} + +impl ParentLocator { + /// Parse a parent locator from its raw metadata item bytes. + pub fn parse(data: &[u8]) -> Result { + let header_size = size_of::(); + + // Check minimum size for the header. + if data.len() < header_size { + return Err((CorruptionType::LocatorTooSmallForHeader).into()); + } + + let header = ParentLocatorHeader::read_from_prefix(data) + .map_err(|_| CorruptionType::LocatorTooSmallForHeader)? + .0 + .clone(); + + // Validate key-value count. + if header.key_value_count == 0 + || header.key_value_count > format::PARENT_LOCATOR_MAXIMUM_KEY_VALUE_COUNT + { + return Err((CorruptionType::InvalidLocatorKeyValueCount).into()); + } + + // Check that the buffer is large enough for header + all entries. + let entry_size = size_of::(); + let entries_end = header_size + header.key_value_count as usize * entry_size; + if data.len() < entries_end { + return Err((CorruptionType::LocatorTooSmallForEntries).into()); + } + + // Parse each entry. + let mut entries = Vec::with_capacity(header.key_value_count as usize); + for i in 0..header.key_value_count as usize { + let off = header_size + i * entry_size; + let entry = ParentLocatorEntry::read_from_prefix(&data[off..]) + .unwrap() + .0 + .clone(); + + // Validate and decode key. + let key_offset = entry.key_offset as usize; + let key_length = entry.key_length as usize; + let key = decode_utf16le(data, key_offset, key_length) + .ok_or(CorruptionType::InvalidLocatorEntryKey)?; + + // Validate and decode value. + let value_offset = entry.value_offset as usize; + let value_length = entry.value_length as usize; + let value = decode_utf16le(data, value_offset, value_length) + .ok_or(CorruptionType::InvalidLocatorEntryValue)?; + + entries.push(LocatorKeyValue { key, value }); + } + + Ok(ParentLocator { + locator_type: header.locator_type, + entries, + }) + } + + /// Find a value by key name (case-sensitive match). + pub fn find(&self, key: &str) -> Option<&str> { + self.entries + .iter() + .find(|e| e.key == key) + .map(|e| e.value.as_str()) + } + + /// Extract well-known parent paths from the locator. + /// + /// This looks up the standard VHDX parent locator keys and returns + /// them in a structured form. Returns `None` for any key not present. + pub fn parent_paths(&self) -> ParentPaths { + ParentPaths { + parent_linkage: self + .find(format::PARENT_LOCATOR_KEY_PARENT_LINKAGE) + .map(String::from), + relative_path: self + .find(format::PARENT_LOCATOR_KEY_RELATIVE_PATH) + .map(String::from), + absolute_win32_path: self + .find(format::PARENT_LOCATOR_KEY_ABSOLUTE_PATH) + .map(String::from), + volume_path: self + .find(format::PARENT_LOCATOR_KEY_VOLUME_PATH) + .map(String::from), + } + } +} + +/// Paths extracted from a VHDX parent locator. +/// +/// Contains the well-known path entries from the standard VHDX parent +/// locator type. The caller should try paths in order of preference: +/// relative, then absolute, then volume path. +#[derive(Debug, Clone)] +#[non_exhaustive] +pub struct ParentPaths { + /// Parent's data write GUID at child creation time. + pub parent_linkage: Option, + /// Relative path from child to parent. + pub relative_path: Option, + /// Absolute Win32 path to parent. + pub absolute_win32_path: Option, + /// Volume GUID path to parent. + pub volume_path: Option, +} + +/// Helper to encode a Rust string into a UTF-16LE byte vector. +#[cfg(test)] +fn encode_utf16le(s: &str) -> Vec { + s.encode_utf16().flat_map(|c| c.to_le_bytes()).collect() +} + +/// Build a valid parent locator binary blob from parts. +#[cfg(test)] +pub(crate) fn build_locator(locator_type: Guid, kvs: &[(&str, &str)]) -> Vec { + use zerocopy::IntoBytes; + + let header_size = size_of::(); + let entry_size = size_of::(); + let entries_end = header_size + kvs.len() * entry_size; + + // Encode all key/value strings. + let encoded: Vec<(Vec, Vec)> = kvs + .iter() + .map(|(k, v)| (encode_utf16le(k), encode_utf16le(v))) + .collect(); + + // Compute total size. + let strings_size: usize = encoded.iter().map(|(k, v)| k.len() + v.len()).sum(); + let total = entries_end + strings_size; + let mut buf = vec![0u8; total]; + + // Write header. + let header = ParentLocatorHeader { + locator_type, + reserved: 0, + key_value_count: kvs.len() as u16, + }; + let h_bytes = header.as_bytes(); + buf[..h_bytes.len()].copy_from_slice(h_bytes); + + // Write entries and string data. + let mut string_offset = entries_end; + for (i, (key_bytes, val_bytes)) in encoded.iter().enumerate() { + let entry = ParentLocatorEntry { + key_offset: string_offset as u32, + value_offset: (string_offset + key_bytes.len()) as u32, + key_length: key_bytes.len() as u16, + value_length: val_bytes.len() as u16, + }; + let e_bytes = entry.as_bytes(); + let off = header_size + i * entry_size; + buf[off..off + e_bytes.len()].copy_from_slice(e_bytes); + + buf[string_offset..string_offset + key_bytes.len()].copy_from_slice(key_bytes); + string_offset += key_bytes.len(); + buf[string_offset..string_offset + val_bytes.len()].copy_from_slice(val_bytes); + string_offset += val_bytes.len(); + } + + buf +} + +#[cfg(test)] +mod tests { + use crate::error::OpenErrorInner; + + use super::*; + + #[test] + fn parse_valid_locator() { + let data = build_locator( + format::PARENT_LOCATOR_VHDX_TYPE_GUID, + &[ + ("parent_linkage", "guid-value-here"), + ("relative_path", "..\\parent.vhdx"), + ("absolute_win32_path", "C:\\vms\\parent.vhdx"), + ], + ); + + let locator = ParentLocator::parse(&data).unwrap(); + assert_eq!(locator.locator_type, format::PARENT_LOCATOR_VHDX_TYPE_GUID); + assert_eq!(locator.entries.len(), 3); + assert_eq!(locator.entries[0].key, "parent_linkage"); + assert_eq!(locator.entries[0].value, "guid-value-here"); + assert_eq!(locator.entries[1].key, "relative_path"); + assert_eq!(locator.entries[1].value, "..\\parent.vhdx"); + assert_eq!(locator.entries[2].key, "absolute_win32_path"); + assert_eq!(locator.entries[2].value, "C:\\vms\\parent.vhdx"); + } + + #[test] + fn find_by_key() { + let data = build_locator( + format::PARENT_LOCATOR_VHDX_TYPE_GUID, + &[("parent_linkage", "link-val"), ("relative_path", "rel-val")], + ); + + let locator = ParentLocator::parse(&data).unwrap(); + assert_eq!(locator.find("parent_linkage"), Some("link-val")); + assert_eq!(locator.find("relative_path"), Some("rel-val")); + assert_eq!(locator.find("nonexistent"), None); + } + + #[test] + fn parse_empty_locator() { + // Build a header with 0 entries. + use zerocopy::IntoBytes; + let header = ParentLocatorHeader { + locator_type: format::PARENT_LOCATOR_VHDX_TYPE_GUID, + reserved: 0, + key_value_count: 0, + }; + let data = header.as_bytes().to_vec(); + let result = ParentLocator::parse(&data); + assert!(matches!( + result, + Err(OpenError(OpenErrorInner::Corrupt( + CorruptionType::InvalidLocatorKeyValueCount + ))) + )); + } + + #[test] + fn parse_invalid_utf16() { + // Build a locator where key has odd byte length. + use zerocopy::IntoBytes; + + let header_size = size_of::(); + let entry_size = size_of::(); + + // Total buffer: header + 1 entry + key(3 bytes, odd) + value(2 bytes) + let total = header_size + entry_size + 3 + 2; + let mut buf = vec![0u8; total]; + + let header = ParentLocatorHeader { + locator_type: format::PARENT_LOCATOR_VHDX_TYPE_GUID, + reserved: 0, + key_value_count: 1, + }; + let h_bytes = header.as_bytes(); + buf[..h_bytes.len()].copy_from_slice(h_bytes); + + let string_start = header_size + entry_size; + let entry = ParentLocatorEntry { + key_offset: string_start as u32, + value_offset: (string_start + 3) as u32, + key_length: 3, // odd = invalid + value_length: 2, + }; + let e_bytes = entry.as_bytes(); + buf[header_size..header_size + e_bytes.len()].copy_from_slice(e_bytes); + + let result = ParentLocator::parse(&buf); + assert!(matches!( + result, + Err(OpenError(OpenErrorInner::Corrupt( + CorruptionType::InvalidLocatorEntryKey + ))) + )); + } + + #[test] + fn parse_embedded_null() { + // Build a locator where key contains an embedded null. + use zerocopy::IntoBytes; + + let header_size = size_of::(); + let entry_size = size_of::(); + + // Key: "a\0b" in UTF-16LE = [0x61, 0x00, 0x00, 0x00, 0x62, 0x00] (6 bytes) + let key_data: Vec = vec![0x61, 0x00, 0x00, 0x00, 0x62, 0x00]; + let value_data = encode_utf16le("val"); + + let total = header_size + entry_size + key_data.len() + value_data.len(); + let mut buf = vec![0u8; total]; + + let header = ParentLocatorHeader { + locator_type: format::PARENT_LOCATOR_VHDX_TYPE_GUID, + reserved: 0, + key_value_count: 1, + }; + let h_bytes = header.as_bytes(); + buf[..h_bytes.len()].copy_from_slice(h_bytes); + + let string_start = header_size + entry_size; + let entry = ParentLocatorEntry { + key_offset: string_start as u32, + value_offset: (string_start + key_data.len()) as u32, + key_length: key_data.len() as u16, + value_length: value_data.len() as u16, + }; + let e_bytes = entry.as_bytes(); + buf[header_size..header_size + e_bytes.len()].copy_from_slice(e_bytes); + + buf[string_start..string_start + key_data.len()].copy_from_slice(&key_data); + let vs = string_start + key_data.len(); + buf[vs..vs + value_data.len()].copy_from_slice(&value_data); + + let result = ParentLocator::parse(&buf); + assert!(matches!( + result, + Err(OpenError(OpenErrorInner::Corrupt( + CorruptionType::InvalidLocatorEntryKey + ))) + )); + } + + #[test] + fn parse_truncated_locator() { + // Header claims 5 entries but buffer only holds header. + use zerocopy::IntoBytes; + + let header = ParentLocatorHeader { + locator_type: format::PARENT_LOCATOR_VHDX_TYPE_GUID, + reserved: 0, + key_value_count: 5, + }; + let data = header.as_bytes().to_vec(); + + let result = ParentLocator::parse(&data); + assert!(matches!( + result, + Err(OpenError(OpenErrorInner::Corrupt( + CorruptionType::LocatorTooSmallForEntries + ))) + )); + } +} diff --git a/vm/devices/storage/vhdx/src/log.rs b/vm/devices/storage/vhdx/src/log.rs new file mode 100644 index 0000000000..34360773d8 --- /dev/null +++ b/vm/devices/storage/vhdx/src/log.rs @@ -0,0 +1,2134 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! VHDX write-ahead log (WAL) — replay and entry construction. +//! +//! The VHDX format uses a write-ahead log stored in a circular "log region" +//! to ensure crash consistency of metadata (BAT, sector bitmaps). This +//! module provides: +//! +//! - [`replay_log`] — scans the log region for valid entries and applies +//! them to the file. +//! - [`LogWriter`] — constructs and writes new log entries. +//! +//! This module is self-contained and depends only on [`crate::format`], +//! [`crate::error`], and external crates. + +use crate::AsyncFile; +use crate::error::CorruptionType; +use crate::error::OpenError; +use crate::error::OpenErrorInner; +use crate::format::LOG_DATA_SECTOR_SIGNATURE; +use crate::format::LOG_DESCRIPTOR_DATA_SIGNATURE; +use crate::format::LOG_DESCRIPTOR_ZERO_SIGNATURE; +use crate::format::LOG_ENTRY_HEADER_SIGNATURE; +use crate::format::LOG_SECTOR_SIZE; +use crate::format::LOGGABLE_OFFSET; +use crate::format::LogDataDescriptor; +use crate::format::LogDataSector; +use crate::format::LogEntryHeader; +use crate::format::LogZeroDescriptor; +use crate::format::compute_checksum; +use guid::Guid; +use zerocopy::FromBytes; +use zerocopy::IntoBytes; + +// --------------------------------------------------------------------------- +// Constants +// --------------------------------------------------------------------------- + +const SECTOR: u32 = LOG_SECTOR_SIZE as u32; +const HEADER_SIZE: u32 = size_of::() as u32; // 64 +const DESCRIPTOR_SIZE: u32 = size_of::() as u32; // 32 + +// --------------------------------------------------------------------------- +// LogRegion — circular buffer helpers +// --------------------------------------------------------------------------- + +/// Describes the circular log region in the VHDX file. +#[derive(Debug, Clone)] +pub struct LogRegion { + /// File offset where the log region starts. + pub file_offset: u64, + /// Length of the log region in bytes (always a multiple of `LOG_SECTOR_SIZE`). + pub length: u32, +} + +impl LogRegion { + /// Modular add within the region: `(offset + length) % self.length`. + fn log_add(&self, offset: u32, len: u32) -> u32 { + let sum = offset + len; + if sum >= self.length { + sum - self.length + } else { + sum + } + } + + /// Length of the circular sequence `[tail, head)`. + /// + /// If `head == tail`, the sequence spans the entire log (full). + fn sequence_length(&self, tail: u32, head: u32) -> u32 { + if head > tail { + head - tail + } else if head < tail { + head + self.length - tail + } else { + // head == tail → full + self.length + } + } + + /// Free (unused) space in the log given the current `[tail, head)`. + fn free_space(&self, tail: u32, head: u32) -> u32 { + self.length - self.sequence_length(tail, head) + } + + /// Returns `true` if `offset` lies within the circular range `[tail, head)`. + fn is_within_sequence(&self, tail: u32, head: u32, offset: u32) -> bool { + self.sequence_length(offset, head) <= self.sequence_length(tail, head) + } + + /// Read a single sector from the log at `base + offset` (modular). + async fn read_sector( + &self, + file: &F, + base: u32, + offset: u32, + buf: F::Buffer, + ) -> Result { + let pos = self.log_add(base, offset); + let buf = file + .read_into(self.file_offset + pos as u64, buf) + .await + .map_err(OpenErrorInner::Io)?; + Ok(buf) + } +} + +// --------------------------------------------------------------------------- +// Entry size helpers +// --------------------------------------------------------------------------- + +/// Length of the descriptor area (header + descriptor sectors), rounded up +/// to a multiple of `LOG_SECTOR_SIZE`. +fn descriptor_area_length(descriptor_count: u32) -> u32 { + let raw = HEADER_SIZE + descriptor_count * DESCRIPTOR_SIZE; + raw.div_ceil(SECTOR) * SECTOR +} + +/// Total entry length: descriptor area + one data sector per data descriptor. +fn entry_length(data_count: u32, zero_count: u32) -> u32 { + descriptor_area_length(data_count + zero_count) + data_count * SECTOR +} + +// --------------------------------------------------------------------------- +// Replay +// --------------------------------------------------------------------------- + +/// Result of log replay. +#[derive(Debug, Clone)] +pub struct ReplayResult { + /// Whether any entries were replayed. + pub replayed: bool, +} + +/// A validated log sequence found during scanning. +#[derive(Debug, Clone)] +struct LogSequence { + tail: u32, + head: u32, + last_lsn: u64, +} + +/// Replay the VHDX log. +/// +/// Scans the log region for valid entries matching `log_guid`, applies +/// them to the file, and flushes. Returns information about what was done. +/// +/// This function is self-contained: it takes a file handle and log region +/// parameters, not a `VhdxFile`. It can be called before the file is +/// fully parsed. +pub async fn replay_log( + file: &F, + log_region: &LogRegion, + log_guid: Guid, +) -> Result { + // Step 1: find the best valid sequence. + let sequence = find_log_sequence(file, log_region, &log_guid).await?; + + // Step 2: apply the sequence. + apply_sequence(file, log_region, &log_guid, &sequence).await +} + +/// Scan the entire log for the sequence with the highest LSN. +async fn find_log_sequence( + file: &F, + region: &LogRegion, + log_guid: &Guid, +) -> Result { + let mut best: Option = None; + let mut tail: u32 = 0; + + loop { + match find_sequence_from_tail(file, region, log_guid, tail).await { + Ok(seq) => { + let dominated = best.as_ref().is_some_and(|b| b.last_lsn >= seq.last_lsn); + if !dominated { + best = Some(seq.clone()); + } + // Advance past this sequence. + if seq.head <= tail { + break; // wrapped + } + tail = seq.head; + } + Err(FindError::NoSequence) => { + tail += SECTOR; + if tail >= region.length { + break; + } + } + Err(FindError::Vhdx(e)) => return Err(e), + } + } + + Ok(best.ok_or(CorruptionType::NoValidLogEntries)?) +} + +enum FindError { + NoSequence, + Vhdx(OpenError), +} + +/// Try to build a sequence starting at `original_tail`. +async fn find_sequence_from_tail( + file: &F, + region: &LogRegion, + log_guid: &Guid, + original_tail: u32, +) -> Result { + let mut seq = LogSequence { + tail: 0, + head: original_tail, + last_lsn: 0, + }; + let mut first = true; + + loop { + match expand_sequence(file, region, log_guid, &mut seq, first).await { + Ok(()) => { + first = false; + } + Err(FindError::NoSequence) if !first => break, + Err(e) => return Err(e), + } + } + + // Verify that the sequence's tail lies within the validated range. + if !region.is_within_sequence(original_tail, seq.head, seq.tail) { + return Err(FindError::NoSequence); + } + + Ok(seq) +} + +/// Try to grow the sequence by one entry at `seq.head`. +async fn expand_sequence( + file: &F, + region: &LogRegion, + log_guid: &Guid, + seq: &mut LogSequence, + first: bool, +) -> Result<(), FindError> { + let mut sector_buf = file.alloc_buffer(SECTOR as usize); + + // Read the first sector at the candidate position. + sector_buf = region + .read_sector(file, seq.head, 0, sector_buf) + .await + .map_err(FindError::Vhdx)?; + + let header = LogEntryHeader::read_from_bytes(§or_buf.as_ref()[..HEADER_SIZE as usize]) + .map_err(|_| FindError::NoSequence)? + .clone(); + + // Validate header fields. + if header.signature != LOG_ENTRY_HEADER_SIGNATURE { + return Err(FindError::NoSequence); + } + if header.log_guid != *log_guid { + return Err(FindError::NoSequence); + } + if !validate_entry_header_fields(&header, region) { + return Err(FindError::NoSequence); + } + + let new_head = region.log_add(seq.head, header.entry_length); + + if !first { + // Check sequence continuity. + if header.sequence_number != seq.last_lsn + 1 { + return Err(FindError::NoSequence); + } + // Entry must fit in the free space. + if header.entry_length > region.free_space(seq.tail, seq.head) { + return Err(FindError::NoSequence); + } + // New tail must be within the growing sequence. + if !region.is_within_sequence(seq.tail, new_head, header.tail) { + return Err(FindError::NoSequence); + } + } + + // Compute CRC-32C over the entire entry. + let buf_ref = sector_buf.as_ref(); + let mut crc = compute_checksum(buf_ref, 4); + + // Read and checksum additional descriptor sectors. + let desc_area_len = descriptor_area_length(header.descriptor_count); + let mut data_descriptor_count: u32 = 0; + + // Validate descriptors. + for i in 0..header.descriptor_count { + let byte_offset = HEADER_SIZE + i * DESCRIPTOR_SIZE; + if byte_offset.is_multiple_of(SECTOR) { + // Need to read a new sector. + sector_buf = region + .read_sector(file, seq.head, byte_offset, sector_buf) + .await + .map_err(FindError::Vhdx)?; + crc = crc32c::crc32c_append(crc, sector_buf.as_ref()); + } + + let local_off = (byte_offset % SECTOR) as usize; + let desc_bytes = §or_buf.as_ref()[local_off..local_off + DESCRIPTOR_SIZE as usize]; + + // Check descriptor signature. + let sig = u32::from_le_bytes(desc_bytes[0..4].try_into().unwrap()); + if sig == LOG_DESCRIPTOR_DATA_SIGNATURE { + let desc = LogDataDescriptor::read_from_bytes(desc_bytes) + .map_err(|_| FindError::NoSequence)?; + if !validate_data_descriptor(&desc, &header, region) { + return Err(FindError::NoSequence); + } + data_descriptor_count += 1; + } else if sig == LOG_DESCRIPTOR_ZERO_SIGNATURE { + let desc = LogZeroDescriptor::read_from_bytes(desc_bytes) + .map_err(|_| FindError::NoSequence)?; + if !validate_zero_descriptor(&desc, &header, region) { + return Err(FindError::NoSequence); + } + } else { + return Err(FindError::NoSequence); + } + } + + // Validate that the entry length matches. + let expected_len = entry_length( + data_descriptor_count, + header.descriptor_count - data_descriptor_count, + ); + if header.entry_length != expected_len { + return Err(FindError::NoSequence); + } + + // Read and validate data sectors. + for i in 0..data_descriptor_count { + let offset = desc_area_len + i * SECTOR; + sector_buf = region + .read_sector(file, seq.head, offset, sector_buf) + .await + .map_err(FindError::Vhdx)?; + crc = crc32c::crc32c_append(crc, sector_buf.as_ref()); + + let data_sector = LogDataSector::read_from_bytes(sector_buf.as_ref()) + .map_err(|_| FindError::NoSequence)?; + if data_sector.signature != LOG_DATA_SECTOR_SIGNATURE { + return Err(FindError::NoSequence); + } + if data_sector.sequence_low != header.sequence_number as u32 + || data_sector.sequence_high != (header.sequence_number >> 32) as u32 + { + return Err(FindError::NoSequence); + } + } + + // Verify CRC. + if crc != header.checksum { + return Err(FindError::NoSequence); + } + + // Entry is valid — update the sequence. + seq.last_lsn = header.sequence_number; + seq.tail = header.tail; + seq.head = new_head; + + Ok(()) +} + +fn validate_entry_header_fields(header: &LogEntryHeader, region: &LogRegion) -> bool { + if header.tail >= region.length { + return false; + } + if !header.entry_length.is_multiple_of(SECTOR) + || header.entry_length < SECTOR + || header.entry_length >= region.length + { + return false; + } + // Descriptor count must fit within the entry. + let max_desc = (header.entry_length - HEADER_SIZE) / DESCRIPTOR_SIZE; + if header.descriptor_count > max_desc { + return false; + } + true +} + +fn validate_data_descriptor( + desc: &LogDataDescriptor, + header: &LogEntryHeader, + region: &LogRegion, +) -> bool { + if !desc.file_offset.is_multiple_of(LOG_SECTOR_SIZE) { + return false; + } + if desc.file_offset < LOGGABLE_OFFSET { + return false; + } + if desc.sequence_number != header.sequence_number { + return false; + } + // Must not overlap the log region. + let write_end = desc.file_offset.checked_add(LOG_SECTOR_SIZE); + if let Some(end) = write_end { + if desc.file_offset < region.file_offset + region.length as u64 && end > region.file_offset + { + return false; + } + } else { + return false; + } + true +} + +fn validate_zero_descriptor( + desc: &LogZeroDescriptor, + header: &LogEntryHeader, + region: &LogRegion, +) -> bool { + if !desc.file_offset.is_multiple_of(LOG_SECTOR_SIZE) { + return false; + } + if desc.file_offset < LOGGABLE_OFFSET { + return false; + } + if !desc.length.is_multiple_of(LOG_SECTOR_SIZE) { + return false; + } + if desc.sequence_number != header.sequence_number { + return false; + } + // Must not overlap the log region. + let write_end = desc.file_offset.checked_add(desc.length); + if let Some(end) = write_end { + if desc.file_offset < region.file_offset + region.length as u64 && end > region.file_offset + { + return false; + } + } else { + return false; + } + true +} + +/// Apply a validated sequence to the file. +async fn apply_sequence( + file: &F, + region: &LogRegion, + log_guid: &Guid, + sequence: &LogSequence, +) -> Result { + let mut tail = sequence.tail; + let head = sequence.head; + let mut last_file_offset: u64 = 0; + let mut replayed = false; + + let mut sector_buf = file.alloc_buffer(SECTOR as usize); + + while tail != head { + // Read header. + sector_buf = region.read_sector(file, tail, 0, sector_buf).await?; + let header = LogEntryHeader::read_from_bytes(§or_buf.as_ref()[..HEADER_SIZE as usize]) + .map_err(|_| CorruptionType::BadLogEntryOnReplay)? + .clone(); + + if header.signature != LOG_ENTRY_HEADER_SIGNATURE || header.log_guid != *log_guid { + return Err(CorruptionType::BadLogEntryOnReplay.into()); + } + + last_file_offset = header.last_file_offset; + + let desc_area_len = descriptor_area_length(header.descriptor_count); + let mut data_sector_index: u32 = 0; + + for i in 0..header.descriptor_count { + let byte_offset = HEADER_SIZE + i * DESCRIPTOR_SIZE; + if byte_offset.is_multiple_of(SECTOR) || i == 0 { + // (Re-)read the descriptor sector. For the first descriptor + // the sector_buf already contains the header sector. + if byte_offset >= SECTOR { + sector_buf = region + .read_sector(file, tail, byte_offset - (byte_offset % SECTOR), sector_buf) + .await?; + } + } + + let local_off = (byte_offset % SECTOR) as usize; + let desc_bytes = §or_buf.as_ref()[local_off..local_off + DESCRIPTOR_SIZE as usize]; + let sig = u32::from_le_bytes(desc_bytes[0..4].try_into().unwrap()); + + if sig == LOG_DESCRIPTOR_ZERO_SIGNATURE { + let desc = LogZeroDescriptor::read_from_bytes(desc_bytes) + .map_err(|_| CorruptionType::BadLogEntryOnReplay)?; + + // Write zeros. + file.zero_range(desc.file_offset, desc.length) + .await + .map_err(OpenErrorInner::Io)?; + } else if sig == LOG_DESCRIPTOR_DATA_SIGNATURE { + let desc = LogDataDescriptor::read_from_bytes(desc_bytes) + .map_err(|_| CorruptionType::BadLogEntryOnReplay)? + .clone(); + + // Read the data sector from the log. + let data_offset = desc_area_len + data_sector_index * SECTOR; + let data_buf = file.alloc_buffer(SECTOR as usize); + let mut data_buf = region + .read_sector(file, tail, data_offset, data_buf) + .await?; + + // Reconstruct the original 4096-byte sector. + // Replace first 8 bytes (signature + sequence_high) with leading_bytes. + let leading = desc.leading_bytes.to_le_bytes(); + data_buf.as_mut()[0..8].copy_from_slice(&leading); + // Replace last 4 bytes (sequence_low) with trailing_bytes. + let trailing = desc.trailing_bytes.to_le_bytes(); + data_buf.as_mut()[SECTOR as usize - 4..].copy_from_slice(&trailing); + + file.write_from(desc.file_offset, data_buf) + .await + .map_err(OpenErrorInner::Io)?; + data_sector_index += 1; + } else { + return Err(CorruptionType::BadLogEntryOnReplay.into()); + } + } + + replayed = true; + tail = region.log_add(tail, header.entry_length); + } + + // Extend file if needed. + if replayed { + let file_sz = file.file_size().await.map_err(OpenErrorInner::Io)?; + if file_sz < last_file_offset { + file.set_file_size(last_file_offset) + .await + .map_err(OpenErrorInner::Io)?; + } + file.flush().await.map_err(OpenErrorInner::Io)?; + } + + Ok(ReplayResult { replayed }) +} + +// --------------------------------------------------------------------------- +// LogWriter +// --------------------------------------------------------------------------- + +/// A page to be logged: 4096 bytes of data at a file offset. +pub struct DataPage<'a> { + /// Target file offset (must be aligned to LOG_SECTOR_SIZE). + pub file_offset: u64, + /// The 4096-byte data payload. + pub payload: &'a [u8; SECTOR as usize], +} + +/// A range to be zeroed during replay. +pub struct ZeroRange { + /// Target file offset (must be aligned to LOG_SECTOR_SIZE). + pub file_offset: u64, + /// Length in bytes (must be a multiple of LOG_SECTOR_SIZE). + pub length: u64, +} + +/// Active log state for writing new entries. +pub struct LogWriter { + region: LogRegion, + tail: u32, + head: u32, + sequence_number: u64, + log_guid: Guid, + flushed_file_offset: u64, + last_file_offset: u64, +} + +impl LogWriter { + /// Create a new `LogWriter` for an empty log. + /// + /// Writes an initial empty entry (zero data descriptors, zero zero-ranges) + /// and flushes. Returns the writer ready for subsequent entries. + pub async fn initialize( + file: &F, + region: LogRegion, + log_guid: Guid, + last_file_offset: u64, + ) -> Result { + let mut writer = LogWriter { + region, + tail: 0, + head: 0, + sequence_number: 0, + log_guid, + flushed_file_offset: last_file_offset, + last_file_offset, + }; + + // Write an initial empty entry with sequence number 1. + writer + .write_entry(file, &[], &[]) + .await + .map_err(OpenErrorInner::Io)? + .ok_or(CorruptionType::LogFull)?; + file.flush().await.map_err(OpenErrorInner::Io)?; + Ok(writer) + } + + /// Create a `LogWriter` from an existing valid log sequence. + /// + /// Returns the amount of free space remaining in the log. + pub fn free_space(&self) -> u32 { + self.region.free_space(self.tail, self.head) + } + + /// Advance the log tail by `len` bytes, reclaiming space. + /// + /// The caller must ensure that all entries in the range `[old_tail, old_tail + len)` + /// have been fully applied and their pages are durable at final file offsets. + pub fn advance_tail(&mut self, new_tail: u32) { + self.tail = new_tail; + } + + /// Returns the current head offset within the log region. + pub fn head(&self) -> u32 { + self.head + } + + /// Write a log entry containing the given data pages and zero ranges. + /// + /// Returns `Some(sequence_number)` on success, or `None` if the log + /// doesn't have enough free space (caller should drain and retry). + pub async fn write_entry( + &mut self, + file: &F, + data_pages: &[DataPage<'_>], + zero_ranges: &[ZeroRange], + ) -> Result, std::io::Error> { + let data_count = data_pages.len() as u32; + let zero_count = zero_ranges.len() as u32; + let total_desc = data_count + zero_count; + let elen = entry_length(data_count, zero_count); + + // We always leave room for at least one more sector to avoid completely + // filling the log. + let needed = elen + SECTOR; + if self.tail == self.head { + // Empty log — the full region is free. + if needed > self.region.length { + return Ok(None); + } + } else if needed > self.free_space() { + return Ok(None); + } + + self.sequence_number += 1; + let seq = self.sequence_number; + + // --- Allocate entry buffers --- + // The entry may wrap around the circular log boundary. Since head + // and elen are both sector-aligned, the split always falls on a + // sector boundary. Allocate one or two buffers accordingly. + let remaining = (self.region.length - self.head) as usize; + let wraps = (elen as usize) > remaining; + let mut buf1 = file.alloc_buffer(if wraps { remaining } else { elen as usize }); + let b1 = buf1.as_mut(); + let mut buf2 = if wraps { + Some(file.alloc_buffer(elen as usize - remaining)) + } else { + None + }; + let b2 = buf2.as_mut().map_or(&mut [][..], |b| b.as_mut()); + let split = remaining; // byte offset within the entry where the split occurs + + fn entry_slice<'a>( + b1: &'a mut [u8], + b2: &'a mut [u8], + split: usize, + offset: usize, + len: usize, + ) -> &'a mut [u8] { + if offset < split { + debug_assert!(offset + len <= split, "access straddles split boundary"); + &mut b1[offset..offset + len] + } else { + let off2 = offset - split; + &mut b2[off2..off2 + len] + } + } + + // --- Build the entry --- + + // Header (first 64 bytes of first sector). + let header = LogEntryHeader { + signature: LOG_ENTRY_HEADER_SIGNATURE, + checksum: 0, + entry_length: elen, + tail: self.tail, + sequence_number: seq, + descriptor_count: total_desc, + reserved: 0, + log_guid: self.log_guid, + flushed_file_offset: self.flushed_file_offset, + last_file_offset: self.last_file_offset, + }; + entry_slice(b1, b2, split, 0, HEADER_SIZE as usize).copy_from_slice(header.as_bytes()); + + // Descriptors. + let desc_area_len = descriptor_area_length(total_desc); + let mut desc_offset = HEADER_SIZE as usize; + let mut data_sector_offset = desc_area_len as usize; + + for dp in data_pages { + let leading = u64::from_le_bytes(dp.payload[0..8].try_into().unwrap()); + let trailing = u32::from_le_bytes( + dp.payload[SECTOR as usize - 4..SECTOR as usize] + .try_into() + .unwrap(), + ); + let desc = LogDataDescriptor { + signature: LOG_DESCRIPTOR_DATA_SIGNATURE, + trailing_bytes: trailing, + leading_bytes: leading, + file_offset: dp.file_offset, + sequence_number: seq, + }; + entry_slice(b1, b2, split, desc_offset, DESCRIPTOR_SIZE as usize) + .copy_from_slice(desc.as_bytes()); + desc_offset += DESCRIPTOR_SIZE as usize; + + let ds = build_data_sector(dp.payload, seq); + entry_slice(b1, b2, split, data_sector_offset, SECTOR as usize) + .copy_from_slice(ds.as_bytes()); + data_sector_offset += SECTOR as usize; + } + + for zr in zero_ranges { + let desc = LogZeroDescriptor { + signature: LOG_DESCRIPTOR_ZERO_SIGNATURE, + reserved: 0, + length: zr.length, + file_offset: zr.file_offset, + sequence_number: seq, + }; + entry_slice(b1, b2, split, desc_offset, DESCRIPTOR_SIZE as usize) + .copy_from_slice(desc.as_bytes()); + desc_offset += DESCRIPTOR_SIZE as usize; + } + + // Compute CRC-32C across both buffers (checksum field is already zero). + let mut crc = crc32c::crc32c(buf1.as_ref()); + if let Some(ref b2) = buf2 { + crc = crc32c::crc32c_append(crc, b2.as_ref()); + } + // Write checksum into the header (always in buf1, bytes 4..8). + buf1.as_mut()[4..8].copy_from_slice(&crc.to_le_bytes()); + + // --- Write to file --- + file.write_from(self.region.file_offset + self.head as u64, buf1) + .await?; + if let Some(buf2) = buf2 { + file.write_from(self.region.file_offset, buf2).await?; + } + + // Advance head. + self.head = self.region.log_add(self.head, elen); + + Ok(Some(seq)) + } +} + +/// Build a `LogDataSector` from a page of original data and a sequence number. +fn build_data_sector(source: &[u8; SECTOR as usize], sequence_number: u64) -> LogDataSector { + let mut data = [0u8; 4084]; + data.copy_from_slice(&source[8..SECTOR as usize - 4]); + LogDataSector { + signature: LOG_DATA_SECTOR_SIGNATURE, + sequence_high: (sequence_number >> 32) as u32, + data, + sequence_low: sequence_number as u32, + } +} + +// =========================================================================== +// Tests +// =========================================================================== + +#[cfg(test)] +mod tests { + use super::*; + use crate::AsyncFileExt; + use crate::tests::support::InMemoryFile; + use pal_async::async_test; + + /// Helpers for tests. + const TEST_LOG_SIZE: u32 = 64 * SECTOR; // 256 KiB + const TEST_LOG_OFFSET: u64 = 1024 * 1024; // 1 MiB into the file + + fn test_region() -> LogRegion { + LogRegion { + file_offset: TEST_LOG_OFFSET, + length: TEST_LOG_SIZE, + } + } + + fn test_guid() -> Guid { + guid::guid!("12345678-1234-1234-1234-123456789abc") + } + + /// Create a file large enough for the log region and a target area. + fn test_file() -> InMemoryFile { + // 4 MiB file: enough for the log at 1 MiB and target writes at 192 KiB+. + InMemoryFile::new(4 * 1024 * 1024) + } + + // ----------------------------------------------------------------------- + // Circular buffer helper tests + // ----------------------------------------------------------------------- + + #[test] + fn log_add_no_wrap() { + let r = LogRegion { + file_offset: 0, + length: 1000, + }; + assert_eq!(r.log_add(100, 200), 300); + } + + #[test] + fn log_add_with_wrap() { + let r = LogRegion { + file_offset: 0, + length: 1000, + }; + assert_eq!(r.log_add(800, 300), 100); + } + + #[test] + fn sequence_length_head_gt_tail() { + let r = LogRegion { + file_offset: 0, + length: 1000, + }; + assert_eq!(r.sequence_length(100, 500), 400); + } + + #[test] + fn sequence_length_head_lt_tail() { + let r = LogRegion { + file_offset: 0, + length: 1000, + }; + assert_eq!(r.sequence_length(800, 200), 400); + } + + #[test] + fn sequence_length_head_eq_tail() { + let r = LogRegion { + file_offset: 0, + length: 1000, + }; + // Full log. + assert_eq!(r.sequence_length(500, 500), 1000); + } + + #[test] + fn free_space_computation() { + let r = LogRegion { + file_offset: 0, + length: 1000, + }; + assert_eq!(r.free_space(100, 500), 600); + assert_eq!(r.free_space(800, 200), 600); + } + + #[test] + fn is_within_sequence_cases() { + let r = LogRegion { + file_offset: 0, + length: 1000, + }; + // Normal range [100, 500) + assert!(r.is_within_sequence(100, 500, 100)); // at tail + assert!(r.is_within_sequence(100, 500, 300)); // in middle + assert!(!r.is_within_sequence(100, 500, 500)); // at head (not within) + assert!(!r.is_within_sequence(100, 500, 50)); // before tail + + // Wrapped range [800, 200) + assert!(r.is_within_sequence(800, 200, 900)); // in first part + assert!(r.is_within_sequence(800, 200, 100)); // in second part + assert!(!r.is_within_sequence(800, 200, 500)); // outside + } + + // ----------------------------------------------------------------------- + // Log Writer tests + // ----------------------------------------------------------------------- + + #[async_test] + async fn writer_initialize_creates_empty_entry() { + let file = test_file(); + let region = test_region(); + let guid = test_guid(); + + let writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024) + .await + .unwrap(); + + assert_eq!(writer.sequence_number, 1); + assert_eq!(writer.tail, 0); + // The empty entry is 1 sector (header only, 0 descriptors). + assert_eq!(writer.head, SECTOR); + + // Read back and validate. + let mut buf = [0u8; SECTOR as usize]; + file.read_at(TEST_LOG_OFFSET, &mut buf).await.unwrap(); + let header = LogEntryHeader::read_from_bytes(&buf[..HEADER_SIZE as usize]).unwrap(); + assert_eq!(header.signature, LOG_ENTRY_HEADER_SIGNATURE); + assert_eq!(header.sequence_number, 1); + assert_eq!(header.descriptor_count, 0); + assert_eq!(header.entry_length, SECTOR); + assert_eq!(header.log_guid, guid); + + // Validate CRC. + let stored_crc = header.checksum; + let computed_crc = compute_checksum(&buf, 4); + assert_eq!(stored_crc, computed_crc); + } + + #[async_test] + async fn writer_one_data_page() { + let file = test_file(); + let region = test_region(); + let guid = test_guid(); + + let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024) + .await + .unwrap(); + + let mut page_data = [0u8; SECTOR as usize]; + for (i, b) in page_data.iter_mut().enumerate() { + *b = (i % 256) as u8; + } + let target_offset = LOGGABLE_OFFSET + 4096; + + let lsn = writer + .write_entry( + &file, + &[DataPage { + file_offset: target_offset, + payload: &page_data, + }], + &[], + ) + .await + .unwrap() + .unwrap(); + + assert_eq!(lsn, 2); + + // Read back the entry header and verify. + let entry_start = TEST_LOG_OFFSET + SECTOR as u64; // after the init entry + let mut buf = [0u8; SECTOR as usize]; + file.read_at(entry_start, &mut buf).await.unwrap(); + let header = LogEntryHeader::read_from_bytes(&buf[..HEADER_SIZE as usize]).unwrap(); + assert_eq!(header.signature, LOG_ENTRY_HEADER_SIGNATURE); + assert_eq!(header.sequence_number, 2); + assert_eq!(header.descriptor_count, 1); + assert_eq!(header.entry_length, entry_length(1, 0)); + } + + #[async_test] + async fn writer_one_zero_range() { + let file = test_file(); + let region = test_region(); + let guid = test_guid(); + + let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024) + .await + .unwrap(); + + let target_offset = LOGGABLE_OFFSET; + let lsn = writer + .write_entry( + &file, + &[], + &[ZeroRange { + file_offset: target_offset, + length: 8192, + }], + ) + .await + .unwrap() + .unwrap(); + + assert_eq!(lsn, 2); + + // Read back and verify descriptor. + let entry_start = TEST_LOG_OFFSET + SECTOR as u64; + let mut buf = [0u8; SECTOR as usize]; + file.read_at(entry_start, &mut buf).await.unwrap(); + let header = LogEntryHeader::read_from_bytes(&buf[..HEADER_SIZE as usize]).unwrap(); + assert_eq!(header.descriptor_count, 1); + // Zero descriptor: entry is just 1 sector (header + descriptor fits) + assert_eq!(header.entry_length, entry_length(0, 1)); + } + + #[async_test] + async fn writer_multiple_entries_advance() { + let file = test_file(); + let region = test_region(); + let guid = test_guid(); + + let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024) + .await + .unwrap(); + + let page = [0xAAu8; SECTOR as usize]; + for i in 0..3 { + let lsn = writer + .write_entry( + &file, + &[DataPage { + file_offset: LOGGABLE_OFFSET + (i as u64) * 4096, + payload: &page, + }], + &[], + ) + .await + .unwrap() + .unwrap(); + assert_eq!(lsn, i as u64 + 2); + } + + // Head should have advanced past the initial empty entry + 3 data entries. + let one_data_entry_len = entry_length(1, 0); + let expected_head = SECTOR + 3 * one_data_entry_len; + assert_eq!(writer.head, expected_head); + assert_eq!(writer.sequence_number, 4); + } + + #[async_test] + async fn writer_wrap_around() { + let file = test_file(); + // Use a small log that will force wrap-around. + let small_log_size = 16 * SECTOR; // 64 KiB + let region = LogRegion { + file_offset: TEST_LOG_OFFSET, + length: small_log_size, + }; + let guid = test_guid(); + + let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024) + .await + .unwrap(); + + // Each data entry = entry_length(1, 0) = 2 sectors. + // After init (1 sector), we have 15 sectors free minus 1 reserved = 14 usable. + // Each entry = 2 sectors, so we can fit 7 entries before needing to advance tail. + // But the writer doesn't advance tail on its own — we just write until full. + let page = [0xBBu8; SECTOR as usize]; + let entry_len = entry_length(1, 0); + + let mut entries_written = 0u32; + loop { + // Check if we have space for entry + 1 sector. + let needed = entry_len + SECTOR; + if writer.tail == writer.head { + // Empty — full space available. + if needed > writer.region.length { + break; + } + } else if needed > writer.free_space() { + break; + } + + writer + .write_entry( + &file, + &[DataPage { + file_offset: LOGGABLE_OFFSET + (entries_written as u64) * 4096, + payload: &page, + }], + &[], + ) + .await + .unwrap(); + entries_written += 1; + } + + assert!(entries_written > 0); + // Verify that the head has wrapped around or is near the end. + // The exact value depends on the arithmetic, but the write should have succeeded. + } + + #[async_test] + async fn writer_log_full_error() { + let file = test_file(); + // Tiny log: 4 sectors. + let region = LogRegion { + file_offset: TEST_LOG_OFFSET, + length: 4 * SECTOR, + }; + let guid = test_guid(); + + let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024) + .await + .unwrap(); + + // After init entry (1 sector), 3 sectors free. A data entry needs 2 sectors + 1 reserved = 3. Fits. + let page = [0xCCu8; SECTOR as usize]; + writer + .write_entry( + &file, + &[DataPage { + file_offset: LOGGABLE_OFFSET, + payload: &page, + }], + &[], + ) + .await + .unwrap(); + + // Now head - tail = 3 sectors used, 1 free. Next entry needs 2 + 1 = 3. Won't fit. + let result = writer + .write_entry( + &file, + &[DataPage { + file_offset: LOGGABLE_OFFSET + 4096, + payload: &page, + }], + &[], + ) + .await; + + // write_entry returns None when the log is full. + assert_eq!(result.unwrap(), None); + } + + // ----------------------------------------------------------------------- + // Log Replay tests + // ----------------------------------------------------------------------- + + #[async_test] + async fn replay_single_data_entry() { + let file = test_file(); + let region = test_region(); + let guid = test_guid(); + + // Write a single data entry. + let mut page_data = [0u8; SECTOR as usize]; + for (i, b) in page_data.iter_mut().enumerate() { + *b = ((i + 1) % 256) as u8; + } + let target_offset = LOGGABLE_OFFSET + 4096; + + let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024) + .await + .unwrap(); + writer + .write_entry( + &file, + &[DataPage { + file_offset: target_offset, + payload: &page_data, + }], + &[], + ) + .await + .unwrap(); + + // Now replay. + let result = replay_log(&file, ®ion, guid).await.unwrap(); + assert!(result.replayed); + + // Verify that the data was written to the target offset. + let mut read_buf = [0u8; SECTOR as usize]; + file.read_at(target_offset, &mut read_buf).await.unwrap(); + assert_eq!(read_buf, page_data); + } + + #[async_test] + async fn replay_data_and_zero_descriptors() { + let file = test_file(); + let region = test_region(); + let guid = test_guid(); + + let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024) + .await + .unwrap(); + + // Write some non-zero data to the zero target first. + let zero_target = LOGGABLE_OFFSET + 8192; + let garbage = [0xFFu8; 8192]; + file.write_at(zero_target, &garbage).await.unwrap(); + + let mut page_data = [0x42u8; SECTOR as usize]; + page_data[0] = 0xDE; + page_data[1] = 0xAD; + let data_target = LOGGABLE_OFFSET + 4096; + + writer + .write_entry( + &file, + &[DataPage { + file_offset: data_target, + payload: &page_data, + }], + &[ZeroRange { + file_offset: zero_target, + length: 8192, + }], + ) + .await + .unwrap(); + + let result = replay_log(&file, ®ion, guid).await.unwrap(); + assert!(result.replayed); + + // Verify data was applied. + let mut read_buf = [0u8; SECTOR as usize]; + file.read_at(data_target, &mut read_buf).await.unwrap(); + assert_eq!(read_buf, page_data); + + // Verify zeros were applied. + let mut zero_buf = vec![0u8; 8192]; + file.read_at(zero_target, &mut zero_buf).await.unwrap(); + assert!(zero_buf.iter().all(|&b| b == 0)); + } + + #[async_test] + async fn replay_multiple_sequential_entries() { + let file = test_file(); + let region = test_region(); + let guid = test_guid(); + + let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024) + .await + .unwrap(); + + let mut pages = Vec::new(); + for n in 0..3 { + let mut page = [0u8; SECTOR as usize]; + page.fill(n as u8 + 1); + pages.push(page); + } + + for (n, page) in pages.iter().enumerate() { + let offset = LOGGABLE_OFFSET + (n as u64) * 4096; + writer + .write_entry( + &file, + &[DataPage { + file_offset: offset, + payload: page, + }], + &[], + ) + .await + .unwrap(); + } + + let result = replay_log(&file, ®ion, guid).await.unwrap(); + assert!(result.replayed); + + for (n, page) in pages.iter().enumerate() { + let offset = LOGGABLE_OFFSET + (n as u64) * 4096; + let mut buf = [0u8; SECTOR as usize]; + file.read_at(offset, &mut buf).await.unwrap(); + assert_eq!(buf, *page, "mismatch at entry {n}"); + } + } + + #[async_test] + async fn replay_invalid_crc_skipped() { + let file = test_file(); + let region = test_region(); + let guid = test_guid(); + + // Write a self-contained entry: start the writer at a non-zero position + // so the entry's tail == its own start. The first (corrupt) entry + // comes before it and is skipped. + // + // Strategy: write two independent sequences. The first is the init + // entry whose CRC we corrupt. The second is an independent entry + // that references itself as the tail. + let mut writer = LogWriter { + region: region.clone(), + tail: 4 * SECTOR, + head: 4 * SECTOR, + sequence_number: 10, + log_guid: guid, + flushed_file_offset: 4 * 1024 * 1024, + last_file_offset: 4 * 1024 * 1024, + }; + + let page = [0xAAu8; SECTOR as usize]; + writer + .write_entry( + &file, + &[DataPage { + file_offset: LOGGABLE_OFFSET, + payload: &page, + }], + &[], + ) + .await + .unwrap(); + + // Also write a garbage entry at offset 0 that looks like a valid header + // but has a corrupt CRC. + let _bad_writer = LogWriter { + region: region.clone(), + tail: 0, + head: 0, + sequence_number: 0, + log_guid: guid, + flushed_file_offset: 4 * 1024 * 1024, + last_file_offset: 4 * 1024 * 1024, + }; + // We just need a valid-looking header at offset 0 with bad CRC. + // Write an init-like entry, then corrupt its CRC. + let header = LogEntryHeader { + signature: LOG_ENTRY_HEADER_SIGNATURE, + checksum: 0xDEADBEEF, // intentionally wrong + entry_length: SECTOR, + tail: 0, + sequence_number: 1, + descriptor_count: 0, + reserved: 0, + log_guid: guid, + flushed_file_offset: 4 * 1024 * 1024, + last_file_offset: 4 * 1024 * 1024, + }; + let mut buf = [0u8; SECTOR as usize]; + buf[..HEADER_SIZE as usize].copy_from_slice(header.as_bytes()); + file.write_at(TEST_LOG_OFFSET, &buf).await.unwrap(); + + // Replay should skip the bad entry at offset 0 and find the good entry + // at offset 4*SECTOR. + let result = replay_log(&file, ®ion, guid).await.unwrap(); + assert!(result.replayed); + + // The data should have been applied. + let mut read_buf = [0u8; SECTOR as usize]; + file.read_at(LOGGABLE_OFFSET, &mut read_buf).await.unwrap(); + assert_eq!(read_buf, page); + } + + #[async_test] + async fn replay_wrong_guid_skipped() { + let file = test_file(); + let region = test_region(); + let guid = test_guid(); + let wrong_guid = guid::guid!("aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"); + + let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024) + .await + .unwrap(); + + let page = [0x55u8; SECTOR as usize]; + writer + .write_entry( + &file, + &[DataPage { + file_offset: LOGGABLE_OFFSET, + payload: &page, + }], + &[], + ) + .await + .unwrap(); + + // Try to replay with the wrong GUID — should find nothing. + let result = replay_log(&file, ®ion, wrong_guid).await; + assert!(result.is_err()); + match result.unwrap_err() { + OpenError(OpenErrorInner::Corrupt(CorruptionType::NoValidLogEntries)) => {} + other => panic!("expected NoValidLogEntries, got {:?}", other), + } + } + + #[async_test] + async fn replay_bad_signature_skipped() { + let file = test_file(); + let region = test_region(); + let guid = test_guid(); + + // Place a valid self-contained entry at sector 4. + let mut writer = LogWriter { + region: region.clone(), + tail: 4 * SECTOR, + head: 4 * SECTOR, + sequence_number: 20, + log_guid: guid, + flushed_file_offset: 4 * 1024 * 1024, + last_file_offset: 4 * 1024 * 1024, + }; + + let page = [0x33u8; SECTOR as usize]; + writer + .write_entry( + &file, + &[DataPage { + file_offset: LOGGABLE_OFFSET, + payload: &page, + }], + &[], + ) + .await + .unwrap(); + + // Write a bad-signature entry at offset 0 so the scanner has something + // to skip. + let mut bad_header_buf = [0u8; SECTOR as usize]; + bad_header_buf[0..4].copy_from_slice(b"XXXX"); + file.write_at(TEST_LOG_OFFSET, &bad_header_buf) + .await + .unwrap(); + + // The entry at sector 4 should still be found. + let result = replay_log(&file, ®ion, guid).await.unwrap(); + assert!(result.replayed); + } + + #[async_test] + async fn replay_empty_log_errors() { + let file = test_file(); + let region = test_region(); + let guid = test_guid(); + + let result = replay_log(&file, ®ion, guid).await; + assert!(result.is_err()); + match result.unwrap_err() { + OpenError(OpenErrorInner::Corrupt(CorruptionType::NoValidLogEntries)) => {} + other => panic!("expected NoValidLogEntries, got {:?}", other), + } + } + + #[async_test] + async fn replay_torn_write_last_entry() { + let file = test_file(); + let region = test_region(); + let guid = test_guid(); + + let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024) + .await + .unwrap(); + + // Write two entries. + let page1 = [0x11u8; SECTOR as usize]; + let page2 = [0x22u8; SECTOR as usize]; + + writer + .write_entry( + &file, + &[DataPage { + file_offset: LOGGABLE_OFFSET, + payload: &page1, + }], + &[], + ) + .await + .unwrap(); + + let entry3_start = writer.head; + writer + .write_entry( + &file, + &[DataPage { + file_offset: LOGGABLE_OFFSET + 4096, + payload: &page2, + }], + &[], + ) + .await + .unwrap(); + + // Simulate torn write: zero out part of the last entry (its second sector). + let torn_offset = TEST_LOG_OFFSET + region.log_add(entry3_start, SECTOR) as u64; + let zeros = [0u8; SECTOR as usize]; + file.write_at(torn_offset, &zeros).await.unwrap(); + + // Replay should apply only entries 1 and 2 (the init entry + first data entry). + let result = replay_log(&file, ®ion, guid).await.unwrap(); + assert!(result.replayed); + + // page1 should be applied. + let mut buf = [0u8; SECTOR as usize]; + file.read_at(LOGGABLE_OFFSET, &mut buf).await.unwrap(); + assert_eq!(buf, page1); + + // page2 should NOT be applied (it was in the torn entry). + // The file might have whatever garbage was at that location. + } + + #[async_test] + async fn writer_then_replay_roundtrip() { + let file = test_file(); + let region = test_region(); + let guid = test_guid(); + + let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024) + .await + .unwrap(); + + // Write several entries. + let mut pages = Vec::new(); + for n in 0..5 { + let mut page = [0u8; SECTOR as usize]; + for (i, b) in page.iter_mut().enumerate() { + *b = ((n * 37 + i) % 256) as u8; + } + pages.push((LOGGABLE_OFFSET + (n as u64) * 4096, page)); + } + + for (offset, page) in &pages { + writer + .write_entry( + &file, + &[DataPage { + file_offset: *offset, + payload: page, + }], + &[], + ) + .await + .unwrap(); + } + + // Replay. + let result = replay_log(&file, ®ion, guid).await.unwrap(); + assert!(result.replayed); + + // Verify all pages. + for (offset, expected) in &pages { + let mut buf = [0u8; SECTOR as usize]; + file.read_at(*offset, &mut buf).await.unwrap(); + assert_eq!(&buf, expected, "mismatch at offset {offset:#x}"); + } + } + + #[async_test] + async fn replay_entry_wrapping_circular_buffer() { + // Use a log small enough that an entry wraps around. + let file = test_file(); + let small_log = 8 * SECTOR; + let region = LogRegion { + file_offset: TEST_LOG_OFFSET, + length: small_log, + }; + let guid = test_guid(); + + let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024) + .await + .unwrap(); + + // Fill the log partially to get head near the end. + // Init entry = 1 sector, data entry = 2 sectors. + // After init: head=1. Write 2 data entries: head=1+2+2=5. + let page_a = [0xAAu8; SECTOR as usize]; + let page_b = [0xBBu8; SECTOR as usize]; + writer + .write_entry( + &file, + &[DataPage { + file_offset: LOGGABLE_OFFSET, + payload: &page_a, + }], + &[], + ) + .await + .unwrap(); + writer + .write_entry( + &file, + &[DataPage { + file_offset: LOGGABLE_OFFSET + 4096, + payload: &page_b, + }], + &[], + ) + .await + .unwrap(); + + // head is now at 5 sectors. Advance tail to free space. + // We simulate this by manually setting the tail forward. + writer.tail = writer.head - entry_length(1, 0); // keep only the last entry valid + // Actually, let's just test the replay path as-is — the entries wrap + // if the head is near the end. + + // head = 5*SECTOR. Log = 8 sectors. Free = 3 sectors. + // A 2-sector entry would need 2 + 1 reserved = 3. Fits! + // It starts at sector 5 and wraps: sector 5 (desc) + sector 6 → wraps to sector 0 (data). + // Wait — that's not wrapping because sector 5+1=6 < 8. + // Let's write another entry to push head to 7: + // Actually head = 5 sectors. Next entry = 2 sectors → head = 7. Free = 1. Can't write more. + + // Let me reconsider. After init (head=1), 2 data entries (head=5). + // Advance tail to 3 (past the init and first data entry). + writer.tail = 3 * SECTOR; + // Free space = 8 - (5*SECTOR - 3*SECTOR)/div... = using sequence_length. + // sequence_length(3S, 5S) = 2S. Free = 8S - 2S = 6S. Next entry = 2S + 1S = 3S. Fits. + + let page_wrap = [0xCCu8; SECTOR as usize]; + writer + .write_entry( + &file, + &[DataPage { + file_offset: LOGGABLE_OFFSET + 8192, + payload: &page_wrap, + }], + &[], + ) + .await + .unwrap(); + + // head is now at 7 sectors. Free = 8S - seq_len(3S, 7S) = 8S - 4S = 4S. Write one more. + let page_wrap2 = [0xDDu8; SECTOR as usize]; + writer + .write_entry( + &file, + &[DataPage { + file_offset: LOGGABLE_OFFSET + 12288, + payload: &page_wrap2, + }], + &[], + ) + .await + .unwrap(); + + // head should now be at (7+2) % 8 = 1 sector. This entry wrapped! + assert_eq!(writer.head, SECTOR); + + // Replay: the scanner needs to find the sequence starting at tail=3S up to head=1S. + let result = replay_log(&file, ®ion, guid).await.unwrap(); + assert!(result.replayed); + + // Verify the wrapped entry was applied. + let mut buf = [0u8; SECTOR as usize]; + file.read_at(LOGGABLE_OFFSET + 8192, &mut buf) + .await + .unwrap(); + assert_eq!(buf, page_wrap); + file.read_at(LOGGABLE_OFFSET + 12288, &mut buf) + .await + .unwrap(); + assert_eq!(buf, page_wrap2); + } + + #[async_test] + async fn replay_highest_lsn_sequence_chosen() { + let file = test_file(); + let region = test_region(); + let guid = test_guid(); + + // Write a sequence with lower LSNs. + let mut writer1 = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024) + .await + .unwrap(); + + let page_old = [0x11u8; SECTOR as usize]; + writer1 + .write_entry( + &file, + &[DataPage { + file_offset: LOGGABLE_OFFSET, + payload: &page_old, + }], + &[], + ) + .await + .unwrap(); + + // Now write a second sequence with higher LSNs starting at a different + // position in the log. We'll manually create a writer at a different head. + let new_head = writer1.head + 4 * SECTOR; // skip a gap + let mut writer2 = LogWriter { + region: region.clone(), + tail: new_head, + head: new_head, + sequence_number: 100, // much higher + log_guid: guid, + flushed_file_offset: 4 * 1024 * 1024, + last_file_offset: 4 * 1024 * 1024, + }; + + let page_new = [0x99u8; SECTOR as usize]; + writer2 + .write_entry( + &file, + &[DataPage { + file_offset: LOGGABLE_OFFSET, + payload: &page_new, + }], + &[], + ) + .await + .unwrap(); + + // Replay should pick the sequence with LSN 101 over LSN 1-2. + let result = replay_log(&file, ®ion, guid).await.unwrap(); + assert!(result.replayed); + + // The data at LOGGABLE_OFFSET should be from the newer sequence. + let mut buf = [0u8; SECTOR as usize]; + file.read_at(LOGGABLE_OFFSET, &mut buf).await.unwrap(); + assert_eq!(buf, page_new); + } + + #[async_test] + async fn roundtrip_crash_replay() { + // Write entries, "crash" (no cleanup), then replay. + let file = test_file(); + let region = test_region(); + let guid = test_guid(); + + let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024) + .await + .unwrap(); + + let page = [0x77u8; SECTOR as usize]; + let target = LOGGABLE_OFFSET + 4096; + writer + .write_entry( + &file, + &[DataPage { + file_offset: target, + payload: &page, + }], + &[], + ) + .await + .unwrap(); + + // Simulating crash — no cleanup. + let _ = writer; + + // Replay should recover the data. + let result = replay_log(&file, ®ion, guid).await.unwrap(); + assert!(result.replayed); + + let mut buf = [0u8; SECTOR as usize]; + file.read_at(target, &mut buf).await.unwrap(); + assert_eq!(buf, page); + } + + #[async_test] + async fn file_extension_on_replay() { + // Start with a small file, write entries referencing a large last_file_offset. + let file = InMemoryFile::new(2 * 1024 * 1024); // 2 MiB + let region = test_region(); + let guid = test_guid(); + + let desired_size = 4 * 1024 * 1024u64; + let mut writer = LogWriter::initialize(&file, region.clone(), guid, desired_size) + .await + .unwrap(); + + let page = [0xABu8; SECTOR as usize]; + writer + .write_entry( + &file, + &[DataPage { + file_offset: LOGGABLE_OFFSET, + payload: &page, + }], + &[], + ) + .await + .unwrap(); + + // The file might still be 2 MiB. + let result = replay_log(&file, ®ion, guid).await.unwrap(); + assert!(result.replayed); + + // After replay, the file should be at least desired_size. + let sz = file.file_size().await.unwrap(); + assert!(sz >= desired_size, "expected >= {desired_size}, got {sz}"); + } + + // ----------------------------------------------------------------------- + // Tail advancement tests + // ----------------------------------------------------------------------- + + /// advance_tail reclaims space visible to free_space(). + #[async_test] + async fn advance_tail_reclaims_free_space() { + let file = test_file(); + let region = test_region(); + let guid = test_guid(); + + let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024) + .await + .unwrap(); + + let initial_free = writer.free_space(); + let page = [0xAAu8; SECTOR as usize]; + + // Write an entry — free space decreases. + writer + .write_entry( + &file, + &[DataPage { + file_offset: LOGGABLE_OFFSET, + payload: &page, + }], + &[], + ) + .await + .unwrap(); + + let after_write = writer.free_space(); + assert!(after_write < initial_free, "writing should consume space"); + + // Advance tail to head — reclaims all space. + writer.advance_tail(writer.head); + + // When tail == head, free_space returns 0 (sequence_length returns + // length for the full-log case). So instead of checking against + // region.length, verify it's more than before the advance. + // Actually, when tail == head AND the log is "empty" (we just + // advanced past everything), the writer treats it as full. + // The write_entry check handles this: if tail == head it uses + // the full region. Let's verify we can write another entry. + writer + .write_entry( + &file, + &[DataPage { + file_offset: LOGGABLE_OFFSET + 4096, + payload: &page, + }], + &[], + ) + .await + .unwrap(); + + // After advancing tail and writing one more, free space should be + // close to what it was after just the init entry + one data entry. + assert!( + writer.free_space() >= after_write, + "after advancing tail and writing, free space should be >= previous" + ); + } + + /// Write entries until the log is full, advance tail, write more. + /// + /// This is the core scenario: without advance_tail, the log + /// fills up and returns LogFull. With it, space is reclaimed. + #[async_test] + async fn write_advance_write_more() { + let file = test_file(); + // Use a small log (16 sectors = 64 KiB) to hit the limit quickly. + let small_log = 16 * SECTOR; + let region = LogRegion { + file_offset: TEST_LOG_OFFSET, + length: small_log, + }; + let guid = test_guid(); + + let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024) + .await + .unwrap(); + + let page = [0xBBu8; SECTOR as usize]; + let elen = entry_length(1, 0); // 2 sectors per entry + + // Fill the log until we can't write anymore. + let mut entries_written = 0u32; + loop { + let needed = elen + SECTOR; // entry + 1 reserved + if writer.tail == writer.head { + if needed > writer.region.length { + break; + } + } else if needed > writer.free_space() { + break; + } + + writer + .write_entry( + &file, + &[DataPage { + file_offset: LOGGABLE_OFFSET + (entries_written as u64) * 4096, + payload: &page, + }], + &[], + ) + .await + .unwrap(); + entries_written += 1; + } + + assert!( + entries_written > 0, + "should have written at least one entry" + ); + + // Confirm the log is now full. + let result = writer + .write_entry( + &file, + &[DataPage { + file_offset: LOGGABLE_OFFSET, + payload: &page, + }], + &[], + ) + .await; + assert!(matches!(result, Ok(None)), "log should be full"); + + // Advance tail past all entries — reclaim everything. + writer.advance_tail(writer.head); + + // Now we should be able to write again. + let mut more_written = 0u32; + loop { + let needed = elen + SECTOR; + if writer.tail == writer.head { + if needed > writer.region.length { + break; + } + } else if needed > writer.free_space() { + break; + } + + writer + .write_entry( + &file, + &[DataPage { + file_offset: LOGGABLE_OFFSET + + ((entries_written + more_written) as u64) * 4096, + payload: &page, + }], + &[], + ) + .await + .unwrap(); + more_written += 1; + } + + assert!( + more_written > 0, + "should write more entries after advancing tail" + ); + } + + /// Incremental tail advancement: advance after each entry, write many + /// more entries than the log can hold without reclamation. + #[async_test] + async fn incremental_advance_exceeds_log_capacity() { + let file = test_file(); + // Tiny log: 8 sectors = 32 KiB. + let small_log = 8 * SECTOR; + let region = LogRegion { + file_offset: TEST_LOG_OFFSET, + length: small_log, + }; + let guid = test_guid(); + + let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024) + .await + .unwrap(); + + let page = [0xCCu8; SECTOR as usize]; + + // The log has 8 sectors. Init takes 1. Each data entry takes 2. + // Without advancement, we can fit ~3 entries before full. + // With incremental advancement, we can write indefinitely. + // Write 50 entries — well beyond the log's raw capacity. + for i in 0..50u32 { + let head_before = writer.head; + writer + .write_entry( + &file, + &[DataPage { + file_offset: LOGGABLE_OFFSET + (i as u64) * 4096, + payload: &page, + }], + &[], + ) + .await + .unwrap_or_else(|e| panic!("entry {i} failed: {e}")); + // Advance tail to where head was before this entry. + // This simulates "apply completed for the previous entry." + writer.advance_tail(head_before); + } + } + + /// Replay after tail advancement: entries before the advanced tail are + /// not part of the valid sequence, so replay only applies entries from + /// the new tail onward. + #[async_test] + async fn replay_after_tail_advance() { + let file = test_file(); + let small_log = 16 * SECTOR; + let region = LogRegion { + file_offset: TEST_LOG_OFFSET, + length: small_log, + }; + let guid = test_guid(); + + let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024) + .await + .unwrap(); + + // Write entry A at LOGGABLE_OFFSET. + let page_a = [0xAAu8; SECTOR as usize]; + writer + .write_entry( + &file, + &[DataPage { + file_offset: LOGGABLE_OFFSET, + payload: &page_a, + }], + &[], + ) + .await + .unwrap(); + + let head_after_a = writer.head; + + // Write entry B at LOGGABLE_OFFSET + 4096. + let page_b = [0xBBu8; SECTOR as usize]; + writer + .write_entry( + &file, + &[DataPage { + file_offset: LOGGABLE_OFFSET + 4096, + payload: &page_b, + }], + &[], + ) + .await + .unwrap(); + + // Advance tail past the init entry and entry A. + // The next write_entry will embed this new tail in its header. + writer.advance_tail(head_after_a); + + // Write entry C to embed the new tail. + let page_c = [0xCCu8; SECTOR as usize]; + writer + .write_entry( + &file, + &[DataPage { + file_offset: LOGGABLE_OFFSET + 8192, + payload: &page_c, + }], + &[], + ) + .await + .unwrap(); + + // Zero out the target areas to prove replay writes them. + let zeros = [0u8; SECTOR as usize]; + file.write_at(LOGGABLE_OFFSET, &zeros).await.unwrap(); + file.write_at(LOGGABLE_OFFSET + 4096, &zeros).await.unwrap(); + file.write_at(LOGGABLE_OFFSET + 8192, &zeros).await.unwrap(); + + // Replay. The scanner should find the sequence starting at the + // new tail (head_after_a), which includes entries B and C. + // Entry A is before tail — it may or may not be replayed depending + // on scanner behavior (it's idempotent either way). + let result = replay_log(&file, ®ion, guid).await.unwrap(); + assert!(result.replayed); + + // Entries B and C must be replayed. + let mut buf = [0u8; SECTOR as usize]; + file.read_at(LOGGABLE_OFFSET + 4096, &mut buf) + .await + .unwrap(); + assert_eq!(buf, page_b, "entry B should be replayed"); + file.read_at(LOGGABLE_OFFSET + 8192, &mut buf) + .await + .unwrap(); + assert_eq!(buf, page_c, "entry C should be replayed"); + } + + /// Wrap-around with incremental tail advancement: write enough entries + /// with per-entry advancement to force both head and tail past the + /// circular boundary. + #[async_test] + async fn wrap_around_with_incremental_advance() { + let file = test_file(); + // 8-sector log. Each data entry = 2 sectors. After init (1 sector), + // without advancement we'd fit ~3 entries. With advancement we wrap. + let small_log = 8 * SECTOR; + let region = LogRegion { + file_offset: TEST_LOG_OFFSET, + length: small_log, + }; + let guid = test_guid(); + + let mut writer = LogWriter::initialize(&file, region.clone(), guid, 4 * 1024 * 1024) + .await + .unwrap(); + + let page = [0xDDu8; SECTOR as usize]; + + // Write 20 entries, advancing tail before each write to keep + // only the last entry valid. This forces both head and tail + // to wrap multiple times. + let mut last_head = writer.head; + for i in 0..20u32 { + writer.advance_tail(last_head); + last_head = writer.head; + + writer + .write_entry( + &file, + &[DataPage { + file_offset: LOGGABLE_OFFSET + (i as u64) * 4096, + payload: &page, + }], + &[], + ) + .await + .unwrap_or_else(|e| panic!("entry {i} failed during wrap-around: {e}")); + } + + // Head and tail should both have wrapped past the log boundary. + // With 20 entries of 2 sectors each in an 8-sector log, we've + // gone around 5+ times. + // Verify replay works with the final state. + let result = replay_log(&file, ®ion, guid).await.unwrap(); + assert!(result.replayed); + + // The last entry wrote to LOGGABLE_OFFSET + 19*4096. + let mut buf = [0u8; SECTOR as usize]; + file.read_at(LOGGABLE_OFFSET + 19 * 4096, &mut buf) + .await + .unwrap(); + assert_eq!(buf, page, "last entry should be replayed correctly"); + } +} diff --git a/vm/devices/storage/vhdx/src/log_permits.rs b/vm/devices/storage/vhdx/src/log_permits.rs new file mode 100644 index 0000000000..4bb1c1465d --- /dev/null +++ b/vm/devices/storage/vhdx/src/log_permits.rs @@ -0,0 +1,211 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Failable semaphore for log pipeline backpressure. +//! +//! [`LogPermits`] limits how many pages can be in-flight in the +//! cache → log → apply pipeline at once. This bounds memory +//! consumption: each in-flight page holds an `Arc<[u8; 4096]>` +//! that cannot be freed until the apply task writes it to its +//! final file offset. +//! +//! **Lifecycle of a permit:** +//! 1. Cache acquires a permit before transitioning a page to +//! `HasPermit` / `Dirty`. +//! 2. The permit stays consumed through commit → log → apply. +//! 3. The apply task releases the permit after writing the page +//! to its final offset and flushing. +//! +//! If the log task fails, the semaphore is **poisoned** — all +//! pending and future acquires return an error. + +use crate::error::PipelineFailed; +use event_listener::Event; +use parking_lot::Mutex; + +/// Failable semaphore shared between the cache and the apply task. +/// +/// The cache acquires permits before dirtying pages. The **apply task** +/// releases permits after writing pages to their final file offsets. +/// Do NOT release permits at commit time — that defeats backpressure +/// and allows unbounded in-flight allocations. +/// +/// If the log task fails, it poisons the semaphore — all waiters and +/// future callers get errors. +pub(crate) struct LogPermits { + state: Mutex, + event: Event, + max_permits: usize, +} + +struct PermitState { + available: usize, + failed: Option, +} + +impl LogPermits { + /// Create a new semaphore with `max_in_flight` permits. + pub fn new(max_in_flight: usize) -> Self { + Self { + state: Mutex::new(PermitState { + available: max_in_flight, + failed: None, + }), + event: Event::new(), + max_permits: max_in_flight, + } + } + + /// Acquire `count` permits. + /// + /// Blocks if insufficient permits are available. Returns an error + /// if the semaphore has been poisoned. + pub async fn acquire(&self, count: usize) -> Result<(), PipelineFailed> { + loop { + let listener = self.event.listen(); + { + let mut state = self.state.lock(); + if let Some(ref err) = state.failed { + return Err(PipelineFailed(err.clone())); + } + if state.available >= count { + state.available -= count; + return Ok(()); + } + } + listener.await; + } + } + + /// Release `count` permits back to the pool. + /// + /// Called by the apply task after writing pages to their final offsets. + pub fn release(&self, count: usize) { + { + let mut state = self.state.lock(); + state.available += count; + assert!( + state.available <= self.max_permits, + "released more permits than were acquired: available {} > max {}", + state.available, + self.max_permits, + ); + } + self.event.notify(usize::MAX); + } + + /// Poison the semaphore. All pending and future acquires will fail. + /// + /// Called by the log task on error. + pub fn fail(&self, error: String) { + { + let mut state = self.state.lock(); + state.failed = Some(error); + } + self.event.notify(usize::MAX); + } + + /// Returns the number of currently available permits. + #[cfg(test)] + pub fn available(&self) -> usize { + self.state.lock().available + } +} + +#[cfg(test)] +mod tests { + use super::*; + use pal_async::async_test; + + #[async_test] + async fn acquire_and_release() { + let permits = LogPermits::new(10); + permits.acquire(3).await.unwrap(); + assert_eq!(permits.available(), 7); + permits.release(3); + assert_eq!(permits.available(), 10); + } + + #[async_test] + async fn acquire_exact_capacity() { + let permits = LogPermits::new(5); + permits.acquire(5).await.unwrap(); + assert_eq!(permits.available(), 0); + permits.release(5); + assert_eq!(permits.available(), 5); + } + + #[async_test] + async fn acquire_blocks_then_unblocks() { + let permits = std::sync::Arc::new(LogPermits::new(2)); + permits.acquire(2).await.unwrap(); + assert_eq!(permits.available(), 0); + + let p = permits.clone(); + let (acquired_tx, acquired_rx) = mesh::oneshot(); + let handle = std::thread::spawn(move || { + futures::executor::block_on(async { + p.acquire(1).await.unwrap(); + acquired_tx.send(()); + }); + }); + + // Give the thread time to block on acquire. + std::thread::sleep(std::time::Duration::from_millis(50)); + // Should still be blocked (0 available). + assert_eq!(permits.available(), 0); + + // Release one permit — unblocks the waiter. + permits.release(1); + acquired_rx.await.unwrap(); + handle.join().unwrap(); + } + + #[async_test] + async fn poison_fails_pending_acquire() { + let permits = std::sync::Arc::new(LogPermits::new(0)); + + let p = permits.clone(); + let handle = std::thread::spawn(move || { + futures::executor::block_on(async { + let result = p.acquire(1).await; + assert!(result.is_err()); + }); + }); + + // Give the thread time to block. + std::thread::sleep(std::time::Duration::from_millis(50)); + permits.fail("log write failed".into()); + handle.join().unwrap(); + } + + #[async_test] + async fn poison_fails_future_acquire() { + let permits = LogPermits::new(10); + permits.fail("log write failed".into()); + let result = permits.acquire(1).await; + assert!(result.is_err()); + } + + #[async_test] + async fn release_after_poison_is_harmless() { + let permits = LogPermits::new(5); + permits.acquire(3).await.unwrap(); + permits.fail("oops".into()); + // Release after poison doesn't panic. + permits.release(3); + // But acquire still fails. + assert!(permits.acquire(1).await.is_err()); + } + + #[async_test] + async fn multiple_acquires_serialize() { + let permits = LogPermits::new(3); + permits.acquire(2).await.unwrap(); + permits.acquire(1).await.unwrap(); + assert_eq!(permits.available(), 0); + permits.release(1); + permits.acquire(1).await.unwrap(); + assert_eq!(permits.available(), 0); + } +} diff --git a/vm/devices/storage/vhdx/src/log_task.rs b/vm/devices/storage/vhdx/src/log_task.rs new file mode 100644 index 0000000000..8e109dcb6f --- /dev/null +++ b/vm/devices/storage/vhdx/src/log_task.rs @@ -0,0 +1,727 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Log task — a single async task that owns all log state and provides +//! crash-consistent metadata persistence. +//! +//! The log task receives [`LogRequest`] messages via a `mesh` channel. +//! [`LogRequest::Commit`] is fire-and-forget: the cache sends a batch +//! of dirty pages and moves on. The log task writes WAL entries, +//! releases permits, and publishes `logged_through_lsn`. +//! +//! After logging a batch, the log task sends it to the +//! [apply task](crate::apply_task) for writing to final file offsets. +//! The apply task publishes `applied_through_lsn`, which the log task +//! reads to advance its tail. +//! +//! # Crash Consistency +//! +//! Metadata changes (BAT entries, sector bitmap bits) are journaled before +//! being committed to their final locations. On crash, +//! [`replay_log()`](crate::log::replay_log) restores them. + +use crate::AsyncFile; +use crate::apply_task::ApplyBatch; +use crate::error::PipelineFailed; +use crate::error::VhdxIoError; +use crate::error::VhdxIoErrorInner; +use crate::flush::FlushSequencer; +use crate::flush::Fsn; +use crate::format::LOG_SECTOR_SIZE; +use crate::log::DataPage; +use crate::log::LogWriter; +use crate::log_permits::LogPermits; +use crate::lsn_watermark::LsnWatermark; +use crate::open::FailureFlag; +use mesh::rpc::Rpc; +use std::collections::VecDeque; +use std::sync::Arc; +use thiserror::Error; + +const LOG_DATA_PAGE_SIZE: usize = LOG_SECTOR_SIZE as usize; + +/// Internal error type for the log task. +/// +/// Captures the three failure modes of the log pipeline without +/// pulling in the public [`VhdxIoError`]. Converted to [`VhdxIoError`] +/// only at the [`LogClient::close`] boundary. +#[derive(Debug, Error)] +pub(crate) enum LogTaskError { + /// An I/O error from WAL writes or flushes. + #[error("flush error")] + Flush(#[source] std::io::Error), + /// The apply task (or another pipeline stage) has failed. + #[error("pipeline failed")] + PipelineFailed(#[source] PipelineFailed), + #[error("failed to write log entry")] + Write(#[source] std::io::Error), + #[error("log transaction too big ({0} pages)")] + TransactionTooBig(usize), +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub(crate) struct Lsn(u64); + +impl Lsn { + pub const ZERO: Lsn = Lsn(0); + + #[cfg(test)] + pub(crate) const fn new(value: u64) -> Self { + Self(value) + } +} + +/// A request to the log task. +pub(crate) enum LogRequest { + /// Log a batch of dirty pages (fire-and-forget). + Commit(Transaction), + + /// Graceful shutdown: log all pending, wait for apply, clear log GUID. + Close(Rpc<(), Result<(), LogTaskError>>), +} + +/// Committed data at a log-data-page-aligned file offset. +pub(crate) struct LogData { + /// File offset where this data should ultimately be written. + file_offset: u64, + /// Log-data-page-aligned data (shared with the cache via Arc COW). + data: Arc, +} + +impl> LogData { + pub(crate) fn new(file_offset: u64, data: Arc) -> Self { + let len = data.as_ref().as_ref().len(); + assert_ne!(len, 0); + assert!( + file_offset.is_multiple_of(LOG_DATA_PAGE_SIZE as u64), + "committed data offset {file_offset:#x} is not {LOG_DATA_PAGE_SIZE}-byte aligned", + ); + assert!( + len.is_multiple_of(LOG_DATA_PAGE_SIZE), + "committed data length {len} is not {LOG_DATA_PAGE_SIZE}-byte aligned" + ); + Self { file_offset, data } + } + + pub(crate) fn page_count(&self) -> usize { + self.data.as_ref().as_ref().len() / LOG_DATA_PAGE_SIZE + } + + #[cfg(test)] + pub(crate) fn data(&self) -> &[u8] { + self.data.as_ref().as_ref() + } + + pub(crate) fn into_parts(self) -> (u64, Arc) { + (self.file_offset, self.data) + } +} + +/// A batch of dirty pages to be logged atomically. +pub(crate) struct Transaction { + /// The LSN assigned by the cache at commit time. + pub lsn: Lsn, + /// The data in this batch. + pub data: Vec>, + /// If set, the log task must wait for this FSN to complete before + /// writing the WAL entry. + pub pre_log_fsn: Option, +} + +/// Client-side handle for sending transactions to the log task. +/// +/// Couples the `Sender` with the LSN counter so that +/// LSN assignment and channel send are always atomic. All methods +/// take `&mut self` — the caller (cache's `PageMap` lock) provides +/// exclusivity. +pub(crate) struct LogClient { + sender: mesh::Sender>, + current_lsn: Lsn, +} + +impl LogClient { + /// Create a new log client wrapping the given sender. + pub fn new(sender: mesh::Sender>) -> Self { + Self { + sender, + current_lsn: Lsn::ZERO, + } + } + + /// Returns the most recently committed LSN (0 if none). + pub fn current_lsn(&self) -> Lsn { + self.current_lsn + } + + /// Begin a new transaction. The returned [`LogTransaction`] borrows + /// `self` mutably, preventing interleaved transactions. + /// + /// The LSN is not assigned until [`LogTransaction::commit()`] is + /// called. Dropping the transaction without committing is a no-op. + pub fn begin(&mut self) -> LogTransaction<'_, B> { + LogTransaction { client: self } + } + + /// Send a graceful close request to the log task and wait for + /// it to finish processing all pending batches. + /// + /// Consumes the client (drops the sender after the RPC completes). + pub async fn close(self) -> Result<(), VhdxIoError> { + use mesh::rpc::RpcSend; + self.sender + .call(LogRequest::Close, ()) + .await + .map_err(|_| VhdxIoErrorInner::Failed(PipelineFailed("log task closed".into())))? + .map_err(VhdxIoErrorInner::LogClose)?; + Ok(()) + } +} + +/// An in-progress log transaction. Borrows the [`LogClient`] mutably +/// to prevent interleaved sends. +/// +/// Call [`commit()`](Self::commit) to assign an LSN and send the +/// transaction to the log task. Dropping without committing is safe +/// and does not advance the LSN. +pub(crate) struct LogTransaction<'a, B> { + client: &'a mut LogClient, +} + +impl LogTransaction<'_, B> { + /// The LSN that will be assigned if this transaction is committed. + pub fn lsn(&self) -> Lsn { + Lsn(self.client.current_lsn.0 + 1) + } + + /// Commit the transaction: assign the next LSN and send it to the + /// log task. Consumes the transaction. + pub fn commit(self, log_data: Vec>, pre_log_fsn: Option) -> Lsn { + self.client.current_lsn.0 += 1; + let lsn = self.client.current_lsn; + self.client.sender.send(LogRequest::Commit(Transaction { + lsn, + data: log_data, + pre_log_fsn, + })); + lsn + } +} +/// Tracks a batch that has been sent to the applier but whose tail +/// hasn't been advanced yet. +struct PendingTail { + /// The LSN of the batch. Once `applied_lsn >= lsn`, the tail + /// can advance to `new_tail`. + lsn: Lsn, + /// The log-region offset to advance the tail to. + new_tail: u32, +} + +/// All mutable state owned by the log task. +pub(crate) struct LogTask { + file: Arc, + log_writer: LogWriter, + flush_sequencer: Arc, + log_permits: Arc, + logged_lsn: Arc, + applied_lsn: Arc, + apply_tx: mesh::Sender>, + pending_tails: VecDeque, + failure_flag: Arc, +} + +impl LogTask { + /// Create a new log task with the given dependencies. + pub(crate) fn new( + file: Arc, + log_writer: LogWriter, + flush_sequencer: Arc, + log_permits: Arc, + logged_lsn: Arc, + applied_lsn: Arc, + apply_tx: mesh::Sender>, + failure_flag: Arc, + ) -> Self { + Self { + file, + log_writer, + flush_sequencer, + log_permits, + logged_lsn, + applied_lsn, + apply_tx, + pending_tails: VecDeque::new(), + failure_flag, + } + } + + /// Run the log task main loop. + /// + /// Consumes requests from `rx` until a `Close` request is received + /// or the channel is dropped. + pub async fn run(mut self, mut rx: mesh::Receiver>) { + loop { + self.advance_tails(); + + let request = match rx.recv().await { + Ok(req) => req, + Err(_) => { + tracing::warn!("VHDX log task: channel closed without close() — file is dirty"); + break; + } + }; + + match request { + LogRequest::::Commit(txn) => { + if let Err(e) = self.handle_commit(txn).await { + tracing::error!("VHDX log task fatal error: {e}"); + self.log_permits.fail(e.to_string()); + self.logged_lsn.fail(e.to_string()); + self.failure_flag.set(&e); + break; + } + } + LogRequest::::Close(rpc) => { + rpc.handle(async |()| self.graceful_close().await).await; + break; + } + } + } + } + + /// Advance the log tail for all batches whose applied data has + /// been flushed (i.e., `applied_fsn <= completed_fsn`). + fn advance_tails(&mut self) { + let flushed_fsn = self.flush_sequencer.completed_fsn(); + let (applied, applied_fsn) = self.applied_lsn.get_with_fsn(); + while let Some(front) = self.pending_tails.front() { + if front.lsn <= applied && applied_fsn <= flushed_fsn { + self.log_writer.advance_tail(front.new_tail); + self.pending_tails.pop_front(); + } else { + break; + } + } + } + + /// Flush applied data and advance tails. Used when the log is full + /// and we need to reclaim space. + async fn flush_and_advance_tails(&mut self) -> Result<(), LogTaskError> { + if let Some(front) = self.pending_tails.front() { + let target = front.lsn; + let applied_fsn = self + .applied_lsn + .wait_for(target) + .await + .map_err(LogTaskError::PipelineFailed)?; + self.flush_sequencer + .flush_through(self.file.as_ref(), applied_fsn) + .await + .map_err(LogTaskError::Flush)?; + self.advance_tails(); + } + Ok(()) + } + + /// Write a WAL entry for the given pages (no flush). + /// + /// Returns `Ok(true)` if the entry was written, `Ok(false)` if the + /// log is full (caller should drain and retry), or `Err` on I/O error. + async fn write_log_entry( + &mut self, + pages: &[LogData], + ) -> Result { + let page_count = pages.iter().map(LogData::page_count).sum(); + let mut data_pages = Vec::with_capacity(page_count); + for p in pages { + for (i, payload) in p.data.as_ref().as_ref().as_chunks().0.iter().enumerate() { + data_pages.push(DataPage { + file_offset: p.file_offset + (i * LOG_DATA_PAGE_SIZE) as u64, + payload, + }); + } + } + + Ok(self + .log_writer + .write_entry(self.file.as_ref(), &data_pages, &[]) + .await + .map_err(LogTaskError::Write)? + .is_some()) + } + + /// Handle a Commit request: write WAL entry, publish LSN, send batch + /// to applier. If the log is full, flushes applied data and retries. + /// + /// Returns `Err` on any fatal error. The caller (`run`) poisons + /// the permits and watermarks — individual methods don't. + async fn handle_commit(&mut self, txn: Transaction) -> Result<(), LogTaskError> { + let lsn = txn.lsn; + + // Ensure pre_log_fsn constraint is met before logging. + if let Some(fsn) = txn.pre_log_fsn { + self.flush_sequencer + .flush_through(self.file.as_ref(), fsn) + .await + .map_err(LogTaskError::Flush)?; + } + + // Write WAL entry, retrying if the log is full. + while !self.write_log_entry(&txn.data).await? { + if self.pending_tails.is_empty() { + return Err(LogTaskError::TransactionTooBig( + txn.data.iter().map(LogData::page_count).sum(), + )); + } + self.flush_and_advance_tails().await?; + } + + // Capture FSN after the WAL write. Flushing through this FSN + // makes the WAL entry durable. We don't flush here — + // VhdxFile::flush() will do it, or the LogFull path will if + // space is needed. + let wal_fsn = self.flush_sequencer.current_fsn(); + self.logged_lsn.advance(lsn, wal_fsn); + + let new_tail = self.log_writer.head(); + + // Send to applier for background apply. + self.apply_tx.send(ApplyBatch { + data: txn.data, + lsn, + }); + + self.pending_tails.push_back(PendingTail { lsn, new_tail }); + Ok(()) + } + + /// Graceful close: wait for all applies, flush, advance tails. + /// + /// After this returns, the log region is fully drained. The caller + /// is responsible for clearing the log GUID in the header. + async fn graceful_close(&mut self) -> Result<(), LogTaskError> { + // Wait for all pending applies and flush. + if let Some(last) = self.pending_tails.back() { + let target_lsn = last.lsn; + let applied_fsn = self + .applied_lsn + .wait_for(target_lsn) + .await + .map_err(LogTaskError::PipelineFailed)?; + self.flush_sequencer + .flush_through(self.file.as_ref(), applied_fsn) + .await + .map_err(LogTaskError::Flush)?; + } + + // Advance all tails — data is durable at final offsets. + for pt in self.pending_tails.drain(..) { + self.log_writer.advance_tail(pt.new_tail); + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::AsyncFileExt; + use crate::apply_task; + use crate::log::LogRegion; + use crate::tests::support::InMemoryFile; + use pal_async::async_test; + use pal_async::task::Spawn; + + const LOG_SIZE: u32 = 64 * 4096; // 256 KiB — deliberately small + const LOG_OFFSET: u64 = 1024 * 1024; // 1 MiB into the file + + /// Set up a log task + apply task connected via channels. + /// Returns (log_tx, file, permits, logged_lsn, applied_lsn, + /// log_task_handle, apply_task_handle). + async fn setup_pipeline( + driver: &pal_async::DefaultDriver, + log_size: u32, + permit_count: usize, + ) -> ( + mesh::Sender>>, + Arc, + Arc, + Arc, + Arc, + pal_async::task::Task<()>, + pal_async::task::Task<()>, + ) { + let file = Arc::new(InMemoryFile::new(4 * 1024 * 1024)); + setup_pipeline_with_file(driver, file, log_size, permit_count).await + } + + /// Like `setup_pipeline`, but with a caller-provided file. + async fn setup_pipeline_with_file( + driver: &pal_async::DefaultDriver, + file: Arc, + log_size: u32, + permit_count: usize, + ) -> ( + mesh::Sender>>, + Arc, + Arc, + Arc, + Arc, + pal_async::task::Task<()>, + pal_async::task::Task<()>, + ) { + let region = LogRegion { + file_offset: LOG_OFFSET, + length: log_size, + }; + let guid = guid::Guid::new_random(); + let log_writer = LogWriter::initialize(file.as_ref(), region, guid, 4 * 1024 * 1024) + .await + .unwrap(); + + let flush_sequencer = Arc::new(FlushSequencer::new()); + let log_permits = Arc::new(LogPermits::new(permit_count)); + let logged_lsn = Arc::new(LsnWatermark::new()); + let applied_lsn = Arc::new(LsnWatermark::new()); + + let failure_flag = Arc::new(FailureFlag::new()); + + let (apply_tx, apply_rx) = mesh::channel::>>(); + let (log_tx, log_rx) = mesh::channel::>>(); + + // Spawn apply task. + let apply_task = driver.spawn( + "test-apply", + apply_task::run_apply_task( + apply_rx, + file.clone(), + flush_sequencer.clone(), + applied_lsn.clone(), + log_permits.clone(), + failure_flag.clone(), + ), + ); + + // Spawn log task. + let log_task = driver.spawn( + "test-log", + LogTask::new( + file.clone(), + log_writer, + flush_sequencer, + log_permits.clone(), + logged_lsn.clone(), + applied_lsn.clone(), + apply_tx, + failure_flag, + ) + .run(log_rx), + ); + + ( + log_tx, + file, + log_permits, + logged_lsn, + applied_lsn, + log_task, + apply_task, + ) + } + + /// Build a Transaction with `n` fake pages. + fn make_txn(lsn: Lsn, n: usize) -> Transaction> { + let pages = (0..n) + .map(|i| { + LogData::new( + (2 * 1024 * 1024 + i * LOG_DATA_PAGE_SIZE) as u64, + Arc::new(vec![lsn.0 as u8; LOG_DATA_PAGE_SIZE]), + ) + }) + .collect(); + Transaction { + lsn, + data: pages, + pre_log_fsn: None, + } + } + + /// Acquire permits and send a commit. Mirrors what the cache does: + /// acquire permits for each page, then commit (which sends the + /// transaction to the log task). + async fn send_commit( + tx: &mesh::Sender>>, + permits: &LogPermits, + lsn: Lsn, + page_count: usize, + ) { + permits.acquire(page_count).await.unwrap(); + tx.send(LogRequest::Commit(make_txn(lsn, page_count))); + } + + #[async_test] + async fn single_commit_publishes_lsn(driver: pal_async::DefaultDriver) { + let (tx, _file, permits, logged_lsn, _applied_lsn, _log_task, _apply_task) = + setup_pipeline(&driver, LOG_SIZE, 100).await; + + send_commit(&tx, &permits, Lsn(1), 1).await; + logged_lsn.wait_for(Lsn(1)).await.unwrap(); + } + + #[async_test] + async fn permits_return_after_apply(driver: pal_async::DefaultDriver) { + let permit_count = 10; + let (tx, _file, permits, logged_lsn, applied_lsn, _log_task, _apply_task) = + setup_pipeline(&driver, LOG_SIZE, permit_count).await; + + // Send a commit of 5 pages (acquires 5 permits). + send_commit(&tx, &permits, Lsn(1), 5).await; + + // Wait for the apply task to finish. + logged_lsn.wait_for(Lsn(1)).await.unwrap(); + applied_lsn.wait_for(Lsn(1)).await.unwrap(); + + // The apply task should have released 5 permits. + // All 10 should be available again. + assert_eq!(permits.available(), permit_count); + } + + #[async_test] + async fn multiple_commits_sequential(driver: pal_async::DefaultDriver) { + let (tx, _file, permits, logged_lsn, _applied_lsn, _log_task, _apply_task) = + setup_pipeline(&driver, LOG_SIZE, 100).await; + + for lsn in 1..=10u64 { + send_commit(&tx, &permits, Lsn(lsn), 1).await; + } + + // All 10 should be logged. + logged_lsn.wait_for(Lsn(10)).await.unwrap(); + } + + #[async_test] + async fn log_full_retry_makes_progress(driver: pal_async::DefaultDriver) { + // Use a small log (256 KiB). Each page + entry overhead ~ 8 KiB. + // With ~30 entries the log will fill up, forcing the retry path. + let (tx, _file, permits, logged_lsn, _applied_lsn, _log_task, _apply_task) = + setup_pipeline(&driver, LOG_SIZE, 500).await; + + // Send 50 single-page commits. This will exceed the 256 KiB log + // and force LogFull → wait for apply → advance tail → retry. + for lsn in 1..=50u64 { + send_commit(&tx, &permits, Lsn(lsn), 1).await; + } + + // If LogFull retry works, all 50 will eventually be logged. + logged_lsn.wait_for(Lsn(50)).await.unwrap(); + } + + #[async_test] + async fn large_batches_through_small_log(driver: pal_async::DefaultDriver) { + // Each batch has 5 pages (~24 KiB with overhead). 256 KiB log + // fits maybe 10 batches. Send 30 — forces multiple cycles of + // LogFull → drain → retry. + let (tx, _file, permits, logged_lsn, _applied_lsn, _log_task, _apply_task) = + setup_pipeline(&driver, LOG_SIZE, 500).await; + + for lsn in 1..=30u64 { + send_commit(&tx, &permits, Lsn(lsn), 5).await; + } + + logged_lsn.wait_for(Lsn(30)).await.unwrap(); + } + + #[async_test] + async fn close_after_commits(driver: pal_async::DefaultDriver) { + use mesh::rpc::RpcSend; + + let (tx, _file, permits, logged_lsn, applied_lsn, _log_task, _apply_task) = + setup_pipeline(&driver, LOG_SIZE, 100).await; + + for lsn in 1..=5u64 { + send_commit(&tx, &permits, Lsn(lsn), 1).await; + } + logged_lsn.wait_for(Lsn(5)).await.unwrap(); + + // Graceful close should wait for all applies and succeed. + let result = tx.call(LogRequest::>::Close, ()).await.unwrap(); + result.unwrap(); + + // All commits should be applied. + assert!(applied_lsn.get() >= Lsn(5)); + } + + #[async_test] + async fn applied_data_is_at_final_offset(driver: pal_async::DefaultDriver) { + let (tx, file, permits, logged_lsn, applied_lsn, _log_task, _apply_task) = + setup_pipeline(&driver, LOG_SIZE, 100).await; + + let target_offset: u64 = 2 * 1024 * 1024; // 2 MiB + let data = Arc::new(vec![0xAB_u8; LOG_DATA_PAGE_SIZE]); + permits.acquire(1).await.unwrap(); + tx.send(LogRequest::Commit(Transaction { + lsn: Lsn(1), + data: vec![LogData::new(target_offset, data.clone())], + pre_log_fsn: None, + })); + + logged_lsn.wait_for(Lsn(1)).await.unwrap(); + applied_lsn.wait_for(Lsn(1)).await.unwrap(); + + // Read back from the final offset — should match. + let mut buf = [0u8; LOG_DATA_PAGE_SIZE]; + file.read_at(target_offset, &mut buf).await.unwrap(); + assert!(buf.iter().all(|&b| b == 0xAB)); + } + + #[async_test] + async fn apply_write_failure_poisons_pipeline(driver: pal_async::DefaultDriver) { + use crate::tests::support::IoInterceptor; + + // Interceptor that fails writes only outside the log region + // (i.e., apply writes to final offsets), not WAL writes. + struct FailApplyInterceptor { + fail: std::sync::atomic::AtomicBool, + } + impl IoInterceptor for FailApplyInterceptor { + fn before_write(&self, offset: u64, _data: &[u8]) -> Result<(), std::io::Error> { + // Log region is at LOG_OFFSET (1 MiB). Apply writes go + // to 2 MiB+. Only fail writes outside the log region. + if self.fail.load(std::sync::atomic::Ordering::Relaxed) && offset >= 2 * 1024 * 1024 + { + return Err(std::io::Error::other("injected apply write failure")); + } + Ok(()) + } + } + + let interceptor = Arc::new(FailApplyInterceptor { + fail: std::sync::atomic::AtomicBool::new(false), + }); + let file = Arc::new(InMemoryFile::with_interceptor( + 4 * 1024 * 1024, + interceptor.clone() as Arc, + )); + + let (tx, _file, permits, logged_lsn, _applied_lsn, _log_task, _apply_task) = + setup_pipeline_with_file(&driver, file, LOG_SIZE, 100).await; + + // First commit succeeds end-to-end. + send_commit(&tx, &permits, Lsn(1), 1).await; + logged_lsn.wait_for(Lsn(1)).await.unwrap(); + + // Now fail apply writes (but not WAL writes). + interceptor + .fail + .store(true, std::sync::atomic::Ordering::Relaxed); + + // Second commit: WAL write succeeds, but apply write will fail. + send_commit(&tx, &permits, Lsn(2), 1).await; + logged_lsn.wait_for(Lsn(2)).await.unwrap(); + + // The apply task should have poisoned permits after the write failure. + // Future permit acquires must fail. + let result = permits.acquire(1).await; + assert!(result.is_err(), "acquire should fail after apply error"); + } +} diff --git a/vm/devices/storage/vhdx/src/lsn_watermark.rs b/vm/devices/storage/vhdx/src/lsn_watermark.rs new file mode 100644 index 0000000000..8d15f43e39 --- /dev/null +++ b/vm/devices/storage/vhdx/src/lsn_watermark.rs @@ -0,0 +1,224 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! LSN watermark — a shared monotonic counter with async waiters. +//! +//! Used to publish progress through the log/apply pipeline: +//! +//! - `logged_lsn`: the log task updates this after writing each WAL +//! entry. The paired FSN lets callers flush through the sequencer +//! to make the WAL entry durable. +//! - `applied_lsn`: the apply task updates this after writing pages +//! to their final offsets. The paired FSN lets the log task flush +//! to make the applied data durable (needed for tail advancement). +//! +//! Both watermarks carry an `(lsn, fsn)` pair. The LSN tracks progress; +//! the FSN tells consumers which flush sequence number to +//! [`flush_through()`](crate::flush::FlushSequencer::flush_through) +//! to make that progress durable on disk. + +use crate::{error::PipelineFailed, flush::Fsn, log_task::Lsn}; +use event_listener::Event; +use parking_lot::Mutex; + +/// A shared monotonic `(lsn, fsn)` counter with async waiting and poisoning. +/// +/// Writers publish new values via [`advance()`](Self::advance). +/// Readers wait for the LSN to reach a target via +/// [`wait_for()`](Self::wait_for), which returns the associated FSN. +/// +/// If the producer fails, it calls [`fail()`](Self::fail) to poison the +/// watermark — all pending and future [`wait_for()`](Self::wait_for) calls +/// return an error. +pub(crate) struct LsnWatermark { + state: Mutex, + event: Event, +} + +struct WatermarkState { + lsn: Lsn, + fsn: Fsn, + failed: Option, +} + +impl LsnWatermark { + /// Create a new watermark starting at LSN 0, FSN 0. + pub fn new() -> Self { + Self { + state: Mutex::new(WatermarkState { + lsn: Lsn::ZERO, + fsn: Fsn::ZERO, + failed: None, + }), + event: Event::new(), + } + } + + /// Read the current LSN value. + pub fn get(&self) -> Lsn { + self.state.lock().lsn + } + + /// Read the current `(lsn, fsn)` pair atomically. + pub fn get_with_fsn(&self) -> (Lsn, Fsn) { + let s = self.state.lock(); + (s.lsn, s.fsn) + } + + /// Advance the watermark to `(new_lsn, new_fsn)`. + /// + /// Both values are advanced independently via `max()`. Callers are + /// sequential task loops, so in practice LSN and FSN always advance + /// together. + pub fn advance(&self, new_lsn: Lsn, new_fsn: Fsn) { + { + let mut s = self.state.lock(); + s.lsn = s.lsn.max(new_lsn); + s.fsn = s.fsn.max(new_fsn); + } + self.event.notify(usize::MAX); + } + + /// Wait until the LSN reaches at least `target`. + /// + /// Returns the FSN associated with the reached LSN. Callers should + /// [`flush_through()`](crate::flush::FlushSequencer::flush_through) + /// the returned FSN to ensure durability. + /// + /// Returns an error if the watermark has been poisoned. + pub async fn wait_for(&self, target: Lsn) -> Result { + loop { + let listener = self.event.listen(); + { + let s = self.state.lock(); + if let Some(ref err) = s.failed { + return Err(PipelineFailed(err.clone())); + } + if s.lsn >= target { + return Ok(s.fsn); + } + } + listener.await; + } + } + + /// Poison the watermark. All pending and future `wait_for()` calls + /// will return an error. + pub fn fail(&self, error: String) { + self.state.lock().failed = Some(error); + self.event.notify(usize::MAX); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use pal_async::async_test; + + #[async_test] + async fn starts_at_zero() { + let wm = LsnWatermark::new(); + assert_eq!(wm.get(), Lsn::ZERO); + assert_eq!(wm.get_with_fsn(), (Lsn::ZERO, Fsn::ZERO)); + } + + #[async_test] + async fn advance_and_read() { + let wm = LsnWatermark::new(); + wm.advance(Lsn::new(5), Fsn::new(100)); + assert_eq!(wm.get(), Lsn::new(5)); + assert_eq!(wm.get_with_fsn(), (Lsn::new(5), Fsn::new(100))); + wm.advance(Lsn::new(10), Fsn::new(200)); + assert_eq!(wm.get(), Lsn::new(10)); + assert_eq!(wm.get_with_fsn(), (Lsn::new(10), Fsn::new(200))); + } + + #[async_test] + async fn advance_is_monotonic() { + let wm = LsnWatermark::new(); + wm.advance(Lsn::new(10), Fsn::new(200)); + wm.advance(Lsn::new(5), Fsn::new(100)); // no-op (both LSN and FSN stay at max) + assert_eq!(wm.get(), Lsn::new(10)); + assert_eq!(wm.get_with_fsn(), (Lsn::new(10), Fsn::new(200))); + } + + #[async_test] + async fn wait_for_already_reached() { + let wm = LsnWatermark::new(); + wm.advance(Lsn::new(10), Fsn::new(100)); + let fsn = wm.wait_for(Lsn::new(5)).await.unwrap(); + assert_eq!(fsn, Fsn::new(100)); + let fsn = wm.wait_for(Lsn::new(10)).await.unwrap(); + assert_eq!(fsn, Fsn::new(100)); + } + + #[async_test] + async fn wait_for_returns_fsn() { + let wm = LsnWatermark::new(); + wm.advance(Lsn::new(5), Fsn::new(42)); + let fsn = wm.wait_for(Lsn::new(5)).await.unwrap(); + assert_eq!(fsn, Fsn::new(42)); + } + + #[async_test] + async fn wait_for_blocks_then_completes() { + let wm = std::sync::Arc::new(LsnWatermark::new()); + + let w = wm.clone(); + let (done_tx, done_rx) = mesh::oneshot(); + let handle = std::thread::spawn(move || { + futures::executor::block_on(async { + let fsn = w.wait_for(Lsn::new(5)).await.unwrap(); + done_tx.send(fsn); + }); + }); + + std::thread::sleep(std::time::Duration::from_millis(50)); + wm.advance(Lsn::new(5), Fsn::new(77)); + let fsn = done_rx.await.unwrap(); + assert_eq!(fsn, Fsn::new(77)); + handle.join().unwrap(); + } + + #[async_test] + async fn wait_for_zero_returns_immediately() { + let wm = LsnWatermark::new(); + let fsn = wm.wait_for(Lsn::ZERO).await.unwrap(); + assert_eq!(fsn, Fsn::ZERO); + } + + #[async_test] + async fn poison_fails_future_wait() { + let wm = LsnWatermark::new(); + wm.fail("broken".into()); + assert!(wm.wait_for(Lsn::new(1)).await.is_err()); + } + + #[async_test] + async fn poison_fails_pending_wait() { + let wm = std::sync::Arc::new(LsnWatermark::new()); + + let w = wm.clone(); + let (done_tx, done_rx) = mesh::oneshot(); + let handle = std::thread::spawn(move || { + futures::executor::block_on(async { + let result = w.wait_for(Lsn::new(5)).await; + assert!(result.is_err()); + done_tx.send(()); + }); + }); + + std::thread::sleep(std::time::Duration::from_millis(50)); + wm.fail("task died".into()); + done_rx.await.unwrap(); + handle.join().unwrap(); + } + + #[async_test] + async fn poison_fails_even_for_already_reached() { + let wm = LsnWatermark::new(); + wm.advance(Lsn::new(10), Fsn::new(100)); + wm.fail("broken".into()); + assert!(wm.wait_for(Lsn::new(5)).await.is_err()); + } +} diff --git a/vm/devices/storage/vhdx/src/metadata.rs b/vm/devices/storage/vhdx/src/metadata.rs new file mode 100644 index 0000000000..b7e58d8755 --- /dev/null +++ b/vm/devices/storage/vhdx/src/metadata.rs @@ -0,0 +1,477 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Metadata table parsing and item reading for VHDX files. +//! +//! Reads the metadata table from the metadata region, validates entries, +//! and provides lookup and raw-read access for individual metadata items. + +use crate::AsyncFile; +use crate::cache::PAGE_SIZE; +use crate::cache::PageCache; +use crate::cache::PageKey; +use crate::error::CorruptionType; +use crate::error::OpenError; +use crate::error::OpenErrorInner; +use crate::format; +use crate::format::MetadataTableEntry; +use crate::format::MetadataTableHeader; +use guid::Guid; +use zerocopy::FromBytes; + +/// Cache tag for metadata region pages. +pub(crate) const METADATA_TAG: u8 = 1; + +/// In-memory representation of the validated metadata table. +pub(crate) struct MetadataTable { + /// The validated entries (sorted by offset for free-space scanning). + entries: Vec, +} + +impl MetadataTable { + /// Read and validate the metadata table from the file. + pub async fn read( + file: &impl AsyncFile, + metadata_offset: u64, + metadata_length: u32, + ) -> Result { + // Read the metadata table (first 64 KiB of the metadata region). + let buf = file.alloc_buffer(format::METADATA_TABLE_SIZE as usize); + let buf = file + .read_into(metadata_offset, buf) + .await + .map_err(OpenErrorInner::Io)?; + let buf = buf.as_ref(); + + // Validate signature. + let header = MetadataTableHeader::read_from_prefix(buf) + .map_err(|_| CorruptionType::InvalidMetadataTableSignature)? + .0 + .clone(); + if header.signature != format::METADATA_TABLE_SIGNATURE { + return Err(CorruptionType::InvalidMetadataTableSignature.into()); + } + + // Validate entry count. + if header.entry_count as u64 > format::METADATA_ENTRY_MAX_COUNT { + return Err(CorruptionType::MetadataTableEntryCountTooHigh.into()); + } + if header.reserved != 0 || header.reserved2.iter().any(|&value| value != 0) { + return Err(CorruptionType::ReservedMetadataTableFieldNonzero.into()); + } + + // Validate metadata region size. + if metadata_length as u64 > format::MAXIMUM_METADATA_REGION_SIZE { + return Err(CorruptionType::MetadataRegionTooLarge.into()); + } + + // Parse entries. + let header_size = size_of::(); + let entry_size = size_of::(); + let mut entries = Vec::with_capacity(header.entry_count as usize); + for i in 0..header.entry_count as usize { + let off = header_size + i * entry_size; + let entry = MetadataTableEntry::read_from_prefix(&buf[off..]) + .unwrap() + .0 + .clone(); + let supported = u32::from( + format::MetadataTableEntryFlags::new() + .with_is_user(true) + .with_is_virtual_disk(true) + .with_is_required(true), + ); + if entry.reserved2 != 0 || u32::from(entry.flags) & !supported != 0 { + return Err(CorruptionType::ReservedMetadataTableFieldNonzero.into()); + } + entries.push(entry); + } + + // Sort by (is_user, item_id) for duplicate detection. + entries.sort_by(|a, b| { + a.flags + .is_user() + .cmp(&b.flags.is_user()) + .then_with(|| a.item_id.cmp(&b.item_id)) + }); + + // Check for duplicates. + for i in 1..entries.len() { + if entries[i].flags.is_user() == entries[i - 1].flags.is_user() + && entries[i].item_id == entries[i - 1].item_id + { + return Err(CorruptionType::MetadataDuplicateGuid.into()); + } + } + + // Re-sort by offset for overlap checking. + entries.sort_by_key(|e| e.offset); + + // Validate each entry and check for overlaps. + let mut user_item_count: u16 = 0; + let mut system_item_count: u16 = 0; + let mut system_metadata_size: u64 = 0; + let mut user_metadata_size: u64 = 0; + let mut last_end: u32 = 0; + + for entry in &entries { + // User + required is invalid. + if entry.flags.is_user() && entry.flags.is_required() { + return Err(CorruptionType::MetadataUserRequired.into()); + } + + // Item size limit. + if entry.length as u64 > format::MAXIMUM_METADATA_ITEM_SIZE { + return Err(CorruptionType::MetadataItemTooLarge.into()); + } + + // Zero GUID is invalid. + if entry.item_id == Guid::ZERO { + return Err(CorruptionType::ZeroMetadataItemId.into()); + } + + if entry.length == 0 { + // Zero-length entries must have zero offset. + if entry.offset != 0 { + return Err(CorruptionType::InvalidMetadataEntryOffset.into()); + } + } else { + // Non-zero entries: offset must be >= table size and fit in region. + if entry.offset < format::METADATA_TABLE_SIZE as u32 { + return Err(CorruptionType::MetadataOverlapping.into()); + } + let end = entry + .offset + .checked_add(entry.length) + .ok_or(CorruptionType::MetadataOverlapping)?; + if end > metadata_length { + return Err(CorruptionType::MetadataOverlapping.into()); + } + // Check overlap with previous entry. + if entry.offset < last_end { + return Err(CorruptionType::MetadataOverlapping.into()); + } + last_end = end; + } + + // Track sizes per category. + if entry.flags.is_user() { + user_item_count += 1; + user_metadata_size += entry.length as u64; + } else { + system_item_count += 1; + system_metadata_size += entry.length as u64; + } + } + + // Validate entry counts. + if user_item_count as u64 > format::METADATA_USER_ENTRY_MAX_COUNT { + return Err(CorruptionType::MetadataUserCountExceeded.into()); + } + if system_item_count as u64 > format::METADATA_SYSTEM_ENTRY_MAX_COUNT { + return Err(CorruptionType::MetadataTableEntryCountTooHigh.into()); + } + + // Validate total sizes per category. + if system_metadata_size > format::MAXIMUM_TOTAL_METADATA_SIZE_PER_CATEGORY + || user_metadata_size > format::MAXIMUM_TOTAL_METADATA_SIZE_PER_CATEGORY + { + return Err(CorruptionType::TotalMetadataSizeExceeded.into()); + } + + Ok(MetadataTable { entries }) + } + + /// Find an entry by GUID and user/system flag. + pub fn find_entry(&self, is_user: bool, item_id: &Guid) -> Option<&MetadataTableEntry> { + self.entries + .iter() + .find(|e| e.flags.is_user() == is_user && &e.item_id == item_id) + } + + /// Read the raw bytes of a metadata item through the page cache. + pub async fn read_item( + &self, + cache: &PageCache, + is_user: bool, + item_id: &Guid, + ) -> Result, OpenError> { + let entry = self + .find_entry(is_user, item_id) + .ok_or(CorruptionType::MissingRequiredMetadata)?; + + let mut data = vec![0; entry.length as usize]; + let mut data_offset = 0; + let mut item_offset = entry.offset as u64; + + while data_offset < data.len() { + let page_offset = item_offset & !(PAGE_SIZE as u64 - 1); + let page_delta = (item_offset - page_offset) as usize; + let len = (data.len() - data_offset).min(PAGE_SIZE - page_delta); + + let page = cache + .acquire_read(PageKey { + tag: METADATA_TAG, + offset: page_offset, + }) + .await + .map_err(OpenErrorInner::MetadataCache)?; + data[data_offset..data_offset + len] + .copy_from_slice(&page[page_delta..page_delta + len]); + + data_offset += len; + item_offset += len as u64; + } + + Ok(data) + } + + /// Returns an iterator over all entries. + pub fn entries(&self) -> &[MetadataTableEntry] { + &self.entries + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::AsyncFileExt; + use crate::cache::PageCache; + use crate::region; + use crate::tests::support::InMemoryFile; + use pal_async::async_test; + use std::sync::Arc; + use zerocopy::IntoBytes; + + fn metadata_cache(file: InMemoryFile, metadata_offset: u64) -> PageCache { + let mut cache = PageCache::new(Arc::new(file), None, None, 0); + cache.register_tag(METADATA_TAG, metadata_offset); + cache + } + + #[async_test] + async fn parse_valid_metadata_table() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let regions = region::parse_region_tables(&file).await.unwrap(); + + let table = MetadataTable::read(&file, regions.metadata_offset, regions.metadata_length) + .await + .unwrap(); + + // Should have 5 system entries: file params, disk size, logical sector, + // physical sector, page 83. + assert_eq!(table.entries.len(), 5); + } + + #[async_test] + async fn find_entry_by_guid() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let regions = region::parse_region_tables(&file).await.unwrap(); + let table = MetadataTable::read(&file, regions.metadata_offset, regions.metadata_length) + .await + .unwrap(); + + // Known GUIDs should be found. + assert!( + table + .find_entry(false, &format::FILE_PARAMETERS_ITEM_GUID) + .is_some() + ); + assert!( + table + .find_entry(false, &format::VIRTUAL_DISK_SIZE_ITEM_GUID) + .is_some() + ); + assert!( + table + .find_entry(false, &format::LOGICAL_SECTOR_SIZE_ITEM_GUID) + .is_some() + ); + assert!( + table + .find_entry(false, &format::PHYSICAL_SECTOR_SIZE_ITEM_GUID) + .is_some() + ); + assert!( + table + .find_entry(false, &format::PAGE_83_ITEM_GUID) + .is_some() + ); + + // Unknown GUID should not be found. + assert!( + table + .find_entry(false, &format::PARENT_LOCATOR_ITEM_GUID) + .is_none() + ); + } + + #[async_test] + async fn read_item_bytes() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let regions = region::parse_region_tables(&file).await.unwrap(); + let table = MetadataTable::read(&file, regions.metadata_offset, regions.metadata_length) + .await + .unwrap(); + let cache = metadata_cache(file, regions.metadata_offset); + + // Read disk size — should be 1 GiB. + let data = table + .read_item(&cache, false, &format::VIRTUAL_DISK_SIZE_ITEM_GUID) + .await + .unwrap(); + assert_eq!(data.len(), 8); + let disk_size = u64::from_le_bytes(data.try_into().unwrap()); + assert_eq!(disk_size, format::GB1); + } + + #[async_test] + async fn invalid_signature() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let regions = region::parse_region_tables(&file).await.unwrap(); + + // Corrupt the metadata table signature. + let mut buf = vec![0u8; format::METADATA_TABLE_SIZE as usize]; + file.read_at(regions.metadata_offset, &mut buf) + .await + .unwrap(); + buf[0] ^= 0xFF; + file.write_at(regions.metadata_offset, &buf).await.unwrap(); + + let result = + MetadataTable::read(&file, regions.metadata_offset, regions.metadata_length).await; + assert!(matches!( + result, + Err(OpenError(OpenErrorInner::Corrupt( + CorruptionType::InvalidMetadataTableSignature + ))) + )); + } + + #[async_test] + async fn duplicate_guid() { + let file = InMemoryFile::new(4 * format::MB1); + + // Build a metadata table with two entries sharing the same GUID. + let mut buf = vec![0u8; format::METADATA_TABLE_SIZE as usize]; + let header = MetadataTableHeader { + signature: format::METADATA_TABLE_SIGNATURE, + reserved: 0, + entry_count: 2, + reserved2: [0; 5], + }; + let h_bytes = header.as_bytes(); + buf[..h_bytes.len()].copy_from_slice(h_bytes); + + let entry = MetadataTableEntry { + item_id: format::FILE_PARAMETERS_ITEM_GUID, + offset: format::METADATA_TABLE_SIZE as u32, + length: 8, + flags: format::MetadataTableEntryFlags::new().with_is_required(true), + reserved2: 0, + }; + let e_bytes = entry.as_bytes(); + let entry_start = size_of::(); + buf[entry_start..entry_start + e_bytes.len()].copy_from_slice(e_bytes); + + let entry2 = MetadataTableEntry { + item_id: format::FILE_PARAMETERS_ITEM_GUID, + offset: format::METADATA_TABLE_SIZE as u32 + 8, + length: 8, + flags: format::MetadataTableEntryFlags::new().with_is_required(true), + reserved2: 0, + }; + let e2_bytes = entry2.as_bytes(); + let off2 = entry_start + e_bytes.len(); + buf[off2..off2 + e2_bytes.len()].copy_from_slice(e2_bytes); + + let metadata_offset = 2 * format::MB1; + file.write_at(metadata_offset, &buf).await.unwrap(); + + let result = MetadataTable::read(&file, metadata_offset, format::MB1 as u32).await; + assert!(matches!( + result, + Err(OpenError(OpenErrorInner::Corrupt( + CorruptionType::MetadataDuplicateGuid + ))) + )); + } + + #[async_test] + async fn user_required_invalid() { + let file = InMemoryFile::new(4 * format::MB1); + + let mut buf = vec![0u8; format::METADATA_TABLE_SIZE as usize]; + let header = MetadataTableHeader { + signature: format::METADATA_TABLE_SIGNATURE, + reserved: 0, + entry_count: 1, + reserved2: [0; 5], + }; + let h_bytes = header.as_bytes(); + buf[..h_bytes.len()].copy_from_slice(h_bytes); + + let entry = MetadataTableEntry { + item_id: format::FILE_PARAMETERS_ITEM_GUID, + offset: format::METADATA_TABLE_SIZE as u32, + length: 8, + flags: format::MetadataTableEntryFlags::new() + .with_is_user(true) + .with_is_required(true), + reserved2: 0, + }; + let e_bytes = entry.as_bytes(); + let entry_start = size_of::(); + buf[entry_start..entry_start + e_bytes.len()].copy_from_slice(e_bytes); + + let metadata_offset = 2 * format::MB1; + file.write_at(metadata_offset, &buf).await.unwrap(); + + let result = MetadataTable::read(&file, metadata_offset, format::MB1 as u32).await; + assert!(matches!( + result, + Err(OpenError(OpenErrorInner::Corrupt( + CorruptionType::MetadataUserRequired + ))) + )); + } + + #[async_test] + async fn reserved_metadata_entry_flags_invalid() { + let file = InMemoryFile::new(4 * format::MB1); + + let mut buf = vec![0u8; format::METADATA_TABLE_SIZE as usize]; + let header = MetadataTableHeader { + signature: format::METADATA_TABLE_SIGNATURE, + reserved: 0, + entry_count: 1, + reserved2: [0; 5], + }; + let h_bytes = header.as_bytes(); + buf[..h_bytes.len()].copy_from_slice(h_bytes); + + let entry = MetadataTableEntry { + item_id: format::FILE_PARAMETERS_ITEM_GUID, + offset: format::METADATA_TABLE_SIZE as u32, + length: 8, + flags: format::MetadataTableEntryFlags::new().with_is_required(true), + reserved2: 0, + }; + let e_bytes = entry.as_bytes(); + let entry_start = size_of::(); + buf[entry_start..entry_start + e_bytes.len()].copy_from_slice(e_bytes); + let flags_offset = entry_start + 24; + buf[flags_offset..flags_offset + 4].copy_from_slice(&8_u32.to_le_bytes()); + + let metadata_offset = 2 * format::MB1; + file.write_at(metadata_offset, &buf).await.unwrap(); + + let result = MetadataTable::read(&file, metadata_offset, format::MB1 as u32).await; + assert!(matches!( + result, + Err(OpenError(OpenErrorInner::Corrupt( + CorruptionType::ReservedMetadataTableFieldNonzero + ))) + )); + } +} diff --git a/vm/devices/storage/vhdx/src/open.rs b/vm/devices/storage/vhdx/src/open.rs new file mode 100644 index 0000000000..f1b7dd0dc6 --- /dev/null +++ b/vm/devices/storage/vhdx/src/open.rs @@ -0,0 +1,1490 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! VHDX file open orchestration. +//! +//! Ties together header, region, metadata, and BAT parsing into +//! [`VhdxFile::open()`], which returns a [`VhdxBuilder`] for +//! configuring options before finalizing as read-only or writable. + +use crate::AsyncFile; +use crate::bat::BAT_TAG; +use crate::bat::Bat; +use crate::cache::PageCache; +use crate::error::CorruptionType; +use crate::error::OpenError; +use crate::error::OpenErrorInner; +use crate::error::PipelineFailed; +use crate::error::VhdxIoError; +use crate::error::VhdxIoErrorInner; +use crate::flush::FlushSequencer; +use crate::format; +use crate::format::FileIdentifier; +use crate::header::HeaderState; +use crate::header::WriteMode; +use crate::header::parse_headers; +use crate::header::serialize_header; +use crate::known_meta::read_known_metadata; +use crate::known_meta::verify_known_metadata; +use crate::log; +use crate::log::LogRegion; +use crate::log_task::LogRequest; +use crate::metadata::METADATA_TAG; +use crate::metadata::MetadataTable; +use crate::region::parse_region_tables; +use crate::sector_bitmap::SBM_TAG; +use crate::space::DeferredReleases; +use crate::space::EofState; +use crate::space::FreeSpaceTracker; +use guid::Guid; +use parking_lot::Mutex; +use std::sync::Arc; +use zerocopy::FromBytes; + +/// Builder for opening a VHDX file. +/// +/// Created via [`VhdxFile::open()`], then configured with builder methods +/// before calling [`read_only()`](Self::read_only) or +/// [`writable()`](Self::writable) to produce a [`VhdxFile`]. +/// +/// # Examples +/// +/// ```ignore +/// // Default options: +/// let vhdx = VhdxFile::open(file).read_only().await?; +/// +/// // With block alignment (for NTFS-DAX / PMEM volumes): +/// let vhdx = VhdxFile::open(file) +/// .block_alignment(2 * 1024 * 1024) +/// .writable(&spawner) +/// .await?; +/// ``` +pub struct VhdxBuilder { + file: F, + options: OpenOptions, +} + +/// Internal options collected by [`VhdxBuilder`]. +#[derive(Debug, Clone)] +struct OpenOptions { + /// Block data alignment in bytes. Must be 0 or a power of 2. + /// + /// When non-zero and ≤ the VHDX block size, new data block allocations + /// from the end of the file are rounded up to this alignment. This + /// matches the host filesystem's cluster size (e.g. 2 MiB on NTFS-DAX + /// volumes) so that data blocks land on cluster boundaries. + /// + /// Default: 0 (no alignment — blocks use the natural 1 MiB granularity). + block_alignment: u32, + /// Whether to allow log replay on a read-only open. + /// + /// When true, a dirty log is replayed (the file handle must support + /// writes for the replay I/O) but the resulting `VhdxFile` is still + /// read-only. When false, a dirty log returns + /// [`CorruptionType::LogReplayRequired`]. + /// + /// Ignored for writable opens (log replay always happens). + /// + /// Default: false. + allow_replay: bool, +} + +impl OpenOptions { + fn new() -> Self { + Self { + block_alignment: 0, + allow_replay: false, + } + } +} + +impl VhdxBuilder { + /// Set the block data alignment in bytes. + /// + /// Must be 0 or a power of 2. If larger than the VHDX block size, + /// it is silently ignored at open time. + /// + /// This should be set to the host filesystem's cluster size when the + /// VHDX file lives on a volume with clusters larger than 1 MiB (e.g. + /// NTFS-DAX with 2 MiB clusters). + pub fn block_alignment(mut self, alignment: u32) -> Self { + self.options.block_alignment = alignment; + self + } + + /// Allow log replay when opening read-only. + /// + /// When true, a dirty log is replayed (the file handle must support + /// writes for the replay I/O) but the resulting [`VhdxFile`] is still + /// read-only. When false, a dirty log returns an error. + /// + /// Has no effect on [`writable()`](Self::writable) opens, which always + /// replay. + pub fn allow_replay(mut self, allow: bool) -> Self { + self.options.allow_replay = allow; + self + } + + /// Open the VHDX file in read-only mode. + pub async fn read_only(self) -> Result, OpenError> + where + F: AsyncFile, + { + VhdxFile::open_read_only(self.file, &self.options).await + } + + /// Open the VHDX file in writable mode with a log task. + /// + /// Replays a dirty log if needed, then spawns a log task for + /// crash-consistent metadata writes. + /// + /// Call [`VhdxFile::close()`] for a clean shutdown. + pub async fn writable( + self, + spawner: &impl pal_async::task::Spawn, + ) -> Result, OpenError> + where + F: AsyncFile, + { + VhdxFile::open_writable(self.file, spawner, &self.options).await + } +} + +/// An open VHDX file handle. +/// +/// Created via [`VhdxFile::open()`], which returns a [`VhdxBuilder`] +/// for configuring options before calling +/// [`read_only()`](VhdxBuilder::read_only) or +/// [`writable()`](VhdxBuilder::writable). +// +// Lock ordering (must acquire in this order, never reverse): +// 1. header_state.inner (futures::lock::Mutex — async, may be held across .await) +// 2. allocation_lock (futures::lock::Mutex — async, may be held across .await) +// 3. bat_state (parking_lot::RwLock — synchronous, NEVER across .await) +// 4. free_space.inner (parking_lot::Mutex — synchronous, NEVER across .await) +// 5. cache.pages/tags (parking_lot::Mutex — brief, NEVER across .await) +// +// header_state.inner serializes all header writes (enable_write_mode, set_log_guid, +// clear_log_guid). Its write_mode AtomicU8 provides a lock-free fast path for +// enable_write_mode, which is called on every write. +// The allocation_lock serializes the entire allocation decision (check BAT, allocate +// space, mark TFP). It is released AFTER TFP is set but BEFORE data I/O begins. +// The bat_state RwLock is held for < 1μs per access (reading/writing in-memory entries). +pub struct VhdxFile { + pub(crate) file: Arc, + pub(crate) cache: PageCache, + pub(crate) bat: Bat, + + // Parsed metadata + pub(crate) disk_size: u64, + pub(crate) block_size: u32, + pub(crate) logical_sector_size: u32, + physical_sector_size: u32, + pub(crate) has_parent: bool, + is_fully_allocated: bool, + page_83_data: Guid, + + // Metadata table (kept for on-demand metadata reads). + metadata_table: MetadataTable, + + // Header and write-mode state (async mutex for serialization, + // AtomicU8 for lock-free hot-path write-mode checks). + pub(crate) header_state: HeaderState, + + /// Serializes block allocation decisions and protects EOF geometry + /// state. Only one allocation sequence runs at a time. + /// Uses futures::lock::Mutex because it may be held across .await points. + pub(crate) allocation_lock: futures::lock::Mutex, + + /// Broadcast event notified when a TFP block completes post-allocation. + /// Writers that encounter a TFP block listen on this event and retry. + pub(crate) allocation_event: event_listener::Event, + + /// Free space tracker. Manages all space allocation within the file, + /// replacing the simple EOF-bump allocator. + pub(crate) free_space: FreeSpaceTracker, + + /// Space releases deferred until their BAT changes are durable. + /// Uses generation-based stamping to coordinate with flush(). + pub(crate) deferred_releases: DeferredReleases, + + // Mode + pub(crate) read_only: bool, + + /// Region table bytes to rewrite (set when the two on-disk copies + /// don't match). Consumed by [`VhdxBuilder::writable`]. + region_rewrite_data: Option, + + /// Error state: once set, all I/O operations fail. + /// Shared with log and apply tasks so they + /// can poison the file directly on fatal error. + pub(crate) failed: Arc, + + // Log task state (set when opened writable via VhdxBuilder::writable). + pub(crate) log_state: Option, +} + +/// Log pipeline state for a writable VHDX file. +/// +/// Created during [`VhdxBuilder::writable`] and consumed by +/// [`VhdxFile::close`] / [`VhdxFile::abort`]. All fields are set +/// together when the log task is spawned. +pub(crate) struct LogTaskState { + /// Handle to the spawned log task. + log_task: pal_async::task::Task<()>, + /// Handle to the spawned apply task. + apply_task: pal_async::task::Task<()>, + /// Flush sequencer for FSN-gated ordering. + pub flush_sequencer: Arc, + /// Failable semaphore for log backpressure. + pub log_permits: Arc, + /// LSN watermark published by the log task. `flush()` waits on this. + pub logged_lsn: Arc, +} + +impl VhdxFile { + /// Begin opening a VHDX file, returning a [`VhdxBuilder`] to configure + /// options before finalizing with [`read_only()`](VhdxBuilder::read_only) + /// or [`writable()`](VhdxBuilder::writable). + pub fn open(file: F) -> VhdxBuilder { + VhdxBuilder { + file, + options: OpenOptions::new(), + } + } + + /// Internal open logic shared by [`VhdxBuilder::read_only`] and + /// [`VhdxBuilder::writable`]. + /// + /// Validates the file identifier, headers, region tables, and metadata. + /// If the log GUID is non-zero (indicating a dirty log), replays the + /// log to recover the file. Read-only opens with a dirty log return + /// [`CorruptionType::LogReplayRequired`]. + async fn open_inner( + file: F, + read_only: bool, + log_sender: Option>>, + options: &OpenOptions, + ) -> Result { + // 1. Validate minimum file size. + let file_length = file.file_size().await.map_err(OpenErrorInner::Io)?; + if file_length < format::HEADER_AREA_SIZE { + return Err(CorruptionType::EmptyFile.into()); + } + + // 2. Validate the file identifier signature. + validate_file_identifier(&file).await?; + + // 3. Parse dual headers. + let mut header = parse_headers(&file, file_length).await?; + + // 4. If log_guid is non-zero, replay the log. + if header.log_guid != Guid::ZERO { + // A dirty log requires writing to the file to replay. If the caller + // opened read-only, we cannot proceed — the metadata may be + // inconsistent and we're not allowed to fix it. + if read_only { + return Err((CorruptionType::LogReplayRequired).into()); + } + + // The file handle hasn't been Arc-wrapped yet — pass &file directly. + let log_region = LogRegion { + file_offset: header.log_offset, + length: header.log_length, + }; + + let replay_result = log::replay_log(&file, &log_region, header.log_guid).await?; + + if replay_result.replayed { + // Write a clean header: clear log_guid, bump sequence number. + let new_seq = header.sequence_number + 1; + let (buf, write_offset) = serialize_header( + &file, + new_seq, + header.file_write_guid, + header.data_write_guid, + Guid::ZERO, + header.log_offset, + header.log_length, + header.first_header_current, + ); + file.write_from(write_offset, buf) + .await + .map_err(OpenErrorInner::Io)?; + file.flush().await.map_err(OpenErrorInner::Io)?; + + // Update the in-flight header state for the rest of the open path. + header.sequence_number = new_seq; + header.log_guid = Guid::ZERO; + header.first_header_current = !header.first_header_current; + } + } + + // 5. Parse region tables. + let regions = parse_region_tables(&file).await?; + + // 6. Read metadata table. + let metadata_table = + MetadataTable::read(&file, regions.metadata_offset, regions.metadata_length).await?; + + // 7. Verify known metadata (all required system items are recognized). + verify_known_metadata(&metadata_table, false)?; + + // 8. Wrap file in Arc for shared access. + let file = Arc::new(file); + + // 9. Create PageCache and register tags before reading metadata items. + let mut cache = PageCache::new( + file.clone(), + log_sender.map(crate::log_task::LogClient::new), + None, + 0, + ); + cache.register_tag(BAT_TAG, regions.bat_offset); + cache.register_tag(METADATA_TAG, regions.metadata_offset); + cache.register_tag(SBM_TAG, 0); + + // 10. Read known metadata values. + let known = read_known_metadata(&cache, &metadata_table).await?; + + // 11. Create BAT manager. + let mut bat = Bat::new( + known.disk_size, + known.block_size, + known.logical_sector_size, + known.has_parent, + regions.bat_length, + )?; + + // 12. Create FreeSpaceTracker. + let (free_space, mut eof_state) = FreeSpaceTracker::new( + file_length, + known.block_size, + options.block_alignment, + format::HEADER_AREA_SIZE, + header.log_offset, + header.log_length, + regions.bat_offset, + regions.bat_length, + regions.metadata_offset, + regions.metadata_length, + bat.data_block_count, + )?; + + // 13. Load in-memory BAT from disk. + bat.load_bat_state( + &*file, + regions.bat_offset, + regions.bat_length, + &free_space, + &mut eof_state, + ) + .await?; + + // 14. Finalize free space initialization after BAT parse. + free_space.complete_initialization(&eof_state); + + // 15. Construct VhdxFile. + Ok(VhdxFile { + file, + cache, + bat, + disk_size: known.disk_size, + block_size: known.block_size, + logical_sector_size: known.logical_sector_size, + physical_sector_size: known.physical_sector_size, + has_parent: known.has_parent, + is_fully_allocated: known.leave_blocks_allocated, + page_83_data: known.page_83_data, + metadata_table, + header_state: HeaderState::new(&header), + allocation_lock: futures::lock::Mutex::new(eof_state), + allocation_event: event_listener::Event::new(), + free_space, + deferred_releases: DeferredReleases::new(), + + read_only, + region_rewrite_data: regions.rewrite_data, + failed: Arc::new(FailureFlag::new()), + + log_state: None, + }) + } + + /// Open an existing VHDX file in read-only mode. + /// + /// If [`OpenOptions::allow_replay`] is true, a dirty log is replayed + /// (requires the file handle to support writes for the replay I/O), + /// but the resulting `VhdxFile` is still read-only. If false, a dirty + /// log returns [`CorruptionType::LogReplayRequired`]. + async fn open_read_only(file: F, options: &OpenOptions) -> Result { + if options.allow_replay { + let mut vhdx = Self::open_inner(file, false, None, options).await?; + vhdx.read_only = true; + Ok(vhdx) + } else { + Self::open_inner(file, true, None, options).await + } + } + + /// Open an existing VHDX file in writable mode with a log task. + /// + /// Replays a dirty log if needed, then spawns a log task for + /// crash-consistent metadata writes. The log task receives dirty pages + /// on `flush()` and writes them as WAL entries. + /// + /// The spawner must implement [`pal_async::task::Spawn`] to spawn the + /// background log task. + /// + /// Call [`close()`](Self::close) for a clean shutdown. Dropping without + /// close leaves the VHDX file dirty (log will be replayed on next open). + async fn open_writable( + file: F, + spawner: &impl pal_async::task::Spawn, + options: &OpenOptions, + ) -> Result { + // Create mesh channel before open_inner so the cache gets the + // sender at construction time. + let (tx, rx) = mesh::channel::>(); + let mut vhdx = Self::open_inner(file, false, Some(tx.clone()), options).await?; + + // Create shared state for log task communication. + let flush_sequencer = { + let mut fs = FlushSequencer::new(); + fs.set_failure_flag(vhdx.failed.clone()); + Arc::new(fs) + }; + let log_permits = Arc::new(crate::log_permits::LogPermits::new( + // Permit count is a multiple of MAX_COMMIT_PAGES to allow + // pipelining: multiple batches can be in-flight (committed + // but not yet applied) simultaneously. Permits are released + // by the apply task, not at commit time. + crate::cache::MAX_COMMIT_PAGES * 4, + )); + let logged_lsn = Arc::new(crate::lsn_watermark::LsnWatermark::new()); + + // Initialize the log writer. + let log_guid = Guid::new_random(); + let (log_offset, log_length) = vhdx.header_state.log_region(); + let log_region = LogRegion { + file_offset: log_offset, + length: log_length, + }; + let file_length = vhdx.file.file_size().await.map_err(OpenErrorInner::Io)?; + let log_writer = + log::LogWriter::initialize(vhdx.file.as_ref(), log_region, log_guid, file_length) + .await?; + + // Write header with log_guid set (marks file as dirty). + // This is done BEFORE spawning the log task so the file is marked + // dirty before any log entries are written. + vhdx.header_state + .set_log_guid(log_guid, vhdx.file.as_ref(), None) + .await + .map_err(OpenErrorInner::Io)?; + + // Spawn the apply task. + let applied_lsn = Arc::new(crate::lsn_watermark::LsnWatermark::new()); + let (apply_tx, apply_rx) = mesh::channel::>(); + let apply_task = spawner.spawn( + "vhdx-apply-task", + crate::apply_task::run_apply_task( + apply_rx, + vhdx.file.clone(), + flush_sequencer.clone(), + applied_lsn.clone(), + log_permits.clone(), + vhdx.failed.clone(), + ), + ); + + // Spawn the log task. + let task = spawner.spawn( + "vhdx-log-task", + crate::log_task::LogTask::new( + vhdx.file.clone(), + log_writer, + flush_sequencer.clone(), + log_permits.clone(), + logged_lsn.clone(), + applied_lsn.clone(), + apply_tx, + vhdx.failed.clone(), + ) + .run(rx), + ); + + // Set log state on the cache. + vhdx.cache.set_log_state(crate::cache::CacheLogState { + permits: log_permits.clone(), + applied_lsn: applied_lsn.clone(), + }); + + vhdx.log_state = Some(LogTaskState { + log_task: task, + apply_task, + flush_sequencer, + log_permits, + logged_lsn, + }); + + // Repair mismatched region tables through the write-ahead log. + // The pages enter the log pipeline and will be applied in due + // course; the next caller-initiated flush() covers them via LSN + // ordering. If we crash before that, either log replay applies + // the entry or the mismatch is re-detected on reopen. + if let Some(table_data) = vhdx.region_rewrite_data.take() { + crate::region::rewrite_region_tables( + &vhdx.cache, + &vhdx + .log_state + .as_ref() + .expect("writable file has log_state") + .log_permits, + table_data, + ) + .await + .map_err(OpenErrorInner::PipelineFailed)?; + } + + Ok(vhdx) + } + + /// Gracefully close the VHDX file. + /// + /// Flushes all dirty pages through the log, applies all logged entries, + /// clears the log GUID in the header, and waits for the log task to exit. + /// + /// After this returns, the file is in a clean state (no log replay needed + /// on next open). + /// + /// If no log task is running (read-only or opened without log), this is + /// a no-op. + pub async fn close(mut self) -> Result<(), VhdxIoError> { + if let Some(state) = self.log_state.take() { + // Ship any remaining dirty pages to the log task. + // This is fire-and-forget — the Close RPC below will + // process after this batch due to channel ordering. + self.cache.commit().map_err(VhdxIoErrorInner::CommitCache)?; + + // Take the log client out of the cache to get the sender. + let client = self + .cache + .take_log_client() + .expect("log client disappeared"); + + // Send Close RPC — the log task will log+apply all pending + // batches, then respond. + client.close().await?; + + state.log_task.await; + // The log task dropping its apply_tx closes the apply channel, + // causing the apply task to exit. + state.apply_task.await; + + // Clear log GUID in the header now that the log is fully drained. + // Done BEFORE truncation so that a crash during truncation + // doesn't leave a non-zero log GUID pointing at a file that + // may have been partially shrunk. With the GUID cleared first, + // a crash at any later point just leaves a larger-than-necessary + // file — no replay is attempted. + self.header_state + .clear_log_guid(self.file.as_ref(), Some(state.flush_sequencer.as_ref())) + .await + .map_err(VhdxIoErrorInner::WriteHeader)?; + + // Truncate the file to reclaim unused trailing space. + // Best-effort: if this fails, the file is still correct, + // just not compacted. + if let Err(e) = self.truncate_file().await { + tracing::warn!( + error = &e as &dyn std::error::Error, + "failed to truncate VHDX file on close" + ); + } + } + Ok(()) + } + + /// Abort the VHDX file without graceful close. + /// + /// Drops the log channel (causing the log task to exit on its next + /// recv) and waits for the log task to finish. No pending batches are + /// applied and the log GUID is NOT cleared — the file remains dirty, + /// requiring log replay on the next open. + /// + /// This is the test-friendly equivalent of a crash: all state held by + /// the log task (including its `Arc`) is released, but no new I/O + /// is issued. + pub async fn abort(mut self) { + // Drop the log client so the log task's recv() returns Err. + self.cache.take_log_client(); + + // Wait for the log task to notice the closed channel and exit. + // The log task dropping its apply_tx closes the apply channel too. + if let Some(state) = self.log_state.take() { + state.log_task.await; + state.apply_task.await; + } + } +} + +impl VhdxFile { + /// Virtual disk size in bytes. + pub fn disk_size(&self) -> u64 { + self.disk_size + } + + /// Block size in bytes. + pub fn block_size(&self) -> u32 { + self.block_size + } + + /// Logical sector size (512 or 4096). + pub fn logical_sector_size(&self) -> u32 { + self.logical_sector_size + } + + /// Physical sector size (512 or 4096). + pub fn physical_sector_size(&self) -> u32 { + self.physical_sector_size + } + + /// Whether this is a differencing disk (has a parent). + pub fn has_parent(&self) -> bool { + self.has_parent + } + + /// Read and parse the parent locator from the metadata region. + /// + /// Returns `Ok(None)` for base (non-differencing) disks. + /// Returns an error if the locator item is missing or corrupt. + pub async fn parent_locator(&self) -> Result, OpenError> { + if !self.has_parent { + return Ok(None); + } + let locator_data = self + .metadata_table + .read_item(&self.cache, false, &format::PARENT_LOCATOR_ITEM_GUID) + .await?; + Ok(Some(crate::locator::ParentLocator::parse(&locator_data)?)) + } + + /// Whether the disk was created with all blocks pre-allocated (fixed VHD). + pub fn is_fully_allocated(&self) -> bool { + self.is_fully_allocated + } + + /// SCSI VPD Page 83 identifier (stable disk identity). + pub fn page_83_data(&self) -> Guid { + self.page_83_data + } + + /// GUID changed on every virtual-disk data write. + pub fn data_write_guid(&self) -> Guid { + self.header_state.data_write_guid() + } + + /// Whether the file was opened in read-only mode. + pub fn is_read_only(&self) -> bool { + self.read_only + } + + /// Ensures the requested write mode is enabled, updating the header + /// and flushing if needed. If the current mode already satisfies the + /// request, this is a no-op. + /// + /// Hot path (mode already enabled): single atomic load, no lock. + /// Cold path (mode transition): acquires the header async mutex, + /// generates new GUIDs, writes the header, flushes, then publishes. + pub(crate) async fn enable_write_mode(&self, mode: WriteMode) -> Result<(), std::io::Error> { + let flush_sequencer = self.log_state.as_ref().map(|s| s.flush_sequencer.as_ref()); + self.header_state + .enable_write_mode(mode, self.file.as_ref(), flush_sequencer) + .await + } +} + +/// Validate the file identifier signature at offset 0. +async fn validate_file_identifier(file: &impl AsyncFile) -> Result<(), OpenError> { + // Read a full sector (not just the identifier struct) so that O_DIRECT + // backends with sector-alignment requirements work correctly. + let buf = file.alloc_buffer(4096); + let buf = file.read_into(0, buf).await.map_err(OpenErrorInner::Io)?; + + let ident = FileIdentifier::read_from_prefix(buf.as_ref()) + .map_err(|_| CorruptionType::InvalidFileIdentifier)? + .0; + + if ident.signature != format::FILE_IDENTIFIER_SIGNATURE { + return Err(CorruptionType::InvalidFileIdentifier.into()); + } + + Ok(()) +} + +/// Shared failure flag for poisoning the VHDX file from any task. +/// +/// Uses an `AtomicBool` for the fast path (`check`) and a mutex for +/// the error message. Once set, the flag is never cleared. +pub(crate) struct FailureFlag { + flag: std::sync::atomic::AtomicBool, + message: Mutex>, +} + +impl FailureFlag { + pub fn new() -> Self { + Self { + flag: std::sync::atomic::AtomicBool::new(false), + message: Mutex::new(None), + } + } + + /// Check whether the flag is set. Fast path: single atomic load. + pub fn check(&self) -> Result<(), VhdxIoError> { + if self.flag.load(std::sync::atomic::Ordering::Relaxed) + && let Some(msg) = self.message.lock().clone() + { + return Err(VhdxIoErrorInner::Failed(msg).into()); + } + Ok(()) + } + + /// Set the failure flag. First caller's message wins. + pub fn set(&self, error: &dyn std::error::Error) { + let mut msg = self.message.lock(); + if msg.is_none() { + *msg = Some(PipelineFailed(error.to_string())); + } + self.flag.store(true, std::sync::atomic::Ordering::Relaxed); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::AsyncFileExt; + use crate::create::{self, CreateParams}; + use crate::error::OpenError; + use crate::format::BatEntry; + use crate::format::BatEntryState; + use crate::format::Header; + use crate::format::MB1; + use crate::space::AllocateFlags; + use crate::tests::support::InMemoryFile; + use pal_async::async_test; + use zerocopy::IntoBytes; + + #[async_test] + async fn open_default_vhdx() { + let (file, params) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + + assert_eq!(vhdx.disk_size(), format::GB1); + assert_eq!(vhdx.block_size(), format::DEFAULT_BLOCK_SIZE); + assert_eq!(vhdx.logical_sector_size(), 512); + assert_eq!(vhdx.physical_sector_size(), 512); + assert!(!vhdx.has_parent()); + assert!(!vhdx.is_fully_allocated()); + assert!(vhdx.is_read_only()); + assert_ne!(vhdx.data_write_guid(), Guid::ZERO); + assert_eq!(vhdx.data_write_guid(), params.data_write_guid); + } + + #[async_test] + async fn open_4k_sector_vhdx() { + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: format::GB1, + logical_sector_size: 4096, + physical_sector_size: 4096, + ..Default::default() + }; + create::create(&file, &mut params).await.unwrap(); + + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + assert_eq!(vhdx.logical_sector_size(), 4096); + assert_eq!(vhdx.physical_sector_size(), 4096); + } + + #[async_test] + async fn open_512_sector_vhdx() { + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: format::GB1, + logical_sector_size: 512, + physical_sector_size: 512, + ..Default::default() + }; + create::create(&file, &mut params).await.unwrap(); + + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + assert_eq!(vhdx.logical_sector_size(), 512); + assert_eq!(vhdx.physical_sector_size(), 512); + } + + #[async_test] + async fn open_various_block_sizes() { + for &block_size in &[ + MB1 as u32, + 2 * MB1 as u32, + 32 * MB1 as u32, + 256 * MB1 as u32, + ] { + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: format::GB1, + block_size, + ..Default::default() + }; + create::create(&file, &mut params).await.unwrap(); + + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + assert_eq!(vhdx.block_size(), block_size); + } + } + + #[async_test] + async fn open_differencing_disk() { + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: format::GB1, + has_parent: true, + ..Default::default() + }; + create::create(&file, &mut params).await.unwrap(); + + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + assert!(vhdx.has_parent()); + } + + #[async_test] + async fn open_fully_allocated() { + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: format::GB1, + is_fully_allocated: true, + ..Default::default() + }; + create::create(&file, &mut params).await.unwrap(); + + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + assert!(vhdx.is_fully_allocated()); + } + + #[async_test] + async fn open_dirty_log_no_valid_entries() { + // Setting log_guid to a random GUID without writing matching log + // entries causes replay_log to return NoValidLogEntries. + let (file, _params) = InMemoryFile::create_test_vhdx(format::GB1).await; + + // Overwrite header 2's log_guid with a non-zero GUID, then fix the CRC. + let mut buf = vec![0u8; format::HEADER_SIZE as usize]; + file.read_at(format::HEADER_OFFSET_2, &mut buf) + .await + .unwrap(); + + let mut header = Header::read_from_prefix(&buf).unwrap().0.clone(); + header.log_guid = Guid::new_random(); + header.checksum = 0; + + let header_bytes = header.as_bytes(); + buf[..header_bytes.len()].copy_from_slice(header_bytes); + let crc = format::compute_checksum(&buf, 4); + buf[4..8].copy_from_slice(&crc.to_le_bytes()); + file.write_at(format::HEADER_OFFSET_2, &buf).await.unwrap(); + + let result = VhdxFile::open(file).allow_replay(true).read_only().await; + assert!(matches!( + result, + Err(OpenError(OpenErrorInner::Corrupt( + CorruptionType::NoValidLogEntries + ))) + )); + } + + #[async_test] + async fn open_invalid_file_identifier() { + let (file, _params) = InMemoryFile::create_test_vhdx(format::GB1).await; + + // Corrupt the file identifier signature. + file.write_at(0, b"BADMAGIC").await.unwrap(); + + let result = VhdxFile::open(file).read_only().await; + assert!(matches!( + result, + Err(OpenError(OpenErrorInner::Corrupt( + CorruptionType::InvalidFileIdentifier + ))) + )); + } + + #[async_test] + async fn open_empty_file() { + // File smaller than HEADER_AREA_SIZE (1 MiB). + let file = InMemoryFile::new(512); + let result = VhdxFile::open(file).read_only().await; + assert!(matches!( + result, + Err(OpenError(OpenErrorInner::Corrupt( + CorruptionType::EmptyFile + ))) + )); + } + + #[async_test] + async fn open_bat_block_lookup() { + let (file, _params) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + + // A newly created dynamic disk has all blocks as NotPresent. + let mapping = vhdx.bat.get_block_mapping(0); + assert_eq!(mapping.bat_state(), BatEntryState::NotPresent); + assert_eq!(mapping.file_offset(), 0); + } + + #[async_test] + async fn open_bat_all_blocks_default() { + let disk_size = 4 * MB1; // Small disk → 2 blocks. + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size, + ..Default::default() + }; + create::create(&file, &mut params).await.unwrap(); + + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + let block_count = (disk_size / vhdx.block_size() as u64) as u32; + + for block in 0..block_count { + let mapping = vhdx.bat.get_block_mapping(block); + assert_eq!(mapping.bat_state(), BatEntryState::NotPresent); + assert_eq!(mapping.file_offset(), 0); + } + } + + #[async_test] + async fn open_read_only_flag() { + let (file, _params) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + assert!(vhdx.is_read_only()); + } + + #[async_test] + async fn open_populates_in_memory_bat() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + + // All payload entries should be NotPresent. + for i in 0..vhdx.bat.data_block_count { + assert_eq!( + vhdx.bat.get_block_mapping(i).bat_state(), + BatEntryState::NotPresent, + "block {i} should be NotPresent" + ); + } + } + + #[async_test] + async fn open_with_allocated_blocks() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let regions = parse_region_tables(&file).await.unwrap(); + + // Manually write a FullyPresent BAT entry for block 0 at offset 4 MB + // (just after the metadata region, within the file). + // First extend the file to cover the block (4 MB offset + 2 MB block = 6 MB). + file.set_file_size(6 * MB1).await.unwrap(); + + let entry = BatEntry::new() + .with_state(BatEntryState::FullyPresent as u8) + .with_file_offset_mb(4); + file.write_at(regions.bat_offset, entry.as_bytes()) + .await + .unwrap(); + + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + let mapping = vhdx.bat.get_block_mapping(0); + assert_eq!(mapping.bat_state(), BatEntryState::FullyPresent,); + assert_eq!(mapping.file_megabyte(), 4); + } + + #[async_test] + async fn bat_lookup_is_synchronous() { + // Compile-time verification: get_block_mapping() is a regular fn, + // not an async fn. We call it without .await. + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + let mapping = vhdx.bat.get_block_mapping(0); + assert_eq!(mapping.bat_state(), BatEntryState::NotPresent); + } + + #[async_test] + async fn eof_counter_no_overlap() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open_inner(file, false, None, &OpenOptions::new()) + .await + .unwrap(); + let mut eof = vhdx.allocation_lock.lock().await; + let a = vhdx + .allocate_space(&mut eof, MB1 as u32, AllocateFlags::new()) + .await + .unwrap(); + let b = vhdx + .allocate_space(&mut eof, MB1 as u32, AllocateFlags::new()) + .await + .unwrap(); + // Two allocations must not overlap. + assert_ne!(a.file_offset, b.file_offset); + assert!(b.file_offset >= a.file_offset + MB1); + } + + #[async_test] + async fn eof_counter_mb_aligned() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open_inner(file, false, None, &OpenOptions::new()) + .await + .unwrap(); + let mut eof = vhdx.allocation_lock.lock().await; + let result = vhdx + .allocate_space(&mut eof, MB1 as u32, AllocateFlags::new()) + .await + .unwrap(); + assert_eq!(result.file_offset % MB1, 0, "offset must be MB1-aligned"); + } + + #[async_test] + async fn open_with_allocated_blocks_inits_space() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let regions = parse_region_tables(&file).await.unwrap(); + + // Extend file to 8 MB then write a FullyPresent BAT entry at offset 4 MB. + file.set_file_size(8 * MB1).await.unwrap(); + + let entry = BatEntry::new() + .with_state(BatEntryState::FullyPresent as u8) + .with_file_offset_mb(4); + file.write_at(regions.bat_offset, entry.as_bytes()) + .await + .unwrap(); + + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + + // The free space tracker should have offset 4*MB marked as in-use. + let eof = vhdx.allocation_lock.lock().await; + assert!( + vhdx.free_space + .is_range_in_use(&eof, 4 * MB1, vhdx.block_size()) + ); + } + + #[async_test] + async fn non_differencing_no_locator() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + assert!(!vhdx.has_parent()); + assert!(vhdx.parent_locator().await.unwrap().is_none()); + } + + /// Helper: inject a parent locator metadata entry and blob into a diff disk. + /// + /// Reads the existing metadata table, appends a new entry for the parent + /// locator GUID, writes the locator blob at the entry's data offset, and + /// updates the metadata table header's entry count. + async fn inject_parent_locator(file: &InMemoryFile, locator_blob: &[u8]) { + use crate::format::{MetadataTableEntry, MetadataTableEntryFlags, MetadataTableHeader}; + use zerocopy::{FromBytes, IntoBytes}; + + let regions = parse_region_tables(file).await.unwrap(); + + // Read the full metadata table (first 64 KiB of metadata region). + let mut table_buf = vec![0u8; format::METADATA_TABLE_SIZE as usize]; + file.read_at(regions.metadata_offset, &mut table_buf) + .await + .unwrap(); + + // Parse header to get current entry count. + let mut header = MetadataTableHeader::read_from_prefix(&table_buf) + .unwrap() + .0 + .clone(); + let old_count = header.entry_count as usize; + let entry_size = size_of::(); + let header_size = size_of::(); + + // Find the max data offset used by existing entries to place our blob after them. + let mut max_data_end: u32 = format::METADATA_TABLE_SIZE as u32; + for i in 0..old_count { + let off = header_size + i * entry_size; + let entry = MetadataTableEntry::read_from_prefix(&table_buf[off..]) + .unwrap() + .0 + .clone(); + if entry.length > 0 { + let end = entry.offset + entry.length; + if end > max_data_end { + max_data_end = end; + } + } + } + + // Place the parent locator blob right after existing data. + let locator_offset = max_data_end; + + // Write the new entry. + let new_entry = MetadataTableEntry { + item_id: format::PARENT_LOCATOR_ITEM_GUID, + offset: locator_offset, + length: locator_blob.len() as u32, + flags: MetadataTableEntryFlags::new().with_is_required(true), + reserved2: 0, + }; + let new_entry_file_offset = header_size + old_count * entry_size; + let e_bytes = new_entry.as_bytes(); + table_buf[new_entry_file_offset..new_entry_file_offset + e_bytes.len()] + .copy_from_slice(e_bytes); + + // Update header entry count. + header.entry_count = (old_count + 1) as u16; + let h_bytes = header.as_bytes(); + table_buf[..h_bytes.len()].copy_from_slice(h_bytes); + + // Write back the metadata table. + file.write_at(regions.metadata_offset, &table_buf) + .await + .unwrap(); + + // Write the locator blob into the metadata region data area. + file.write_at( + regions.metadata_offset + locator_offset as u64, + locator_blob, + ) + .await + .unwrap(); + } + + #[async_test] + async fn differencing_has_locator() { + use crate::locator; + + // Create a differencing disk. + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: format::GB1, + has_parent: true, + ..Default::default() + }; + create::create(&file, &mut params).await.unwrap(); + + // Build a parent locator blob and inject it into the metadata region. + let locator_blob = locator::build_locator( + format::PARENT_LOCATOR_VHDX_TYPE_GUID, + &[ + ("parent_linkage", "{some-guid}"), + ("relative_path", ".\\parent.vhdx"), + ("absolute_win32_path", "C:\\VMs\\parent.vhdx"), + ], + ); + inject_parent_locator(&file, &locator_blob).await; + + // Open and verify. + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + assert!(vhdx.has_parent()); + + let loc = vhdx + .parent_locator() + .await + .unwrap() + .expect("should have locator"); + assert_eq!(loc.locator_type, format::PARENT_LOCATOR_VHDX_TYPE_GUID); + assert_eq!(loc.find("parent_linkage"), Some("{some-guid}")); + assert_eq!(loc.find("relative_path"), Some(".\\parent.vhdx")); + assert_eq!( + loc.find("absolute_win32_path"), + Some("C:\\VMs\\parent.vhdx") + ); + } + + #[async_test] + async fn parent_paths_extraction() { + use crate::locator; + + // Create a differencing disk with a parent locator. + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: format::GB1, + has_parent: true, + ..Default::default() + }; + create::create(&file, &mut params).await.unwrap(); + + let locator_blob = locator::build_locator( + format::PARENT_LOCATOR_VHDX_TYPE_GUID, + &[ + ("parent_linkage", "{some-guid}"), + ("relative_path", ".\\parent.vhdx"), + ("absolute_win32_path", "C:\\VMs\\parent.vhdx"), + ], + ); + inject_parent_locator(&file, &locator_blob).await; + + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + let loc = vhdx + .parent_locator() + .await + .unwrap() + .expect("should have locator"); + let paths = loc.parent_paths(); + assert_eq!(paths.parent_linkage.as_deref(), Some("{some-guid}")); + assert_eq!(paths.relative_path.as_deref(), Some(".\\parent.vhdx")); + assert_eq!( + paths.absolute_win32_path.as_deref(), + Some("C:\\VMs\\parent.vhdx") + ); + assert!(paths.volume_path.is_none()); + } + + #[async_test] + async fn differencing_missing_locator_errors() { + // Create a diff disk but don't write any locator data. + // create() doesn't add a parent locator entry, so read_item() will + // return MissingRequiredMetadata. + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: format::GB1, + has_parent: true, + ..Default::default() + }; + create::create(&file, &mut params).await.unwrap(); + + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + assert!(vhdx.has_parent()); + let result = vhdx.parent_locator().await; + assert!(result.is_err()); + } + + // ----------------------------------------------------------------------- + // Log replay integration tests + // ----------------------------------------------------------------------- + + /// Inject a dirty log into a VHDX file: + /// 1. Write log entries using LogWriter + /// 2. Set the header's log_guid to match + /// 3. Update header CRC + /// + /// Returns the log_guid used. + async fn inject_dirty_log( + file: &InMemoryFile, + data_pages: &[log::DataPage<'_>], + zero_ranges: &[log::ZeroRange], + ) -> Guid { + // Read the active header (header 2, sequence_number=1 after create). + let mut hdr_buf = vec![0u8; format::HEADER_SIZE as usize]; + file.read_at(format::HEADER_OFFSET_2, &mut hdr_buf) + .await + .unwrap(); + let header = Header::read_from_prefix(&hdr_buf).unwrap().0.clone(); + + let log_guid = Guid::new_random(); + let log_region = LogRegion { + file_offset: header.log_offset, + length: header.log_length, + }; + + // Initialize a LogWriter and write the entry. + let file_size = file.file_size().await.unwrap(); + let mut writer = log::LogWriter::initialize(file, log_region, log_guid, file_size) + .await + .unwrap(); + + if !data_pages.is_empty() || !zero_ranges.is_empty() { + writer + .write_entry(file, data_pages, zero_ranges) + .await + .unwrap(); + } + + // Set log_guid in a new header with bumped sequence number. + // Write to header 1 (the non-current slot) with a higher sequence + // number so it becomes the active header. + let mut header_copy = header; + header_copy.log_guid = log_guid; + header_copy.sequence_number += 1; + header_copy.checksum = 0; + + let mut buf = vec![0u8; format::HEADER_SIZE as usize]; + let hdr_bytes = header_copy.as_bytes(); + buf[..hdr_bytes.len()].copy_from_slice(hdr_bytes); + let crc = format::compute_checksum(&buf, 4); + buf[4..8].copy_from_slice(&crc.to_le_bytes()); + + // Write to header 1 (which now has a higher seq, becoming active). + file.write_at(format::HEADER_OFFSET_1, &buf).await.unwrap(); + + log_guid + } + + #[async_test] + async fn open_replays_dirty_log_data() { + let (file, _params) = InMemoryFile::create_test_vhdx(format::GB1).await; + + // Pick a target offset >= LOGABLE_OFFSET (192 KiB = region table offset). + // Use 320 KiB (= 5 * 64 KiB) to be past both region tables. + let target_offset: u64 = 5 * format::KB64; + + // Build a recognizable data pattern. + let pattern = [0xABu8; 4096]; + let data_page = log::DataPage { + file_offset: target_offset, + payload: &pattern, + }; + + inject_dirty_log(&file, &[data_page], &[]).await; + + // Open should replay the log and succeed. + let vhdx = VhdxFile::open(file) + .allow_replay(true) + .read_only() + .await + .unwrap(); + assert_eq!(vhdx.disk_size(), format::GB1); + + // Verify the data pattern was written at the target offset via the + // Arc inside the VhdxFile. + let mut readback = [0u8; 4096]; + vhdx.file + .read_at(target_offset, &mut readback) + .await + .unwrap(); + assert_eq!(readback, pattern); + } + + #[async_test] + async fn open_replays_dirty_log_zeros() { + let (file, _params) = InMemoryFile::create_test_vhdx(format::GB1).await; + + // Write non-zero data at a target offset first. + let target_offset: u64 = 5 * format::KB64; + let non_zero = [0xFFu8; 4096]; + file.write_at(target_offset, &non_zero).await.unwrap(); + + // Inject a dirty log with a zero descriptor targeting that offset. + let zero_range = log::ZeroRange { + file_offset: target_offset, + length: 4096, + }; + + inject_dirty_log(&file, &[], &[zero_range]).await; + + // Open should replay the log and succeed. + let vhdx = VhdxFile::open(file) + .allow_replay(true) + .read_only() + .await + .unwrap(); + assert_eq!(vhdx.disk_size(), format::GB1); + + // Verify the range is now zeroed. + let mut readback = [0u8; 4096]; + vhdx.file + .read_at(target_offset, &mut readback) + .await + .unwrap(); + assert_eq!(readback, [0u8; 4096]); + } + + #[async_test] + async fn open_replay_then_reopen_clean() { + let (file, _params) = InMemoryFile::create_test_vhdx(format::GB1).await; + + let target_offset: u64 = 5 * format::KB64; + let pattern = [0xCDu8; 4096]; + let data_page = log::DataPage { + file_offset: target_offset, + payload: &pattern, + }; + + inject_dirty_log(&file, &[data_page], &[]).await; + + // First open triggers replay. + let vhdx = VhdxFile::open(file) + .allow_replay(true) + .read_only() + .await + .unwrap(); + // The clean header was written to the file inside vhdx. + // Make a snapshot of the replayed file for the second open. + let snapshot = vhdx.file.snapshot(); + drop(vhdx); + + // Create a new InMemoryFile from the snapshot for the second open. + let file3 = InMemoryFile::from_snapshot(snapshot); + + // Second open should succeed without replay (log_guid is now ZERO). + let vhdx2 = VhdxFile::open(file3).read_only().await.unwrap(); + assert_eq!(vhdx2.disk_size(), format::GB1); + } + + #[async_test] + async fn open_replay_corrupt_log_entry() { + let (file, _params) = InMemoryFile::create_test_vhdx(format::GB1).await; + + let target_offset: u64 = 5 * format::KB64; + let pattern = [0xEEu8; 4096]; + let data_page = log::DataPage { + file_offset: target_offset, + payload: &pattern, + }; + + let _log_guid = inject_dirty_log(&file, &[data_page], &[]).await; + + // Read the active header to find the log region offset. + let mut hdr_buf = vec![0u8; format::HEADER_SIZE as usize]; + file.read_at(format::HEADER_OFFSET_1, &mut hdr_buf) + .await + .unwrap(); + let header = Header::read_from_prefix(&hdr_buf).unwrap().0.clone(); + + // Corrupt the first byte of the log region (flip a byte in the CRC + // of the log entry). + let mut corrupt_buf = [0u8; 1]; + file.read_at(header.log_offset + 4, &mut corrupt_buf) + .await + .unwrap(); + corrupt_buf[0] ^= 0xFF; + file.write_at(header.log_offset + 4, &corrupt_buf) + .await + .unwrap(); + + // Open should fail because there are no valid log entries for this GUID. + let result = VhdxFile::open(file).allow_replay(true).read_only().await; + assert!(matches!( + result, + Err(OpenError(OpenErrorInner::Corrupt( + CorruptionType::NoValidLogEntries + ))) + )); + } + + #[async_test] + async fn open_read_only_dirty_log_rejected() { + let (file, _params) = InMemoryFile::create_test_vhdx(format::GB1).await; + + let target_offset: u64 = 5 * format::KB64; + let pattern = [0xBBu8; 4096]; + let data_page = log::DataPage { + file_offset: target_offset, + payload: &pattern, + }; + + inject_dirty_log(&file, &[data_page], &[]).await; + + // Read-only open with a dirty log should return LogReplayRequired. + let result = VhdxFile::open(file).read_only().await; + assert!(matches!( + result, + Err(OpenError(OpenErrorInner::Corrupt( + CorruptionType::LogReplayRequired + ))) + )); + } +} diff --git a/vm/devices/storage/vhdx/src/region.rs b/vm/devices/storage/vhdx/src/region.rs new file mode 100644 index 0000000000..69e13f775f --- /dev/null +++ b/vm/devices/storage/vhdx/src/region.rs @@ -0,0 +1,527 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Region table parsing and validation for VHDX files. +//! +//! Reads both region tables, validates their signatures and CRC-32C checksums, +//! identifies BAT and metadata regions, and checks for overlaps and duplicates. + +use crate::AsyncFile; +use crate::cache::PageCache; +use crate::error::CorruptionType; +use crate::error::OpenError; +use crate::error::OpenErrorInner; +use crate::format; +use crate::format::RegionTableEntry; +use crate::format::RegionTableHeader; +use crate::log_task::LogData; +use crate::log_task::Lsn; +use std::sync::Arc; +use zerocopy::FromBytes; + +/// Parsed region table data. +pub(crate) struct ParsedRegions { + /// File offset of the BAT region. + pub bat_offset: u64, + /// Length of the BAT region in bytes. + pub bat_length: u32, + /// File offset of the metadata region. + pub metadata_offset: u64, + /// Length of the metadata region in bytes. + pub metadata_length: u32, + /// The validated region table bytes. Present only when the two on-disk + /// copies don't match and need rewriting. + pub rewrite_data: Option, +} + +/// Read and validate a single 64 KiB region table from the file. +/// Returns the raw bytes if valid, or `None` if the table is corrupt. +async fn read_and_validate_region_table( + file: &F, + offset: u64, +) -> Result, OpenError> { + let buf = file.alloc_buffer(format::REGION_TABLE_SIZE as usize); + let buf = file + .read_into(offset, buf) + .await + .map_err(OpenErrorInner::Io)?; + let buf_ref = buf.as_ref(); + + // Check signature. + let header = match RegionTableHeader::read_from_prefix(buf_ref) { + Ok((h, _)) => h, + Err(_) => return Ok(None), + }; + if header.signature != format::REGION_TABLE_SIGNATURE { + return Ok(None); + } + + // Validate CRC-32C checksum (checksum field is at byte offset 4). + if !format::validate_checksum(buf_ref, 4) { + return Ok(None); + } + + Ok(Some(buf)) +} + +/// Read both region tables from the file, validate, and extract BAT/metadata +/// region locations. +pub(crate) async fn parse_region_tables( + file: &F, +) -> Result, OpenError> { + let table1 = read_and_validate_region_table(file, format::REGION_TABLE_OFFSET).await?; + let table2 = read_and_validate_region_table(file, format::ALT_REGION_TABLE_OFFSET).await?; + + let (table, needs_rewrite) = match (table1, table2) { + (Some(t1), Some(t2)) => { + let needs_rewrite = t1.as_ref() != t2.as_ref(); + (t1, needs_rewrite) + } + (Some(t1), None) => (t1, true), + (None, Some(t2)) => (t2, true), + (None, None) => return Err(CorruptionType::RegionTablesBothCorrupt.into()), + }; + + // Parse the header to get entry count. + let header = RegionTableHeader::read_from_prefix(table.as_ref()) + .unwrap() + .0 + .clone(); + + if header.entry_count as u64 > format::REGION_TABLE_MAX_ENTRY_COUNT { + return Err(CorruptionType::InvalidEntryCountInRegionTable.into()); + } + if header.reserved != 0 { + return Err(CorruptionType::ReservedRegionTableFieldNonzero.into()); + } + + // Parse all entries. + let entry_size = size_of::(); + let header_size = size_of::(); + let mut entries = Vec::with_capacity(header.entry_count as usize); + for i in 0..header.entry_count as usize { + let offset = header_size + i * entry_size; + let entry = RegionTableEntry::read_from_prefix(&table.as_ref()[offset..]) + .unwrap() + .0 + .clone(); + + let supported = u32::from(format::RegionTableEntryFlags::new().with_required(true)); + if u32::from(entry.flags) & !supported != 0 { + return Err(CorruptionType::ReservedRegionTableFieldNonzero.into()); + } + entries.push(entry); + } + + // Sort by GUID for duplicate detection. + entries.sort_by_key(|a| a.guid); + + // Check for duplicate GUIDs. + for i in 1..entries.len() { + if entries[i].guid == entries[i - 1].guid { + return Err(CorruptionType::DuplicateRegionEntry.into()); + } + } + + // Validate each entry's offset and length. + for entry in &entries { + if entry.length == 0 { + return Err(CorruptionType::OffsetOrLengthInRegionTable.into()); + } + if !entry.file_offset.is_multiple_of(format::REGION_ALIGNMENT) + || !(entry.length as u64).is_multiple_of(format::REGION_ALIGNMENT) + { + return Err(CorruptionType::OffsetOrLengthInRegionTable.into()); + } + } + + // Check for overlapping regions by sorting by offset. + let mut by_offset: Vec<(u64, u64)> = entries + .iter() + .map(|e| (e.file_offset, e.length as u64)) + .collect(); + // Also include the 1 MiB header area as a reserved region. + by_offset.push((0, format::HEADER_AREA_SIZE)); + by_offset.sort_by_key(|&(offset, _)| offset); + + for i in 1..by_offset.len() { + let prev_end = by_offset[i - 1] + .0 + .checked_add(by_offset[i - 1].1) + .ok_or(CorruptionType::OffsetOrLengthInRegionTable)?; + if prev_end > by_offset[i].0 { + return Err(CorruptionType::OffsetOrLengthInRegionTable.into()); + } + } + + // Identify known regions. + let mut bat_offset = None; + let mut bat_length = None; + let mut metadata_offset = None; + let mut metadata_length = None; + + for entry in &entries { + if entry.guid == format::BAT_REGION_GUID { + bat_offset = Some(entry.file_offset); + bat_length = Some(entry.length); + } else if entry.guid == format::METADATA_REGION_GUID { + metadata_offset = Some(entry.file_offset); + metadata_length = Some(entry.length); + } else if entry.flags.required() { + return Err(CorruptionType::UnknownRequiredRegion.into()); + } + // Unknown non-required regions are silently ignored. + } + + let bat_offset = bat_offset.ok_or(CorruptionType::MissingBatOrMetadataRegion)?; + let bat_length = bat_length.ok_or(CorruptionType::MissingBatOrMetadataRegion)?; + let metadata_offset = metadata_offset.ok_or(CorruptionType::MissingBatOrMetadataRegion)?; + let metadata_length = metadata_length.ok_or(CorruptionType::MissingBatOrMetadataRegion)?; + + Ok(ParsedRegions { + bat_offset, + bat_length, + metadata_offset, + metadata_length, + rewrite_data: if needs_rewrite { + Some(table.clone()) + } else { + None + }, + }) +} + +/// Write the region table to both on-disk slots via the write-ahead log. +/// +/// Called during [`VhdxBuilder::writable`](crate::open::VhdxBuilder::writable) +/// when one region table was corrupt or the two copies didn't match. Acquires +/// log permits, sends the pages through [`PageCache::commit_raw`], and returns +/// the LSN. The caller must wait for the LSN and flush to make the writes +/// durable. +pub(crate) async fn rewrite_region_tables( + cache: &PageCache, + log_permits: &crate::log_permits::LogPermits, + table: F::Buffer, +) -> Result { + assert_eq!( + table.as_ref().len(), + format::REGION_TABLE_SIZE as usize, + "region table must be exactly {} bytes", + format::REGION_TABLE_SIZE + ); + + let log_data_page_size = format::LOG_SECTOR_SIZE as usize; + let pages_per_table = format::REGION_TABLE_SIZE as usize / log_data_page_size; + let total_pages = pages_per_table * 2; + let table = Arc::new(table); + let mut pages = Vec::with_capacity(2); + + for base_offset in [format::REGION_TABLE_OFFSET, format::ALT_REGION_TABLE_OFFSET] { + pages.push(LogData::new(base_offset, table.clone())); + } + + log_permits.acquire(total_pages).await?; + Ok(cache.commit_raw(pages, None)) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::AsyncFileExt; + use crate::error::OpenErrorInner; + use crate::open::VhdxFile; + use crate::tests::support::InMemoryFile; + use pal_async::DefaultDriver; + use pal_async::async_test; + use zerocopy::IntoBytes; + + #[async_test] + async fn parse_valid_region_tables() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let regions = parse_region_tables(&file).await.unwrap(); + + // Metadata at 2 MiB, BAT at 3 MiB (based on create layout). + assert_eq!(regions.metadata_offset, 2 * format::MB1); + assert_eq!( + regions.metadata_length, + format::DEFAULT_METADATA_REGION_SIZE + ); + assert_eq!(regions.bat_offset, 3 * format::MB1); + assert!(regions.rewrite_data.is_none()); + } + + #[async_test] + async fn parse_one_corrupt_table() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + + // Corrupt the first region table's CRC. + let mut buf = vec![0u8; format::REGION_TABLE_SIZE as usize]; + file.read_at(format::REGION_TABLE_OFFSET, &mut buf) + .await + .unwrap(); + buf[10] ^= 0xFF; + file.write_at(format::REGION_TABLE_OFFSET, &buf) + .await + .unwrap(); + + let regions = parse_region_tables(&file).await.unwrap(); + assert!(regions.rewrite_data.is_some()); + // Should still parse successfully using table 2. + assert_eq!(regions.metadata_offset, 2 * format::MB1); + } + + #[async_test] + async fn parse_both_corrupt() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + + // Corrupt both region tables. + for offset in [format::REGION_TABLE_OFFSET, format::ALT_REGION_TABLE_OFFSET] { + let mut buf = vec![0u8; format::REGION_TABLE_SIZE as usize]; + file.read_at(offset, &mut buf).await.unwrap(); + buf[10] ^= 0xFF; + file.write_at(offset, &buf).await.unwrap(); + } + + let result = parse_region_tables(&file).await; + assert!(matches!( + result, + Err(OpenError(OpenErrorInner::Corrupt( + CorruptionType::RegionTablesBothCorrupt + ))) + )); + } + + /// Helper to build a custom region table and write it to both locations. + async fn write_custom_region_table(file: &InMemoryFile, entries: &[RegionTableEntry]) { + let mut buf = vec![0u8; format::REGION_TABLE_SIZE as usize]; + let header = RegionTableHeader { + signature: format::REGION_TABLE_SIGNATURE, + checksum: 0, + entry_count: entries.len() as u32, + reserved: 0, + }; + let h_bytes = header.as_bytes(); + buf[..h_bytes.len()].copy_from_slice(h_bytes); + + let entry_start = size_of::(); + for (i, entry) in entries.iter().enumerate() { + let off = entry_start + i * size_of::(); + let e_bytes = entry.as_bytes(); + buf[off..off + e_bytes.len()].copy_from_slice(e_bytes); + } + + let crc = format::compute_checksum(&buf, 4); + buf[4..8].copy_from_slice(&crc.to_le_bytes()); + + file.write_at(format::REGION_TABLE_OFFSET, &buf) + .await + .unwrap(); + file.write_at(format::ALT_REGION_TABLE_OFFSET, &buf) + .await + .unwrap(); + } + + #[async_test] + async fn parse_missing_bat_region() { + let file = InMemoryFile::new(format::HEADER_AREA_SIZE); + // Only metadata region, no BAT. + let entries = vec![RegionTableEntry { + guid: format::METADATA_REGION_GUID, + file_offset: 2 * format::MB1, + length: format::MB1 as u32, + flags: format::RegionTableEntryFlags::new().with_required(true), + }]; + write_custom_region_table(&file, &entries).await; + + let result = parse_region_tables(&file).await; + assert!(matches!( + result, + Err(OpenError(OpenErrorInner::Corrupt( + CorruptionType::MissingBatOrMetadataRegion + ))) + )); + } + + #[async_test] + async fn parse_duplicate_region() { + let file = InMemoryFile::new(format::HEADER_AREA_SIZE); + let entries = vec![ + RegionTableEntry { + guid: format::BAT_REGION_GUID, + file_offset: 2 * format::MB1, + length: format::MB1 as u32, + flags: format::RegionTableEntryFlags::new().with_required(true), + }, + RegionTableEntry { + guid: format::BAT_REGION_GUID, + file_offset: 3 * format::MB1, + length: format::MB1 as u32, + flags: format::RegionTableEntryFlags::new().with_required(true), + }, + ]; + write_custom_region_table(&file, &entries).await; + + let result = parse_region_tables(&file).await; + assert!(matches!( + result, + Err(OpenError(OpenErrorInner::Corrupt( + CorruptionType::DuplicateRegionEntry + ))) + )); + } + + async fn corrupt_region_table_bytes( + file: &InMemoryFile, + entries: &[RegionTableEntry], + corrupt: impl FnOnce(&mut [u8]), + ) { + write_custom_region_table(file, entries).await; + + let mut buf = vec![0u8; format::REGION_TABLE_SIZE as usize]; + file.read_at(format::REGION_TABLE_OFFSET, &mut buf) + .await + .unwrap(); + corrupt(&mut buf); + let crc = format::compute_checksum(&buf, 4); + buf[4..8].copy_from_slice(&crc.to_le_bytes()); + + file.write_at(format::REGION_TABLE_OFFSET, &buf) + .await + .unwrap(); + file.write_at(format::ALT_REGION_TABLE_OFFSET, &buf) + .await + .unwrap(); + } + + #[async_test] + async fn parse_region_table_with_nonzero_reserved_header() { + let file = InMemoryFile::new(format::HEADER_AREA_SIZE); + let entries = vec![ + RegionTableEntry { + guid: format::BAT_REGION_GUID, + file_offset: 2 * format::MB1, + length: format::MB1 as u32, + flags: format::RegionTableEntryFlags::new().with_required(true), + }, + RegionTableEntry { + guid: format::METADATA_REGION_GUID, + file_offset: 3 * format::MB1, + length: format::MB1 as u32, + flags: format::RegionTableEntryFlags::new().with_required(true), + }, + ]; + corrupt_region_table_bytes(&file, &entries, |buf| { + buf[12..16].copy_from_slice(&1_u32.to_le_bytes()); + }) + .await; + + let result = parse_region_tables(&file).await; + assert!(matches!( + result, + Err(OpenError(OpenErrorInner::Corrupt( + CorruptionType::ReservedRegionTableFieldNonzero + ))) + )); + } + + #[async_test] + async fn parse_region_table_with_reserved_entry_flags() { + let file = InMemoryFile::new(format::HEADER_AREA_SIZE); + let entries = vec![ + RegionTableEntry { + guid: format::BAT_REGION_GUID, + file_offset: 2 * format::MB1, + length: format::MB1 as u32, + flags: format::RegionTableEntryFlags::new().with_required(true), + }, + RegionTableEntry { + guid: format::METADATA_REGION_GUID, + file_offset: 3 * format::MB1, + length: format::MB1 as u32, + flags: format::RegionTableEntryFlags::new().with_required(true), + }, + ]; + corrupt_region_table_bytes(&file, &entries, |buf| { + let entry_flags_offset = size_of::() + 28; + buf[entry_flags_offset..entry_flags_offset + 4].copy_from_slice(&3_u32.to_le_bytes()); + }) + .await; + + let result = parse_region_tables(&file).await; + assert!(matches!( + result, + Err(OpenError(OpenErrorInner::Corrupt( + CorruptionType::ReservedRegionTableFieldNonzero + ))) + )); + } + + #[async_test] + async fn parse_overlapping_regions() { + let file = InMemoryFile::new(format::HEADER_AREA_SIZE); + // Two regions that overlap at the 2 MiB mark. + let entries = vec![ + RegionTableEntry { + guid: format::BAT_REGION_GUID, + file_offset: 2 * format::MB1, + length: 2 * format::MB1 as u32, + flags: format::RegionTableEntryFlags::new().with_required(true), + }, + RegionTableEntry { + guid: format::METADATA_REGION_GUID, + file_offset: 3 * format::MB1, + length: format::MB1 as u32, + flags: format::RegionTableEntryFlags::new().with_required(true), + }, + ]; + write_custom_region_table(&file, &entries).await; + + let result = parse_region_tables(&file).await; + assert!(matches!( + result, + Err(OpenError(OpenErrorInner::Corrupt( + CorruptionType::OffsetOrLengthInRegionTable + ))) + )); + } + + #[async_test] + async fn rewrite_repairs_corrupt_table(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + + // Corrupt the first region table. + let mut buf = vec![0u8; format::REGION_TABLE_SIZE as usize]; + file.read_at(format::REGION_TABLE_OFFSET, &mut buf) + .await + .unwrap(); + buf[10] ^= 0xFF; + file.write_at(format::REGION_TABLE_OFFSET, &buf) + .await + .unwrap(); + + // Opening writable should detect and repair the mismatch via the log. + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let file_ref = vhdx.file.clone(); + vhdx.close().await.unwrap(); + + // Parse again — both should match now. + let regions2 = parse_region_tables(&*file_ref).await.unwrap(); + assert!( + regions2.rewrite_data.is_none(), + "tables should match after rewrite" + ); + + // Verify both on-disk copies are identical. + let mut t1 = vec![0u8; format::REGION_TABLE_SIZE as usize]; + let mut t2 = vec![0u8; format::REGION_TABLE_SIZE as usize]; + file_ref + .read_at(format::REGION_TABLE_OFFSET, &mut t1) + .await + .unwrap(); + file_ref + .read_at(format::ALT_REGION_TABLE_OFFSET, &mut t2) + .await + .unwrap(); + assert_eq!(t1, t2, "both region tables should be identical"); + } +} diff --git a/vm/devices/storage/vhdx/src/sector_bitmap.rs b/vm/devices/storage/vhdx/src/sector_bitmap.rs new file mode 100644 index 0000000000..b29e1309ef --- /dev/null +++ b/vm/devices/storage/vhdx/src/sector_bitmap.rs @@ -0,0 +1,688 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Sector bitmap read logic for partially-present VHDX blocks. +//! +//! A sector bitmap is a 1-MiB block of bits where each bit represents one +//! logical sector. Bit = 1 means the sector's data is present in this VHDX +//! file; bit = 0 means the sector is transparent (should be read from the +//! parent disk in a differencing chain). +//! +//! The bitmap is cached in 4-KiB pages via the [`PageCache`]. Each page +//! covers `4096 * 8 = 32768` sectors. + +use crate::AsyncFile; +use crate::cache::PageKey; +use crate::cache::WriteMode; +use crate::error::VhdxIoError; +use crate::error::VhdxIoErrorInner; +use crate::format::BatEntryState; +use crate::format::CACHE_PAGE_SIZE; +use crate::format::SECTORS_PER_CHUNK; +use crate::io::ReadRange; +use crate::open::VhdxFile; +use bitvec::prelude::*; + +/// Cache tag for sector bitmap pages. +/// +/// SBM pages are at absolute file offsets (not region-relative), so the +/// base offset for this tag is 0. +pub(crate) const SBM_TAG: u8 = 2; + +/// Number of sectors tracked per bitmap cache page (4 KiB * 8 bits = 32768). +const SECTORS_PER_BITMAP_PAGE: u64 = CACHE_PAGE_SIZE * 8; + +impl VhdxFile { + /// Resolve a read for a partially-present block by reading the sector bitmap. + /// + /// For each sector in the range, checks the corresponding bit in the sector + /// bitmap. Emits runs of [`ReadRange::Data`] (bit=1, sector present in file) + /// and [`ReadRange::Unmapped`] (bit=0, sector transparent to parent). + /// + /// # Arguments + /// + /// * `data_file_offset` - The file offset of the data block (from the + /// payload BAT entry). Used to compute file offsets for present sectors. + /// * `virtual_offset` - The virtual disk byte offset of the start of this + /// sub-request (already clamped to a single block). + /// * `length` - The length in bytes (already clamped to a single block). + /// * `ranges` - Output vector to append ranges to. + pub(crate) async fn resolve_partial_block_read( + &self, + data_file_offset: u64, + virtual_offset: u64, + length: u32, + ranges: &mut Vec, + ) -> Result<(), VhdxIoError> { + // 1. Compute sector coordinates. + let sector_number = virtual_offset / self.logical_sector_size as u64; + let chunk_number = (sector_number / SECTORS_PER_CHUNK) as u32; + let sector_count = length as u64 / self.logical_sector_size as u64; + + // 2. Get sector bitmap block mapping (synchronous). + // The SBM is guaranteed to be allocated — validated during BAT + // loading (PartiallyPresentWithoutSectorBitmap check) and + // maintained at runtime by ensure_sbm_allocated. + let sbm_mapping = self.bat.get_sector_bitmap_mapping(chunk_number); + assert!( + sbm_mapping.bat_state() == BatEntryState::FullyPresent, + "SBM for chunk {chunk_number} must be allocated for PartiallyPresent block" + ); + + // 3. Iterate over bitmap pages (outer loop for multi-page support). + let mut remaining_sectors = sector_count; + let mut current_virtual_offset = virtual_offset; + + while remaining_sectors > 0 { + // Recompute bitmap page coordinates for current position. + let cur_sector = current_virtual_offset / self.logical_sector_size as u64; + let cur_chunk_sector = cur_sector % SECTORS_PER_CHUNK; + let cur_page_number = cur_chunk_sector / SECTORS_PER_BITMAP_PAGE; + let start_bit = cur_chunk_sector % SECTORS_PER_BITMAP_PAGE; + let bits_in_this_page = + std::cmp::min(start_bit + remaining_sectors, SECTORS_PER_BITMAP_PAGE); + + // Acquire the bitmap page for this portion. + let page_file_offset = sbm_mapping.file_offset() + cur_page_number * CACHE_PAGE_SIZE; + { + let guard = self + .cache + .acquire_read(PageKey { + tag: SBM_TAG, + offset: page_file_offset, + }) + .await + .map_err(VhdxIoErrorInner::ReadSectorBitmap)?; + + // Scan bits within this page using BitSlice for word-level acceleration. + let bits = BitSlice::::from_slice(&*guard); + let window = &bits[start_bit as usize..bits_in_this_page as usize]; + let mut pos = 0usize; + let len = window.len(); + while pos < len { + // Find first set bit (data present). + let one = window[pos..].first_one().map_or(len, |i| pos + i); + if one > pos { + let unmapped_sectors = (one - pos) as u64; + let unmapped_bytes = unmapped_sectors * self.logical_sector_size as u64; + ranges.push(ReadRange::Unmapped { + guest_offset: current_virtual_offset, + length: unmapped_bytes as u32, + }); + current_virtual_offset += unmapped_bytes; + } + + if one < len { + // Find first clear bit (end of data run). + let next_zero = window[one..].first_zero().map_or(len, |i| one + i); + let data_sectors = (next_zero - one) as u64; + let data_bytes = data_sectors * self.logical_sector_size as u64; + let block_offset = (current_virtual_offset % self.block_size as u64) as u32; + let file_offset = data_file_offset + block_offset as u64; + ranges.push(ReadRange::Data { + guest_offset: current_virtual_offset, + length: data_bytes as u32, + file_offset, + }); + current_virtual_offset += data_bytes; + pos = next_zero; + } else { + pos = len; + } + } + } + + // Advance to next page. + let sectors_processed = bits_in_this_page - start_bit; + remaining_sectors -= sectors_processed; + } + + Ok(()) + } + + /// Set or clear sector bitmap bits for a range of sectors. + /// + /// For each sector in the virtual range, sets (or clears) the corresponding + /// bit in the sector bitmap. The bitmap page is acquired in Modify mode + /// and written through to disk on release. + /// + /// # Arguments + /// + /// * `virtual_offset` - Virtual disk byte offset of the start of the range. + /// * `length` - Length in bytes. + /// * `set` - If true, set bits (mark sectors present); if false, clear bits. + pub(crate) async fn set_sector_bitmap_bits( + &self, + virtual_offset: u64, + length: u32, + set: bool, + ) -> Result<(), VhdxIoError> { + let sector_number = virtual_offset / self.logical_sector_size as u64; + let chunk_number = (sector_number / SECTORS_PER_CHUNK) as u32; + let sector_count = length as u64 / self.logical_sector_size as u64; + + // Get sector bitmap block mapping (synchronous). + // The SBM is guaranteed to be allocated — validated during BAT + // loading and maintained at runtime by ensure_sbm_allocated. + let sbm_mapping = self.bat.get_sector_bitmap_mapping(chunk_number); + assert!( + sbm_mapping.bat_state() == BatEntryState::FullyPresent, + "SBM for chunk {chunk_number} must be allocated for PartiallyPresent block" + ); + + let mut remaining_sectors = sector_count; + let mut current_virtual_offset = virtual_offset; + + while remaining_sectors > 0 { + let cur_sector = current_virtual_offset / self.logical_sector_size as u64; + let cur_chunk_sector = cur_sector % SECTORS_PER_CHUNK; + let cur_page_number = cur_chunk_sector / SECTORS_PER_BITMAP_PAGE; + let start_bit = cur_chunk_sector % SECTORS_PER_BITMAP_PAGE; + let bits_in_this_page = + std::cmp::min(start_bit + remaining_sectors, SECTORS_PER_BITMAP_PAGE); + + let page_file_offset = sbm_mapping.file_offset() + cur_page_number * CACHE_PAGE_SIZE; + + // If the range covers the entire page, skip the disk read. + let full_page = start_bit == 0 && bits_in_this_page == SECTORS_PER_BITMAP_PAGE; + let mode = if full_page { + WriteMode::Overwrite + } else { + WriteMode::Modify + }; + + let mut guard = self + .cache + .acquire_write( + PageKey { + tag: SBM_TAG, + offset: page_file_offset, + }, + mode, + ) + .await + .map_err(VhdxIoErrorInner::SectorBitmapCache)?; + + if full_page { + // Overwrite entire page without reading existing data. + // Overwriting pages are zero-initialized by the cache. + if set || !guard.is_overwriting() { + guard.fill(if set { 0xFF } else { 0x00 }); + } + } else { + // Check via read-only Deref whether any bits actually differ. + // If not, DerefMut is never called, the page stays clean, + // and no write-back occurs. + let bits = BitSlice::::from_slice(&*guard); + let window = &bits[start_bit as usize..bits_in_this_page as usize]; + let needs_change = if set { !window.all() } else { window.any() }; + + if needs_change { + let bits_mut = BitSlice::::from_slice_mut(&mut *guard); + bits_mut[start_bit as usize..bits_in_this_page as usize].fill(set); + } + } + + let sectors_processed = bits_in_this_page - start_bit; + remaining_sectors -= sectors_processed; + current_virtual_offset += sectors_processed * self.logical_sector_size as u64; + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::AsyncFileExt; + use crate::bat::Bat; + use crate::create::{self, CreateParams}; + use crate::format; + use crate::format::BatEntry; + use crate::io::ReadRange; + use crate::open::VhdxFile; + use crate::region; + use crate::tests::support::InMemoryFile; + use pal_async::DefaultDriver; + use pal_async::async_test; + use zerocopy::IntoBytes; + + /// Create a differencing VHDX with block 0 set to PartiallyPresent and + /// a sector bitmap at a known file offset. + /// + /// The `bitmap_data` should be exactly 4096 bytes of bitmap data for the + /// first bitmap page. + /// + /// Returns `(VhdxFile, data_block_file_offset, sbm_block_file_offset)`. + async fn create_partial_block_vhdx( + bitmap_data: &[u8; 4096], + ) -> (VhdxFile, u64, u64) { + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: format::GB1, + has_parent: true, + ..Default::default() + }; + create::create(&file, &mut params).await.unwrap(); + + // Parse regions to find the BAT offset. + let regions = region::parse_region_tables(&file).await.unwrap(); + let bat_offset = regions.bat_offset; + + // Compute entry indices. With 2 MiB blocks, 512-byte sectors, + // chunk_ratio = 2048. Block 0 is payload entry 0. SBM entry for + // chunk 0 is at index chunk_ratio = 2048. + let bat = Bat::new( + format::GB1, + format::DEFAULT_BLOCK_SIZE, + 512, + true, + format::MB1 as u32, + ) + .unwrap(); + let payload_index = bat.payload_entry_index(0); + let sbm_index = bat.sector_bitmap_entry_index(0); + + // Place data block at 8 MiB (file_offset_mb = 8). + let data_block_offset = 8 * format::MB1; + let data_entry = BatEntry::new() + .with_state(BatEntryState::PartiallyPresent as u8) + .with_file_offset_mb(data_block_offset >> 20); + file.write_at(bat_offset + payload_index as u64 * 8, data_entry.as_bytes()) + .await + .unwrap(); + + // Place SBM block at 10 MiB (file_offset_mb = 10). + let sbm_block_offset = 10 * format::MB1; + let sbm_entry = BatEntry::new() + .with_state(BatEntryState::FullyPresent as u8) + .with_file_offset_mb(sbm_block_offset >> 20); + file.write_at(bat_offset + sbm_index as u64 * 8, sbm_entry.as_bytes()) + .await + .unwrap(); + + // Write the bitmap data at the SBM page offset (first page of SBM block). + file.write_at(sbm_block_offset, bitmap_data).await.unwrap(); + + // Open the VHDX. + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + + (vhdx, data_block_offset, sbm_block_offset) + } + + #[async_test] + async fn partial_block_all_present() { + // All bits set → single Data range. + let bitmap = [0xFFu8; 4096]; + let (vhdx, data_offset, _) = create_partial_block_vhdx(&bitmap).await; + + let mut ranges = Vec::new(); + // Read first 4096 bytes (8 sectors * 512) of block 0. + vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap(); + + assert_eq!(ranges.len(), 1); + assert_eq!( + ranges[0], + ReadRange::Data { + guest_offset: 0, + length: 4096, + file_offset: data_offset, + } + ); + } + + #[async_test] + async fn partial_block_all_transparent() { + // All bits clear → single Unmapped range. + let bitmap = [0x00u8; 4096]; + let (vhdx, _, _) = create_partial_block_vhdx(&bitmap).await; + + let mut ranges = Vec::new(); + vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap(); + + assert_eq!(ranges.len(), 1); + assert_eq!( + ranges[0], + ReadRange::Unmapped { + guest_offset: 0, + length: 4096, + } + ); + } + + #[async_test] + async fn partial_block_mixed() { + // First 4 sectors (bits 0-3) set, next 4 (bits 4-7) clear. + // Byte 0 = 0x0F (bits 0-3 set, 4-7 clear). + let mut bitmap = [0x00u8; 4096]; + bitmap[0] = 0x0F; + let (vhdx, data_offset, _) = create_partial_block_vhdx(&bitmap).await; + + let mut ranges = Vec::new(); + // Read 8 sectors = 4096 bytes. + vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap(); + + assert_eq!(ranges.len(), 2); + assert_eq!( + ranges[0], + ReadRange::Data { + guest_offset: 0, + length: 2048, // 4 sectors * 512 + file_offset: data_offset, + } + ); + assert_eq!( + ranges[1], + ReadRange::Unmapped { + guest_offset: 2048, + length: 2048, + } + ); + } + + #[async_test] + async fn partial_block_alternating() { + // Alternating: sector 0 set, 1 clear, 2 set, 3 clear, ... + // Byte pattern: 0b01010101 = 0x55 → bits 0,2,4,6 set + let mut bitmap = [0x00u8; 4096]; + bitmap[0] = 0x55; + let (vhdx, data_offset, _) = create_partial_block_vhdx(&bitmap).await; + + let mut ranges = Vec::new(); + // Read 8 sectors = 4096 bytes. + vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap(); + + // 0x55 = 0b01010101: bits 0,2,4,6 set; bits 1,3,5,7 clear. + // Expected: Data(0) Unmapped(1) Data(2) Unmapped(3) + // Data(4) Unmapped(5) Data(6) Unmapped(7) + assert_eq!(ranges.len(), 8); + assert_eq!( + ranges[0], + ReadRange::Data { + guest_offset: 0, + length: 512, + file_offset: data_offset, + } + ); + assert_eq!( + ranges[1], + ReadRange::Unmapped { + guest_offset: 512, + length: 512, + } + ); + assert_eq!( + ranges[2], + ReadRange::Data { + guest_offset: 1024, + length: 512, + file_offset: data_offset + 1024, + } + ); + assert_eq!( + ranges[3], + ReadRange::Unmapped { + guest_offset: 1536, + length: 512, + } + ); + assert_eq!( + ranges[4], + ReadRange::Data { + guest_offset: 2048, + length: 512, + file_offset: data_offset + 2048, + } + ); + assert_eq!( + ranges[5], + ReadRange::Unmapped { + guest_offset: 2560, + length: 512, + } + ); + assert_eq!( + ranges[6], + ReadRange::Data { + guest_offset: 3072, + length: 512, + file_offset: data_offset + 3072, + } + ); + assert_eq!( + ranges[7], + ReadRange::Unmapped { + guest_offset: 3584, + length: 512, + } + ); + } + + #[async_test] + async fn partial_block_single_sector_present() { + // Only bit 2 set: Unmapped(0,1024) Data(1024,512) Unmapped(1536,2560) + let mut bitmap = [0x00u8; 4096]; + bitmap[0] = 0x04; // bit 2 set + let (vhdx, data_offset, _) = create_partial_block_vhdx(&bitmap).await; + + let mut ranges = Vec::new(); + vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap(); + + assert_eq!(ranges.len(), 3); + assert_eq!( + ranges[0], + ReadRange::Unmapped { + guest_offset: 0, + length: 1024, // 2 sectors + } + ); + assert_eq!( + ranges[1], + ReadRange::Data { + guest_offset: 1024, + length: 512, + file_offset: data_offset + 1024, + } + ); + assert_eq!( + ranges[2], + ReadRange::Unmapped { + guest_offset: 1536, + length: 2560, // 5 sectors + } + ); + } + + #[async_test] + async fn partial_block_first_and_last_sector() { + // Bits 0 and 7 set in byte 0: Data(0,512) Unmapped(512,3072) Data(3584,512) + let mut bitmap = [0x00u8; 4096]; + bitmap[0] = 0x81; // bits 0 and 7 set + let (vhdx, data_offset, _) = create_partial_block_vhdx(&bitmap).await; + + let mut ranges = Vec::new(); + vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap(); + + assert_eq!(ranges.len(), 3); + assert_eq!( + ranges[0], + ReadRange::Data { + guest_offset: 0, + length: 512, + file_offset: data_offset, + } + ); + assert_eq!( + ranges[1], + ReadRange::Unmapped { + guest_offset: 512, + length: 3072, // 6 sectors + } + ); + assert_eq!( + ranges[2], + ReadRange::Data { + guest_offset: 3584, + length: 512, + file_offset: data_offset + 3584, + } + ); + } + + #[async_test] + async fn partial_block_read_at_offset() { + // All bits set. Read starting at sector 4 (offset 2048 within block). + let bitmap = [0xFFu8; 4096]; + let (vhdx, data_offset, _) = create_partial_block_vhdx(&bitmap).await; + + let mut ranges = Vec::new(); + // Read 4 sectors starting at byte offset 2048. + vhdx.resolve_read(2048, 2048, &mut ranges).await.unwrap(); + + assert_eq!(ranges.len(), 1); + assert_eq!( + ranges[0], + ReadRange::Data { + guest_offset: 2048, + length: 2048, + file_offset: data_offset + 2048, + } + ); + } + + #[async_test] + async fn partial_block_unallocated_sbm_error() { + // Set up a PartiallyPresent data block but leave the SBM entry as NotPresent. + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: format::GB1, + has_parent: true, + ..Default::default() + }; + create::create(&file, &mut params).await.unwrap(); + + let regions = region::parse_region_tables(&file).await.unwrap(); + let bat_offset = regions.bat_offset; + let bat = Bat::new( + format::GB1, + format::DEFAULT_BLOCK_SIZE, + 512, + true, + format::MB1 as u32, + ) + .unwrap(); + let payload_index = bat.payload_entry_index(0); + + // Set block 0 to PartiallyPresent but do NOT set the SBM entry. + let data_block_offset = 8 * format::MB1; + let data_entry = BatEntry::new() + .with_state(BatEntryState::PartiallyPresent as u8) + .with_file_offset_mb(data_block_offset >> 20); + file.write_at(bat_offset + payload_index as u64 * 8, data_entry.as_bytes()) + .await + .unwrap(); + + // Extend file to cover the data block offset + block size. + let needed = data_block_offset + format::DEFAULT_BLOCK_SIZE as u64; + file.set_file_size(needed).await.unwrap(); + + // Open should fail because the PartiallyPresent block has no + // corresponding SBM allocation. + let result = VhdxFile::open(file).read_only().await; + assert!( + result.is_err(), + "open should reject PartiallyPresent block without SBM" + ); + } + + #[async_test] + async fn set_sector_bitmap_bits_roundtrip(driver: DefaultDriver) { + // Create a differencing VHDX with all-zero bitmap (all transparent). + // This test writes SBM bits, so it needs a writable VhdxFile. + let bitmap = [0x00u8; 4096]; + let (_, data_offset, _) = create_partial_block_vhdx(&bitmap).await; + + // Re-create the same setup but open writable. + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: format::GB1, + has_parent: true, + ..Default::default() + }; + create::create(&file, &mut params).await.unwrap(); + let regions = region::parse_region_tables(&file).await.unwrap(); + let bat_offset = regions.bat_offset; + let bat = Bat::new( + format::GB1, + format::DEFAULT_BLOCK_SIZE, + 512, + true, + format::MB1 as u32, + ) + .unwrap(); + let payload_index = bat.payload_entry_index(0); + let sbm_index = bat.sector_bitmap_entry_index(0); + + let data_block_offset = 8 * format::MB1; + let data_entry = BatEntry::new() + .with_state(BatEntryState::PartiallyPresent as u8) + .with_file_offset_mb(data_block_offset >> 20); + file.write_at(bat_offset + payload_index as u64 * 8, data_entry.as_bytes()) + .await + .unwrap(); + + let sbm_block_offset = 10 * format::MB1; + let sbm_entry = BatEntry::new() + .with_state(BatEntryState::FullyPresent as u8) + .with_file_offset_mb(sbm_block_offset >> 20); + file.write_at(bat_offset + sbm_index as u64 * 8, sbm_entry.as_bytes()) + .await + .unwrap(); + + file.write_at(sbm_block_offset, &bitmap).await.unwrap(); + + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + // Verify initial state: sectors 0-7 are transparent. + let mut ranges = Vec::new(); + vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap(); + assert_eq!(ranges.len(), 1); + assert_eq!( + ranges[0], + ReadRange::Unmapped { + guest_offset: 0, + length: 4096, + } + ); + + // Set bits for sectors 0-3 (first 2048 bytes). + vhdx.set_sector_bitmap_bits( + 0, // virtual_offset + 2048, // length (4 sectors * 512) + true, // set + ) + .await + .unwrap(); + + // Now read again: first 4 sectors should be Data, last 4 Unmapped. + let mut ranges2 = Vec::new(); + vhdx.resolve_read(0, 4096, &mut ranges2).await.unwrap(); + assert_eq!(ranges2.len(), 2); + assert_eq!( + ranges2[0], + ReadRange::Data { + guest_offset: 0, + length: 2048, + file_offset: data_offset, + } + ); + assert_eq!( + ranges2[1], + ReadRange::Unmapped { + guest_offset: 2048, + length: 2048, + } + ); + } +} diff --git a/vm/devices/storage/vhdx/src/space.rs b/vm/devices/storage/vhdx/src/space.rs new file mode 100644 index 0000000000..efba84cef0 --- /dev/null +++ b/vm/devices/storage/vhdx/src/space.rs @@ -0,0 +1,2000 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Free space management for VHDX files. +//! +//! Tracks which megabyte-granularity regions of the file are free, in-use, +//! or soft-anchored (from trimmed blocks). Implements a four-priority +//! allocation strategy: +//! +//! 1. **Free space pool** — reuse interior free blocks +//! 2. **Near-EOF space** — allocate from zeroed space before file end +//! 3. **Soft-anchored blocks** — reclaim trimmed blocks (in-memory only) +//! 4. **Extend EOF** — grow the file +//! +//! The bitmap uses 1-bit-per-megabyte granularity with SET = free / anchored +//! and CLEAR = in-use. + +use crate::bat::Bat; +use crate::error::CorruptionType; +use crate::error::OpenError; +use crate::error::OpenErrorInner; +use crate::error::VhdxIoError; +use crate::error::VhdxIoErrorInner; +use crate::format::BatEntryState; +use crate::format::MB1; +use bitfield_struct::bitfield; +use bitvec::prelude::*; +use parking_lot::Mutex; +use std::collections::HashMap; + +/// Default EOF extension length: 32 MiB. +const DEFAULT_EOF_EXTENSION_LENGTH: u32 = 32 * MB1 as u32; + +// --------------------------------------------------------------------------- +// SpaceBitmap — RTL_BITMAP equivalent +// --------------------------------------------------------------------------- + +/// Bitmap wrapper providing `RTL_BITMAP`-equivalent operations. +/// +/// Uses [`BitVec`] with LSB-first bit ordering on `u64` words for +/// word-level accelerated operations. +/// SET bits (1) denote the property tracked by the containing structure +/// (free, anchored, or trimmed); CLEAR bits (0) denote the opposite. +#[derive(Clone)] +struct SpaceBitmap { + bits: BitVec, +} + +impl SpaceBitmap { + /// Create a new bitmap with `bit_count` bits, all initially clear. + fn new(bit_count: usize) -> Self { + SpaceBitmap { + bits: bitvec![u64, Lsb0; 0; bit_count], + } + } + + /// Number of valid bits. + fn len(&self) -> usize { + self.bits.len() + } + + /// Set a single bit. + fn set_bit(&mut self, index: usize) { + self.bits.set(index, true); + } + + /// Clear a single bit. + fn clear_bit(&mut self, index: usize) { + self.bits.set(index, false); + } + + /// Check whether a single bit is set. + fn check_bit(&self, index: usize) -> bool { + self.bits[index] + } + + /// Set a contiguous range of bits `[start..start+count)`. + fn set_range(&mut self, start: usize, count: usize) { + self.bits[start..start + count].fill(true); + } + + /// Clear a contiguous range of bits `[start..start+count)`. + fn clear_range(&mut self, start: usize, count: usize) { + self.bits[start..start + count].fill(false); + } + + /// Check whether all bits in `[start..start+count)` are set. + fn are_bits_set(&self, start: usize, count: usize) -> bool { + count == 0 || self.bits[start..start + count].all() + } + + /// Check whether all bits in `[start..start+count)` are clear. + fn are_bits_clear(&self, start: usize, count: usize) -> bool { + count == 0 || self.bits[start..start + count].not_any() + } + + /// Find the first contiguous run of `count` SET bits, starting the + /// scan at `hint`. Returns `None` if no such run exists. + /// + /// Scans `[hint..len)` first, then `[0..hint)`. Uses word-level + /// `first_one` / `first_zero` operations for efficient run detection. + fn find_set_bits(&self, count: usize, hint: usize) -> Option { + let total = self.bits.len(); + if count == 0 || count > total { + return None; + } + let hint = hint.min(total); + + // Pass 1: [hint..total) + if let Some(idx) = Self::find_run(&self.bits, count, hint, total) { + return Some(idx); + } + // Pass 2: [0..hint) — only the region not covered by pass 1. + if hint > 0 { + if let Some(idx) = Self::find_run(&self.bits, count, 0, hint) { + return Some(idx); + } + } + None + } + + /// Set all valid bits. + fn set_all(&mut self) { + self.bits.fill(true); + } + + /// Resize the bitmap to `new_bit_count`. New bits are cleared. + /// Preserves existing data up to `min(old_count, new_count)`. + fn resize(&mut self, new_bit_count: usize) { + self.bits.resize(new_bit_count, false); + } + + /// Find a contiguous run of `count` SET bits within `[start..end)`. + fn find_run( + bits: &BitSlice, + count: usize, + start: usize, + end: usize, + ) -> Option { + if end - start < count { + return None; + } + let window = &bits[start..end]; + let mut pos = 0; + while pos + count <= window.len() { + // Skip clear bits — find next set bit. + let run_start = match window[pos..].first_one() { + Some(i) => pos + i, + None => return None, + }; + if run_start + count > window.len() { + return None; + } + // Find end of the set-bit run. + let run_end = window[run_start..] + .first_zero() + .map_or(window.len(), |i| run_start + i); + if run_end - run_start >= count { + return Some(start + run_start); + } + pos = run_end; + } + None + } +} + +// --------------------------------------------------------------------------- +// Sub-structures +// --------------------------------------------------------------------------- + +/// Free space pool state. Tracks 1-bit-per-megabyte: SET = free. +struct FreeSpacePool { + bitmap: SpaceBitmap, + lowest_bit_hint: u32, + /// Fast-path flag: if true, skip free-pool scan for block-sized allocations. + no_free_blocks: bool, +} + +/// Anchored space state. Tracks 1-bit-per-megabyte: SET = soft-anchored. +struct AnchoredSpacePool { + bitmap: SpaceBitmap, + lowest_bit_hint: u32, +} + +/// Tracks which data blocks have been trimmed but still hold a +/// "soft anchor" to their file space. +/// +/// When a block is trimmed with `TrimMode::FileSpace`, the BAT entry +/// transitions to Unmapped but the `file_megabyte` field is preserved. +/// The space is *not* released to the free pool. This avoids the cost +/// of zeroing + flushing the space before a future BAT commit, because +/// the space still contains only the block's own old data — no +/// cross-block data leak is possible on power failure. +/// +/// Bitmap: 1-bit-per-block-number, SET = has soft-anchored file offset. +struct TrimmedBlockTracker { + bitmap: SpaceBitmap, + lowest_block_number_hint: u32, + num_trimmed_blocks: u32, +} + +/// EOF geometry state — describes where new space comes from. +/// +/// These fields are only mutated under the `allocation_lock` (the async +/// `futures::lock::Mutex<()>` on `VhdxFile` that serializes the +/// allocate→TFP→write sequence). They live outside `FreeSpaceInner` +/// so they don't contend with the sync mutex. +pub(crate) struct EofState { + /// Current file length (always MB1-aligned). + pub file_length: u64, + /// Highest in-use file offset. + pub last_file_offset: u64, + /// Offset at which all data beyond is guaranteed zero. + pub zero_offset: u64, + /// Minimum chunk for EOF extension (constant after init). + pub eof_extension_length: u32, +} + +/// Internal mutable state of the free space tracker. +struct FreeSpaceInner { + free_space: FreeSpacePool, + anchored_space: AnchoredSpacePool, + trimmed_blocks: TrimmedBlockTracker, + + /// Block size in bytes. + block_size: u32, + /// Number of data blocks. + data_block_count: u32, +} + +// --------------------------------------------------------------------------- +// FreeSpaceTracker — public API +// --------------------------------------------------------------------------- + +/// Free space tracker for VHDX files. All internal state is protected by +/// a synchronous `parking_lot::Mutex`. +/// +/// This mutex must **never** be held across `.await` points. The outer +/// `allocation_lock` (an async mutex on `VhdxFile`) serializes the full +/// allocation sequence including any file I/O. +pub(crate) struct FreeSpaceTracker { + inner: Mutex, + /// Block alignment (0 or power of 2 ≤ block_size). Constant after construction. + block_alignment: u32, +} + +/// Flags for [`VhdxFile::allocate_space()`]. +#[bitfield(u8)] +#[derive(PartialEq, Eq)] +pub(crate) struct AllocateFlags { + /// Align the allocation to `block_alignment`. + #[bits(1)] + pub aligned: bool, + /// Zero the allocated region if not already zeroed on disk. + #[bits(1)] + pub zero: bool, + #[bits(6)] + _reserved: u8, +} + +/// Describes the state of newly allocated space. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum SpaceState { + /// Fresh space from file extension — zeroed on disk. Safe to commit + /// BAT before flushing the data write (no data leak possible). + Zero, + /// Recycled space containing the same block's own old data. Safe to + /// commit BAT before flushing (a power failure only exposes the + /// block's own stale data, not another block's). NOT zero. + OwnStale, + /// Recycled space that may contain another block's data. Must flush + /// data writes before committing BAT to prevent cross-block data + /// leaks on power failure. NOT zero. + CrossStale, +} + +impl SpaceState { + /// Safe to commit BAT entry before data flush completes? + pub fn is_safe(self) -> bool { + matches!(self, Self::Zero | Self::OwnStale) + } + + /// Guaranteed zeroed on disk? + pub fn is_zero(self) -> bool { + matches!(self, Self::Zero) + } +} + +/// Result from a successful space allocation. +pub(crate) struct AllocateResult { + /// File byte offset of the allocated region. + pub file_offset: u64, + /// State of the allocated space. + pub state: SpaceState, + /// If this allocation reclaimed a cross-block soft anchor, the old + /// block number whose `file_megabyte` must be cleared by the caller. + pub unanchored_block: Option, +} + +impl FreeSpaceTracker { + /// Create and initialize the free space tracker. + /// + /// Called during `VhdxFile::open_inner()`, before the BAT parse. Sets all file + /// space as free, then marks the header area, log, BAT, and metadata + /// regions as in-use. + /// + /// Returns both the tracker and the initial [`EofState`]. + pub fn new( + file_length: u64, + block_size: u32, + block_alignment: u32, + header_area_size: u64, + log_offset: u64, + log_length: u32, + bat_offset: u64, + bat_length: u32, + metadata_offset: u64, + metadata_length: u32, + data_block_count: u32, + ) -> Result<(Self, EofState), OpenError> { + // Validate alignment. + if block_alignment != 0 && !block_alignment.is_power_of_two() { + return Err(OpenErrorInner::InvalidParameter( + crate::error::InvalidFormatReason::BlockAlignmentNotPowerOfTwo, + ) + .into()); + } + let effective_alignment = if block_alignment > block_size { + 0 + } else { + block_alignment + }; + // File length must be MB1-aligned. + let aligned_file_length = (file_length + MB1 - 1) & !(MB1 - 1); + let bit_count = (aligned_file_length / MB1) as usize; + + // Create bitmaps. + let mut free_space_bitmap = SpaceBitmap::new(bit_count); + let anchored_space_bitmap = SpaceBitmap::new(bit_count); + let trimmed_block_bitmap = SpaceBitmap::new(data_block_count as usize); + + // Mark entire file as free. + free_space_bitmap.set_all(); + + let mut eof_state = EofState { + file_length: aligned_file_length, + last_file_offset: 0, + zero_offset: 0, + eof_extension_length: DEFAULT_EOF_EXTENSION_LENGTH, + }; + + let mut inner = FreeSpaceInner { + free_space: FreeSpacePool { + bitmap: free_space_bitmap, + lowest_bit_hint: 0, + no_free_blocks: false, + }, + anchored_space: AnchoredSpacePool { + bitmap: anchored_space_bitmap, + lowest_bit_hint: bit_count as u32, + }, + trimmed_blocks: TrimmedBlockTracker { + bitmap: trimmed_block_bitmap, + lowest_block_number_hint: data_block_count, + num_trimmed_blocks: 0, + }, + block_size, + data_block_count, + }; + + // Mark header area as in-use. + inner.mark_range_in_use_inner(&mut eof_state, 0, header_area_size as u32)?; + + // Mark log as in-use. + if log_length > 0 { + inner.mark_range_in_use_inner(&mut eof_state, log_offset, log_length)?; + } + + // Mark BAT region as in-use. + // BAT length is rounded up to MB1 for space tracking. + let bat_length_aligned = round_up_mb1(bat_length as u64) as u32; + inner.mark_range_in_use_inner(&mut eof_state, bat_offset, bat_length_aligned)?; + + // Mark metadata region as in-use. + let metadata_length_aligned = round_up_mb1(metadata_length as u64) as u32; + inner.mark_range_in_use_inner(&mut eof_state, metadata_offset, metadata_length_aligned)?; + + Ok(( + FreeSpaceTracker { + inner: Mutex::new(inner), + block_alignment: effective_alignment, + }, + eof_state, + )) + } + + /// Block alignment (0 or power of 2). Constant after construction. + pub fn block_alignment(&self) -> u32 { + self.block_alignment + } + + /// Mark a file range as in-use during BAT parse. + /// + /// Validates that the range doesn't overlap with an already-in-use range + /// and doesn't extend past EOF. + pub fn mark_range_in_use( + &self, + eof: &mut EofState, + offset: u64, + length: u32, + ) -> Result<(), CorruptionType> { + self.inner + .lock() + .mark_range_in_use_inner(eof, offset, length) + } + + /// Mark a trimmed block as soft-anchored during BAT parse. + pub fn mark_trimmed_block( + &self, + block_number: u32, + file_offset: u64, + block_size: u32, + ) -> Result<(), CorruptionType> { + self.inner + .lock() + .mark_trimmed_block_inner(block_number, file_offset, block_size) + } + + /// Finalize after BAT parse. Separates EOF free space from pool free space. + /// + /// Blocks from `ZeroOffset` to `FileLength` are "near-EOF free space" + /// (tracked separately, not in the bitmap pool). Clear those bits from + /// the FreeSpace bitmap. + pub fn complete_initialization(&self, eof: &EofState) { + let mut inner = self.inner.lock(); + let bit_base = (eof.zero_offset / MB1) as usize; + let bit_count = ((eof.file_length - eof.zero_offset) / MB1) as usize; + if bit_count > 0 { + debug_assert!(inner.free_space.bitmap.are_bits_set(bit_base, bit_count)); + inner.free_space.bitmap.clear_range(bit_base, bit_count); + } + } + + /// Try to allocate using priorities 1–3, with access to the BAT state + /// for soft-anchor lookup (priority 3). + pub fn try_allocate_with_bat( + &self, + eof: &mut EofState, + size: u32, + aligned: bool, + bat: &Bat, + ) -> Option { + self.try_allocate_inner(eof, size, aligned, Some(bat)) + } + + /// Try all three in-memory allocation priorities. + fn try_allocate_inner( + &self, + eof: &mut EofState, + size: u32, + aligned: bool, + bat: Option<&Bat>, + ) -> Option { + let mut inner = self.inner.lock(); + // Priority 1: free space pool. + if let Some(offset) = inner.free_space_pool_alloc(eof, size) { + return Some(AllocateResult { + file_offset: offset, + state: SpaceState::CrossStale, + unanchored_block: None, + }); + } + + // Priority 2: near-EOF space (between ZeroOffset and FileLength). + let aligned_zero_offset = if aligned && self.block_alignment != 0 { + round_up(eof.zero_offset, self.block_alignment as u64) + } else { + eof.zero_offset + }; + + if eof.file_length >= aligned_zero_offset + size as u64 { + let offset = aligned_zero_offset; + eof.zero_offset = aligned_zero_offset + size as u64; + eof.last_file_offset = eof.zero_offset; + return Some(AllocateResult { + file_offset: offset, + state: SpaceState::Zero, + unanchored_block: None, + }); + } + + // Priority 3: soft-anchored space from trimmed blocks. + // + // Only considers blocks in TrimmedBlockTracker, which are populated + // by flush() — so they are always durable. The caller must clear + // the old block's file_megabyte in BatState and write its BAT page + // to cache. + if size <= inner.block_size { + if let Some(bat) = bat { + if let Some((file_offset, block_number)) = + inner.find_and_unanchor_in_memory_inner(bat) + { + // If the allocated block is larger than needed, release excess. + if size < inner.block_size { + let excess_offset = file_offset + size as u64; + let excess_size = inner.block_size - size; + inner.release_inner(excess_offset, excess_size); + } + return Some(AllocateResult { + file_offset, + state: SpaceState::CrossStale, + unanchored_block: Some(block_number), + }); + } + } + } + + // Priority 4: caller must extend EOF. + None + } + + /// Release space back to the free pool. + pub fn release(&self, offset: u64, size: u32) { + self.inner.lock().release_inner(offset, size); + } + + /// Unmark a trimmed block (when its space is reclaimed). + #[must_use] + pub fn unmark_trimmed_block( + &self, + block_number: u32, + file_offset: u64, + block_size: u32, + ) -> bool { + let mut inner = self.inner.lock(); + inner.unmark_trimmed_block_inner(block_number, file_offset, block_size) + } + + /// Compute truncation target size. + pub fn truncate_target(&self, eof: &EofState, is_fully_allocated: bool) -> u64 { + let inner = self.inner.lock(); + let mut target = eof.last_file_offset; + if is_fully_allocated { + let excess = inner.compute_excess_block_count(eof, target); + let extra = (excess as u64) * inner.block_size as u64; + target = (target + extra).min(eof.file_length); + } + target + } + + /// Update state after truncation. + pub fn apply_truncate(&self, eof: &mut EofState, new_file_length: u64) { + let mut inner = self.inner.lock(); + let aligned = (new_file_length + MB1 - 1) & !(MB1 - 1); + let new_bit_count = (aligned / MB1) as usize; + let old_bit_count = inner.free_space.bitmap.len(); + + if new_bit_count < old_bit_count { + inner.free_space.bitmap.resize(new_bit_count); + inner.anchored_space.bitmap.resize(new_bit_count); + } + + eof.file_length = aligned; + eof.zero_offset = eof.zero_offset.min(aligned); + } +} + +impl EofState { + /// Compute the target file size for EOF extension. + /// + /// Includes `eof_extension_length` minimum chunk. + pub fn required_file_length(&self, block_alignment: u32, size: u32, aligned: bool) -> u64 { + let aligned_zero_offset = if aligned && block_alignment != 0 { + round_up(self.zero_offset, block_alignment as u64) + } else { + self.zero_offset + }; + let target = aligned_zero_offset + size as u64; + let min_target = self.file_length + self.eof_extension_length as u64; + target.max(min_target) + } + + /// Update state after file extension completed. + /// + /// Resizes bitmaps if needed and updates `file_length`. + pub fn complete_file_extend(&mut self, tracker: &FreeSpaceTracker, new_file_length: u64) { + let mut inner = tracker.inner.lock(); + let aligned = (new_file_length + MB1 - 1) & !(MB1 - 1); + let new_bit_count = (aligned / MB1) as usize; + let old_bit_count = inner.free_space.bitmap.len(); + + if new_bit_count > old_bit_count { + // Grow by at least 125% to avoid O(n²) behavior. + let target_bits = (old_bit_count + old_bit_count / 4).max(new_bit_count); + inner.free_space.bitmap.resize(target_bits); + inner.anchored_space.bitmap.resize(target_bits); + } + + self.file_length = aligned; + } +} + +// --------------------------------------------------------------------------- +// Internal helpers (operate on FreeSpaceInner, called under lock) +// --------------------------------------------------------------------------- + +/// Round `value` up to the nearest multiple of `alignment`. +fn round_up(value: u64, alignment: u64) -> u64 { + (value + alignment - 1) & !(alignment - 1) +} + +/// Round `value` up to the nearest MB1 boundary. +fn round_up_mb1(value: u64) -> u64 { + round_up(value, MB1) +} + +impl FreeSpaceInner { + /// Mark a file range as in-use during parse (internal, no lock). + fn mark_range_in_use_inner( + &mut self, + eof: &mut EofState, + offset: u64, + length: u32, + ) -> Result<(), CorruptionType> { + debug_assert!(offset.is_multiple_of(MB1), "offset must be MB1-aligned"); + debug_assert!( + (length as u64).is_multiple_of(MB1), + "length must be MB1-aligned" + ); + + if length == 0 { + return Ok(()); + } + + // Check range is within file. + if eof.file_length < offset || eof.file_length - offset < length as u64 { + return Err(CorruptionType::RangeBeyondEof); + } + + let bit_base = (offset / MB1) as usize; + let bit_count = length as usize / MB1 as usize; + + // Overlap check: all bits must currently be SET (free). + if !self.free_space.bitmap.are_bits_set(bit_base, bit_count) { + return Err(CorruptionType::RangeCollision); + } + + // Mark as in-use (clear the bits). + self.free_space.bitmap.clear_range(bit_base, bit_count); + + // Update last_file_offset and zero_offset. + let range_end = offset + length as u64; + if range_end > eof.last_file_offset { + eof.last_file_offset = range_end; + if eof.last_file_offset > eof.zero_offset { + eof.zero_offset = eof.last_file_offset; + } + } + + Ok(()) + } + + /// Mark a trimmed block as soft-anchored (internal, no lock). + fn mark_trimmed_block_inner( + &mut self, + block_number: u32, + file_offset: u64, + block_size: u32, + ) -> Result<(), CorruptionType> { + debug_assert!(block_number < self.data_block_count); + debug_assert!(block_size.is_multiple_of(MB1 as u32)); + + // Check: already marked as trimmed? + if self.trimmed_blocks.bitmap.check_bit(block_number as usize) { + return Err(CorruptionType::TrimmedRangeCollision); + } + + // Check: anchored space bits must be clear (no collision with another anchor). + let bit_base = (file_offset / MB1) as usize; + let bit_count = block_size as usize / MB1 as usize; + if !self + .anchored_space + .bitmap + .are_bits_clear(bit_base, bit_count) + { + return Err(CorruptionType::TrimmedRangeCollision); + } + + // Mark in trimmed block tracker. + self.trimmed_blocks.bitmap.set_bit(block_number as usize); + self.trimmed_blocks.num_trimmed_blocks += 1; + self.trimmed_blocks.lowest_block_number_hint = self + .trimmed_blocks + .lowest_block_number_hint + .min(block_number); + + // Mark in anchored space bitmap. + self.anchored_space.bitmap.set_range(bit_base, bit_count); + self.anchored_space.lowest_bit_hint = + self.anchored_space.lowest_bit_hint.min(bit_base as u32); + + Ok(()) + } + + /// Unmark a trimmed block (internal, no lock). + #[must_use] + fn unmark_trimmed_block_inner( + &mut self, + block_number: u32, + file_offset: u64, + block_size: u32, + ) -> bool { + debug_assert!(block_number < self.data_block_count); + debug_assert!(block_size.is_multiple_of(MB1 as u32)); + + // If not marked, someone else already claimed it. + if !self.trimmed_blocks.bitmap.check_bit(block_number as usize) { + return false; + } + + self.trimmed_blocks.bitmap.clear_bit(block_number as usize); + self.trimmed_blocks.num_trimmed_blocks -= 1; + + let bit_base = (file_offset / MB1) as usize; + let bit_count = block_size as usize / MB1 as usize; + debug_assert!( + self.anchored_space.bitmap.are_bits_set(bit_base, bit_count), + "anchored space bits must be set for trimmed block {block_number} at offset {file_offset:#x}" + ); + self.anchored_space.bitmap.clear_range(bit_base, bit_count); + + true + } + + /// Release space to the free pool (internal, no lock). + fn release_inner(&mut self, offset: u64, size: u32) { + debug_assert!(offset.is_multiple_of(MB1)); + debug_assert!((size as u64).is_multiple_of(MB1)); + + let bit_base = (offset / MB1) as usize; + let bit_count = size as usize / MB1 as usize; + + if bit_base + bit_count > self.free_space.bitmap.len() { + // Defensive: can't release beyond bitmap size. + return; + } + + debug_assert!(self.free_space.bitmap.are_bits_clear(bit_base, bit_count)); + self.free_space.bitmap.set_range(bit_base, bit_count); + self.free_space.no_free_blocks = false; + + if (bit_base as u32) < self.free_space.lowest_bit_hint { + self.free_space.lowest_bit_hint = bit_base as u32; + } + } + + /// Priority 1: free space pool allocation (internal, no lock). + fn free_space_pool_alloc(&mut self, eof: &mut EofState, length: u32) -> Option { + debug_assert!((length as u64).is_multiple_of(MB1)); + let bit_count = length as usize / MB1 as usize; + + // Fast-path skip for block-sized allocations. + if length >= self.block_size && self.free_space.no_free_blocks { + return None; + } + + let result = self + .free_space + .bitmap + .find_set_bits(bit_count, self.free_space.lowest_bit_hint as usize); + + match result { + Some(bit_base) => { + // Claim the space. + self.free_space.bitmap.clear_range(bit_base, bit_count); + self.free_space.lowest_bit_hint = (bit_base + bit_count) as u32; + let max_offset = (bit_base + bit_count) as u64 * MB1; + if eof.last_file_offset < max_offset { + eof.last_file_offset = max_offset; + } + Some(bit_base as u64 * MB1) + } + None => { + if length <= self.block_size { + self.free_space.no_free_blocks = true; + } + None + } + } + } + + /// Find and unanchor an in-memory-only soft-anchored block. + fn find_and_unanchor_in_memory_inner(&mut self, bat: &Bat) -> Option<(u64, u32)> { + if self.trimmed_blocks.num_trimmed_blocks == 0 { + return None; + } + + let block_size = self.block_size; + + // Try to find an in-memory-only soft-anchored block by scanning + // the TrimmedBlock bitmap. + let mut trimmed_found = 0u32; + let total_trimmed = self.trimmed_blocks.num_trimmed_blocks; + let mut hint = self.trimmed_blocks.lowest_block_number_hint as usize; + + while trimmed_found < total_trimmed { + let block_number = match self.trimmed_blocks.bitmap.find_set_bits(1, hint) { + Some(n) => n, + None => break, + }; + + trimmed_found += 1; + let mapping = bat.get_block_mapping(block_number as u32); + + // Block must be soft-anchored: unmapped/undefined state with non-zero file_megabyte. + let state = mapping.bat_state(); + let is_unmapped = state == BatEntryState::Unmapped + || state == BatEntryState::Undefined + || state == BatEntryState::Zero + || state == BatEntryState::NotPresent; + + debug_assert!( + is_unmapped && mapping.file_megabyte() != 0, + "trimmed block {block_number} is not soft-anchored" + ); + + // Check if it's in-memory only (not on-disk anchored). + // Only blocks in TrimmedBlockTracker are considered here, and + // those are only populated by flush() after WAL durability, so + // the on-disk BAT already reflects the trim. Cross-block reclaim + // is safe — the caller just needs to clear the old block's + // file_megabyte and write its BAT page to cache. + let file_offset = mapping.file_megabyte() as u64 * MB1; + + // Unmark the trimmed block. + if self.unmark_trimmed_block_inner(block_number as u32, file_offset, block_size) { + return Some((file_offset, block_number as u32)); + } + + hint = block_number + 1; + } + + None + } + + /// Compute excess block count (blocks that won't fit given current space). + fn compute_excess_block_count(&self, eof: &EofState, max_offset: u64) -> u32 { + // Count unallocated blocks. + let total = self.data_block_count; + // Available space: count of free bits in free space bitmap + anchored space + // + space from zero_offset to file_length. + let mut available_mb: u64 = 0; + + // Count free bits up to the bitmap. + for i in 0..self.free_space.bitmap.len() { + if self.free_space.bitmap.check_bit(i) { + available_mb += 1; + } + } + + // Count anchored bits. + for i in 0..self.anchored_space.bitmap.len() { + if self.anchored_space.bitmap.check_bit(i) { + available_mb += 1; + } + } + + // EOF space. + let zero = eof.zero_offset.min(max_offset); + if eof.file_length > zero { + available_mb += (eof.file_length - zero) / MB1; + } + + let block_mb = self.block_size as u64 / MB1; + let available_blocks = available_mb / block_mb; + let needed = total as u64; + if needed > available_blocks { + (needed - available_blocks) as u32 + } else { + 0 + } + } +} + +// --------------------------------------------------------------------------- +// Deferred space releases +// --------------------------------------------------------------------------- + +/// Maximum number of deferred entries before trim forces a flush. +const DEFERRED_QUOTA: usize = 1024; + +/// A space release that is deferred until its BAT change is durable on disk. +/// +/// Without deferral, a crash could "teleport" data: a new block's data +/// appears at an old block's file offset because the old block's BAT +/// reverts to FullyPresent on replay. +struct DeferredRelease { + file_offset: u64, + size: u32, + anchor: bool, +} + +/// Entry in the deferred releases tracker, with generation stamp. +struct DeferredEntry { + release: DeferredRelease, + /// `None` = not yet committed to a WAL entry. + /// `Some(gen)` = committed in flush generation `gen`. + committed_gen: Option, +} + +/// Tracks deferred space releases with generation-based promotion. +/// +/// All state is behind a single `parking_lot::Mutex` — never held across +/// `.await`. The generation counter ensures entries are only promoted +/// after the flush that committed them reaches WAL durability. +pub(crate) struct DeferredReleases { + inner: Mutex, +} + +struct DeferredInner { + entries: HashMap, + /// Monotonically increasing. Bumped by each flush before commit. + generation: u64, +} + +impl DeferredReleases { + pub fn new() -> Self { + Self { + inner: Mutex::new(DeferredInner { + entries: HashMap::new(), + generation: 0, + }), + } + } + + /// Insert or replace a deferred release for a block. + /// The entry starts uncommitted (no generation stamp). + pub fn insert(&self, block_number: u32, file_offset: u64, size: u32, anchor: bool) { + self.inner.lock().entries.insert( + block_number, + DeferredEntry { + release: DeferredRelease { + file_offset, + size, + anchor, + }, + committed_gen: None, + }, + ); + } + + /// Remove a deferred entry for same-block reclaim. + /// Returns the file offset if found. Same-block reclaim is always + /// safe (OwnStale) regardless of committed state. + pub fn remove(&self, block_number: u32) -> Option { + self.inner + .lock() + .entries + .remove(&block_number) + .map(|e| e.release.file_offset) + } + + /// Check whether a deferred entry exists for a block, and remove + /// it if so. Returns true if removed. + pub fn cancel(&self, block_number: u32) -> bool { + self.inner.lock().entries.remove(&block_number).is_some() + } + + /// Returns true if the number of deferred entries has reached the + /// quota and a flush should be triggered to free them. + pub fn needs_flush(&self) -> bool { + self.inner.lock().entries.len() >= DEFERRED_QUOTA + } + + /// Stamp all uncommitted entries with the current generation and + /// bump the generation. Called at the start of flush(), before + /// `commit()`. Returns the generation that was stamped. + pub fn stamp_uncommitted(&self) -> u64 { + let mut inner = self.inner.lock(); + inner.generation += 1; + let flush_gen = inner.generation; + for entry in inner.entries.values_mut() { + if entry.committed_gen.is_none() { + entry.committed_gen = Some(flush_gen); + } + } + flush_gen + } + + /// Drain all entries committed at or before the given generation. + /// Returns (block_number, file_offset, size, anchor) tuples for + /// promotion to the FreeSpaceTracker. + pub fn drain_committed(&self, up_to_gen: u64) -> Vec<(u32, u64, u32, bool)> { + let mut inner = self.inner.lock(); + let mut drained = Vec::new(); + inner.entries.retain(|&block, entry| { + if entry.committed_gen.is_some_and(|g| g <= up_to_gen) { + drained.push(( + block, + entry.release.file_offset, + entry.release.size, + entry.release.anchor, + )); + false // remove from map + } else { + true // keep + } + }); + drained + } +} + +// --------------------------------------------------------------------------- +// VhdxFile — space allocation +// --------------------------------------------------------------------------- + +use crate::AsyncFile; +use crate::bat::BlockMapping; +use crate::bat::BlockType; +use crate::open::VhdxFile; + +impl VhdxFile { + /// Allocate space for a new block. Async — may extend the file. + /// + /// Called under `allocation_lock` (the `FreeSpaceWorkerLock` equivalent). + /// The caller must pass `&mut EofState` obtained from locking + /// `allocation_lock`. + /// Tries pool → near-EOF → anchored, extends file and retries if needed. + /// + /// When `flags` includes [`AllocateFlags::ZERO`], the allocated region + /// is guaranteed to be zeroed on disk before returning. Near-EOF + /// allocations are inherently zero; pool/anchor allocations get an + /// explicit zero-write. + /// + /// When `flags` includes [`AllocateFlags::ALIGNED`], the allocation is + /// aligned to `block_alignment`. + pub(crate) async fn allocate_space( + &self, + eof: &mut EofState, + size: u32, + flags: AllocateFlags, + ) -> Result { + debug_assert!( + (size as u64).is_multiple_of(MB1), + "allocation size must be MB1-aligned" + ); + + loop { + // Try priorities 1–3 (pool, near-EOF, anchored). + let result = + self.free_space + .try_allocate_with_bat(eof, size, flags.aligned(), &self.bat); + + if let Some(alloc) = result { + // If this was a cross-block soft-anchor reclaim, clear the + // old block's file_megabyte in BatState and write its BAT + // page to cache. The old block's trim is already durable + // (TrimmedBlockTracker is only populated after flush), so + // no extra flush is needed — just BAT write ordering. + if let Some(old_block) = alloc.unanchored_block { + let old_mapping = self.bat.get_block_mapping(old_block); + let cleared_mapping = BlockMapping::new() + .with_bat_state(old_mapping.bat_state()) + .with_transitioning_to_fully_present(false) + .with_file_megabyte(0); + self.bat.set_block_mapping(old_block, cleared_mapping); + + // Write old block's BAT page to cache (async). + // LOCK AUDIT: allocation_lock held. + self.bat + .write_block_mapping( + &self.cache, + BlockType::Payload, + old_block, + cleared_mapping, + None, + ) + .await?; + } + + if flags.zero() && !alloc.state.is_zero() { + // Space from pool/anchor may contain stale data — zero it. + self.file + .zero_range(alloc.file_offset, size as u64) + .await + .map_err(|e| VhdxIoErrorInner::ZeroBlock { + err: e, + file_offset: alloc.file_offset, + })?; + } + return Ok(alloc); + } + + // Priority 4: extend EOF. + let block_alignment = self.free_space.block_alignment(); + let target = eof.required_file_length(block_alignment, size, flags.aligned()); + // LOCK AUDIT: bat_state read-lock dropped (end of block above). allocation_lock held (async Mutex — OK across .await). + self.file + .set_file_size(target) + .await + .map_err(|e| VhdxIoErrorInner::ExtendFile { + err: e, + target_file_size: target, + })?; + eof.complete_file_extend(&self.free_space, target); + // Retry — will succeed from near-EOF space. + } + } + + /// Truncate the file to reclaim unused trailing space. + /// + /// Shrinks the file to just past the highest in-use offset, rounded + /// up to MB1. For fully-allocated (fixed) disks, reserves extra + /// space for blocks that haven't been allocated yet. + /// + /// Called during [`close()`](Self::close) after all WAL entries are + /// drained. Must NOT be called while the log task is running. + pub(crate) async fn truncate_file(&self) -> Result<(), VhdxIoError> { + let mut eof = self.allocation_lock.lock().await; + let target = self + .free_space + .truncate_target(&eof, self.is_fully_allocated()); + + // Only shrink, never grow. And don't bother if the savings + // are less than the EOF extension length (avoids thrashing + // on files that are close to their minimum size). + if target < eof.file_length && eof.file_length - target >= eof.eof_extension_length as u64 { + // Round up to MB1. + let target_aligned = (target + MB1 - 1) & !(MB1 - 1); + self.file.set_file_size(target_aligned).await.map_err(|e| { + VhdxIoErrorInner::TruncateFile { + err: e, + target_file_size: target_aligned, + } + })?; + self.free_space.apply_truncate(&mut eof, target_aligned); + } + + Ok(()) + } + + /// Compute the cache [`PageKey`] for the BAT page containing the given + /// payload block's entry. + /// + /// Used by crash-consistency tests to inspect `pre_log_fsn` on BAT pages. + #[cfg(test)] + pub(crate) fn bat_page_key_for_block(&self, block_number: u32) -> crate::cache::PageKey { + use crate::bat::BAT_TAG; + use crate::format::CACHE_PAGE_SIZE; + + let entry_index = self.bat.payload_entry_index(block_number); + let page_offset = (entry_index as u64 * 8) & !(CACHE_PAGE_SIZE - 1); + crate::cache::PageKey { + tag: BAT_TAG, + offset: page_offset, + } + } +} + +// --------------------------------------------------------------------------- +// Test-only helpers on FreeSpaceTracker +// --------------------------------------------------------------------------- + +#[cfg(test)] +impl FreeSpaceTracker { + /// Try to allocate space using priorities 1–3 (pool, near-EOF, anchored) + /// without a BAT state (skips priority 3). + pub fn try_allocate( + &self, + eof: &mut EofState, + size: u32, + aligned: bool, + ) -> Option { + self.try_allocate_inner(eof, size, aligned, None) + } + + /// Find and unanchor a soft-anchored block (in-memory only anchors). + pub fn find_and_unanchor_in_memory(&self, bat: &Bat) -> Option<(u64, u32)> { + let mut inner = self.inner.lock(); + inner.find_and_unanchor_in_memory_inner(bat) + } + + /// Check if a range is in use (for debug/validation). + pub fn is_range_in_use(&self, eof: &EofState, offset: u64, length: u32) -> bool { + let inner = self.inner.lock(); + debug_assert!(offset.is_multiple_of(MB1)); + debug_assert!((length as u64).is_multiple_of(MB1)); + + if eof.file_length < offset || eof.file_length - offset < length as u64 { + return true; + } + + let bit_base = (offset / MB1) as usize; + let bit_count = length as usize / MB1 as usize; + !inner.free_space.bitmap.are_bits_set(bit_base, bit_count) + } +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use super::*; + use crate::bat::BlockMapping; + use crate::format::BatEntryState; + + // -- Bitmap unit tests -- + + #[test] + fn bitmap_set_clear_range() { + let mut bm = SpaceBitmap::new(128); + assert!(bm.are_bits_clear(0, 128)); + + bm.set_range(10, 20); + assert!(bm.are_bits_set(10, 20)); + assert!(bm.are_bits_clear(0, 10)); + assert!(bm.are_bits_clear(30, 98)); + + bm.clear_range(15, 5); + assert!(bm.are_bits_set(10, 5)); + assert!(bm.are_bits_clear(15, 5)); + assert!(bm.are_bits_set(20, 10)); + } + + #[test] + fn bitmap_find_set_bits() { + let mut bm = SpaceBitmap::new(64); + // Create a run of 8 set bits starting at index 20. + bm.set_range(20, 8); + + assert_eq!(bm.find_set_bits(8, 0), Some(20)); + assert_eq!(bm.find_set_bits(8, 20), Some(20)); + assert_eq!(bm.find_set_bits(9, 0), None); + assert_eq!(bm.find_set_bits(1, 25), Some(25)); + } + + #[test] + fn bitmap_find_set_bits_wraps_hint() { + let mut bm = SpaceBitmap::new(64); + // Run at the beginning. + bm.set_range(0, 4); + + // Hint past the run — should wrap and find it. + assert_eq!(bm.find_set_bits(4, 50), Some(0)); + } + + /// Regression: find_set_bits must find a valid non-wrapping run at + /// the bitmap start even when the scan first encounters a wrapping + /// candidate that spans the bitmap end→start boundary. + /// + /// Bitmap (8 bits): [1,1,1,1,0,0,1,1] + /// ^-------^ valid run of 4 at index 0 + /// ^--^ bits 6-7 set + /// + /// With hint=5, the scan visits: 5(0),6(1),7(1),0(1),1(1) — a run + /// of 4 starting at index 6, but it wraps (6+4=10>8). After + /// rejecting the wrap, bits 0-3 must still be found as a valid run. + #[test] + fn bitmap_find_set_bits_rejected_wrap_finds_later_run() { + let mut bm = SpaceBitmap::new(8); + bm.set_range(0, 4); // bits 0,1,2,3 + bm.set_range(6, 2); // bits 6,7 + + // Hint=5: scan starts at 5, wraps, should find run at 0. + assert_eq!( + bm.find_set_bits(4, 5), + Some(0), + "should find non-wrapping run [0..4) after rejecting wrap at 6" + ); + } + + #[test] + fn bitmap_are_bits_set_clear() { + let mut bm = SpaceBitmap::new(32); + bm.set_all(); + assert!(bm.are_bits_set(0, 32)); + assert!(!bm.are_bits_clear(0, 32)); + + bm.clear_bit(16); + assert!(!bm.are_bits_set(0, 32)); + assert!(!bm.are_bits_set(16, 1)); + assert!(bm.are_bits_clear(16, 1)); + } + + #[test] + fn bitmap_empty_and_full() { + let bm_empty = SpaceBitmap::new(0); + assert_eq!(bm_empty.len(), 0); + assert_eq!(bm_empty.find_set_bits(1, 0), None); + + let mut bm = SpaceBitmap::new(1); + assert!(bm.are_bits_clear(0, 1)); + bm.set_bit(0); + assert!(bm.are_bits_set(0, 1)); + } + + // -- FreeSpaceTracker initialization tests -- + + /// Helper: create a tracker for a small test file. + fn make_test_tracker(file_mb: u64, block_size_mb: u32) -> (FreeSpaceTracker, EofState) { + make_test_tracker_aligned(file_mb, block_size_mb, 0) + } + + fn make_test_tracker_aligned( + file_mb: u64, + block_size_mb: u32, + block_alignment: u32, + ) -> (FreeSpaceTracker, EofState) { + let file_length = file_mb * MB1; + let block_size = block_size_mb * MB1 as u32; + let data_block_count = 16; // arbitrary for testing + + FreeSpaceTracker::new( + file_length, + block_size, + block_alignment, + MB1, // header_area_size = 1 MB + MB1, // log_offset = 1 MB + MB1 as u32, // log_length = 1 MB + 2 * MB1, // bat_offset = 2 MB + MB1 as u32, // bat_length = 1 MB + 3 * MB1, // metadata_offset = 3 MB + MB1 as u32, // metadata_length = 1 MB + data_block_count, + ) + .unwrap() + } + + #[test] + fn init_marks_header_in_use() { + let (tracker, eof) = make_test_tracker(10, 2); + // Header area (0..1MB) should be in-use. + assert!(tracker.is_range_in_use(&eof, 0, MB1 as u32)); + } + + #[test] + fn init_marks_regions_in_use() { + let (tracker, eof) = make_test_tracker(10, 2); + // Log (1..2MB), BAT (2..3MB), metadata (3..4MB) should be in-use. + assert!(tracker.is_range_in_use(&eof, MB1, MB1 as u32)); + assert!(tracker.is_range_in_use(&eof, 2 * MB1, MB1 as u32)); + assert!(tracker.is_range_in_use(&eof, 3 * MB1, MB1 as u32)); + } + + #[test] + fn overlap_detection() { + let (tracker, mut eof) = make_test_tracker(10, 2); + // Try to mark the header area again — should fail with RangeCollision. + let result = tracker.mark_range_in_use(&mut eof, 0, MB1 as u32); + assert!(matches!(result, Err(CorruptionType::RangeCollision))); + } + + #[test] + fn range_beyond_eof_detected() { + let (tracker, mut eof) = make_test_tracker(10, 2); + // Try to mark a range that extends beyond file length. + let result = tracker.mark_range_in_use(&mut eof, 9 * MB1, 2 * MB1 as u32); + assert!(matches!(result, Err(CorruptionType::RangeBeyondEof))); + } + + // -- Allocation priority tests -- + + #[test] + fn allocate_from_free_pool() { + let (tracker, mut eof) = make_test_tracker(10, 2); + // Mark offset 4MB in-use (simulating BAT parse finding a block there). + tracker + .mark_range_in_use(&mut eof, 4 * MB1, MB1 as u32) + .unwrap(); + tracker.complete_initialization(&eof); + // Now zero_offset = 5*MB. Near-EOF = 5..10 MB (5 MB). + // Bit 4 is in-use (cleared). Release it back to pool. + tracker.release(4 * MB1, MB1 as u32); + + // Priority 1: should find the released space. + let result = tracker.try_allocate(&mut eof, MB1 as u32, false); + assert!(result.is_some()); + let r = result.unwrap(); + assert_eq!(r.file_offset, 4 * MB1); + assert!(!r.state.is_safe()); + } + + #[test] + fn allocate_from_eof_space() { + let (tracker, mut eof) = make_test_tracker(10, 2); + tracker.complete_initialization(&eof); + + // After initialization, zero_offset = 4*MB, file_length = 10*MB. + // Near-EOF space = 6 MB. + let result = tracker.try_allocate(&mut eof, 2 * MB1 as u32, false); + assert!(result.is_some()); + let r = result.unwrap(); + assert_eq!(r.file_offset, 4 * MB1); + assert!(r.state.is_safe()); // Beyond old zero_offset. + } + + #[test] + fn allocate_extends_eof() { + // Create a tracker with only 4MB (all in-use by regions). + let (tracker, mut eof) = make_test_tracker(4, 2); + tracker.complete_initialization(&eof); + + // No free space, no near-EOF space. + let result = tracker.try_allocate(&mut eof, MB1 as u32, false); + assert!(result.is_none()); + + // Compute required length and extend. + let target = eof.required_file_length(tracker.block_alignment(), MB1 as u32, false); + assert!(target > 4 * MB1); + + eof.complete_file_extend(&tracker, target); + + // Now retry — should succeed from near-EOF. + let result = tracker.try_allocate(&mut eof, MB1 as u32, false); + assert!(result.is_some()); + assert!(result.unwrap().state.is_safe()); + } + + #[test] + fn allocate_alignment() { + // 20MB file, 4MB block size, 4MB alignment. + let (tracker, mut eof) = make_test_tracker_aligned(20, 4, 4 * MB1 as u32); + tracker.complete_initialization(&eof); + + // zero_offset = 4MB (after regions). + // Aligned allocation from EOF: should be at 4MB (already aligned). + let result = tracker.try_allocate(&mut eof, 4 * MB1 as u32, true); + assert!(result.is_some()); + let r = result.unwrap(); + assert_eq!(r.file_offset % (4 * MB1), 0); + } + + #[test] + fn allocate_sets_no_free_blocks_flag() { + let (tracker, mut eof) = make_test_tracker(10, 2); + tracker.complete_initialization(&eof); + + // Exhaust near-EOF space with pool allocations — first exhaust pool. + // After init, pool is empty (regions fill 0..4MB, rest is EOF space). + // Try pool-only: allocate 1MB from pool (should fail, and set flag). + // But near-EOF will succeed before we get to that. + // + // Instead, fill up all space and verify the flag works. + // Allocate all 6 MB of EOF space. + for _ in 0..6 { + tracker.try_allocate(&mut eof, MB1 as u32, false).unwrap(); + } + // Now no space left. + let result = tracker.try_allocate(&mut eof, MB1 as u32, false); + assert!(result.is_none()); + } + + // -- Soft anchoring tests -- + + /// Create a minimal `Bat` for soft-anchor tests with one anchored block. + /// + /// Uses 2 MiB block size, 512-byte sectors, no parent. The + /// `data_block_count` parameter controls how many payload entries + /// the BAT has. + fn make_test_bat_with_anchored_block( + block_number: u32, + file_megabyte: u32, + data_block_count: u32, + ) -> Bat { + let block_size = 2 * MB1 as u32; + let disk_size = data_block_count as u64 * block_size as u64; + let mut bat = Bat::new(disk_size, block_size, 512, false, MB1 as u32).unwrap(); + bat.init_test_payload_mappings(); + bat.set_block_mapping( + block_number, + BlockMapping::new() + .with_bat_state(BatEntryState::Unmapped) + .with_file_megabyte(file_megabyte), + ); + bat + } + + #[test] + fn mark_and_find_anchored_block() { + let (tracker, _eof) = make_test_tracker(20, 2); + // Mark block 3 as trimmed at file offset 6*MB. + tracker + .mark_trimmed_block(3, 6 * MB1, 2 * MB1 as u32) + .unwrap(); + + // Verify anchored space bits are set. + let inner = tracker.inner.lock(); + assert!(inner.anchored_space.bitmap.are_bits_set(6, 2)); + assert!(inner.trimmed_blocks.bitmap.check_bit(3)); + assert_eq!(inner.trimmed_blocks.num_trimmed_blocks, 1); + } + + #[test] + fn unmark_trimmed_block() { + let (tracker, _eof) = make_test_tracker(20, 2); + tracker + .mark_trimmed_block(3, 6 * MB1, 2 * MB1 as u32) + .unwrap(); + assert!(tracker.unmark_trimmed_block(3, 6 * MB1, 2 * MB1 as u32)); + + let inner = tracker.inner.lock(); + assert!(inner.anchored_space.bitmap.are_bits_clear(6, 2)); + assert!(!inner.trimmed_blocks.bitmap.check_bit(3)); + assert_eq!(inner.trimmed_blocks.num_trimmed_blocks, 0); + } + + #[test] + fn find_and_unanchor_in_memory() { + let (tracker, _eof) = make_test_tracker(20, 2); + // Mark block 5 as trimmed at file offset 8*MB. + tracker + .mark_trimmed_block(5, 8 * MB1, 2 * MB1 as u32) + .unwrap(); + + let bat = make_test_bat_with_anchored_block(5, 8, 16); + + let result = tracker.find_and_unanchor_in_memory(&bat); + assert!(result.is_some()); + let (offset, block_num) = result.unwrap(); + assert_eq!(offset, 8 * MB1); + assert_eq!(block_num, 5); + + // After unanchoring, the trimmed block should be unmarked. + let inner = tracker.inner.lock(); + assert!(!inner.trimmed_blocks.bitmap.check_bit(5)); + assert_eq!(inner.trimmed_blocks.num_trimmed_blocks, 0); + } + + #[test] + fn anchored_space_before_eof_extend() { + // Set up a full file with no free pool and no EOF space, + // but with a soft-anchored block. + let (tracker, mut eof) = make_test_tracker(10, 2); + + // Mark block 2 as trimmed at offset 6*MB. + tracker + .mark_trimmed_block(2, 6 * MB1, 2 * MB1 as u32) + .unwrap(); + // Mark remaining free space as in-use so pool is empty. + tracker + .mark_range_in_use(&mut eof, 4 * MB1, MB1 as u32) + .unwrap(); + tracker + .mark_range_in_use(&mut eof, 5 * MB1, MB1 as u32) + .unwrap(); + tracker + .mark_range_in_use(&mut eof, 8 * MB1, 2 * MB1 as u32) + .unwrap(); + tracker.complete_initialization(&eof); + + let bat = make_test_bat_with_anchored_block(2, 6, 16); + + // Should find anchored space (priority 3) instead of extending EOF. + let result = tracker.try_allocate_with_bat(&mut eof, 2 * MB1 as u32, false, &bat); + assert!(result.is_some()); + let r = result.unwrap(); + assert_eq!(r.file_offset, 6 * MB1); + } + + // -- Release tests -- + + #[test] + fn release_then_reallocate() { + let (tracker, mut eof) = make_test_tracker(10, 2); + tracker.complete_initialization(&eof); + + // Allocate from EOF space. + let r1 = tracker.try_allocate(&mut eof, MB1 as u32, false).unwrap(); + let offset = r1.file_offset; + + // Release it back to free pool. + tracker.release(offset, MB1 as u32); + + // Allocate again — should reuse the released space. + let r2 = tracker.try_allocate(&mut eof, MB1 as u32, false).unwrap(); + assert_eq!(r2.file_offset, offset); + } + + // -- Truncation test -- + + #[test] + fn truncate_shrinks_bitmaps() { + let (tracker, mut eof) = make_test_tracker(10, 2); + tracker.complete_initialization(&eof); + + assert_eq!(eof.file_length, 10 * MB1); + + tracker.apply_truncate(&mut eof, 6 * MB1); + assert_eq!(eof.file_length, 6 * MB1); + } + + // -- Bitmap resize test -- + + #[test] + fn bitmap_resize_preserves_data() { + let mut bm = SpaceBitmap::new(32); + bm.set_range(10, 10); + + bm.resize(64); + assert_eq!(bm.len(), 64); + assert!(bm.are_bits_set(10, 10)); + assert!(bm.are_bits_clear(20, 44)); + + bm.resize(16); + assert_eq!(bm.len(), 16); + assert!(bm.are_bits_set(10, 6)); // only 10..16 remains + } + + // -- Priority cascade test -- + + #[test] + fn priority_cascade_pool_then_eof_then_anchor_then_extend() { + // Walk through all 4 priorities in sequence. + let (tracker, mut eof) = make_test_tracker(10, 2); + + // Mark 4..5 MB in-use (a data block during BAT parse). + tracker + .mark_range_in_use(&mut eof, 4 * MB1, MB1 as u32) + .unwrap(); + // Mark 5..7 MB in-use, then mark as soft-anchored (trimmed block 1). + // Always mark in-use first, then mark as trimmed. + tracker + .mark_range_in_use(&mut eof, 5 * MB1, 2 * MB1 as u32) + .unwrap(); + tracker + .mark_trimmed_block(1, 5 * MB1, 2 * MB1 as u32) + .unwrap(); + // Mark 7..8 MB in-use. + tracker + .mark_range_in_use(&mut eof, 7 * MB1, MB1 as u32) + .unwrap(); + + tracker.complete_initialization(&eof); + // zero_offset = 8 MB, file_length = 10 MB. + // Pool: empty (all bits 0..8 are cleared). Near-EOF: 8..10 (2 MB). + + // Release bit 4 back to pool. + tracker.release(4 * MB1, MB1 as u32); + + // Create BAT for soft-anchor lookup. + let bat = make_test_bat_with_anchored_block(1, 5, 16); + + // Priority 1: pool (offset 4 MB). + let r1 = tracker + .try_allocate_with_bat(&mut eof, MB1 as u32, false, &bat) + .unwrap(); + assert_eq!(r1.file_offset, 4 * MB1); + assert!(!r1.state.is_safe()); + + // Pool now empty. Priority 2: near-EOF (offset 8 MB). + let r2 = tracker + .try_allocate_with_bat(&mut eof, MB1 as u32, false, &bat) + .unwrap(); + assert_eq!(r2.file_offset, 8 * MB1); + assert!(r2.state.is_safe()); + + // Take the second EOF MB too. + let r3 = tracker + .try_allocate_with_bat(&mut eof, MB1 as u32, false, &bat) + .unwrap(); + assert_eq!(r3.file_offset, 9 * MB1); + assert!(r3.state.is_safe()); + + // Pool and EOF exhausted. Priority 3: soft-anchored (offset 5 MB). + // The block is 2 MB but we only need 1 MB — excess goes to pool. + let r4 = tracker + .try_allocate_with_bat(&mut eof, MB1 as u32, false, &bat) + .unwrap(); + assert_eq!(r4.file_offset, 5 * MB1); + assert!(!r4.state.is_safe()); + + // The excess 1 MB from the anchored block should now be in pool. + let r5 = tracker + .try_allocate_with_bat(&mut eof, MB1 as u32, false, &bat) + .unwrap(); + assert_eq!(r5.file_offset, 6 * MB1); + assert!(!r5.state.is_safe()); + + // Everything exhausted. Priority 4: returns None. + let r6 = tracker.try_allocate_with_bat(&mut eof, MB1 as u32, false, &bat); + assert!(r6.is_none()); + + // Extend EOF, then retry. + let target = eof.required_file_length(tracker.block_alignment(), MB1 as u32, false); + eof.complete_file_extend(&tracker, target); + let r7 = tracker + .try_allocate_with_bat(&mut eof, MB1 as u32, false, &bat) + .unwrap(); + assert!(r7.state.is_safe()); + assert_eq!(r7.file_offset, 10 * MB1); + } + + // -- Aligned allocation from pool test -- + + #[test] + fn aligned_alloc_from_pool() { + // 20 MB file, 4 MB block size, 4 MB alignment. + let (tracker, mut eof) = make_test_tracker_aligned(20, 4, 4 * MB1 as u32); + + // Mark 4..8 MB in-use, then release to create a 4MB pool hole at an aligned offset. + tracker + .mark_range_in_use(&mut eof, 4 * MB1, 4 * MB1 as u32) + .unwrap(); + tracker.complete_initialization(&eof); + tracker.release(4 * MB1, 4 * MB1 as u32); + + // Pool allocation ignores alignment (alignment only applies to near-EOF). + let result = tracker + .try_allocate(&mut eof, 4 * MB1 as u32, true) + .unwrap(); + assert_eq!(result.file_offset, 4 * MB1); + assert!(!result.state.is_safe()); + } + + // -- Unaligned EOF skip test -- + + #[test] + fn aligned_alloc_skips_unaligned_eof_offset() { + // 20 MB file, 4 MB block size, 4 MB alignment. + let (tracker, mut eof) = make_test_tracker_aligned(20, 4, 4 * MB1 as u32); + + // Mark 4..5 MB in-use. This pushes zero_offset to 5 MB (not 4MB-aligned). + tracker + .mark_range_in_use(&mut eof, 4 * MB1, MB1 as u32) + .unwrap(); + tracker.complete_initialization(&eof); + // zero_offset = 5 MB. Aligned to 4 MB → round up to 8 MB. + // So the allocation should come from offset 8 MB (skipping 5..8). + + let result = tracker + .try_allocate(&mut eof, 4 * MB1 as u32, true) + .unwrap(); + assert_eq!(result.file_offset, 8 * MB1); + assert!(result.state.is_safe()); + } + + // -- Bitmap resize on file extend -- + + #[test] + fn complete_file_extend_grows_bitmaps() { + let (tracker, mut eof) = make_test_tracker(4, 2); + tracker.complete_initialization(&eof); + + // Bitmap should be 4 bits (4 MB / 1 MB). + { + let inner = tracker.inner.lock(); + assert!(inner.free_space.bitmap.len() >= 4); + } + + // Extend to 100 MB. + eof.complete_file_extend(&tracker, 100 * MB1); + assert_eq!(eof.file_length, 100 * MB1); + + { + let inner = tracker.inner.lock(); + // Bitmap must have grown to at least 100 bits. + assert!(inner.free_space.bitmap.len() >= 100); + assert!(inner.anchored_space.bitmap.len() >= 100); + } + + // Near-EOF space should now be available. + let result = tracker.try_allocate(&mut eof, MB1 as u32, false).unwrap(); + assert!(result.state.is_safe()); + } + + // -- no_free_blocks flag reset on release -- + + #[test] + fn no_free_blocks_flag_resets_on_release() { + let (tracker, mut eof) = make_test_tracker(6, 2); + tracker.complete_initialization(&eof); + + // Exhaust all space: 2 MB of near-EOF (6-4=2). + tracker.try_allocate(&mut eof, MB1 as u32, false).unwrap(); + tracker.try_allocate(&mut eof, MB1 as u32, false).unwrap(); + assert!(tracker.try_allocate(&mut eof, MB1 as u32, false).is_none()); + + // The no_free_blocks flag should be set now. + { + let inner = tracker.inner.lock(); + assert!(inner.free_space.no_free_blocks); + } + + // Release 1 MB back. + tracker.release(4 * MB1, MB1 as u32); + + // Flag should be cleared. + { + let inner = tracker.inner.lock(); + assert!(!inner.free_space.no_free_blocks); + } + + // Should be able to allocate again. + let result = tracker.try_allocate(&mut eof, MB1 as u32, false).unwrap(); + assert_eq!(result.file_offset, 4 * MB1); + assert!(!result.state.is_safe()); + } + + // -- Fragmented pool test -- + + #[test] + fn fragmented_pool_allocates_from_lowest_hint() { + let (tracker, mut eof) = make_test_tracker(20, 2); + // Mark a contiguous range 4..11 MB in-use during BAT parse. + tracker + .mark_range_in_use(&mut eof, 4 * MB1, 7 * MB1 as u32) + .unwrap(); + tracker.complete_initialization(&eof); + // zero_offset = 11 MB. Near-EOF = 11..20 (9 MB). + // Pool: empty (bits 0..11 all cleared). + + // Release scattered 1MB blocks to create fragmentation. + tracker.release(10 * MB1, MB1 as u32); + tracker.release(8 * MB1, MB1 as u32); + tracker.release(6 * MB1, MB1 as u32); + tracker.release(4 * MB1, MB1 as u32); + + // Pool should find the lowest free bit first. + let r1 = tracker.try_allocate(&mut eof, MB1 as u32, false).unwrap(); + assert_eq!(r1.file_offset, 4 * MB1); + + let r2 = tracker.try_allocate(&mut eof, MB1 as u32, false).unwrap(); + assert_eq!(r2.file_offset, 6 * MB1); + + let r3 = tracker.try_allocate(&mut eof, MB1 as u32, false).unwrap(); + assert_eq!(r3.file_offset, 8 * MB1); + + let r4 = tracker.try_allocate(&mut eof, MB1 as u32, false).unwrap(); + assert_eq!(r4.file_offset, 10 * MB1); + + // Pool exhausted — next allocation comes from near-EOF. + let r5 = tracker.try_allocate(&mut eof, MB1 as u32, false).unwrap(); + assert_eq!(r5.file_offset, 11 * MB1); + assert!(r5.state.is_safe()); + } + + // -- Multi-MB allocation from pool -- + + #[test] + fn pool_allocates_contiguous_multi_mb() { + let (tracker, mut eof) = make_test_tracker(20, 2); + // Mark a contiguous 4 MB region (bits 4..8) in-use, then release. + tracker + .mark_range_in_use(&mut eof, 4 * MB1, 4 * MB1 as u32) + .unwrap(); + tracker.complete_initialization(&eof); + tracker.release(4 * MB1, 4 * MB1 as u32); + + // Now request a 3 MB allocation from pool — should find the 4MB hole. + let result = tracker + .try_allocate(&mut eof, 3 * MB1 as u32, false) + .unwrap(); + assert_eq!(result.file_offset, 4 * MB1); + assert!(!result.state.is_safe()); + + // 1 MB of the hole (bit 7) is still in pool. + let r2 = tracker.try_allocate(&mut eof, MB1 as u32, false).unwrap(); + assert_eq!(r2.file_offset, 7 * MB1); + } + + // -- Truncation clamps zero_offset -- + + #[test] + fn truncate_clamps_zero_offset() { + let (tracker, mut eof) = make_test_tracker(10, 2); + tracker.complete_initialization(&eof); + // zero_offset = 4 MB, file_length = 10 MB. + + // Allocate some EOF space to advance zero_offset. + tracker + .try_allocate(&mut eof, 3 * MB1 as u32, false) + .unwrap(); + assert_eq!(eof.zero_offset, 7 * MB1); + + // Truncate file to 5 MB. + tracker.apply_truncate(&mut eof, 5 * MB1); + assert_eq!(eof.file_length, 5 * MB1); + // zero_offset should be clamped to file_length. + assert!(eof.zero_offset <= 5 * MB1); + } + + // -- Multiple anchored blocks: only one reclaimed per allocate -- + + #[test] + fn multiple_anchored_blocks_reclaimed_one_at_a_time() { + let (tracker, mut eof) = make_test_tracker(20, 2); + // Mark anchored regions in-use first (standard sequence), + // then mark as trimmed. + tracker + .mark_range_in_use(&mut eof, 6 * MB1, 2 * MB1 as u32) + .unwrap(); + tracker + .mark_trimmed_block(2, 6 * MB1, 2 * MB1 as u32) + .unwrap(); + tracker + .mark_range_in_use(&mut eof, 10 * MB1, 2 * MB1 as u32) + .unwrap(); + tracker + .mark_trimmed_block(5, 10 * MB1, 2 * MB1 as u32) + .unwrap(); + + // Fill all remaining space so pool + EOF are empty. + tracker + .mark_range_in_use(&mut eof, 4 * MB1, 2 * MB1 as u32) + .unwrap(); + tracker + .mark_range_in_use(&mut eof, 8 * MB1, 2 * MB1 as u32) + .unwrap(); + tracker + .mark_range_in_use(&mut eof, 12 * MB1, 8 * MB1 as u32) + .unwrap(); + tracker.complete_initialization(&eof); + + // BAT with both blocks anchored. + let bat = make_test_bat_with_anchored_block(2, 6, 16); + bat.set_block_mapping( + 5, + BlockMapping::new() + .with_bat_state(BatEntryState::Unmapped) + .with_file_megabyte(10), + ); + + // First allocate gets block 2 (lowest block number). + let r1 = tracker + .try_allocate_with_bat(&mut eof, 2 * MB1 as u32, false, &bat) + .unwrap(); + assert_eq!(r1.file_offset, 6 * MB1); + assert!(!r1.state.is_safe()); + + // Second allocate gets block 5. + let r2 = tracker + .try_allocate_with_bat(&mut eof, 2 * MB1 as u32, false, &bat) + .unwrap(); + assert_eq!(r2.file_offset, 10 * MB1); + assert!(!r2.state.is_safe()); + + // No more anchored blocks. + assert!( + tracker + .try_allocate_with_bat(&mut eof, 2 * MB1 as u32, false, &bat) + .is_none() + ); + } + + // -- Anchored block larger than requested: excess goes to pool -- + + #[test] + fn anchored_block_excess_released_to_pool() { + let (tracker, mut eof) = make_test_tracker(10, 2); // block_size = 2 MB + // Anchor block 0 at offset 4..6 MB. + tracker + .mark_trimmed_block(0, 4 * MB1, 2 * MB1 as u32) + .unwrap(); + // Fill the rest. + tracker + .mark_range_in_use(&mut eof, 6 * MB1, 4 * MB1 as u32) + .unwrap(); + tracker.complete_initialization(&eof); + + let bat = make_test_bat_with_anchored_block(0, 4, 16); + + // Request only 1 MB from a 2 MB anchored block. + let r = tracker + .try_allocate_with_bat(&mut eof, MB1 as u32, false, &bat) + .unwrap(); + assert_eq!(r.file_offset, 4 * MB1); + + // The excess 1 MB (at offset 5 MB) should now be in the free pool. + let r2 = tracker.try_allocate(&mut eof, MB1 as u32, false).unwrap(); + assert_eq!(r2.file_offset, 5 * MB1); + assert!(!r2.state.is_safe()); + } + + // -- required_file_length respects alignment -- + + #[test] + fn required_file_length_with_alignment() { + let (tracker, eof) = make_test_tracker_aligned(4, 4, 4 * MB1 as u32); + tracker.complete_initialization(&eof); + // zero_offset = 4 MB (already aligned). + + let target = eof.required_file_length(tracker.block_alignment(), 4 * MB1 as u32, true); + // Should be at least file_length + extension_length. + assert!(target >= 4 * MB1 + DEFAULT_EOF_EXTENSION_LENGTH as u64); + // And aligned target should fit the request. + assert!(target >= 4 * MB1 + 4 * MB1); + } + + // -- Zero-length mark is a no-op -- + + #[test] + fn mark_zero_length_is_noop() { + let (tracker, mut eof) = make_test_tracker(10, 2); + assert!(tracker.mark_range_in_use(&mut eof, 4 * MB1, 0).is_ok()); + // The range should still be free. + assert!(!tracker.is_range_in_use(&eof, 4 * MB1, MB1 as u32)); + } +} diff --git a/vm/devices/storage/vhdx/src/tests/crash_tests.rs b/vm/devices/storage/vhdx/src/tests/crash_tests.rs new file mode 100644 index 0000000000..60d0b2aefb --- /dev/null +++ b/vm/devices/storage/vhdx/src/tests/crash_tests.rs @@ -0,0 +1,1132 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Crash consistency tests for the VHDX write path. +//! +//! Tests are organized into three categories: +//! 1. **Structural tests** — verify `pre_log_fsn` values on BAT pages +//! after allocation decisions. +//! 2. **Ordering tests** — verify flush barriers between data writes and +//! WAL writes via the write log. +//! 3. **End-to-end crash recovery tests** — simulate crashes with +//! `CrashTestFile` and verify replay recovers correctly. + +use crate::AsyncFile; +use crate::AsyncFileExt; +use crate::format; +use crate::open::VhdxFile; +use crate::tests::support::CrashTestFile; +use crate::tests::support::InMemoryFile; +use pal_async::DefaultDriver; +use pal_async::async_test; + +/// Helper: write a data pattern via the write path. +async fn write_pattern(vhdx: &VhdxFile, offset: u64, len: usize, value: u8) { + let write_buf = vec![value; len]; + let mut ranges = Vec::new(); + let guard = vhdx + .resolve_write(offset, len as u32, &mut ranges) + .await + .unwrap(); + for range in &ranges { + match range { + crate::WriteRange::Data { + file_offset, + length, + .. + } => { + vhdx.file + .write_at(*file_offset, &write_buf[..(*length as usize)]) + .await + .unwrap(); + } + crate::WriteRange::Zero { + file_offset, + length, + } => { + let zeros = vec![0u8; *length as usize]; + vhdx.file.write_at(*file_offset, &zeros).await.unwrap(); + } + } + } + guard.complete().await.unwrap(); +} + +/// Helper: read data at a guest offset via the read path. +async fn read_pattern(vhdx: &VhdxFile, offset: u64, len: usize) -> Vec { + let mut buf = vec![0u8; len]; + let mut ranges = Vec::new(); + let _guard = vhdx + .resolve_read(offset, len as u32, &mut ranges) + .await + .unwrap(); + for range in &ranges { + match range { + crate::ReadRange::Data { + guest_offset, + file_offset, + length, + } => { + let start = (*guest_offset - offset) as usize; + let end = start + *length as usize; + vhdx.file + .read_at(*file_offset, &mut buf[start..end]) + .await + .unwrap(); + } + crate::ReadRange::Zero { + guest_offset, + length, + } => { + let start = (*guest_offset - offset) as usize; + let end = start + *length as usize; + buf[start..end].fill(0); + } + crate::ReadRange::Unmapped { .. } => {} + } + } + buf +} + +// ============================================================================= +// Structural tests: verify pre_log_fsn values on BAT pages +// ============================================================================= + +/// Near-EOF allocation: SpaceState::Zero → no pre_log_fsn on BAT page. +/// +/// First write to a new VHDX allocates from near-EOF space, which is +/// already zeroed (durable zeros). No flush barrier is needed. +#[async_test] +async fn bat_page_no_fsn_safe_near_eof(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + // Write one full block (block_size is 1 MiB by default). + let block_size = vhdx.block_size as usize; + write_pattern(&vhdx, 0, block_size, 0xAA).await; + + // Check that the BAT page for block 0 has NO pre_log_fsn. + let page_key = vhdx.bat_page_key_for_block(0); + let fsn = vhdx.cache.get_pre_log_fsn(page_key); + assert_eq!( + fsn, None, + "near-EOF allocation should NOT set pre_log_fsn (SpaceState::Zero)" + ); + + vhdx.close().await.unwrap(); +} + +/// Free-pool reuse: SpaceState::CrossStale → pre_log_fsn set. +/// +/// Allocate a block, trim it (release to free pool), then allocate again. +/// The second allocation reuses free-pool space, which is NOT safe (contains +/// old data from the previously trimmed block). +#[async_test] +async fn bat_page_has_fsn_unsafe_free_pool(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size as u64; + + // Allocate block 0 with data. + write_pattern(&vhdx, 0, block_size as usize, 0xAA).await; + vhdx.flush().await.unwrap(); + + // Trim block 0 to release its space to the free pool. + let trim_request = + crate::trim::TrimRequest::new(crate::trim::TrimMode::FreeSpace, 0, block_size); + vhdx.trim(trim_request).await.unwrap(); + vhdx.flush().await.unwrap(); + + // Now write to block 1 — space should come from free pool. + write_pattern(&vhdx, block_size, block_size as usize, 0xBB).await; + + // The BAT page for block 1 should have a pre_log_fsn set because + // the allocation reused free-pool space (SpaceState::CrossStale (unsafe)). + let page_key = vhdx.bat_page_key_for_block(1); + let fsn = vhdx.cache.get_pre_log_fsn(page_key); + + assert!( + fsn.is_some(), + "free-pool reuse should set pre_log_fsn (SpaceState::CrossStale)" + ); + + vhdx.close().await.unwrap(); +} + +/// Overwrite existing FullyPresent block: no allocation → no pre_log_fsn. +/// +/// Writing to an already-allocated block should not set any FSN constraint +/// because the BAT entry doesn't change. +#[async_test] +async fn bat_page_no_fsn_existing_block(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size as usize; + + // First write allocates the block. + write_pattern(&vhdx, 0, block_size, 0xAA).await; + vhdx.flush().await.unwrap(); + + // Second write overwrites the same block — no new allocation. + write_pattern(&vhdx, 0, block_size, 0xBB).await; + + // BAT page should have no pre_log_fsn (no allocation happened). + let page_key = vhdx.bat_page_key_for_block(0); + let fsn = vhdx.cache.get_pre_log_fsn(page_key); + assert_eq!( + fsn, None, + "overwrite of existing block should NOT set pre_log_fsn" + ); + + vhdx.close().await.unwrap(); +} + +/// Soft-anchor reclaim: SpaceState::Zero → no pre_log_fsn. +/// +/// Trim a block with FileSpace mode (creates soft anchor), then write the +/// same block again. The allocation reclaims the soft-anchored space, which +/// is the block's own old data — SpaceState::OwnStale (safe). +#[async_test] +async fn bat_page_no_fsn_safe_soft_anchor(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size as u64; + + // Allocate block 0 with data. + write_pattern(&vhdx, 0, block_size as usize, 0xAA).await; + vhdx.flush().await.unwrap(); + + // Trim block 0 with FileSpace mode — creates a soft anchor. + let trim_request = + crate::trim::TrimRequest::new(crate::trim::TrimMode::FileSpace, 0, block_size); + vhdx.trim(trim_request).await.unwrap(); + vhdx.flush().await.unwrap(); + + // Write to block 0 again — should reclaim the soft-anchored space. + write_pattern(&vhdx, 0, block_size as usize, 0xBB).await; + + // The BAT page for block 0 should have NO pre_log_fsn because the + // allocation reused the block's own old space (SpaceState::OwnStale (safe)). + let page_key = vhdx.bat_page_key_for_block(0); + let fsn = vhdx.cache.get_pre_log_fsn(page_key); + assert_eq!( + fsn, None, + "soft-anchor reclaim should NOT set pre_log_fsn (SpaceState::Zero)" + ); + + vhdx.close().await.unwrap(); +} + +/// Partial write triggering non-TFP allocation with unsafe space → +/// pre_log_fsn set. +/// +/// Write a partial block (less than full block) where the allocation +/// comes from the free pool. Since the space contains stale data from +/// another block, SpaceState::CrossStale (unsafe) → the non-TFP path sets pre_log_fsn. +#[async_test] +async fn bat_page_has_fsn_partial_unsafe(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size as u64; + + // Allocate block 0 with data. + write_pattern(&vhdx, 0, block_size as usize, 0xAA).await; + vhdx.flush().await.unwrap(); + + // Trim block 0 to release its space to the free pool. + let trim_request = + crate::trim::TrimRequest::new(crate::trim::TrimMode::FreeSpace, 0, block_size); + vhdx.trim(trim_request).await.unwrap(); + vhdx.flush().await.unwrap(); + + // Write a partial block at block 1 (less than full block_size). This + // triggers the non-TFP allocation path. Space comes from free pool → + // SpaceState::CrossStale (unsafe). + let partial_size = 4096; + write_pattern(&vhdx, block_size, partial_size, 0xCC).await; + + // The BAT page for block 1 should have pre_log_fsn set. + let page_key = vhdx.bat_page_key_for_block(1); + let fsn = vhdx.cache.get_pre_log_fsn(page_key); + assert!( + fsn.is_some(), + "partial write with unsafe free-pool space should set pre_log_fsn" + ); + + vhdx.close().await.unwrap(); +} + +/// Partial write with safe space → no pre_log_fsn. +/// +/// A partial write to a new block allocated from near-EOF (safe) space +/// should NOT set pre_log_fsn, because near-EOF space contains durable zeros. +#[async_test] +async fn bat_page_no_fsn_partial_safe(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + // Partial write to block 0 (less than full block). Space comes from + // near-EOF → SpaceState::OwnStale (safe). + let partial_size = 4096; + write_pattern(&vhdx, 0, partial_size, 0xDD).await; + + // The BAT page for block 0 should have NO pre_log_fsn. + let page_key = vhdx.bat_page_key_for_block(0); + let fsn = vhdx.cache.get_pre_log_fsn(page_key); + assert_eq!( + fsn, None, + "partial write with safe near-EOF space should NOT set pre_log_fsn" + ); + + vhdx.close().await.unwrap(); +} + +// ============================================================================= +// End-to-end crash recovery tests using CrashTestFile +// ============================================================================= + +/// Write + flush → crash → replay recovers data. +#[async_test] +async fn crash_after_flush_data_survives(driver: DefaultDriver) { + // Create a VHDX on InMemoryFile first, then transfer to CrashTestFile. + let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let snapshot = mem_file.snapshot(); + + let crash_file = CrashTestFile::from_durable(snapshot); + let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap(); + + // Write one block of data. + let block_size = vhdx.block_size as usize; + write_pattern(&vhdx, 0, block_size, 0xAB).await; + + // Flush to make data durable. + vhdx.flush().await.unwrap(); + + // Crash — get durable state, then abort (ensures log task exits). + let durable = vhdx.file.durable_snapshot(); + vhdx.abort().await; + + // Reopen from durable state (log replay will happen). + let recovered_file = InMemoryFile::from_snapshot(durable); + let vhdx2 = VhdxFile::open(recovered_file) + .allow_replay(true) + .read_only() + .await + .unwrap(); + + // Verify data survived. + let read_buf = read_pattern(&vhdx2, 0, block_size).await; + assert!( + read_buf.iter().all(|&b| b == 0xAB), + "data should survive crash after flush" + ); +} + +/// Write without flush → crash → data lost. +#[async_test] +async fn crash_no_flush_data_lost(driver: DefaultDriver) { + let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let snapshot = mem_file.snapshot(); + + let crash_file = CrashTestFile::from_durable(snapshot); + let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size as usize; + + // Write but do NOT flush. + write_pattern(&vhdx, 0, block_size, 0xCD).await; + + // Crash — get durable state. + let durable = vhdx.file.durable_snapshot(); + vhdx.abort().await; + + // Reopen from durable state. + let recovered_file = InMemoryFile::from_snapshot(durable); + let vhdx2 = VhdxFile::open(recovered_file) + .allow_replay(true) + .read_only() + .await + .unwrap(); + + // Data should be lost (read as zeros for unallocated block). + let read_buf = read_pattern(&vhdx2, 0, block_size).await; + assert!( + read_buf.iter().all(|&b| b == 0), + "data should be lost without flush before crash" + ); +} + +/// Write + flush + close → reopen → clean (no replay needed). +#[async_test] +async fn clean_close_no_replay(driver: DefaultDriver) { + let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let snapshot = mem_file.snapshot(); + + let crash_file = CrashTestFile::from_durable(snapshot); + let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size as usize; + + write_pattern(&vhdx, 0, block_size, 0xEE).await; + vhdx.flush().await.unwrap(); + + // Close cleanly. + let file_ref = vhdx.file.clone(); + vhdx.close().await.unwrap(); + let durable = file_ref.durable_snapshot(); + + // Reopen — should NOT need log replay. + let recovered_file = InMemoryFile::from_snapshot(durable); + let vhdx2 = VhdxFile::open(recovered_file) + .allow_replay(true) + .read_only() + .await + .unwrap(); + + let read_buf = read_pattern(&vhdx2, 0, block_size).await; + assert!( + read_buf.iter().all(|&b| b == 0xEE), + "data should survive clean close + reopen" + ); +} + +/// Crash after flush, reopen, write more, flush, crash again → +/// both rounds of data survive. +#[async_test] +async fn crash_recovery_then_more_writes(driver: DefaultDriver) { + let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let snapshot = mem_file.snapshot(); + let block_size: u64; + + // Round 1: write, flush, crash. + let durable1 = { + let crash_file = CrashTestFile::from_durable(snapshot); + let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap(); + block_size = vhdx.block_size as u64; + + write_pattern(&vhdx, 0, block_size as usize, 0x11).await; + vhdx.flush().await.unwrap(); + + let durable = vhdx.file.durable_snapshot(); + vhdx.abort().await; + durable + }; + + // Round 2: recover, write more, flush, crash again. + let durable2 = { + let crash_file = CrashTestFile::from_durable(durable1); + let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap(); + + write_pattern(&vhdx, block_size, block_size as usize, 0x22).await; + vhdx.flush().await.unwrap(); + + let durable = vhdx.file.durable_snapshot(); + vhdx.abort().await; + durable + }; + + // Verify both rounds of data survive. + let recovered_file = InMemoryFile::from_snapshot(durable2); + let vhdx = VhdxFile::open(recovered_file) + .allow_replay(true) + .read_only() + .await + .unwrap(); + + let buf0 = read_pattern(&vhdx, 0, block_size as usize).await; + assert!( + buf0.iter().all(|&b| b == 0x11), + "round 1 data should survive" + ); + + let buf1 = read_pattern(&vhdx, block_size, block_size as usize).await; + assert!( + buf1.iter().all(|&b| b == 0x22), + "round 2 data should survive" + ); +} + +/// Multiple blocks → flush → crash → all survive. +#[async_test] +async fn crash_multi_block_all_survive(driver: DefaultDriver) { + let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let snapshot = mem_file.snapshot(); + + let crash_file = CrashTestFile::from_durable(snapshot); + let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size as u64; + + // Write three blocks with different patterns. + write_pattern(&vhdx, 0, block_size as usize, 0xAA).await; + write_pattern(&vhdx, block_size, block_size as usize, 0xBB).await; + write_pattern(&vhdx, block_size * 2, block_size as usize, 0xCC).await; + + vhdx.flush().await.unwrap(); + + // Crash. + let durable = vhdx.file.durable_snapshot(); + vhdx.abort().await; + + // Verify all three blocks survived. + let recovered = InMemoryFile::from_snapshot(durable); + let vhdx = VhdxFile::open(recovered) + .allow_replay(true) + .read_only() + .await + .unwrap(); + + let buf0 = read_pattern(&vhdx, 0, block_size as usize).await; + assert!(buf0.iter().all(|&b| b == 0xAA)); + let buf1 = read_pattern(&vhdx, block_size, block_size as usize).await; + assert!(buf1.iter().all(|&b| b == 0xBB)); + let buf2 = read_pattern(&vhdx, block_size * 2, block_size as usize).await; + assert!(buf2.iter().all(|&b| b == 0xCC)); +} + +/// Sequential writes with flushes → crash after second flush → +/// first two blocks survive, third (unflushed) lost. +#[async_test] +async fn crash_interleaved_flush_partial(driver: DefaultDriver) { + let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let snapshot = mem_file.snapshot(); + + let crash_file = CrashTestFile::from_durable(snapshot); + let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size as u64; + + // Write block 0, flush. + write_pattern(&vhdx, 0, block_size as usize, 0x11).await; + vhdx.flush().await.unwrap(); + + // Write block 1, flush. + write_pattern(&vhdx, block_size, block_size as usize, 0x22).await; + vhdx.flush().await.unwrap(); + + // Write block 2, do NOT flush. + write_pattern(&vhdx, block_size * 2, block_size as usize, 0x33).await; + + // Crash. + let durable = vhdx.file.durable_snapshot(); + vhdx.abort().await; + + let recovered = InMemoryFile::from_snapshot(durable); + let vhdx = VhdxFile::open(recovered) + .allow_replay(true) + .read_only() + .await + .unwrap(); + + // Block 0 and 1 should survive. + let buf0 = read_pattern(&vhdx, 0, block_size as usize).await; + assert!(buf0.iter().all(|&b| b == 0x11), "block 0 should survive"); + let buf1 = read_pattern(&vhdx, block_size, block_size as usize).await; + assert!(buf1.iter().all(|&b| b == 0x22), "block 1 should survive"); + + // Block 2 should be lost (zeros). + let buf2 = read_pattern(&vhdx, block_size * 2, block_size as usize).await; + assert!( + buf2.iter().all(|&b| b == 0), + "block 2 (unflushed) should be lost" + ); +} + +/// Large write spanning multiple blocks → flush → crash → all survive. +#[async_test] +async fn crash_spanning_write_survives(driver: DefaultDriver) { + let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let snapshot = mem_file.snapshot(); + + let crash_file = CrashTestFile::from_durable(snapshot); + let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size as u64; + + // Write 3 blocks at once (spanning write). + let total_len = (block_size * 3) as usize; + write_pattern(&vhdx, 0, total_len, 0xDD).await; + vhdx.flush().await.unwrap(); + + // Crash. + let durable = vhdx.file.durable_snapshot(); + vhdx.abort().await; + + let recovered = InMemoryFile::from_snapshot(durable); + let vhdx = VhdxFile::open(recovered) + .allow_replay(true) + .read_only() + .await + .unwrap(); + + let read_buf = read_pattern(&vhdx, 0, total_len).await; + assert!( + read_buf.iter().all(|&b| b == 0xDD), + "spanning write data should survive crash after flush" + ); +} + +/// Write → flush → apply completes → crash (log_guid still set) → +/// replay is idempotent. +/// +/// After flush, the log task writes WAL entries and applies them to the +/// BAT region. If we crash at that point, log_guid is still set (close +/// never ran), so the next open replays the log. Since the entries are +/// already applied, replay is idempotent — the data should be correct. +#[async_test] +async fn crash_after_apply_replay_idempotent(driver: DefaultDriver) { + let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let snapshot = mem_file.snapshot(); + + let crash_file = CrashTestFile::from_durable(snapshot); + let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size as usize; + + // Write two blocks with distinct patterns. + write_pattern(&vhdx, 0, block_size, 0xA1).await; + write_pattern(&vhdx, block_size as u64, block_size, 0xA2).await; + vhdx.flush().await.unwrap(); + + // Take a durable snapshot — log entries are applied by now. + let durable = vhdx.file.durable_snapshot(); + vhdx.abort().await; + + // First replay — should succeed. + let recovered1 = InMemoryFile::from_snapshot(durable.clone()); + let vhdx1 = VhdxFile::open(recovered1) + .allow_replay(true) + .read_only() + .await + .unwrap(); + let buf0 = read_pattern(&vhdx1, 0, block_size).await; + assert!( + buf0.iter().all(|&b| b == 0xA1), + "block 0 after first replay" + ); + let buf1 = read_pattern(&vhdx1, block_size as u64, block_size).await; + assert!( + buf1.iter().all(|&b| b == 0xA2), + "block 1 after first replay" + ); + + // Second replay from the same durable snapshot — should be idempotent. + let recovered2 = InMemoryFile::from_snapshot(durable); + let vhdx2 = VhdxFile::open(recovered2) + .allow_replay(true) + .read_only() + .await + .unwrap(); + let buf0b = read_pattern(&vhdx2, 0, block_size).await; + assert!( + buf0b.iter().all(|&b| b == 0xA1), + "block 0 after second replay (idempotent)" + ); + let buf1b = read_pattern(&vhdx2, block_size as u64, block_size).await; + assert!( + buf1b.iter().all(|&b| b == 0xA2), + "block 1 after second replay (idempotent)" + ); +} + +/// Overwrite same block → flush → crash → latest data survives. +#[async_test] +async fn crash_overwrite_latest_wins(driver: DefaultDriver) { + let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let snapshot = mem_file.snapshot(); + + let crash_file = CrashTestFile::from_durable(snapshot); + let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size as usize; + + // Write block 0 with 0xAA, flush. + write_pattern(&vhdx, 0, block_size, 0xAA).await; + vhdx.flush().await.unwrap(); + + // Overwrite block 0 with 0xBB, flush. + write_pattern(&vhdx, 0, block_size, 0xBB).await; + vhdx.flush().await.unwrap(); + + // Crash. + let durable = vhdx.file.durable_snapshot(); + vhdx.abort().await; + + let recovered = InMemoryFile::from_snapshot(durable); + let vhdx = VhdxFile::open(recovered) + .allow_replay(true) + .read_only() + .await + .unwrap(); + + let read_buf = read_pattern(&vhdx, 0, block_size).await; + assert!( + read_buf.iter().all(|&b| b == 0xBB), + "latest overwrite should survive" + ); +} + +/// Crash without close → reopen → log_guid set → replay → correct. +#[async_test] +async fn drop_without_close_triggers_replay(driver: DefaultDriver) { + let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let snapshot = mem_file.snapshot(); + + let crash_file = CrashTestFile::from_durable(snapshot); + let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size as usize; + + write_pattern(&vhdx, 0, block_size, 0xEE).await; + vhdx.flush().await.unwrap(); + + // Abort without close (simulates unclean shutdown). + let durable = vhdx.file.durable_snapshot(); + vhdx.abort().await; + + // Verify the header still has log_guid set (dirty file). + let recovered = InMemoryFile::from_snapshot(durable.clone()); + // Opening read-only when dirty should fail with LogReplayRequired. + let result = VhdxFile::open(InMemoryFile::from_snapshot(durable.clone())) + .read_only() + .await; + assert!(result.is_err(), "read-only open of dirty file should fail"); + + // Open writable — log replay should happen. + let vhdx2 = VhdxFile::open(recovered) + .allow_replay(true) + .read_only() + .await + .unwrap(); + let read_buf = read_pattern(&vhdx2, 0, block_size).await; + assert!( + read_buf.iter().all(|&b| b == 0xEE), + "data should survive after log replay" + ); + drop(result); +} + +// ============================================================================= +// Ordering tests: verify flush barrier placement via write log +// ============================================================================= + +/// Core ordering test: unsafe allocation → flush barrier between +/// data write and WAL write. +/// +/// We verify that after a free-pool reuse allocation, the write log +/// contains a Flush entry between the data writes and subsequent WAL writes. +#[async_test] +async fn flush_between_data_and_wal_unsafe(driver: DefaultDriver) { + let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let snapshot = mem_file.snapshot(); + + let crash_file = CrashTestFile::from_durable(snapshot); + let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size as u64; + + // Allocate block 0 (near-EOF, safe). + write_pattern(&vhdx, 0, block_size as usize, 0xAA).await; + vhdx.flush().await.unwrap(); + + // Trim block 0 to release to free pool. + let trim_request = + crate::trim::TrimRequest::new(crate::trim::TrimMode::FreeSpace, 0, block_size); + vhdx.trim(trim_request).await.unwrap(); + vhdx.flush().await.unwrap(); + + // Record the flush count before the unsafe allocation. + let pre_flush_count = vhdx.file.flush_count(); + + // Write block 1 — should reuse free-pool space (unsafe). + write_pattern(&vhdx, block_size, block_size as usize, 0xBB).await; + + // Flush — this triggers the log path. + vhdx.flush().await.unwrap(); + + // Post-flush count should be greater (at least one flush for data + one for WAL). + let post_flush_count = vhdx.file.flush_count(); + assert!( + post_flush_count > pre_flush_count, + "should have flushed after unsafe allocation: pre={}, post={}", + pre_flush_count, + post_flush_count + ); + + vhdx.close().await.unwrap(); +} + +/// Safe allocation → no extra flush barrier needed. +/// +/// Near-EOF allocations produce SpaceState::Zero, so no pre_log_fsn +/// is set. The number of flushes should be minimal. +#[async_test] +async fn no_extra_flush_safe_allocation(driver: DefaultDriver) { + let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let snapshot = mem_file.snapshot(); + + let crash_file = CrashTestFile::from_durable(snapshot); + let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size as u64; + + // All allocations are near-EOF (safe). No extra flushes needed. + write_pattern(&vhdx, 0, block_size as usize, 0xAA).await; + + // The BAT page should have no pre_log_fsn constraint. + let page_key = vhdx.bat_page_key_for_block(0); + let fsn = vhdx.cache.get_pre_log_fsn(page_key); + assert_eq!( + fsn, None, + "safe near-EOF allocation should not set pre_log_fsn" + ); + + vhdx.close().await.unwrap(); +} + +/// Multiple blocks in one write, mixed safe/unsafe → barrier present +/// for the unsafe block. +/// +/// Allocate block 0 (safe), trim it to free pool, then write a spanning +/// write covering block 1 (unsafe, from free pool) and block 2 (safe, +/// near-EOF). The unsafe block should have pre_log_fsn set. +#[async_test] +async fn mixed_safe_unsafe_has_barrier(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size as u64; + + // Allocate block 0. + write_pattern(&vhdx, 0, block_size as usize, 0xAA).await; + vhdx.flush().await.unwrap(); + + // Trim block 0 to release to free pool. + let trim_request = + crate::trim::TrimRequest::new(crate::trim::TrimMode::FreeSpace, 0, block_size); + vhdx.trim(trim_request).await.unwrap(); + vhdx.flush().await.unwrap(); + + // Write a spanning write covering blocks 1 and 2. + // Block 1's space may come from the free pool (unsafe). + // Block 2's space comes from near-EOF (safe). + let total_len = (block_size * 2) as usize; + write_pattern(&vhdx, block_size, total_len, 0xCC).await; + + // At least one of the BAT pages should have pre_log_fsn set (the + // unsafe block). Check that the write_guard's needs_flush_before_log + // was set by verifying the BAT page for the unsafe block has FSN. + let page_key_1 = vhdx.bat_page_key_for_block(1); + let page_key_2 = vhdx.bat_page_key_for_block(2); + let fsn_1 = vhdx.cache.get_pre_log_fsn(page_key_1); + let fsn_2 = vhdx.cache.get_pre_log_fsn(page_key_2); + + // At least one block should have FSN (the one allocated from free pool). + // Both blocks share the TFP path, so needs_flush_before_log is set for + // the entire WriteIoGuard. The FSN is applied to all TFP blocks in + // complete_write_inner. + assert!( + fsn_1.is_some() || fsn_2.is_some(), + "mixed safe/unsafe spanning write should set pre_log_fsn on at least one BAT page" + ); + + vhdx.close().await.unwrap(); +} + +// ============================================================================= +// Header update tests +// ============================================================================= + +/// After flush, header sequence_number has advanced. +#[async_test] +async fn flush_advances_header_sequence(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + let seq_before = vhdx.header_state.sequence_number().await; + + write_pattern(&vhdx, 0, vhdx.block_size as usize, 0xAA).await; + vhdx.flush().await.unwrap(); + + // The enable_write_mode call during the first write bumps the sequence number. + let seq_after = vhdx.header_state.sequence_number().await; + assert!( + seq_after > seq_before, + "sequence number should advance after write: before={}, after={}", + seq_before, + seq_after + ); + + vhdx.close().await.unwrap(); +} + +/// close() writes clean header (log_guid = ZERO). +#[async_test] +async fn close_header_is_clean(driver: DefaultDriver) { + use zerocopy::FromBytes; + + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let file_ref = vhdx.file.clone(); + + write_pattern(&vhdx, 0, vhdx.block_size as usize, 0xBB).await; + vhdx.flush().await.unwrap(); + vhdx.close().await.unwrap(); + + // Read both headers, find the current one (highest seq#). + let mut buf1 = vec![0u8; format::HEADER_SIZE as usize]; + file_ref + .read_at(format::HEADER_OFFSET_1, &mut buf1) + .await + .unwrap(); + let mut buf2 = vec![0u8; format::HEADER_SIZE as usize]; + file_ref + .read_at(format::HEADER_OFFSET_2, &mut buf2) + .await + .unwrap(); + + let h1 = format::Header::read_from_prefix(&buf1).ok().map(|(h, _)| h); + let h2 = format::Header::read_from_prefix(&buf2).ok().map(|(h, _)| h); + + let current = match (&h1, &h2) { + (Some(a), Some(b)) if b.sequence_number >= a.sequence_number => b, + (Some(a), _) => a, + (_, Some(b)) => b, + _ => panic!("no valid headers"), + }; + assert_eq!( + current.log_guid, + guid::Guid::ZERO, + "after close, current header should have log_guid = ZERO" + ); +} + +/// Header alternation: writes alternate between header slots 1 and 2. +#[async_test] +async fn headers_alternate_between_slots(driver: DefaultDriver) { + use zerocopy::FromBytes; + + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + + // Read initial header state. + let mut buf1_init = vec![0u8; format::HEADER_SIZE as usize]; + file.read_at(format::HEADER_OFFSET_1, &mut buf1_init) + .await + .unwrap(); + let h1_init = format::Header::read_from_prefix(&buf1_init) + .ok() + .map(|(h, _)| h); + let mut buf2_init = vec![0u8; format::HEADER_SIZE as usize]; + file.read_at(format::HEADER_OFFSET_2, &mut buf2_init) + .await + .unwrap(); + let h2_init = format::Header::read_from_prefix(&buf2_init) + .ok() + .map(|(h, _)| h); + + let seq1_init = h1_init.as_ref().map_or(0, |h| h.sequence_number); + let seq2_init = h2_init.as_ref().map_or(0, |h| h.sequence_number); + + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let file_ref = vhdx.file.clone(); + + // After open_writable, one header slot was updated with log_guid. + let mut buf1_after = vec![0u8; format::HEADER_SIZE as usize]; + file_ref + .read_at(format::HEADER_OFFSET_1, &mut buf1_after) + .await + .unwrap(); + let h1_after = format::Header::read_from_prefix(&buf1_after) + .ok() + .map(|(h, _)| h); + + let mut buf2_after = vec![0u8; format::HEADER_SIZE as usize]; + file_ref + .read_at(format::HEADER_OFFSET_2, &mut buf2_after) + .await + .unwrap(); + let h2_after = format::Header::read_from_prefix(&buf2_after) + .ok() + .map(|(h, _)| h); + + let seq1_after = h1_after.as_ref().map_or(0, |h| h.sequence_number); + let seq2_after = h2_after.as_ref().map_or(0, |h| h.sequence_number); + + // One slot should have a higher sequence number than before. + let slot1_updated = seq1_after > seq1_init; + let slot2_updated = seq2_after > seq2_init; + assert!( + slot1_updated || slot2_updated, + "one header slot should be updated after open_writable" + ); + assert!( + !(slot1_updated && slot2_updated), + "only one header slot should be updated (alternation)" + ); + + vhdx.close().await.unwrap(); +} + +// ============================================================================= +// Deferred space reclaim tests +// ============================================================================= + +/// Trim block A (FileSpace), then write the same block again (same-block +/// reclaim from deferred list). The write should reuse A's offset without +/// needing a flush — OwnStale. +#[async_test] +async fn deferred_same_block_reclaim(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size as u64; + + // Write block 0. + write_pattern(&vhdx, 0, block_size as usize, 0xAA).await; + let original_offset = vhdx.bat.get_block_mapping(0).file_megabyte(); + assert!(original_offset > 0); + + // Trim block 0 with FileSpace (creates deferred anchor). + let trim_req = crate::trim::TrimRequest::new(crate::trim::TrimMode::FileSpace, 0, block_size); + vhdx.trim(trim_req).await.unwrap(); + + // Write block 0 again — should reclaim from deferred list (no flush). + write_pattern(&vhdx, 0, block_size as usize, 0xBB).await; + + // Block 0 should be FullyPresent at the same offset. + let new_offset = { + let mapping = vhdx.bat.get_block_mapping(0); + assert_eq!(mapping.bat_state(), format::BatEntryState::FullyPresent); + mapping.file_megabyte() + }; + assert_eq!(original_offset, new_offset, "should reuse same offset"); + + // Verify data. + let buf = read_pattern(&vhdx, 0, block_size as usize).await; + assert!(buf.iter().all(|&b| b == 0xBB)); + + vhdx.close().await.unwrap(); +} + +/// Trim block A (FileSpace), crash before flush. On reopen, A should +/// still be FullyPresent with its data intact — the trim was never durable. +#[async_test] +async fn deferred_trim_crash_no_data_loss(driver: DefaultDriver) { + let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let snapshot = mem_file.snapshot(); + + let crash_file = CrashTestFile::from_durable(snapshot); + let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size as u64; + + // Write block 0. + write_pattern(&vhdx, 0, block_size as usize, 0xDD).await; + vhdx.flush().await.unwrap(); + + // Trim block 0 with FileSpace — deferred, NOT flushed. + let trim_req = crate::trim::TrimRequest::new(crate::trim::TrimMode::FileSpace, 0, block_size); + vhdx.trim(trim_req).await.unwrap(); + + // Crash — get durable state. + let durable = vhdx.file.durable_snapshot(); + vhdx.abort().await; + + // Reopen from durable state. + let recovered = InMemoryFile::from_snapshot(durable); + let vhdx2 = VhdxFile::open(recovered) + .allow_replay(true) + .read_only() + .await + .unwrap(); + + // Block 0 should still have its data (trim wasn't durable). + let buf = read_pattern(&vhdx2, 0, block_size as usize).await; + assert!( + buf.iter().all(|&b| b == 0xDD), + "data should survive crash when trim wasn't flushed" + ); +} + +/// Trim block A (FileSpace), write block B using separate space, crash +/// before flush. A should keep its data, B's write should be lost. +/// No data teleportation. +#[async_test] +async fn deferred_no_teleportation_on_crash(driver: DefaultDriver) { + let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let snapshot = mem_file.snapshot(); + + let crash_file = CrashTestFile::from_durable(snapshot); + let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size as u64; + + // Write blocks 0 and 1. + write_pattern(&vhdx, 0, block_size as usize, 0x11).await; + write_pattern(&vhdx, block_size, block_size as usize, 0x22).await; + vhdx.flush().await.unwrap(); + + // Trim block 0 — deferred, not flushed. + let trim_req = crate::trim::TrimRequest::new(crate::trim::TrimMode::FileSpace, 0, block_size); + vhdx.trim(trim_req).await.unwrap(); + + // Write block 1 with new data — this uses block 1's existing offset + // (overwrite, no allocation needed). + write_pattern(&vhdx, block_size, block_size as usize, 0x33).await; + + // Do NOT flush. Crash. + let durable = vhdx.file.durable_snapshot(); + vhdx.abort().await; + + // Reopen. + let recovered = InMemoryFile::from_snapshot(durable); + let vhdx2 = VhdxFile::open(recovered) + .allow_replay(true) + .read_only() + .await + .unwrap(); + + // Block 0 should still have original data (trim wasn't durable). + let buf0 = read_pattern(&vhdx2, 0, block_size as usize).await; + assert!( + buf0.iter().all(|&b| b == 0x11), + "block 0 data should be intact after crash (trim not durable)" + ); + + // Block 1: may have old (0x22) or new (0x33) data depending on + // whether the overwrite was flushed. Either is acceptable. + // What is NOT acceptable: block 1 reading as 0x11 (block 0's data). + let buf1 = read_pattern(&vhdx2, block_size, block_size as usize).await; + assert!( + buf1.iter().all(|&b| b == 0x22) || buf1.iter().all(|&b| b == 0x33), + "block 1 should have its own data, not block 0's" + ); +} + +/// Trim + flush + write + flush + reopen: verify clean ownership. +#[async_test] +async fn deferred_trim_flush_write_flush_reopen(driver: DefaultDriver) { + let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let snapshot = mem_file.snapshot(); + + let crash_file = CrashTestFile::from_durable(snapshot); + let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size as u64; + + // Write block 0, flush. + write_pattern(&vhdx, 0, block_size as usize, 0xAA).await; + vhdx.flush().await.unwrap(); + + // Trim block 0, flush (trim becomes durable). + let trim_req = crate::trim::TrimRequest::new(crate::trim::TrimMode::FileSpace, 0, block_size); + vhdx.trim(trim_req).await.unwrap(); + vhdx.flush().await.unwrap(); + + // Write block 0 again (same-block reclaim of durable anchor), flush. + write_pattern(&vhdx, 0, block_size as usize, 0xBB).await; + vhdx.flush().await.unwrap(); + + // Graceful close. + let durable = vhdx.file.durable_snapshot(); + vhdx.close().await.unwrap(); + + // Reopen and verify. + let recovered = InMemoryFile::from_snapshot(durable); + let vhdx2 = VhdxFile::open(recovered) + .allow_replay(true) + .read_only() + .await + .unwrap(); + + let buf = read_pattern(&vhdx2, 0, block_size as usize).await; + assert!( + buf.iter().all(|&b| b == 0xBB), + "block 0 should have new data after trim+write+flush cycle" + ); +} diff --git a/vm/devices/storage/vhdx/src/tests/io_tests.rs b/vm/devices/storage/vhdx/src/tests/io_tests.rs new file mode 100644 index 0000000000..ec3746ad09 --- /dev/null +++ b/vm/devices/storage/vhdx/src/tests/io_tests.rs @@ -0,0 +1,3709 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +use crate::AsyncFile; +use crate::AsyncFileExt; +use crate::create::{self, CreateParams}; +use crate::error::VhdxIoError; +use crate::error::VhdxIoErrorInner; +use crate::format; +use crate::format::BatEntry; +use crate::format::BatEntryState; +use crate::format::MB1; +use crate::header::WriteMode; +use crate::io::ReadRange; +use crate::io::WriteRange; +use crate::open::VhdxFile; +use crate::region; +use crate::tests::support::InMemoryFile; +use crate::tests::support::IoInterceptor; +use guid::Guid; +use pal_async::DefaultDriver; +use pal_async::async_test; +use std::borrow::Borrow; +use std::future::Future; +use std::sync::Arc; +use std::sync::atomic::AtomicBool; +use std::sync::atomic::Ordering; +use zerocopy::IntoBytes; + +#[async_test] +async fn read_empty_disk_returns_zero() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + + let mut ranges = Vec::new(); + let _guard = vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap(); + + assert_eq!(ranges.len(), 1); + assert_eq!( + ranges[0], + ReadRange::Zero { + guest_offset: 0, + length: 4096, + } + ); +} + +#[async_test] +async fn read_zero_length() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + + let mut ranges = Vec::new(); + let _guard = vhdx.resolve_read(0, 0, &mut ranges).await.unwrap(); + + assert!(ranges.is_empty()); +} + +#[async_test] +async fn read_beyond_end_of_disk() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + + let mut ranges = Vec::new(); + // Read 512 bytes past the end (both offset and length are sector-aligned). + let result = vhdx + .resolve_read(format::GB1 - 512, 1024, &mut ranges) + .await; + assert!(matches!( + result, + Err(VhdxIoError(VhdxIoErrorInner::BeyondEndOfDisk)) + )); +} + +#[async_test] +async fn read_at_disk_end_exact() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + + let mut ranges = Vec::new(); + let _guard = vhdx + .resolve_read(format::GB1 - 4096, 4096, &mut ranges) + .await + .unwrap(); + + assert_eq!(ranges.len(), 1); + assert_eq!( + ranges[0], + ReadRange::Zero { + guest_offset: format::GB1 - 4096, + length: 4096, + } + ); +} + +#[async_test] +async fn read_fully_present_block() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let regions = region::parse_region_tables(&file).await.unwrap(); + let bat_offset = regions.bat_offset; + + // Write a FullyPresent BAT entry for block 0 at file_offset_mb = 4. + let entry = BatEntry::new() + .with_state(BatEntryState::FullyPresent as u8) + .with_file_offset_mb(4); + file.write_at(bat_offset, entry.as_bytes()).await.unwrap(); + + // Extend file to cover the allocated range. + let needed = 4 * MB1 + format::DEFAULT_BLOCK_SIZE as u64; + file.set_file_size(needed).await.unwrap(); + + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + let mut ranges = Vec::new(); + let _guard = vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap(); + + assert_eq!(ranges.len(), 1); + assert_eq!( + ranges[0], + ReadRange::Data { + guest_offset: 0, + length: 4096, + file_offset: 4 * MB1, + } + ); +} + +#[async_test] +async fn read_spanning_two_blocks() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + + let block_size = vhdx.block_size() as u64; + let mut ranges = Vec::new(); + // Read last 512 bytes of block 0 and first 512 bytes of block 1. + let _guard = vhdx + .resolve_read((block_size - 512) as u64, 1024, &mut ranges) + .await + .unwrap(); + + assert_eq!(ranges.len(), 2); + assert_eq!( + ranges[0], + ReadRange::Zero { + guest_offset: block_size - 512, + length: 512, + } + ); + assert_eq!( + ranges[1], + ReadRange::Zero { + guest_offset: block_size, + length: 512, + } + ); +} + +#[async_test] +async fn read_spanning_multiple_blocks() { + // Use a small disk with 1 MiB blocks so spans are easier to test. + let file = InMemoryFile::new(0); + let block_size = MB1 as u32; + let mut params = CreateParams { + disk_size: 4 * MB1, + block_size, + ..Default::default() + }; + create::create(&file, &mut params).await.unwrap(); + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + + let mut ranges = Vec::new(); + // Read across blocks 0, 1, 2: start at 512 KiB, length = 2 MiB. + // Block 0: 512 KiB remaining. Block 1: full 1 MiB. Block 2: 512 KiB. + let start = MB1 / 2; // middle of block 0 + let len = (2 * MB1) as u32; // spans 3 blocks + let _guard = vhdx.resolve_read(start, len, &mut ranges).await.unwrap(); + + assert_eq!(ranges.len(), 3); + // Block 0: remaining half + assert_eq!( + ranges[0], + ReadRange::Zero { + guest_offset: start, + length: (MB1 / 2) as u32, + } + ); + // Block 1: full block + assert_eq!( + ranges[1], + ReadRange::Zero { + guest_offset: MB1, + length: block_size, + } + ); + // Block 2: first half + assert_eq!( + ranges[2], + ReadRange::Zero { + guest_offset: 2 * MB1, + length: (MB1 / 2) as u32, + } + ); +} + +#[async_test] +async fn read_unaligned_within_block() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let regions = region::parse_region_tables(&file).await.unwrap(); + + // Set block 0 to FullyPresent at file_offset_mb = 4. + let entry = BatEntry::new() + .with_state(BatEntryState::FullyPresent as u8) + .with_file_offset_mb(4); + file.write_at(regions.bat_offset, entry.as_bytes()) + .await + .unwrap(); + + // Extend file to cover the allocated range. + let needed = 4 * MB1 + format::DEFAULT_BLOCK_SIZE as u64; + file.set_file_size(needed).await.unwrap(); + + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + let mut ranges = Vec::new(); + // Read 512 bytes starting at sector 10 (offset 5120). + let _guard = vhdx.resolve_read(5120, 512, &mut ranges).await.unwrap(); + + assert_eq!(ranges.len(), 1); + assert_eq!( + ranges[0], + ReadRange::Data { + guest_offset: 5120, + length: 512, + file_offset: 4 * MB1 + 5120, + } + ); +} + +#[async_test] +async fn read_differencing_not_present() { + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: format::GB1, + has_parent: true, + ..Default::default() + }; + create::create(&file, &mut params).await.unwrap(); + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + + let mut ranges = Vec::new(); + let _guard = vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap(); + + assert_eq!(ranges.len(), 1); + assert_eq!( + ranges[0], + ReadRange::Unmapped { + guest_offset: 0, + length: 4096, + } + ); +} + +#[async_test] +async fn read_zero_state_block() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let regions = region::parse_region_tables(&file).await.unwrap(); + + // Set block 0 to Zero state. + let entry = BatEntry::new() + .with_state(BatEntryState::Zero as u8) + .with_file_offset_mb(0); + file.write_at(regions.bat_offset, entry.as_bytes()) + .await + .unwrap(); + + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + let mut ranges = Vec::new(); + let _guard = vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap(); + + assert_eq!(ranges.len(), 1); + assert_eq!( + ranges[0], + ReadRange::Zero { + guest_offset: 0, + length: 4096, + } + ); +} + +#[async_test] +async fn read_unmapped_block() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let regions = region::parse_region_tables(&file).await.unwrap(); + + // Set block 0 to Unmapped (trimmed) state. + let entry = BatEntry::new() + .with_state(BatEntryState::Unmapped as u8) + .with_file_offset_mb(0); + file.write_at(regions.bat_offset, entry.as_bytes()) + .await + .unwrap(); + + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + let mut ranges = Vec::new(); + let _guard = vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap(); + + assert_eq!(ranges.len(), 1); + assert_eq!( + ranges[0], + ReadRange::Zero { + guest_offset: 0, + length: 4096, + } + ); +} + +#[async_test] +async fn read_undefined_state_block() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let regions = region::parse_region_tables(&file).await.unwrap(); + + // Set block 0 to Undefined state (value 1). + let entry = BatEntry::new() + .with_state(BatEntryState::Undefined as u8) + .with_file_offset_mb(0); + file.write_at(regions.bat_offset, entry.as_bytes()) + .await + .unwrap(); + + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + let mut ranges = Vec::new(); + let _guard = vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap(); + + assert_eq!(ranges.len(), 1); + assert_eq!( + ranges[0], + ReadRange::Zero { + guest_offset: 0, + length: 4096, + } + ); +} + +#[async_test] +async fn read_entire_disk() { + // Small disk: 4 MiB with 2 MiB blocks = 2 blocks. + let disk_size = 4 * MB1; + let (file, _) = InMemoryFile::create_test_vhdx(disk_size).await; + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + + let mut ranges = Vec::new(); + let _guard = vhdx + .resolve_read(0, disk_size as u32, &mut ranges) + .await + .unwrap(); + + // 2 blocks, each produces one Zero range. + assert_eq!(ranges.len(), 2); + assert_eq!( + ranges[0], + ReadRange::Zero { + guest_offset: 0, + length: format::DEFAULT_BLOCK_SIZE, + } + ); + assert_eq!( + ranges[1], + ReadRange::Zero { + guest_offset: format::DEFAULT_BLOCK_SIZE as u64, + length: format::DEFAULT_BLOCK_SIZE, + } + ); +} + +#[async_test] +async fn read_4k_sector_disk() { + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: format::GB1, + logical_sector_size: 4096, + physical_sector_size: 4096, + ..Default::default() + }; + create::create(&file, &mut params).await.unwrap(); + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + + let mut ranges = Vec::new(); + // Read one 4K sector. + let _guard = vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap(); + assert_eq!(ranges.len(), 1); + assert_eq!( + ranges[0], + ReadRange::Zero { + guest_offset: 0, + length: 4096, + } + ); + + // Unaligned read should fail. + let mut ranges2 = Vec::new(); + let result = vhdx.resolve_read(512, 4096, &mut ranges2).await; + assert!(matches!( + result, + Err(VhdxIoError(VhdxIoErrorInner::UnalignedIo)) + )); +} + +// ---- Write tests ---- + +#[async_test] +async fn write_to_empty_block(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + let mut ranges = Vec::new(); + let _guard = vhdx.resolve_write(0, 4096, &mut ranges).await.unwrap(); + + // Should allocate a new block. With SpaceState::Zero (near-EOF + // extension space), zero padding is skipped — only Data emitted. + // Writing 4096 bytes at offset 0 in block: + // Data(0, 4096, file_offset) + assert!(!ranges.is_empty()); + // First should be Data + match ranges[0] { + WriteRange::Data { + guest_offset, + length, + file_offset, + } => { + assert_eq!(guest_offset, 0); + assert_eq!(length, 4096); + // file_offset should be MB-aligned. + assert!(file_offset > 0); + assert_eq!(file_offset % MB1, 0); + } + _ => panic!("expected Data range, got {:?}", ranges[0]), + } + // With safe data, trailing zero padding is skipped. + // If not safe, a trailing Zero range would follow. + if ranges.len() > 1 { + match ranges[1] { + WriteRange::Zero { + file_offset, + length, + } => { + assert_eq!(length, format::DEFAULT_BLOCK_SIZE - 4096); + assert!(file_offset > 0); + } + _ => panic!("expected Zero range, got {:?}", ranges[1]), + } + } +} + +#[async_test] +async fn write_to_fully_present_block(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let regions = region::parse_region_tables(&file).await.unwrap(); + + // Write a FullyPresent BAT entry for block 0 at file_offset_mb = 4. + let entry = BatEntry::new() + .with_state(BatEntryState::FullyPresent as u8) + .with_file_offset_mb(4); + file.write_at(regions.bat_offset, entry.as_bytes()) + .await + .unwrap(); + + // Extend file to cover the allocated range. + let needed = 4 * MB1 + format::DEFAULT_BLOCK_SIZE as u64; + file.set_file_size(needed).await.unwrap(); + + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let mut ranges = Vec::new(); + let _guard = vhdx.resolve_write(0, 4096, &mut ranges).await.unwrap(); + + // Should write directly to the existing block — single Data range. + assert_eq!(ranges.len(), 1); + assert_eq!( + ranges[0], + WriteRange::Data { + guest_offset: 0, + length: 4096, + file_offset: 4 * MB1, + } + ); +} + +#[async_test] +async fn write_spanning_two_blocks(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + let block_size = vhdx.block_size() as u64; + let mut ranges = Vec::new(); + // Write last 512 bytes of block 0 and first 512 bytes of block 1. + let _guard = vhdx + .resolve_write((block_size - 512) as u64, 1024, &mut ranges) + .await + .unwrap(); + + // Each block needs allocation. Filter out the data ranges. + let data_ranges: Vec<_> = ranges + .iter() + .filter(|r| matches!(r, WriteRange::Data { .. })) + .collect(); + assert_eq!(data_ranges.len(), 2, "expected 2 Data ranges for 2 blocks"); + + // First Data: last 512 bytes of block 0. + match data_ranges[0] { + WriteRange::Data { + guest_offset, + length, + .. + } => { + assert_eq!(*guest_offset, block_size - 512); + assert_eq!(*length, 512); + } + _ => unreachable!(), + } + // Second Data: first 512 bytes of block 1. + match data_ranges[1] { + WriteRange::Data { + guest_offset, + length, + .. + } => { + assert_eq!(*guest_offset, block_size); + assert_eq!(*length, 512); + } + _ => unreachable!(), + } +} + +#[async_test] +async fn write_then_read_roundtrip(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + // Step 1: resolve_write to get file offsets. + let mut write_ranges = Vec::new(); + let guard = vhdx.resolve_write(0, 512, &mut write_ranges).await.unwrap(); + + // Step 2: Write actual data at the returned Data offsets. + let pattern: Vec = (0..512u16).map(|i| (i % 256) as u8).collect(); + for wr in &write_ranges { + match wr { + WriteRange::Data { + file_offset, + length, + .. + } => { + vhdx.file + .write_at(*file_offset, &pattern[..(*length as usize)]) + .await + .unwrap(); + } + WriteRange::Zero { + file_offset, + length, + } => { + let zeros = vec![0u8; *length as usize]; + vhdx.file.write_at(*file_offset, &zeros).await.unwrap(); + } + } + } + + // Step 3: complete via guard. + guard.complete().await.unwrap(); + + // Step 4: resolve_read at the same offset. + let mut read_ranges = Vec::new(); + let _guard = vhdx.resolve_read(0, 512, &mut read_ranges).await.unwrap(); + + // Should now be Data (block was allocated). + assert_eq!(read_ranges.len(), 1); + match &read_ranges[0] { + ReadRange::Data { + file_offset, + length, + .. + } => { + assert_eq!(*length, 512); + let mut buf = vec![0u8; 512]; + vhdx.file.read_at(*file_offset, &mut buf).await.unwrap(); + assert_eq!(buf, pattern); + } + other => panic!("expected Data read range, got {:?}", other), + } +} + +#[async_test] +async fn write_partial_block(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + // Write 512 bytes at offset 4096 within block 0. + let mut ranges = Vec::new(); + let _guard = vhdx.resolve_write(4096, 512, &mut ranges).await.unwrap(); + + // With safe data (near-EOF or extension space), Zero padding is + // skipped. Only expect the Data range. + // Without safe data, we'd see: Zero(leading 4096), Data(512), Zero(trailing). + assert!(!ranges.is_empty()); + // Find the Data range. + let data_range = ranges + .iter() + .find(|r| matches!(r, WriteRange::Data { .. })) + .expect("expected at least one Data range"); + match data_range { + WriteRange::Data { + guest_offset, + length, + .. + } => { + assert_eq!(*guest_offset, 4096); + assert_eq!(*length, 512); + } + _ => unreachable!(), + } +} + +#[async_test] +async fn write_full_block(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + // Write exactly one full block (no padding needed). + let mut ranges = Vec::new(); + let _guard = vhdx + .resolve_write(0, format::DEFAULT_BLOCK_SIZE, &mut ranges) + .await + .unwrap(); + + // Should be exactly one Data range — no zero padding. + assert_eq!(ranges.len(), 1); + match ranges[0] { + WriteRange::Data { + guest_offset, + length, + file_offset, + } => { + assert_eq!(guest_offset, 0); + assert_eq!(length, format::DEFAULT_BLOCK_SIZE); + assert!(file_offset > 0); + assert_eq!(file_offset % MB1, 0); + } + _ => panic!("expected Data range, got {:?}", ranges[0]), + } +} + +#[async_test] +async fn write_zero_length(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + let mut ranges = Vec::new(); + let _guard = vhdx.resolve_write(0, 0, &mut ranges).await.unwrap(); + assert!(ranges.is_empty()); +} + +#[async_test] +async fn write_beyond_end_of_disk(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + let mut ranges = Vec::new(); + let result = vhdx + .resolve_write(format::GB1 - 512, 1024, &mut ranges) + .await; + assert!(matches!( + result, + Err(VhdxIoError(VhdxIoErrorInner::BeyondEndOfDisk)) + )); +} + +#[async_test] +async fn write_read_only() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + + let mut ranges = Vec::new(); + let result = vhdx.resolve_write(0, 4096, &mut ranges).await; + assert!(matches!( + result, + Err(VhdxIoError(VhdxIoErrorInner::ReadOnly)) + )); +} + +#[async_test] +async fn write_large_spanning_many_blocks(driver: DefaultDriver) { + // 4 MiB disk with 1 MiB blocks → 4 blocks. + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: 4 * MB1, + block_size: MB1 as u32, + ..Default::default() + }; + create::create(&file, &mut params).await.unwrap(); + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + // Write 3 MiB starting at offset 512 KiB (spans blocks 0,1,2,3). + let start = MB1 / 2; + let length = (3 * MB1) as u32; + let mut ranges = Vec::new(); + let _guard = vhdx + .resolve_write(start, length, &mut ranges) + .await + .unwrap(); + + let data_ranges: Vec<_> = ranges + .iter() + .filter(|r| matches!(r, WriteRange::Data { .. })) + .collect(); + // Should span 4 blocks: partial block 0, full block 1, full block 2, partial block 3. + assert_eq!(data_ranges.len(), 4); + + // Verify guest offsets and lengths. + let block_size = MB1; + match data_ranges[0] { + WriteRange::Data { + guest_offset, + length, + .. + } => { + assert_eq!(*guest_offset, start); + assert_eq!(*length as u64, block_size - start); + } + _ => unreachable!(), + } + match data_ranges[1] { + WriteRange::Data { + guest_offset, + length, + .. + } => { + assert_eq!(*guest_offset, block_size); + assert_eq!(*length as u64, block_size); + } + _ => unreachable!(), + } + match data_ranges[2] { + WriteRange::Data { + guest_offset, + length, + .. + } => { + assert_eq!(*guest_offset, 2 * block_size); + assert_eq!(*length as u64, block_size); + } + _ => unreachable!(), + } + match data_ranges[3] { + WriteRange::Data { + guest_offset, + length, + .. + } => { + assert_eq!(*guest_offset, 3 * block_size); + assert_eq!(*length as u64, start); // remaining half of last block + } + _ => unreachable!(), + } +} + +#[async_test] +async fn first_write_updates_header(driver: DefaultDriver) { + let (file, params) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + let original_data_guid = params.data_write_guid; + assert_eq!(vhdx.data_write_guid(), original_data_guid); + + // Perform a write — this triggers enable_write_mode. + let mut ranges = Vec::new(); + let _guard = vhdx.resolve_write(0, 512, &mut ranges).await.unwrap(); + + // data_write_guid should have changed. + let new_data_guid = vhdx.data_write_guid(); + assert_ne!(new_data_guid, original_data_guid); + assert_ne!(new_data_guid, Guid::ZERO); +} + +#[async_test] +async fn second_write_no_header_update(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + // First write — triggers header update. + let mut ranges = Vec::new(); + let _guard = vhdx.resolve_write(0, 512, &mut ranges).await.unwrap(); + let guid_after_first = vhdx.data_write_guid(); + + // Second write — should NOT update header again. + let mut ranges2 = Vec::new(); + let _guard2 = vhdx.resolve_write(512, 512, &mut ranges2).await.unwrap(); + let guid_after_second = vhdx.data_write_guid(); + + assert_eq!(guid_after_first, guid_after_second); +} + +#[async_test] +async fn file_writable_only_does_not_change_data_guid(driver: DefaultDriver) { + let (file, params) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + let original_data_guid = params.data_write_guid; + + // Enable FileWritable mode (metadata-only modification). + vhdx.enable_write_mode(WriteMode::FileWritable) + .await + .unwrap(); + + // data_write_guid should NOT have changed. + assert_eq!(vhdx.data_write_guid(), original_data_guid); + + // But the write mode should be set (subsequent DataWritable will escalate). + assert_eq!( + vhdx.header_state.write_mode(), + Some(WriteMode::FileWritable) + ); +} + +// --- TFP mechanics, write integration, and error path tests --- + +/// Interceptor with toggleable failure for mid-test fault injection. +struct ToggleableInterceptor { + fail_writes: Arc, + fail_set_file_size: Arc, +} + +impl IoInterceptor for ToggleableInterceptor { + fn before_write(&self, _offset: u64, _data: &[u8]) -> Result<(), std::io::Error> { + if self.fail_writes.load(Ordering::SeqCst) { + return Err(std::io::Error::other("injected write failure")); + } + Ok(()) + } + + fn before_set_file_size(&self, _size: u64) -> Result<(), std::io::Error> { + if self.fail_set_file_size.load(Ordering::SeqCst) { + return Err(std::io::Error::other("injected set_file_size failure")); + } + Ok(()) + } +} + +#[async_test] +async fn resolve_write_sets_tfp_on_full_block(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size(); + + let mut ranges = Vec::new(); + let _guard = vhdx + .resolve_write(0, block_size, &mut ranges) + .await + .unwrap(); + + // Full-block write should set TFP on block 0. + let mapping = vhdx.bat.get_block_mapping(0); + assert!( + mapping.transitioning_to_fully_present(), + "full-block resolve_write should set TFP" + ); + assert!( + mapping.file_megabyte() > 0, + "allocated block should have non-zero file offset" + ); +} + +#[async_test] +async fn resolve_write_no_tfp_on_partial_block(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + let mut ranges = Vec::new(); + let _guard = vhdx.resolve_write(0, 512, &mut ranges).await.unwrap(); + + // Partial-block write should NOT set TFP — BAT committed immediately. + let mapping = vhdx.bat.get_block_mapping(0); + assert!( + !mapping.transitioning_to_fully_present(), + "partial-block resolve_write should not set TFP" + ); + assert_eq!( + mapping.bat_state(), + BatEntryState::FullyPresent, + "partial allocation should set FullyPresent immediately" + ); +} + +#[async_test] +async fn write_read_roundtrip_multi_block(driver: DefaultDriver) { + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: 4 * MB1, + block_size: MB1 as u32, + ..Default::default() + }; + create::create(&file, &mut params).await.unwrap(); + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + let block_size = vhdx.block_size() as u64; + // Write 2 full blocks starting at offset 0. + let length = (2 * block_size) as u32; + let mut write_ranges = Vec::new(); + let guard = vhdx + .resolve_write(0, length, &mut write_ranges) + .await + .unwrap(); + + // Write recognizable pattern to each Data range. + for wr in &write_ranges { + match wr { + WriteRange::Data { + guest_offset, + length, + file_offset, + } => { + let pattern: Vec = (0..*length) + .map(|i| ((guest_offset + i as u64) % 251) as u8) + .collect(); + vhdx.file.write_at(*file_offset, &pattern).await.unwrap(); + } + WriteRange::Zero { + file_offset, + length, + } => { + let zeros = vec![0u8; *length as usize]; + vhdx.file.write_at(*file_offset, &zeros).await.unwrap(); + } + } + } + guard.complete().await.unwrap(); + + // Read back both blocks. + let mut read_ranges = Vec::new(); + let _guard = vhdx + .resolve_read(0, length, &mut read_ranges) + .await + .unwrap(); + + for rr in &read_ranges { + match rr { + ReadRange::Data { + guest_offset, + length, + file_offset, + } => { + let mut buf = vec![0u8; *length as usize]; + vhdx.file.read_at(*file_offset, &mut buf).await.unwrap(); + let expected: Vec = (0..*length) + .map(|i| ((guest_offset + i as u64) % 251) as u8) + .collect(); + assert_eq!( + buf, expected, + "data mismatch at guest offset {guest_offset}" + ); + } + ReadRange::Zero { .. } => { + panic!("expected Data range after write, got Zero"); + } + ReadRange::Unmapped { .. } => { + panic!("expected Data range after write, got Unmapped"); + } + } + } +} + +#[async_test] +async fn write_to_already_allocated_no_growth(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let regions = region::parse_region_tables(&file).await.unwrap(); + + // Pre-allocate block 0 as FullyPresent at offset 100 MB. + let entry = BatEntry::new() + .with_state(BatEntryState::FullyPresent as u8) + .with_file_offset_mb(100); + file.write_at(regions.bat_offset, entry.as_bytes()) + .await + .unwrap(); + + // Ensure file is big enough to cover that offset. + let needed_size = 100 * MB1 + format::DEFAULT_BLOCK_SIZE as u64; + file.set_file_size(needed_size).await.unwrap(); + + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let eof_before = vhdx.allocation_lock.lock().await.file_length; + + let mut ranges = Vec::new(); + let _guard = vhdx.resolve_write(0, 4096, &mut ranges).await.unwrap(); + + // No new allocation should occur — verify file length unchanged. + let eof_after = vhdx.allocation_lock.lock().await.file_length; + assert_eq!( + eof_before, eof_after, + "eof should not change for existing block" + ); + + // Should point to the existing block. + assert_eq!(ranges.len(), 1); + match ranges[0] { + WriteRange::Data { file_offset, .. } => { + assert_eq!(file_offset, 100 * MB1); + } + _ => panic!("expected Data range"), + } +} + +#[async_test] +async fn write_flush_persists_bat(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + // Write and complete a full block. + let block_size = vhdx.block_size(); + let mut ranges = Vec::new(); + let guard = vhdx + .resolve_write(0, block_size, &mut ranges) + .await + .unwrap(); + guard.complete().await.unwrap(); + vhdx.flush().await.unwrap(); + + // Snapshot immediately after flush — proves flush persisted the BAT. + // Log GUID is still set, so reopen will do log replay. + let snapshot = vhdx.file.snapshot(); + + // Reopen from snapshot (log replay recovers the state). + let recovered = InMemoryFile::from_snapshot(snapshot); + let vhdx2 = VhdxFile::open(recovered).writable(&driver).await.unwrap(); + let mapping = vhdx2.bat.get_block_mapping(0); + assert_eq!( + mapping.bat_state(), + BatEntryState::FullyPresent, + "BAT should show FullyPresent after flush + reopen" + ); + assert!( + mapping.file_megabyte() > 0, + "BAT should have non-zero offset after flush + reopen" + ); +} + +#[async_test] +async fn complete_write_clears_tfp(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size(); + + // resolve_write should set TFP. + let mut ranges = Vec::new(); + let guard = vhdx + .resolve_write(0, block_size, &mut ranges) + .await + .unwrap(); + + { + let mapping = vhdx.bat.get_block_mapping(0); + assert!(mapping.transitioning_to_fully_present()); + } + + // guard.complete() should clear TFP. + guard.complete().await.unwrap(); + + { + let mapping = vhdx.bat.get_block_mapping(0); + assert!( + !mapping.transitioning_to_fully_present(), + "TFP should be cleared after complete_write" + ); + assert_eq!( + mapping.bat_state(), + BatEntryState::FullyPresent, + "block should be FullyPresent after complete" + ); + } +} + +#[async_test] +async fn complete_write_writes_bat_to_disk(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size(); + + let mut ranges = Vec::new(); + let guard = vhdx + .resolve_write(0, block_size, &mut ranges) + .await + .unwrap(); + + // Get the allocated offset from in-memory BAT. + let expected_mb = vhdx.bat.get_block_mapping(0).file_megabyte(); + + guard.complete().await.unwrap(); + vhdx.flush().await.unwrap(); + + // Snapshot after flush — proves complete + flush persisted the BAT. + let snapshot = vhdx.file.snapshot(); + + // Reopen from snapshot (log replay recovers the state). + let recovered = InMemoryFile::from_snapshot(snapshot); + let vhdx2 = VhdxFile::open(recovered).writable(&driver).await.unwrap(); + let mapping = vhdx2.bat.get_block_mapping(0); + assert_eq!( + mapping.bat_state(), + BatEntryState::FullyPresent, + "BAT should be FullyPresent after flush + reopen" + ); + assert_eq!( + mapping.file_megabyte(), + expected_mb, + "BAT file offset should match after flush + reopen" + ); +} + +#[async_test] +async fn resolve_write_extends_file(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + let size_before = vhdx.file.file_size().await.unwrap(); + + let mut ranges = Vec::new(); + let _guard = vhdx.resolve_write(0, 512, &mut ranges).await.unwrap(); + + let size_after = vhdx.file.file_size().await.unwrap(); + assert!( + size_after > size_before, + "file should grow after allocating a new block \ + (before={size_before}, after={size_after})" + ); +} + +#[async_test] +async fn abort_write_reverts_bat(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size(); + + // resolve_write for a full block → sets TFP. + let mut ranges = Vec::new(); + let guard = vhdx + .resolve_write(0, block_size, &mut ranges) + .await + .unwrap(); + + // Abort (drop guard without complete) → reverts in-memory BAT. + drop(guard); + + // Block should be back to NotPresent with zero offset. + let mapping = vhdx.bat.get_block_mapping(0); + assert_eq!(mapping.bat_state(), BatEntryState::NotPresent); + assert_eq!(mapping.file_offset(), 0); + + vhdx.close().await.unwrap(); +} + +#[async_test] +async fn abort_write_clears_tfp(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size(); + + let mut ranges = Vec::new(); + let guard = vhdx + .resolve_write(0, block_size, &mut ranges) + .await + .unwrap(); + + // TFP should be set. + { + assert!( + vhdx.bat + .get_block_mapping(0) + .transitioning_to_fully_present() + ); + } + + // Abort (drop guard without complete). + drop(guard); + + // TFP should be cleared and state reverted to NotPresent. + { + let mapping = vhdx.bat.get_block_mapping(0); + assert!( + !mapping.transitioning_to_fully_present(), + "TFP should be cleared after abort" + ); + assert_eq!( + mapping.bat_state(), + BatEntryState::NotPresent, + "should revert to original NotPresent state" + ); + assert_eq!( + mapping.file_megabyte(), + 0, + "should revert file_megabyte to 0" + ); + } +} + +#[async_test] +async fn abort_write_allows_subsequent_write(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size(); + + // First write: allocate and abort. + let mut ranges = Vec::new(); + let guard = vhdx + .resolve_write(0, block_size, &mut ranges) + .await + .unwrap(); + drop(guard); + + // Second write: should succeed (no TFP blocking). + let mut ranges2 = Vec::new(); + let guard2 = vhdx + .resolve_write(0, block_size, &mut ranges2) + .await + .unwrap(); + guard2.complete().await.unwrap(); + + // Block should be FullyPresent now. + let mapping = vhdx.bat.get_block_mapping(0); + assert_eq!(mapping.bat_state(), BatEntryState::FullyPresent); + assert!(!mapping.transitioning_to_fully_present()); +} + +#[async_test] +async fn complete_write_notifies_on_cache_failure(driver: DefaultDriver) { + // With write-back mode (no write-through), cache writes during + // complete() only mark pages dirty in the cache. The actual disk + // write happens on flush through the log task. So complete() + // itself should succeed even with write failures enabled. + let (orig_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let data = orig_file.snapshot(); + + let fail_writes = Arc::new(AtomicBool::new(false)); + let interceptor = Arc::new(ToggleableInterceptor { + fail_writes: fail_writes.clone(), + fail_set_file_size: Arc::new(AtomicBool::new(false)), + }); + let file = InMemoryFile::with_interceptor(0, interceptor); + file.write_at(0, &data).await.unwrap(); + + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size(); + + // resolve_write succeeds (writes for header update, set_file_size). + let mut ranges = Vec::new(); + let guard = vhdx + .resolve_write(0, block_size, &mut ranges) + .await + .unwrap(); + + // Enable write failure. + fail_writes.store(true, Ordering::SeqCst); + + // complete() should succeed — commit() is a no-op in write-back mode, + // and dirty pages are marked in cache without file I/O. + let result = guard.complete().await; + assert!( + result.is_ok(), + "complete() should succeed in write-back mode even with write failures" + ); + + // TFP should be cleared and state set to FullyPresent. + { + let mapping = vhdx.bat.get_block_mapping(0); + assert!( + !mapping.transitioning_to_fully_present(), + "TFP should be cleared after complete" + ); + assert_eq!( + mapping.bat_state(), + BatEntryState::FullyPresent, + "state should be FullyPresent after complete" + ); + } + + // Re-enable writes. + fail_writes.store(false, Ordering::SeqCst); + + // A subsequent resolve_write should work (not hang on TFP). + let mut ranges2 = Vec::new(); + let _guard2 = vhdx + .resolve_write(0, block_size, &mut ranges2) + .await + .unwrap(); +} + +#[async_test] +async fn resolve_write_error_reverts_tfp(driver: DefaultDriver) { + // Create VHDX normally, then snapshot to new file with toggleable interceptor. + let (orig_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let data = orig_file.snapshot(); + + let fail_set_file_size = Arc::new(AtomicBool::new(false)); + let interceptor = Arc::new(ToggleableInterceptor { + fail_writes: Arc::new(AtomicBool::new(false)), + fail_set_file_size: fail_set_file_size.clone(), + }); + let file = InMemoryFile::with_interceptor(0, interceptor); + file.write_at(0, &data).await.unwrap(); + + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size(); + + // Enable set_file_size failure. + fail_set_file_size.store(true, Ordering::SeqCst); + + // resolve_write should fail when set_file_size fails during allocation. + let mut ranges = Vec::new(); + let result = vhdx.resolve_write(0, block_size, &mut ranges).await; + assert!( + result.is_err(), + "resolve_write should fail when set_file_size fails" + ); + + // TFP should be reverted. + { + let mapping = vhdx.bat.get_block_mapping(0); + assert!( + !mapping.transitioning_to_fully_present(), + "TFP should be reverted on resolve_write error" + ); + } + + // Disable failure, retry should succeed. + fail_set_file_size.store(false, Ordering::SeqCst); + + let mut ranges2 = Vec::new(); + let _guard = vhdx + .resolve_write(0, block_size, &mut ranges2) + .await + .unwrap(); +} + +/// Verify that a new allocation from near-EOF (safe data) omits zero +/// padding, while an allocation from the free pool does emit zero padding. +#[async_test] +async fn safe_data_skips_zero_padding(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size() as u64; + + // Step 1: Partial write to block 0 at guest_offset=0, len=512. + // Allocation comes from near-EOF → SpaceState::Zero → no zero ranges. + let mut ranges = Vec::new(); + let _guard = vhdx.resolve_write(0, 512, &mut ranges).await.unwrap(); + + let zero_ranges: Vec<_> = ranges + .iter() + .filter(|r| matches!(r, WriteRange::Zero { .. })) + .collect(); + assert!( + zero_ranges.is_empty(), + "near-EOF allocation should skip zero padding, but got {} Zero ranges", + zero_ranges.len(), + ); + + // Extract the block base offset (block_offset=0 since guest_offset=0). + let allocated_offset = match ranges[0] { + WriteRange::Data { file_offset, .. } => file_offset, + _ => panic!("expected Data range"), + }; + + // Step 2: Release the allocated space back to pool. + // (Intentionally creating an inconsistency for testing purposes.) + vhdx.free_space + .release(allocated_offset, vhdx.block_size() as u32); + + // Step 3: Partial write to block 1 at block-aligned guest offset. + // Should allocate from pool (unsafe data) → zero ranges emitted. + let mut ranges2 = Vec::new(); + let _guard2 = vhdx + .resolve_write(block_size, 512, &mut ranges2) + .await + .unwrap(); + + let zero_ranges2: Vec<_> = ranges2 + .iter() + .filter(|r| matches!(r, WriteRange::Zero { .. })) + .collect(); + assert!( + !zero_ranges2.is_empty(), + "pool allocation should emit zero padding, but got 0 Zero ranges", + ); +} + +// ---- Concurrent I/O stress tests ---- + +/// Wrapper around `InMemoryFile` that yields once on `set_file_size`. +/// +/// `InMemoryFile`'s async methods are synchronous (return Ready +/// immediately), so `futures::join!` won't interleave two +/// `resolve_write` calls. This wrapper inserts a +/// `futures::pending!()` call inside `set_file_size`, creating a yield +/// point during `allocate_space` while the `allocation_lock` is held. +struct YieldingFile { + inner: InMemoryFile, +} + +impl AsyncFile for YieldingFile { + type Buffer = Vec; + + fn alloc_buffer(&self, len: usize) -> Vec { + self.inner.alloc_buffer(len) + } + + async fn read_into(&self, offset: u64, buf: Vec) -> Result, std::io::Error> { + self.inner.read_into(offset, buf).await + } + + async fn write_from( + &self, + offset: u64, + buf: impl Borrow> + Send + 'static, + ) -> Result<(), std::io::Error> { + self.inner.write_from(offset, buf).await + } + + async fn flush(&self) -> Result<(), std::io::Error> { + self.inner.flush().await + } + async fn file_size(&self) -> Result { + self.inner.file_size().await + } + async fn set_file_size(&self, size: u64) -> Result<(), std::io::Error> { + // Yield once to allow other futures to run, then resume. + // We must wake ourselves before returning Pending, otherwise + // the executor won't re-poll us (deadlock). + let mut yielded = false; + std::future::poll_fn(|cx| { + if !yielded { + yielded = true; + cx.waker().wake_by_ref(); + std::task::Poll::Pending + } else { + std::task::Poll::Ready(()) + } + }) + .await; + self.inner.set_file_size(size).await + } +} + +/// Helper: create a VHDX with custom block size on an `InMemoryFile`, +/// returning the file and params. +async fn create_vhdx_with_block_size( + disk_size: u64, + block_size: u32, +) -> (InMemoryFile, CreateParams) { + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size, + block_size, + ..Default::default() + }; + create::create(&file, &mut params).await.unwrap(); + (file, params) +} + +/// Helper: perform a full write-complete cycle on a single block. +async fn write_block( + vhdx: &VhdxFile, + guest_offset: u64, + length: u32, + pattern_byte: u8, +) { + let mut ranges = Vec::new(); + let guard = vhdx + .resolve_write(guest_offset, length, &mut ranges) + .await + .unwrap(); + + // Write pattern data at each Data range, zero at each Zero range. + for wr in &ranges { + match wr { + WriteRange::Data { + file_offset, + length, + .. + } => { + let data = vec![pattern_byte; *length as usize]; + vhdx.file.write_at(*file_offset, &data).await.unwrap(); + } + WriteRange::Zero { + file_offset, + length, + } => { + let zeros = vec![0u8; *length as usize]; + vhdx.file.write_at(*file_offset, &zeros).await.unwrap(); + } + } + } + + guard.complete().await.unwrap(); +} + +/// Helper: read a block and verify the pattern byte. +async fn verify_block_pattern( + vhdx: &VhdxFile, + guest_offset: u64, + length: u32, + expected_byte: u8, +) { + let mut ranges = Vec::new(); + let _guard = vhdx + .resolve_read(guest_offset, length, &mut ranges) + .await + .unwrap(); + + for rr in &ranges { + match rr { + ReadRange::Data { + file_offset, + length, + .. + } => { + let mut buf = vec![0u8; *length as usize]; + vhdx.file.read_at(*file_offset, &mut buf).await.unwrap(); + assert!( + buf.iter().all(|&b| b == expected_byte), + "expected all bytes to be 0x{:02x} at file_offset {}, \ + but found mismatch", + expected_byte, + file_offset, + ); + } + ReadRange::Zero { .. } => { + assert_eq!(expected_byte, 0, "expected data but got Zero range"); + } + ReadRange::Unmapped { .. } => { + panic!("unexpected Unmapped range in non-differencing disk"); + } + } + } +} + +#[async_test] +async fn concurrent_reads_same_block(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap()); + let block_size = vhdx.block_size(); + + // Pre-allocate block 0 with known data. + write_block(&*vhdx, 0, block_size, 0xAA).await; + + // Spawn 10 concurrent reads to the same block. + let futures: Vec<_> = (0..10) + .map(|_| { + let vhdx = vhdx.clone(); + async move { + let mut ranges = Vec::new(); + let _guard = vhdx.resolve_read(0, block_size, &mut ranges).await.unwrap(); + assert_eq!(ranges.len(), 1); + match &ranges[0] { + ReadRange::Data { + guest_offset, + length, + file_offset, + } => { + assert_eq!(*guest_offset, 0); + assert_eq!(*length, block_size); + assert!(*file_offset > 0); + } + other => panic!("expected Data range, got {:?}", other), + } + ranges + } + }) + .collect(); + + let results = futures::future::join_all(futures).await; + + // All results should be identical. + let first = &results[0]; + for result in &results[1..] { + assert_eq!(first, result); + } +} + +#[async_test] +async fn concurrent_reads_different_blocks(driver: DefaultDriver) { + let (file, _) = create_vhdx_with_block_size(4 * MB1, MB1 as u32).await; + let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap()); + let block_size = vhdx.block_size(); + + // Pre-allocate blocks 0, 1, 2. + for i in 0..3u8 { + write_block(&*vhdx, i as u64 * block_size as u64, block_size, 0x10 + i).await; + } + + // Spawn 3 concurrent reads, one per block. + let futures: Vec<_> = (0..3u32) + .map(|i| { + let vhdx = vhdx.clone(); + let bs = block_size; + async move { + let mut ranges = Vec::new(); + let _guard = vhdx + .resolve_read(i as u64 * bs as u64, bs, &mut ranges) + .await + .unwrap(); + assert_eq!(ranges.len(), 1); + match &ranges[0] { + ReadRange::Data { file_offset, .. } => { + assert!(*file_offset > 0); + } + other => panic!("expected Data range for block {}, got {:?}", i, other), + } + } + }) + .collect(); + + futures::future::join_all(futures).await; +} + +#[async_test] +async fn concurrent_writes_different_blocks(driver: DefaultDriver) { + // 8 MiB disk with 1 MiB blocks → 8 blocks. + let (file, _) = create_vhdx_with_block_size(8 * MB1, MB1 as u32).await; + let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap()); + let block_size = vhdx.block_size(); + + // Spawn 4 concurrent tasks, each writing to a unique block. + let futures: Vec<_> = (0..4u8) + .map(|i| { + let vhdx = vhdx.clone(); + let bs = block_size; + async move { + let offset = i as u64 * bs as u64; + let pattern = 0x40 + i; + write_block(&*vhdx, offset, bs, pattern).await; + } + }) + .collect(); + + futures::future::join_all(futures).await; + + // Verify each block reads back the correct pattern. + for i in 0..4u8 { + let offset = i as u64 * block_size as u64; + verify_block_pattern(&*vhdx, offset, block_size, 0x40 + i).await; + } +} + +#[async_test] +async fn concurrent_writes_same_block(driver: DefaultDriver) { + // This test exercises concurrent writes to the same unallocated block. + // The correct behavior is serialization: + // 1. task_a: resolve_write → acquires allocation lock → allocates + // → sets TFP → returns ranges + // 2. task_a: complete_write → clears TFP → FullyPresent → notifies + // 3. task_b: resolve_write → was waiting for TFP to clear (either + // in the read phase or after acquiring the lock). Once cleared, + // sees FullyPresent → emits Data range → returns. + // + // Uses YieldingFile to force a yield during set_file_size (inside + // allocate_space), creating the interleaving where task_b's read + // phase may see NotPresent before task_a sets TFP. + + let (inner_file, _) = create_vhdx_with_block_size(4 * MB1, MB1 as u32).await; + let data = inner_file.snapshot(); + + let yielding_file = YieldingFile { + inner: InMemoryFile::new(0), + }; + yielding_file.inner.write_at(0, &data).await.unwrap(); + + let vhdx = Arc::new( + VhdxFile::open(yielding_file) + .writable(&driver) + .await + .unwrap(), + ); + let block_size = vhdx.block_size(); + + // Both tasks write to block 0 (offset 0, full block). + // task_a does resolve + complete as a unit so TFP clears and + // task_b (serialized behind task_a) can proceed. + let vhdx_a = vhdx.clone(); + let vhdx_b = vhdx.clone(); + + let task_a = async { + let mut ranges = Vec::new(); + let guard = vhdx_a + .resolve_write(0, block_size, &mut ranges) + .await + .unwrap(); + guard.complete().await.unwrap(); + ranges + }; + + let task_b = async { + let mut ranges = Vec::new(); + let _guard = vhdx_b + .resolve_write(0, block_size, &mut ranges) + .await + .unwrap(); + ranges + }; + + let (ranges_a, ranges_b) = futures::join!(task_a, task_b); + + // Both should have produced data ranges. + assert!(!ranges_a.is_empty(), "task_a produced no ranges"); + assert!(!ranges_b.is_empty(), "task_b produced no ranges"); + + // Block should be FullyPresent. + let mapping = vhdx.bat.get_block_mapping(0); + assert_eq!(mapping.bat_state(), BatEntryState::FullyPresent); + assert!(!mapping.transitioning_to_fully_present()); +} + +#[async_test] +async fn concurrent_flush_requests(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap()); + let block_size = vhdx.block_size(); + + // Write to a block, complete. + write_block(&*vhdx, 0, block_size, 0xBB).await; + + // Spawn 5 concurrent flush calls. + let futures: Vec<_> = (0..5) + .map(|_| { + let vhdx = vhdx.clone(); + async move { + vhdx.flush().await.unwrap(); + } + }) + .collect(); + + futures::future::join_all(futures).await; +} + +#[async_test] +async fn stress_random_writes_no_corruption(driver: DefaultDriver) { + // 8 MiB disk with 1 MiB blocks → 8 blocks. + let (file, _) = create_vhdx_with_block_size(8 * MB1, MB1 as u32).await; + let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap()); + let block_size = vhdx.block_size(); + + // Spawn 8 tasks, each claiming a unique block. + let futures: Vec<_> = (0..8u8) + .map(|i| { + let vhdx = vhdx.clone(); + let bs = block_size; + async move { + let offset = i as u64 * bs as u64; + let pattern = 0x80 + i; + write_block(&*vhdx, offset, bs, pattern).await; + vhdx.flush().await.unwrap(); + } + }) + .collect(); + + futures::future::join_all(futures).await; + + // Verify all blocks. + for i in 0..8u8 { + let offset = i as u64 * block_size as u64; + verify_block_pattern(&*vhdx, offset, block_size, 0x80 + i).await; + } +} + +#[async_test] +async fn concurrent_read_and_write_same_block(driver: DefaultDriver) { + let (file, _) = create_vhdx_with_block_size(4 * MB1, MB1 as u32).await; + let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap()); + let block_size = vhdx.block_size(); + + // Pre-allocate block 0 with known data. + write_block(&*vhdx, 0, block_size, 0xCC).await; + + // Concurrent: read block 0, write block 1. + let vhdx_r = vhdx.clone(); + let vhdx_w = vhdx.clone(); + + let read_task = async move { + let mut ranges = Vec::new(); + let _guard = vhdx_r + .resolve_read(0, block_size, &mut ranges) + .await + .unwrap(); + assert_eq!(ranges.len(), 1); + match &ranges[0] { + ReadRange::Data { .. } => {} + other => panic!("expected Data range, got {:?}", other), + } + }; + + let write_task = async move { + let offset = block_size as u64; + write_block(&*vhdx_w, offset, block_size, 0xDD).await; + }; + + futures::join!(read_task, write_task); + + // Verify block 0 still has original data. + verify_block_pattern(&*vhdx, 0, block_size, 0xCC).await; + // Verify block 1 has new data. + verify_block_pattern(&*vhdx, block_size as u64, block_size, 0xDD).await; +} + +// ---- IoGuard refcount tracking tests ---- + +#[async_test] +async fn read_guard_increments_refcount(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size(); + + // Pre-allocate block 0 so it's FullyPresent. + write_block(&vhdx, 0, block_size, 0xAA).await; + + // Resolve a read — refcount should be 1 while guard is alive. + let mut ranges = Vec::new(); + let guard = vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap(); + + assert_eq!(vhdx.bat.io_refcount(0), 1); + + // Drop the guard — refcount should go back to 0. + drop(guard); + + assert_eq!(vhdx.bat.io_refcount(0), 0); +} + +#[async_test] +async fn read_guard_drop_decrements_refcount(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size(); + + // Pre-allocate block 0. + write_block(&vhdx, 0, block_size, 0xBB).await; + + let mut ranges = Vec::new(); + let guard = vhdx.resolve_read(0, block_size, &mut ranges).await.unwrap(); + + // Refcount is 1 while guard is held. + assert_eq!(vhdx.bat.io_refcount(0), 1); + + // Drop explicitly. + drop(guard); + + // Refcount back to 0. + assert_eq!(vhdx.bat.io_refcount(0), 0); +} + +#[async_test] +async fn read_guard_multiple_blocks(driver: DefaultDriver) { + let (file, _) = create_vhdx_with_block_size(4 * MB1, MB1 as u32).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size(); + + // Write 3 blocks. + write_block(&vhdx, 0, block_size, 0x11).await; + write_block(&vhdx, block_size as u64, block_size, 0x22).await; + write_block(&vhdx, 2 * block_size as u64, block_size, 0x33).await; + + // Read spanning all 3 blocks. + let mut ranges = Vec::new(); + let guard = vhdx + .resolve_read(0, 3 * block_size, &mut ranges) + .await + .unwrap(); + + assert_eq!(vhdx.bat.io_refcount(0), 1); + assert_eq!(vhdx.bat.io_refcount(1), 1); + assert_eq!(vhdx.bat.io_refcount(2), 1); + + drop(guard); + + assert_eq!(vhdx.bat.io_refcount(0), 0); + assert_eq!(vhdx.bat.io_refcount(1), 0); + assert_eq!(vhdx.bat.io_refcount(2), 0); +} + +#[async_test] +async fn read_guard_zero_range_has_refcount() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + + // Read an unallocated (Zero) block — refcount is still incremented + // (harmless, since trim won't touch unallocated blocks). + let mut ranges = Vec::new(); + let guard = vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap(); + + assert_eq!(vhdx.bat.io_refcount(0), 1); + + drop(guard); + + assert_eq!(vhdx.bat.io_refcount(0), 0); +} + +#[async_test] +async fn write_guard_complete_drops_refcount(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size(); + + let mut ranges = Vec::new(); + let guard = vhdx + .resolve_write(0, block_size, &mut ranges) + .await + .unwrap(); + + // Refcount should be 1. + assert_eq!(vhdx.bat.io_refcount(0), 1); + + // Write data and complete. + for wr in &ranges { + match wr { + WriteRange::Data { + file_offset, + length, + .. + } => { + let data = vec![0xEE; *length as usize]; + vhdx.file.write_at(*file_offset, &data).await.unwrap(); + } + WriteRange::Zero { + file_offset, + length, + } => { + let zeros = vec![0u8; *length as usize]; + vhdx.file.write_at(*file_offset, &zeros).await.unwrap(); + } + } + } + + guard.complete().await.unwrap(); + + // After complete + drop, refcount should be 0 and block should be FullyPresent. + assert_eq!(vhdx.bat.io_refcount(0), 0); + let mapping = vhdx.bat.get_block_mapping(0); + assert_eq!(mapping.bat_state(), BatEntryState::FullyPresent); +} + +#[async_test] +async fn write_guard_drop_aborts_and_decrements_refcount(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size(); + + let mut ranges = Vec::new(); + let guard = vhdx + .resolve_write(0, block_size, &mut ranges) + .await + .unwrap(); + + // Refcount should be 1. + assert_eq!(vhdx.bat.io_refcount(0), 1); + + // Drop without calling complete() — abort. + drop(guard); + + // Refcount should be 0, block should be back to NotPresent. + assert_eq!(vhdx.bat.io_refcount(0), 0); + let mapping = vhdx.bat.get_block_mapping(0); + assert_eq!(mapping.bat_state(), BatEntryState::NotPresent); +} + +#[async_test] +async fn concurrent_read_guards_same_block(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size(); + + // Pre-allocate block 0. + write_block(&vhdx, 0, block_size, 0xFF).await; + + // Two concurrent reads on the same block. + let mut ranges1 = Vec::new(); + let mut ranges2 = Vec::new(); + let guard1 = vhdx.resolve_read(0, 4096, &mut ranges1).await.unwrap(); + let guard2 = vhdx.resolve_read(0, 4096, &mut ranges2).await.unwrap(); + + // Refcount should be 2. + assert_eq!(vhdx.bat.io_refcount(0), 2); + + // Drop first guard — refcount should be 1. + drop(guard1); + assert_eq!(vhdx.bat.io_refcount(0), 1); + + // Drop second guard — refcount should be 0. + drop(guard2); + assert_eq!(vhdx.bat.io_refcount(0), 0); +} + +// ---- Concurrent write+trim and mixed-workload stress tests ---- + +use crate::trim::TrimMode; +use crate::trim::TrimRequest; + +#[async_test] +async fn concurrent_write_and_trim_same_block(driver: DefaultDriver) { + // Setup: 8 MiB disk, 1 MiB blocks. + let (file, _) = create_vhdx_with_block_size(8 * MB1, MB1 as u32).await; + let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap()); + let block_size = vhdx.block_size(); + + // Write block 0 with pattern 0xAA, complete. + write_block(&*vhdx, 0, block_size, 0xAA).await; + + // Concurrently: write block 0 with 0xEE + trim block 0 (FileSpace). + let vhdx_w = vhdx.clone(); + let vhdx_t = vhdx.clone(); + + let (write_result, trim_result) = futures::join!( + async { + write_block(&*vhdx_w, 0, block_size, 0xEE).await; + Ok::<(), VhdxIoError>(()) + }, + async { + vhdx_t + .trim(TrimRequest::new(TrimMode::FileSpace, 0, block_size as u64)) + .await + } + ); + + write_result.unwrap(); + trim_result.unwrap(); + + // Check what actually happened by examining block state. + let mapping = vhdx.bat.get_block_mapping(0); + match mapping.bat_state() { + BatEntryState::Unmapped => { + // Trim won — read should return zeros. + verify_block_pattern(&*vhdx, 0, block_size, 0x00).await; + } + BatEntryState::FullyPresent => { + // Write won — read should return 0xEE. + verify_block_pattern(&*vhdx, 0, block_size, 0xEE).await; + } + other => panic!("unexpected state: {other:?}"), + } +} + +#[async_test] +async fn concurrent_trim_then_rewrite(driver: DefaultDriver) { + // Setup: 8 MiB disk, 1 MiB blocks. + let (file, _) = create_vhdx_with_block_size(8 * MB1, MB1 as u32).await; + let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap()); + let block_size = vhdx.block_size(); + + // Write block 0 with pattern 0xAA. + write_block(&*vhdx, 0, block_size, 0xAA).await; + + // Sequential: trim → rewrite. Verify the trim→re-allocate path. + vhdx.trim(TrimRequest::new(TrimMode::FileSpace, 0, block_size as u64)) + .await + .unwrap(); + + let mapping = vhdx.bat.get_block_mapping(0); + assert_eq!( + mapping.bat_state(), + BatEntryState::Unmapped, + "block should be Unmapped after trim" + ); + + // Re-write with pattern 0xBB. + write_block(&*vhdx, 0, block_size, 0xBB).await; + + let mapping = vhdx.bat.get_block_mapping(0); + assert_eq!(mapping.bat_state(), BatEntryState::FullyPresent); + verify_block_pattern(&*vhdx, 0, block_size, 0xBB).await; + + // Now do trim + write concurrently. + let vhdx_t = vhdx.clone(); + let vhdx_w = vhdx.clone(); + + let (trim_result, write_result) = futures::join!( + async { + vhdx_t + .trim(TrimRequest::new(TrimMode::FileSpace, 0, block_size as u64)) + .await + }, + async { + write_block(&*vhdx_w, 0, block_size, 0xCC).await; + Ok::<(), VhdxIoError>(()) + } + ); + + trim_result.unwrap(); + write_result.unwrap(); + + // Verify no panics and data is consistent. + let mapping = vhdx.bat.get_block_mapping(0); + match mapping.bat_state() { + BatEntryState::Unmapped => { + verify_block_pattern(&*vhdx, 0, block_size, 0x00).await; + } + BatEntryState::FullyPresent => { + verify_block_pattern(&*vhdx, 0, block_size, 0xCC).await; + } + other => panic!("unexpected state: {other:?}"), + } +} + +#[async_test] +async fn mixed_workload_stress(driver: DefaultDriver) { + // 8 MiB disk with 1 MiB blocks → 8 blocks. + let (file, _) = create_vhdx_with_block_size(8 * MB1, MB1 as u32).await; + let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap()); + let block_size = vhdx.block_size(); + let num_blocks: u32 = 8; + + // Shadow state: None = unwritten/trimmed (expect zeros), Some(pattern) = last written pattern. + let shadow: Arc>>> = + Arc::new(parking_lot::Mutex::new(vec![None; num_blocks as usize])); + + let num_tasks: u32 = 8; + let iters_per_task: u8 = 16; + + let tasks: Vec<_> = (0..num_tasks) + .map(|task_id| { + let vhdx = vhdx.clone(); + let shadow = shadow.clone(); + let bs = block_size; + + async move { + for iter in 0..iters_per_task { + let block = (task_id.wrapping_mul(3).wrapping_add(iter as u32)) % num_blocks; + let pattern = ((task_id as u16 * 16 + iter as u16) as u8) | 0x01; // always nonzero + let block_offset = block as u64 * bs as u64; + + let op = (task_id as u8).wrapping_add(iter) % 10; + match op { + 0..=4 => { + // Write (50%) + write_block(&*vhdx, block_offset, bs, pattern).await; + shadow.lock()[block as usize] = Some(pattern); + } + 5..=7 => { + // Read + verify (30%) + let expected = shadow.lock()[block as usize]; + let mut ranges = Vec::new(); + let guard = vhdx + .resolve_read(block_offset, bs, &mut ranges) + .await + .unwrap(); + for rr in &ranges { + match rr { + ReadRange::Data { + file_offset, + length, + .. + } => { + let mut buf = vec![0u8; *length as usize]; + vhdx.file.read_at(*file_offset, &mut buf).await.unwrap(); + let exp = expected.unwrap_or_else(|| { + panic!( + "task {task_id} iter {iter}: shadow says \ + None but got Data range" + ) + }); + assert!( + buf.iter().all(|&b| b == exp), + "task {task_id} iter {iter}: expected \ + 0x{exp:02x}, got mismatch" + ); + } + ReadRange::Zero { .. } => { + assert!( + expected.is_none(), + "task {task_id} iter {iter}: got Zero but \ + expected Some({:02x})", + expected.unwrap() + ); + } + ReadRange::Unmapped { .. } => { + panic!("unexpected Unmapped on non-differencing disk"); + } + } + } + drop(guard); + } + 8 => { + // Trim (10%) + vhdx.trim(TrimRequest::new( + TrimMode::FileSpace, + block_offset, + bs as u64, + )) + .await + .unwrap(); + shadow.lock()[block as usize] = None; + } + 9 => { + // Flush (10%) + vhdx.flush().await.unwrap(); + } + _ => unreachable!(), + } + } + } + }) + .collect(); + + futures::future::join_all(tasks).await; + + // Post-check: verify every block against final shadow state. + let final_shadow = shadow.lock().clone(); + for block in 0..num_blocks { + let block_offset = block as u64 * block_size as u64; + let expected = final_shadow[block as usize]; + match expected { + Some(pattern) => { + verify_block_pattern(&*vhdx, block_offset, block_size, pattern).await; + } + None => { + // Should be zeros. + let mut ranges = Vec::new(); + let _guard = vhdx + .resolve_read(block_offset, block_size, &mut ranges) + .await + .unwrap(); + for rr in &ranges { + match rr { + ReadRange::Zero { .. } => {} + ReadRange::Data { + file_offset, + length, + .. + } => { + let mut buf = vec![0u8; *length as usize]; + vhdx.file.read_at(*file_offset, &mut buf).await.unwrap(); + assert!( + buf.iter().all(|&b| b == 0), + "block {block}: shadow says None but data is non-zero" + ); + } + ReadRange::Unmapped { .. } => { + panic!("unexpected Unmapped on non-differencing disk"); + } + } + } + } + } + } +} + +#[async_test] +async fn concurrent_partial_writes_same_block(driver: DefaultDriver) { + // 8 MiB disk, 1 MiB blocks. + let (file, _) = create_vhdx_with_block_size(8 * MB1, MB1 as u32).await; + let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap()); + let block_size = vhdx.block_size(); + + // Pre-allocate block 0 with pattern 0xAA. + write_block(&*vhdx, 0, block_size, 0xAA).await; + + let half = block_size / 2; + let vhdx_a = vhdx.clone(); + let vhdx_b = vhdx.clone(); + + // Concurrently write first half with 0xBB, second half with 0xCC. + let ((), ()) = futures::join!( + async { + // Task A: write first half. + let mut ranges = Vec::new(); + let guard = vhdx_a.resolve_write(0, half, &mut ranges).await.unwrap(); + for wr in &ranges { + match wr { + WriteRange::Data { + file_offset, + length, + .. + } => { + let data = vec![0xBB; *length as usize]; + vhdx_a.file.write_at(*file_offset, &data).await.unwrap(); + } + WriteRange::Zero { + file_offset, + length, + } => { + let zeros = vec![0u8; *length as usize]; + vhdx_a.file.write_at(*file_offset, &zeros).await.unwrap(); + } + } + } + guard.complete().await.unwrap(); + }, + async { + // Task B: write second half. + let mut ranges = Vec::new(); + let guard = vhdx_b + .resolve_write(half as u64, half, &mut ranges) + .await + .unwrap(); + for wr in &ranges { + match wr { + WriteRange::Data { + file_offset, + length, + .. + } => { + let data = vec![0xCC; *length as usize]; + vhdx_b.file.write_at(*file_offset, &data).await.unwrap(); + } + WriteRange::Zero { + file_offset, + length, + } => { + let zeros = vec![0u8; *length as usize]; + vhdx_b.file.write_at(*file_offset, &zeros).await.unwrap(); + } + } + } + guard.complete().await.unwrap(); + } + ); + + // Read back full block: first half should be 0xBB, second half 0xCC. + let mut ranges = Vec::new(); + let _guard = vhdx.resolve_read(0, block_size, &mut ranges).await.unwrap(); + + for rr in &ranges { + match rr { + ReadRange::Data { + guest_offset, + length, + file_offset, + } => { + let mut buf = vec![0u8; *length as usize]; + vhdx.file.read_at(*file_offset, &mut buf).await.unwrap(); + + // Determine expected pattern based on position within block. + for (i, &byte) in buf.iter().enumerate() { + let pos = (*guest_offset as usize) + i; + let expected = if pos < half as usize { 0xBB } else { 0xCC }; + assert_eq!( + byte, expected, + "byte at guest offset {pos}: expected 0x{expected:02x}, got 0x{byte:02x}" + ); + } + } + other => panic!("expected Data range, got {other:?}"), + } + } +} + +#[async_test] +async fn concurrent_write_flush_trim_interleaved(driver: DefaultDriver) { + // Setup: 8 MiB disk, 1 MiB blocks. + let (file, _) = create_vhdx_with_block_size(8 * MB1, MB1 as u32).await; + let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap()); + let block_size = vhdx.block_size(); + + // Write block 0 with 0xDD, complete. + write_block(&*vhdx, 0, block_size, 0xDD).await; + + let vhdx_f = vhdx.clone(); + let vhdx_t = vhdx.clone(); + let vhdx_r = vhdx.clone(); + + // Concurrently: flush + trim block 0 + read block 1 (unallocated → zeros). + let (flush_result, trim_result, read_result) = futures::join!( + async { vhdx_f.flush().await }, + async { + vhdx_t + .trim(TrimRequest::new(TrimMode::FileSpace, 0, block_size as u64)) + .await + }, + async { + let mut ranges = Vec::new(); + let _guard = vhdx_r + .resolve_read(block_size as u64, block_size, &mut ranges) + .await + .unwrap(); + // Block 1 is unallocated → should be Zero. + for rr in &ranges { + assert!( + matches!(rr, ReadRange::Zero { .. }), + "block 1 should be Zero, got {rr:?}" + ); + } + Ok::<(), VhdxIoError>(()) + } + ); + + flush_result.unwrap(); + trim_result.unwrap(); + read_result.unwrap(); + + // Verify block 0 state is consistent. + let mapping = vhdx.bat.get_block_mapping(0); + match mapping.bat_state() { + BatEntryState::Unmapped => { + // Trim completed — read should return zeros. + verify_block_pattern(&*vhdx, 0, block_size, 0x00).await; + } + BatEntryState::FullyPresent => { + // Flush completed before trim could run — data preserved. + verify_block_pattern(&*vhdx, 0, block_size, 0xDD).await; + } + other => panic!("unexpected state: {other:?}"), + } +} + +#[async_test] +async fn stress_write_trim_cycle(driver: DefaultDriver) { + // 8 MiB disk with 1 MiB blocks → 8 blocks. + let (file, _) = create_vhdx_with_block_size(8 * MB1, MB1 as u32).await; + let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap()); + let block_size = vhdx.block_size(); + + let num_writer_tasks: u32 = 4; + let num_reader_tasks: u32 = 2; + let iters_per_writer: u8 = 8; + + // Shadow state: None = unwritten/trimmed (zeros), Some(pattern) = last written. + let shadow: Arc>>> = + Arc::new(parking_lot::Mutex::new(vec![None; 4])); + + // Writer tasks: write → trim → write again on block `task_id`. + let writer_tasks: Vec<_> = (0..num_writer_tasks) + .map(|task_id| { + let vhdx = vhdx.clone(); + let shadow = shadow.clone(); + let bs = block_size; + + async move { + for iter in 0..iters_per_writer { + let block_offset = task_id as u64 * bs as u64; + let pattern_a = ((task_id as u16 * 32 + iter as u16 * 2) as u8) | 0x01; + let pattern_b = ((task_id as u16 * 32 + iter as u16 * 2 + 1) as u8) | 0x01; + + // Write with pattern_a. + write_block(&*vhdx, block_offset, bs, pattern_a).await; + shadow.lock()[task_id as usize] = Some(pattern_a); + + // Trim. + vhdx.trim(TrimRequest::new( + TrimMode::FileSpace, + block_offset, + bs as u64, + )) + .await + .unwrap(); + shadow.lock()[task_id as usize] = None; + + // Write with pattern_b. + write_block(&*vhdx, block_offset, bs, pattern_b).await; + shadow.lock()[task_id as usize] = Some(pattern_b); + } + } + }) + .collect(); + + // Reader tasks: continuously read all 4 blocks, verify consistency. + let reader_tasks: Vec<_> = (0..num_reader_tasks) + .map(|_reader_id| { + let vhdx = vhdx.clone(); + let shadow = shadow.clone(); + let bs = block_size; + + async move { + // Read all 4 blocks multiple times. + for _round in 0..16 { + for block in 0..4u32 { + let block_offset = block as u64 * bs as u64; + let expected = shadow.lock()[block as usize]; + + let mut ranges = Vec::new(); + let guard = vhdx + .resolve_read(block_offset, bs, &mut ranges) + .await + .unwrap(); + for rr in &ranges { + match rr { + ReadRange::Data { + file_offset, + length, + .. + } => { + let mut buf = vec![0u8; *length as usize]; + vhdx.file.read_at(*file_offset, &mut buf).await.unwrap(); + match expected { + Some(exp) => { + assert!( + buf.iter().all(|&b| b == exp), + "reader block {block}: expected \ + 0x{exp:02x}, got mismatch" + ); + } + None => { + assert!( + buf.iter().all(|&b| b == 0), + "reader block {block}: expected zeros, \ + got non-zero data" + ); + } + } + } + ReadRange::Zero { .. } => { + assert!( + expected.is_none(), + "reader block {block}: got Zero but expected \ + Some({:02x})", + expected.unwrap() + ); + } + ReadRange::Unmapped { .. } => { + panic!("unexpected Unmapped on non-differencing disk"); + } + } + } + drop(guard); + } + } + } + }) + .collect(); + + // Run all tasks concurrently. + let all_tasks: Vec<_> = writer_tasks + .into_iter() + .map(|t| Box::pin(t) as std::pin::Pin>>) + .chain( + reader_tasks + .into_iter() + .map(|t| Box::pin(t) as std::pin::Pin>>), + ) + .collect(); + + futures::future::join_all(all_tasks).await; + + // Post-check: verify final state of all 4 blocks. + let final_shadow = shadow.lock().clone(); + for block in 0..4u32 { + let block_offset = block as u64 * block_size as u64; + match final_shadow[block as usize] { + Some(pattern) => { + verify_block_pattern(&*vhdx, block_offset, block_size, pattern).await; + } + None => { + verify_block_pattern(&*vhdx, block_offset, block_size, 0x00).await; + } + } + } +} + +// ---- SBM allocation tests ---- + +#[async_test] +async fn partial_write_diff_disk_allocates_sbm(driver: DefaultDriver) { + // A sub-block write to a NotPresent block in a differencing disk + // should allocate the SBM block and set the payload to PartiallyPresent. + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: format::GB1, + has_parent: true, + ..Default::default() + }; + create::create(&file, &mut params).await.unwrap(); + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + // Partial write: 4096 bytes at offset 0 (sub-block). + write_block(&vhdx, 0, 4096, 0xAB).await; + + // Block 0 should be PartiallyPresent. + let mapping = vhdx.bat.get_block_mapping(0); + assert_eq!(mapping.bat_state(), BatEntryState::PartiallyPresent); + + // SBM block for chunk 0 should be FullyPresent (allocated). + let sbm_mapping = vhdx.bat.get_sector_bitmap_mapping(0); + assert_eq!(sbm_mapping.bat_state(), BatEntryState::FullyPresent); + + // Read the written range — should return Data. + let mut ranges = Vec::new(); + let _guard = vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap(); + let has_data = ranges.iter().any(|r| matches!(r, ReadRange::Data { .. })); + assert!(has_data, "written sectors should return Data"); + + // Read an unwritten range in the same block — should return Unmapped. + let mut ranges2 = Vec::new(); + let _guard2 = vhdx.resolve_read(4096, 512, &mut ranges2).await.unwrap(); + assert_eq!(ranges2.len(), 1); + assert!( + matches!(ranges2[0], ReadRange::Unmapped { .. }), + "unwritten sectors in diff disk should return Unmapped" + ); +} + +#[async_test] +async fn partial_write_diff_disk_sbm_bits_set_correctly(driver: DefaultDriver) { + // Write 4096 bytes (sectors 0-7 for 512-byte sectors) to a diff disk. + // Verify that the written sectors read as Data and unwritten ones as Unmapped. + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: format::GB1, + has_parent: true, + ..Default::default() + }; + create::create(&file, &mut params).await.unwrap(); + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + write_block(&vhdx, 0, 4096, 0xCD).await; + + // Sectors 0-7 should be Data. + let mut ranges = Vec::new(); + let _guard = vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap(); + assert_eq!(ranges.len(), 1); + match &ranges[0] { + ReadRange::Data { + guest_offset, + length, + .. + } => { + assert_eq!(*guest_offset, 0); + assert_eq!(*length, 4096); + } + other => panic!("expected Data, got {:?}", other), + } + + // Sector 8 onward should be Unmapped (transparent to parent). + let mut ranges2 = Vec::new(); + let _guard2 = vhdx.resolve_read(4096, 512, &mut ranges2).await.unwrap(); + assert_eq!(ranges2.len(), 1); + assert_eq!( + ranges2[0], + ReadRange::Unmapped { + guest_offset: 4096, + length: 512, + } + ); +} + +#[async_test] +async fn full_block_write_diff_disk_no_sbm(driver: DefaultDriver) { + // A full-block write to a diff disk should set FullyPresent, not allocate SBM. + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: format::GB1, + has_parent: true, + ..Default::default() + }; + create::create(&file, &mut params).await.unwrap(); + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size(); + + // Full-block write. + write_block(&vhdx, 0, block_size, 0xEE).await; + + // Block 0 should be FullyPresent (TFP path). + let mapping = vhdx.bat.get_block_mapping(0); + assert_eq!(mapping.bat_state(), BatEntryState::FullyPresent); + + // SBM block for chunk 0 should NOT be allocated. + let sbm_mapping = vhdx.bat.get_sector_bitmap_mapping(0); + assert_ne!( + sbm_mapping.bat_state(), + BatEntryState::FullyPresent, + "full-block write should not allocate SBM" + ); +} + +#[async_test] +async fn second_partial_write_same_chunk_reuses_sbm(driver: DefaultDriver) { + // Two partial writes to different blocks in the same chunk should + // reuse the same SBM block. + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: format::GB1, + has_parent: true, + ..Default::default() + }; + create::create(&file, &mut params).await.unwrap(); + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size() as u64; + + // First partial write to block 0. + write_block(&vhdx, 0, 4096, 0x11).await; + + let sbm_mapping_1 = vhdx.bat.get_sector_bitmap_mapping(0); + assert_eq!(sbm_mapping_1.bat_state(), BatEntryState::FullyPresent); + let sbm_offset_1 = sbm_mapping_1.file_offset(); + + // Second partial write to block 1 (same chunk). + write_block(&vhdx, block_size, 4096, 0x22).await; + + let sbm_mapping_2 = vhdx.bat.get_sector_bitmap_mapping(0); + assert_eq!(sbm_mapping_2.bat_state(), BatEntryState::FullyPresent); + let sbm_offset_2 = sbm_mapping_2.file_offset(); + + // SBM should be reused (same file offset). + assert_eq!( + sbm_offset_1, sbm_offset_2, + "SBM block should be reused, not reallocated" + ); + + // Both blocks should read back correctly. + let mut ranges0 = Vec::new(); + let _g0 = vhdx.resolve_read(0, 4096, &mut ranges0).await.unwrap(); + assert!(matches!(ranges0[0], ReadRange::Data { .. })); + + let mut ranges1 = Vec::new(); + let _g1 = vhdx + .resolve_read(block_size, 4096, &mut ranges1) + .await + .unwrap(); + assert!(matches!(ranges1[0], ReadRange::Data { .. })); +} + +#[async_test] +async fn partial_write_non_diff_disk_no_sbm(driver: DefaultDriver) { + // A sub-block write to a non-differencing disk should set FullyPresent + // and NOT allocate any SBM block. + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + // Partial write: 4096 bytes at offset 0. + write_block(&vhdx, 0, 4096, 0x77).await; + + // Block should be FullyPresent (not PartiallyPresent). + let mapping = vhdx.bat.get_block_mapping(0); + assert_eq!(mapping.bat_state(), BatEntryState::FullyPresent); + + // SBM should NOT be allocated. + // For non-diff disks, sector_bitmap_block_count may be 0, + // so we check via bat_state directly. + let sbm_count = vhdx.bat.sector_bitmap_block_count; + if sbm_count > 0 { + let sbm_mapping = vhdx.bat.get_sector_bitmap_mapping(0); + assert_ne!( + sbm_mapping.bat_state(), + BatEntryState::FullyPresent, + "non-diff disk should not allocate SBM" + ); + } + + // Unwritten sectors within the block should read as Zero (not Unmapped). + let mut ranges = Vec::new(); + let _guard = vhdx.resolve_read(4096, 512, &mut ranges).await.unwrap(); + assert_eq!(ranges.len(), 1); + match &ranges[0] { + ReadRange::Data { .. } => { + // Data range is fine — zero-padded data within an allocated block. + } + ReadRange::Zero { .. } => { + // Zero range is also acceptable (block may be zero-padded). + } + ReadRange::Unmapped { .. } => { + panic!("non-diff disk should never return Unmapped for allocated block"); + } + } +} + +// ----------------------------------------------------------------------- +// File poisoning tests +// ----------------------------------------------------------------------- + +/// Interceptor with atomic flags for runtime fault injection. +struct DynamicFailInterceptor { + fail_writes: AtomicBool, + fail_flushes: AtomicBool, +} + +impl DynamicFailInterceptor { + fn new() -> Self { + Self { + fail_writes: AtomicBool::new(false), + fail_flushes: AtomicBool::new(false), + } + } +} + +impl IoInterceptor for DynamicFailInterceptor { + fn before_write(&self, _offset: u64, _data: &[u8]) -> Result<(), std::io::Error> { + if self.fail_writes.load(Ordering::Relaxed) { + return Err(std::io::Error::other("injected write failure")); + } + Ok(()) + } + + fn before_flush(&self) -> Result<(), std::io::Error> { + if self.fail_flushes.load(Ordering::Relaxed) { + return Err(std::io::Error::other("injected flush failure")); + } + Ok(()) + } +} + +/// Helper: create a writable VHDX with a dynamic fault interceptor. +async fn create_writable_with_faults( + driver: &DefaultDriver, +) -> (VhdxFile, Arc) { + // Create a clean VHDX, snapshot it, reopen with interceptor. + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let snapshot = file.snapshot(); + + let interceptor = Arc::new(DynamicFailInterceptor::new()); + let file2 = InMemoryFile::with_interceptor(snapshot.len() as u64, interceptor.clone()); + file2.write_at(0, &snapshot).await.unwrap(); + + let vhdx = VhdxFile::open(file2).writable(driver).await.unwrap(); + (vhdx, interceptor) +} + +#[async_test] +async fn flush_io_error_poisons_file(driver: DefaultDriver) { + let (vhdx, interceptor) = create_writable_with_faults(&driver).await; + + // Write some data successfully. + let data = [0xAAu8; 4096]; + let mut ranges = Vec::new(); + let guard = vhdx.resolve_write(0, 4096, &mut ranges).await.unwrap(); + for range in &ranges { + if let WriteRange::Data { + file_offset, + length, + .. + } = range + { + vhdx.file + .write_at(*file_offset, &data[..*length as usize]) + .await + .unwrap(); + } + } + guard.complete().await.unwrap(); + + // Now inject flush failure. + interceptor.fail_flushes.store(true, Ordering::Relaxed); + + // Flush should fail. + let result = vhdx.flush().await; + assert!(result.is_err(), "flush should fail with injected error"); + + // Disable the fault — shouldn't matter, file is poisoned. + interceptor.fail_flushes.store(false, Ordering::Relaxed); + + // Subsequent writes should be rejected with Failed. + { + let mut ranges = Vec::new(); + let result = vhdx.resolve_write(0, 4096, &mut ranges).await; + assert!( + matches!(result, Err(VhdxIoError(VhdxIoErrorInner::Failed(_)))), + "write after poison should return Failed" + ); + } + + // Reads should also be rejected. + { + let mut ranges = Vec::new(); + let result = vhdx.resolve_read(0, 4096, &mut ranges).await; + assert!( + matches!(result, Err(VhdxIoError(VhdxIoErrorInner::Failed(_)))), + "read after poison should return Failed" + ); + } + + vhdx.abort().await; +} + +#[async_test] +async fn apply_write_error_poisons_file(driver: DefaultDriver) { + let (vhdx, interceptor) = create_writable_with_faults(&driver).await; + + // Write one block successfully and flush to ensure the pipeline works. + let data = [0xBBu8; 4096]; + let mut ranges = Vec::new(); + let guard = vhdx.resolve_write(0, 4096, &mut ranges).await.unwrap(); + for range in &ranges { + if let WriteRange::Data { + file_offset, + length, + .. + } = range + { + vhdx.file + .write_at(*file_offset, &data[..*length as usize]) + .await + .unwrap(); + } + } + guard.complete().await.unwrap(); + vhdx.flush().await.unwrap(); + + // Now inject write failures — this will hit the log task when + // it tries to write the WAL entry, and/or the apply task when + // it tries to write pages to their final file offsets. + interceptor.fail_writes.store(true, Ordering::Relaxed); + + // Write to a different block to generate new dirty BAT pages. + let block_size = vhdx.block_size() as u64; + let mut ranges = Vec::new(); + let guard = vhdx + .resolve_write(block_size, 4096, &mut ranges) + .await + .unwrap(); + for range in &ranges { + if let WriteRange::Data { + file_offset, + length, + .. + } = range + { + let _ = vhdx + .file + .write_at(*file_offset, &data[..*length as usize]) + .await; + } + } + guard.complete().await.unwrap(); + + // Flush sends to the log pipeline. The log task's WAL write + // will hit the injected failure and poison the file. + let _ = vhdx.flush().await; + + // Clear the fault — the file should stay poisoned regardless. + interceptor.fail_writes.store(false, Ordering::Relaxed); + + // A second flush attempt synchronizes with the poisoned pipeline + // and ensures the error has propagated. + let _ = vhdx.flush().await; + + // The file should now be poisoned. Try an operation. + { + let mut ranges = Vec::new(); + let result = vhdx.resolve_write(0, 4096, &mut ranges).await; + assert!( + matches!(result, Err(VhdxIoError(VhdxIoErrorInner::Failed(_)))), + "write after apply failure should return Failed" + ); + } + + vhdx.abort().await; +} + +#[async_test] +async fn poison_error_message_preserved(driver: DefaultDriver) { + let (vhdx, interceptor) = create_writable_with_faults(&driver).await; + + // Write data. + let data = [0xCCu8; 4096]; + let mut ranges = Vec::new(); + let guard = vhdx.resolve_write(0, 4096, &mut ranges).await.unwrap(); + for range in &ranges { + if let WriteRange::Data { + file_offset, + length, + .. + } = range + { + vhdx.file + .write_at(*file_offset, &data[..*length as usize]) + .await + .unwrap(); + } + } + guard.complete().await.unwrap(); + + // Inject flush failure and flush. + interceptor.fail_flushes.store(true, Ordering::Relaxed); + let _ = vhdx.flush().await; + + // The error message should contain something useful. + let result = vhdx.failed.check(); + match result { + Err(VhdxIoError(VhdxIoErrorInner::Failed(pf))) => { + assert!( + !pf.to_string().is_empty(), + "poison error message should not be empty" + ); + } + other => panic!("expected Failed, got: {other:?}"), + } + + vhdx.abort().await; +} + +// ---- Post-Log Crash Consistency Tests ---- +// +// These tests exercise crash recovery scenarios that aren't covered by +// the basic crash tests or concurrent tests. They focus on: +// 1. Unsafe (free-pool) allocation → flush → crash → no data teleportation +// 2. High-volume log pipeline saturation → crash → replay +// 3. Repeated crash-recovery cycles with writable reopen + +use crate::tests::support::CrashTestFile; + +/// Helper: write a data pattern via the write path. +async fn write_pattern_p16(vhdx: &VhdxFile, offset: u64, len: usize, value: u8) { + let write_buf = vec![value; len]; + let mut ranges = Vec::new(); + let guard = vhdx + .resolve_write(offset, len as u32, &mut ranges) + .await + .unwrap(); + for range in &ranges { + match range { + WriteRange::Data { + file_offset, + length, + .. + } => { + vhdx.file + .write_at(*file_offset, &write_buf[..(*length as usize)]) + .await + .unwrap(); + } + WriteRange::Zero { + file_offset, + length, + } => { + let zeros = vec![0u8; *length as usize]; + vhdx.file.write_at(*file_offset, &zeros).await.unwrap(); + } + } + } + guard.complete().await.unwrap(); +} + +/// Helper: read data at a guest offset via the read path. +async fn read_pattern_p16(vhdx: &VhdxFile, offset: u64, len: usize) -> Vec { + let mut buf = vec![0u8; len]; + let mut ranges = Vec::new(); + let _guard = vhdx + .resolve_read(offset, len as u32, &mut ranges) + .await + .unwrap(); + for range in &ranges { + match range { + ReadRange::Data { + guest_offset, + file_offset, + length, + } => { + let start = (*guest_offset - offset) as usize; + let end = start + *length as usize; + vhdx.file + .read_at(*file_offset, &mut buf[start..end]) + .await + .unwrap(); + } + ReadRange::Zero { + guest_offset, + length, + } => { + let start = (*guest_offset - offset) as usize; + let end = start + *length as usize; + buf[start..end].fill(0); + } + ReadRange::Unmapped { .. } => {} + } + } + buf +} + +/// Unsafe (free-pool) allocation → flush → crash → no data teleportation. +/// +/// Allocate block A, trim it to the free pool, then write block B which +/// reuses A's freed space. Flush (so the WAL + FSN barrier are exercised), +/// then crash and replay. Verify: +/// - Block B has its own data (not A's old data) +/// - Block A reads as zeros (trimmed) +/// - No data from A "teleports" to B via stale on-disk content +/// +/// This is the end-to-end crash test for the pre_log_fsn barrier mechanism. +/// Existing tests verify the barrier is *set* (bat_page_has_fsn_unsafe_free_pool) +/// and that a flush *occurs* (flush_between_data_and_wal_unsafe), but no +/// existing test verifies that data is correct after crash+replay when +/// the barrier was needed. +#[async_test] +async fn crash_unsafe_reuse_no_teleportation(driver: DefaultDriver) { + let (mem_file, _) = create_vhdx_with_block_size(4 * MB1, MB1 as u32).await; + let snapshot = mem_file.snapshot(); + + let crash_file = CrashTestFile::from_durable(snapshot); + let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size() as u64; + + // Step 1: Allocate block 0 with pattern 0xAA (near-EOF, safe). + write_pattern_p16(&vhdx, 0, block_size as usize, 0xAA).await; + vhdx.flush().await.unwrap(); + + // Step 2: Trim block 0 with FreeSpace mode → releases to free pool. + let trim_req = TrimRequest::new(TrimMode::FreeSpace, 0, block_size); + vhdx.trim(trim_req).await.unwrap(); + vhdx.flush().await.unwrap(); + + // Step 3: Write block 1 — should reuse block 0's freed space. + // This is the unsafe allocation (SpaceState::CrossStale) that + // requires a pre_log_fsn barrier. + write_pattern_p16(&vhdx, block_size, block_size as usize, 0xBB).await; + vhdx.flush().await.unwrap(); + + // Crash. + let durable = vhdx.file.durable_snapshot(); + vhdx.abort().await; + + // Recover and verify. + let recovered = InMemoryFile::from_snapshot(durable); + let vhdx2 = VhdxFile::open(recovered) + .allow_replay(true) + .read_only() + .await + .unwrap(); + + // Block 0 should be zeros (trimmed with FreeSpace → Unmapped/Zero). + let buf0 = read_pattern_p16(&vhdx2, 0, block_size as usize).await; + assert!( + buf0.iter().all(|&b| b == 0), + "block 0 should be zeros after FreeSpace trim + crash" + ); + + // Block 1 should have 0xBB (not 0xAA — no teleportation). + let buf1 = read_pattern_p16(&vhdx2, block_size, block_size as usize).await; + assert!( + buf1.iter().all(|&b| b == 0xBB), + "block 1 should have 0xBB, not stale data from block 0" + ); +} + +/// High-volume log pipeline stress + crash + replay. +/// +/// Writes many blocks through the full commit→log→apply pipeline (enough +/// to trigger LogFull retry and circular buffer wrapping), then crashes +/// and replays. Verifies all flushed data survives and the log replays +/// correctly even after heavy use. +/// +/// This combines the load profile of `log_pipeline_stress` (500 blocks) +/// with CrashTestFile crash semantics, which no existing test does. +#[async_test] +async fn crash_high_volume_pipeline(driver: DefaultDriver) { + const BLOCK_COUNT: usize = 100; + const BLOCK_SIZE: u64 = 2 * MB1; + const WRITE_LEN: usize = 4096; + + let disk_size = BLOCK_SIZE * (BLOCK_COUNT as u64 + 1); + let (mem_file, _) = create_vhdx_with_block_size(disk_size, BLOCK_SIZE as u32).await; + let snapshot = mem_file.snapshot(); + + let crash_file = CrashTestFile::from_durable(snapshot); + let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap(); + + // Write 100 distinct blocks. The cache will trigger batch-full commits + // as dirty pages accumulate, and the log task will hit LogFull and + // retry as the circular buffer fills. + for i in 0..BLOCK_COUNT { + let offset = i as u64 * BLOCK_SIZE; + let pattern = (i & 0xFF) as u8; + write_pattern_p16(&vhdx, offset, WRITE_LEN, pattern).await; + } + + // Flush everything — drives all batches through commit→log→apply. + vhdx.flush().await.unwrap(); + + // Crash (no clean close — log_guid remains set). + let durable = vhdx.file.durable_snapshot(); + vhdx.abort().await; + + // Recover with log replay. + let recovered = InMemoryFile::from_snapshot(durable); + let vhdx2 = VhdxFile::open(recovered) + .allow_replay(true) + .read_only() + .await + .unwrap(); + + // Verify every block survived. + for i in 0..BLOCK_COUNT { + let offset = i as u64 * BLOCK_SIZE; + let expected = (i & 0xFF) as u8; + let buf = read_pattern_p16(&vhdx2, offset, WRITE_LEN).await; + assert!( + buf.iter().all(|&b| b == expected), + "block {i}: expected 0x{expected:02X}, got 0x{:02X}", + buf[0], + ); + } +} + +/// Repeated crash-recovery cycles with writable reopen. +/// +/// Each cycle: open writable → write new data → flush → crash → verify. +/// The next cycle reopens writable from the crashed state. This tests +/// that log replay produces a file that can be opened writable again +/// (new log set up, new sequence numbers, etc.) without corruption +/// accumulating over multiple cycles. +/// +/// Existing tests (`crash_recovery_then_more_writes`) do 2 rounds but +/// always reopen read-only for verification. This test reopens writable +/// for 5 consecutive cycles, verifying the full open-write-crash-recover +/// lifecycle. +#[async_test] +async fn crash_repeated_writable_recovery_cycles(driver: DefaultDriver) { + const CYCLES: usize = 5; + + let (mem_file, _) = create_vhdx_with_block_size(8 * MB1, MB1 as u32).await; + let mut durable = mem_file.snapshot(); + let block_size = MB1; + + for cycle in 0..CYCLES { + // Open writable from the (possibly crashed) durable state. + let crash_file = CrashTestFile::from_durable(durable); + let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap(); + + // Write to a different block each cycle. + let offset = cycle as u64 * block_size; + let pattern = (0x10 + cycle as u8) | 0x01; // nonzero + write_pattern_p16(&vhdx, offset, block_size as usize, pattern).await; + vhdx.flush().await.unwrap(); + + // Verify all blocks from this and previous cycles are correct. + for prev in 0..=cycle { + let prev_offset = prev as u64 * block_size; + let prev_pattern = (0x10 + prev as u8) | 0x01; + let buf = read_pattern_p16(&vhdx, prev_offset, block_size as usize).await; + assert!( + buf.iter().all(|&b| b == prev_pattern), + "cycle {cycle}, block {prev}: expected 0x{prev_pattern:02x}, \ + got 0x{:02x}", + buf[0] + ); + } + + // Crash. + durable = vhdx.file.durable_snapshot(); + vhdx.abort().await; + } + + // Final verification: open read-only from the last crash, verify + // all 5 blocks from all cycles survived. + let recovered = InMemoryFile::from_snapshot(durable); + let vhdx = VhdxFile::open(recovered) + .allow_replay(true) + .read_only() + .await + .unwrap(); + + for cycle in 0..CYCLES { + let offset = cycle as u64 * block_size; + let pattern = (0x10 + cycle as u8) | 0x01; + let buf = read_pattern_p16(&vhdx, offset, block_size as usize).await; + assert!( + buf.iter().all(|&b| b == pattern), + "final verify block {cycle}: expected 0x{pattern:02x}, got 0x{:02x}", + buf[0] + ); + } +} + +// ---- Concurrent crash tests using YieldingCrashFile ---- +// +// These tests use YieldingCrashFile to create genuine interleaving between +// the log task, apply task, and user write tasks. The yield points cause +// the apply task to yield during write_at, allowing other tasks to make +// progress. Crash snapshots taken at these interleaving points exercise +// the recovery path under partial-apply conditions. + +use crate::tests::support::YieldingCrashFile; + +/// Concurrent writers with interleaved apply + crash + replay. +/// +/// Two tasks write to different blocks concurrently while the apply task +/// yields between its writes (via `yield_on_write`). This creates a +/// genuine interleaving: one task's data may be at its final offset while +/// another task's WAL entry exists but hasn't been applied yet. After +/// crash + replay, all flushed data must be present. +#[async_test] +async fn concurrent_writes_interleaved_apply_crash(driver: DefaultDriver) { + let (mem_file, _) = create_vhdx_with_block_size(8 * MB1, MB1 as u32).await; + let snapshot = mem_file.snapshot(); + + // yield_on_write=true: apply task yields before each page write, + // allowing the log task to process another batch mid-apply. + let file = YieldingCrashFile::from_durable(snapshot, true, false); + let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap()); + let block_size = vhdx.block_size(); + + // Two concurrent writers to different blocks. + { + let vhdx_a = vhdx.clone(); + let vhdx_b = vhdx.clone(); + let bs = block_size; + + let ((), ()) = futures::join!( + async { + write_block(&*vhdx_a, 0, bs, 0xAA).await; + }, + async { + write_block(&*vhdx_b, bs as u64, bs, 0xBB).await; + } + ); + } + + // Flush to make everything durable. + vhdx.flush().await.unwrap(); + + // Take durable snapshot and crash. + let durable = vhdx.file.durable_snapshot(); + Arc::into_inner(vhdx).expect("no other refs").abort().await; + + // Recover and verify both blocks survived. + let recovered = InMemoryFile::from_snapshot(durable); + let vhdx2 = VhdxFile::open(recovered) + .allow_replay(true) + .read_only() + .await + .unwrap(); + + let buf0 = read_pattern_p16(&vhdx2, 0, block_size as usize).await; + assert!( + buf0.iter().all(|&b| b == 0xAA), + "block 0 should have 0xAA after interleaved apply + crash" + ); + + let buf1 = read_pattern_p16(&vhdx2, block_size as u64, block_size as usize).await; + assert!( + buf1.iter().all(|&b| b == 0xBB), + "block 1 should have 0xBB after interleaved apply + crash" + ); +} + +/// Interleaved flush + write + crash. +/// +/// `yield_on_flush=true` causes flush to yield, allowing a concurrent +/// writer to make progress (its write reaches the log task) before the +/// flush's file-level flush completes. After crash, the pre-flush data +/// must be durable; the concurrent write may or may not survive. +#[async_test] +async fn interleaved_flush_and_write_crash(driver: DefaultDriver) { + let (mem_file, _) = create_vhdx_with_block_size(8 * MB1, MB1 as u32).await; + let snapshot = mem_file.snapshot(); + + // yield_on_flush=true: flush yields, allowing concurrent writer to run. + let file = YieldingCrashFile::from_durable(snapshot, false, true); + let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap()); + let block_size = vhdx.block_size(); + + // Write block 0 — this data must survive the flush. + write_block(&*vhdx, 0, block_size, 0xCC).await; + + // Concurrent: flush (yields during file flush) + write block 1. + { + let vhdx_f = vhdx.clone(); + let vhdx_w = vhdx.clone(); + let bs = block_size; + + let ((), ()) = futures::join!( + async { + vhdx_f.flush().await.unwrap(); + }, + async { + write_block(&*vhdx_w, bs as u64, bs, 0xDD).await; + } + ); + } + + // Final flush to ensure the concurrent write is also durable. + vhdx.flush().await.unwrap(); + + // Crash. + let durable = vhdx.file.durable_snapshot(); + Arc::into_inner(vhdx).expect("no other refs").abort().await; + + // Recover. + let recovered = InMemoryFile::from_snapshot(durable); + let vhdx2 = VhdxFile::open(recovered) + .allow_replay(true) + .read_only() + .await + .unwrap(); + + // Block 0 must survive (was written before the first flush). + let buf0 = read_pattern_p16(&vhdx2, 0, block_size as usize).await; + assert!( + buf0.iter().all(|&b| b == 0xCC), + "block 0 should have 0xCC (pre-flush data must survive)" + ); + + // Block 1 should also survive (final flush made it durable). + let buf1 = read_pattern_p16(&vhdx2, block_size as u64, block_size as usize).await; + assert!( + buf1.iter().all(|&b| b == 0xDD), + "block 1 should have 0xDD after final flush" + ); +} + +/// Stress test: many interleaved writers with yielding apply + crash. +/// +/// 8 tasks each write to a unique block with `yield_on_write=true`, +/// creating maximum interleaving between the apply task and log task. +/// After flush + crash + replay, all data must be intact. +#[async_test] +async fn stress_interleaved_apply_crash(driver: DefaultDriver) { + let (mem_file, _) = create_vhdx_with_block_size(8 * MB1, MB1 as u32).await; + let snapshot = mem_file.snapshot(); + + let file = YieldingCrashFile::from_durable(snapshot, true, false); + let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap()); + let block_size = vhdx.block_size(); + + // 8 concurrent writers, each to a unique block. + let write_futures: Vec<_> = (0..8u8) + .map(|i| { + let vhdx = vhdx.clone(); + let bs = block_size; + async move { + let offset = i as u64 * bs as u64; + let pattern = 0x50 + i; + write_block(&*vhdx, offset, bs, pattern).await; + } + }) + .collect(); + + futures::future::join_all(write_futures).await; + + vhdx.flush().await.unwrap(); + + // Crash. + let durable = vhdx.file.durable_snapshot(); + let vhdx = Arc::into_inner(vhdx).expect("no other refs"); + vhdx.abort().await; + + // Recover and verify all 8 blocks. + let recovered = InMemoryFile::from_snapshot(durable); + let vhdx2 = VhdxFile::open(recovered) + .allow_replay(true) + .read_only() + .await + .unwrap(); + + for i in 0..8u8 { + let offset = i as u64 * block_size as u64; + let expected = 0x50 + i; + let buf = read_pattern_p16(&vhdx2, offset, block_size as usize).await; + assert!( + buf.iter().all(|&b| b == expected), + "block {i}: expected 0x{expected:02x}, got 0x{:02x}", + buf[0] + ); + } +} + +/// Interleaved trim + write + crash with yield points. +/// +/// Write all blocks, flush. Then concurrently trim some blocks and write +/// others with `yield_on_write=true`. Flush, crash, and verify the +/// expected state (trimmed blocks are zeros, written blocks have data). +#[async_test] +async fn interleaved_trim_write_crash(driver: DefaultDriver) { + let (mem_file, _) = create_vhdx_with_block_size(8 * MB1, MB1 as u32).await; + let snapshot = mem_file.snapshot(); + + let file = YieldingCrashFile::from_durable(snapshot, true, false); + let vhdx = Arc::new(VhdxFile::open(file).writable(&driver).await.unwrap()); + let block_size = vhdx.block_size(); + + // Step 1: Write all 8 blocks with initial data. + for i in 0..8u8 { + let offset = i as u64 * block_size as u64; + write_block(&*vhdx, offset, block_size, 0x10 + i).await; + } + vhdx.flush().await.unwrap(); + + // Step 2: Concurrently trim blocks 0-3 and write blocks 4-7. + let trim_futures: Vec<_> = (0..4u8) + .map(|i| { + let vhdx = vhdx.clone(); + let bs = block_size; + async move { + let offset = i as u64 * bs as u64; + vhdx.trim(TrimRequest::new(TrimMode::FileSpace, offset, bs as u64)) + .await + .unwrap(); + } + }) + .collect(); + + let write_futures: Vec<_> = (4..8u8) + .map(|i| { + let vhdx = vhdx.clone(); + let bs = block_size; + async move { + let offset = i as u64 * bs as u64; + write_block(&*vhdx, offset, bs, 0x90 + i).await; + } + }) + .collect(); + + let ((), ()) = futures::join!( + async { + futures::future::join_all(trim_futures).await; + }, + async { + futures::future::join_all(write_futures).await; + } + ); + + vhdx.flush().await.unwrap(); + + // Crash. + let durable = vhdx.file.durable_snapshot(); + let vhdx = Arc::into_inner(vhdx).expect("no other refs"); + vhdx.abort().await; + + // Recover. + let recovered = InMemoryFile::from_snapshot(durable); + let vhdx2 = VhdxFile::open(recovered) + .allow_replay(true) + .read_only() + .await + .unwrap(); + + // Blocks 0-3: trimmed → zeros. + for i in 0..4u8 { + let offset = i as u64 * block_size as u64; + let buf = read_pattern_p16(&vhdx2, offset, block_size as usize).await; + assert!( + buf.iter().all(|&b| b == 0), + "block {i}: expected zeros (trimmed), got 0x{:02x}", + buf[0] + ); + } + + // Blocks 4-7: overwritten with new data. + for i in 4..8u8 { + let offset = i as u64 * block_size as u64; + let expected = 0x90 + i; + let buf = read_pattern_p16(&vhdx2, offset, block_size as usize).await; + assert!( + buf.iter().all(|&b| b == expected), + "block {i}: expected 0x{expected:02x}, got 0x{:02x}", + buf[0] + ); + } +} + +// ---- Selective durability crash tests ---- +// +// These tests use CrashAfterFlushFile to crash at specific points in +// the WAL pipeline. Unlike CrashTestFile (where flush is all-or-nothing), +// CrashAfterFlushFile can be armed to fail after N more flushes, +// simulating crashes between the WAL flush and the apply flush. + +use crate::tests::support::CrashAfterFlushFile; + +/// Write + flush with crash armed after 1 flush. +/// +/// The VhdxFile::flush() path does: commit → log task writes WAL → +/// flush_sequencer.flush() (1 file.flush()) → apply task writes BAT. +/// +/// With arm(1), the flush_sequencer's flush succeeds (WAL + user data +/// durable), but a subsequent flush (or the apply write itself) fails. +/// The apply task's BAT write may or may not succeed in volatile, but +/// the BAT is NOT durable. On recovery, WAL replay must restore the +/// BAT page. +#[async_test] +async fn crash_wal_durable_apply_lost(driver: DefaultDriver) { + let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let snapshot = mem_file.snapshot(); + + let crash_file = CrashAfterFlushFile::new(snapshot); + let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size() as usize; + + // Write one block. + write_pattern_p16(&vhdx, 0, block_size, 0xAB).await; + + // Arm the crash: allow 1 more flush (the flush_sequencer's flush + // that makes WAL + user data durable), then fail everything. + vhdx.file.arm(1); + + // Flush — the WAL flush succeeds; subsequent ops fail. + // This may return Ok (if the crash hits after the sequencer flush) + // or Err (if the apply task races and triggers the error). + let _ = vhdx.flush().await; + + // Take the durable snapshot. The WAL entry and user data should be + // durable. The BAT page may NOT be at its final offset. + let durable = vhdx.file.durable_snapshot(); + + // Don't call abort() — the file is poisoned, tasks may be in error state. + // Just drop everything and recover from durable state. + drop(vhdx); + + // Recover: open with replay. The WAL should restore the BAT page. + let recovered = InMemoryFile::from_snapshot(durable); + let vhdx2 = VhdxFile::open(recovered) + .allow_replay(true) + .read_only() + .await + .unwrap(); + + // Verify the data survived via WAL replay. + let buf = read_pattern_p16(&vhdx2, 0, block_size).await; + assert!( + buf.iter().all(|&b| b == 0xAB), + "data should survive via WAL replay when apply is lost: got 0x{:02x}", + buf[0] + ); +} + +/// Write + flush with crash armed after 0 flushes. +/// +/// The next flush fails immediately. Nothing new is durable. +/// Recovery should see the original empty state. +#[async_test] +async fn crash_before_wal_flush_data_lost(driver: DefaultDriver) { + let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let snapshot = mem_file.snapshot(); + + let crash_file = CrashAfterFlushFile::new(snapshot.clone()); + let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size() as usize; + + // Write a block. + write_pattern_p16(&vhdx, 0, block_size, 0xCD).await; + + // Arm: next flush fails. The WAL flush won't succeed, so nothing + // new is durable. + vhdx.file.arm(0); + + // Flush will fail. + let result = vhdx.flush().await; + assert!(result.is_err(), "flush should fail with armed crash"); + + // Durable state should be the pre-write state. + let durable = vhdx.file.durable_snapshot(); + drop(vhdx); + + // Recover and verify data is NOT present (was never durable). + let recovered = InMemoryFile::from_snapshot(durable); + let vhdx2 = VhdxFile::open(recovered) + .allow_replay(true) + .read_only() + .await + .unwrap(); + + let buf = read_pattern_p16(&vhdx2, 0, block_size).await; + assert!( + buf.iter().all(|&b| b == 0), + "data should be lost when WAL flush fails: got 0x{:02x}", + buf[0] + ); +} + +/// Multiple writes, flush, then arm and write more. +/// +/// First batch: write blocks 0-2, flush (all durable). Second batch: +/// write blocks 3-4, arm(1), flush (WAL durable, apply may fail). +/// Recovery should see blocks 0-2 (clean) and blocks 3-4 (via replay). +#[async_test] +async fn crash_partial_pipeline_multi_batch(driver: DefaultDriver) { + let (mem_file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let snapshot = mem_file.snapshot(); + + let crash_file = CrashAfterFlushFile::new(snapshot); + let vhdx = VhdxFile::open(crash_file).writable(&driver).await.unwrap(); + let block_size = vhdx.block_size() as u64; + + // Batch 1: write blocks 0-2, flush normally (unarmed). + for i in 0..3u8 { + let offset = i as u64 * block_size; + write_pattern_p16(&vhdx, offset, block_size as usize, 0x10 + i).await; + } + vhdx.flush().await.unwrap(); + + // Batch 2: write blocks 3-4, arm, flush. + for i in 3..5u8 { + let offset = i as u64 * block_size; + write_pattern_p16(&vhdx, offset, block_size as usize, 0x20 + i).await; + } + + // Arm: 1 more flush (WAL flush succeeds), then crash. + vhdx.file.arm(1); + let _ = vhdx.flush().await; + + let durable = vhdx.file.durable_snapshot(); + drop(vhdx); + + // Recover. + let recovered = InMemoryFile::from_snapshot(durable); + let vhdx2 = VhdxFile::open(recovered) + .allow_replay(true) + .read_only() + .await + .unwrap(); + + // Blocks 0-2: from batch 1 (fully durable before arm). + for i in 0..3u8 { + let offset = i as u64 * block_size; + let expected = 0x10 + i; + let buf = read_pattern_p16(&vhdx2, offset, block_size as usize).await; + assert!( + buf.iter().all(|&b| b == expected), + "batch 1 block {i}: expected 0x{expected:02x}, got 0x{:02x}", + buf[0] + ); + } + + // Blocks 3-4: from batch 2 (WAL durable, may need replay). + for i in 3..5u8 { + let offset = i as u64 * block_size; + let expected = 0x20 + i; + let buf = read_pattern_p16(&vhdx2, offset, block_size as usize).await; + assert!( + buf.iter().all(|&b| b == expected), + "batch 2 block {i}: expected 0x{expected:02x}, got 0x{:02x}", + buf[0] + ); + } +} diff --git a/vm/devices/storage/vhdx/src/tests/mod.rs b/vm/devices/storage/vhdx/src/tests/mod.rs new file mode 100644 index 0000000000..2bfba82243 --- /dev/null +++ b/vm/devices/storage/vhdx/src/tests/mod.rs @@ -0,0 +1,547 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +pub mod support; + +#[cfg(test)] +mod crash_tests; +#[cfg(test)] +mod io_tests; +#[cfg(test)] +mod trim_tests; + +#[cfg(test)] +mod integration { + use crate::AsyncFile; + use crate::cache::PageCache; + use crate::create::{self, CreateParams}; + use crate::format; + use crate::header; + use crate::known_meta; + use crate::metadata::METADATA_TAG; + use crate::metadata::MetadataTable; + use crate::region; + use crate::tests::support::InMemoryFile; + use guid::Guid; + use pal_async::async_test; + use std::sync::Arc; + + fn metadata_cache(file: InMemoryFile, metadata_offset: u64) -> PageCache { + let mut cache = PageCache::new(Arc::new(file), None, None, 0); + cache.register_tag(METADATA_TAG, metadata_offset); + cache + } + + #[async_test] + async fn create_then_parse_full_roundtrip() { + let disk_size = 2 * format::GB1; + let mut params = CreateParams { + disk_size, + block_size: 2 * format::MB1 as u32, + logical_sector_size: 512, + physical_sector_size: 4096, + ..CreateParams::default() + }; + let file = InMemoryFile::new(0); + create::create(&file, &mut params).await.unwrap(); + let file_length = file.file_size().await.unwrap(); + + // 1. Parse headers. + let parsed_header = header::parse_headers(&file, file_length).await.unwrap(); + assert_eq!(parsed_header.log_guid, Guid::ZERO); + assert_ne!(parsed_header.file_write_guid, Guid::ZERO); + assert_ne!(parsed_header.data_write_guid, Guid::ZERO); + + // 2. Parse region tables. + let regions = region::parse_region_tables(&file).await.unwrap(); + assert!(regions.rewrite_data.is_none()); + assert!(regions.bat_offset > 0); + assert!(regions.metadata_offset > 0); + + // 3. Read metadata table. + let table = MetadataTable::read(&file, regions.metadata_offset, regions.metadata_length) + .await + .unwrap(); + + // 4. Verify known metadata. + known_meta::verify_known_metadata(&table, false).unwrap(); + + // 5. Read known metadata. + let cache = metadata_cache(file, regions.metadata_offset); + let meta = known_meta::read_known_metadata(&cache, &table) + .await + .unwrap(); + + assert_eq!(meta.disk_size, disk_size); + assert_eq!(meta.block_size, 2 * format::MB1 as u32); + assert_eq!(meta.logical_sector_size, 512); + assert_eq!(meta.physical_sector_size, 4096); + assert!(!meta.has_parent); + assert!(!meta.leave_blocks_allocated); + assert_ne!(meta.page_83_data, Guid::ZERO); + } + + #[async_test] + async fn create_differencing_then_parse() { + let mut params = CreateParams { + disk_size: format::GB1, + has_parent: true, + ..CreateParams::default() + }; + let file = InMemoryFile::new(0); + create::create(&file, &mut params).await.unwrap(); + let file_length = file.file_size().await.unwrap(); + + let _header = header::parse_headers(&file, file_length).await.unwrap(); + let regions = region::parse_region_tables(&file).await.unwrap(); + let table = MetadataTable::read(&file, regions.metadata_offset, regions.metadata_length) + .await + .unwrap(); + + known_meta::verify_known_metadata(&table, false).unwrap(); + let cache = metadata_cache(file, regions.metadata_offset); + let meta = known_meta::read_known_metadata(&cache, &table) + .await + .unwrap(); + + assert!(meta.has_parent); + } +} + +#[cfg(test)] +mod log_task_integration { + use crate::format; + use crate::open::VhdxFile; + use crate::tests::support::InMemoryFile; + use crate::{AsyncFile, AsyncFileExt}; + use pal_async::DefaultDriver; + use pal_async::async_test; + use zerocopy::FromBytes; + + /// Helper: create a test VHDX file with default parameters. + async fn create_test_vhdx_file(disk_size: u64) -> InMemoryFile { + let (file, _) = InMemoryFile::create_test_vhdx(disk_size).await; + file + } + + /// Helper: write a data pattern via the write path. + async fn write_pattern(vhdx: &VhdxFile, offset: u64, len: usize, value: u8) { + let write_buf = vec![value; len]; + let mut ranges = Vec::new(); + let guard = vhdx + .resolve_write(offset, len as u32, &mut ranges) + .await + .unwrap(); + for range in &ranges { + match range { + crate::WriteRange::Data { + file_offset, + length, + .. + } => { + vhdx.file + .write_at(*file_offset, &write_buf[..(*length as usize)]) + .await + .unwrap(); + } + crate::WriteRange::Zero { + file_offset, + length, + } => { + let zeros = vec![0u8; *length as usize]; + vhdx.file.write_at(*file_offset, &zeros).await.unwrap(); + } + } + } + guard.complete().await.unwrap(); + } + + /// Helper: read data at a guest offset via the read path. + async fn read_pattern(vhdx: &VhdxFile, offset: u64, len: usize) -> Vec { + let mut buf = vec![0u8; len]; + let mut ranges = Vec::new(); + let _guard = vhdx + .resolve_read(offset, len as u32, &mut ranges) + .await + .unwrap(); + for range in &ranges { + match range { + crate::ReadRange::Data { + guest_offset, + file_offset, + length, + } => { + let start = (*guest_offset - offset) as usize; + let end = start + *length as usize; + vhdx.file + .read_at(*file_offset, &mut buf[start..end]) + .await + .unwrap(); + } + crate::ReadRange::Zero { + guest_offset, + length, + } => { + let start = (*guest_offset - offset) as usize; + let end = start + *length as usize; + buf[start..end].fill(0); + } + crate::ReadRange::Unmapped { .. } => {} + } + } + buf + } + + #[async_test] + async fn open_writable_and_close(driver: DefaultDriver) { + let file = create_test_vhdx_file(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + // Verify the file is opened in writable mode with a log task. + assert!(!vhdx.read_only); + assert!(vhdx.log_state.is_some()); + + // Close should succeed cleanly. + vhdx.close().await.unwrap(); + } + + #[async_test] + async fn open_writable_sets_log_guid(driver: DefaultDriver) { + let file = create_test_vhdx_file(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + // The file should have log_guid set (the header was written during open). + // We verify by reading the header from the file. + let mut buf = vec![0u8; format::HEADER_SIZE as usize]; + // Read both headers and check at least one has log_guid != 0. + vhdx.file + .read_at(format::HEADER_OFFSET_1, &mut buf) + .await + .unwrap(); + let h1 = format::Header::read_from_prefix(&buf).ok().map(|(h, _)| h); + vhdx.file + .read_at(format::HEADER_OFFSET_2, &mut buf) + .await + .unwrap(); + let h2 = format::Header::read_from_prefix(&buf).ok().map(|(h, _)| h); + + let has_log_guid = h1.as_ref().is_some_and(|h| h.log_guid != guid::Guid::ZERO) + || h2.as_ref().is_some_and(|h| h.log_guid != guid::Guid::ZERO); + assert!(has_log_guid, "log_guid should be set after open_writable"); + + vhdx.close().await.unwrap(); + } + + #[async_test] + async fn close_clears_log_guid(driver: DefaultDriver) { + let file = create_test_vhdx_file(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let file_ref = vhdx.file.clone(); + + // Close the file. + vhdx.close().await.unwrap(); + + // After close, both headers should have log_guid == ZERO + // (at least the current one). + let mut buf1 = vec![0u8; format::HEADER_SIZE as usize]; + file_ref + .read_at(format::HEADER_OFFSET_1, &mut buf1) + .await + .unwrap(); + let mut buf2 = vec![0u8; format::HEADER_SIZE as usize]; + file_ref + .read_at(format::HEADER_OFFSET_2, &mut buf2) + .await + .unwrap(); + + use zerocopy::FromBytes as _; + let h1 = format::Header::read_from_prefix(&buf1).ok().map(|(h, _)| h); + let h2 = format::Header::read_from_prefix(&buf2).ok().map(|(h, _)| h); + + // The current header (highest sequence_number) should have ZERO log_guid. + let current = match (&h1, &h2) { + (Some(a), Some(b)) if b.sequence_number >= a.sequence_number => b, + (Some(a), _) => a, + (_, Some(b)) => b, + _ => panic!("no valid headers"), + }; + assert_eq!(current.log_guid, guid::Guid::ZERO); + } + + #[async_test] + async fn write_flush_close_reopen(driver: DefaultDriver) { + let file = create_test_vhdx_file(format::GB1).await; + + // Open with log, write data, flush, close. + let file_arc = { + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + write_pattern(&vhdx, 0, 4096, 0xAB).await; + vhdx.flush().await.unwrap(); + let file_arc = vhdx.file.clone(); + vhdx.close().await.unwrap(); + file_arc + }; + + // Reopen (no log needed since we closed cleanly) and verify data. + { + let vhdx = VhdxFile::open(InMemoryFile::from_snapshot(file_arc.snapshot())) + .read_only() + .await + .unwrap(); + let read_buf = read_pattern(&vhdx, 0, 4096).await; + assert!(read_buf.iter().all(|&b| b == 0xAB)); + } + } + + #[async_test] + async fn close_then_reopen_is_clean(driver: DefaultDriver) { + let file = create_test_vhdx_file(format::GB1).await; + + // Open with log, do nothing, close. + let file_arc = { + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let file_arc = vhdx.file.clone(); + vhdx.close().await.unwrap(); + file_arc + }; + + // Reopen — should succeed without log replay. + let vhdx = VhdxFile::open(InMemoryFile::from_snapshot(file_arc.snapshot())) + .read_only() + .await + .unwrap(); + assert!(vhdx.read_only); + } + + #[async_test] + async fn open_read_only_no_spawner() { + let file = create_test_vhdx_file(format::GB1).await; + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + assert!(vhdx.read_only); + assert!(vhdx.log_state.is_none()); + } + + #[async_test] + async fn flush_returns_fsn(driver: DefaultDriver) { + let file = create_test_vhdx_file(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + // Write data to dirty some cache pages. + write_pattern(&vhdx, 0, 4096, 0xEE).await; + + // Commit should return a valid FSN via the cache. + let _fsn = vhdx.cache.commit().unwrap(); + // FSN can be 0 if no dirty pages (BAT may or may not be dirty depending + // on cache state). Just verify no errors. + + vhdx.close().await.unwrap(); + } + + #[async_test] + async fn multiple_writes_single_flush(driver: DefaultDriver) { + let file = create_test_vhdx_file(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + // Multiple writes at different offsets. + write_pattern(&vhdx, 0, 4096, 0x11).await; + write_pattern(&vhdx, 4096, 4096, 0x22).await; + write_pattern(&vhdx, 8192, 4096, 0x33).await; + + // Single flush should handle all dirty pages. + vhdx.flush().await.unwrap(); + let file_arc = vhdx.file.clone(); + vhdx.close().await.unwrap(); + + // Reopen and verify. + let vhdx2 = VhdxFile::open(InMemoryFile::from_snapshot(file_arc.snapshot())) + .read_only() + .await + .unwrap(); + + let buf0 = read_pattern(&vhdx2, 0, 4096).await; + assert!(buf0.iter().all(|&b| b == 0x11), "first write mismatch"); + let buf1 = read_pattern(&vhdx2, 4096, 4096).await; + assert!(buf1.iter().all(|&b| b == 0x22), "second write mismatch"); + let buf2 = read_pattern(&vhdx2, 8192, 4096).await; + assert!(buf2.iter().all(|&b| b == 0x33), "third write mismatch"); + } + + /// Write to 200+ distinct data blocks, flush, close, reopen, and verify. + /// + /// This exercises the eager commit path: with 200+ distinct block writes + /// the cache will accumulate more dirty BAT pages than `MAX_COMMIT_PAGES`, + /// triggering automatic intermediate commits before the final flush. + #[async_test] + async fn large_write_survives_close_reopen(driver: DefaultDriver) { + const BLOCK_COUNT: usize = 200; + // Default block size is 2 MiB; place one 4 KiB write in each block. + const BLOCK_SIZE: u64 = 2 * format::MB1; + const WRITE_LEN: usize = 4096; + + let disk_size = BLOCK_SIZE * (BLOCK_COUNT as u64 + 1); + let file = create_test_vhdx_file(disk_size).await; + + // Open with log, write a distinct pattern into each of 200 blocks. + let file_arc = { + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + for i in 0..BLOCK_COUNT { + let offset = i as u64 * BLOCK_SIZE; + let pattern = (i & 0xFF) as u8; + write_pattern(&vhdx, offset, WRITE_LEN, pattern).await; + } + vhdx.flush().await.unwrap(); + let file_arc = vhdx.file.clone(); + vhdx.close().await.unwrap(); + file_arc + }; + + // Reopen from snapshot and verify every block. + { + let vhdx = VhdxFile::open(InMemoryFile::from_snapshot(file_arc.snapshot())) + .read_only() + .await + .unwrap(); + for i in 0..BLOCK_COUNT { + let offset = i as u64 * BLOCK_SIZE; + let expected = (i & 0xFF) as u8; + let buf = read_pattern(&vhdx, offset, WRITE_LEN).await; + assert!( + buf.iter().all(|&b| b == expected), + "block {} mismatch: expected 0x{:02X}, got 0x{:02X}", + i, + expected, + buf[0], + ); + } + } + } + + /// After flush + close, all permits should have been released. + /// Verifies the apply task releases permits (not commit). + #[async_test] + async fn permits_released_after_apply(driver: DefaultDriver) { + let file = create_test_vhdx_file(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + // Write to several distinct blocks so multiple BAT pages are dirtied. + for i in 0..10u64 { + let offset = i * 2 * format::MB1; // each in a different block + write_pattern(&vhdx, offset, 4096, (i & 0xFF) as u8).await; + } + + // Flush commits dirty pages → log → apply → permits released. + vhdx.flush().await.unwrap(); + + // Write 10 more blocks. If permits weren't released, this would + // eventually block (deadlock). The fact that it completes proves + // permits are flowing back from the apply task. + for i in 10..20u64 { + let offset = i * 2 * format::MB1; + write_pattern(&vhdx, offset, 4096, (i & 0xFF) as u8).await; + } + vhdx.flush().await.unwrap(); + + // Verify all data survived. + for i in 0..20u64 { + let offset = i * 2 * format::MB1; + let expected = (i & 0xFF) as u8; + let buf = read_pattern(&vhdx, offset, 4096).await; + assert!( + buf.iter().all(|&b| b == expected), + "block {i} mismatch after permit recycling" + ); + } + + vhdx.close().await.unwrap(); + } + + /// Pump many batches through the full pipeline, exceeding the log's + /// circular buffer capacity. The log task must handle LogFull by + /// waiting for the apply task to drain, advancing tails, and retrying. + #[async_test] + async fn many_commits_forward_progress(driver: DefaultDriver) { + const BATCH_COUNT: usize = 20; + const BLOCK_SIZE: u64 = 2 * format::MB1; + + let disk_size = BLOCK_SIZE * (BATCH_COUNT as u64 + 1); + let file = create_test_vhdx_file(disk_size).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + // Each iteration writes to a new block (dirtying its BAT page), + // then flushes. This forces commit → log → apply for each batch. + // With a 1 MiB log the circular buffer will fill up, exercising + // the LogFull retry path in handle_commit. + for i in 0..BATCH_COUNT { + let offset = i as u64 * BLOCK_SIZE; + let pattern = (i & 0xFF) as u8; + write_pattern(&vhdx, offset, 4096, pattern).await; + vhdx.flush().await.unwrap(); + } + + let file_arc = vhdx.file.clone(); + vhdx.close().await.unwrap(); + + // Reopen read-only and verify every block. + let vhdx2 = VhdxFile::open(InMemoryFile::from_snapshot(file_arc.snapshot())) + .read_only() + .await + .unwrap(); + for i in 0..BATCH_COUNT { + let offset = i as u64 * BLOCK_SIZE; + let expected = (i & 0xFF) as u8; + let buf = read_pattern(&vhdx2, offset, 4096).await; + assert!( + buf.iter().all(|&b| b == expected), + "block {i} mismatch: expected 0x{expected:02X}, got 0x{:02X}", + buf[0], + ); + } + } + + /// Stress test: pump a large number of writes through the pipeline + /// without individual flushes, then flush once at the end. This + /// exercises batch-full commit (automatic commits when dirty_count + /// reaches MAX_COMMIT_PAGES) combined with the log task's LogFull + /// retry and permit backpressure from the apply task. + #[async_test] + async fn log_pipeline_stress(driver: DefaultDriver) { + const BLOCK_COUNT: usize = 500; + const BLOCK_SIZE: u64 = 2 * format::MB1; + const WRITE_LEN: usize = 4096; + + let disk_size = BLOCK_SIZE * (BLOCK_COUNT as u64 + 1); + let file = create_test_vhdx_file(disk_size).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + // Write to 500 distinct blocks without flushing. The cache will + // trigger batch-full commits as dirty pages accumulate, and the + // log task will hit LogFull and retry as the circular buffer + // fills. The apply task must release permits to keep the + // pipeline moving. + for i in 0..BLOCK_COUNT { + let offset = i as u64 * BLOCK_SIZE; + let pattern = (i & 0xFF) as u8; + write_pattern(&vhdx, offset, WRITE_LEN, pattern).await; + } + + vhdx.flush().await.unwrap(); + let file_arc = vhdx.file.clone(); + vhdx.close().await.unwrap(); + + // Reopen and verify. + let vhdx2 = VhdxFile::open(InMemoryFile::from_snapshot(file_arc.snapshot())) + .read_only() + .await + .unwrap(); + for i in 0..BLOCK_COUNT { + let offset = i as u64 * BLOCK_SIZE; + let expected = (i & 0xFF) as u8; + let buf = read_pattern(&vhdx2, offset, WRITE_LEN).await; + assert!( + buf.iter().all(|&b| b == expected), + "block {i} mismatch: expected 0x{expected:02X}, got 0x{:02X}", + buf[0], + ); + } + } +} diff --git a/vm/devices/storage/vhdx/src/tests/support.rs b/vm/devices/storage/vhdx/src/tests/support.rs new file mode 100644 index 0000000000..7deaccdc58 --- /dev/null +++ b/vm/devices/storage/vhdx/src/tests/support.rs @@ -0,0 +1,813 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Test support utilities: in-memory file backing store and I/O interceptors. + +use crate::{AsyncFile, AsyncFileExt}; +use parking_lot::Mutex; +use std::borrow::Borrow; +use std::sync::Arc; + +/// Trait for intercepting I/O operations in tests. +/// +/// Default implementations return `Ok(())` (no interception). +pub trait IoInterceptor: Send + Sync { + /// Called before a read operation. + fn before_read(&self, offset: u64, len: usize) -> Result<(), std::io::Error> { + let _ = (offset, len); + Ok(()) + } + + /// Called before a write operation. + fn before_write(&self, offset: u64, data: &[u8]) -> Result<(), std::io::Error> { + let _ = (offset, data); + Ok(()) + } + + /// Called before a flush operation. + fn before_flush(&self) -> Result<(), std::io::Error> { + Ok(()) + } + + /// Called before a set_file_size operation. + fn before_set_file_size(&self, size: u64) -> Result<(), std::io::Error> { + let _ = size; + Ok(()) + } + + /// Returns `true` if the write should be silently discarded (data not + /// written). The default is `false`. + fn should_discard_write(&self, offset: u64, data: &[u8]) -> bool { + let _ = (offset, data); + false + } +} + +/// An interceptor that fails all I/O operations of specified types. +pub struct FailingInterceptor { + /// Whether reads should fail. + pub fail_reads: bool, + /// Whether writes should fail. + pub fail_writes: bool, + /// Whether flushes should fail. + pub fail_flushes: bool, + /// Whether set_file_size should fail. + pub fail_set_file_size: bool, +} + +impl IoInterceptor for FailingInterceptor { + fn before_read(&self, _offset: u64, _len: usize) -> Result<(), std::io::Error> { + if self.fail_reads { + return Err(std::io::Error::other("injected I/O failure")); + } + Ok(()) + } + + fn before_write(&self, _offset: u64, _data: &[u8]) -> Result<(), std::io::Error> { + if self.fail_writes { + return Err(std::io::Error::other("injected I/O failure")); + } + Ok(()) + } + + fn before_flush(&self) -> Result<(), std::io::Error> { + if self.fail_flushes { + return Err(std::io::Error::other("injected I/O failure")); + } + Ok(()) + } + + fn before_set_file_size(&self, _size: u64) -> Result<(), std::io::Error> { + if self.fail_set_file_size { + return Err(std::io::Error::other("injected I/O failure")); + } + Ok(()) + } +} + +/// An interceptor that silently discards writes. +/// +/// Reads and flushes pass through normally. Writes appear to succeed +/// but the underlying data is not modified. This simulates a crash +/// where writes were in flight but not persisted. +pub struct DiscardWritesInterceptor; + +impl IoInterceptor for DiscardWritesInterceptor { + fn should_discard_write(&self, _offset: u64, _data: &[u8]) -> bool { + true + } +} + +/// In-memory file backing store for tests. +/// +/// Supports optional I/O interception for failure injection and write +/// discarding (used in crash tests). +pub struct InMemoryFile { + inner: Mutex, + interceptor: Option>, +} + +struct InMemoryFileInner { + data: Vec, +} + +impl InMemoryFile { + /// Creates a zero-filled file of the given size. + pub fn new(size: u64) -> Self { + Self { + inner: Mutex::new(InMemoryFileInner { + data: vec![0u8; size as usize], + }), + interceptor: None, + } + } + + /// Creates a zero-filled file with an I/O interceptor. + pub fn with_interceptor(size: u64, interceptor: Arc) -> Self { + Self { + inner: Mutex::new(InMemoryFileInner { + data: vec![0u8; size as usize], + }), + interceptor: Some(interceptor), + } + } + + /// Returns a clone of the current file contents. + pub fn snapshot(&self) -> Vec { + self.inner.lock().data.clone() + } + + /// Create an `InMemoryFile` from existing data (e.g. a snapshot). + pub fn from_snapshot(data: Vec) -> InMemoryFile { + InMemoryFile { + inner: Mutex::new(InMemoryFileInner { data }), + interceptor: None, + } + } + + /// Create a VHDX file in memory with the given disk size and default parameters. + /// + /// Returns the `InMemoryFile` and the validated `CreateParams`. + pub async fn create_test_vhdx(disk_size: u64) -> (InMemoryFile, crate::create::CreateParams) { + let file = InMemoryFile::new(0); + let mut params = crate::create::CreateParams { + disk_size, + ..Default::default() + }; + crate::create::create(&file, &mut params).await.unwrap(); + (file, params) + } +} + +impl AsyncFile for InMemoryFile { + type Buffer = Vec; + + fn alloc_buffer(&self, len: usize) -> Vec { + vec![0u8; len] + } + + async fn read_into(&self, offset: u64, mut buf: Vec) -> Result, std::io::Error> { + if let Some(interceptor) = &self.interceptor { + interceptor.before_read(offset, buf.len())?; + } + let inner = self.inner.lock(); + let offset = offset as usize; + let file_len = inner.data.len(); + for (i, byte) in buf.iter_mut().enumerate() { + let pos = offset + i; + *byte = if pos < file_len { inner.data[pos] } else { 0 }; + } + Ok(buf) + } + + async fn write_from( + &self, + offset: u64, + buf: impl Borrow> + Send + 'static, + ) -> Result<(), std::io::Error> { + let buf = buf.borrow(); + if let Some(interceptor) = &self.interceptor { + interceptor.before_write(offset, buf.as_ref())?; + if interceptor.should_discard_write(offset, buf.as_ref()) { + return Ok(()); + } + } + let mut inner = self.inner.lock(); + let offset = offset as usize; + let end = offset + buf.len(); + if end > inner.data.len() { + inner.data.resize(end, 0); + } + inner.data[offset..end].copy_from_slice(buf.as_ref()); + Ok(()) + } + + async fn flush(&self) -> Result<(), std::io::Error> { + if let Some(interceptor) = &self.interceptor { + interceptor.before_flush()?; + } + Ok(()) + } + + async fn file_size(&self) -> Result { + Ok(self.inner.lock().data.len() as u64) + } + + async fn set_file_size(&self, size: u64) -> Result<(), std::io::Error> { + if let Some(interceptor) = &self.interceptor { + interceptor.before_set_file_size(size)?; + } + let mut inner = self.inner.lock(); + inner.data.resize(size as usize, 0); + Ok(()) + } +} + +/// A file implementation that separates volatile and durable state, +/// with a write log for verifying operation ordering. +/// +/// - `write_at()` → writes to volatile only (reads see it, but it won't +/// survive a crash). +/// - `flush()` → copies volatile to durable (survives crash). +/// - `crash()` → returns durable state; volatile-only writes are lost. +/// - `from_durable(data)` → creates a new file from a crash snapshot. +/// +/// The write log records every `write_at`, `flush`, and `set_file_size` +/// call, enabling ordering tests that verify flush barriers exist between +/// data writes and WAL writes. +pub struct CrashTestFile { + inner: Mutex, +} + +impl std::fmt::Debug for CrashTestFile { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let inner = self.inner.lock(); + f.debug_struct("CrashTestFile") + .field("durable_len", &inner.durable.len()) + .field("volatile_len", &inner.volatile.len()) + .field("flush_count", &inner.flush_count) + .finish() + } +} + +struct CrashTestFileInner { + /// Data that has survived flush — survives power failure. + durable: Vec, + /// Data as seen by reads — includes unflushed writes. + volatile: Vec, + /// How many flush() calls have occurred. + flush_count: u64, +} + +impl CrashTestFile { + /// Create a CrashTestFile from existing durable data (e.g. from a crash snapshot). + pub fn from_durable(data: Vec) -> Self { + Self { + inner: Mutex::new(CrashTestFileInner { + volatile: data.clone(), + durable: data, + flush_count: 0, + }), + } + } + + /// Snapshot durable state without consuming the file. + pub fn durable_snapshot(&self) -> Vec { + self.inner.lock().durable.clone() + } + + /// How many flushes have occurred. + pub fn flush_count(&self) -> u64 { + self.inner.lock().flush_count + } +} + +/// A crash-test file that yields during `write_at` and/or `flush`, +/// allowing other tasks to interleave. +/// +/// This combines `CrashTestFile`'s durable/volatile split with +/// `YieldingFile`'s yield-point mechanism. When a yield is configured, +/// the file yields (returns Pending once) at the start of the operation, +/// allowing other spawned tasks to run. This creates genuine interleaving +/// between the log task, apply task, and user write tasks. +/// +/// # Use cases +/// +/// - **`yield_on_write = true`**: The apply task yields before each +/// `write_at`, allowing the log task to process another commit. This +/// creates a crash point where one batch's applies are in progress +/// while another batch is being logged. +/// +/// - **`yield_on_flush = true`**: The flush path yields, allowing +/// concurrent writes to reach the log task before the flush completes. +pub struct YieldingCrashFile { + inner: Mutex, +} + +struct CrashTestFileYieldInner { + durable: Vec, + volatile: Vec, + flush_count: u64, + yield_on_write: bool, + yield_on_flush: bool, +} + +impl YieldingCrashFile { + /// Create a `YieldingCrashFile` from existing durable data. + pub fn from_durable(data: Vec, yield_on_write: bool, yield_on_flush: bool) -> Self { + Self { + inner: Mutex::new(CrashTestFileYieldInner { + volatile: data.clone(), + durable: data, + flush_count: 0, + yield_on_write, + yield_on_flush, + }), + } + } + + /// Snapshot durable state without consuming the file. + pub fn durable_snapshot(&self) -> Vec { + self.inner.lock().durable.clone() + } +} + +/// A crash-test file where the crash point is armed dynamically. +/// +/// Before arming, the file behaves like a normal `CrashTestFile`: writes +/// go to volatile, flush copies volatile→durable. +/// +/// After [`arm(n)`](Self::arm) is called, the file will allow exactly `n` +/// more flushes to succeed (making data durable), then start failing all +/// writes and flushes with I/O errors. The durable state is frozen at +/// the last successful flush. +/// +/// # Typical usage +/// +/// ```ignore +/// // Create and open writable (flushes during open are unaffected). +/// let file = CrashAfterFlushFile::new(snapshot); +/// let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); +/// +/// // Do some writes. +/// write_block(&vhdx, 0, bs, 0xAA).await; +/// +/// // Arm: allow 1 more flush (the WAL flush), then crash. +/// vhdx.file.arm(1); +/// +/// // This flush will: commit → log task writes WAL → flush_sequencer +/// // calls file.flush() (succeeds, armed count decrements to 0) → +/// // apply task tries to write → I/O error → file poisoned. +/// let _ = vhdx.flush().await; // may fail if apply races +/// ``` +pub struct CrashAfterFlushFile { + inner: Mutex, +} + +struct CrashAfterFlushInner { + /// Data that has survived flush — survives power failure. + durable: Vec, + /// Data as seen by reads — includes unflushed writes. + volatile: Vec, + /// How many flushes have occurred. + flush_count: u64, + /// When Some(n), allow n more flushes then crash. None = not armed. + remaining_flushes: Option, + /// Whether the crash has been triggered. + crashed: bool, +} + +impl CrashAfterFlushFile { + /// Create a new crash-armed file from existing data. + /// The file starts unarmed; call [`arm()`](Self::arm) to set the crash point. + pub fn new(data: Vec) -> Self { + Self { + inner: Mutex::new(CrashAfterFlushInner { + volatile: data.clone(), + durable: data, + flush_count: 0, + remaining_flushes: None, + crashed: false, + }), + } + } + + /// Arm the crash: allow `n` more successful flushes, then fail. + /// + /// - `arm(0)` — the next flush fails immediately. + /// - `arm(1)` — the next flush succeeds (makes data durable), then + /// the one after that fails. + pub fn arm(&self, remaining_flushes: u64) { + let mut inner = self.inner.lock(); + inner.remaining_flushes = Some(remaining_flushes); + } + + /// Snapshot durable state without consuming the file. + pub fn durable_snapshot(&self) -> Vec { + self.inner.lock().durable.clone() + } +} + +impl AsyncFile for CrashAfterFlushFile { + type Buffer = Vec; + + fn alloc_buffer(&self, len: usize) -> Vec { + vec![0u8; len] + } + + async fn read_into(&self, offset: u64, mut buf: Vec) -> Result, std::io::Error> { + let inner = self.inner.lock(); + let offset = offset as usize; + let file_len = inner.volatile.len(); + for (i, byte) in buf.iter_mut().enumerate() { + let pos = offset + i; + *byte = if pos < file_len { + inner.volatile[pos] + } else { + 0 + }; + } + Ok(buf) + } + + async fn write_from( + &self, + offset: u64, + buf: impl Borrow> + Send + 'static, + ) -> Result<(), std::io::Error> { + let buf = buf.borrow(); + let mut inner = self.inner.lock(); + if inner.crashed { + return Err(std::io::Error::other("crash: disk unavailable")); + } + let off = offset as usize; + let end = off + buf.len(); + if end > inner.volatile.len() { + inner.volatile.resize(end, 0); + } + inner.volatile[off..end].copy_from_slice(buf.as_ref()); + Ok(()) + } + + async fn flush(&self) -> Result<(), std::io::Error> { + let mut inner = self.inner.lock(); + if inner.crashed { + return Err(std::io::Error::other("crash: disk unavailable")); + } + // Check if armed and out of remaining flushes. + if let Some(ref remaining) = inner.remaining_flushes { + if *remaining == 0 { + // Crash NOW — don't make data durable, fail the flush. + inner.crashed = true; + return Err(std::io::Error::other("crash: disk unavailable")); + } + } + // Make data durable. + inner.durable = inner.volatile.clone(); + inner.flush_count += 1; + // Decrement remaining flushes. + if let Some(ref mut remaining) = inner.remaining_flushes { + *remaining -= 1; + } + Ok(()) + } + + async fn file_size(&self) -> Result { + Ok(self.inner.lock().volatile.len() as u64) + } + + async fn set_file_size(&self, size: u64) -> Result<(), std::io::Error> { + let mut inner = self.inner.lock(); + if inner.crashed { + return Err(std::io::Error::other("crash: disk unavailable")); + } + inner.volatile.resize(size as usize, 0); + inner.durable.resize(size as usize, 0); + Ok(()) + } +} + +/// Yield once to allow other tasks to run, then resume. +async fn yield_once() { + let mut yielded = false; + std::future::poll_fn(|cx| { + if !yielded { + yielded = true; + cx.waker().wake_by_ref(); + std::task::Poll::Pending + } else { + std::task::Poll::Ready(()) + } + }) + .await; +} + +impl AsyncFile for YieldingCrashFile { + type Buffer = Vec; + + fn alloc_buffer(&self, len: usize) -> Vec { + vec![0u8; len] + } + + async fn read_into(&self, offset: u64, mut buf: Vec) -> Result, std::io::Error> { + let inner = self.inner.lock(); + let offset = offset as usize; + let file_len = inner.volatile.len(); + for (i, byte) in buf.iter_mut().enumerate() { + let pos = offset + i; + *byte = if pos < file_len { + inner.volatile[pos] + } else { + 0 + }; + } + Ok(buf) + } + + async fn write_from( + &self, + offset: u64, + buf: impl Borrow> + Send + 'static, + ) -> Result<(), std::io::Error> { + let should_yield = self.inner.lock().yield_on_write; + if should_yield { + yield_once().await; + } + let buf = buf.borrow(); + let mut inner = self.inner.lock(); + let off = offset as usize; + let end = off + buf.len(); + if end > inner.volatile.len() { + inner.volatile.resize(end, 0); + } + inner.volatile[off..end].copy_from_slice(buf.as_ref()); + Ok(()) + } + + async fn flush(&self) -> Result<(), std::io::Error> { + let should_yield = self.inner.lock().yield_on_flush; + if should_yield { + yield_once().await; + } + + let mut inner = self.inner.lock(); + inner.durable = inner.volatile.clone(); + inner.flush_count += 1; + Ok(()) + } + + async fn file_size(&self) -> Result { + Ok(self.inner.lock().volatile.len() as u64) + } + + async fn set_file_size(&self, size: u64) -> Result<(), std::io::Error> { + let mut inner = self.inner.lock(); + inner.volatile.resize(size as usize, 0); + inner.durable.resize(size as usize, 0); + Ok(()) + } +} + +impl AsyncFile for CrashTestFile { + type Buffer = Vec; + + fn alloc_buffer(&self, len: usize) -> Vec { + vec![0u8; len] + } + + async fn read_into(&self, offset: u64, mut buf: Vec) -> Result, std::io::Error> { + let inner = self.inner.lock(); + let offset = offset as usize; + let file_len = inner.volatile.len(); + for (i, byte) in buf.iter_mut().enumerate() { + let pos = offset + i; + *byte = if pos < file_len { + inner.volatile[pos] + } else { + 0 + }; + } + Ok(buf) + } + + async fn write_from( + &self, + offset: u64, + buf: impl Borrow> + Send + 'static, + ) -> Result<(), std::io::Error> { + let buf = buf.borrow(); + let mut inner = self.inner.lock(); + let off = offset as usize; + let end = off + buf.len(); + if end > inner.volatile.len() { + inner.volatile.resize(end, 0); + } + inner.volatile[off..end].copy_from_slice(buf.as_ref()); + Ok(()) + } + + async fn flush(&self) -> Result<(), std::io::Error> { + let mut inner = self.inner.lock(); + // Copy volatile to durable (all unflushed writes become durable). + inner.durable = inner.volatile.clone(); + inner.flush_count += 1; + Ok(()) + } + + async fn file_size(&self) -> Result { + // Return volatile size (latest state as seen by reads). + Ok(self.inner.lock().volatile.len() as u64) + } + + async fn set_file_size(&self, size: u64) -> Result<(), std::io::Error> { + let mut inner = self.inner.lock(); + // File size changes are immediately durable (metadata is sync). + inner.volatile.resize(size as usize, 0); + inner.durable.resize(size as usize, 0); + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use pal_async::async_test; + + #[async_test] + async fn write_then_read() { + let file = InMemoryFile::new(1024); + let data = b"hello, vhdx!"; + file.write_at(100, data).await.unwrap(); + + let mut buf = vec![0u8; data.len()]; + file.read_at(100, &mut buf).await.unwrap(); + assert_eq!(&buf, data); + } + + #[async_test] + async fn read_zeros_on_new_file() { + let file = InMemoryFile::new(256); + let mut buf = vec![0xFFu8; 256]; + file.read_at(0, &mut buf).await.unwrap(); + assert!(buf.iter().all(|&b| b == 0)); + } + + #[async_test] + async fn read_beyond_eof_zero_fills() { + let file = InMemoryFile::new(8); + // Write known data to the entire file. + file.write_at(0, &[1, 2, 3, 4, 5, 6, 7, 8]).await.unwrap(); + + // Read a range that extends 4 bytes past EOF. + let mut buf = vec![0xFFu8; 12]; + file.read_at(0, &mut buf).await.unwrap(); + assert_eq!(&buf[..8], &[1, 2, 3, 4, 5, 6, 7, 8]); + assert_eq!(&buf[8..], &[0, 0, 0, 0]); + + // Read entirely beyond EOF. + let mut buf2 = vec![0xFFu8; 4]; + file.read_at(100, &mut buf2).await.unwrap(); + assert!(buf2.iter().all(|&b| b == 0)); + } + + #[async_test] + async fn write_beyond_eof_grows() { + let file = InMemoryFile::new(4); + assert_eq!(file.file_size().await.unwrap(), 4); + + file.write_at(8, b"hi").await.unwrap(); + assert_eq!(file.file_size().await.unwrap(), 10); + + // Gap between old EOF (4) and write offset (8) should be zeros. + let mut gap = vec![0xFFu8; 4]; + file.read_at(4, &mut gap).await.unwrap(); + assert!(gap.iter().all(|&b| b == 0)); + + // Written data should be present. + let mut buf = vec![0u8; 2]; + file.read_at(8, &mut buf).await.unwrap(); + assert_eq!(&buf, b"hi"); + } + + #[async_test] + async fn set_file_size_grow() { + let file = InMemoryFile::new(4); + file.write_at(0, &[1, 2, 3, 4]).await.unwrap(); + + file.set_file_size(8).await.unwrap(); + assert_eq!(file.file_size().await.unwrap(), 8); + + let mut buf = vec![0xFFu8; 8]; + file.read_at(0, &mut buf).await.unwrap(); + assert_eq!(&buf, &[1, 2, 3, 4, 0, 0, 0, 0]); + } + + #[async_test] + async fn set_file_size_shrink() { + let file = InMemoryFile::new(8); + file.write_at(0, &[1, 2, 3, 4, 5, 6, 7, 8]).await.unwrap(); + + file.set_file_size(4).await.unwrap(); + assert_eq!(file.file_size().await.unwrap(), 4); + + let snapshot = file.snapshot(); + assert_eq!(&snapshot, &[1, 2, 3, 4]); + } + + #[async_test] + async fn file_size_reports_correctly() { + let file = InMemoryFile::new(100); + assert_eq!(file.file_size().await.unwrap(), 100); + + file.set_file_size(200).await.unwrap(); + assert_eq!(file.file_size().await.unwrap(), 200); + + file.set_file_size(50).await.unwrap(); + assert_eq!(file.file_size().await.unwrap(), 50); + } + + #[async_test] + async fn snapshot_returns_copy() { + let file = InMemoryFile::new(4); + file.write_at(0, &[1, 2, 3, 4]).await.unwrap(); + + let snap = file.snapshot(); + assert_eq!(&snap, &[1, 2, 3, 4]); + + // Subsequent write should not affect the snapshot. + file.write_at(0, &[9, 9, 9, 9]).await.unwrap(); + assert_eq!(&snap, &[1, 2, 3, 4]); + } + + #[async_test] + async fn failing_interceptor_read() { + let file = InMemoryFile::with_interceptor( + 64, + Arc::new(FailingInterceptor { + fail_reads: true, + fail_writes: false, + fail_flushes: false, + fail_set_file_size: false, + }), + ); + + let mut buf = vec![0u8; 8]; + let result = file.read_at(0, &mut buf).await; + assert!(result.is_err()); + } + + #[async_test] + async fn failing_interceptor_write() { + let file = InMemoryFile::with_interceptor( + 64, + Arc::new(FailingInterceptor { + fail_reads: false, + fail_writes: true, + fail_flushes: false, + fail_set_file_size: false, + }), + ); + + let result = file.write_at(0, &[1, 2, 3, 4]).await; + assert!(result.is_err()); + + // File should not be modified. + let snapshot = file.snapshot(); + assert!(snapshot.iter().all(|&b| b == 0)); + } + + #[async_test] + async fn failing_interceptor_flush() { + let file = InMemoryFile::with_interceptor( + 64, + Arc::new(FailingInterceptor { + fail_reads: false, + fail_writes: false, + fail_flushes: true, + fail_set_file_size: false, + }), + ); + + let result = file.flush().await; + assert!(result.is_err()); + } + + #[async_test] + async fn discard_writes_interceptor() { + let file = InMemoryFile::with_interceptor(8, Arc::new(DiscardWritesInterceptor)); + + // Write should appear to succeed. + file.write_at(0, &[1, 2, 3, 4]).await.unwrap(); + + // But the data should not actually be written. + let mut buf = vec![0xFFu8; 4]; + file.read_at(0, &mut buf).await.unwrap(); + assert!(buf.iter().all(|&b| b == 0)); + } + + #[async_test] + async fn flush_is_noop() { + let file = InMemoryFile::new(64); + file.flush().await.unwrap(); + } +} diff --git a/vm/devices/storage/vhdx/src/tests/trim_tests.rs b/vm/devices/storage/vhdx/src/tests/trim_tests.rs new file mode 100644 index 0000000000..f2cb201ace --- /dev/null +++ b/vm/devices/storage/vhdx/src/tests/trim_tests.rs @@ -0,0 +1,710 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +use crate::AsyncFileExt; +use crate::create::{self, CreateParams}; +use crate::error::VhdxIoError; +use crate::error::VhdxIoErrorInner; +use crate::format; +use crate::format::BatEntryState; +use crate::format::MB1; +use crate::io::ReadRange; +use crate::io::WriteRange; +use crate::open::VhdxFile; +use crate::tests::support::InMemoryFile; +use crate::trim::{TrimMode, TrimRequest}; +use pal_async::DefaultDriver; +use pal_async::async_test; + +/// Helper to create a disk and write a full block, returning the VhdxFile. +async fn create_and_write_block( + disk_size: u64, + block_number: u32, + driver: &DefaultDriver, +) -> VhdxFile { + let (file, _) = InMemoryFile::create_test_vhdx(disk_size).await; + let vhdx = VhdxFile::open(file).writable(driver).await.unwrap(); + let block_offset = block_number as u64 * vhdx.block_size() as u64; + let block_size = vhdx.block_size(); + + // Write a full block of data. + let mut ranges = Vec::new(); + let guard = vhdx + .resolve_write(block_offset, block_size, &mut ranges) + .await + .unwrap(); + + // Perform the writes (we don't actually need to write data for BAT testing). + for range in &ranges { + match range { + WriteRange::Data { + file_offset, + length, + .. + } => { + let buf = vec![0xAA; *length as usize]; + vhdx.file.write_at(*file_offset, &buf).await.unwrap(); + } + WriteRange::Zero { + file_offset, + length, + } => { + let buf = vec![0u8; *length as usize]; + vhdx.file.write_at(*file_offset, &buf).await.unwrap(); + } + } + } + guard.complete().await.unwrap(); + vhdx +} + +/// Helper to verify a block's BAT state. +fn assert_block_state(vhdx: &VhdxFile, block_number: u32, expected: BatEntryState) { + let mapping = vhdx.bat.get_block_mapping(block_number); + let actual = mapping.bat_state(); + assert_eq!( + actual, expected, + "block {block_number}: expected {expected:?}, got {actual:?}" + ); +} + +/// Helper to check if a block has a non-zero file megabyte (soft anchor). +fn block_has_file_offset(vhdx: &VhdxFile, block_number: u32) -> bool { + vhdx.bat.get_block_mapping(block_number).file_megabyte() != 0 +} + +/// Helper to write data to ranges returned by resolve_write. +async fn write_ranges(vhdx: &VhdxFile, ranges: &[WriteRange], pattern: u8) { + for range in ranges { + match range { + WriteRange::Data { + file_offset, + length, + .. + } => { + let buf = vec![pattern; *length as usize]; + vhdx.file.write_at(*file_offset, &buf).await.unwrap(); + } + WriteRange::Zero { + file_offset, + length, + } => { + let buf = vec![0u8; *length as usize]; + vhdx.file.write_at(*file_offset, &buf).await.unwrap(); + } + } + } +} + +// ---- Basic Trim Tests ---- + +#[async_test] +async fn trim_full_block_file_space(driver: DefaultDriver) { + let vhdx = create_and_write_block(format::GB1, 0, &driver).await; + assert_block_state(&vhdx, 0, BatEntryState::FullyPresent); + assert!(block_has_file_offset(&vhdx, 0)); + + vhdx.trim(TrimRequest::new( + TrimMode::FileSpace, + 0, + vhdx.block_size() as u64, + )) + .await + .unwrap(); + + assert_block_state(&vhdx, 0, BatEntryState::Unmapped); + // Soft anchor preserved. + assert!(block_has_file_offset(&vhdx, 0)); +} + +#[async_test] +async fn trim_full_block_free_space(driver: DefaultDriver) { + let vhdx = create_and_write_block(format::GB1, 0, &driver).await; + + vhdx.trim(TrimRequest::new( + TrimMode::FreeSpace, + 0, + vhdx.block_size() as u64, + )) + .await + .unwrap(); + + assert_block_state(&vhdx, 0, BatEntryState::Undefined); + // FreeSpace on FullyPresent clears file offset (releases space). + assert!(!block_has_file_offset(&vhdx, 0)); +} + +#[async_test] +async fn trim_full_block_zero(driver: DefaultDriver) { + let vhdx = create_and_write_block(format::GB1, 0, &driver).await; + + vhdx.trim(TrimRequest::new( + TrimMode::Zero, + 0, + vhdx.block_size() as u64, + )) + .await + .unwrap(); + + assert_block_state(&vhdx, 0, BatEntryState::Zero); + assert!(!block_has_file_offset(&vhdx, 0)); +} + +#[async_test] +async fn trim_full_block_make_transparent(driver: DefaultDriver) { + let vhdx = create_and_write_block(format::GB1, 0, &driver).await; + + vhdx.trim(TrimRequest::new( + TrimMode::MakeTransparent, + 0, + vhdx.block_size() as u64, + )) + .await + .unwrap(); + + assert_block_state(&vhdx, 0, BatEntryState::NotPresent); + assert!(!block_has_file_offset(&vhdx, 0)); +} + +#[async_test] +async fn trim_remove_soft_anchors(driver: DefaultDriver) { + let vhdx = create_and_write_block(format::GB1, 0, &driver).await; + + // First trim with FileSpace to create a soft anchor. + vhdx.trim(TrimRequest::new( + TrimMode::FileSpace, + 0, + vhdx.block_size() as u64, + )) + .await + .unwrap(); + assert_block_state(&vhdx, 0, BatEntryState::Unmapped); + assert!(block_has_file_offset(&vhdx, 0)); + + // Now remove the soft anchor. + vhdx.trim(TrimRequest::new( + TrimMode::RemoveSoftAnchors, + 0, + vhdx.block_size() as u64, + )) + .await + .unwrap(); + assert_block_state(&vhdx, 0, BatEntryState::Unmapped); + assert!(!block_has_file_offset(&vhdx, 0)); +} + +#[async_test] +async fn trim_already_trimmed_idempotent(driver: DefaultDriver) { + let vhdx = create_and_write_block(format::GB1, 0, &driver).await; + + vhdx.trim(TrimRequest::new( + TrimMode::FileSpace, + 0, + vhdx.block_size() as u64, + )) + .await + .unwrap(); + assert_block_state(&vhdx, 0, BatEntryState::Unmapped); + + // Second trim with FileSpace → no-op. + vhdx.trim(TrimRequest::new( + TrimMode::FileSpace, + 0, + vhdx.block_size() as u64, + )) + .await + .unwrap(); + assert_block_state(&vhdx, 0, BatEntryState::Unmapped); +} + +#[async_test] +async fn trim_undefined_block_file_space_noop(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + // Block 0 starts as NotPresent on a fresh non-differencing disk. + assert_block_state(&vhdx, 0, BatEntryState::NotPresent); + + vhdx.trim(TrimRequest::new( + TrimMode::FileSpace, + 0, + vhdx.block_size() as u64, + )) + .await + .unwrap(); + + // FileSpace is a no-op for NotPresent → should still be NotPresent. + assert_block_state(&vhdx, 0, BatEntryState::NotPresent); +} + +#[async_test] +async fn trim_zero_block_noop(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + // First: write and trim to Zero to get a Zero block. + let block_size = vhdx.block_size(); + let mut ranges = Vec::new(); + let guard = vhdx + .resolve_write(0, block_size, &mut ranges) + .await + .unwrap(); + write_ranges(&vhdx, &ranges, 0).await; + guard.complete().await.unwrap(); + + vhdx.trim(TrimRequest::new(TrimMode::Zero, 0, block_size as u64)) + .await + .unwrap(); + assert_block_state(&vhdx, 0, BatEntryState::Zero); + + // Second Zero trim → no-op. + vhdx.trim(TrimRequest::new(TrimMode::Zero, 0, block_size as u64)) + .await + .unwrap(); + assert_block_state(&vhdx, 0, BatEntryState::Zero); +} + +// ---- Range Tests ---- + +#[async_test] +async fn trim_cross_block(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let bs = vhdx.block_size(); + + // Write blocks 0, 1, 2. + for block in 0..3u32 { + let offset = block as u64 * bs as u64; + let mut ranges = Vec::new(); + let guard = vhdx.resolve_write(offset, bs, &mut ranges).await.unwrap(); + write_ranges(&vhdx, &ranges, 0xBB).await; + guard.complete().await.unwrap(); + } + + // Trim all 3 blocks at once. + vhdx.trim(TrimRequest::new(TrimMode::FileSpace, 0, 3 * bs as u64)) + .await + .unwrap(); + + for block in 0..3u32 { + assert_block_state(&vhdx, block, BatEntryState::Unmapped); + assert!(block_has_file_offset(&vhdx, block)); + } +} + +#[async_test] +async fn trim_partial_range_skips_edges(driver: DefaultDriver) { + let file = InMemoryFile::new(0); + let bs = MB1 as u32; // Use 1 MiB blocks for easier testing. + let mut params = CreateParams { + disk_size: 4 * MB1, + block_size: bs, + ..Default::default() + }; + create::create(&file, &mut params).await.unwrap(); + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + // Write blocks 0, 1, 2. + for block in 0..3u32 { + let offset = block as u64 * bs as u64; + let mut ranges = Vec::new(); + let guard = vhdx.resolve_write(offset, bs, &mut ranges).await.unwrap(); + write_ranges(&vhdx, &ranges, 0xCC).await; + guard.complete().await.unwrap(); + } + + // Trim from mid-block-0 through mid-block-2 → only block 1 is trimmed. + let trim_offset = MB1 / 2; // mid-block-0 + let trim_length = 2 * MB1; // covers block 1 fully, partial block 0 and 2 + vhdx.trim(TrimRequest::new( + TrimMode::FileSpace, + trim_offset, + trim_length, + )) + .await + .unwrap(); + + assert_block_state(&vhdx, 0, BatEntryState::FullyPresent); // partial → not trimmed + assert_block_state(&vhdx, 1, BatEntryState::Unmapped); // fully covered → trimmed + assert_block_state(&vhdx, 2, BatEntryState::FullyPresent); // partial → not trimmed +} + +#[async_test] +async fn trim_entire_disk(driver: DefaultDriver) { + let file = InMemoryFile::new(0); + let bs = MB1 as u32; + let mut params = CreateParams { + disk_size: 4 * MB1, + block_size: bs, + ..Default::default() + }; + create::create(&file, &mut params).await.unwrap(); + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + // Write all 4 blocks. + for block in 0..4u32 { + let offset = block as u64 * bs as u64; + let mut ranges = Vec::new(); + let guard = vhdx.resolve_write(offset, bs, &mut ranges).await.unwrap(); + write_ranges(&vhdx, &ranges, 0xDD).await; + guard.complete().await.unwrap(); + } + + // Trim the entire disk. + vhdx.trim(TrimRequest::new(TrimMode::FileSpace, 0, 4 * MB1)) + .await + .unwrap(); + + for block in 0..4u32 { + assert_block_state(&vhdx, block, BatEntryState::Unmapped); + } +} + +#[async_test] +async fn trim_at_disk_end_rounds_up(driver: DefaultDriver) { + // The disk size may not be an exact multiple of block size. + // If trim range ends exactly at disk_size, we round up. + let file = InMemoryFile::new(0); + let bs = MB1 as u32; + // 3.5 MiB disk with 1 MiB blocks → 4 blocks (last block is partial). + let disk_size = 3 * MB1 + MB1 / 2; + let mut params = CreateParams { + disk_size, + block_size: bs, + ..Default::default() + }; + create::create(&file, &mut params).await.unwrap(); + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + // Write block 3 (the last, partial block). + let block3_offset = 3 * MB1; + // Write less than a full block (only the valid portion). + let write_size = (MB1 / 2) as u32; + let mut ranges = Vec::new(); + let guard = vhdx + .resolve_write(block3_offset, write_size, &mut ranges) + .await + .unwrap(); + write_ranges(&vhdx, &ranges, 0xEE).await; + guard.complete().await.unwrap(); + + // Trim from block 3 to end of disk. + vhdx.trim(TrimRequest::new( + TrimMode::FileSpace, + block3_offset, + disk_size - block3_offset, + )) + .await + .unwrap(); + + // Block 3 should be trimmed (disk end rounding kicks in). + assert_block_state(&vhdx, 3, BatEntryState::Unmapped); +} + +// ---- Read-After-Trim Tests ---- + +#[async_test] +async fn read_after_trim_returns_zeros(driver: DefaultDriver) { + let vhdx = create_and_write_block(format::GB1, 0, &driver).await; + + // Verify data is present. + let mut ranges = Vec::new(); + let guard = vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap(); + assert!(matches!(ranges[0], ReadRange::Data { .. })); + drop(guard); + + // Trim. + vhdx.trim(TrimRequest::new( + TrimMode::FileSpace, + 0, + vhdx.block_size() as u64, + )) + .await + .unwrap(); + + // Read after trim → zeros. + let mut ranges = Vec::new(); + let _guard = vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap(); + assert_eq!(ranges.len(), 1); + assert!( + matches!(ranges[0], ReadRange::Zero { .. }), + "expected Zero range after trim, got {:?}", + ranges[0] + ); +} + +#[async_test] +async fn trim_then_write_reallocates(driver: DefaultDriver) { + let vhdx = create_and_write_block(format::GB1, 0, &driver).await; + + // Trim with FileSpace (soft anchor). + vhdx.trim(TrimRequest::new( + TrimMode::FileSpace, + 0, + vhdx.block_size() as u64, + )) + .await + .unwrap(); + assert_block_state(&vhdx, 0, BatEntryState::Unmapped); + + // Write again — should reallocate (possibly reusing soft anchor). + let bs = vhdx.block_size(); + let mut ranges = Vec::new(); + let guard = vhdx.resolve_write(0, bs, &mut ranges).await.unwrap(); + write_ranges(&vhdx, &ranges, 0xFF).await; + guard.complete().await.unwrap(); + + assert_block_state(&vhdx, 0, BatEntryState::FullyPresent); +} + +// ---- Fully-Allocated Disk Tests ---- + +#[async_test] +async fn trim_fixed_disk_file_space_noop(driver: DefaultDriver) { + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: 4 * MB1, + block_size: MB1 as u32, + is_fully_allocated: true, + ..Default::default() + }; + create::create(&file, &mut params).await.unwrap(); + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + // FileSpace trim on fixed → no-op. + vhdx.trim(TrimRequest::new(TrimMode::FileSpace, 0, 4 * MB1)) + .await + .unwrap(); + + // Blocks should be unchanged. + let mapping = vhdx.bat.get_block_mapping(0); + let state = mapping.bat_state(); + // On a fully-allocated disk, blocks start as Undefined (not yet written). + // The FileSpace mode is a no-op, so they stay the same. + assert_ne!(state, BatEntryState::Unmapped); +} + +#[async_test] +async fn trim_fixed_disk_make_transparent_allowed(driver: DefaultDriver) { + let file = InMemoryFile::new(0); + let mut params = CreateParams { + disk_size: 4 * MB1, + block_size: MB1 as u32, + is_fully_allocated: true, + ..Default::default() + }; + create::create(&file, &mut params).await.unwrap(); + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + // MakeTransparent on fixed → allowed. + vhdx.trim(TrimRequest::new(TrimMode::MakeTransparent, 0, 4 * MB1)) + .await + .unwrap(); + + // Blocks should be NotPresent (MakeTransparent succeeded). + assert_block_state(&vhdx, 0, BatEntryState::NotPresent); +} + +// ---- Concurrent Safety Tests ---- + +#[async_test] +async fn trim_waits_for_in_flight_read(driver: DefaultDriver) { + let vhdx = create_and_write_block(format::GB1, 0, &driver).await; + + // Acquire a read guard on block 0 to hold its refcount. + let mut ranges = Vec::new(); + let read_guard = vhdx.resolve_read(0, 4096, &mut ranges).await.unwrap(); + + // Spawn trim concurrently. It should block until the guard is dropped. + let trim_done = std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)); + let trim_done2 = trim_done.clone(); + + let (trim_result, _) = futures::join!( + async { + let r = vhdx + .trim(TrimRequest::new( + TrimMode::FileSpace, + 0, + vhdx.block_size() as u64, + )) + .await; + trim_done2.store(true, std::sync::atomic::Ordering::SeqCst); + r + }, + async { + // After a yield, drop the read guard. + // The trim should be able to see the refcount eventually. + // Yield to let the trim task run. + std::future::poll_fn(|cx| { + cx.waker().wake_by_ref(); + std::task::Poll::Ready(()) + }) + .await; + assert!( + !trim_done.load(std::sync::atomic::Ordering::SeqCst), + "trim should not complete while read guard is held" + ); + drop(read_guard); + } + ); + + trim_result.unwrap(); + assert_block_state(&vhdx, 0, BatEntryState::Unmapped); +} + +#[async_test] +async fn trim_waits_for_in_flight_write(driver: DefaultDriver) { + let vhdx = create_and_write_block(format::GB1, 0, &driver).await; + + // Acquire a write guard on block 0. + let mut ranges = Vec::new(); + let write_guard = vhdx + .resolve_write(0, vhdx.block_size(), &mut ranges) + .await + .unwrap(); + + let trim_done = std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)); + let trim_done2 = trim_done.clone(); + + let (trim_result, _) = futures::join!( + async { + let r = vhdx + .trim(TrimRequest::new( + TrimMode::FileSpace, + 0, + vhdx.block_size() as u64, + )) + .await; + trim_done2.store(true, std::sync::atomic::Ordering::SeqCst); + r + }, + async { + // Yield to let the trim task run. + std::future::poll_fn(|cx| { + cx.waker().wake_by_ref(); + std::task::Poll::Ready(()) + }) + .await; + assert!( + !trim_done.load(std::sync::atomic::Ordering::SeqCst), + "trim should not complete while write guard is held" + ); + // Complete the write so the guard drops after. + write_guard.complete().await.unwrap(); + } + ); + + trim_result.unwrap(); + assert_block_state(&vhdx, 0, BatEntryState::Unmapped); +} + +#[async_test] +async fn trim_concurrent_different_block(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + let bs = vhdx.block_size(); + + // Write blocks 0 and 1. + for block in 0..2u32 { + let offset = block as u64 * bs as u64; + let mut ranges = Vec::new(); + let guard = vhdx.resolve_write(offset, bs, &mut ranges).await.unwrap(); + write_ranges(&vhdx, &ranges, 0xAA).await; + guard.complete().await.unwrap(); + } + + // Trim block 0, read block 1 concurrently. + let (trim_result, read_result) = futures::join!( + vhdx.trim(TrimRequest::new(TrimMode::FileSpace, 0, bs as u64)), + async { + let mut ranges = Vec::new(); + let guard = vhdx + .resolve_read(bs as u64, 4096, &mut ranges) + .await + .unwrap(); + let result = ranges.clone(); + drop(guard); + result + } + ); + + trim_result.unwrap(); + assert_block_state(&vhdx, 0, BatEntryState::Unmapped); + assert!(matches!(read_result[0], ReadRange::Data { .. })); +} + +// ---- Validation Tests ---- + +#[async_test] +async fn trim_read_only_fails() { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).read_only().await.unwrap(); + + let result = vhdx + .trim(TrimRequest::new( + TrimMode::FileSpace, + 0, + vhdx.block_size() as u64, + )) + .await; + assert!(matches!( + result, + Err(VhdxIoError(VhdxIoErrorInner::ReadOnly)) + )); +} + +#[async_test] +async fn trim_unaligned_offset_fails(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + let result = vhdx + .trim(TrimRequest::new(TrimMode::FileSpace, 1, 512)) + .await; + assert!(matches!( + result, + Err(VhdxIoError(VhdxIoErrorInner::UnalignedIo)) + )); +} + +#[async_test] +async fn trim_beyond_disk_fails(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + let result = vhdx + .trim(TrimRequest::new( + TrimMode::FileSpace, + format::GB1 - 512, + 1024, + )) + .await; + assert!(matches!( + result, + Err(VhdxIoError(VhdxIoErrorInner::BeyondEndOfDisk)) + )); +} + +#[async_test] +async fn trim_beyond_disk_ok_with_skip(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + // With skip_disk_size_check, goes beyond but computes no included blocks → ok. + let result = vhdx + .trim( + TrimRequest::new(TrimMode::FileSpace, format::GB1 - 512, 1024) + .skip_disk_size_check(true), + ) + .await; + assert!(result.is_ok()); +} + +#[async_test] +async fn trim_zero_length_noop(driver: DefaultDriver) { + let (file, _) = InMemoryFile::create_test_vhdx(format::GB1).await; + let vhdx = VhdxFile::open(file).writable(&driver).await.unwrap(); + + let result = vhdx.trim(TrimRequest::new(TrimMode::FileSpace, 0, 0)).await; + assert!(result.is_ok()); +} diff --git a/vm/devices/storage/vhdx/src/trim.rs b/vm/devices/storage/vhdx/src/trim.rs new file mode 100644 index 0000000000..7ad7962967 --- /dev/null +++ b/vm/devices/storage/vhdx/src/trim.rs @@ -0,0 +1,578 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! VHDX block trim (unmap) support. +//! +//! Implements the `VhdxFile::trim()` method that transitions blocks to +//! unmapped states, releasing file space back to the free pool or +//! soft-anchoring it for later reuse. + +use crate::AsyncFile; +use crate::bat::BlockMapping; +use crate::bat::BlockType; +use crate::error::VhdxIoError; +use crate::error::VhdxIoErrorInner; +use crate::format::BatEntryState; +use crate::format::MB1; +use crate::header::WriteMode; +use crate::open::VhdxFile; + +/// Trim mode determining the target block state. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TrimMode { + /// Move blocks to the Unmapped (trimmed) state, keeping file offset + /// as a soft anchor for potential reuse. + /// + /// Denied for Undefined blocks: a block that was never written + /// should stay Undefined (preserves backup semantics — backup tools + /// skip Undefined blocks, but not Unmapped ones). + FileSpace, + + /// Move blocks to the Undefined state. Soft anchor may be kept or + /// cleared depending on the original state. + FreeSpace, + + /// Move blocks to the Zero state, clearing the file offset. + Zero, + + /// Move blocks to the NotPresent (transparent) state, clearing the + /// file offset. For differencing disks, reads fall through to parent. + /// + /// Allowed on fully-allocated (fixed) disks. + MakeTransparent, + + /// Remove soft anchors from trimmed/undefined blocks without changing + /// their state. Clears file_megabyte if the block is soft-anchored. + /// + /// Allowed on fully-allocated (fixed) disks. Does not change data + /// content, so DataWriteGuid is not updated. + RemoveSoftAnchors, +} + +/// Builder for a trim operation on a VHDX file. +/// +/// Created via [`VhdxFile::trim`]. Required parameters (`mode`, `offset`, +/// `length`) are provided at construction; optional flags default to the +/// safe/common values and can be overridden with builder methods. +#[derive(Debug, Clone)] +pub struct TrimRequest { + mode: TrimMode, + offset: u64, + length: u64, + skip_disk_size_check: bool, + skip_write_guid_change: bool, +} + +impl TrimRequest { + /// Create a new trim request. + /// + /// * `mode` - Determines the target block state. + /// * `offset` - Virtual disk byte offset (must be sector-aligned). + /// * `length` - Length in bytes (must be sector-aligned). + pub fn new(mode: TrimMode, offset: u64, length: u64) -> Self { + Self { + mode, + offset, + length, + skip_disk_size_check: false, + skip_write_guid_change: false, + } + } + + /// Skip bounds checking against the virtual disk size. + pub fn skip_disk_size_check(mut self, skip: bool) -> Self { + self.skip_disk_size_check = skip; + self + } + + /// Don't update DataWriteGuid when trimming. + pub fn skip_write_guid_change(mut self, skip: bool) -> Self { + self.skip_write_guid_change = skip; + self + } +} + +/// Returns true if the given trim mode is allowed on fully-allocated (fixed) disks. +fn mode_allowed_on_fixed(mode: TrimMode) -> bool { + matches!( + mode, + TrimMode::MakeTransparent | TrimMode::RemoveSoftAnchors + ) +} + +/// Returns true if this trim mode should skip the DataWriteGuid update. +fn mode_skips_write_guid(mode: TrimMode) -> bool { + matches!(mode, TrimMode::RemoveSoftAnchors) +} + +/// Convert a block mapping according to the trim mode. +/// +/// Returns the new mapping, which may be identical to `old` (no-op). +fn convert_mapping(mode: TrimMode, old: BlockMapping) -> BlockMapping { + let state = old.bat_state(); + match mode { + TrimMode::FileSpace => convert_file_space(state, old), + TrimMode::FreeSpace => convert_free_space(state, old), + TrimMode::Zero => convert_zero(state, old), + TrimMode::MakeTransparent => convert_make_transparent(state, old), + TrimMode::RemoveSoftAnchors => convert_remove_soft_anchors(old), + } +} + +/// FileSpace: FullyPresent/PartiallyPresent → Unmapped (keep soft anchor). +/// All other states are no-ops. +fn convert_file_space(state: BatEntryState, old: BlockMapping) -> BlockMapping { + match state { + BatEntryState::FullyPresent | BatEntryState::PartiallyPresent => { + BlockMapping::new() + .with_bat_state(BatEntryState::Unmapped) + .with_transitioning_to_fully_present(false) + .with_file_megabyte(old.file_megabyte()) // keep as soft anchor + } + _ => old, // NotPresent, Undefined, Zero, Unmapped → no change + } +} + +/// FreeSpace: FullyPresent/PartiallyPresent → Undefined (clear offset, release space). +/// Zero → Undefined (clear offset). +/// Unmapped → Undefined (keep soft anchor). +/// Others → no change. +fn convert_free_space(state: BatEntryState, old: BlockMapping) -> BlockMapping { + match state { + BatEntryState::FullyPresent | BatEntryState::PartiallyPresent => { + // Release space — clear file offset. + BlockMapping::new() + .with_bat_state(BatEntryState::Undefined) + .with_transitioning_to_fully_present(false) + .with_file_megabyte(0) + } + BatEntryState::Zero => BlockMapping::new() + .with_bat_state(BatEntryState::Undefined) + .with_transitioning_to_fully_present(false) + .with_file_megabyte(0), + BatEntryState::Unmapped => { + // Keep soft anchor if present. + BlockMapping::new() + .with_bat_state(BatEntryState::Undefined) + .with_transitioning_to_fully_present(false) + .with_file_megabyte(old.file_megabyte()) + } + _ => old, // NotPresent, Undefined → no change + } +} + +/// Zero: any state → Zero (clear file offset). +fn convert_zero(state: BatEntryState, old: BlockMapping) -> BlockMapping { + match state { + BatEntryState::Zero if old.file_megabyte() == 0 => old, // already Zero with no offset + _ => { + debug_assert!( + !old.transitioning_to_fully_present(), + "cannot trim TFP block to Zero" + ); + BlockMapping::new() + .with_bat_state(BatEntryState::Zero) + .with_transitioning_to_fully_present(false) + .with_file_megabyte(0) + } + } +} + +/// MakeTransparent: any state → NotPresent (clear file offset). +fn convert_make_transparent(state: BatEntryState, old: BlockMapping) -> BlockMapping { + match state { + BatEntryState::NotPresent if old.file_megabyte() == 0 => old, // already NotPresent + _ => BlockMapping::new() + .with_bat_state(BatEntryState::NotPresent) + .with_transitioning_to_fully_present(false) + .with_file_megabyte(0), + } +} + +/// RemoveSoftAnchors: clear file offset if soft-anchored, otherwise no-op. +fn convert_remove_soft_anchors(old: BlockMapping) -> BlockMapping { + if old.is_soft_anchored() { + BlockMapping::new() + .with_bat_state(old.bat_state()) + .with_transitioning_to_fully_present(false) + .with_file_megabyte(0) + } else { + old + } +} + +/// Compute the block range fully included in a byte range. +/// +/// Returns `(start_block, block_count)`. Only blocks whose entire extent +/// falls within `[offset..offset+length)` are included. Leading and +/// trailing partial blocks are skipped. +fn included_blocks(offset: u64, length: u64, block_size: u64) -> (u32, u32) { + if length == 0 { + return (0, 0); + } + // First fully-included block: round UP to next block boundary. + let start = offset.div_ceil(block_size) as u32; + // First block NOT included: round DOWN. + let end = ((offset + length) / block_size) as u32; + if end <= start { + (start, 0) + } else { + (start, end - start) + } +} + +impl VhdxFile { + /// Trim (unmap) a range of virtual disk blocks. + /// + /// Transitions blocks to unmapped/zero/transparent state depending on + /// the mode specified in `request`. Only blocks fully covered by the + /// range are trimmed. + /// + /// # Errors + /// + /// Returns an error if: + /// - The file is read-only + /// - Offset or length is not aligned to the logical sector size + /// - The range extends beyond the disk size (unless `skip_disk_size_check`) + /// - The file is in a permanently failed state + pub async fn trim(&self, request: TrimRequest) -> Result<(), VhdxIoError> { + self.failed.check()?; + + let TrimRequest { + mode, + offset, + length, + skip_disk_size_check, + skip_write_guid_change, + } = request; + + // 1. Check read-only. + if self.read_only { + return Err(VhdxIoErrorInner::ReadOnly.into()); + } + + // 2. Zero-length — immediate success. + if length == 0 { + return Ok(()); + } + + // 3. Validate alignment to logical sector size. + if !offset.is_multiple_of(self.logical_sector_size as u64) + || !length.is_multiple_of(self.logical_sector_size as u64) + { + return Err(VhdxIoErrorInner::UnalignedIo.into()); + } + + // 4. Validate bounds (unless skipped). + if !skip_disk_size_check { + if offset + .checked_add(length) + .is_none_or(|end| end > self.disk_size) + { + return Err(VhdxIoErrorInner::BeyondEndOfDisk.into()); + } + } + + // 5. If fully-allocated (fixed) disk and mode doesn't allow it: no-op. + if self.is_fully_allocated() && !mode_allowed_on_fixed(mode) { + return Ok(()); + } + + // 6. Enable write mode. + // All trim modes modify the file (BAT entries), so FileWritable + // is always needed. DataWritable is additionally needed when the + // mode changes user-visible data (everything except + // RemoveSoftAnchors) and the caller hasn't opted out. + if !skip_write_guid_change && !mode_skips_write_guid(mode) { + self.enable_write_mode(WriteMode::DataWritable) + .await + .map_err(VhdxIoErrorInner::WriteHeader)?; + } else { + self.enable_write_mode(WriteMode::FileWritable) + .await + .map_err(VhdxIoErrorInner::WriteHeader)?; + } + + // 7. Compute effective length: if trim extends to exactly disk_size, + // round up to cover the full last block. + let effective_length = if !skip_disk_size_check && offset + length == self.disk_size { + let block_size = self.block_size as u64; + let full_disk_size = crate::create::round_up(self.disk_size, block_size); + full_disk_size - offset + } else { + length + }; + + // 8. Compute included blocks. + let (start_block, block_count) = + included_blocks(offset, effective_length, self.block_size as u64); + if block_count == 0 { + return Ok(()); + } + let end_block = start_block + block_count; + + // 9. Main trim loop. + // + // For each block, we atomically claim it (CAS 0 → SENTINEL), + // preventing any new I/O from reading stale mappings. Then we + // read + convert the mapping, write the BAT, handle space + // management, and release the claim. + let mut current_block = start_block; + loop { + if current_block >= end_block { + return Ok(()); + } + + // 9a. Claim the block: set trim-pending to block new I/O, + // wait for in-flight I/Os to drain, then take exclusive + // ownership. + let claim = self.bat.claim_for_trim(current_block).await; + + // 9b. Block is claimed — no new I/O can start on it. + // Read the mapping and compute the trim conversion. + let old_mapping = self.bat.get_block_mapping(current_block); + let new_mapping = convert_mapping(mode, old_mapping); + + if old_mapping == new_mapping { + // No-op — release claim and advance. + current_block += 1; + continue; + } + + // 9c. Write BAT entry to cache (also updates in-memory BAT atomically). + // LOCK AUDIT: Trim claim held (not a sync lock). Safe to await. + self.bat + .write_block_mapping( + &self.cache, + BlockType::Payload, + current_block, + new_mapping, + None, + ) + .await?; + + // 9e. Handle space management based on old→new transition. + // + // Space releases are deferred until the BAT change is durable + // on disk. Without deferral, a crash could teleport data from + // a new block into the old block's offset. + let old_anchored = old_mapping.is_soft_anchored(); + let new_anchored = new_mapping.is_soft_anchored(); + let old_file_mb = old_mapping.file_megabyte(); + let new_file_mb = new_mapping.file_megabyte(); + let old_file_offset = old_file_mb as u64 * MB1; + let block_size = self.block_size; + + if old_anchored && new_anchored { + // Same anchor — assert same file offset, no space management. + debug_assert_eq!(old_file_mb, new_file_mb); + } else if old_anchored && !new_anchored { + // Was soft-anchored → no longer: unmark/cancel + defer release. + let was_deferred = self.deferred_releases.cancel(current_block); + if !was_deferred { + assert!( + self.free_space.unmark_trimmed_block( + current_block, + old_file_offset, + block_size, + ), + "soft-anchored block {current_block} not tracked as trimmed" + ); + } + self.deferred_releases + .insert(current_block, old_file_offset, block_size, false); + } else if !old_anchored && new_anchored { + // Was not anchored → now soft-anchored: defer the anchor. + self.deferred_releases + .insert(current_block, old_file_offset, block_size, true); + } else { + // Neither was nor becomes anchored. + if old_file_mb != 0 { + self.deferred_releases.insert( + current_block, + old_file_offset, + block_size, + false, + ); + } + } + + // 9f. Release the trim claim — I/O can resume on this block. + drop(claim); + + // Quota check: force flush if too many deferred releases. + if self.deferred_releases.needs_flush() { + self.flush().await?; + } + + current_block += 1; + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // ---- included_blocks unit tests ---- + + #[test] + fn included_blocks_full_coverage() { + // Range exactly covers blocks 0..3 (3 blocks). + let block_size = 2 * MB1; + let (start, count) = included_blocks(0, 3 * block_size, block_size); + assert_eq!(start, 0); + assert_eq!(count, 3); + } + + #[test] + fn included_blocks_partial_edges() { + // Start mid-block-0, end mid-block-2 → only block 1 included. + let block_size = 2 * MB1; + let (start, count) = included_blocks(MB1, 2 * block_size, block_size); + assert_eq!(start, 1); // block 0 is partial + assert_eq!(count, 1); // only block 1 fully covered + } + + #[test] + fn included_blocks_zero_length() { + let (start, count) = included_blocks(0, 0, 2 * MB1); + assert_eq!(start, 0); + assert_eq!(count, 0); + } + + #[test] + fn included_blocks_too_small() { + // Range is less than one block → no blocks included. + let block_size = 2 * MB1; + let (_start, count) = included_blocks(MB1, MB1, block_size); + assert_eq!(count, 0); + } + + // ---- Conversion function unit tests ---- + + #[test] + fn convert_file_space_mappings() { + // FullyPresent → Unmapped (keep offset) + let m = BlockMapping::new() + .with_bat_state(BatEntryState::FullyPresent) + .with_file_megabyte(4); + let r = convert_mapping(TrimMode::FileSpace, m); + assert_eq!(r.bat_state(), BatEntryState::Unmapped); + assert_eq!(r.file_megabyte(), 4); + + // Undefined → Undefined (no change) + let m = BlockMapping::new() + .with_bat_state(BatEntryState::Undefined) + .with_file_megabyte(0); + let r = convert_mapping(TrimMode::FileSpace, m); + assert_eq!(r, m); + + // Unmapped → Unmapped (no change) + let m = BlockMapping::new() + .with_bat_state(BatEntryState::Unmapped) + .with_file_megabyte(5); + let r = convert_mapping(TrimMode::FileSpace, m); + assert_eq!(r, m); + } + + #[test] + fn convert_free_space_mappings() { + // FullyPresent → Undefined (clear offset) + let m = BlockMapping::new() + .with_bat_state(BatEntryState::FullyPresent) + .with_file_megabyte(4); + let r = convert_mapping(TrimMode::FreeSpace, m); + assert_eq!(r.bat_state(), BatEntryState::Undefined); + assert_eq!(r.file_megabyte(), 0); + + // Unmapped → Undefined (keep anchor) + let m = BlockMapping::new() + .with_bat_state(BatEntryState::Unmapped) + .with_file_megabyte(5); + let r = convert_mapping(TrimMode::FreeSpace, m); + assert_eq!(r.bat_state(), BatEntryState::Undefined); + assert_eq!(r.file_megabyte(), 5); + } + + #[test] + fn convert_zero_mappings() { + // FullyPresent → Zero (clear offset) + let m = BlockMapping::new() + .with_bat_state(BatEntryState::FullyPresent) + .with_file_megabyte(4); + let r = convert_mapping(TrimMode::Zero, m); + assert_eq!(r.bat_state(), BatEntryState::Zero); + assert_eq!(r.file_megabyte(), 0); + + // Zero (no offset) → Zero (no change) + let m = BlockMapping::new() + .with_bat_state(BatEntryState::Zero) + .with_file_megabyte(0); + let r = convert_mapping(TrimMode::Zero, m); + assert_eq!(r, m); + } + + #[test] + fn convert_make_transparent_mappings() { + // FullyPresent → NotPresent + let m = BlockMapping::new() + .with_bat_state(BatEntryState::FullyPresent) + .with_file_megabyte(4); + let r = convert_mapping(TrimMode::MakeTransparent, m); + assert_eq!(r.bat_state(), BatEntryState::NotPresent); + assert_eq!(r.file_megabyte(), 0); + + // NotPresent → NotPresent (no change) + let m = BlockMapping::new() + .with_bat_state(BatEntryState::NotPresent) + .with_file_megabyte(0); + let r = convert_mapping(TrimMode::MakeTransparent, m); + assert_eq!(r, m); + } + + #[test] + fn convert_remove_soft_anchors_mappings() { + // Unmapped with offset → clear offset + let m = BlockMapping::new() + .with_bat_state(BatEntryState::Unmapped) + .with_file_megabyte(5); + let r = convert_mapping(TrimMode::RemoveSoftAnchors, m); + assert_eq!(r.bat_state(), BatEntryState::Unmapped); + assert_eq!(r.file_megabyte(), 0); + + // FullyPresent → no change (not soft-anchored) + let m = BlockMapping::new() + .with_bat_state(BatEntryState::FullyPresent) + .with_file_megabyte(4); + let r = convert_mapping(TrimMode::RemoveSoftAnchors, m); + assert_eq!(r, m); + } + + #[test] + fn is_soft_anchored_checks() { + // Unmapped with offset → anchored + let m = BlockMapping::new() + .with_bat_state(BatEntryState::Unmapped) + .with_file_megabyte(5); + assert!(m.is_soft_anchored()); + + // Undefined with offset → anchored + let m = BlockMapping::new() + .with_bat_state(BatEntryState::Undefined) + .with_file_megabyte(3); + assert!(m.is_soft_anchored()); + + // Unmapped with no offset → not anchored + let m = BlockMapping::new() + .with_bat_state(BatEntryState::Unmapped) + .with_file_megabyte(0); + assert!(!m.is_soft_anchored()); + + // FullyPresent with offset → not anchored (wrong state) + let m = BlockMapping::new() + .with_bat_state(BatEntryState::FullyPresent) + .with_file_megabyte(4); + assert!(!m.is_soft_anchored()); + } +} diff --git a/vm/devices/storage/vhdx/tests/native_cross_validation.rs b/vm/devices/storage/vhdx/tests/native_cross_validation.rs new file mode 100644 index 0000000000..a71566d4b9 --- /dev/null +++ b/vm/devices/storage/vhdx/tests/native_cross_validation.rs @@ -0,0 +1,1920 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Native API cross-validation smoke tests. +//! +//! These tests exercise the first interaction between the Rust VHDX parser +//! and the Windows native VHD stack. They are deliberately limited in scope +//! to surface format-level bugs that need to be diagnosed and fixed before +//! writing a full test suite. +//! +//! **All tests are gated with `#[cfg(windows)]`.** +//! +//! ## Format bugs discovered and fixed +//! +//! (Updated as bugs are found during cross-validation.) + +#![cfg(windows)] +// UNSAFETY: Windows FFI calls for virtual disk APIs and raw disk I/O. +#![expect(unsafe_code)] + +use pal_async::DefaultDriver; +use parking_lot::Mutex; +use std::borrow::Borrow; +use std::io; +use std::path::Path; +use std::sync::Arc; +use vhdx::AsyncFile; +use vhdx::ReadRange; +use vhdx::TrimMode; +use vhdx::TrimRequest; +use vhdx::WriteRange; + +use windows::Win32::Foundation::CloseHandle; +use windows::Win32::Foundation::HANDLE; +use windows::Win32::Storage::FileSystem::ReadFile; +use windows::Win32::Storage::FileSystem::WriteFile; +use windows::Win32::Storage::Vhd::ATTACH_VIRTUAL_DISK_FLAG; +use windows::Win32::Storage::Vhd::ATTACH_VIRTUAL_DISK_FLAG_NO_DRIVE_LETTER; +use windows::Win32::Storage::Vhd::ATTACH_VIRTUAL_DISK_FLAG_NO_LOCAL_HOST; +use windows::Win32::Storage::Vhd::AttachVirtualDisk; +use windows::Win32::Storage::Vhd::CREATE_VIRTUAL_DISK_FLAG_NONE; +use windows::Win32::Storage::Vhd::CREATE_VIRTUAL_DISK_PARAMETERS; +use windows::Win32::Storage::Vhd::CREATE_VIRTUAL_DISK_VERSION_2; +use windows::Win32::Storage::Vhd::CreateVirtualDisk; +use windows::Win32::Storage::Vhd::DetachVirtualDisk; +use windows::Win32::Storage::Vhd::OPEN_VIRTUAL_DISK_FLAG_NONE; +use windows::Win32::Storage::Vhd::OPEN_VIRTUAL_DISK_PARAMETERS; +use windows::Win32::Storage::Vhd::OPEN_VIRTUAL_DISK_VERSION_2; +use windows::Win32::Storage::Vhd::OpenVirtualDisk; +use windows::Win32::Storage::Vhd::VIRTUAL_DISK_ACCESS_MASK; +use windows::Win32::Storage::Vhd::VIRTUAL_STORAGE_TYPE; +use windows::Win32::System::IO::GetOverlappedResult; +use windows::Win32::System::IO::OVERLAPPED; +use windows::Win32::System::Threading::CreateEventW; +use windows::core::PCWSTR; + +// --------------------------------------------------------------------- +// StdFile — blocking AsyncFile adapter for integration tests +// --------------------------------------------------------------------- + +/// Blocking `AsyncFile` impl backed by `std::fs::File`. +/// Suitable for tests only — all operations block the current thread. +struct StdFile { + file: Mutex, +} + +impl StdFile { + fn open(path: &Path, read_only: bool) -> io::Result { + let file = std::fs::OpenOptions::new() + .read(true) + .write(!read_only) + .open(path)?; + Ok(Self { + file: Mutex::new(file), + }) + } + + fn create(path: &Path) -> io::Result { + let file = std::fs::OpenOptions::new() + .read(true) + .write(true) + .create(true) + .truncate(true) + .open(path)?; + Ok(Self { + file: Mutex::new(file), + }) + } +} + +impl AsyncFile for StdFile { + type Buffer = Vec; + + fn alloc_buffer(&self, len: usize) -> Vec { + vec![0u8; len] + } + + async fn read_into(&self, offset: u64, mut buf: Vec) -> Result, io::Error> { + use std::io::Read; + use std::io::Seek; + use std::io::SeekFrom; + let mut file = self.file.lock(); + file.seek(SeekFrom::Start(offset))?; + file.read_exact(buf.as_mut())?; + Ok(buf) + } + + async fn write_from( + &self, + offset: u64, + buf: impl Borrow> + Send + 'static, + ) -> Result<(), io::Error> { + use std::io::Seek; + use std::io::SeekFrom; + use std::io::Write; + let buf = buf.borrow(); + let mut file = self.file.lock(); + file.seek(SeekFrom::Start(offset))?; + file.write_all(buf.as_ref()) + } + + async fn flush(&self) -> Result<(), io::Error> { + use std::io::Write; + let mut file = self.file.lock(); + file.flush() + } + + async fn file_size(&self) -> Result { + let file = self.file.lock(); + file.metadata().map(|m| m.len()) + } + + async fn set_file_size(&self, size: u64) -> Result<(), io::Error> { + let file = self.file.lock(); + file.set_len(size) + } +} + +// --------------------------------------------------------------------- +// Windows Virtual Disk Type Constants +// --------------------------------------------------------------------- + +const VIRTUAL_STORAGE_TYPE_DEVICE_VHDX: u32 = 3; + +// Microsoft vendor GUID: {EC984AEC-A0F9-47e9-901F-71415A66345B} +const VIRTUAL_STORAGE_TYPE_VENDOR_MICROSOFT: windows::core::GUID = windows::core::GUID { + data1: 0xEC984AEC, + data2: 0xA0F9, + data3: 0x47e9, + data4: [0x90, 0x1F, 0x71, 0x41, 0x5A, 0x66, 0x34, 0x5B], +}; + +// --------------------------------------------------------------------- +// Path helper +// --------------------------------------------------------------------- + +fn to_wide(path: &Path) -> Vec { + use std::os::windows::ffi::OsStrExt; + path.as_os_str() + .encode_wide() + .chain(std::iter::once(0)) + .collect() +} + +// --------------------------------------------------------------------- +// NativeVhdx — RAII wrapper around Windows virtual disk APIs +// --------------------------------------------------------------------- + +struct NativeVhdx { + handle: HANDLE, + attached: bool, +} + +impl NativeVhdx { + /// Create a new dynamic VHDX via CreateVirtualDisk. + fn create_dynamic(path: &Path, size_bytes: u64, block_size: u32, sector_size: u32) -> Self { + let storage_type = VIRTUAL_STORAGE_TYPE { + DeviceId: VIRTUAL_STORAGE_TYPE_DEVICE_VHDX, + VendorId: VIRTUAL_STORAGE_TYPE_VENDOR_MICROSOFT, + }; + + let wide = to_wide(path); + + let mut params = CREATE_VIRTUAL_DISK_PARAMETERS { + Version: CREATE_VIRTUAL_DISK_VERSION_2, + ..Default::default() + }; + params.Anonymous.Version2.MaximumSize = size_bytes; + params.Anonymous.Version2.BlockSizeInBytes = block_size; + params.Anonymous.Version2.SectorSizeInBytes = sector_size; + + let mut handle = HANDLE::default(); + + // SAFETY: All parameters are correctly initialized, wide path is + // null-terminated, and handle is written by the API on success. + let result = unsafe { + CreateVirtualDisk( + &storage_type, + PCWSTR(wide.as_ptr()), + VIRTUAL_DISK_ACCESS_MASK(0), + None, + CREATE_VIRTUAL_DISK_FLAG_NONE, + 0, + ¶ms, + None, + &mut handle, + ) + }; + assert!(result.is_ok(), "CreateVirtualDisk failed: {result:?}"); + + NativeVhdx { + handle, + attached: false, + } + } + + /// Open an existing VHDX via OpenVirtualDisk. + fn open(path: &Path, _read_only: bool) -> Self { + let storage_type = VIRTUAL_STORAGE_TYPE { + DeviceId: VIRTUAL_STORAGE_TYPE_DEVICE_VHDX, + VendorId: VIRTUAL_STORAGE_TYPE_VENDOR_MICROSOFT, + }; + + let wide = to_wide(path); + + let params = OPEN_VIRTUAL_DISK_PARAMETERS { + Version: OPEN_VIRTUAL_DISK_VERSION_2, + ..Default::default() + }; + + let mut handle = HANDLE::default(); + + // SAFETY: All parameters are correctly initialized, wide path is + // null-terminated, and handle is written by the API on success. + let result = unsafe { + OpenVirtualDisk( + &storage_type, + PCWSTR(wide.as_ptr()), + VIRTUAL_DISK_ACCESS_MASK(0), + OPEN_VIRTUAL_DISK_FLAG_NONE, + Some(¶ms), + &mut handle, + ) + }; + assert!(result.is_ok(), "OpenVirtualDisk failed: {result:?}"); + + NativeVhdx { + handle, + attached: false, + } + } + + /// Attach with NO_LOCAL_HOST for raw byte-level I/O. + /// With NO_LOCAL_HOST, no PhysicalDrive device is surfaced — instead, + /// ReadFile/WriteFile work directly on the virtual disk handle. + fn attach_raw(&mut self) -> RawDiskHandle { + let flags = ATTACH_VIRTUAL_DISK_FLAG( + ATTACH_VIRTUAL_DISK_FLAG_NO_LOCAL_HOST.0 | ATTACH_VIRTUAL_DISK_FLAG_NO_DRIVE_LETTER.0, + ); + + // SAFETY: Handle is valid (from Create/OpenVirtualDisk). Flags are valid. + let result = unsafe { AttachVirtualDisk(self.handle, None, flags, 0, None, None) }; + assert!(result.is_ok(), "AttachVirtualDisk failed: {result:?}"); + self.attached = true; + + // With NO_LOCAL_HOST the virtual disk handle itself supports + // ReadFile/WriteFile at virtual-disk offsets. No PhysicalDrive path. + RawDiskHandle { + handle: self.handle, + owned: false, + } + } +} + +impl Drop for NativeVhdx { + fn drop(&mut self) { + if self.attached { + // SAFETY: Handle is valid and was successfully attached. + let _ = unsafe { DetachVirtualDisk(self.handle, Default::default(), 0) }; + self.attached = false; + } + if !self.handle.is_invalid() { + // SAFETY: Handle is valid (from Create/OpenVirtualDisk). + let _ = unsafe { CloseHandle(self.handle) }; + } + } +} + +// --------------------------------------------------------------------- +// RawDiskHandle — read/write at byte offsets on attached virtual disk +// --------------------------------------------------------------------- + +struct RawDiskHandle { + handle: HANDLE, + /// Whether this handle is owned (should be closed on drop). + /// When borrowed from NativeVhdx (NO_LOCAL_HOST attach), this is false. + owned: bool, +} + +impl RawDiskHandle { + /// Read `buf.len()` bytes from the raw disk at the given byte offset. + /// Offset and length must be sector-aligned (multiples of 512). + fn read_at(&self, offset: u64, buf: &mut [u8]) -> io::Result { + // SAFETY: Creating a manual-reset event for overlapped I/O. + let event = unsafe { CreateEventW(None, true, false, None) } + .map_err(|e| io::Error::from_raw_os_error(e.code().0))?; + let mut overlapped: OVERLAPPED = Default::default(); + overlapped.Anonymous.Anonymous.Offset = (offset & 0xFFFF_FFFF) as u32; + overlapped.Anonymous.Anonymous.OffsetHigh = (offset >> 32) as u32; + overlapped.hEvent = event; + + let mut bytes_read = 0u32; + // SAFETY: Handle is valid, buf is valid for buf.len() bytes, + // overlapped is correctly initialized with event and offset. + let result = unsafe { + ReadFile( + self.handle, + Some(buf), + Some(&mut bytes_read), + Some(&mut overlapped), + ) + }; + match result { + Ok(()) => {} + Err(e) if e.code() == windows::Win32::Foundation::ERROR_IO_PENDING.into() => { + // ERROR_IO_PENDING — wait for completion. + // SAFETY: Handle and overlapped are valid; bWait=true blocks. + unsafe { GetOverlappedResult(self.handle, &overlapped, &mut bytes_read, true) } + .map_err(|e| io::Error::from_raw_os_error(e.code().0))?; + } + Err(e) => { + // SAFETY: Event handle is valid. + let _ = unsafe { CloseHandle(event) }; + return Err(io::Error::from_raw_os_error(e.code().0)); + } + } + // SAFETY: Event handle is valid. + let _ = unsafe { CloseHandle(event) }; + Ok(bytes_read as usize) + } + + /// Write `data.len()` bytes to the raw disk at the given byte offset. + /// Offset and length must be sector-aligned (multiples of 512). + fn write_at(&self, offset: u64, data: &[u8]) -> io::Result { + // SAFETY: Creating a manual-reset event for overlapped I/O. + let event = unsafe { CreateEventW(None, true, false, None) } + .map_err(|e| io::Error::from_raw_os_error(e.code().0))?; + let mut overlapped: OVERLAPPED = Default::default(); + overlapped.Anonymous.Anonymous.Offset = (offset & 0xFFFF_FFFF) as u32; + overlapped.Anonymous.Anonymous.OffsetHigh = (offset >> 32) as u32; + overlapped.hEvent = event; + + let mut bytes_written = 0u32; + // SAFETY: Handle is valid, data is valid for data.len() bytes, + // overlapped is correctly initialized with event and offset. + let result = unsafe { + WriteFile( + self.handle, + Some(data), + Some(&mut bytes_written), + Some(&mut overlapped), + ) + }; + match result { + Ok(()) => {} + Err(e) if e.code() == windows::Win32::Foundation::ERROR_IO_PENDING.into() => { + // ERROR_IO_PENDING — wait for completion. + // SAFETY: Handle and overlapped are valid; bWait=true blocks. + unsafe { GetOverlappedResult(self.handle, &overlapped, &mut bytes_written, true) } + .map_err(|e| io::Error::from_raw_os_error(e.code().0))?; + } + Err(e) => { + // SAFETY: Event handle is valid. + let _ = unsafe { CloseHandle(event) }; + return Err(io::Error::from_raw_os_error(e.code().0)); + } + } + // SAFETY: Event handle is valid. + let _ = unsafe { CloseHandle(event) }; + Ok(bytes_written as usize) + } +} + +impl Drop for RawDiskHandle { + fn drop(&mut self) { + if self.owned && !self.handle.is_invalid() { + // SAFETY: Handle is valid and owned by this struct. + let _ = unsafe { CloseHandle(self.handle) }; + } + } +} + +// --------------------------------------------------------------------- +// RustVhdx — helper wrapping the Rust VHDX API for test scenarios +// --------------------------------------------------------------------- + +struct RustVhdx { + vhdx: vhdx::VhdxFile, + /// Separate file handle for data I/O (shared backing path). + io_file: Arc, +} + +impl RustVhdx { + async fn create(path: &Path, disk_size: u64, block_size: u32, driver: &DefaultDriver) -> Self { + let file = StdFile::create(path).expect("create backing file"); + let mut params = vhdx::CreateParams { + disk_size, + block_size, + ..Default::default() + }; + vhdx::create(&file, &mut params).await.expect("vhdx create"); + drop(file); + + // Re-open for use with log task. + Self::open(path, false, Some(driver)).await + } + + async fn open(path: &Path, read_only: bool, driver: Option<&DefaultDriver>) -> Self { + let file = StdFile::open(path, read_only).expect("open backing file"); + let io_file = Arc::new(StdFile::open(path, read_only).expect("open io file")); + let vhdx = if read_only { + vhdx::VhdxFile::open(file) + .read_only() + .await + .expect("vhdx open") + } else { + let driver = driver.expect("writable open requires a driver/spawner"); + vhdx::VhdxFile::open(file) + .writable(driver) + .await + .expect("vhdx open_writable") + }; + RustVhdx { vhdx, io_file } + } + + /// Read data at a virtual offset. Returns a Vec of `len` bytes. + async fn read_data(&self, offset: u64, len: u32) -> Vec { + let mut ranges = Vec::new(); + let guard = self + .vhdx + .resolve_read(offset, len, &mut ranges) + .await + .expect("resolve_read"); + + let mut result = vec![0u8; len as usize]; + + for range in &ranges { + match range { + ReadRange::Data { + guest_offset, + length, + file_offset, + } => { + let buf_offset = (*guest_offset - offset) as usize; + let buf_len = *length as usize; + self.io_file + .read_at(*file_offset, &mut result[buf_offset..buf_offset + buf_len]) + .await + .expect("read data from file"); + } + ReadRange::Zero { .. } | ReadRange::Unmapped { .. } => { + // Already zero-initialized. + } + } + } + + drop(guard); + result + } + + /// Write data at a virtual offset. + async fn write_data(&self, offset: u64, data: &[u8]) { + let mut ranges = Vec::new(); + let guard = self + .vhdx + .resolve_write(offset, data.len() as u32, &mut ranges) + .await + .expect("resolve_write"); + + for range in &ranges { + match range { + WriteRange::Data { + guest_offset, + length, + file_offset, + } => { + let buf_offset = (*guest_offset - offset) as usize; + let buf_len = *length as usize; + self.io_file + .write_at(*file_offset, &data[buf_offset..buf_offset + buf_len]) + .await + .expect("write data to file"); + } + WriteRange::Zero { + file_offset, + length, + } => { + let zeros = vec![0u8; *length as usize]; + self.io_file + .write_at(*file_offset, &zeros) + .await + .expect("zero-fill file range"); + } + } + } + + guard.complete().await.expect("write complete"); + } + + /// Flush the VHDX file. + async fn flush(&self) { + self.vhdx.flush().await.expect("flush"); + } + + /// Trim a range of the virtual disk. + async fn trim_range(&self, offset: u64, length: u64) { + self.vhdx + .trim(TrimRequest::new(TrimMode::Zero, offset, length)) + .await + .expect("trim"); + } + + /// Close the VHDX (consume self). + async fn close(self) { + self.vhdx.close().await.expect("close"); + } + + /// Abort (crash) the VHDX — drops without clean close, leaving a dirty log. + async fn abort(self) { + self.vhdx.abort().await; + } +} + +// ===================================================================== +// Test Data Pattern +// ===================================================================== + +/// Generate a test pattern for a given offset: the pattern byte +/// is derived from the offset so each location has unique data. +fn test_pattern(offset: u64, len: usize) -> Vec { + (0..len) + .map(|i| ((offset as usize + i) % 251) as u8) // prime modulus avoids power-of-2 alignment + .collect() +} + +// ===================================================================== +// Test Cases +// ===================================================================== + +/// Test 1: Native-Create → Rust-Open (Metadata Check) +/// +/// Native creates a dynamic VHDX (1 GiB) → close → Rust opens → verify +/// disk geometry matches. +#[pal_async::async_test] +async fn native_create_rust_open_metadata() { + let dir = tempfile::tempdir().unwrap(); + let vhdx_path = dir.path().join("test.vhdx"); + + // Native create: 1 GiB, default block/sector sizes (pass 0 for defaults). + { + let _native = NativeVhdx::create_dynamic(&vhdx_path, 1024 * 1024 * 1024, 0, 0); + // Drop closes the handle. + } + + // Rust open and verify metadata. + let rust = RustVhdx::open(&vhdx_path, true, None).await; + + // Native defaults: 1 GiB disk, typically 32 MiB block size, 512 sector sizes. + assert_eq!(rust.vhdx.disk_size(), 1024 * 1024 * 1024, "disk_size"); + // The native default block size is typically 32 MiB, but may vary. + // Just assert it's a power of 2 and > 0. + let block_size = rust.vhdx.block_size(); + assert!(block_size > 0 && block_size.is_power_of_two(), "block_size"); + // Sector sizes: native defaults to 512 logical, 4096 physical. + assert_eq!(rust.vhdx.logical_sector_size(), 512, "logical_sector_size"); + assert_eq!( + rust.vhdx.physical_sector_size(), + 4096, + "physical_sector_size" + ); + + rust.close().await; +} + +/// Test 2: Rust-Create → Native-Open (Open Succeeds) +/// +/// Rust creates a dynamic VHDX (1 GiB) → close → native OpenVirtualDisk +/// succeeds. +#[pal_async::async_test] +async fn rust_create_native_open(driver: DefaultDriver) { + let dir = tempfile::tempdir().unwrap(); + let vhdx_path = dir.path().join("test.vhdx"); + + // Rust create: 1 GiB, 2 MiB block size (Rust default), 512-byte sectors. + { + let rust = RustVhdx::create(&vhdx_path, 1024 * 1024 * 1024, 0, &driver).await; + rust.close().await; + } + + // Native open — this is the most likely test to fail. + let _native = NativeVhdx::open(&vhdx_path, true); + // If we get here, the native stack accepted the Rust-created file. +} + +/// Test 3: Rust-Create → Native-Attach → Raw-Read Zeros +/// +/// Rust creates a small dynamic VHDX (4 MiB, 2 MiB blocks) → close → +/// native opens → attach → raw-read first sector → verify all zeros. +#[pal_async::async_test] +async fn rust_create_native_attach_read_zeros(driver: DefaultDriver) { + let dir = tempfile::tempdir().unwrap(); + let vhdx_path = dir.path().join("test.vhdx"); + + // Rust create: 4 MiB disk, 2 MiB block size. + { + let rust = RustVhdx::create(&vhdx_path, 4 * 1024 * 1024, 2 * 1024 * 1024, &driver).await; + rust.flush().await; + rust.close().await; + } + + // Native open + attach. + let mut native = NativeVhdx::open(&vhdx_path, false); + let raw = native.attach_raw(); + + // Read the first sector (512 bytes) at offset 0. + let mut buf = vec![0xCCu8; 512]; + let bytes_read = raw.read_at(0, &mut buf).expect("raw read at offset 0"); + assert_eq!(bytes_read, 512, "expected 512 bytes read"); + + // A freshly-created, never-written VHDX should return all zeros. + assert!(buf.iter().all(|&b| b == 0), "first sector should be zeros"); +} + +/// Test 4: Native-Create → Native-Write → Rust-Read (Data) +/// +/// Native creates dynamic VHDX (1 GiB, default sizes) → attach → write +/// known patterns at 3 offsets across different blocks → detach → close → +/// Rust opens → reads at each offset → data matches. +#[pal_async::async_test] +async fn native_create_rust_read_data() { + let dir = tempfile::tempdir().unwrap(); + let vhdx_path = dir.path().join("test.vhdx"); + + // Determine block size after native create (typically 32 MiB). + let block_size: u64; + + // Native create + write. + { + let mut native = NativeVhdx::create_dynamic(&vhdx_path, 1024 * 1024 * 1024, 0, 0); + let raw = native.attach_raw(); + + // We need to know the block size to write across blocks. + // Native defaults to 32 MiB blocks. + block_size = 32 * 1024 * 1024; + + let offsets = [0u64, block_size, 2 * block_size]; + for &off in &offsets { + let pattern = test_pattern(off, 512); + let written = raw.write_at(off, &pattern).expect("native write"); + assert_eq!(written, 512); + } + // Drop detaches and closes. + } + + // Rust open + read + verify. + let rust = RustVhdx::open(&vhdx_path, true, None).await; + + let offsets = [0u64, block_size, 2 * block_size]; + for &off in &offsets { + let expected = test_pattern(off, 512); + let actual = rust.read_data(off, 512).await; + assert_eq!(actual, expected, "data mismatch at offset {off:#x}"); + } + + rust.close().await; +} + +/// Test 5: Native-Create → Rust-Read (Custom 32 MiB Block Size) +/// +/// Native creates with explicit 32 MiB block size → Rust opens → +/// `block_size()` == 32 MiB. +#[pal_async::async_test] +async fn native_create_custom_block_size() { + let dir = tempfile::tempdir().unwrap(); + let vhdx_path = dir.path().join("test.vhdx"); + + { + let _native = + NativeVhdx::create_dynamic(&vhdx_path, 1024 * 1024 * 1024, 32 * 1024 * 1024, 0); + } + + let rust = RustVhdx::open(&vhdx_path, true, None).await; + assert_eq!( + rust.vhdx.block_size(), + 33554432, + "block_size should be 32 MiB" + ); + rust.close().await; +} + +/// Test 6: Native-Create → Rust-Read (4K Logical Sector) +/// +/// Native creates with 4096 logical sector size → Rust opens → +/// `logical_sector_size()` == 4096. +#[pal_async::async_test] +async fn native_create_4k_sector() { + let dir = tempfile::tempdir().unwrap(); + let vhdx_path = dir.path().join("test.vhdx"); + + { + let _native = NativeVhdx::create_dynamic(&vhdx_path, 1024 * 1024 * 1024, 0, 4096); + } + + let rust = RustVhdx::open(&vhdx_path, true, None).await; + assert_eq!( + rust.vhdx.logical_sector_size(), + 4096, + "logical_sector_size should be 4096" + ); + rust.close().await; +} + +/// Test 7: Rust-Create → Native-Read (Data) +/// +/// Rust creates + writes data at multiple offsets across block boundaries → +/// flush → close → native opens → attach → raw-read at each offset → +/// data matches. +#[pal_async::async_test] +async fn rust_create_native_read_data(driver: DefaultDriver) { + let dir = tempfile::tempdir().unwrap(); + let vhdx_path = dir.path().join("test.vhdx"); + + let block_size: u64 = 2 * 1024 * 1024; // 2 MiB + + // Rust create + write. + { + let rust = RustVhdx::create(&vhdx_path, 32 * 1024 * 1024, block_size as u32, &driver).await; + + // Write to 3 different blocks (blocks 1, 3, 5 — skip block 0 since test 3 uses it). + let offsets = [block_size, 3 * block_size, 5 * block_size]; + for &off in &offsets { + let pattern = test_pattern(off, 512); + rust.write_data(off, &pattern).await; + } + rust.flush().await; + rust.close().await; + } + + // Native open + attach + read + verify. + let mut native = NativeVhdx::open(&vhdx_path, false); + let raw = native.attach_raw(); + + let offsets = [block_size, 3 * block_size, 5 * block_size]; + for &off in &offsets { + let expected = test_pattern(off, 512); + let mut buf = vec![0u8; 512]; + let bytes_read = raw.read_at(off, &mut buf).expect("native read"); + assert_eq!(bytes_read, 512); + assert_eq!(buf, expected, "data mismatch at offset {off:#x}"); + } +} + +/// Test 8: Rust-Create → Native-Open (Various Block Sizes) +/// +/// Rust creates VHDX files with 2 MiB, 4 MiB, and 32 MiB block sizes → +/// native opens each → open succeeds without error. +#[pal_async::async_test] +async fn rust_create_various_block_sizes(driver: DefaultDriver) { + let dir = tempfile::tempdir().unwrap(); + let block_sizes: &[u32] = &[2 * 1024 * 1024, 4 * 1024 * 1024, 32 * 1024 * 1024]; + + for &bs in block_sizes { + let name = format!("test_bs_{bs}.vhdx"); + let vhdx_path = dir.path().join(&name); + + { + let rust = RustVhdx::create(&vhdx_path, 64 * 1024 * 1024, bs, &driver).await; + rust.close().await; + } + + let _native = NativeVhdx::open(&vhdx_path, true); + // If we get here, the native stack accepted the file. + } +} + +/// Test 9: Interleaved — Native-Write Then Rust-Write +/// +/// Native creates → attach → write region A (offset 0) → detach → close → +/// Rust opens → writes region B (second block) → flush → close → +/// native opens → attach → reads both regions → both intact. +#[pal_async::async_test] +async fn interleaved_native_then_rust(driver: DefaultDriver) { + let dir = tempfile::tempdir().unwrap(); + let vhdx_path = dir.path().join("test.vhdx"); + + let block_size: u64 = 32 * 1024 * 1024; // native default + + // Step 1: Native create + write region A at offset 0. + { + let mut native = NativeVhdx::create_dynamic(&vhdx_path, 1024 * 1024 * 1024, 0, 0); + let raw = native.attach_raw(); + let pattern_a = test_pattern(0, 512); + let written = raw.write_at(0, &pattern_a).expect("native write region A"); + assert_eq!(written, 512); + } + + // Step 2: Rust opens → writes region B at block_size offset. + { + let rust = RustVhdx::open(&vhdx_path, false, Some(&driver)).await; + let pattern_b = test_pattern(block_size, 512); + rust.write_data(block_size, &pattern_b).await; + rust.flush().await; + rust.close().await; + } + + // Step 3: Native opens → reads both regions → verifies. + { + let mut native = NativeVhdx::open(&vhdx_path, false); + let raw = native.attach_raw(); + + let expected_a = test_pattern(0, 512); + let mut buf_a = vec![0u8; 512]; + let bytes = raw.read_at(0, &mut buf_a).expect("read region A"); + assert_eq!(bytes, 512); + assert_eq!(buf_a, expected_a, "region A corrupted"); + + let expected_b = test_pattern(block_size, 512); + let mut buf_b = vec![0u8; 512]; + let bytes = raw.read_at(block_size, &mut buf_b).expect("read region B"); + assert_eq!(bytes, 512); + assert_eq!(buf_b, expected_b, "region B corrupted"); + } +} + +/// Test 10: Interleaved — Rust-Write Then Native-Write +/// +/// Rust creates → writes blocks 0, 2, 4 → flush → close → +/// native opens → attach → writes blocks 1, 3 → detach → close → +/// Rust opens → reads all blocks → all data intact. +#[pal_async::async_test] +async fn interleaved_rust_then_native(driver: DefaultDriver) { + let dir = tempfile::tempdir().unwrap(); + let vhdx_path = dir.path().join("test.vhdx"); + + let block_size: u64 = 2 * 1024 * 1024; // 2 MiB + + // Step 1: Rust create + write blocks 0, 2, 4. + let rust_offsets = [0u64, 2 * block_size, 4 * block_size]; + { + let rust = RustVhdx::create(&vhdx_path, 32 * 1024 * 1024, block_size as u32, &driver).await; + for &off in &rust_offsets { + rust.write_data(off, &test_pattern(off, 512)).await; + } + rust.flush().await; + rust.close().await; + } + + // Step 2: Native opens → writes blocks 1, 3. + let native_offsets = [block_size, 3 * block_size]; + { + let mut native = NativeVhdx::open(&vhdx_path, false); + let raw = native.attach_raw(); + for &off in &native_offsets { + let pattern = test_pattern(off, 512); + let written = raw.write_at(off, &pattern).expect("native write"); + assert_eq!(written, 512); + } + } + + // Step 3: Rust opens → reads all blocks → verifies. + { + let rust = RustVhdx::open(&vhdx_path, true, None).await; + + for &off in rust_offsets.iter().chain(native_offsets.iter()) { + let expected = test_pattern(off, 512); + let actual = rust.read_data(off, 512).await; + assert_eq!(actual, expected, "data mismatch at offset {off:#x}"); + } + + rust.close().await; + } +} + +/// Test 11: Three-Way Round-Trip +/// +/// Rust creates → writes block 0 → flush → close → native opens → attach → +/// writes block 1 → detach → close → Rust opens → reads blocks 0 and 1 → +/// both correct. +#[pal_async::async_test] +async fn three_way_round_trip(driver: DefaultDriver) { + let dir = tempfile::tempdir().unwrap(); + let vhdx_path = dir.path().join("test.vhdx"); + + let block_size: u64 = 2 * 1024 * 1024; + + // Step 1: Rust creates and writes block 0. + { + let rust = RustVhdx::create(&vhdx_path, 16 * 1024 * 1024, block_size as u32, &driver).await; + rust.write_data(0, &test_pattern(0, 512)).await; + rust.flush().await; + rust.close().await; + } + + // Step 2: Native opens and writes block 1. + { + let mut native = NativeVhdx::open(&vhdx_path, false); + let raw = native.attach_raw(); + let pattern = test_pattern(block_size, 512); + let written = raw + .write_at(block_size, &pattern) + .expect("native write block 1"); + assert_eq!(written, 512); + } + + // Step 3: Rust opens → reads blocks 0 and 1 → verifies. + { + let rust = RustVhdx::open(&vhdx_path, true, None).await; + + let data0 = rust.read_data(0, 512).await; + assert_eq!(data0, test_pattern(0, 512), "block 0 data mismatch"); + + let data1 = rust.read_data(block_size, 512).await; + assert_eq!( + data1, + test_pattern(block_size, 512), + "block 1 data mismatch" + ); + + rust.close().await; + } +} + +/// Test 12: Trim — Rust-Trim → Native-Read +/// +/// Rust creates small disk (4 MiB, 2 MiB blocks) → writes all blocks → +/// trims block 1 → flush → close → native opens → attach → +/// raw-read block 0 (data intact) → raw-read block 1 (zeros). +#[pal_async::async_test] +async fn trim_rust_trim_native_read(driver: DefaultDriver) { + let dir = tempfile::tempdir().unwrap(); + let vhdx_path = dir.path().join("test.vhdx"); + + let block_size: u64 = 2 * 1024 * 1024; + + // Rust create + write both blocks + trim block 1. + { + let rust = RustVhdx::create(&vhdx_path, 4 * 1024 * 1024, block_size as u32, &driver).await; + + // Write block 0 and block 1. + rust.write_data(0, &test_pattern(0, 512)).await; + rust.write_data(block_size, &test_pattern(block_size, 512)) + .await; + rust.flush().await; + + // Trim block 1 entirely. + rust.trim_range(block_size, block_size).await; + rust.flush().await; + rust.close().await; + } + + // Native open + attach + verify. + let mut native = NativeVhdx::open(&vhdx_path, false); + let raw = native.attach_raw(); + + // Block 0 should still have data. + let mut buf0 = vec![0u8; 512]; + let bytes = raw.read_at(0, &mut buf0).expect("read block 0"); + assert_eq!(bytes, 512); + assert_eq!(buf0, test_pattern(0, 512), "block 0 should be intact"); + + // Block 1 should be zeros after trim. + let mut buf1 = vec![0u8; 512]; + let bytes = raw.read_at(block_size, &mut buf1).expect("read block 1"); + assert_eq!(bytes, 512); + assert!( + buf1.iter().all(|&b| b == 0), + "block 1 should be zeros after trim" + ); +} + +/// Test 13: Trim — Native-Write → Rust-Trim → Native-Read +/// +/// Native creates → attach → writes blocks 0 and 1 → detach → close → +/// Rust opens → trims block 1 → flush → close → native opens → attach → +/// raw-read block 0 (intact) → raw-read block 1 (zeros). +#[pal_async::async_test] +async fn trim_native_write_rust_trim_native_read(driver: DefaultDriver) { + let dir = tempfile::tempdir().unwrap(); + let vhdx_path = dir.path().join("test.vhdx"); + + // Native default block size is 32 MiB. Use a smaller Rust-created disk + // so trim covers a full block efficiently. + let block_size: u64 = 2 * 1024 * 1024; + + // Step 1: Rust creates to control block size, then close. + { + let rust = RustVhdx::create(&vhdx_path, 8 * 1024 * 1024, block_size as u32, &driver).await; + rust.close().await; + } + + // Step 2: Native writes blocks 0 and 1. + { + let mut native = NativeVhdx::open(&vhdx_path, false); + let raw = native.attach_raw(); + + let written = raw + .write_at(0, &test_pattern(0, 512)) + .expect("native write block 0"); + assert_eq!(written, 512); + + let written = raw + .write_at(block_size, &test_pattern(block_size, 512)) + .expect("native write block 1"); + assert_eq!(written, 512); + } + + // Step 3: Rust opens → trims block 1 → flush → close. + { + let rust = RustVhdx::open(&vhdx_path, false, Some(&driver)).await; + rust.trim_range(block_size, block_size).await; + rust.flush().await; + rust.close().await; + } + + // Step 4: Native opens → reads → verifies. + { + let mut native = NativeVhdx::open(&vhdx_path, false); + let raw = native.attach_raw(); + + // Block 0 intact. + let mut buf0 = vec![0u8; 512]; + let bytes = raw.read_at(0, &mut buf0).expect("read block 0"); + assert_eq!(bytes, 512); + assert_eq!(buf0, test_pattern(0, 512), "block 0 should be intact"); + + // Block 1 zeros. + let mut buf1 = vec![0u8; 512]; + let bytes = raw.read_at(block_size, &mut buf1).expect("read block 1"); + assert_eq!(bytes, 512); + assert!( + buf1.iter().all(|&b| b == 0), + "block 1 should be zeros after trim" + ); + } +} + +// ===================================================================== +// Differencing Disk Helpers +// ===================================================================== + +impl NativeVhdx { + /// Create a differencing VHDX child (parent must already exist). + fn create_differencing(path: &Path, parent_path: &Path) -> Self { + let storage_type = VIRTUAL_STORAGE_TYPE { + DeviceId: VIRTUAL_STORAGE_TYPE_DEVICE_VHDX, + VendorId: VIRTUAL_STORAGE_TYPE_VENDOR_MICROSOFT, + }; + + let wide = to_wide(path); + let parent_wide = to_wide(parent_path); + + let mut params = CREATE_VIRTUAL_DISK_PARAMETERS { + Version: CREATE_VIRTUAL_DISK_VERSION_2, + ..Default::default() + }; + // ParentPath tells CreateVirtualDisk to create a differencing child. + // MaximumSize, BlockSizeInBytes, and SectorSizeInBytes are inherited + // from the parent (set to 0 / left default). + params.Anonymous.Version2.ParentPath = PCWSTR(parent_wide.as_ptr()); + + let mut handle = HANDLE::default(); + + // SAFETY: All parameters are correctly initialized, paths are + // null-terminated, and handle is written by the API on success. + // `parent_wide` is alive for the duration of this call. + let result = unsafe { + CreateVirtualDisk( + &storage_type, + PCWSTR(wide.as_ptr()), + VIRTUAL_DISK_ACCESS_MASK(0), + None, + CREATE_VIRTUAL_DISK_FLAG_NONE, + 0, + ¶ms, + None, + &mut handle, + ) + }; + assert!( + result.is_ok(), + "CreateVirtualDisk (differencing) failed: {result:?}" + ); + + NativeVhdx { + handle, + attached: false, + } + } +} + +impl RustVhdx { + /// Create a differencing VHDX via the Rust API (`has_parent: true`). + /// + /// No parent locator is written — this is sufficient for Rust-only + /// chained reads but NOT for native-open. + async fn create_diff( + path: &Path, + disk_size: u64, + block_size: u32, + driver: &DefaultDriver, + ) -> Self { + let file = StdFile::create(path).expect("create backing file"); + let mut params = vhdx::CreateParams { + disk_size, + block_size, + has_parent: true, + ..Default::default() + }; + vhdx::create(&file, &mut params) + .await + .expect("vhdx create diff"); + drop(file); + + Self::open(path, false, Some(driver)).await + } +} + +/// Read data from a child, resolving Unmapped ranges from the parent. +/// +/// For each `ReadRange::Unmapped` in the child's read resolution, +/// reads the corresponding range from the parent. `Data` and `Zero` +/// ranges are handled normally from the child. +async fn chained_read(child: &RustVhdx, parent: &RustVhdx, offset: u64, len: u32) -> Vec { + let mut ranges = Vec::new(); + let guard = child + .vhdx + .resolve_read(offset, len, &mut ranges) + .await + .expect("child resolve_read"); + + let mut result = vec![0u8; len as usize]; + + for range in &ranges { + match range { + ReadRange::Data { + guest_offset, + length, + file_offset, + } => { + let buf_offset = (*guest_offset - offset) as usize; + let buf_len = *length as usize; + child + .io_file + .read_at(*file_offset, &mut result[buf_offset..buf_offset + buf_len]) + .await + .expect("read child data"); + } + ReadRange::Zero { + guest_offset, + length, + } => { + // Already zero-initialized in result. + let _ = (guest_offset, length); + } + ReadRange::Unmapped { + guest_offset, + length, + } => { + // Fall through to parent. + let parent_data = parent.read_data(*guest_offset, *length).await; + let buf_offset = (*guest_offset - offset) as usize; + result[buf_offset..buf_offset + parent_data.len()].copy_from_slice(&parent_data); + } + } + } + + drop(guard); + result +} + +// ===================================================================== +// Differencing Disk Test Cases +// ===================================================================== + +/// Test 14: Rust-Only Chained Read — Unwritten Child +/// +/// Rust creates parent + writes data → Rust creates diff child → +/// child read returns zeros (Unmapped) → chained_read falls through +/// to parent → data matches. +#[pal_async::async_test] +async fn diff_rust_chained_read_unwritten_child(driver: DefaultDriver) { + let dir = tempfile::tempdir().unwrap(); + let parent_path = dir.path().join("parent.vhdx"); + let child_path = dir.path().join("child.vhdx"); + + let block_size: u32 = 2 * 1024 * 1024; + let disk_size: u64 = 4 * 1024 * 1024; + + // Step 1: Rust-create parent, write test_pattern at offset 0. + { + let parent = RustVhdx::create(&parent_path, disk_size, block_size, &driver).await; + parent.write_data(0, &test_pattern(0, 512)).await; + parent.flush().await; + parent.close().await; + } + + // Step 2: Rust-create diff child (has_parent: true). + let child = RustVhdx::create_diff(&child_path, disk_size, block_size, &driver).await; + + // Step 3: child.read_data returns zeros (Unmapped treated as zero). + let child_data = child.read_data(0, 512).await; + assert!( + child_data.iter().all(|&b| b == 0), + "unwritten child should return zeros" + ); + + // Step 4: chained_read falls through to parent. + let parent = RustVhdx::open(&parent_path, true, None).await; + let chained = chained_read(&child, &parent, 0, 512).await; + assert_eq!( + chained, + test_pattern(0, 512), + "chained read should return parent data" + ); + + // Step 5: Verify child is a differencing disk. + assert!(child.vhdx.has_parent(), "child should have has_parent set"); + + child.close().await; + parent.close().await; +} + +/// Test 15: Rust-Only Chained Read — Partial Block Write +/// +/// Rust creates parent + writes 2 sectors → Rust creates diff child → +/// writes 1 sector to child with different data → chained_read returns +/// child data for written sector, parent data for unwritten sector. +/// +/// This exercises PartiallyPresent block handling: the Rust write allocates +/// the block as PartiallyPresent (not FullyPresent), and the sector bitmap +/// tracks which sectors are present in the child vs. transparent to parent. +#[pal_async::async_test] +async fn diff_rust_chained_read_partial_block(driver: DefaultDriver) { + let dir = tempfile::tempdir().unwrap(); + let parent_path = dir.path().join("parent.vhdx"); + let child_path = dir.path().join("child.vhdx"); + + let block_size: u32 = 2 * 1024 * 1024; + let disk_size: u64 = 4 * 1024 * 1024; + + // Step 1: Rust-create parent, write 2 sectors at offset 0. + { + let parent = RustVhdx::create(&parent_path, disk_size, block_size, &driver).await; + parent.write_data(0, &test_pattern(0, 1024)).await; + parent.flush().await; + parent.close().await; + } + + // Step 2: Rust-create diff child. + let child = RustVhdx::create_diff(&child_path, disk_size, block_size, &driver).await; + + // Step 3: Write only sector 0 in child with a distinguishable pattern. + // The block should become PartiallyPresent with SBM bit 0 set. + let child_pattern = vec![0xAA; 512]; + child.write_data(0, &child_pattern).await; + child.flush().await; + + // Step 4: chained_read should return child data for sector 0, + // parent data for sector 1. + let parent = RustVhdx::open(&parent_path, true, None).await; + let chained = chained_read(&child, &parent, 0, 1024).await; + + // Sector 0 (bytes 0..512): from child → [0xAA; 512] + assert_eq!( + &chained[..512], + &child_pattern[..], + "sector 0 should come from child" + ); + // Sector 1 (bytes 512..1024): from parent → test_pattern(512, 512) + assert_eq!( + &chained[512..1024], + &test_pattern(512, 512)[..], + "sector 1 should come from parent" + ); + + child.close().await; + parent.close().await; +} + +/// Test 16: Native-Create Diff → Rust Reads +/// +/// Native creates parent + writes data → native creates diff child → +/// writes different data to child block 0 → Rust opens child → reads +/// child data for block 0 + Unmapped for block 1 → chained_read resolves +/// parent data for block 1. +#[pal_async::async_test] +async fn diff_native_create_rust_reads() { + let dir = tempfile::tempdir().unwrap(); + let parent_path = dir.path().join("parent.vhdx"); + let child_path = dir.path().join("child.vhdx"); + + // Native default: 32 MiB blocks. + let block_size: u64 = 32 * 1024 * 1024; + + // Step 1: Native-create parent (1 GiB). + // Write test_pattern at offset 0 and offset block_size. + { + let mut native = NativeVhdx::create_dynamic(&parent_path, 1024 * 1024 * 1024, 0, 0); + let raw = native.attach_raw(); + + let written = raw + .write_at(0, &test_pattern(0, 512)) + .expect("write parent block 0"); + assert_eq!(written, 512); + + let written = raw + .write_at(block_size, &test_pattern(block_size, 512)) + .expect("write parent block 1"); + assert_eq!(written, 512); + } + + // Step 2: Native-create differencing child. + // Write [0xBB; 512] at offset 0 (overwrites parent's block 0). + { + let mut native = NativeVhdx::create_differencing(&child_path, &parent_path); + let raw = native.attach_raw(); + + let child_data = vec![0xBBu8; 512]; + let written = raw.write_at(0, &child_data).expect("write child block 0"); + assert_eq!(written, 512); + } + + // Step 3: Rust opens child (read-only). + let child = RustVhdx::open(&child_path, true, None).await; + + // Block 0, sector 0: child has data → should be [0xBB; 512]. + let data_block0 = child.read_data(0, 512).await; + assert_eq!( + data_block0, + vec![0xBBu8; 512], + "child block 0 sector 0 should be 0xBB" + ); + + // Block 1: Unmapped in child → read_data returns zeros. + let data_block1 = child.read_data(block_size, 512).await; + assert!( + data_block1.iter().all(|&b| b == 0), + "child block 1 should be zeros (Unmapped)" + ); + + // Step 4: Rust opens parent (read-only). + let parent = RustVhdx::open(&parent_path, true, None).await; + + // Verify parent block 1 data directly. + let parent_block1 = parent.read_data(block_size, 512).await; + assert_eq!( + parent_block1, + test_pattern(block_size, 512), + "parent block 1 should have original data" + ); + + // Step 5: chained_read for block 1 → falls through to parent. + let chained = chained_read(&child, &parent, block_size, 512).await; + assert_eq!( + chained, + test_pattern(block_size, 512), + "chained read block 1 should return parent data" + ); + + child.close().await; + parent.close().await; +} + +/// Test 17: Native-Create Diff → Rust Reads Empty Child +/// +/// Native creates parent + writes data → native creates diff child → +/// no writes to child → Rust reads child → all Unmapped → chained read +/// falls through to parent. +#[pal_async::async_test] +async fn diff_native_create_empty_child_rust_reads() { + let dir = tempfile::tempdir().unwrap(); + let parent_path = dir.path().join("parent.vhdx"); + let child_path = dir.path().join("child.vhdx"); + + // Step 1: Native-create parent (1 GiB), write data at offset 0. + { + let mut native = NativeVhdx::create_dynamic(&parent_path, 1024 * 1024 * 1024, 0, 0); + let raw = native.attach_raw(); + + let written = raw + .write_at(0, &test_pattern(0, 512)) + .expect("write parent"); + assert_eq!(written, 512); + } + + // Step 2: Native-create differencing child (no writes). + { + let _native = NativeVhdx::create_differencing(&child_path, &parent_path); + } + + // Step 3: Rust opens child. + let child = RustVhdx::open(&child_path, true, None).await; + + // Child has_parent should be true. + assert!(child.vhdx.has_parent(), "child should be a diff disk"); + + // read_data returns zeros (Unmapped). + let child_data = child.read_data(0, 512).await; + assert!( + child_data.iter().all(|&b| b == 0), + "empty child should return zeros" + ); + + // Step 4: Rust opens parent; chained_read falls through. + let parent = RustVhdx::open(&parent_path, true, None).await; + let chained = chained_read(&child, &parent, 0, 512).await; + assert_eq!( + chained, + test_pattern(0, 512), + "chained read should return parent data" + ); + + child.close().await; + parent.close().await; +} + +/// Test 18: Rust Writes to Native-Created Diff +/// +/// Native creates parent → writes data at offsets 0 and 512 → native creates +/// diff child → Rust opens child writable → writes sector 0 with different +/// data → close → native opens child (with parent chain) → attach → +/// raw-read → child data present at sector 0, parent data for sector 1 +/// (unwritten in child, falls through via native chain and SBM resolution). +#[pal_async::async_test] +async fn diff_rust_writes_to_native_diff(driver: DefaultDriver) { + let dir = tempfile::tempdir().unwrap(); + let parent_path = dir.path().join("parent.vhdx"); + let child_path = dir.path().join("child.vhdx"); + + // Step 1: Native-create parent (1 GiB), write data at offsets 0 and 512. + { + let mut native = NativeVhdx::create_dynamic(&parent_path, 1024 * 1024 * 1024, 0, 0); + let raw = native.attach_raw(); + + let written = raw + .write_at(0, &test_pattern(0, 512)) + .expect("write parent sector 0"); + assert_eq!(written, 512); + + let written = raw + .write_at(512, &test_pattern(512, 512)) + .expect("write parent sector 1"); + assert_eq!(written, 512); + } + + // Step 2: Native-create diff child (no writes yet). + { + let _native = NativeVhdx::create_differencing(&child_path, &parent_path); + } + + // Step 3: Rust opens child writable, writes only sector 0. + // The block should become PartiallyPresent with SBM bit 0 set. + { + let child = RustVhdx::open(&child_path, false, Some(&driver)).await; + let child_data = vec![0xCCu8; 512]; + child.write_data(0, &child_data).await; + child.flush().await; + child.close().await; + } + + // Step 4: Native opens child (chain resolves automatically). + // Sector 0: from child (SBM bit set) → [0xCC; 512] + // Sector 1: from parent (SBM bit clear, falls through) → test_pattern(512, 512) + { + let mut native = NativeVhdx::open(&child_path, false); + let raw = native.attach_raw(); + + let mut buf0 = vec![0u8; 512]; + let bytes = raw.read_at(0, &mut buf0).expect("read child sector 0"); + assert_eq!(bytes, 512); + assert_eq!(buf0, vec![0xCCu8; 512], "sector 0 should be child's data"); + + let mut buf1 = vec![0u8; 512]; + let bytes = raw.read_at(512, &mut buf1).expect("read child sector 1"); + assert_eq!(bytes, 512); + assert_eq!( + buf1, + test_pattern(512, 512), + "sector 1 should come from parent via chain" + ); + } +} + +/// Test 19: Rust Writes + Trims in Diff Child +/// +/// Rust-create parent → write data to blocks 0 and 1 → native-create diff +/// child → Rust writes to child blocks 0 and 1 → Rust trims block 1 → +/// native reads → block 0 has child data, block 1 is zeros. +#[pal_async::async_test] +async fn diff_rust_writes_and_trims(driver: DefaultDriver) { + let dir = tempfile::tempdir().unwrap(); + let parent_path = dir.path().join("parent.vhdx"); + let child_path = dir.path().join("child.vhdx"); + + let block_size: u64 = 2 * 1024 * 1024; + + // Step 1: Rust-create parent (to control block size), write blocks 0 and 1. + { + let parent = + RustVhdx::create(&parent_path, 8 * 1024 * 1024, block_size as u32, &driver).await; + parent.write_data(0, &test_pattern(0, 512)).await; + parent + .write_data(block_size, &test_pattern(block_size, 512)) + .await; + parent.flush().await; + parent.close().await; + } + + // Step 2: Native-create diff child. + { + let _native = NativeVhdx::create_differencing(&child_path, &parent_path); + } + + // Step 3: Rust opens child writable. + // - Write [0xDD; 512] at offset 0 (block 0, sector 0) + // - Write [0xEE; 512] at offset block_size (block 1, sector 0) + // - Trim block 1 entirely + { + let child = RustVhdx::open(&child_path, false, Some(&driver)).await; + child.write_data(0, &vec![0xDDu8; 512]).await; + child.write_data(block_size, &vec![0xEEu8; 512]).await; + child.trim_range(block_size, block_size).await; + child.flush().await; + child.close().await; + } + + // Step 4: Native opens child (chain). Attach + read. + { + let mut native = NativeVhdx::open(&child_path, false); + let raw = native.attach_raw(); + + // Block 0: child's write → [0xDD; 512] + let mut buf0 = vec![0u8; 512]; + let bytes = raw.read_at(0, &mut buf0).expect("read block 0"); + assert_eq!(bytes, 512); + assert_eq!(buf0, vec![0xDDu8; 512], "block 0 should be child's data"); + + // Block 1: trimmed → zeros (TrimMode::Zero makes block Zero state; + // through native chain, Zero means zeros). + let mut buf1 = vec![0u8; 512]; + let bytes = raw.read_at(block_size, &mut buf1).expect("read block 1"); + assert_eq!(bytes, 512); + assert!( + buf1.iter().all(|&b| b == 0), + "block 1 should be zeros after trim" + ); + } +} + +// ===================================================================== +// Log Replay Cross-Validation +// ===================================================================== +// +// These tests exercise crash recovery scenarios where the Rust stack +// writes data with a dirty log (via `abort()`), and the native Windows +// VHD stack replays the log on open — or vice versa. +// +// The key API for simulating a crash in the Rust stack is `VhdxFile::abort()`: +// it drops the log channel without flushing, leaving the log GUID set in +// the header. The next open (by either stack) must replay the log before +// the file is usable. +// +// The native Windows VHD stack always performs a clean close on handle drop +// (it flushes the log and clears the log GUID), so we cannot easily create +// a dirty log via native. Tests focus on Rust-crash → Native-replay and +// full lifecycle interleaving scenarios. + +/// Test 20: Rust Crash → Native Replay +/// +/// Rust opens writable → writes data to two blocks → flush → abort +/// (simulated crash, log stays dirty) → native opens (replays log) → +/// attach → raw-read → data is present and correct. +#[pal_async::async_test] +async fn log_replay_rust_crash_native_reads(driver: DefaultDriver) { + let dir = tempfile::tempdir().unwrap(); + let vhdx_path = dir.path().join("test.vhdx"); + + let block_size: u64 = 2 * 1024 * 1024; + + // Rust create + write + flush + abort (crash). + { + let rust = RustVhdx::create(&vhdx_path, 8 * 1024 * 1024, block_size as u32, &driver).await; + + // Write to blocks 0 and 1. + rust.write_data(0, &test_pattern(0, 512)).await; + rust.write_data(block_size, &test_pattern(block_size, 512)) + .await; + rust.flush().await; + + // Abort — leaves dirty log (log_guid is set in header). + rust.abort().await; + } + + // Native opens — should replay the dirty log automatically. + let mut native = NativeVhdx::open(&vhdx_path, false); + let raw = native.attach_raw(); + + // Block 0: should have Rust's data after log replay. + let mut buf0 = vec![0u8; 512]; + let bytes = raw.read_at(0, &mut buf0).expect("read block 0"); + assert_eq!(bytes, 512); + assert_eq!(buf0, test_pattern(0, 512), "block 0 data after log replay"); + + // Block 1: should have Rust's data after log replay. + let mut buf1 = vec![0u8; 512]; + let bytes = raw.read_at(block_size, &mut buf1).expect("read block 1"); + assert_eq!(bytes, 512); + assert_eq!( + buf1, + test_pattern(block_size, 512), + "block 1 data after log replay" + ); +} + +/// Test 21: Rust Crash (Multiple Blocks) → Native Replay +/// +/// Rust opens writable → writes data to many blocks across the disk +/// (enough to exercise multiple log entries / batch commits) → flush → +/// abort → native opens (replays all log entries) → all data intact. +#[pal_async::async_test] +async fn log_replay_rust_crash_many_blocks_native_reads(driver: DefaultDriver) { + let dir = tempfile::tempdir().unwrap(); + let vhdx_path = dir.path().join("test.vhdx"); + + let block_size: u64 = 2 * 1024 * 1024; + let block_count = 8u64; + let disk_size = block_size * (block_count + 1); + + // Rust create + write all blocks + flush + abort. + { + let rust = RustVhdx::create(&vhdx_path, disk_size, block_size as u32, &driver).await; + + for i in 0..block_count { + let offset = i * block_size; + rust.write_data(offset, &test_pattern(offset, 512)).await; + } + rust.flush().await; + rust.abort().await; + } + + // Native opens (replays log) → attach → read all blocks. + let mut native = NativeVhdx::open(&vhdx_path, false); + let raw = native.attach_raw(); + + for i in 0..block_count { + let offset = i * block_size; + let expected = test_pattern(offset, 512); + let mut buf = vec![0u8; 512]; + let bytes = raw.read_at(offset, &mut buf).expect("native read"); + assert_eq!(bytes, 512); + assert_eq!( + buf, expected, + "data mismatch at block {i} (offset {offset:#x})" + ); + } +} + +/// Test 22: Rust Crash → Rust Replay → Native Reads +/// +/// Rust writes → flush → abort → Rust reopens writable (replays log) → +/// clean close → native opens → data intact. This verifies Rust's own +/// log replay produces a file the native stack accepts. +#[pal_async::async_test] +async fn log_replay_rust_crash_rust_replay_native_reads(driver: DefaultDriver) { + let dir = tempfile::tempdir().unwrap(); + let vhdx_path = dir.path().join("test.vhdx"); + + let block_size: u64 = 2 * 1024 * 1024; + + // Rust create + write + flush + abort. + { + let rust = RustVhdx::create(&vhdx_path, 8 * 1024 * 1024, block_size as u32, &driver).await; + rust.write_data(0, &test_pattern(0, 512)).await; + rust.write_data(block_size, &test_pattern(block_size, 512)) + .await; + rust.flush().await; + rust.abort().await; + } + + // Rust reopens writable (replays log) → verify data → clean close. + { + let rust = RustVhdx::open(&vhdx_path, false, Some(&driver)).await; + let data0 = rust.read_data(0, 512).await; + assert_eq!(data0, test_pattern(0, 512), "block 0 after Rust replay"); + let data1 = rust.read_data(block_size, 512).await; + assert_eq!( + data1, + test_pattern(block_size, 512), + "block 1 after Rust replay" + ); + rust.close().await; + } + + // Native opens the cleanly-closed file → data intact. + let mut native = NativeVhdx::open(&vhdx_path, false); + let raw = native.attach_raw(); + + let mut buf0 = vec![0u8; 512]; + let bytes = raw.read_at(0, &mut buf0).expect("read block 0"); + assert_eq!(bytes, 512); + assert_eq!(buf0, test_pattern(0, 512), "block 0 via native"); + + let mut buf1 = vec![0u8; 512]; + let bytes = raw.read_at(block_size, &mut buf1).expect("read block 1"); + assert_eq!(bytes, 512); + assert_eq!(buf1, test_pattern(block_size, 512), "block 1 via native"); +} + +/// Test 23: Rust Crash → Native Replay → Native Writes More → Rust Reads +/// +/// Full lifecycle: Rust writes block 0 → abort (crash) → native opens +/// (replays log) → native writes block 1 → close → Rust opens → reads +/// both blocks → both correct. +#[pal_async::async_test] +async fn log_replay_lifecycle_crash_replay_more_writes(driver: DefaultDriver) { + let dir = tempfile::tempdir().unwrap(); + let vhdx_path = dir.path().join("test.vhdx"); + + let block_size: u64 = 2 * 1024 * 1024; + + // Step 1: Rust create + write block 0 + flush + abort. + { + let rust = RustVhdx::create(&vhdx_path, 16 * 1024 * 1024, block_size as u32, &driver).await; + rust.write_data(0, &test_pattern(0, 512)).await; + rust.flush().await; + rust.abort().await; + } + + // Step 2: Native opens (replays dirty log) → writes block 1 → closes. + { + let mut native = NativeVhdx::open(&vhdx_path, false); + let raw = native.attach_raw(); + + // Verify block 0 survived replay. + let mut buf = vec![0u8; 512]; + let bytes = raw.read_at(0, &mut buf).expect("read block 0 after replay"); + assert_eq!(bytes, 512); + assert_eq!(buf, test_pattern(0, 512), "block 0 after native replay"); + + // Write block 1. + let pattern = test_pattern(block_size, 512); + let written = raw + .write_at(block_size, &pattern) + .expect("native write block 1"); + assert_eq!(written, 512); + } + + // Step 3: Rust opens → reads both blocks → verifies. + { + let rust = RustVhdx::open(&vhdx_path, true, None).await; + + let data0 = rust.read_data(0, 512).await; + assert_eq!(data0, test_pattern(0, 512), "block 0 via Rust"); + + let data1 = rust.read_data(block_size, 512).await; + assert_eq!(data1, test_pattern(block_size, 512), "block 1 via Rust"); + + rust.close().await; + } +} + +/// Test 24: Rust Crash With Trim → Native Replay +/// +/// Rust creates → writes blocks 0 and 1 → trims block 1 → flush → abort → +/// native opens (replays log) → block 0 intact, block 1 is zeros. +/// Verifies that trim state is correctly captured in the WAL and replayed. +#[pal_async::async_test] +async fn log_replay_rust_crash_with_trim_native_reads(driver: DefaultDriver) { + let dir = tempfile::tempdir().unwrap(); + let vhdx_path = dir.path().join("test.vhdx"); + + let block_size: u64 = 2 * 1024 * 1024; + + // Rust create + write both blocks + trim block 1 + flush + abort. + { + let rust = RustVhdx::create(&vhdx_path, 8 * 1024 * 1024, block_size as u32, &driver).await; + rust.write_data(0, &test_pattern(0, 512)).await; + rust.write_data(block_size, &test_pattern(block_size, 512)) + .await; + rust.flush().await; + + // Trim block 1 → BAT state change (Zero or Unmapped). + rust.trim_range(block_size, block_size).await; + rust.flush().await; + + rust.abort().await; + } + + // Native opens (replays log including the trim BAT update). + let mut native = NativeVhdx::open(&vhdx_path, false); + let raw = native.attach_raw(); + + // Block 0: should have data. + let mut buf0 = vec![0u8; 512]; + let bytes = raw.read_at(0, &mut buf0).expect("read block 0"); + assert_eq!(bytes, 512); + assert_eq!(buf0, test_pattern(0, 512), "block 0 should be intact"); + + // Block 1: should be zeros (trimmed). + let mut buf1 = vec![0u8; 512]; + let bytes = raw.read_at(block_size, &mut buf1).expect("read block 1"); + assert_eq!(bytes, 512); + assert!( + buf1.iter().all(|&b| b == 0), + "block 1 zeros after trim + crash + log replay" + ); +} + +/// Test 25: Multiple Crash-Recovery Cycles via Native +/// +/// Rust writes → crash → native opens (replays) → writes more → close → +/// Rust writes → crash → native opens (replays) → all data intact. +/// Verifies that the log replay leaves the file in a clean state that +/// supports another full write-crash-recovery cycle. +#[pal_async::async_test] +async fn log_replay_repeated_crash_cycles(driver: DefaultDriver) { + let dir = tempfile::tempdir().unwrap(); + let vhdx_path = dir.path().join("test.vhdx"); + + let block_size: u64 = 2 * 1024 * 1024; + + // Cycle 1: Rust writes block 0 → crash. + { + let rust = RustVhdx::create(&vhdx_path, 16 * 1024 * 1024, block_size as u32, &driver).await; + rust.write_data(0, &test_pattern(0, 512)).await; + rust.flush().await; + rust.abort().await; + } + + // Cycle 1 recovery: Native opens (replays) → writes block 1 → closes. + { + let mut native = NativeVhdx::open(&vhdx_path, false); + let raw = native.attach_raw(); + + // Verify block 0 survived. + let mut buf = vec![0u8; 512]; + raw.read_at(0, &mut buf).expect("read block 0"); + assert_eq!(buf, test_pattern(0, 512), "cycle 1: block 0"); + + // Write block 1. + let written = raw + .write_at(block_size, &test_pattern(block_size, 512)) + .expect("native write block 1"); + assert_eq!(written, 512); + } + + // Cycle 2: Rust opens (clean file now) → writes block 2 → crash. + { + let rust = RustVhdx::open(&vhdx_path, false, Some(&driver)).await; + rust.write_data(2 * block_size, &test_pattern(2 * block_size, 512)) + .await; + rust.flush().await; + rust.abort().await; + } + + // Cycle 2 recovery: Native opens (replays) → reads all 3 blocks. + { + let mut native = NativeVhdx::open(&vhdx_path, false); + let raw = native.attach_raw(); + + let mut buf0 = vec![0u8; 512]; + raw.read_at(0, &mut buf0).expect("read block 0"); + assert_eq!(buf0, test_pattern(0, 512), "cycle 2: block 0"); + + let mut buf1 = vec![0u8; 512]; + raw.read_at(block_size, &mut buf1).expect("read block 1"); + assert_eq!(buf1, test_pattern(block_size, 512), "cycle 2: block 1"); + + let mut buf2 = vec![0u8; 512]; + raw.read_at(2 * block_size, &mut buf2) + .expect("read block 2"); + assert_eq!(buf2, test_pattern(2 * block_size, 512), "cycle 2: block 2"); + } +} + +/// Test 26: Clean Rust File → Native Opens Without Replay +/// +/// Rust creates → writes → flush → close (clean shutdown) → native opens → +/// data intact. A cleanly-closed file should not trigger log replay. +#[pal_async::async_test] +async fn log_replay_clean_close_no_replay_needed(driver: DefaultDriver) { + let dir = tempfile::tempdir().unwrap(); + let vhdx_path = dir.path().join("test.vhdx"); + + let block_size: u64 = 2 * 1024 * 1024; + + // Rust create + write + flush + clean close. + { + let rust = RustVhdx::create(&vhdx_path, 8 * 1024 * 1024, block_size as u32, &driver).await; + rust.write_data(0, &test_pattern(0, 512)).await; + rust.write_data(block_size, &test_pattern(block_size, 512)) + .await; + rust.flush().await; + rust.close().await; + } + + // Native opens — should succeed without needing log replay. + let mut native = NativeVhdx::open(&vhdx_path, false); + let raw = native.attach_raw(); + + let mut buf0 = vec![0u8; 512]; + let bytes = raw.read_at(0, &mut buf0).expect("read block 0"); + assert_eq!(bytes, 512); + assert_eq!(buf0, test_pattern(0, 512), "block 0"); + + let mut buf1 = vec![0u8; 512]; + let bytes = raw.read_at(block_size, &mut buf1).expect("read block 1"); + assert_eq!(bytes, 512); + assert_eq!(buf1, test_pattern(block_size, 512), "block 1"); +}